arch/x86/mm/numa_64.c

   1 /*
   2  * Generic VM initialization for x86-64 NUMA setups.
   3  * Copyright 2002,2003 Andi Kleen, SuSE Labs.
   4  */
   5 #include <linux/kernel.h>
   6 #include <linux/mm.h>
   7 #include <linux/string.h>
   8 #include <linux/init.h>
   9 #include <linux/bootmem.h>
  10 #include <linux/mmzone.h>
  11 #include <linux/ctype.h>
  12 #include <linux/module.h>
  13 #include <linux/nodemask.h>
  14 #include <linux/sched.h>
  15
  16 #include <asm/e820.h>
  17 #include <asm/proto.h>
  18 #include <asm/dma.h>
  19 #include <asm/numa.h>
  20 #include <asm/acpi.h>
  21 #include <asm/k8.h>
  22
  23 struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
  24 EXPORT_SYMBOL(node_data);
  25
  26 struct memnode memnode;
  27
  28 s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
  29         [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
  30 };
  31
  32 int numa_off __initdata;
  33 static unsigned long __initdata nodemap_addr;
  34 static unsigned long __initdata nodemap_size;
  35
  36 DEFINE_PER_CPU(int, node_number) = 0;
  37 EXPORT_PER_CPU_SYMBOL(node_number);
  38
  39 /*
  40  * Map cpu index to node index
  41  */
  42 DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
  43 EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
  44
  45 /*
  46  * Given a shift value, try to populate memnodemap[]
  47  * Returns :
  48  * 1 if OK
  49  * 0 if memnodmap[] too small (of shift too small)
  50  * -1 if node overlap or lost ram (shift too big)
  51  */
  52 static int __init populate_memnodemap(const struct bootnode *nodes,
  53                                       int numnodes, int shift, int *nodeids)
  54 {
  55         unsigned long addr, end;
  56         int i, res = -1;
  57
  58         memset(memnodemap, 0xff, sizeof(s16)*memnodemapsize);
  59         for (i = 0; i < numnodes; i++) {
  60                 addr = nodes[i].start;
  61                 end = nodes[i].end;
  62                 if (addr >= end)
  63                         continue;
  64                 if ((end >> shift) >= memnodemapsize)
  65                         return 0;
  66                 do {
  67                         if (memnodemap[addr >> shift] != NUMA_NO_NODE)
  68                                 return -1;
  69
  70                         if (!nodeids)
  71                                 memnodemap[addr >> shift] = i;
  72                         else
  73                                 memnodemap[addr >> shift] = nodeids[i];
  74
  75                         addr += (1UL << shift);
  76                 } while (addr < end);
  77                 res = 1;
  78         }
  79         return res;
  80 }
  81
  82 static int __init allocate_cachealigned_memnodemap(void)
  83 {
  84         unsigned long addr;
  85
  86         memnodemap = memnode.embedded_map;
  87         if (memnodemapsize <= ARRAY_SIZE(memnode.embedded_map))
  88                 return 0;
  89
  90         addr = 0x8000;
  91         nodemap_size = roundup(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES);
  92         nodemap_addr = find_e820_area(addr, max_pfn<<PAGE_SHIFT,
  93                                       nodemap_size, L1_CACHE_BYTES);
  94         if (nodemap_addr == -1UL) {
  95                 printk(KERN_ERR
  96                        "NUMA: Unable to allocate Memory to Node hash map\n");
  97                 nodemap_addr = nodemap_size = 0;
  98                 return -1;
  99         }
 100         memnodemap = phys_to_virt(nodemap_addr);
 101         reserve_early(nodemap_addr, nodemap_addr + nodemap_size, "MEMNODEMAP");
 102
 103         printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n",
 104                nodemap_addr, nodemap_addr + nodemap_size);
 105         return 0;
 106 }
 107
 108 /*
 109  * The LSB of all start and end addresses in the node map is the value of the
 110  * maximum possible shift.
 111  */
 112 static int __init extract_lsb_from_nodes(const struct bootnode *nodes,
 113                                          int numnodes)
 114 {
 115         int i, nodes_used = 0;
 116         unsigned long start, end;
 117         unsigned long bitfield = 0, memtop = 0;
 118
 119         for (i = 0; i < numnodes; i++) {
 120                 start = nodes[i].start;
 121                 end = nodes[i].end;
 122                 if (start >= end)
 123                         continue;
 124                 bitfield |= start;
 125                 nodes_used++;
 126                 if (end > memtop)
 127                         memtop = end;
 128         }
 129         if (nodes_used <= 1)
 130                 i = 63;
 131         else
 132                 i = find_first_bit(&bitfield, sizeof(unsigned long)*8);
 133         memnodemapsize = (memtop >> i)+1;
 134         return i;
 135 }
 136
 137 int __init compute_hash_shift(struct bootnode *nodes, int numnodes,
 138                               int *nodeids)
 139 {
 140         int shift;
 141
 142         shift = extract_lsb_from_nodes(nodes, numnodes);
 143         if (allocate_cachealigned_memnodemap())
 144                 return -1;
 145         printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
 146                 shift);
 147
 148         if (populate_memnodemap(nodes, numnodes, shift, nodeids) != 1) {
 149                 printk(KERN_INFO "Your memory is not aligned you need to "
 150                        "rebuild your kernel with a bigger NODEMAPSIZE "
 151                        "shift=%d\n", shift);
 152                 return -1;
 153         }
 154         return shift;
 155 }
 156
 157 int __meminit  __early_pfn_to_nid(unsigned long pfn)
 158 {
 159         return phys_to_nid(pfn << PAGE_SHIFT);
 160 }
 161
 162 static void * __init early_node_mem(int nodeid, unsigned long start,
 163                                     unsigned long end, unsigned long size,
 164                                     unsigned long align)
 165 {
 166         unsigned long mem;
 167
 168         /*
 169          * put it on high as possible
 170          * something will go with NODE_DATA
 171          */
 172         if (start < (MAX_DMA_PFN<<PAGE_SHIFT))
 173                 start = MAX_DMA_PFN<<PAGE_SHIFT;
 174         if (start < (MAX_DMA32_PFN<<PAGE_SHIFT) &&
 175             end > (MAX_DMA32_PFN<<PAGE_SHIFT))
 176                 start = MAX_DMA32_PFN<<PAGE_SHIFT;
 177         mem = find_e820_area(start, end, size, align);
 178         if (mem != -1L)
 179                 return __va(mem);
 180
 181         /* extend the search scope */
 182         end = max_pfn_mapped << PAGE_SHIFT;
 183         if (end > (MAX_DMA32_PFN<<PAGE_SHIFT))
 184                 start = MAX_DMA32_PFN<<PAGE_SHIFT;
 185         else
 186                 start = MAX_DMA_PFN<<PAGE_SHIFT;
 187         mem = find_e820_area(start, end, size, align);
 188         if (mem != -1L)
 189                 return __va(mem);
 190
 191         printk(KERN_ERR "Cannot find %lu bytes in node %d\n",
 192                        size, nodeid);
 193
 194         return NULL;
 195 }
 196
 197 /* Initialize bootmem allocator for a node */
 198 void __init
 199 setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
 200 {
 201         unsigned long start_pfn, last_pfn, nodedata_phys;
 202         const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
 203         int nid;
 204 #ifndef CONFIG_NO_BOOTMEM
 205         unsigned long bootmap_start, bootmap_pages, bootmap_size;
 206         void *bootmap;
 207 #endif
 208
 209         if (!end)
 210                 return;
 211
 212         /*
 213          * Don't confuse VM with a node that doesn't have the
 214          * minimum amount of memory:
 215          */
 216         if (end && (end - start) < NODE_MIN_SIZE)
 217                 return;
 218
 219         start = roundup(start, ZONE_ALIGN);
 220
 221         printk(KERN_INFO "Initmem setup node %d %016lx-%016lx\n", nodeid,
 222                start, end);
 223
 224         start_pfn = start >> PAGE_SHIFT;
 225         last_pfn = end >> PAGE_SHIFT;
 226
 227         node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size,
 228                                            SMP_CACHE_BYTES);
 229         if (node_data[nodeid] == NULL)
 230                 return;
 231         nodedata_phys = __pa(node_data[nodeid]);
 232         reserve_early(nodedata_phys, nodedata_phys + pgdat_size, "NODE_DATA");
 233         printk(KERN_INFO "  NODE_DATA [%016lx - %016lx]\n", nodedata_phys,
 234                 nodedata_phys + pgdat_size - 1);
 235         nid = phys_to_nid(nodedata_phys);
 236         if (nid != nodeid)
 237                 printk(KERN_INFO "    NODE_DATA(%d) on node %d\n", nodeid, nid);
 238
 239         memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
 240         NODE_DATA(nodeid)->node_id = nodeid;
 241         NODE_DATA(nodeid)->node_start_pfn = start_pfn;
 242         NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn;
 243
 244 #ifndef CONFIG_NO_BOOTMEM
 245         NODE_DATA(nodeid)->bdata = &bootmem_node_data[nodeid];
 246
 247         /*
 248          * Find a place for the bootmem map
 249          * nodedata_phys could be on other nodes by alloc_bootmem,
 250          * so need to sure bootmap_start not to be small, otherwise
 251          * early_node_mem will get that with find_e820_area instead
 252          * of alloc_bootmem, that could clash with reserved range
 253          */
 254         bootmap_pages = bootmem_bootmap_pages(last_pfn - start_pfn);
 255         bootmap_start = roundup(nodedata_phys + pgdat_size, PAGE_SIZE);
 256         /*
 257          * SMP_CACHE_BYTES could be enough, but init_bootmem_node like
 258          * to use that to align to PAGE_SIZE
 259          */
 260         bootmap = early_node_mem(nodeid, bootmap_start, end,
 261                                  bootmap_pages<<PAGE_SHIFT, PAGE_SIZE);
 262         if (bootmap == NULL)  {
 263                 free_early(nodedata_phys, nodedata_phys + pgdat_size);
 264                 node_data[nodeid] = NULL;
 265                 return;
 266         }
 267         bootmap_start = __pa(bootmap);
 268         reserve_early(bootmap_start, bootmap_start+(bootmap_pages<<PAGE_SHIFT),
 269                         "BOOTMAP");
 270
 271         bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
 272                                          bootmap_start >> PAGE_SHIFT,
 273                                          start_pfn, last_pfn);
 274
 275         printk(KERN_INFO "  bootmap [%016lx -  %016lx] pages %lx\n",
 276                  bootmap_start, bootmap_start + bootmap_size - 1,
 277                  bootmap_pages);
 278         nid = phys_to_nid(bootmap_start);
 279         if (nid != nodeid)
 280                 printk(KERN_INFO "    bootmap(%d) on node %d\n", nodeid, nid);
 281
 282         free_bootmem_with_active_regions(nodeid, end);
 283 #endif
 284
 285         node_set_online(nodeid);
 286 }
 287
 288 /*
 289  * There are unfortunately some poorly designed mainboards around that
 290  * only connect memory to a single CPU. This breaks the 1:1 cpu->node
 291  * mapping. To avoid this fill in the mapping for all possible CPUs,
 292  * as the number of CPUs is not known yet. We round robin the existing
 293  * nodes.
 294  */
 295 void __init numa_init_array(void)
 296 {
 297         int rr, i;
 298
 299         rr = first_node(node_online_map);
 300         for (i = 0; i < nr_cpu_ids; i++) {
 301                 if (early_cpu_to_node(i) != NUMA_NO_NODE)
 302                         continue;
 303                 numa_set_node(i, rr);
 304                 rr = next_node(rr, node_online_map);
 305                 if (rr == MAX_NUMNODES)
 306                         rr = first_node(node_online_map);
 307         }
 308 }
 309
 310 #ifdef CONFIG_NUMA_EMU
 311 /* Numa emulation */
 312 static struct bootnode nodes[MAX_NUMNODES] __initdata;
 313 static struct bootnode physnodes[MAX_NUMNODES] __initdata;
 314 static char *cmdline __initdata;
 315
 316 static int __init setup_physnodes(unsigned long start, unsigned long end,
 317                                         int acpi, int k8)
 318 {
 319         int nr_nodes = 0;
 320         int ret = 0;
 321         int i;
 322
 323 #ifdef CONFIG_ACPI_NUMA
 324         if (acpi)
 325                 nr_nodes = acpi_get_nodes(physnodes);
 326 #endif
 327 #ifdef CONFIG_K8_NUMA
 328         if (k8)
 329                 nr_nodes = k8_get_nodes(physnodes);
 330 #endif
 331         /*
 332          * Basic sanity checking on the physical node map: there may be errors
 333          * if the SRAT or K8 incorrectly reported the topology or the mem=
 334          * kernel parameter is used.
 335          */
 336         for (i = 0; i < nr_nodes; i++) {
 337                 if (physnodes[i].start == physnodes[i].end)
 338                         continue;
 339                 if (physnodes[i].start > end) {
 340                         physnodes[i].end = physnodes[i].start;
 341                         continue;
 342                 }
 343                 if (physnodes[i].end < start) {
 344                         physnodes[i].start = physnodes[i].end;
 345                         continue;
 346                 }
 347                 if (physnodes[i].start < start)
 348                         physnodes[i].start = start;
 349                 if (physnodes[i].end > end)
 350                         physnodes[i].end = end;
 351         }
 352
 353         /*
 354          * Remove all nodes that have no memory or were truncated because of the
 355          * limited address range.
 356          */
 357         for (i = 0; i < nr_nodes; i++) {
 358                 if (physnodes[i].start == physnodes[i].end)
 359                         continue;
 360                 physnodes[ret].start = physnodes[i].start;
 361                 physnodes[ret].end = physnodes[i].end;
 362                 ret++;
 363         }
 364
 365         /*
 366          * If no physical topology was detected, a single node is faked to cover
 367          * the entire address space.
 368          */
 369         if (!ret) {
 370                 physnodes[ret].start = start;
 371                 physnodes[ret].end = end;
 372                 ret = 1;
 373         }
 374         return ret;
 375 }
 376
 377 /*
 378  * Setups up nid to range from addr to addr + size.  If the end
 379  * boundary is greater than max_addr, then max_addr is used instead.
 380  * The return value is 0 if there is additional memory left for
 381  * allocation past addr and -1 otherwise.  addr is adjusted to be at
 382  * the end of the node.
 383  */
 384 static int __init setup_node_range(int nid, u64 *addr, u64 size, u64 max_addr)
 385 {
 386         int ret = 0;
 387         nodes[nid].start = *addr;
 388         *addr += size;
 389         if (*addr >= max_addr) {
 390                 *addr = max_addr;
 391                 ret = -1;
 392         }
 393         nodes[nid].end = *addr;
 394         node_set(nid, node_possible_map);
 395         printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid,
 396                nodes[nid].start, nodes[nid].end,
 397                (nodes[nid].end - nodes[nid].start) >> 20);
 398         return ret;
 399 }
 400
 401 /*
 402  * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr
 403  * to max_addr.  The return value is the number of nodes allocated.
 404  */
 405 static int __init split_nodes_interleave(u64 addr, u64 max_addr,
 406                                                 int nr_phys_nodes, int nr_nodes)
 407 {
 408         nodemask_t physnode_mask = NODE_MASK_NONE;
 409         u64 size;
 410         int big;
 411         int ret = 0;
 412         int i;
 413
 414         if (nr_nodes <= 0)
 415                 return -1;
 416         if (nr_nodes > MAX_NUMNODES) {
 417                 pr_info("numa=fake=%d too large, reducing to %d\n",
 418                         nr_nodes, MAX_NUMNODES);
 419                 nr_nodes = MAX_NUMNODES;
 420         }
 421
 422         size = (max_addr - addr - e820_hole_size(addr, max_addr)) / nr_nodes;
 423         /*
 424          * Calculate the number of big nodes that can be allocated as a result
 425          * of consolidating the remainder.
 426          */
 427         big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) /
 428                 FAKE_NODE_MIN_SIZE;
 429
 430         size &= FAKE_NODE_MIN_HASH_MASK;
 431         if (!size) {
 432                 pr_err("Not enough memory for each node.  "
 433                         "NUMA emulation disabled.\n");
 434                 return -1;
 435         }
 436
 437         for (i = 0; i < nr_phys_nodes; i++)
 438                 if (physnodes[i].start != physnodes[i].end)
 439                         node_set(i, physnode_mask);
 440
 441         /*
 442          * Continue to fill physical nodes with fake nodes until there is no
 443          * memory left on any of them.
 444          */
 445         while (nodes_weight(physnode_mask)) {
 446                 for_each_node_mask(i, physnode_mask) {
 447                         u64 end = physnodes[i].start + size;
 448                         u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
 449
 450                         if (ret < big)
 451                                 end += FAKE_NODE_MIN_SIZE;
 452
 453                         /*
 454                          * Continue to add memory to this fake node if its
 455                          * non-reserved memory is less than the per-node size.
 456                          */
 457                         while (end - physnodes[i].start -
 458                                 e820_hole_size(physnodes[i].start, end) < size) {
 459                                 end += FAKE_NODE_MIN_SIZE;
 460                                 if (end > physnodes[i].end) {
 461                                         end = physnodes[i].end;
 462                                         break;
 463                                 }
 464                         }
 465
 466                         /*
 467                          * If there won't be at least FAKE_NODE_MIN_SIZE of
 468                          * non-reserved memory in ZONE_DMA32 for the next node,
 469                          * this one must extend to the boundary.
 470                          */
 471                         if (end < dma32_end && dma32_end - end -
 472                             e820_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
 473                                 end = dma32_end;
 474
 475                         /*
 476                          * If there won't be enough non-reserved memory for the
 477                          * next node, this one must extend to the end of the
 478                          * physical node.
 479                          */
 480                         if (physnodes[i].end - end -
 481                             e820_hole_size(end, physnodes[i].end) < size)
 482                                 end = physnodes[i].end;
 483
 484                         /*
 485                          * Avoid allocating more nodes than requested, which can
 486                          * happen as a result of rounding down each node's size
 487                          * to FAKE_NODE_MIN_SIZE.
 488                          */
 489                         if (nodes_weight(physnode_mask) + ret >= nr_nodes)
 490                                 end = physnodes[i].end;
 491
 492                         if (setup_node_range(ret++, &physnodes[i].start,
 493                                                 end - physnodes[i].start,
 494                                                 physnodes[i].end) < 0)
 495                                 node_clear(i, physnode_mask);
 496                 }
 497         }
 498         return ret;
 499 }
 500
 501 /*
 502  * Returns the end address of a node so that there is at least `size' amount of
 503  * non-reserved memory or `max_addr' is reached.
 504  */
 505 static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size)
 506 {
 507         u64 end = start + size;
 508
 509         while (end - start - e820_hole_size(start, end) < size) {
 510                 end += FAKE_NODE_MIN_SIZE;
 511                 if (end > max_addr) {
 512                         end = max_addr;
 513                         break;
 514                 }
 515         }
 516         return end;
 517 }
 518
 519 /*
 520  * Sets up fake nodes of `size' interleaved over physical nodes ranging from
 521  * `addr' to `max_addr'.  The return value is the number of nodes allocated.
 522  */
 523 static int __init split_nodes_size_interleave(u64 addr, u64 max_addr, u64 size)
 524 {
 525         nodemask_t physnode_mask = NODE_MASK_NONE;
 526         u64 min_size;
 527         int ret = 0;
 528         int i;
 529
 530         if (!size)
 531                 return -1;
 532         /*
 533          * The limit on emulated nodes is MAX_NUMNODES, so the size per node is
 534          * increased accordingly if the requested size is too small.  This
 535          * creates a uniform distribution of node sizes across the entire
 536          * machine (but not necessarily over physical nodes).
 537          */
 538         min_size = (max_addr - addr - e820_hole_size(addr, max_addr)) /
 539                                                 MAX_NUMNODES;
 540         min_size = max(min_size, FAKE_NODE_MIN_SIZE);
 541         if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size)
 542                 min_size = (min_size + FAKE_NODE_MIN_SIZE) &
 543                                                 FAKE_NODE_MIN_HASH_MASK;
 544         if (size < min_size) {
 545                 pr_err("Fake node size %LuMB too small, increasing to %LuMB\n",
 546                         size >> 20, min_size >> 20);
 547                 size = min_size;
 548         }
 549         size &= FAKE_NODE_MIN_HASH_MASK;
 550
 551         for (i = 0; i < MAX_NUMNODES; i++)
 552                 if (physnodes[i].start != physnodes[i].end)
 553                         node_set(i, physnode_mask);
 554         /*
 555          * Fill physical nodes with fake nodes of size until there is no memory
 556          * left on any of them.
 557          */
 558         while (nodes_weight(physnode_mask)) {
 559                 for_each_node_mask(i, physnode_mask) {
 560                         u64 dma32_end = MAX_DMA32_PFN << PAGE_SHIFT;
 561                         u64 end;
 562
 563                         end = find_end_of_node(physnodes[i].start,
 564                                                 physnodes[i].end, size);
 565                         /*
 566                          * If there won't be at least FAKE_NODE_MIN_SIZE of
 567                          * non-reserved memory in ZONE_DMA32 for the next node,
 568                          * this one must extend to the boundary.
 569                          */
 570                         if (end < dma32_end && dma32_end - end -
 571                             e820_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
 572                                 end = dma32_end;
 573
 574                         /*
 575                          * If there won't be enough non-reserved memory for the
 576                          * next node, this one must extend to the end of the
 577                          * physical node.
 578                          */
 579                         if (physnodes[i].end - end -
 580                             e820_hole_size(end, physnodes[i].end) < size)
 581                                 end = physnodes[i].end;
 582
 583                         /*
 584                          * Setup the fake node that will be allocated as bootmem
 585                          * later.  If setup_node_range() returns non-zero, there
 586                          * is no more memory available on this physical node.
 587                          */
 588                         if (setup_node_range(ret++, &physnodes[i].start,
 589                                                 end - physnodes[i].start,
 590                                                 physnodes[i].end) < 0)
 591                                 node_clear(i, physnode_mask);
 592                 }
 593         }
 594         return ret;
 595 }
 596
 597 /*
 598  * Sets up the system RAM area from start_pfn to last_pfn according to the
 599  * numa=fake command-line option.
 600  */
 601 static int __init numa_emulation(unsigned long start_pfn,
 602                         unsigned long last_pfn, int acpi, int k8)
 603 {
 604         u64 addr = start_pfn << PAGE_SHIFT;
 605         u64 max_addr = last_pfn << PAGE_SHIFT;
 606         int num_phys_nodes;
 607         int num_nodes;
 608         int i;
 609
 610         num_phys_nodes = setup_physnodes(addr, max_addr, acpi, k8);
 611         /*
 612          * If the numa=fake command-line contains a 'M' or 'G', it represents
 613          * the fixed node size.  Otherwise, if it is just a single number N,
 614          * split the system RAM into N fake nodes.
 615          */
 616         if (strchr(cmdline, 'M') || strchr(cmdline, 'G')) {
 617                 u64 size;
 618
 619                 size = memparse(cmdline, &cmdline);
 620                 num_nodes = split_nodes_size_interleave(addr, max_addr, size);
 621         } else {
 622                 unsigned long n;
 623
 624                 n = simple_strtoul(cmdline, NULL, 0);
 625                 num_nodes = split_nodes_interleave(addr, max_addr, num_phys_nodes, n);
 626         }
 627
 628         if (num_nodes < 0)
 629                 return num_nodes;
 630         memnode_shift = compute_hash_shift(nodes, num_nodes, NULL);
 631         if (memnode_shift < 0) {
 632                 memnode_shift = 0;
 633                 printk(KERN_ERR "No NUMA hash function found.  NUMA emulation "
 634                        "disabled.\n");
 635                 return -1;
 636         }
 637
 638         /*
 639          * We need to vacate all active ranges that may have been registered for
 640          * the e820 memory map.
 641          */
 642         remove_all_active_ranges();
 643         for_each_node_mask(i, node_possible_map) {
 644                 e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
 645                                                 nodes[i].end >> PAGE_SHIFT);
 646                 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
 647         }
 648         acpi_fake_nodes(nodes, num_nodes);
 649         numa_init_array();
 650         return 0;
 651 }
 652 #endif /* CONFIG_NUMA_EMU */
 653
 654 void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn,
 655                                 int acpi, int k8)
 656 {
 657         int i;
 658
 659         nodes_clear(node_possible_map);
 660         nodes_clear(node_online_map);
 661
 662 #ifdef CONFIG_NUMA_EMU
 663         if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, k8))
 664                 return;
 665         nodes_clear(node_possible_map);
 666         nodes_clear(node_online_map);
 667 #endif
 668
 669 #ifdef CONFIG_ACPI_NUMA
 670         if (!numa_off && acpi && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
 671                                                   last_pfn << PAGE_SHIFT))
 672                 return;
 673         nodes_clear(node_possible_map);
 674         nodes_clear(node_online_map);
 675 #endif
 676
 677 #ifdef CONFIG_K8_NUMA
 678         if (!numa_off && k8 && !k8_scan_nodes())
 679                 return;
 680         nodes_clear(node_possible_map);
 681         nodes_clear(node_online_map);
 682 #endif
 683         printk(KERN_INFO "%s\n",
 684                numa_off ? "NUMA turned off" : "No NUMA configuration found");
 685
 686         printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
 687                start_pfn << PAGE_SHIFT,
 688                last_pfn << PAGE_SHIFT);
 689         /* setup dummy node covering all memory */
 690         memnode_shift = 63;
 691         memnodemap = memnode.embedded_map;
 692         memnodemap[0] = 0;
 693         node_set_online(0);
 694         node_set(0, node_possible_map);
 695         for (i = 0; i < nr_cpu_ids; i++)
 696                 numa_set_node(i, 0);
 697         e820_register_active_regions(0, start_pfn, last_pfn);
 698         setup_node_bootmem(0, start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT);
 699 }
 700
 701 unsigned long __init numa_free_all_bootmem(void)
 702 {
 703         unsigned long pages = 0;
 704         int i;
 705
 706         for_each_online_node(i)
 707                 pages += free_all_bootmem_node(NODE_DATA(i));
 708
 709 #ifdef CONFIG_NO_BOOTMEM
 710         pages += free_all_memory_core_early(MAX_NUMNODES);
 711 #endif
 712
 713         return pages;
 714 }
 715
 716 static __init int numa_setup(char *opt)
 717 {
 718         if (!opt)
 719                 return -EINVAL;
 720         if (!strncmp(opt, "off", 3))
 721                 numa_off = 1;
 722 #ifdef CONFIG_NUMA_EMU
 723         if (!strncmp(opt, "fake=", 5))
 724                 cmdline = opt + 5;
 725 #endif
 726 #ifdef CONFIG_ACPI_NUMA
 727         if (!strncmp(opt, "noacpi", 6))
 728                 acpi_numa = -1;
 729 #endif
 730         return 0;
 731 }
 732 early_param("numa", numa_setup);
 733
 734 #ifdef CONFIG_NUMA
 735
 736 static __init int find_near_online_node(int node)
 737 {
 738         int n, val;
 739         int min_val = INT_MAX;
 740         int best_node = -1;
 741
 742         for_each_online_node(n) {
 743                 val = node_distance(node, n);
 744
 745                 if (val < min_val) {
 746                         min_val = val;
 747                         best_node = n;
 748                 }
 749         }
 750
 751         return best_node;
 752 }
 753
 754 /*
 755  * Setup early cpu_to_node.
 756  *
 757  * Populate cpu_to_node[] only if x86_cpu_to_apicid[],
 758  * and apicid_to_node[] tables have valid entries for a CPU.
 759  * This means we skip cpu_to_node[] initialisation for NUMA
 760  * emulation and faking node case (when running a kernel compiled
 761  * for NUMA on a non NUMA box), which is OK as cpu_to_node[]
 762  * is already initialized in a round robin manner at numa_init_array,
 763  * prior to this call, and this initialization is good enough
 764  * for the fake NUMA cases.
 765  *
 766  * Called before the per_cpu areas are setup.
 767  */
 768 void __init init_cpu_to_node(void)
 769 {
 770         int cpu;
 771         u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
 772
 773         BUG_ON(cpu_to_apicid == NULL);
 774
 775         for_each_possible_cpu(cpu) {
 776                 int node;
 777                 u16 apicid = cpu_to_apicid[cpu];
 778
 779                 if (apicid == BAD_APICID)
 780                         continue;
 781                 node = apicid_to_node[apicid];
 782                 if (node == NUMA_NO_NODE)
 783                         continue;
 784                 if (!node_online(node))
 785                         node = find_near_online_node(node);
 786                 numa_set_node(cpu, node);
 787         }
 788 }
 789 #endif
 790
 791
 792 void __cpuinit numa_set_node(int cpu, int node)
 793 {
 794         int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
 795
 796         /* early setting, no percpu area yet */
 797         if (cpu_to_node_map) {
 798                 cpu_to_node_map[cpu] = node;
 799                 return;
 800         }
 801
 802 #ifdef CONFIG_DEBUG_PER_CPU_MAPS
 803         if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) {
 804                 printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu);
 805                 dump_stack();
 806                 return;
 807         }
 808 #endif
 809         per_cpu(x86_cpu_to_node_map, cpu) = node;
 810
 811         if (node != NUMA_NO_NODE)
 812                 per_cpu(node_number, cpu) = node;
 813 }
 814
 815 void __cpuinit numa_clear_node(int cpu)
 816 {
 817         numa_set_node(cpu, NUMA_NO_NODE);
 818 }
 819
 820 #ifndef CONFIG_DEBUG_PER_CPU_MAPS
 821
 822 void __cpuinit numa_add_cpu(int cpu)
 823 {
 824         cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
 825 }
 826
 827 void __cpuinit numa_remove_cpu(int cpu)
 828 {
 829         cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
 830 }
 831
 832 #else /* CONFIG_DEBUG_PER_CPU_MAPS */
 833
 834 /*
 835  * --------- debug versions of the numa functions ---------
 836  */
 837 static void __cpuinit numa_set_cpumask(int cpu, int enable)
 838 {
 839         int node = early_cpu_to_node(cpu);
 840         struct cpumask *mask;
 841         char buf[64];
 842
 843         mask = node_to_cpumask_map[node];
 844         if (mask == NULL) {
 845                 printk(KERN_ERR "node_to_cpumask_map[%i] NULL\n", node);
 846                 dump_stack();
 847                 return;
 848         }
 849
 850         if (enable)
 851                 cpumask_set_cpu(cpu, mask);
 852         else
 853                 cpumask_clear_cpu(cpu, mask);
 854
 855         cpulist_scnprintf(buf, sizeof(buf), mask);
 856         printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
 857                 enable ? "numa_add_cpu" : "numa_remove_cpu", cpu, node, buf);
 858 }
 859
 860 void __cpuinit numa_add_cpu(int cpu)
 861 {
 862         numa_set_cpumask(cpu, 1);
 863 }
 864
 865 void __cpuinit numa_remove_cpu(int cpu)
 866 {
 867         numa_set_cpumask(cpu, 0);
 868 }
 869
 870 int cpu_to_node(int cpu)
 871 {
 872         if (early_per_cpu_ptr(x86_cpu_to_node_map)) {
 873                 printk(KERN_WARNING
 874                         "cpu_to_node(%d): usage too early!\n", cpu);
 875                 dump_stack();
 876                 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
 877         }
 878         return per_cpu(x86_cpu_to_node_map, cpu);
 879 }
 880 EXPORT_SYMBOL(cpu_to_node);
 881
 882 /*
 883  * Same function as cpu_to_node() but used if called before the
 884  * per_cpu areas are setup.
 885  */
 886 int early_cpu_to_node(int cpu)
 887 {
 888         if (early_per_cpu_ptr(x86_cpu_to_node_map))
 889                 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
 890
 891         if (!cpu_possible(cpu)) {
 892                 printk(KERN_WARNING
 893                         "early_cpu_to_node(%d): no per_cpu area!\n", cpu);
 894                 dump_stack();
 895                 return NUMA_NO_NODE;
 896         }
 897         return per_cpu(x86_cpu_to_node_map, cpu);
 898 }
 899
 900 /*
 901  * --------- end of debug versions of the numa functions ---------
 902  */
 903
 904 #endif /* CONFIG_DEBUG_PER_CPU_MAPS */