[PATCH] x86-64: configurable fake numa node sizes
[linux-2.6/x86.git] / arch / x86_64 / mm / numa.c
blobc55936bc6be64cf9dc8854e34edee4e55ca83d65
1 /*
2 * Generic VM initialization for x86-64 NUMA setups.
3 * Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 */
5 #include <linux/kernel.h>
6 #include <linux/mm.h>
7 #include <linux/string.h>
8 #include <linux/init.h>
9 #include <linux/bootmem.h>
10 #include <linux/mmzone.h>
11 #include <linux/ctype.h>
12 #include <linux/module.h>
13 #include <linux/nodemask.h>
15 #include <asm/e820.h>
16 #include <asm/proto.h>
17 #include <asm/dma.h>
18 #include <asm/numa.h>
19 #include <asm/acpi.h>
21 #ifndef Dprintk
22 #define Dprintk(x...)
23 #endif
25 struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
26 bootmem_data_t plat_node_bdata[MAX_NUMNODES];
28 struct memnode memnode;
30 unsigned char cpu_to_node[NR_CPUS] __read_mostly = {
31 [0 ... NR_CPUS-1] = NUMA_NO_NODE
33 unsigned char apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
34 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
36 cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly;
38 int numa_off __initdata;
39 unsigned long __initdata nodemap_addr;
40 unsigned long __initdata nodemap_size;
44 * Given a shift value, try to populate memnodemap[]
45 * Returns :
46 * 1 if OK
47 * 0 if memnodmap[] too small (of shift too small)
48 * -1 if node overlap or lost ram (shift too big)
50 static int __init
51 populate_memnodemap(const struct bootnode *nodes, int numnodes, int shift)
53 int i;
54 int res = -1;
55 unsigned long addr, end;
57 memset(memnodemap, 0xff, memnodemapsize);
58 for (i = 0; i < numnodes; i++) {
59 addr = nodes[i].start;
60 end = nodes[i].end;
61 if (addr >= end)
62 continue;
63 if ((end >> shift) >= memnodemapsize)
64 return 0;
65 do {
66 if (memnodemap[addr >> shift] != 0xff)
67 return -1;
68 memnodemap[addr >> shift] = i;
69 addr += (1UL << shift);
70 } while (addr < end);
71 res = 1;
73 return res;
76 static int __init allocate_cachealigned_memnodemap(void)
78 unsigned long pad, pad_addr;
80 memnodemap = memnode.embedded_map;
81 if (memnodemapsize <= 48)
82 return 0;
84 pad = L1_CACHE_BYTES - 1;
85 pad_addr = 0x8000;
86 nodemap_size = pad + memnodemapsize;
87 nodemap_addr = find_e820_area(pad_addr, end_pfn<<PAGE_SHIFT,
88 nodemap_size);
89 if (nodemap_addr == -1UL) {
90 printk(KERN_ERR
91 "NUMA: Unable to allocate Memory to Node hash map\n");
92 nodemap_addr = nodemap_size = 0;
93 return -1;
95 pad_addr = (nodemap_addr + pad) & ~pad;
96 memnodemap = phys_to_virt(pad_addr);
98 printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n",
99 nodemap_addr, nodemap_addr + nodemap_size);
100 return 0;
104 * The LSB of all start and end addresses in the node map is the value of the
105 * maximum possible shift.
107 static int __init
108 extract_lsb_from_nodes (const struct bootnode *nodes, int numnodes)
110 int i, nodes_used = 0;
111 unsigned long start, end;
112 unsigned long bitfield = 0, memtop = 0;
114 for (i = 0; i < numnodes; i++) {
115 start = nodes[i].start;
116 end = nodes[i].end;
117 if (start >= end)
118 continue;
119 bitfield |= start;
120 nodes_used++;
121 if (end > memtop)
122 memtop = end;
124 if (nodes_used <= 1)
125 i = 63;
126 else
127 i = find_first_bit(&bitfield, sizeof(unsigned long)*8);
128 memnodemapsize = (memtop >> i)+1;
129 return i;
132 int __init compute_hash_shift(struct bootnode *nodes, int numnodes)
134 int shift;
136 shift = extract_lsb_from_nodes(nodes, numnodes);
137 if (allocate_cachealigned_memnodemap())
138 return -1;
139 printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
140 shift);
142 if (populate_memnodemap(nodes, numnodes, shift) != 1) {
143 printk(KERN_INFO
144 "Your memory is not aligned you need to rebuild your kernel "
145 "with a bigger NODEMAPSIZE shift=%d\n",
146 shift);
147 return -1;
149 return shift;
152 #ifdef CONFIG_SPARSEMEM
153 int early_pfn_to_nid(unsigned long pfn)
155 return phys_to_nid(pfn << PAGE_SHIFT);
157 #endif
159 static void * __init
160 early_node_mem(int nodeid, unsigned long start, unsigned long end,
161 unsigned long size)
163 unsigned long mem = find_e820_area(start, end, size);
164 void *ptr;
165 if (mem != -1L)
166 return __va(mem);
167 ptr = __alloc_bootmem_nopanic(size,
168 SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS));
169 if (ptr == 0) {
170 printk(KERN_ERR "Cannot find %lu bytes in node %d\n",
171 size, nodeid);
172 return NULL;
174 return ptr;
177 /* Initialize bootmem allocator for a node */
178 void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
180 unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size, bootmap_start;
181 unsigned long nodedata_phys;
182 void *bootmap;
183 const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE);
185 start = round_up(start, ZONE_ALIGN);
187 printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid, start, end);
189 start_pfn = start >> PAGE_SHIFT;
190 end_pfn = end >> PAGE_SHIFT;
192 node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size);
193 if (node_data[nodeid] == NULL)
194 return;
195 nodedata_phys = __pa(node_data[nodeid]);
197 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
198 NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid];
199 NODE_DATA(nodeid)->node_start_pfn = start_pfn;
200 NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
202 /* Find a place for the bootmem map */
203 bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
204 bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
205 bootmap = early_node_mem(nodeid, bootmap_start, end,
206 bootmap_pages<<PAGE_SHIFT);
207 if (bootmap == NULL) {
208 if (nodedata_phys < start || nodedata_phys >= end)
209 free_bootmem((unsigned long)node_data[nodeid],pgdat_size);
210 node_data[nodeid] = NULL;
211 return;
213 bootmap_start = __pa(bootmap);
214 Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages);
216 bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
217 bootmap_start >> PAGE_SHIFT,
218 start_pfn, end_pfn);
220 free_bootmem_with_active_regions(nodeid, end);
222 reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size);
223 reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT);
224 #ifdef CONFIG_ACPI_NUMA
225 srat_reserve_add_area(nodeid);
226 #endif
227 node_set_online(nodeid);
230 /* Initialize final allocator for a zone */
231 void __init setup_node_zones(int nodeid)
233 unsigned long start_pfn, end_pfn, memmapsize, limit;
235 start_pfn = node_start_pfn(nodeid);
236 end_pfn = node_end_pfn(nodeid);
238 Dprintk(KERN_INFO "Setting up memmap for node %d %lx-%lx\n",
239 nodeid, start_pfn, end_pfn);
241 /* Try to allocate mem_map at end to not fill up precious <4GB
242 memory. */
243 memmapsize = sizeof(struct page) * (end_pfn-start_pfn);
244 limit = end_pfn << PAGE_SHIFT;
245 #ifdef CONFIG_FLAT_NODE_MEM_MAP
246 NODE_DATA(nodeid)->node_mem_map =
247 __alloc_bootmem_core(NODE_DATA(nodeid)->bdata,
248 memmapsize, SMP_CACHE_BYTES,
249 round_down(limit - memmapsize, PAGE_SIZE),
250 limit);
251 #endif
254 void __init numa_init_array(void)
256 int rr, i;
257 /* There are unfortunately some poorly designed mainboards around
258 that only connect memory to a single CPU. This breaks the 1:1 cpu->node
259 mapping. To avoid this fill in the mapping for all possible
260 CPUs, as the number of CPUs is not known yet.
261 We round robin the existing nodes. */
262 rr = first_node(node_online_map);
263 for (i = 0; i < NR_CPUS; i++) {
264 if (cpu_to_node[i] != NUMA_NO_NODE)
265 continue;
266 numa_set_node(i, rr);
267 rr = next_node(rr, node_online_map);
268 if (rr == MAX_NUMNODES)
269 rr = first_node(node_online_map);
274 #ifdef CONFIG_NUMA_EMU
275 /* Numa emulation */
276 #define E820_ADDR_HOLE_SIZE(start, end) \
277 (e820_hole_size((start) >> PAGE_SHIFT, (end) >> PAGE_SHIFT) << \
278 PAGE_SHIFT)
279 char *cmdline __initdata;
282 * Setups up nid to range from addr to addr + size. If the end boundary is
283 * greater than max_addr, then max_addr is used instead. The return value is 0
284 * if there is additional memory left for allocation past addr and -1 otherwise.
285 * addr is adjusted to be at the end of the node.
287 static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr,
288 u64 size, u64 max_addr)
290 int ret = 0;
291 nodes[nid].start = *addr;
292 *addr += size;
293 if (*addr >= max_addr) {
294 *addr = max_addr;
295 ret = -1;
297 nodes[nid].end = *addr;
298 node_set_online(nid);
299 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid,
300 nodes[nid].start, nodes[nid].end,
301 (nodes[nid].end - nodes[nid].start) >> 20);
302 return ret;
306 * Splits num_nodes nodes up equally starting at node_start. The return value
307 * is the number of nodes split up and addr is adjusted to be at the end of the
308 * last node allocated.
310 static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr,
311 u64 max_addr, int node_start,
312 int num_nodes)
314 unsigned int big;
315 u64 size;
316 int i;
318 if (num_nodes <= 0)
319 return -1;
320 if (num_nodes > MAX_NUMNODES)
321 num_nodes = MAX_NUMNODES;
322 size = (max_addr - *addr - E820_ADDR_HOLE_SIZE(*addr, max_addr)) /
323 num_nodes;
325 * Calculate the number of big nodes that can be allocated as a result
326 * of consolidating the leftovers.
328 big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * num_nodes) /
329 FAKE_NODE_MIN_SIZE;
331 /* Round down to nearest FAKE_NODE_MIN_SIZE. */
332 size &= FAKE_NODE_MIN_HASH_MASK;
333 if (!size) {
334 printk(KERN_ERR "Not enough memory for each node. "
335 "NUMA emulation disabled.\n");
336 return -1;
339 for (i = node_start; i < num_nodes + node_start; i++) {
340 u64 end = *addr + size;
341 if (i < big)
342 end += FAKE_NODE_MIN_SIZE;
344 * The final node can have the remaining system RAM. Other
345 * nodes receive roughly the same amount of available pages.
347 if (i == num_nodes + node_start - 1)
348 end = max_addr;
349 else
350 while (end - *addr - E820_ADDR_HOLE_SIZE(*addr, end) <
351 size) {
352 end += FAKE_NODE_MIN_SIZE;
353 if (end > max_addr) {
354 end = max_addr;
355 break;
358 if (setup_node_range(i, nodes, addr, end - *addr, max_addr) < 0)
359 break;
361 return i - node_start + 1;
365 * Sets up the system RAM area from start_pfn to end_pfn according to the
366 * numa=fake command-line option.
368 static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
370 struct bootnode nodes[MAX_NUMNODES];
371 u64 addr = start_pfn << PAGE_SHIFT;
372 u64 max_addr = end_pfn << PAGE_SHIFT;
373 unsigned int coeff;
374 unsigned int num = 0;
375 int num_nodes = 0;
376 u64 size;
377 int i;
379 memset(&nodes, 0, sizeof(nodes));
381 * If the numa=fake command-line is just a single number N, split the
382 * system RAM into N fake nodes.
384 if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) {
385 num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0,
386 simple_strtol(cmdline, NULL, 0));
387 if (num_nodes < 0)
388 return num_nodes;
389 goto out;
392 /* Parse the command line. */
393 for (coeff = 1; ; cmdline++) {
394 if (*cmdline && isdigit(*cmdline)) {
395 num = num * 10 + *cmdline - '0';
396 continue;
398 if (*cmdline == '*')
399 coeff = num;
400 if (!*cmdline || *cmdline == ',') {
402 * Round down to the nearest FAKE_NODE_MIN_SIZE.
403 * Command-line coefficients are in megabytes.
405 size = ((u64)num << 20) & FAKE_NODE_MIN_HASH_MASK;
406 if (size) {
407 for (i = 0; i < coeff; i++, num_nodes++)
408 if (setup_node_range(num_nodes, nodes,
409 &addr, size, max_addr) < 0)
410 goto done;
411 coeff = 1;
414 if (!*cmdline)
415 break;
416 num = 0;
418 done:
419 if (!num_nodes)
420 return -1;
421 /* Fill remainder of system RAM with a final node, if appropriate. */
422 if (addr < max_addr) {
423 setup_node_range(num_nodes, nodes, &addr, max_addr - addr,
424 max_addr);
425 num_nodes++;
427 out:
428 memnode_shift = compute_hash_shift(nodes, num_nodes);
429 if (memnode_shift < 0) {
430 memnode_shift = 0;
431 printk(KERN_ERR "No NUMA hash function found. NUMA emulation "
432 "disabled.\n");
433 return -1;
437 * We need to vacate all active ranges that may have been registered by
438 * SRAT.
440 remove_all_active_ranges();
441 for_each_online_node(i) {
442 e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
443 nodes[i].end >> PAGE_SHIFT);
444 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
446 numa_init_array();
447 return 0;
449 #undef E820_ADDR_HOLE_SIZE
450 #endif /* CONFIG_NUMA_EMU */
452 void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
454 int i;
456 #ifdef CONFIG_NUMA_EMU
457 if (cmdline && !numa_emulation(start_pfn, end_pfn))
458 return;
459 #endif
461 #ifdef CONFIG_ACPI_NUMA
462 if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
463 end_pfn << PAGE_SHIFT))
464 return;
465 #endif
467 #ifdef CONFIG_K8_NUMA
468 if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT))
469 return;
470 #endif
471 printk(KERN_INFO "%s\n",
472 numa_off ? "NUMA turned off" : "No NUMA configuration found");
474 printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
475 start_pfn << PAGE_SHIFT,
476 end_pfn << PAGE_SHIFT);
477 /* setup dummy node covering all memory */
478 memnode_shift = 63;
479 memnodemap = memnode.embedded_map;
480 memnodemap[0] = 0;
481 nodes_clear(node_online_map);
482 node_set_online(0);
483 for (i = 0; i < NR_CPUS; i++)
484 numa_set_node(i, 0);
485 node_to_cpumask[0] = cpumask_of_cpu(0);
486 e820_register_active_regions(0, start_pfn, end_pfn);
487 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
490 __cpuinit void numa_add_cpu(int cpu)
492 set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
495 void __cpuinit numa_set_node(int cpu, int node)
497 cpu_pda(cpu)->nodenumber = node;
498 cpu_to_node[cpu] = node;
501 unsigned long __init numa_free_all_bootmem(void)
503 int i;
504 unsigned long pages = 0;
505 for_each_online_node(i) {
506 pages += free_all_bootmem_node(NODE_DATA(i));
508 return pages;
511 void __init paging_init(void)
513 int i;
514 unsigned long max_zone_pfns[MAX_NR_ZONES];
515 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
516 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
517 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
518 max_zone_pfns[ZONE_NORMAL] = end_pfn;
520 sparse_memory_present_with_active_regions(MAX_NUMNODES);
521 sparse_init();
523 for_each_online_node(i) {
524 setup_node_zones(i);
527 free_area_init_nodes(max_zone_pfns);
530 static __init int numa_setup(char *opt)
532 if (!opt)
533 return -EINVAL;
534 if (!strncmp(opt,"off",3))
535 numa_off = 1;
536 #ifdef CONFIG_NUMA_EMU
537 if (!strncmp(opt, "fake=", 5))
538 cmdline = opt + 5;
539 #endif
540 #ifdef CONFIG_ACPI_NUMA
541 if (!strncmp(opt,"noacpi",6))
542 acpi_numa = -1;
543 if (!strncmp(opt,"hotadd=", 7))
544 hotadd_percent = simple_strtoul(opt+7, NULL, 10);
545 #endif
546 return 0;
549 early_param("numa", numa_setup);
552 * Setup early cpu_to_node.
554 * Populate cpu_to_node[] only if x86_cpu_to_apicid[],
555 * and apicid_to_node[] tables have valid entries for a CPU.
556 * This means we skip cpu_to_node[] initialisation for NUMA
557 * emulation and faking node case (when running a kernel compiled
558 * for NUMA on a non NUMA box), which is OK as cpu_to_node[]
559 * is already initialized in a round robin manner at numa_init_array,
560 * prior to this call, and this initialization is good enough
561 * for the fake NUMA cases.
563 void __init init_cpu_to_node(void)
565 int i;
566 for (i = 0; i < NR_CPUS; i++) {
567 u8 apicid = x86_cpu_to_apicid[i];
568 if (apicid == BAD_APICID)
569 continue;
570 if (apicid_to_node[apicid] == NUMA_NO_NODE)
571 continue;
572 numa_set_node(i,apicid_to_node[apicid]);
576 EXPORT_SYMBOL(cpu_to_node);
577 EXPORT_SYMBOL(node_to_cpumask);
578 EXPORT_SYMBOL(memnode);
579 EXPORT_SYMBOL(node_data);
581 #ifdef CONFIG_DISCONTIGMEM
583 * Functions to convert PFNs from/to per node page addresses.
584 * These are out of line because they are quite big.
585 * They could be all tuned by pre caching more state.
586 * Should do that.
589 int pfn_valid(unsigned long pfn)
591 unsigned nid;
592 if (pfn >= num_physpages)
593 return 0;
594 nid = pfn_to_nid(pfn);
595 if (nid == 0xff)
596 return 0;
597 return pfn >= node_start_pfn(nid) && (pfn) < node_end_pfn(nid);
599 EXPORT_SYMBOL(pfn_valid);
600 #endif