2 * Generic VM initialization for x86-64 NUMA setups.
3 * Copyright 2002,2003 Andi Kleen, SuSE Labs.
5 #include <linux/kernel.h>
7 #include <linux/string.h>
8 #include <linux/init.h>
9 #include <linux/bootmem.h>
10 #include <linux/mmzone.h>
11 #include <linux/ctype.h>
12 #include <linux/module.h>
13 #include <linux/nodemask.h>
16 #include <asm/proto.h>
26 struct pglist_data
*node_data
[MAX_NUMNODES
] __read_mostly
;
27 bootmem_data_t plat_node_bdata
[MAX_NUMNODES
];
29 struct memnode memnode
;
31 unsigned char cpu_to_node
[NR_CPUS
] __read_mostly
= {
32 [0 ... NR_CPUS
-1] = NUMA_NO_NODE
34 unsigned char apicid_to_node
[MAX_LOCAL_APIC
] __cpuinitdata
= {
35 [0 ... MAX_LOCAL_APIC
-1] = NUMA_NO_NODE
37 cpumask_t node_to_cpumask
[MAX_NUMNODES
] __read_mostly
;
39 int numa_off __initdata
;
40 unsigned long __initdata nodemap_addr
;
41 unsigned long __initdata nodemap_size
;
45 * Given a shift value, try to populate memnodemap[]
48 * 0 if memnodmap[] too small (of shift too small)
49 * -1 if node overlap or lost ram (shift too big)
52 populate_memnodemap(const struct bootnode
*nodes
, int numnodes
, int shift
)
56 unsigned long addr
, end
;
58 memset(memnodemap
, 0xff, memnodemapsize
);
59 for (i
= 0; i
< numnodes
; i
++) {
60 addr
= nodes
[i
].start
;
64 if ((end
>> shift
) >= memnodemapsize
)
67 if (memnodemap
[addr
>> shift
] != 0xff)
69 memnodemap
[addr
>> shift
] = i
;
70 addr
+= (1UL << shift
);
77 static int __init
allocate_cachealigned_memnodemap(void)
79 unsigned long pad
, pad_addr
;
81 memnodemap
= memnode
.embedded_map
;
82 if (memnodemapsize
<= 48)
85 pad
= L1_CACHE_BYTES
- 1;
87 nodemap_size
= pad
+ memnodemapsize
;
88 nodemap_addr
= find_e820_area(pad_addr
, end_pfn
<<PAGE_SHIFT
,
90 if (nodemap_addr
== -1UL) {
92 "NUMA: Unable to allocate Memory to Node hash map\n");
93 nodemap_addr
= nodemap_size
= 0;
96 pad_addr
= (nodemap_addr
+ pad
) & ~pad
;
97 memnodemap
= phys_to_virt(pad_addr
);
99 printk(KERN_DEBUG
"NUMA: Allocated memnodemap from %lx - %lx\n",
100 nodemap_addr
, nodemap_addr
+ nodemap_size
);
105 * The LSB of all start and end addresses in the node map is the value of the
106 * maximum possible shift.
109 extract_lsb_from_nodes (const struct bootnode
*nodes
, int numnodes
)
111 int i
, nodes_used
= 0;
112 unsigned long start
, end
;
113 unsigned long bitfield
= 0, memtop
= 0;
115 for (i
= 0; i
< numnodes
; i
++) {
116 start
= nodes
[i
].start
;
128 i
= find_first_bit(&bitfield
, sizeof(unsigned long)*8);
129 memnodemapsize
= (memtop
>> i
)+1;
133 int __init
compute_hash_shift(struct bootnode
*nodes
, int numnodes
)
137 shift
= extract_lsb_from_nodes(nodes
, numnodes
);
138 if (allocate_cachealigned_memnodemap())
140 printk(KERN_DEBUG
"NUMA: Using %d for the hash shift.\n",
143 if (populate_memnodemap(nodes
, numnodes
, shift
) != 1) {
145 "Your memory is not aligned you need to rebuild your kernel "
146 "with a bigger NODEMAPSIZE shift=%d\n",
153 #ifdef CONFIG_SPARSEMEM
154 int early_pfn_to_nid(unsigned long pfn
)
156 return phys_to_nid(pfn
<< PAGE_SHIFT
);
161 early_node_mem(int nodeid
, unsigned long start
, unsigned long end
,
164 unsigned long mem
= find_e820_area(start
, end
, size
);
168 ptr
= __alloc_bootmem_nopanic(size
,
169 SMP_CACHE_BYTES
, __pa(MAX_DMA_ADDRESS
));
171 printk(KERN_ERR
"Cannot find %lu bytes in node %d\n",
178 /* Initialize bootmem allocator for a node */
179 void __init
setup_node_bootmem(int nodeid
, unsigned long start
, unsigned long end
)
181 unsigned long start_pfn
, end_pfn
, bootmap_pages
, bootmap_size
, bootmap_start
;
182 unsigned long nodedata_phys
;
184 const int pgdat_size
= round_up(sizeof(pg_data_t
), PAGE_SIZE
);
186 start
= round_up(start
, ZONE_ALIGN
);
188 printk(KERN_INFO
"Bootmem setup node %d %016lx-%016lx\n", nodeid
, start
, end
);
190 start_pfn
= start
>> PAGE_SHIFT
;
191 end_pfn
= end
>> PAGE_SHIFT
;
193 node_data
[nodeid
] = early_node_mem(nodeid
, start
, end
, pgdat_size
);
194 if (node_data
[nodeid
] == NULL
)
196 nodedata_phys
= __pa(node_data
[nodeid
]);
198 memset(NODE_DATA(nodeid
), 0, sizeof(pg_data_t
));
199 NODE_DATA(nodeid
)->bdata
= &plat_node_bdata
[nodeid
];
200 NODE_DATA(nodeid
)->node_start_pfn
= start_pfn
;
201 NODE_DATA(nodeid
)->node_spanned_pages
= end_pfn
- start_pfn
;
203 /* Find a place for the bootmem map */
204 bootmap_pages
= bootmem_bootmap_pages(end_pfn
- start_pfn
);
205 bootmap_start
= round_up(nodedata_phys
+ pgdat_size
, PAGE_SIZE
);
206 bootmap
= early_node_mem(nodeid
, bootmap_start
, end
,
207 bootmap_pages
<<PAGE_SHIFT
);
208 if (bootmap
== NULL
) {
209 if (nodedata_phys
< start
|| nodedata_phys
>= end
)
210 free_bootmem((unsigned long)node_data
[nodeid
],pgdat_size
);
211 node_data
[nodeid
] = NULL
;
214 bootmap_start
= __pa(bootmap
);
215 Dprintk("bootmap start %lu pages %lu\n", bootmap_start
, bootmap_pages
);
217 bootmap_size
= init_bootmem_node(NODE_DATA(nodeid
),
218 bootmap_start
>> PAGE_SHIFT
,
221 free_bootmem_with_active_regions(nodeid
, end
);
223 reserve_bootmem_node(NODE_DATA(nodeid
), nodedata_phys
, pgdat_size
);
224 reserve_bootmem_node(NODE_DATA(nodeid
), bootmap_start
, bootmap_pages
<<PAGE_SHIFT
);
225 #ifdef CONFIG_ACPI_NUMA
226 srat_reserve_add_area(nodeid
);
228 node_set_online(nodeid
);
231 /* Initialize final allocator for a zone */
232 void __init
setup_node_zones(int nodeid
)
234 unsigned long start_pfn
, end_pfn
, memmapsize
, limit
;
236 start_pfn
= node_start_pfn(nodeid
);
237 end_pfn
= node_end_pfn(nodeid
);
239 Dprintk(KERN_INFO
"Setting up memmap for node %d %lx-%lx\n",
240 nodeid
, start_pfn
, end_pfn
);
242 /* Try to allocate mem_map at end to not fill up precious <4GB
244 memmapsize
= sizeof(struct page
) * (end_pfn
-start_pfn
);
245 limit
= end_pfn
<< PAGE_SHIFT
;
246 #ifdef CONFIG_FLAT_NODE_MEM_MAP
247 NODE_DATA(nodeid
)->node_mem_map
=
248 __alloc_bootmem_core(NODE_DATA(nodeid
)->bdata
,
249 memmapsize
, SMP_CACHE_BYTES
,
250 round_down(limit
- memmapsize
, PAGE_SIZE
),
255 void __init
numa_init_array(void)
258 /* There are unfortunately some poorly designed mainboards around
259 that only connect memory to a single CPU. This breaks the 1:1 cpu->node
260 mapping. To avoid this fill in the mapping for all possible
261 CPUs, as the number of CPUs is not known yet.
262 We round robin the existing nodes. */
263 rr
= first_node(node_online_map
);
264 for (i
= 0; i
< NR_CPUS
; i
++) {
265 if (cpu_to_node(i
) != NUMA_NO_NODE
)
267 numa_set_node(i
, rr
);
268 rr
= next_node(rr
, node_online_map
);
269 if (rr
== MAX_NUMNODES
)
270 rr
= first_node(node_online_map
);
275 #ifdef CONFIG_NUMA_EMU
277 char *cmdline __initdata
;
280 * Setups up nid to range from addr to addr + size. If the end boundary is
281 * greater than max_addr, then max_addr is used instead. The return value is 0
282 * if there is additional memory left for allocation past addr and -1 otherwise.
283 * addr is adjusted to be at the end of the node.
285 static int __init
setup_node_range(int nid
, struct bootnode
*nodes
, u64
*addr
,
286 u64 size
, u64 max_addr
)
289 nodes
[nid
].start
= *addr
;
291 if (*addr
>= max_addr
) {
295 nodes
[nid
].end
= *addr
;
296 node_set(nid
, node_possible_map
);
297 printk(KERN_INFO
"Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid
,
298 nodes
[nid
].start
, nodes
[nid
].end
,
299 (nodes
[nid
].end
- nodes
[nid
].start
) >> 20);
304 * Splits num_nodes nodes up equally starting at node_start. The return value
305 * is the number of nodes split up and addr is adjusted to be at the end of the
306 * last node allocated.
308 static int __init
split_nodes_equally(struct bootnode
*nodes
, u64
*addr
,
309 u64 max_addr
, int node_start
,
318 if (num_nodes
> MAX_NUMNODES
)
319 num_nodes
= MAX_NUMNODES
;
320 size
= (max_addr
- *addr
- e820_hole_size(*addr
, max_addr
)) /
323 * Calculate the number of big nodes that can be allocated as a result
324 * of consolidating the leftovers.
326 big
= ((size
& ~FAKE_NODE_MIN_HASH_MASK
) * num_nodes
) /
329 /* Round down to nearest FAKE_NODE_MIN_SIZE. */
330 size
&= FAKE_NODE_MIN_HASH_MASK
;
332 printk(KERN_ERR
"Not enough memory for each node. "
333 "NUMA emulation disabled.\n");
337 for (i
= node_start
; i
< num_nodes
+ node_start
; i
++) {
338 u64 end
= *addr
+ size
;
340 end
+= FAKE_NODE_MIN_SIZE
;
342 * The final node can have the remaining system RAM. Other
343 * nodes receive roughly the same amount of available pages.
345 if (i
== num_nodes
+ node_start
- 1)
348 while (end
- *addr
- e820_hole_size(*addr
, end
) <
350 end
+= FAKE_NODE_MIN_SIZE
;
351 if (end
> max_addr
) {
356 if (setup_node_range(i
, nodes
, addr
, end
- *addr
, max_addr
) < 0)
359 return i
- node_start
+ 1;
363 * Splits the remaining system RAM into chunks of size. The remaining memory is
364 * always assigned to a final node and can be asymmetric. Returns the number of
367 static int __init
split_nodes_by_size(struct bootnode
*nodes
, u64
*addr
,
368 u64 max_addr
, int node_start
, u64 size
)
371 size
= (size
<< 20) & FAKE_NODE_MIN_HASH_MASK
;
372 while (!setup_node_range(i
++, nodes
, addr
, size
, max_addr
))
374 return i
- node_start
;
378 * Sets up the system RAM area from start_pfn to end_pfn according to the
379 * numa=fake command-line option.
381 static int __init
numa_emulation(unsigned long start_pfn
, unsigned long end_pfn
)
383 struct bootnode nodes
[MAX_NUMNODES
];
384 u64 addr
= start_pfn
<< PAGE_SHIFT
;
385 u64 max_addr
= end_pfn
<< PAGE_SHIFT
;
393 memset(&nodes
, 0, sizeof(nodes
));
395 * If the numa=fake command-line is just a single number N, split the
396 * system RAM into N fake nodes.
398 if (!strchr(cmdline
, '*') && !strchr(cmdline
, ',')) {
399 num_nodes
= split_nodes_equally(nodes
, &addr
, max_addr
, 0,
400 simple_strtol(cmdline
, NULL
, 0));
406 /* Parse the command line. */
407 for (coeff_flag
= 0; ; cmdline
++) {
408 if (*cmdline
&& isdigit(*cmdline
)) {
409 num
= num
* 10 + *cmdline
- '0';
412 if (*cmdline
== '*') {
417 if (!*cmdline
|| *cmdline
== ',') {
421 * Round down to the nearest FAKE_NODE_MIN_SIZE.
422 * Command-line coefficients are in megabytes.
424 size
= ((u64
)num
<< 20) & FAKE_NODE_MIN_HASH_MASK
;
426 for (i
= 0; i
< coeff
; i
++, num_nodes
++)
427 if (setup_node_range(num_nodes
, nodes
,
428 &addr
, size
, max_addr
) < 0)
440 /* Fill remainder of system RAM, if appropriate. */
441 if (addr
< max_addr
) {
442 if (coeff_flag
&& coeff
< 0) {
443 /* Split remaining nodes into num-sized chunks */
444 num_nodes
+= split_nodes_by_size(nodes
, &addr
, max_addr
,
448 switch (*(cmdline
- 1)) {
450 /* Split remaining nodes into coeff chunks */
453 num_nodes
+= split_nodes_equally(nodes
, &addr
, max_addr
,
457 /* Do not allocate remaining system RAM */
460 /* Give one final node */
461 setup_node_range(num_nodes
, nodes
, &addr
,
462 max_addr
- addr
, max_addr
);
467 memnode_shift
= compute_hash_shift(nodes
, num_nodes
);
468 if (memnode_shift
< 0) {
470 printk(KERN_ERR
"No NUMA hash function found. NUMA emulation "
476 * We need to vacate all active ranges that may have been registered by
477 * SRAT and set acpi_numa to -1 so that srat_disabled() always returns
478 * true. NUMA emulation has succeeded so we will not scan ACPI nodes.
480 remove_all_active_ranges();
481 #ifdef CONFIG_ACPI_NUMA
484 for_each_node_mask(i
, node_possible_map
) {
485 e820_register_active_regions(i
, nodes
[i
].start
>> PAGE_SHIFT
,
486 nodes
[i
].end
>> PAGE_SHIFT
);
487 setup_node_bootmem(i
, nodes
[i
].start
, nodes
[i
].end
);
489 acpi_fake_nodes(nodes
, num_nodes
);
493 #endif /* CONFIG_NUMA_EMU */
495 void __init
numa_initmem_init(unsigned long start_pfn
, unsigned long end_pfn
)
499 nodes_clear(node_possible_map
);
501 #ifdef CONFIG_NUMA_EMU
502 if (cmdline
&& !numa_emulation(start_pfn
, end_pfn
))
504 nodes_clear(node_possible_map
);
507 #ifdef CONFIG_ACPI_NUMA
508 if (!numa_off
&& !acpi_scan_nodes(start_pfn
<< PAGE_SHIFT
,
509 end_pfn
<< PAGE_SHIFT
))
511 nodes_clear(node_possible_map
);
514 #ifdef CONFIG_K8_NUMA
515 if (!numa_off
&& !k8_scan_nodes(start_pfn
<<PAGE_SHIFT
, end_pfn
<<PAGE_SHIFT
))
517 nodes_clear(node_possible_map
);
519 printk(KERN_INFO
"%s\n",
520 numa_off
? "NUMA turned off" : "No NUMA configuration found");
522 printk(KERN_INFO
"Faking a node at %016lx-%016lx\n",
523 start_pfn
<< PAGE_SHIFT
,
524 end_pfn
<< PAGE_SHIFT
);
525 /* setup dummy node covering all memory */
527 memnodemap
= memnode
.embedded_map
;
529 nodes_clear(node_online_map
);
531 node_set(0, node_possible_map
);
532 for (i
= 0; i
< NR_CPUS
; i
++)
534 node_to_cpumask
[0] = cpumask_of_cpu(0);
535 e820_register_active_regions(0, start_pfn
, end_pfn
);
536 setup_node_bootmem(0, start_pfn
<< PAGE_SHIFT
, end_pfn
<< PAGE_SHIFT
);
539 __cpuinit
void numa_add_cpu(int cpu
)
541 set_bit(cpu
, &node_to_cpumask
[cpu_to_node(cpu
)]);
544 void __cpuinit
numa_set_node(int cpu
, int node
)
546 cpu_pda(cpu
)->nodenumber
= node
;
547 cpu_to_node(cpu
) = node
;
550 unsigned long __init
numa_free_all_bootmem(void)
553 unsigned long pages
= 0;
554 for_each_online_node(i
) {
555 pages
+= free_all_bootmem_node(NODE_DATA(i
));
560 void __init
paging_init(void)
563 unsigned long max_zone_pfns
[MAX_NR_ZONES
];
564 memset(max_zone_pfns
, 0, sizeof(max_zone_pfns
));
565 max_zone_pfns
[ZONE_DMA
] = MAX_DMA_PFN
;
566 max_zone_pfns
[ZONE_DMA32
] = MAX_DMA32_PFN
;
567 max_zone_pfns
[ZONE_NORMAL
] = end_pfn
;
569 sparse_memory_present_with_active_regions(MAX_NUMNODES
);
572 for_each_online_node(i
) {
576 free_area_init_nodes(max_zone_pfns
);
579 static __init
int numa_setup(char *opt
)
583 if (!strncmp(opt
,"off",3))
585 #ifdef CONFIG_NUMA_EMU
586 if (!strncmp(opt
, "fake=", 5))
589 #ifdef CONFIG_ACPI_NUMA
590 if (!strncmp(opt
,"noacpi",6))
592 if (!strncmp(opt
,"hotadd=", 7))
593 hotadd_percent
= simple_strtoul(opt
+7, NULL
, 10);
598 early_param("numa", numa_setup
);
601 * Setup early cpu_to_node.
603 * Populate cpu_to_node[] only if x86_cpu_to_apicid[],
604 * and apicid_to_node[] tables have valid entries for a CPU.
605 * This means we skip cpu_to_node[] initialisation for NUMA
606 * emulation and faking node case (when running a kernel compiled
607 * for NUMA on a non NUMA box), which is OK as cpu_to_node[]
608 * is already initialized in a round robin manner at numa_init_array,
609 * prior to this call, and this initialization is good enough
610 * for the fake NUMA cases.
612 void __init
init_cpu_to_node(void)
615 for (i
= 0; i
< NR_CPUS
; i
++) {
616 u8 apicid
= x86_cpu_to_apicid_init
[i
];
617 if (apicid
== BAD_APICID
)
619 if (apicid_to_node
[apicid
] == NUMA_NO_NODE
)
621 numa_set_node(i
,apicid_to_node
[apicid
]);
625 EXPORT_SYMBOL(cpu_to_node
);
626 EXPORT_SYMBOL(node_to_cpumask
);
627 EXPORT_SYMBOL(memnode
);
628 EXPORT_SYMBOL(node_data
);
630 #ifdef CONFIG_DISCONTIGMEM
632 * Functions to convert PFNs from/to per node page addresses.
633 * These are out of line because they are quite big.
634 * They could be all tuned by pre caching more state.
638 int pfn_valid(unsigned long pfn
)
641 if (pfn
>= num_physpages
)
643 nid
= pfn_to_nid(pfn
);
646 return pfn
>= node_start_pfn(nid
) && (pfn
) < node_end_pfn(nid
);
648 EXPORT_SYMBOL(pfn_valid
);