[PATCH] x86_64: Handle empty PXMs that only contain hotplug memory
[linux-2.6/kvm.git] / arch / x86_64 / mm / numa.c
blobcc02573a327170f80cbf5fa64c99c3d25aad9eef
1 /*
2 * Generic VM initialization for x86-64 NUMA setups.
3 * Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 */
5 #include <linux/kernel.h>
6 #include <linux/mm.h>
7 #include <linux/string.h>
8 #include <linux/init.h>
9 #include <linux/bootmem.h>
10 #include <linux/mmzone.h>
11 #include <linux/ctype.h>
12 #include <linux/module.h>
13 #include <linux/nodemask.h>
15 #include <asm/e820.h>
16 #include <asm/proto.h>
17 #include <asm/dma.h>
18 #include <asm/numa.h>
19 #include <asm/acpi.h>
21 #ifndef Dprintk
22 #define Dprintk(x...)
23 #endif
25 struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
26 bootmem_data_t plat_node_bdata[MAX_NUMNODES];
28 struct memnode memnode;
30 unsigned char cpu_to_node[NR_CPUS] __read_mostly = {
31 [0 ... NR_CPUS-1] = NUMA_NO_NODE
33 unsigned char apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
34 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
36 cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly;
38 int numa_off __initdata;
42 * Given a shift value, try to populate memnodemap[]
43 * Returns :
44 * 1 if OK
45 * 0 if memnodmap[] too small (of shift too small)
46 * -1 if node overlap or lost ram (shift too big)
48 static int __init
49 populate_memnodemap(const struct bootnode *nodes, int numnodes, int shift)
51 int i;
52 int res = -1;
53 unsigned long addr, end;
55 if (shift >= 64)
56 return -1;
57 memset(memnodemap, 0xff, sizeof(memnodemap));
58 for (i = 0; i < numnodes; i++) {
59 addr = nodes[i].start;
60 end = nodes[i].end;
61 if (addr >= end)
62 continue;
63 if ((end >> shift) >= NODEMAPSIZE)
64 return 0;
65 do {
66 if (memnodemap[addr >> shift] != 0xff)
67 return -1;
68 memnodemap[addr >> shift] = i;
69 addr += (1UL << shift);
70 } while (addr < end);
71 res = 1;
73 return res;
76 int __init compute_hash_shift(struct bootnode *nodes, int numnodes)
78 int shift = 20;
80 while (populate_memnodemap(nodes, numnodes, shift + 1) >= 0)
81 shift++;
83 printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
84 shift);
86 if (populate_memnodemap(nodes, numnodes, shift) != 1) {
87 printk(KERN_INFO
88 "Your memory is not aligned you need to rebuild your kernel "
89 "with a bigger NODEMAPSIZE shift=%d\n",
90 shift);
91 return -1;
93 return shift;
96 #ifdef CONFIG_SPARSEMEM
97 int early_pfn_to_nid(unsigned long pfn)
99 return phys_to_nid(pfn << PAGE_SHIFT);
101 #endif
103 static void * __init
104 early_node_mem(int nodeid, unsigned long start, unsigned long end,
105 unsigned long size)
107 unsigned long mem = find_e820_area(start, end, size);
108 void *ptr;
109 if (mem != -1L)
110 return __va(mem);
111 ptr = __alloc_bootmem_nopanic(size,
112 SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS));
113 if (ptr == 0) {
114 printk(KERN_ERR "Cannot find %lu bytes in node %d\n",
115 size, nodeid);
116 return NULL;
118 return ptr;
121 /* Initialize bootmem allocator for a node */
122 void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
124 unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size, bootmap_start;
125 unsigned long nodedata_phys;
126 void *bootmap;
127 const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE);
129 start = round_up(start, ZONE_ALIGN);
131 printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid, start, end);
133 start_pfn = start >> PAGE_SHIFT;
134 end_pfn = end >> PAGE_SHIFT;
136 node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size);
137 if (node_data[nodeid] == NULL)
138 return;
139 nodedata_phys = __pa(node_data[nodeid]);
141 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
142 NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid];
143 NODE_DATA(nodeid)->node_start_pfn = start_pfn;
144 NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
146 /* Find a place for the bootmem map */
147 bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
148 bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
149 bootmap = early_node_mem(nodeid, bootmap_start, end,
150 bootmap_pages<<PAGE_SHIFT);
151 if (bootmap == NULL) {
152 if (nodedata_phys < start || nodedata_phys >= end)
153 free_bootmem((unsigned long)node_data[nodeid],pgdat_size);
154 node_data[nodeid] = NULL;
155 return;
157 bootmap_start = __pa(bootmap);
158 Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages);
160 bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
161 bootmap_start >> PAGE_SHIFT,
162 start_pfn, end_pfn);
164 e820_bootmem_free(NODE_DATA(nodeid), start, end);
166 reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size);
167 reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT);
168 #ifdef CONFIG_ACPI_NUMA
169 srat_reserve_add_area(nodeid);
170 #endif
171 node_set_online(nodeid);
174 /* Initialize final allocator for a zone */
175 void __init setup_node_zones(int nodeid)
177 unsigned long start_pfn, end_pfn, memmapsize, limit;
178 unsigned long zones[MAX_NR_ZONES];
179 unsigned long holes[MAX_NR_ZONES];
181 start_pfn = node_start_pfn(nodeid);
182 end_pfn = node_end_pfn(nodeid);
184 Dprintk(KERN_INFO "Setting up node %d %lx-%lx\n",
185 nodeid, start_pfn, end_pfn);
187 /* Try to allocate mem_map at end to not fill up precious <4GB
188 memory. */
189 memmapsize = sizeof(struct page) * (end_pfn-start_pfn);
190 limit = end_pfn << PAGE_SHIFT;
191 NODE_DATA(nodeid)->node_mem_map =
192 __alloc_bootmem_core(NODE_DATA(nodeid)->bdata,
193 memmapsize, SMP_CACHE_BYTES,
194 round_down(limit - memmapsize, PAGE_SIZE),
195 limit);
197 size_zones(zones, holes, start_pfn, end_pfn);
198 free_area_init_node(nodeid, NODE_DATA(nodeid), zones,
199 start_pfn, holes);
202 void __init numa_init_array(void)
204 int rr, i;
205 /* There are unfortunately some poorly designed mainboards around
206 that only connect memory to a single CPU. This breaks the 1:1 cpu->node
207 mapping. To avoid this fill in the mapping for all possible
208 CPUs, as the number of CPUs is not known yet.
209 We round robin the existing nodes. */
210 rr = first_node(node_online_map);
211 for (i = 0; i < NR_CPUS; i++) {
212 if (cpu_to_node[i] != NUMA_NO_NODE)
213 continue;
214 numa_set_node(i, rr);
215 rr = next_node(rr, node_online_map);
216 if (rr == MAX_NUMNODES)
217 rr = first_node(node_online_map);
222 #ifdef CONFIG_NUMA_EMU
223 int numa_fake __initdata = 0;
225 /* Numa emulation */
226 static int numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
228 int i;
229 struct bootnode nodes[MAX_NUMNODES];
230 unsigned long sz = ((end_pfn - start_pfn)<<PAGE_SHIFT) / numa_fake;
232 /* Kludge needed for the hash function */
233 if (hweight64(sz) > 1) {
234 unsigned long x = 1;
235 while ((x << 1) < sz)
236 x <<= 1;
237 if (x < sz/2)
238 printk(KERN_ERR "Numa emulation unbalanced. Complain to maintainer\n");
239 sz = x;
242 memset(&nodes,0,sizeof(nodes));
243 for (i = 0; i < numa_fake; i++) {
244 nodes[i].start = (start_pfn<<PAGE_SHIFT) + i*sz;
245 if (i == numa_fake-1)
246 sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start;
247 nodes[i].end = nodes[i].start + sz;
248 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n",
250 nodes[i].start, nodes[i].end,
251 (nodes[i].end - nodes[i].start) >> 20);
252 node_set_online(i);
254 memnode_shift = compute_hash_shift(nodes, numa_fake);
255 if (memnode_shift < 0) {
256 memnode_shift = 0;
257 printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n");
258 return -1;
260 for_each_online_node(i)
261 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
262 numa_init_array();
263 return 0;
265 #endif
267 void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
269 int i;
271 #ifdef CONFIG_NUMA_EMU
272 if (numa_fake && !numa_emulation(start_pfn, end_pfn))
273 return;
274 #endif
276 #ifdef CONFIG_ACPI_NUMA
277 if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
278 end_pfn << PAGE_SHIFT))
279 return;
280 #endif
282 #ifdef CONFIG_K8_NUMA
283 if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT))
284 return;
285 #endif
286 printk(KERN_INFO "%s\n",
287 numa_off ? "NUMA turned off" : "No NUMA configuration found");
289 printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
290 start_pfn << PAGE_SHIFT,
291 end_pfn << PAGE_SHIFT);
292 /* setup dummy node covering all memory */
293 memnode_shift = 63;
294 memnodemap[0] = 0;
295 nodes_clear(node_online_map);
296 node_set_online(0);
297 for (i = 0; i < NR_CPUS; i++)
298 numa_set_node(i, 0);
299 node_to_cpumask[0] = cpumask_of_cpu(0);
300 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
303 __cpuinit void numa_add_cpu(int cpu)
305 set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
308 void __cpuinit numa_set_node(int cpu, int node)
310 cpu_pda(cpu)->nodenumber = node;
311 cpu_to_node[cpu] = node;
314 unsigned long __init numa_free_all_bootmem(void)
316 int i;
317 unsigned long pages = 0;
318 for_each_online_node(i) {
319 pages += free_all_bootmem_node(NODE_DATA(i));
321 return pages;
324 #ifdef CONFIG_SPARSEMEM
325 static void __init arch_sparse_init(void)
327 int i;
329 for_each_online_node(i)
330 memory_present(i, node_start_pfn(i), node_end_pfn(i));
332 sparse_init();
334 #else
335 #define arch_sparse_init() do {} while (0)
336 #endif
338 void __init paging_init(void)
340 int i;
342 arch_sparse_init();
344 for_each_online_node(i) {
345 setup_node_zones(i);
349 /* [numa=off] */
350 __init int numa_setup(char *opt)
352 if (!strncmp(opt,"off",3))
353 numa_off = 1;
354 #ifdef CONFIG_NUMA_EMU
355 if(!strncmp(opt, "fake=", 5)) {
356 numa_fake = simple_strtoul(opt+5,NULL,0); ;
357 if (numa_fake >= MAX_NUMNODES)
358 numa_fake = MAX_NUMNODES;
360 #endif
361 #ifdef CONFIG_ACPI_NUMA
362 if (!strncmp(opt,"noacpi",6))
363 acpi_numa = -1;
364 if (!strncmp(opt,"hotadd=", 7))
365 hotadd_percent = simple_strtoul(opt+7, NULL, 10);
366 #endif
367 return 1;
371 * Setup early cpu_to_node.
373 * Populate cpu_to_node[] only if x86_cpu_to_apicid[],
374 * and apicid_to_node[] tables have valid entries for a CPU.
375 * This means we skip cpu_to_node[] initialisation for NUMA
376 * emulation and faking node case (when running a kernel compiled
377 * for NUMA on a non NUMA box), which is OK as cpu_to_node[]
378 * is already initialized in a round robin manner at numa_init_array,
379 * prior to this call, and this initialization is good enough
380 * for the fake NUMA cases.
382 void __init init_cpu_to_node(void)
384 int i;
385 for (i = 0; i < NR_CPUS; i++) {
386 u8 apicid = x86_cpu_to_apicid[i];
387 if (apicid == BAD_APICID)
388 continue;
389 if (apicid_to_node[apicid] == NUMA_NO_NODE)
390 continue;
391 numa_set_node(i,apicid_to_node[apicid]);
395 EXPORT_SYMBOL(cpu_to_node);
396 EXPORT_SYMBOL(node_to_cpumask);
397 EXPORT_SYMBOL(memnode);
398 EXPORT_SYMBOL(node_data);
400 #ifdef CONFIG_DISCONTIGMEM
402 * Functions to convert PFNs from/to per node page addresses.
403 * These are out of line because they are quite big.
404 * They could be all tuned by pre caching more state.
405 * Should do that.
408 int pfn_valid(unsigned long pfn)
410 unsigned nid;
411 if (pfn >= num_physpages)
412 return 0;
413 nid = pfn_to_nid(pfn);
414 if (nid == 0xff)
415 return 0;
416 return pfn >= node_start_pfn(nid) && (pfn) < node_end_pfn(nid);
418 EXPORT_SYMBOL(pfn_valid);
419 #endif