[PATCH] x86_64: Early initialization of cpu_to_node
[linux-2.6/linux-acpi-2.6/ibm-acpi-2.6.git] / arch / x86_64 / mm / numa.c
blob876e343744128f57d10b6ea2c70165da3cf62e64
1 /*
2 * Generic VM initialization for x86-64 NUMA setups.
3 * Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 */
5 #include <linux/kernel.h>
6 #include <linux/mm.h>
7 #include <linux/string.h>
8 #include <linux/init.h>
9 #include <linux/bootmem.h>
10 #include <linux/mmzone.h>
11 #include <linux/ctype.h>
12 #include <linux/module.h>
13 #include <linux/nodemask.h>
15 #include <asm/e820.h>
16 #include <asm/proto.h>
17 #include <asm/dma.h>
18 #include <asm/numa.h>
19 #include <asm/acpi.h>
21 #ifndef Dprintk
22 #define Dprintk(x...)
23 #endif
25 struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
26 bootmem_data_t plat_node_bdata[MAX_NUMNODES];
28 int memnode_shift;
29 u8 memnodemap[NODEMAPSIZE];
31 unsigned char cpu_to_node[NR_CPUS] __read_mostly = {
32 [0 ... NR_CPUS-1] = NUMA_NO_NODE
34 unsigned char apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
35 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
37 cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly;
39 int numa_off __initdata;
43 * Given a shift value, try to populate memnodemap[]
44 * Returns :
45 * 1 if OK
46 * 0 if memnodmap[] too small (of shift too small)
47 * -1 if node overlap or lost ram (shift too big)
49 static int __init
50 populate_memnodemap(const struct node *nodes, int numnodes, int shift)
52 int i;
53 int res = -1;
54 unsigned long addr, end;
56 if (shift >= 64)
57 return -1;
58 memset(memnodemap, 0xff, sizeof(memnodemap));
59 for (i = 0; i < numnodes; i++) {
60 addr = nodes[i].start;
61 end = nodes[i].end;
62 if (addr >= end)
63 continue;
64 if ((end >> shift) >= NODEMAPSIZE)
65 return 0;
66 do {
67 if (memnodemap[addr >> shift] != 0xff)
68 return -1;
69 memnodemap[addr >> shift] = i;
70 addr += (1UL << shift);
71 } while (addr < end);
72 res = 1;
74 return res;
77 int __init compute_hash_shift(struct node *nodes, int numnodes)
79 int shift = 20;
81 while (populate_memnodemap(nodes, numnodes, shift + 1) >= 0)
82 shift++;
84 printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
85 shift);
87 if (populate_memnodemap(nodes, numnodes, shift) != 1) {
88 printk(KERN_INFO
89 "Your memory is not aligned you need to rebuild your kernel "
90 "with a bigger NODEMAPSIZE shift=%d\n",
91 shift);
92 return -1;
94 return shift;
97 #ifdef CONFIG_SPARSEMEM
98 int early_pfn_to_nid(unsigned long pfn)
100 return phys_to_nid(pfn << PAGE_SHIFT);
102 #endif
104 /* Initialize bootmem allocator for a node */
105 void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
107 unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size, bootmap_start;
108 unsigned long nodedata_phys;
109 const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE);
111 start = round_up(start, ZONE_ALIGN);
113 printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid, start, end);
115 start_pfn = start >> PAGE_SHIFT;
116 end_pfn = end >> PAGE_SHIFT;
118 nodedata_phys = find_e820_area(start, end, pgdat_size);
119 if (nodedata_phys == -1L)
120 panic("Cannot find memory pgdat in node %d\n", nodeid);
122 Dprintk("nodedata_phys %lx\n", nodedata_phys);
124 node_data[nodeid] = phys_to_virt(nodedata_phys);
125 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
126 NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid];
127 NODE_DATA(nodeid)->node_start_pfn = start_pfn;
128 NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
130 /* Find a place for the bootmem map */
131 bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
132 bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
133 bootmap_start = find_e820_area(bootmap_start, end, bootmap_pages<<PAGE_SHIFT);
134 if (bootmap_start == -1L)
135 panic("Not enough continuous space for bootmap on node %d", nodeid);
136 Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages);
138 bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
139 bootmap_start >> PAGE_SHIFT,
140 start_pfn, end_pfn);
142 e820_bootmem_free(NODE_DATA(nodeid), start, end);
144 reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size);
145 reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT);
146 node_set_online(nodeid);
149 /* Initialize final allocator for a zone */
150 void __init setup_node_zones(int nodeid)
152 unsigned long start_pfn, end_pfn;
153 unsigned long zones[MAX_NR_ZONES];
154 unsigned long holes[MAX_NR_ZONES];
156 start_pfn = node_start_pfn(nodeid);
157 end_pfn = node_end_pfn(nodeid);
159 Dprintk(KERN_INFO "Setting up node %d %lx-%lx\n",
160 nodeid, start_pfn, end_pfn);
162 size_zones(zones, holes, start_pfn, end_pfn);
163 free_area_init_node(nodeid, NODE_DATA(nodeid), zones,
164 start_pfn, holes);
167 void __init numa_init_array(void)
169 int rr, i;
170 /* There are unfortunately some poorly designed mainboards around
171 that only connect memory to a single CPU. This breaks the 1:1 cpu->node
172 mapping. To avoid this fill in the mapping for all possible
173 CPUs, as the number of CPUs is not known yet.
174 We round robin the existing nodes. */
175 rr = first_node(node_online_map);
176 for (i = 0; i < NR_CPUS; i++) {
177 if (cpu_to_node[i] != NUMA_NO_NODE)
178 continue;
179 numa_set_node(i, rr);
180 rr = next_node(rr, node_online_map);
181 if (rr == MAX_NUMNODES)
182 rr = first_node(node_online_map);
187 #ifdef CONFIG_NUMA_EMU
188 int numa_fake __initdata = 0;
190 /* Numa emulation */
191 static int numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
193 int i;
194 struct node nodes[MAX_NUMNODES];
195 unsigned long sz = ((end_pfn - start_pfn)<<PAGE_SHIFT) / numa_fake;
197 /* Kludge needed for the hash function */
198 if (hweight64(sz) > 1) {
199 unsigned long x = 1;
200 while ((x << 1) < sz)
201 x <<= 1;
202 if (x < sz/2)
203 printk(KERN_ERR "Numa emulation unbalanced. Complain to maintainer\n");
204 sz = x;
207 memset(&nodes,0,sizeof(nodes));
208 for (i = 0; i < numa_fake; i++) {
209 nodes[i].start = (start_pfn<<PAGE_SHIFT) + i*sz;
210 if (i == numa_fake-1)
211 sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start;
212 nodes[i].end = nodes[i].start + sz;
213 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n",
215 nodes[i].start, nodes[i].end,
216 (nodes[i].end - nodes[i].start) >> 20);
217 node_set_online(i);
219 memnode_shift = compute_hash_shift(nodes, numa_fake);
220 if (memnode_shift < 0) {
221 memnode_shift = 0;
222 printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n");
223 return -1;
225 for_each_online_node(i)
226 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
227 numa_init_array();
228 return 0;
230 #endif
232 void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
234 int i;
236 #ifdef CONFIG_NUMA_EMU
237 if (numa_fake && !numa_emulation(start_pfn, end_pfn))
238 return;
239 #endif
241 #ifdef CONFIG_ACPI_NUMA
242 if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
243 end_pfn << PAGE_SHIFT))
244 return;
245 #endif
247 #ifdef CONFIG_K8_NUMA
248 if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT))
249 return;
250 #endif
251 printk(KERN_INFO "%s\n",
252 numa_off ? "NUMA turned off" : "No NUMA configuration found");
254 printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
255 start_pfn << PAGE_SHIFT,
256 end_pfn << PAGE_SHIFT);
257 /* setup dummy node covering all memory */
258 memnode_shift = 63;
259 memnodemap[0] = 0;
260 nodes_clear(node_online_map);
261 node_set_online(0);
262 for (i = 0; i < NR_CPUS; i++)
263 numa_set_node(i, 0);
264 node_to_cpumask[0] = cpumask_of_cpu(0);
265 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
268 __cpuinit void numa_add_cpu(int cpu)
270 set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
273 void __cpuinit numa_set_node(int cpu, int node)
275 cpu_pda[cpu].nodenumber = node;
276 cpu_to_node[cpu] = node;
279 unsigned long __init numa_free_all_bootmem(void)
281 int i;
282 unsigned long pages = 0;
283 for_each_online_node(i) {
284 pages += free_all_bootmem_node(NODE_DATA(i));
286 return pages;
289 #ifdef CONFIG_SPARSEMEM
290 static void __init arch_sparse_init(void)
292 int i;
294 for_each_online_node(i)
295 memory_present(i, node_start_pfn(i), node_end_pfn(i));
297 sparse_init();
299 #else
300 #define arch_sparse_init() do {} while (0)
301 #endif
303 void __init paging_init(void)
305 int i;
307 arch_sparse_init();
309 for_each_online_node(i) {
310 setup_node_zones(i);
314 /* [numa=off] */
315 __init int numa_setup(char *opt)
317 if (!strncmp(opt,"off",3))
318 numa_off = 1;
319 #ifdef CONFIG_NUMA_EMU
320 if(!strncmp(opt, "fake=", 5)) {
321 numa_fake = simple_strtoul(opt+5,NULL,0); ;
322 if (numa_fake >= MAX_NUMNODES)
323 numa_fake = MAX_NUMNODES;
325 #endif
326 #ifdef CONFIG_ACPI_NUMA
327 if (!strncmp(opt,"noacpi",6))
328 acpi_numa = -1;
329 #endif
330 return 1;
334 * Setup early cpu_to_node.
336 * Populate cpu_to_node[] only if x86_cpu_to_apicid[],
337 * and apicid_to_node[] tables have valid entries for a CPU.
338 * This means we skip cpu_to_node[] initialisation for NUMA
339 * emulation and faking node case (when running a kernel compiled
340 * for NUMA on a non NUMA box), which is OK as cpu_to_node[]
341 * is already initialized in a round robin manner at numa_init_array,
342 * prior to this call, and this initialization is good enough
343 * for the fake NUMA cases.
345 void __init init_cpu_to_node(void)
347 int i;
348 for (i = 0; i < NR_CPUS; i++) {
349 u8 apicid = x86_cpu_to_apicid[i];
350 if (apicid == BAD_APICID)
351 continue;
352 if (apicid_to_node[apicid] == NUMA_NO_NODE)
353 continue;
354 cpu_to_node[i] = apicid_to_node[apicid];
358 EXPORT_SYMBOL(cpu_to_node);
359 EXPORT_SYMBOL(node_to_cpumask);
360 EXPORT_SYMBOL(memnode_shift);
361 EXPORT_SYMBOL(memnodemap);
362 EXPORT_SYMBOL(node_data);