[JFFS2] Fix more breakage caused by janitorial meddling.
[linux-2.6/kmemtrace.git] / arch / x86_64 / mm / srat.c
blob15ae9fcd65a700c9f9060860f0fce56f7f6f3fcf
1 /*
2 * ACPI 3.0 based NUMA setup
3 * Copyright 2004 Andi Kleen, SuSE Labs.
5 * Reads the ACPI SRAT table to figure out what memory belongs to which CPUs.
7 * Called from acpi_numa_init while reading the SRAT and SLIT tables.
8 * Assumes all memory regions belonging to a single proximity domain
9 * are in one chunk. Holes between them will be included in the node.
12 #include <linux/kernel.h>
13 #include <linux/acpi.h>
14 #include <linux/mmzone.h>
15 #include <linux/bitmap.h>
16 #include <linux/module.h>
17 #include <linux/topology.h>
18 #include <linux/bootmem.h>
19 #include <linux/mm.h>
20 #include <asm/proto.h>
21 #include <asm/numa.h>
22 #include <asm/e820.h>
24 #if (defined(CONFIG_ACPI_HOTPLUG_MEMORY) || \
25 defined(CONFIG_ACPI_HOTPLUG_MEMORY_MODULE)) \
26 && !defined(CONFIG_MEMORY_HOTPLUG)
27 #define RESERVE_HOTADD 1
28 #endif
30 static struct acpi_table_slit *acpi_slit;
32 static nodemask_t nodes_parsed __initdata;
33 static nodemask_t nodes_found __initdata;
34 static struct bootnode nodes[MAX_NUMNODES] __initdata;
35 static struct bootnode nodes_add[MAX_NUMNODES] __initdata;
36 static int found_add_area __initdata;
37 int hotadd_percent __initdata = 10;
38 static u8 pxm2node[256] = { [0 ... 255] = 0xff };
40 /* Too small nodes confuse the VM badly. Usually they result
41 from BIOS bugs. */
42 #define NODE_MIN_SIZE (4*1024*1024)
44 static int node_to_pxm(int n);
46 int pxm_to_node(int pxm)
48 if ((unsigned)pxm >= 256)
49 return -1;
50 /* Extend 0xff to (int)-1 */
51 return (signed char)pxm2node[pxm];
54 static __init int setup_node(int pxm)
56 unsigned node = pxm2node[pxm];
57 if (node == 0xff) {
58 if (nodes_weight(nodes_found) >= MAX_NUMNODES)
59 return -1;
60 node = first_unset_node(nodes_found);
61 node_set(node, nodes_found);
62 pxm2node[pxm] = node;
64 return pxm2node[pxm];
67 static __init int conflicting_nodes(unsigned long start, unsigned long end)
69 int i;
70 for_each_node_mask(i, nodes_parsed) {
71 struct bootnode *nd = &nodes[i];
72 if (nd->start == nd->end)
73 continue;
74 if (nd->end > start && nd->start < end)
75 return i;
76 if (nd->end == end && nd->start == start)
77 return i;
79 return -1;
82 static __init void cutoff_node(int i, unsigned long start, unsigned long end)
84 struct bootnode *nd = &nodes[i];
86 if (found_add_area)
87 return;
89 if (nd->start < start) {
90 nd->start = start;
91 if (nd->end < nd->start)
92 nd->start = nd->end;
94 if (nd->end > end) {
95 nd->end = end;
96 if (nd->start > nd->end)
97 nd->start = nd->end;
101 static __init void bad_srat(void)
103 int i;
104 printk(KERN_ERR "SRAT: SRAT not used.\n");
105 acpi_numa = -1;
106 for (i = 0; i < MAX_LOCAL_APIC; i++)
107 apicid_to_node[i] = NUMA_NO_NODE;
108 for (i = 0; i < MAX_NUMNODES; i++)
109 nodes_add[i].start = nodes[i].end = 0;
112 static __init inline int srat_disabled(void)
114 return numa_off || acpi_numa < 0;
118 * A lot of BIOS fill in 10 (= no distance) everywhere. This messes
119 * up the NUMA heuristics which wants the local node to have a smaller
120 * distance than the others.
121 * Do some quick checks here and only use the SLIT if it passes.
123 static __init int slit_valid(struct acpi_table_slit *slit)
125 int i, j;
126 int d = slit->localities;
127 for (i = 0; i < d; i++) {
128 for (j = 0; j < d; j++) {
129 u8 val = slit->entry[d*i + j];
130 if (i == j) {
131 if (val != 10)
132 return 0;
133 } else if (val <= 10)
134 return 0;
137 return 1;
140 /* Callback for SLIT parsing */
141 void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
143 if (!slit_valid(slit)) {
144 printk(KERN_INFO "ACPI: SLIT table looks invalid. Not used.\n");
145 return;
147 acpi_slit = slit;
150 /* Callback for Proximity Domain -> LAPIC mapping */
151 void __init
152 acpi_numa_processor_affinity_init(struct acpi_table_processor_affinity *pa)
154 int pxm, node;
155 if (srat_disabled())
156 return;
157 if (pa->header.length != sizeof(struct acpi_table_processor_affinity)) { bad_srat();
158 return;
160 if (pa->flags.enabled == 0)
161 return;
162 pxm = pa->proximity_domain;
163 node = setup_node(pxm);
164 if (node < 0) {
165 printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
166 bad_srat();
167 return;
169 apicid_to_node[pa->apic_id] = node;
170 acpi_numa = 1;
171 printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n",
172 pxm, pa->apic_id, node);
175 #ifdef RESERVE_HOTADD
177 * Protect against too large hotadd areas that would fill up memory.
179 static int hotadd_enough_memory(struct bootnode *nd)
181 static unsigned long allocated;
182 static unsigned long last_area_end;
183 unsigned long pages = (nd->end - nd->start) >> PAGE_SHIFT;
184 long mem = pages * sizeof(struct page);
185 unsigned long addr;
186 unsigned long allowed;
187 unsigned long oldpages = pages;
189 if (mem < 0)
190 return 0;
191 allowed = (end_pfn - e820_hole_size(0, end_pfn)) * PAGE_SIZE;
192 allowed = (allowed / 100) * hotadd_percent;
193 if (allocated + mem > allowed) {
194 /* Give them at least part of their hotadd memory upto hotadd_percent
195 It would be better to spread the limit out
196 over multiple hotplug areas, but that is too complicated
197 right now */
198 if (allocated >= allowed)
199 return 0;
200 pages = (allowed - allocated + mem) / sizeof(struct page);
201 mem = pages * sizeof(struct page);
202 nd->end = nd->start + pages*PAGE_SIZE;
204 /* Not completely fool proof, but a good sanity check */
205 addr = find_e820_area(last_area_end, end_pfn<<PAGE_SHIFT, mem);
206 if (addr == -1UL)
207 return 0;
208 if (pages != oldpages)
209 printk(KERN_NOTICE "SRAT: Hotadd area limited to %lu bytes\n",
210 pages << PAGE_SHIFT);
211 last_area_end = addr + mem;
212 allocated += mem;
213 return 1;
217 * It is fine to add this area to the nodes data it will be used later
218 * This code supports one contigious hot add area per node.
220 static int reserve_hotadd(int node, unsigned long start, unsigned long end)
222 unsigned long s_pfn = start >> PAGE_SHIFT;
223 unsigned long e_pfn = end >> PAGE_SHIFT;
224 int changed = 0;
225 struct bootnode *nd = &nodes_add[node];
227 /* I had some trouble with strange memory hotadd regions breaking
228 the boot. Be very strict here and reject anything unexpected.
229 If you want working memory hotadd write correct SRATs.
231 The node size check is a basic sanity check to guard against
232 mistakes */
233 if ((signed long)(end - start) < NODE_MIN_SIZE) {
234 printk(KERN_ERR "SRAT: Hotplug area too small\n");
235 return -1;
238 /* This check might be a bit too strict, but I'm keeping it for now. */
239 if (e820_hole_size(s_pfn, e_pfn) != e_pfn - s_pfn) {
240 printk(KERN_ERR "SRAT: Hotplug area has existing memory\n");
241 return -1;
244 if (!hotadd_enough_memory(&nodes_add[node])) {
245 printk(KERN_ERR "SRAT: Hotplug area too large\n");
246 return -1;
249 /* Looks good */
251 found_add_area = 1;
252 if (nd->start == nd->end) {
253 nd->start = start;
254 nd->end = end;
255 changed = 1;
256 } else {
257 if (nd->start == end) {
258 nd->start = start;
259 changed = 1;
261 if (nd->end == start) {
262 nd->end = end;
263 changed = 1;
265 if (!changed)
266 printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n");
269 if ((nd->end >> PAGE_SHIFT) > end_pfn)
270 end_pfn = nd->end >> PAGE_SHIFT;
272 if (changed)
273 printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", nd->start, nd->end);
274 return 0;
276 #endif
278 /* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
279 void __init
280 acpi_numa_memory_affinity_init(struct acpi_table_memory_affinity *ma)
282 struct bootnode *nd, oldnode;
283 unsigned long start, end;
284 int node, pxm;
285 int i;
287 if (srat_disabled())
288 return;
289 if (ma->header.length != sizeof(struct acpi_table_memory_affinity)) {
290 bad_srat();
291 return;
293 if (ma->flags.enabled == 0)
294 return;
295 if (ma->flags.hot_pluggable && hotadd_percent == 0)
296 return;
297 start = ma->base_addr_lo | ((u64)ma->base_addr_hi << 32);
298 end = start + (ma->length_lo | ((u64)ma->length_hi << 32));
299 pxm = ma->proximity_domain;
300 node = setup_node(pxm);
301 if (node < 0) {
302 printk(KERN_ERR "SRAT: Too many proximity domains.\n");
303 bad_srat();
304 return;
306 i = conflicting_nodes(start, end);
307 if (i == node) {
308 printk(KERN_WARNING
309 "SRAT: Warning: PXM %d (%lx-%lx) overlaps with itself (%Lx-%Lx)\n",
310 pxm, start, end, nodes[i].start, nodes[i].end);
311 } else if (i >= 0) {
312 printk(KERN_ERR
313 "SRAT: PXM %d (%lx-%lx) overlaps with PXM %d (%Lx-%Lx)\n",
314 pxm, start, end, node_to_pxm(i),
315 nodes[i].start, nodes[i].end);
316 bad_srat();
317 return;
319 nd = &nodes[node];
320 oldnode = *nd;
321 if (!node_test_and_set(node, nodes_parsed)) {
322 nd->start = start;
323 nd->end = end;
324 } else {
325 if (start < nd->start)
326 nd->start = start;
327 if (nd->end < end)
328 nd->end = end;
331 printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm,
332 nd->start, nd->end);
334 #ifdef RESERVE_HOTADD
335 if (ma->flags.hot_pluggable && reserve_hotadd(node, start, end) < 0) {
336 /* Ignore hotadd region. Undo damage */
337 printk(KERN_NOTICE "SRAT: Hotplug region ignored\n");
338 *nd = oldnode;
339 if ((nd->start | nd->end) == 0)
340 node_clear(node, nodes_parsed);
342 #endif
345 /* Sanity check to catch more bad SRATs (they are amazingly common).
346 Make sure the PXMs cover all memory. */
347 static int nodes_cover_memory(void)
349 int i;
350 unsigned long pxmram, e820ram;
352 pxmram = 0;
353 for_each_node_mask(i, nodes_parsed) {
354 unsigned long s = nodes[i].start >> PAGE_SHIFT;
355 unsigned long e = nodes[i].end >> PAGE_SHIFT;
356 pxmram += e - s;
357 pxmram -= e820_hole_size(s, e);
358 pxmram -= nodes_add[i].end - nodes_add[i].start;
359 if ((long)pxmram < 0)
360 pxmram = 0;
363 e820ram = end_pfn - e820_hole_size(0, end_pfn);
364 /* We seem to lose 3 pages somewhere. Allow a bit of slack. */
365 if ((long)(e820ram - pxmram) >= 1*1024*1024) {
366 printk(KERN_ERR
367 "SRAT: PXMs only cover %luMB of your %luMB e820 RAM. Not used.\n",
368 (pxmram << PAGE_SHIFT) >> 20,
369 (e820ram << PAGE_SHIFT) >> 20);
370 return 0;
372 return 1;
375 static void unparse_node(int node)
377 int i;
378 node_clear(node, nodes_parsed);
379 for (i = 0; i < MAX_LOCAL_APIC; i++) {
380 if (apicid_to_node[i] == node)
381 apicid_to_node[i] = NUMA_NO_NODE;
385 void __init acpi_numa_arch_fixup(void) {}
387 /* Use the information discovered above to actually set up the nodes. */
388 int __init acpi_scan_nodes(unsigned long start, unsigned long end)
390 int i;
392 /* First clean up the node list */
393 for (i = 0; i < MAX_NUMNODES; i++) {
394 cutoff_node(i, start, end);
395 if ((nodes[i].end - nodes[i].start) < NODE_MIN_SIZE)
396 unparse_node(i);
399 if (acpi_numa <= 0)
400 return -1;
402 if (!nodes_cover_memory()) {
403 bad_srat();
404 return -1;
407 memnode_shift = compute_hash_shift(nodes, MAX_NUMNODES);
408 if (memnode_shift < 0) {
409 printk(KERN_ERR
410 "SRAT: No NUMA node hash function found. Contact maintainer\n");
411 bad_srat();
412 return -1;
415 /* Finally register nodes */
416 for_each_node_mask(i, nodes_parsed)
417 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
418 /* Try again in case setup_node_bootmem missed one due
419 to missing bootmem */
420 for_each_node_mask(i, nodes_parsed)
421 if (!node_online(i))
422 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
424 for (i = 0; i < NR_CPUS; i++) {
425 if (cpu_to_node[i] == NUMA_NO_NODE)
426 continue;
427 if (!node_isset(cpu_to_node[i], nodes_parsed))
428 numa_set_node(i, NUMA_NO_NODE);
430 numa_init_array();
431 return 0;
434 static int node_to_pxm(int n)
436 int i;
437 if (pxm2node[n] == n)
438 return n;
439 for (i = 0; i < 256; i++)
440 if (pxm2node[i] == n)
441 return i;
442 return 0;
445 void __init srat_reserve_add_area(int nodeid)
447 if (found_add_area && nodes_add[nodeid].end) {
448 u64 total_mb;
450 printk(KERN_INFO "SRAT: Reserving hot-add memory space "
451 "for node %d at %Lx-%Lx\n",
452 nodeid, nodes_add[nodeid].start, nodes_add[nodeid].end);
453 total_mb = (nodes_add[nodeid].end - nodes_add[nodeid].start)
454 >> PAGE_SHIFT;
455 total_mb *= sizeof(struct page);
456 total_mb >>= 20;
457 printk(KERN_INFO "SRAT: This will cost you %Lu MB of "
458 "pre-allocated memory.\n", (unsigned long long)total_mb);
459 reserve_bootmem_node(NODE_DATA(nodeid), nodes_add[nodeid].start,
460 nodes_add[nodeid].end - nodes_add[nodeid].start);
464 int __node_distance(int a, int b)
466 int index;
468 if (!acpi_slit)
469 return a == b ? 10 : 20;
470 index = acpi_slit->localities * node_to_pxm(a);
471 return acpi_slit->entry[index + node_to_pxm(b)];
474 EXPORT_SYMBOL(__node_distance);