2 * Generic VM initialization for x86-64 NUMA setups.
3 * Copyright 2002,2003 Andi Kleen, SuSE Labs.
5 #include <linux/kernel.h>
7 #include <linux/string.h>
8 #include <linux/init.h>
9 #include <linux/bootmem.h>
10 #include <linux/memblock.h>
11 #include <linux/mmzone.h>
12 #include <linux/ctype.h>
13 #include <linux/module.h>
14 #include <linux/nodemask.h>
15 #include <linux/sched.h>
16 #include <linux/acpi.h>
19 #include <asm/proto.h>
22 #include <asm/amd_nb.h>
24 #include "numa_internal.h"
26 struct pglist_data
*node_data
[MAX_NUMNODES
] __read_mostly
;
27 EXPORT_SYMBOL(node_data
);
29 nodemask_t numa_nodes_parsed __initdata
;
31 struct memnode memnode
;
33 static unsigned long __initdata nodemap_addr
;
34 static unsigned long __initdata nodemap_size
;
36 static struct numa_meminfo numa_meminfo __initdata
;
38 static int numa_distance_cnt
;
39 static u8
*numa_distance
;
42 * Given a shift value, try to populate memnodemap[]
45 * 0 if memnodmap[] too small (of shift too small)
46 * -1 if node overlap or lost ram (shift too big)
48 static int __init
populate_memnodemap(const struct numa_meminfo
*mi
, int shift
)
50 unsigned long addr
, end
;
53 memset(memnodemap
, 0xff, sizeof(s16
)*memnodemapsize
);
54 for (i
= 0; i
< mi
->nr_blks
; i
++) {
55 addr
= mi
->blk
[i
].start
;
59 if ((end
>> shift
) >= memnodemapsize
)
62 if (memnodemap
[addr
>> shift
] != NUMA_NO_NODE
)
64 memnodemap
[addr
>> shift
] = mi
->blk
[i
].nid
;
65 addr
+= (1UL << shift
);
72 static int __init
allocate_cachealigned_memnodemap(void)
76 memnodemap
= memnode
.embedded_map
;
77 if (memnodemapsize
<= ARRAY_SIZE(memnode
.embedded_map
))
81 nodemap_size
= roundup(sizeof(s16
) * memnodemapsize
, L1_CACHE_BYTES
);
82 nodemap_addr
= memblock_find_in_range(addr
, get_max_mapped(),
83 nodemap_size
, L1_CACHE_BYTES
);
84 if (nodemap_addr
== MEMBLOCK_ERROR
) {
86 "NUMA: Unable to allocate Memory to Node hash map\n");
87 nodemap_addr
= nodemap_size
= 0;
90 memnodemap
= phys_to_virt(nodemap_addr
);
91 memblock_x86_reserve_range(nodemap_addr
, nodemap_addr
+ nodemap_size
, "MEMNODEMAP");
93 printk(KERN_DEBUG
"NUMA: Allocated memnodemap from %lx - %lx\n",
94 nodemap_addr
, nodemap_addr
+ nodemap_size
);
99 * The LSB of all start and end addresses in the node map is the value of the
100 * maximum possible shift.
102 static int __init
extract_lsb_from_nodes(const struct numa_meminfo
*mi
)
104 int i
, nodes_used
= 0;
105 unsigned long start
, end
;
106 unsigned long bitfield
= 0, memtop
= 0;
108 for (i
= 0; i
< mi
->nr_blks
; i
++) {
109 start
= mi
->blk
[i
].start
;
110 end
= mi
->blk
[i
].end
;
121 i
= find_first_bit(&bitfield
, sizeof(unsigned long)*8);
122 memnodemapsize
= (memtop
>> i
)+1;
126 static int __init
compute_hash_shift(const struct numa_meminfo
*mi
)
130 shift
= extract_lsb_from_nodes(mi
);
131 if (allocate_cachealigned_memnodemap())
133 printk(KERN_DEBUG
"NUMA: Using %d for the hash shift.\n",
136 if (populate_memnodemap(mi
, shift
) != 1) {
137 printk(KERN_INFO
"Your memory is not aligned you need to "
138 "rebuild your kernel with a bigger NODEMAPSIZE "
139 "shift=%d\n", shift
);
145 int __meminit
__early_pfn_to_nid(unsigned long pfn
)
147 return phys_to_nid(pfn
<< PAGE_SHIFT
);
150 static void * __init
early_node_mem(int nodeid
, unsigned long start
,
151 unsigned long end
, unsigned long size
,
157 * put it on high as possible
158 * something will go with NODE_DATA
160 if (start
< (MAX_DMA_PFN
<<PAGE_SHIFT
))
161 start
= MAX_DMA_PFN
<<PAGE_SHIFT
;
162 if (start
< (MAX_DMA32_PFN
<<PAGE_SHIFT
) &&
163 end
> (MAX_DMA32_PFN
<<PAGE_SHIFT
))
164 start
= MAX_DMA32_PFN
<<PAGE_SHIFT
;
165 mem
= memblock_x86_find_in_range_node(nodeid
, start
, end
, size
, align
);
166 if (mem
!= MEMBLOCK_ERROR
)
169 /* extend the search scope */
170 end
= max_pfn_mapped
<< PAGE_SHIFT
;
171 start
= MAX_DMA_PFN
<< PAGE_SHIFT
;
172 mem
= memblock_find_in_range(start
, end
, size
, align
);
173 if (mem
!= MEMBLOCK_ERROR
)
176 printk(KERN_ERR
"Cannot find %lu bytes in node %d\n",
182 static int __init
numa_add_memblk_to(int nid
, u64 start
, u64 end
,
183 struct numa_meminfo
*mi
)
185 /* ignore zero length blks */
189 /* whine about and ignore invalid blks */
190 if (start
> end
|| nid
< 0 || nid
>= MAX_NUMNODES
) {
191 pr_warning("NUMA: Warning: invalid memblk node %d (%Lx-%Lx)\n",
196 if (mi
->nr_blks
>= NR_NODE_MEMBLKS
) {
197 pr_err("NUMA: too many memblk ranges\n");
201 mi
->blk
[mi
->nr_blks
].start
= start
;
202 mi
->blk
[mi
->nr_blks
].end
= end
;
203 mi
->blk
[mi
->nr_blks
].nid
= nid
;
209 * numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo
210 * @idx: Index of memblk to remove
211 * @mi: numa_meminfo to remove memblk from
213 * Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and
214 * decrementing @mi->nr_blks.
216 void __init
numa_remove_memblk_from(int idx
, struct numa_meminfo
*mi
)
219 memmove(&mi
->blk
[idx
], &mi
->blk
[idx
+ 1],
220 (mi
->nr_blks
- idx
) * sizeof(mi
->blk
[0]));
224 * numa_add_memblk - Add one numa_memblk to numa_meminfo
225 * @nid: NUMA node ID of the new memblk
226 * @start: Start address of the new memblk
227 * @end: End address of the new memblk
229 * Add a new memblk to the default numa_meminfo.
232 * 0 on success, -errno on failure.
234 int __init
numa_add_memblk(int nid
, u64 start
, u64 end
)
236 return numa_add_memblk_to(nid
, start
, end
, &numa_meminfo
);
239 /* Initialize bootmem allocator for a node */
241 setup_node_bootmem(int nodeid
, unsigned long start
, unsigned long end
)
243 unsigned long start_pfn
, last_pfn
, nodedata_phys
;
244 const int pgdat_size
= roundup(sizeof(pg_data_t
), PAGE_SIZE
);
251 * Don't confuse VM with a node that doesn't have the
252 * minimum amount of memory:
254 if (end
&& (end
- start
) < NODE_MIN_SIZE
)
257 start
= roundup(start
, ZONE_ALIGN
);
259 printk(KERN_INFO
"Initmem setup node %d %016lx-%016lx\n", nodeid
,
262 start_pfn
= start
>> PAGE_SHIFT
;
263 last_pfn
= end
>> PAGE_SHIFT
;
265 node_data
[nodeid
] = early_node_mem(nodeid
, start
, end
, pgdat_size
,
267 if (node_data
[nodeid
] == NULL
)
269 nodedata_phys
= __pa(node_data
[nodeid
]);
270 memblock_x86_reserve_range(nodedata_phys
, nodedata_phys
+ pgdat_size
, "NODE_DATA");
271 printk(KERN_INFO
" NODE_DATA [%016lx - %016lx]\n", nodedata_phys
,
272 nodedata_phys
+ pgdat_size
- 1);
273 nid
= phys_to_nid(nodedata_phys
);
275 printk(KERN_INFO
" NODE_DATA(%d) on node %d\n", nodeid
, nid
);
277 memset(NODE_DATA(nodeid
), 0, sizeof(pg_data_t
));
278 NODE_DATA(nodeid
)->node_id
= nodeid
;
279 NODE_DATA(nodeid
)->node_start_pfn
= start_pfn
;
280 NODE_DATA(nodeid
)->node_spanned_pages
= last_pfn
- start_pfn
;
282 node_set_online(nodeid
);
286 * numa_cleanup_meminfo - Cleanup a numa_meminfo
287 * @mi: numa_meminfo to clean up
289 * Sanitize @mi by merging and removing unncessary memblks. Also check for
290 * conflicts and clear unused memblks.
293 * 0 on success, -errno on failure.
295 int __init
numa_cleanup_meminfo(struct numa_meminfo
*mi
)
298 const u64 high
= (u64
)max_pfn
<< PAGE_SHIFT
;
301 for (i
= 0; i
< mi
->nr_blks
; i
++) {
302 struct numa_memblk
*bi
= &mi
->blk
[i
];
304 /* make sure all blocks are inside the limits */
305 bi
->start
= max(bi
->start
, low
);
306 bi
->end
= min(bi
->end
, high
);
308 /* and there's no empty block */
309 if (bi
->start
== bi
->end
) {
310 numa_remove_memblk_from(i
--, mi
);
314 for (j
= i
+ 1; j
< mi
->nr_blks
; j
++) {
315 struct numa_memblk
*bj
= &mi
->blk
[j
];
316 unsigned long start
, end
;
319 * See whether there are overlapping blocks. Whine
320 * about but allow overlaps of the same nid. They
321 * will be merged below.
323 if (bi
->end
> bj
->start
&& bi
->start
< bj
->end
) {
324 if (bi
->nid
!= bj
->nid
) {
325 pr_err("NUMA: node %d (%Lx-%Lx) overlaps with node %d (%Lx-%Lx)\n",
326 bi
->nid
, bi
->start
, bi
->end
,
327 bj
->nid
, bj
->start
, bj
->end
);
330 pr_warning("NUMA: Warning: node %d (%Lx-%Lx) overlaps with itself (%Lx-%Lx)\n",
331 bi
->nid
, bi
->start
, bi
->end
,
336 * Join together blocks on the same node, holes
337 * between which don't overlap with memory on other
340 if (bi
->nid
!= bj
->nid
)
342 start
= max(min(bi
->start
, bj
->start
), low
);
343 end
= min(max(bi
->end
, bj
->end
), high
);
344 for (k
= 0; k
< mi
->nr_blks
; k
++) {
345 struct numa_memblk
*bk
= &mi
->blk
[k
];
347 if (bi
->nid
== bk
->nid
)
349 if (start
< bk
->end
&& end
> bk
->start
)
354 printk(KERN_INFO
"NUMA: Node %d [%Lx,%Lx) + [%Lx,%Lx) -> [%lx,%lx)\n",
355 bi
->nid
, bi
->start
, bi
->end
, bj
->start
, bj
->end
,
359 numa_remove_memblk_from(j
--, mi
);
363 for (i
= mi
->nr_blks
; i
< ARRAY_SIZE(mi
->blk
); i
++) {
364 mi
->blk
[i
].start
= mi
->blk
[i
].end
= 0;
365 mi
->blk
[i
].nid
= NUMA_NO_NODE
;
372 * Set nodes, which have memory in @mi, in *@nodemask.
374 static void __init
numa_nodemask_from_meminfo(nodemask_t
*nodemask
,
375 const struct numa_meminfo
*mi
)
379 for (i
= 0; i
< ARRAY_SIZE(mi
->blk
); i
++)
380 if (mi
->blk
[i
].start
!= mi
->blk
[i
].end
&&
381 mi
->blk
[i
].nid
!= NUMA_NO_NODE
)
382 node_set(mi
->blk
[i
].nid
, *nodemask
);
386 * numa_reset_distance - Reset NUMA distance table
388 * The current table is freed. The next numa_set_distance() call will
391 void __init
numa_reset_distance(void)
393 size_t size
= numa_distance_cnt
* numa_distance_cnt
* sizeof(numa_distance
[0]);
395 /* numa_distance could be 1LU marking allocation failure, test cnt */
396 if (numa_distance_cnt
)
397 memblock_x86_free_range(__pa(numa_distance
),
398 __pa(numa_distance
) + size
);
399 numa_distance_cnt
= 0;
400 numa_distance
= NULL
; /* enable table creation */
403 static int __init
numa_alloc_distance(void)
405 nodemask_t nodes_parsed
;
410 /* size the new table and allocate it */
411 nodes_parsed
= numa_nodes_parsed
;
412 numa_nodemask_from_meminfo(&nodes_parsed
, &numa_meminfo
);
414 for_each_node_mask(i
, nodes_parsed
)
417 size
= cnt
* cnt
* sizeof(numa_distance
[0]);
419 phys
= memblock_find_in_range(0, (u64
)max_pfn_mapped
<< PAGE_SHIFT
,
421 if (phys
== MEMBLOCK_ERROR
) {
422 pr_warning("NUMA: Warning: can't allocate distance table!\n");
423 /* don't retry until explicitly reset */
424 numa_distance
= (void *)1LU;
427 memblock_x86_reserve_range(phys
, phys
+ size
, "NUMA DIST");
429 numa_distance
= __va(phys
);
430 numa_distance_cnt
= cnt
;
432 /* fill with the default distances */
433 for (i
= 0; i
< cnt
; i
++)
434 for (j
= 0; j
< cnt
; j
++)
435 numa_distance
[i
* cnt
+ j
] = i
== j
?
436 LOCAL_DISTANCE
: REMOTE_DISTANCE
;
437 printk(KERN_DEBUG
"NUMA: Initialized distance table, cnt=%d\n", cnt
);
443 * numa_set_distance - Set NUMA distance from one NUMA to another
444 * @from: the 'from' node to set distance
445 * @to: the 'to' node to set distance
446 * @distance: NUMA distance
448 * Set the distance from node @from to @to to @distance. If distance table
449 * doesn't exist, one which is large enough to accomodate all the currently
450 * known nodes will be created.
452 * If such table cannot be allocated, a warning is printed and further
453 * calls are ignored until the distance table is reset with
454 * numa_reset_distance().
456 * If @from or @to is higher than the highest known node at the time of
457 * table creation or @distance doesn't make sense, the call is ignored.
458 * This is to allow simplification of specific NUMA config implementations.
460 void __init
numa_set_distance(int from
, int to
, int distance
)
462 if (!numa_distance
&& numa_alloc_distance() < 0)
465 if (from
>= numa_distance_cnt
|| to
>= numa_distance_cnt
) {
466 printk_once(KERN_DEBUG
"NUMA: Debug: distance out of bound, from=%d to=%d distance=%d\n",
471 if ((u8
)distance
!= distance
||
472 (from
== to
&& distance
!= LOCAL_DISTANCE
)) {
473 pr_warn_once("NUMA: Warning: invalid distance parameter, from=%d to=%d distance=%d\n",
478 numa_distance
[from
* numa_distance_cnt
+ to
] = distance
;
481 int __node_distance(int from
, int to
)
483 if (from
>= numa_distance_cnt
|| to
>= numa_distance_cnt
)
484 return from
== to
? LOCAL_DISTANCE
: REMOTE_DISTANCE
;
485 return numa_distance
[from
* numa_distance_cnt
+ to
];
487 EXPORT_SYMBOL(__node_distance
);
490 * Sanity check to catch more bad NUMA configurations (they are amazingly
491 * common). Make sure the nodes cover all memory.
493 static bool __init
numa_meminfo_cover_memory(const struct numa_meminfo
*mi
)
495 unsigned long numaram
, e820ram
;
499 for (i
= 0; i
< mi
->nr_blks
; i
++) {
500 unsigned long s
= mi
->blk
[i
].start
>> PAGE_SHIFT
;
501 unsigned long e
= mi
->blk
[i
].end
>> PAGE_SHIFT
;
503 numaram
-= __absent_pages_in_range(mi
->blk
[i
].nid
, s
, e
);
504 if ((long)numaram
< 0)
508 e820ram
= max_pfn
- (memblock_x86_hole_size(0,
509 max_pfn
<< PAGE_SHIFT
) >> PAGE_SHIFT
);
510 /* We seem to lose 3 pages somewhere. Allow 1M of slack. */
511 if ((long)(e820ram
- numaram
) >= (1 << (20 - PAGE_SHIFT
))) {
512 printk(KERN_ERR
"NUMA: nodes only cover %luMB of your %luMB e820 RAM. Not used.\n",
513 (numaram
<< PAGE_SHIFT
) >> 20,
514 (e820ram
<< PAGE_SHIFT
) >> 20);
520 static int __init
numa_register_memblks(struct numa_meminfo
*mi
)
524 /* Account for nodes with cpus and no memory */
525 node_possible_map
= numa_nodes_parsed
;
526 numa_nodemask_from_meminfo(&node_possible_map
, mi
);
527 if (WARN_ON(nodes_empty(node_possible_map
)))
530 memnode_shift
= compute_hash_shift(mi
);
531 if (memnode_shift
< 0) {
532 printk(KERN_ERR
"NUMA: No NUMA node hash function found. Contact maintainer\n");
536 for (i
= 0; i
< mi
->nr_blks
; i
++)
537 memblock_x86_register_active_regions(mi
->blk
[i
].nid
,
538 mi
->blk
[i
].start
>> PAGE_SHIFT
,
539 mi
->blk
[i
].end
>> PAGE_SHIFT
);
541 /* for out of order entries */
543 if (!numa_meminfo_cover_memory(mi
))
546 /* Finally register nodes. */
547 for_each_node_mask(nid
, node_possible_map
) {
548 u64 start
= (u64
)max_pfn
<< PAGE_SHIFT
;
551 for (i
= 0; i
< mi
->nr_blks
; i
++) {
552 if (nid
!= mi
->blk
[i
].nid
)
554 start
= min(mi
->blk
[i
].start
, start
);
555 end
= max(mi
->blk
[i
].end
, end
);
559 setup_node_bootmem(nid
, start
, end
);
566 * dummy_numma_init - Fallback dummy NUMA init
568 * Used if there's no underlying NUMA architecture, NUMA initialization
569 * fails, or NUMA is disabled on the command line.
571 * Must online at least one node and add memory blocks that cover all
572 * allowed memory. This function must not fail.
574 static int __init
dummy_numa_init(void)
576 printk(KERN_INFO
"%s\n",
577 numa_off
? "NUMA turned off" : "No NUMA configuration found");
578 printk(KERN_INFO
"Faking a node at %016lx-%016lx\n",
579 0LU, max_pfn
<< PAGE_SHIFT
);
581 node_set(0, numa_nodes_parsed
);
582 numa_add_memblk(0, 0, (u64
)max_pfn
<< PAGE_SHIFT
);
587 static int __init
numa_init(int (*init_func
)(void))
592 for (i
= 0; i
< MAX_LOCAL_APIC
; i
++)
593 set_apicid_to_node(i
, NUMA_NO_NODE
);
595 nodes_clear(numa_nodes_parsed
);
596 nodes_clear(node_possible_map
);
597 nodes_clear(node_online_map
);
598 memset(&numa_meminfo
, 0, sizeof(numa_meminfo
));
599 remove_all_active_ranges();
600 numa_reset_distance();
605 ret
= numa_cleanup_meminfo(&numa_meminfo
);
609 numa_emulation(&numa_meminfo
, numa_distance_cnt
);
611 ret
= numa_register_memblks(&numa_meminfo
);
615 for (i
= 0; i
< nr_cpu_ids
; i
++) {
616 int nid
= early_cpu_to_node(i
);
618 if (nid
== NUMA_NO_NODE
)
620 if (!node_online(nid
))
627 void __init
initmem_init(void)
632 #ifdef CONFIG_ACPI_NUMA
633 ret
= numa_init(x86_acpi_numa_init
);
637 #ifdef CONFIG_AMD_NUMA
638 ret
= numa_init(amd_numa_init
);
644 numa_init(dummy_numa_init
);
647 unsigned long __init
numa_free_all_bootmem(void)
649 unsigned long pages
= 0;
652 for_each_online_node(i
)
653 pages
+= free_all_bootmem_node(NODE_DATA(i
));
655 pages
+= free_all_memory_core_early(MAX_NUMNODES
);
660 int __cpuinit
numa_cpu_node(int cpu
)
662 int apicid
= early_per_cpu(x86_cpu_to_apicid
, cpu
);
664 if (apicid
!= BAD_APICID
)
665 return __apicid_to_node
[apicid
];