2 * linux/arch/x86_64/mm/init.c
4 * Copyright (C) 1995 Linus Torvalds
5 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
6 * Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
9 #include <linux/config.h>
10 #include <linux/signal.h>
11 #include <linux/sched.h>
12 #include <linux/kernel.h>
13 #include <linux/errno.h>
14 #include <linux/string.h>
15 #include <linux/types.h>
16 #include <linux/ptrace.h>
17 #include <linux/mman.h>
19 #include <linux/swap.h>
20 #include <linux/smp.h>
21 #include <linux/init.h>
22 #include <linux/pagemap.h>
23 #include <linux/bootmem.h>
24 #include <linux/proc_fs.h>
25 #include <linux/pci.h>
26 #include <linux/dma-mapping.h>
27 #include <linux/module.h>
28 #include <linux/memory_hotplug.h>
30 #include <asm/processor.h>
31 #include <asm/system.h>
32 #include <asm/uaccess.h>
33 #include <asm/pgtable.h>
34 #include <asm/pgalloc.h>
36 #include <asm/fixmap.h>
40 #include <asm/mmu_context.h>
41 #include <asm/proto.h>
43 #include <asm/sections.h>
44 #include <asm/dma-mapping.h>
45 #include <asm/swiotlb.h>
51 struct dma_mapping_ops
* dma_ops
;
52 EXPORT_SYMBOL(dma_ops
);
54 static unsigned long dma_reserve __initdata
;
56 DEFINE_PER_CPU(struct mmu_gather
, mmu_gathers
);
59 * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
60 * physical space so we can cache the place of the first one and move
61 * around without checking the pgd every time.
66 long i
, total
= 0, reserved
= 0;
67 long shared
= 0, cached
= 0;
71 printk(KERN_INFO
"Mem-info:\n");
73 printk(KERN_INFO
"Free swap: %6ldkB\n", nr_swap_pages
<<(PAGE_SHIFT
-10));
75 for_each_pgdat(pgdat
) {
76 for (i
= 0; i
< pgdat
->node_spanned_pages
; ++i
) {
77 page
= pfn_to_page(pgdat
->node_start_pfn
+ i
);
79 if (PageReserved(page
))
81 else if (PageSwapCache(page
))
83 else if (page_count(page
))
84 shared
+= page_count(page
) - 1;
87 printk(KERN_INFO
"%lu pages of RAM\n", total
);
88 printk(KERN_INFO
"%lu reserved pages\n",reserved
);
89 printk(KERN_INFO
"%lu pages shared\n",shared
);
90 printk(KERN_INFO
"%lu pages swap cached\n",cached
);
93 /* References to section boundaries */
97 static void *spp_getpage(void)
101 ptr
= (void *) get_zeroed_page(GFP_ATOMIC
);
103 ptr
= alloc_bootmem_pages(PAGE_SIZE
);
104 if (!ptr
|| ((unsigned long)ptr
& ~PAGE_MASK
))
105 panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem
?"after bootmem":"");
107 Dprintk("spp_getpage %p\n", ptr
);
111 static void set_pte_phys(unsigned long vaddr
,
112 unsigned long phys
, pgprot_t prot
)
119 Dprintk("set_pte_phys %lx to %lx\n", vaddr
, phys
);
121 pgd
= pgd_offset_k(vaddr
);
122 if (pgd_none(*pgd
)) {
123 printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
126 pud
= pud_offset(pgd
, vaddr
);
127 if (pud_none(*pud
)) {
128 pmd
= (pmd_t
*) spp_getpage();
129 set_pud(pud
, __pud(__pa(pmd
) | _KERNPG_TABLE
| _PAGE_USER
));
130 if (pmd
!= pmd_offset(pud
, 0)) {
131 printk("PAGETABLE BUG #01! %p <-> %p\n", pmd
, pmd_offset(pud
,0));
135 pmd
= pmd_offset(pud
, vaddr
);
136 if (pmd_none(*pmd
)) {
137 pte
= (pte_t
*) spp_getpage();
138 set_pmd(pmd
, __pmd(__pa(pte
) | _KERNPG_TABLE
| _PAGE_USER
));
139 if (pte
!= pte_offset_kernel(pmd
, 0)) {
140 printk("PAGETABLE BUG #02!\n");
144 new_pte
= pfn_pte(phys
>> PAGE_SHIFT
, prot
);
146 pte
= pte_offset_kernel(pmd
, vaddr
);
147 if (!pte_none(*pte
) &&
148 pte_val(*pte
) != (pte_val(new_pte
) & __supported_pte_mask
))
150 set_pte(pte
, new_pte
);
153 * It's enough to flush this one mapping.
154 * (PGE mappings get flushed as well)
156 __flush_tlb_one(vaddr
);
159 /* NOTE: this is meant to be run only at boot */
160 void __set_fixmap (enum fixed_addresses idx
, unsigned long phys
, pgprot_t prot
)
162 unsigned long address
= __fix_to_virt(idx
);
164 if (idx
>= __end_of_fixed_addresses
) {
165 printk("Invalid __set_fixmap\n");
168 set_pte_phys(address
, phys
, prot
);
171 unsigned long __initdata table_start
, table_end
;
173 extern pmd_t temp_boot_pmds
[];
175 static struct temp_map
{
179 } temp_mappings
[] __initdata
= {
180 { &temp_boot_pmds
[0], (void *)(40UL * 1024 * 1024) },
181 { &temp_boot_pmds
[1], (void *)(42UL * 1024 * 1024) },
185 static __meminit
void *alloc_low_page(int *index
, unsigned long *phys
)
189 unsigned long pfn
= table_end
++, paddr
;
193 adr
= (void *)get_zeroed_page(GFP_ATOMIC
);
199 panic("alloc_low_page: ran out of memory");
200 for (i
= 0; temp_mappings
[i
].allocated
; i
++) {
201 if (!temp_mappings
[i
].pmd
)
202 panic("alloc_low_page: ran out of temp mappings");
204 ti
= &temp_mappings
[i
];
205 paddr
= (pfn
<< PAGE_SHIFT
) & PMD_MASK
;
206 set_pmd(ti
->pmd
, __pmd(paddr
| _KERNPG_TABLE
| _PAGE_PSE
));
209 adr
= ti
->address
+ ((pfn
<< PAGE_SHIFT
) & ~PMD_MASK
);
210 memset(adr
, 0, PAGE_SIZE
);
212 *phys
= pfn
* PAGE_SIZE
;
216 static __meminit
void unmap_low_page(int i
)
223 ti
= &temp_mappings
[i
];
224 set_pmd(ti
->pmd
, __pmd(0));
228 static void __meminit
229 phys_pmd_init(pmd_t
*pmd
, unsigned long address
, unsigned long end
)
233 for (i
= 0; i
< PTRS_PER_PMD
; pmd
++, i
++, address
+= PMD_SIZE
) {
237 for (; i
< PTRS_PER_PMD
; i
++, pmd
++)
238 set_pmd(pmd
, __pmd(0));
241 entry
= _PAGE_NX
|_PAGE_PSE
|_KERNPG_TABLE
|_PAGE_GLOBAL
|address
;
242 entry
&= __supported_pte_mask
;
243 set_pmd(pmd
, __pmd(entry
));
247 static void __meminit
248 phys_pmd_update(pud_t
*pud
, unsigned long address
, unsigned long end
)
250 pmd_t
*pmd
= pmd_offset(pud
, (unsigned long)__va(address
));
252 if (pmd_none(*pmd
)) {
253 spin_lock(&init_mm
.page_table_lock
);
254 phys_pmd_init(pmd
, address
, end
);
255 spin_unlock(&init_mm
.page_table_lock
);
260 static void __meminit
phys_pud_init(pud_t
*pud
, unsigned long address
, unsigned long end
)
262 long i
= pud_index(address
);
266 if (after_bootmem
&& pud_val(*pud
)) {
267 phys_pmd_update(pud
, address
, end
);
271 for (; i
< PTRS_PER_PUD
; pud
++, i
++) {
273 unsigned long paddr
, pmd_phys
;
276 paddr
= (address
& PGDIR_MASK
) + i
*PUD_SIZE
;
280 if (!after_bootmem
&& !e820_mapped(paddr
, paddr
+PUD_SIZE
, 0)) {
281 set_pud(pud
, __pud(0));
285 pmd
= alloc_low_page(&map
, &pmd_phys
);
286 spin_lock(&init_mm
.page_table_lock
);
287 set_pud(pud
, __pud(pmd_phys
| _KERNPG_TABLE
));
288 phys_pmd_init(pmd
, paddr
, end
);
289 spin_unlock(&init_mm
.page_table_lock
);
295 static void __init
find_early_table_space(unsigned long end
)
297 unsigned long puds
, pmds
, tables
, start
;
299 puds
= (end
+ PUD_SIZE
- 1) >> PUD_SHIFT
;
300 pmds
= (end
+ PMD_SIZE
- 1) >> PMD_SHIFT
;
301 tables
= round_up(puds
* sizeof(pud_t
), PAGE_SIZE
) +
302 round_up(pmds
* sizeof(pmd_t
), PAGE_SIZE
);
304 /* RED-PEN putting page tables only on node 0 could
305 cause a hotspot and fill up ZONE_DMA. The page tables
306 need roughly 0.5KB per GB. */
308 table_start
= find_e820_area(start
, end
, tables
);
309 if (table_start
== -1UL)
310 panic("Cannot find space for the kernel page tables");
312 table_start
>>= PAGE_SHIFT
;
313 table_end
= table_start
;
315 early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n",
316 end
, table_start
<< PAGE_SHIFT
, table_end
<< PAGE_SHIFT
);
319 /* Setup the direct mapping of the physical memory at PAGE_OFFSET.
320 This runs before bootmem is initialized and gets pages directly from the
321 physical memory. To access them they are temporarily mapped. */
322 void __meminit
init_memory_mapping(unsigned long start
, unsigned long end
)
326 Dprintk("init_memory_mapping\n");
329 * Find space for the kernel direct mapping tables.
330 * Later we should allocate these tables in the local node of the memory
331 * mapped. Unfortunately this is done currently before the nodes are
335 find_early_table_space(end
);
337 start
= (unsigned long)__va(start
);
338 end
= (unsigned long)__va(end
);
340 for (; start
< end
; start
= next
) {
342 unsigned long pud_phys
;
343 pgd_t
*pgd
= pgd_offset_k(start
);
347 pud
= pud_offset_k(pgd
, __PAGE_OFFSET
);
349 pud
= alloc_low_page(&map
, &pud_phys
);
351 next
= start
+ PGDIR_SIZE
;
354 phys_pud_init(pud
, __pa(start
), __pa(next
));
356 set_pgd(pgd_offset_k(start
), mk_kernel_pgd(pud_phys
));
361 asm volatile("movq %%cr4,%0" : "=r" (mmu_cr4_features
));
365 void __cpuinit
zap_low_mappings(int cpu
)
368 pgd_t
*pgd
= pgd_offset_k(0UL);
372 * For AP's, zap the low identity mappings by changing the cr3
373 * to init_level4_pgt and doing local flush tlb all
375 asm volatile("movq %0,%%cr3" :: "r" (__pa_symbol(&init_level4_pgt
)));
380 /* Compute zone sizes for the DMA and DMA32 zones in a node. */
382 size_zones(unsigned long *z
, unsigned long *h
,
383 unsigned long start_pfn
, unsigned long end_pfn
)
388 for (i
= 0; i
< MAX_NR_ZONES
; i
++)
391 if (start_pfn
< MAX_DMA_PFN
)
392 z
[ZONE_DMA
] = MAX_DMA_PFN
- start_pfn
;
393 if (start_pfn
< MAX_DMA32_PFN
) {
394 unsigned long dma32_pfn
= MAX_DMA32_PFN
;
395 if (dma32_pfn
> end_pfn
)
397 z
[ZONE_DMA32
] = dma32_pfn
- start_pfn
;
399 z
[ZONE_NORMAL
] = end_pfn
- start_pfn
;
401 /* Remove lower zones from higher ones. */
403 for (i
= 0; i
< MAX_NR_ZONES
; i
++) {
411 for (i
= 0; i
< MAX_NR_ZONES
; i
++) {
414 h
[i
] = e820_hole_size(s
, w
);
417 /* Add the space pace needed for mem_map to the holes too. */
418 for (i
= 0; i
< MAX_NR_ZONES
; i
++)
419 h
[i
] += (z
[i
] * sizeof(struct page
)) / PAGE_SIZE
;
421 /* The 16MB DMA zone has the kernel and other misc mappings.
424 h
[ZONE_DMA
] += dma_reserve
;
425 if (h
[ZONE_DMA
] >= z
[ZONE_DMA
]) {
427 "Kernel too large and filling up ZONE_DMA?\n");
428 h
[ZONE_DMA
] = z
[ZONE_DMA
];
434 void __init
paging_init(void)
436 unsigned long zones
[MAX_NR_ZONES
], holes
[MAX_NR_ZONES
];
438 memory_present(0, 0, end_pfn
);
440 size_zones(zones
, holes
, 0, end_pfn
);
441 free_area_init_node(0, NODE_DATA(0), zones
,
442 __pa(PAGE_OFFSET
) >> PAGE_SHIFT
, holes
);
446 /* Unmap a kernel mapping if it exists. This is useful to avoid prefetches
447 from the CPU leading to inconsistent cache lines. address and size
448 must be aligned to 2MB boundaries.
449 Does nothing when the mapping doesn't exist. */
450 void __init
clear_kernel_mapping(unsigned long address
, unsigned long size
)
452 unsigned long end
= address
+ size
;
454 BUG_ON(address
& ~LARGE_PAGE_MASK
);
455 BUG_ON(size
& ~LARGE_PAGE_MASK
);
457 for (; address
< end
; address
+= LARGE_PAGE_SIZE
) {
458 pgd_t
*pgd
= pgd_offset_k(address
);
463 pud
= pud_offset(pgd
, address
);
466 pmd
= pmd_offset(pud
, address
);
467 if (!pmd
|| pmd_none(*pmd
))
469 if (0 == (pmd_val(*pmd
) & _PAGE_PSE
)) {
470 /* Could handle this, but it should not happen currently. */
472 "clear_kernel_mapping: mapping has been split. will leak memory\n");
475 set_pmd(pmd
, __pmd(0));
481 * Memory hotplug specific functions
482 * These are only for non-NUMA machines right now.
484 #ifdef CONFIG_MEMORY_HOTPLUG
486 void online_page(struct page
*page
)
488 ClearPageReserved(page
);
489 init_page_count(page
);
495 int add_memory(u64 start
, u64 size
)
497 struct pglist_data
*pgdat
= NODE_DATA(0);
498 struct zone
*zone
= pgdat
->node_zones
+ MAX_NR_ZONES
-2;
499 unsigned long start_pfn
= start
>> PAGE_SHIFT
;
500 unsigned long nr_pages
= size
>> PAGE_SHIFT
;
503 ret
= __add_pages(zone
, start_pfn
, nr_pages
);
507 init_memory_mapping(start
, (start
+ size
-1));
511 printk("%s: Problem encountered in __add_pages!\n", __func__
);
514 EXPORT_SYMBOL_GPL(add_memory
);
516 int remove_memory(u64 start
, u64 size
)
520 EXPORT_SYMBOL_GPL(remove_memory
);
524 static struct kcore_list kcore_mem
, kcore_vmalloc
, kcore_kernel
, kcore_modules
,
527 void __init
mem_init(void)
529 long codesize
, reservedpages
, datasize
, initsize
;
531 #ifdef CONFIG_SWIOTLB
536 /* How many end-of-memory variables you have, grandma! */
537 max_low_pfn
= end_pfn
;
539 num_physpages
= end_pfn
;
540 high_memory
= (void *) __va(end_pfn
* PAGE_SIZE
);
542 /* clear the zero-page */
543 memset(empty_zero_page
, 0, PAGE_SIZE
);
547 /* this will put all low memory onto the freelists */
549 totalram_pages
= numa_free_all_bootmem();
551 totalram_pages
= free_all_bootmem();
553 reservedpages
= end_pfn
- totalram_pages
- e820_hole_size(0, end_pfn
);
557 codesize
= (unsigned long) &_etext
- (unsigned long) &_text
;
558 datasize
= (unsigned long) &_edata
- (unsigned long) &_etext
;
559 initsize
= (unsigned long) &__init_end
- (unsigned long) &__init_begin
;
561 /* Register memory areas for /proc/kcore */
562 kclist_add(&kcore_mem
, __va(0), max_low_pfn
<< PAGE_SHIFT
);
563 kclist_add(&kcore_vmalloc
, (void *)VMALLOC_START
,
564 VMALLOC_END
-VMALLOC_START
);
565 kclist_add(&kcore_kernel
, &_stext
, _end
- _stext
);
566 kclist_add(&kcore_modules
, (void *)MODULES_VADDR
, MODULES_LEN
);
567 kclist_add(&kcore_vsyscall
, (void *)VSYSCALL_START
,
568 VSYSCALL_END
- VSYSCALL_START
);
570 printk("Memory: %luk/%luk available (%ldk kernel code, %ldk reserved, %ldk data, %ldk init)\n",
571 (unsigned long) nr_free_pages() << (PAGE_SHIFT
-10),
572 end_pfn
<< (PAGE_SHIFT
-10),
574 reservedpages
<< (PAGE_SHIFT
-10),
580 * Sync boot_level4_pgt mappings with the init_level4_pgt
581 * except for the low identity mappings which are already zapped
582 * in init_level4_pgt. This sync-up is essential for AP's bringup
584 memcpy(boot_level4_pgt
+1, init_level4_pgt
+1, (PTRS_PER_PGD
-1)*sizeof(pgd_t
));
588 void free_initmem(void)
592 addr
= (unsigned long)(&__init_begin
);
593 for (; addr
< (unsigned long)(&__init_end
); addr
+= PAGE_SIZE
) {
594 ClearPageReserved(virt_to_page(addr
));
595 init_page_count(virt_to_page(addr
));
596 memset((void *)(addr
& ~(PAGE_SIZE
-1)), 0xcc, PAGE_SIZE
);
600 memset(__initdata_begin
, 0xba, __initdata_end
- __initdata_begin
);
601 printk ("Freeing unused kernel memory: %luk freed\n", (__init_end
- __init_begin
) >> 10);
604 #ifdef CONFIG_DEBUG_RODATA
606 extern char __start_rodata
, __end_rodata
;
607 void mark_rodata_ro(void)
609 unsigned long addr
= (unsigned long)&__start_rodata
;
611 for (; addr
< (unsigned long)&__end_rodata
; addr
+= PAGE_SIZE
)
612 change_page_attr_addr(addr
, 1, PAGE_KERNEL_RO
);
614 printk ("Write protecting the kernel read-only data: %luk\n",
615 (&__end_rodata
- &__start_rodata
) >> 10);
618 * change_page_attr_addr() requires a global_flush_tlb() call after it.
619 * We do this after the printk so that if something went wrong in the
620 * change, the printk gets out at least to give a better debug hint
621 * of who is the culprit.
627 #ifdef CONFIG_BLK_DEV_INITRD
628 void free_initrd_mem(unsigned long start
, unsigned long end
)
632 printk ("Freeing initrd memory: %ldk freed\n", (end
- start
) >> 10);
633 for (; start
< end
; start
+= PAGE_SIZE
) {
634 ClearPageReserved(virt_to_page(start
));
635 init_page_count(virt_to_page(start
));
642 void __init
reserve_bootmem_generic(unsigned long phys
, unsigned len
)
644 /* Should check here against the e820 map to avoid double free */
646 int nid
= phys_to_nid(phys
);
647 reserve_bootmem_node(NODE_DATA(nid
), phys
, len
);
649 reserve_bootmem(phys
, len
);
651 if (phys
+len
<= MAX_DMA_PFN
*PAGE_SIZE
)
652 dma_reserve
+= len
/ PAGE_SIZE
;
655 int kern_addr_valid(unsigned long addr
)
657 unsigned long above
= ((long)addr
) >> __VIRTUAL_MASK_SHIFT
;
663 if (above
!= 0 && above
!= -1UL)
666 pgd
= pgd_offset_k(addr
);
670 pud
= pud_offset(pgd
, addr
);
674 pmd
= pmd_offset(pud
, addr
);
678 return pfn_valid(pmd_pfn(*pmd
));
680 pte
= pte_offset_kernel(pmd
, addr
);
683 return pfn_valid(pte_pfn(*pte
));
687 #include <linux/sysctl.h>
689 extern int exception_trace
, page_fault_trace
;
691 static ctl_table debug_table2
[] = {
692 { 99, "exception-trace", &exception_trace
, sizeof(int), 0644, NULL
,
697 static ctl_table debug_root_table2
[] = {
698 { .ctl_name
= CTL_DEBUG
, .procname
= "debug", .mode
= 0555,
699 .child
= debug_table2
},
703 static __init
int x8664_sysctl_init(void)
705 register_sysctl_table(debug_root_table2
, 1);
708 __initcall(x8664_sysctl_init
);
711 /* A pseudo VMAs to allow ptrace access for the vsyscall page. This only
712 covers the 64bit vsyscall page now. 32bit has a real VMA now and does
713 not need special handling anymore. */
715 static struct vm_area_struct gate_vma
= {
716 .vm_start
= VSYSCALL_START
,
717 .vm_end
= VSYSCALL_END
,
718 .vm_page_prot
= PAGE_READONLY
721 struct vm_area_struct
*get_gate_vma(struct task_struct
*tsk
)
723 #ifdef CONFIG_IA32_EMULATION
724 if (test_tsk_thread_flag(tsk
, TIF_IA32
))
730 int in_gate_area(struct task_struct
*task
, unsigned long addr
)
732 struct vm_area_struct
*vma
= get_gate_vma(task
);
735 return (addr
>= vma
->vm_start
) && (addr
< vma
->vm_end
);
738 /* Use this when you have no reliable task/vma, typically from interrupt
739 * context. It is less reliable than using the task's vma and may give
742 int in_gate_area_no_task(unsigned long addr
)
744 return (addr
>= VSYSCALL_START
) && (addr
< VSYSCALL_END
);