[PATCH] x86_64: Replace swiotlb extern with include
[linux-2.6/kvm.git] / arch / x86_64 / mm / init.c
blob854a41b8372bc9b345fb061357bd293502e75150
1 /*
2 * linux/arch/x86_64/mm/init.c
4 * Copyright (C) 1995 Linus Torvalds
5 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
6 * Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
7 */
9 #include <linux/config.h>
10 #include <linux/signal.h>
11 #include <linux/sched.h>
12 #include <linux/kernel.h>
13 #include <linux/errno.h>
14 #include <linux/string.h>
15 #include <linux/types.h>
16 #include <linux/ptrace.h>
17 #include <linux/mman.h>
18 #include <linux/mm.h>
19 #include <linux/swap.h>
20 #include <linux/smp.h>
21 #include <linux/init.h>
22 #include <linux/pagemap.h>
23 #include <linux/bootmem.h>
24 #include <linux/proc_fs.h>
25 #include <linux/pci.h>
27 #include <asm/processor.h>
28 #include <asm/system.h>
29 #include <asm/uaccess.h>
30 #include <asm/pgtable.h>
31 #include <asm/pgalloc.h>
32 #include <asm/dma.h>
33 #include <asm/fixmap.h>
34 #include <asm/e820.h>
35 #include <asm/apic.h>
36 #include <asm/tlb.h>
37 #include <asm/mmu_context.h>
38 #include <asm/proto.h>
39 #include <asm/smp.h>
40 #include <asm/sections.h>
42 #ifndef Dprintk
43 #define Dprintk(x...)
44 #endif
46 static unsigned long dma_reserve __initdata;
48 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
51 * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
52 * physical space so we can cache the place of the first one and move
53 * around without checking the pgd every time.
56 void show_mem(void)
58 long i, total = 0, reserved = 0;
59 long shared = 0, cached = 0;
60 pg_data_t *pgdat;
61 struct page *page;
63 printk(KERN_INFO "Mem-info:\n");
64 show_free_areas();
65 printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
67 for_each_pgdat(pgdat) {
68 for (i = 0; i < pgdat->node_spanned_pages; ++i) {
69 page = pfn_to_page(pgdat->node_start_pfn + i);
70 total++;
71 if (PageReserved(page))
72 reserved++;
73 else if (PageSwapCache(page))
74 cached++;
75 else if (page_count(page))
76 shared += page_count(page) - 1;
79 printk(KERN_INFO "%lu pages of RAM\n", total);
80 printk(KERN_INFO "%lu reserved pages\n",reserved);
81 printk(KERN_INFO "%lu pages shared\n",shared);
82 printk(KERN_INFO "%lu pages swap cached\n",cached);
85 /* References to section boundaries */
87 int after_bootmem;
89 static void *spp_getpage(void)
91 void *ptr;
92 if (after_bootmem)
93 ptr = (void *) get_zeroed_page(GFP_ATOMIC);
94 else
95 ptr = alloc_bootmem_pages(PAGE_SIZE);
96 if (!ptr || ((unsigned long)ptr & ~PAGE_MASK))
97 panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem?"after bootmem":"");
99 Dprintk("spp_getpage %p\n", ptr);
100 return ptr;
103 static void set_pte_phys(unsigned long vaddr,
104 unsigned long phys, pgprot_t prot)
106 pgd_t *pgd;
107 pud_t *pud;
108 pmd_t *pmd;
109 pte_t *pte, new_pte;
111 Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
113 pgd = pgd_offset_k(vaddr);
114 if (pgd_none(*pgd)) {
115 printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
116 return;
118 pud = pud_offset(pgd, vaddr);
119 if (pud_none(*pud)) {
120 pmd = (pmd_t *) spp_getpage();
121 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
122 if (pmd != pmd_offset(pud, 0)) {
123 printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
124 return;
127 pmd = pmd_offset(pud, vaddr);
128 if (pmd_none(*pmd)) {
129 pte = (pte_t *) spp_getpage();
130 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
131 if (pte != pte_offset_kernel(pmd, 0)) {
132 printk("PAGETABLE BUG #02!\n");
133 return;
136 new_pte = pfn_pte(phys >> PAGE_SHIFT, prot);
138 pte = pte_offset_kernel(pmd, vaddr);
139 if (!pte_none(*pte) &&
140 pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask))
141 pte_ERROR(*pte);
142 set_pte(pte, new_pte);
145 * It's enough to flush this one mapping.
146 * (PGE mappings get flushed as well)
148 __flush_tlb_one(vaddr);
151 /* NOTE: this is meant to be run only at boot */
152 void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
154 unsigned long address = __fix_to_virt(idx);
156 if (idx >= __end_of_fixed_addresses) {
157 printk("Invalid __set_fixmap\n");
158 return;
160 set_pte_phys(address, phys, prot);
163 unsigned long __initdata table_start, table_end;
165 extern pmd_t temp_boot_pmds[];
167 static struct temp_map {
168 pmd_t *pmd;
169 void *address;
170 int allocated;
171 } temp_mappings[] __initdata = {
172 { &temp_boot_pmds[0], (void *)(40UL * 1024 * 1024) },
173 { &temp_boot_pmds[1], (void *)(42UL * 1024 * 1024) },
177 static __init void *alloc_low_page(int *index, unsigned long *phys)
179 struct temp_map *ti;
180 int i;
181 unsigned long pfn = table_end++, paddr;
182 void *adr;
184 if (pfn >= end_pfn)
185 panic("alloc_low_page: ran out of memory");
186 for (i = 0; temp_mappings[i].allocated; i++) {
187 if (!temp_mappings[i].pmd)
188 panic("alloc_low_page: ran out of temp mappings");
190 ti = &temp_mappings[i];
191 paddr = (pfn << PAGE_SHIFT) & PMD_MASK;
192 set_pmd(ti->pmd, __pmd(paddr | _KERNPG_TABLE | _PAGE_PSE));
193 ti->allocated = 1;
194 __flush_tlb();
195 adr = ti->address + ((pfn << PAGE_SHIFT) & ~PMD_MASK);
196 *index = i;
197 *phys = pfn * PAGE_SIZE;
198 return adr;
201 static __init void unmap_low_page(int i)
203 struct temp_map *ti = &temp_mappings[i];
204 set_pmd(ti->pmd, __pmd(0));
205 ti->allocated = 0;
208 static void __init phys_pud_init(pud_t *pud, unsigned long address, unsigned long end)
210 long i, j;
212 i = pud_index(address);
213 pud = pud + i;
214 for (; i < PTRS_PER_PUD; pud++, i++) {
215 int map;
216 unsigned long paddr, pmd_phys;
217 pmd_t *pmd;
219 paddr = address + i*PUD_SIZE;
220 if (paddr >= end) {
221 for (; i < PTRS_PER_PUD; i++, pud++)
222 set_pud(pud, __pud(0));
223 break;
226 if (!e820_mapped(paddr, paddr+PUD_SIZE, 0)) {
227 set_pud(pud, __pud(0));
228 continue;
231 pmd = alloc_low_page(&map, &pmd_phys);
232 set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE));
233 for (j = 0; j < PTRS_PER_PMD; pmd++, j++, paddr += PMD_SIZE) {
234 unsigned long pe;
236 if (paddr >= end) {
237 for (; j < PTRS_PER_PMD; j++, pmd++)
238 set_pmd(pmd, __pmd(0));
239 break;
241 pe = _PAGE_NX|_PAGE_PSE | _KERNPG_TABLE | _PAGE_GLOBAL | paddr;
242 pe &= __supported_pte_mask;
243 set_pmd(pmd, __pmd(pe));
245 unmap_low_page(map);
247 __flush_tlb();
250 static void __init find_early_table_space(unsigned long end)
252 unsigned long puds, pmds, tables;
254 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
255 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
256 tables = round_up(puds * sizeof(pud_t), PAGE_SIZE) +
257 round_up(pmds * sizeof(pmd_t), PAGE_SIZE);
259 table_start = find_e820_area(0x8000, __pa_symbol(&_text), tables);
260 if (table_start == -1UL)
261 panic("Cannot find space for the kernel page tables");
263 table_start >>= PAGE_SHIFT;
264 table_end = table_start;
267 /* Setup the direct mapping of the physical memory at PAGE_OFFSET.
268 This runs before bootmem is initialized and gets pages directly from the
269 physical memory. To access them they are temporarily mapped. */
270 void __init init_memory_mapping(unsigned long start, unsigned long end)
272 unsigned long next;
274 Dprintk("init_memory_mapping\n");
277 * Find space for the kernel direct mapping tables.
278 * Later we should allocate these tables in the local node of the memory
279 * mapped. Unfortunately this is done currently before the nodes are
280 * discovered.
282 find_early_table_space(end);
284 start = (unsigned long)__va(start);
285 end = (unsigned long)__va(end);
287 for (; start < end; start = next) {
288 int map;
289 unsigned long pud_phys;
290 pud_t *pud = alloc_low_page(&map, &pud_phys);
291 next = start + PGDIR_SIZE;
292 if (next > end)
293 next = end;
294 phys_pud_init(pud, __pa(start), __pa(next));
295 set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
296 unmap_low_page(map);
299 asm volatile("movq %%cr4,%0" : "=r" (mmu_cr4_features));
300 __flush_tlb_all();
301 early_printk("kernel direct mapping tables upto %lx @ %lx-%lx\n", end,
302 table_start<<PAGE_SHIFT,
303 table_end<<PAGE_SHIFT);
306 void __cpuinit zap_low_mappings(int cpu)
308 if (cpu == 0) {
309 pgd_t *pgd = pgd_offset_k(0UL);
310 pgd_clear(pgd);
311 } else {
313 * For AP's, zap the low identity mappings by changing the cr3
314 * to init_level4_pgt and doing local flush tlb all
316 asm volatile("movq %0,%%cr3" :: "r" (__pa_symbol(&init_level4_pgt)));
318 __flush_tlb_all();
321 /* Compute zone sizes for the DMA and DMA32 zones in a node. */
322 __init void
323 size_zones(unsigned long *z, unsigned long *h,
324 unsigned long start_pfn, unsigned long end_pfn)
326 int i;
327 unsigned long w;
329 for (i = 0; i < MAX_NR_ZONES; i++)
330 z[i] = 0;
332 if (start_pfn < MAX_DMA_PFN)
333 z[ZONE_DMA] = MAX_DMA_PFN - start_pfn;
334 if (start_pfn < MAX_DMA32_PFN) {
335 unsigned long dma32_pfn = MAX_DMA32_PFN;
336 if (dma32_pfn > end_pfn)
337 dma32_pfn = end_pfn;
338 z[ZONE_DMA32] = dma32_pfn - start_pfn;
340 z[ZONE_NORMAL] = end_pfn - start_pfn;
342 /* Remove lower zones from higher ones. */
343 w = 0;
344 for (i = 0; i < MAX_NR_ZONES; i++) {
345 if (z[i])
346 z[i] -= w;
347 w += z[i];
350 /* Compute holes */
351 w = 0;
352 for (i = 0; i < MAX_NR_ZONES; i++) {
353 unsigned long s = w;
354 w += z[i];
355 h[i] = e820_hole_size(s, w);
358 /* Add the space pace needed for mem_map to the holes too. */
359 for (i = 0; i < MAX_NR_ZONES; i++)
360 h[i] += (z[i] * sizeof(struct page)) / PAGE_SIZE;
362 /* The 16MB DMA zone has the kernel and other misc mappings.
363 Account them too */
364 if (h[ZONE_DMA]) {
365 h[ZONE_DMA] += dma_reserve;
366 if (h[ZONE_DMA] >= z[ZONE_DMA]) {
367 printk(KERN_WARNING
368 "Kernel too large and filling up ZONE_DMA?\n");
369 h[ZONE_DMA] = z[ZONE_DMA];
374 #ifndef CONFIG_NUMA
375 void __init paging_init(void)
377 unsigned long zones[MAX_NR_ZONES], holes[MAX_NR_ZONES];
378 size_zones(zones, holes, 0, end_pfn);
379 free_area_init_node(0, NODE_DATA(0), zones,
380 __pa(PAGE_OFFSET) >> PAGE_SHIFT, holes);
382 #endif
384 /* Unmap a kernel mapping if it exists. This is useful to avoid prefetches
385 from the CPU leading to inconsistent cache lines. address and size
386 must be aligned to 2MB boundaries.
387 Does nothing when the mapping doesn't exist. */
388 void __init clear_kernel_mapping(unsigned long address, unsigned long size)
390 unsigned long end = address + size;
392 BUG_ON(address & ~LARGE_PAGE_MASK);
393 BUG_ON(size & ~LARGE_PAGE_MASK);
395 for (; address < end; address += LARGE_PAGE_SIZE) {
396 pgd_t *pgd = pgd_offset_k(address);
397 pud_t *pud;
398 pmd_t *pmd;
399 if (pgd_none(*pgd))
400 continue;
401 pud = pud_offset(pgd, address);
402 if (pud_none(*pud))
403 continue;
404 pmd = pmd_offset(pud, address);
405 if (!pmd || pmd_none(*pmd))
406 continue;
407 if (0 == (pmd_val(*pmd) & _PAGE_PSE)) {
408 /* Could handle this, but it should not happen currently. */
409 printk(KERN_ERR
410 "clear_kernel_mapping: mapping has been split. will leak memory\n");
411 pmd_ERROR(*pmd);
413 set_pmd(pmd, __pmd(0));
415 __flush_tlb_all();
418 static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules,
419 kcore_vsyscall;
421 void __init mem_init(void)
423 long codesize, reservedpages, datasize, initsize;
425 #ifdef CONFIG_SWIOTLB
426 if (!iommu_aperture &&
427 (end_pfn >= 0xffffffff>>PAGE_SHIFT || force_iommu))
428 swiotlb = 1;
429 if (swiotlb)
430 swiotlb_init();
431 #endif
433 /* How many end-of-memory variables you have, grandma! */
434 max_low_pfn = end_pfn;
435 max_pfn = end_pfn;
436 num_physpages = end_pfn;
437 high_memory = (void *) __va(end_pfn * PAGE_SIZE);
439 /* clear the zero-page */
440 memset(empty_zero_page, 0, PAGE_SIZE);
442 reservedpages = 0;
444 /* this will put all low memory onto the freelists */
445 #ifdef CONFIG_NUMA
446 totalram_pages = numa_free_all_bootmem();
447 #else
448 totalram_pages = free_all_bootmem();
449 #endif
450 reservedpages = end_pfn - totalram_pages - e820_hole_size(0, end_pfn);
452 after_bootmem = 1;
454 codesize = (unsigned long) &_etext - (unsigned long) &_text;
455 datasize = (unsigned long) &_edata - (unsigned long) &_etext;
456 initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
458 /* Register memory areas for /proc/kcore */
459 kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
460 kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
461 VMALLOC_END-VMALLOC_START);
462 kclist_add(&kcore_kernel, &_stext, _end - _stext);
463 kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
464 kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
465 VSYSCALL_END - VSYSCALL_START);
467 printk("Memory: %luk/%luk available (%ldk kernel code, %ldk reserved, %ldk data, %ldk init)\n",
468 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
469 end_pfn << (PAGE_SHIFT-10),
470 codesize >> 10,
471 reservedpages << (PAGE_SHIFT-10),
472 datasize >> 10,
473 initsize >> 10);
475 #ifdef CONFIG_SMP
477 * Sync boot_level4_pgt mappings with the init_level4_pgt
478 * except for the low identity mappings which are already zapped
479 * in init_level4_pgt. This sync-up is essential for AP's bringup
481 memcpy(boot_level4_pgt+1, init_level4_pgt+1, (PTRS_PER_PGD-1)*sizeof(pgd_t));
482 #endif
485 void free_initmem(void)
487 unsigned long addr;
489 addr = (unsigned long)(&__init_begin);
490 for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) {
491 ClearPageReserved(virt_to_page(addr));
492 set_page_count(virt_to_page(addr), 1);
493 memset((void *)(addr & ~(PAGE_SIZE-1)), 0xcc, PAGE_SIZE);
494 free_page(addr);
495 totalram_pages++;
497 memset(__initdata_begin, 0xba, __initdata_end - __initdata_begin);
498 printk ("Freeing unused kernel memory: %luk freed\n", (__init_end - __init_begin) >> 10);
501 #ifdef CONFIG_BLK_DEV_INITRD
502 void free_initrd_mem(unsigned long start, unsigned long end)
504 if (start < (unsigned long)&_end)
505 return;
506 printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
507 for (; start < end; start += PAGE_SIZE) {
508 ClearPageReserved(virt_to_page(start));
509 set_page_count(virt_to_page(start), 1);
510 free_page(start);
511 totalram_pages++;
514 #endif
516 void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
518 /* Should check here against the e820 map to avoid double free */
519 #ifdef CONFIG_NUMA
520 int nid = phys_to_nid(phys);
521 reserve_bootmem_node(NODE_DATA(nid), phys, len);
522 #else
523 reserve_bootmem(phys, len);
524 #endif
525 if (phys+len <= MAX_DMA_PFN*PAGE_SIZE)
526 dma_reserve += len / PAGE_SIZE;
529 int kern_addr_valid(unsigned long addr)
531 unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
532 pgd_t *pgd;
533 pud_t *pud;
534 pmd_t *pmd;
535 pte_t *pte;
537 if (above != 0 && above != -1UL)
538 return 0;
540 pgd = pgd_offset_k(addr);
541 if (pgd_none(*pgd))
542 return 0;
544 pud = pud_offset(pgd, addr);
545 if (pud_none(*pud))
546 return 0;
548 pmd = pmd_offset(pud, addr);
549 if (pmd_none(*pmd))
550 return 0;
551 if (pmd_large(*pmd))
552 return pfn_valid(pmd_pfn(*pmd));
554 pte = pte_offset_kernel(pmd, addr);
555 if (pte_none(*pte))
556 return 0;
557 return pfn_valid(pte_pfn(*pte));
560 #ifdef CONFIG_SYSCTL
561 #include <linux/sysctl.h>
563 extern int exception_trace, page_fault_trace;
565 static ctl_table debug_table2[] = {
566 { 99, "exception-trace", &exception_trace, sizeof(int), 0644, NULL,
567 proc_dointvec },
568 #ifdef CONFIG_CHECKING
569 { 100, "page-fault-trace", &page_fault_trace, sizeof(int), 0644, NULL,
570 proc_dointvec },
571 #endif
572 { 0, }
575 static ctl_table debug_root_table2[] = {
576 { .ctl_name = CTL_DEBUG, .procname = "debug", .mode = 0555,
577 .child = debug_table2 },
578 { 0 },
581 static __init int x8664_sysctl_init(void)
583 register_sysctl_table(debug_root_table2, 1);
584 return 0;
586 __initcall(x8664_sysctl_init);
587 #endif
589 /* A pseudo VMAs to allow ptrace access for the vsyscall page. This only
590 covers the 64bit vsyscall page now. 32bit has a real VMA now and does
591 not need special handling anymore. */
593 static struct vm_area_struct gate_vma = {
594 .vm_start = VSYSCALL_START,
595 .vm_end = VSYSCALL_END,
596 .vm_page_prot = PAGE_READONLY
599 struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
601 #ifdef CONFIG_IA32_EMULATION
602 if (test_tsk_thread_flag(tsk, TIF_IA32))
603 return NULL;
604 #endif
605 return &gate_vma;
608 int in_gate_area(struct task_struct *task, unsigned long addr)
610 struct vm_area_struct *vma = get_gate_vma(task);
611 if (!vma)
612 return 0;
613 return (addr >= vma->vm_start) && (addr < vma->vm_end);
616 /* Use this when you have no reliable task/vma, typically from interrupt
617 * context. It is less reliable than using the task's vma and may give
618 * false positives.
620 int in_gate_area_no_task(unsigned long addr)
622 return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);