2 * PPC64 (POWER4) Huge TLB Page Support for Kernel.
4 * Copyright (C) 2003 David Gibson, IBM Corporation.
6 * Based on the IA-32 version:
7 * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
10 #include <linux/init.h>
13 #include <linux/hugetlb.h>
14 #include <linux/pagemap.h>
15 #include <linux/slab.h>
16 #include <linux/err.h>
17 #include <linux/sysctl.h>
19 #include <asm/pgalloc.h>
21 #include <asm/tlbflush.h>
22 #include <asm/mmu_context.h>
23 #include <asm/machdep.h>
24 #include <asm/cputable.h>
27 #define PAGE_SHIFT_64K 16
28 #define PAGE_SHIFT_16M 24
29 #define PAGE_SHIFT_16G 34
31 #define NUM_LOW_AREAS (0x100000000UL >> SID_SHIFT)
32 #define NUM_HIGH_AREAS (PGTABLE_RANGE >> HTLB_AREA_SHIFT)
33 #define MAX_NUMBER_GPAGES 1024
35 /* Tracks the 16G pages after the device tree is scanned and before the
36 * huge_boot_pages list is ready. */
37 static unsigned long gpage_freearray
[MAX_NUMBER_GPAGES
];
38 static unsigned nr_gpages
;
40 /* Flag to mark huge PD pointers. This means pmd_bad() and pud_bad()
41 * will choke on pointers to hugepte tables, which is handy for
42 * catching screwups early. */
44 static inline int shift_to_mmu_psize(unsigned int shift
)
48 for (psize
= 0; psize
< MMU_PAGE_COUNT
; ++psize
)
49 if (mmu_psize_defs
[psize
].shift
== shift
)
54 static inline unsigned int mmu_psize_to_shift(unsigned int mmu_psize
)
56 if (mmu_psize_defs
[mmu_psize
].shift
)
57 return mmu_psize_defs
[mmu_psize
].shift
;
61 #define hugepd_none(hpd) ((hpd).pd == 0)
63 static inline pte_t
*hugepd_page(hugepd_t hpd
)
65 BUG_ON(!hugepd_ok(hpd
));
66 return (pte_t
*)((hpd
.pd
& ~HUGEPD_SHIFT_MASK
) | 0xc000000000000000);
69 static inline unsigned int hugepd_shift(hugepd_t hpd
)
71 return hpd
.pd
& HUGEPD_SHIFT_MASK
;
74 static inline pte_t
*hugepte_offset(hugepd_t
*hpdp
, unsigned long addr
, unsigned pdshift
)
76 unsigned long idx
= (addr
& ((1UL << pdshift
) - 1)) >> hugepd_shift(*hpdp
);
77 pte_t
*dir
= hugepd_page(*hpdp
);
82 pte_t
*find_linux_pte_or_hugepte(pgd_t
*pgdir
, unsigned long ea
, unsigned *shift
)
87 hugepd_t
*hpdp
= NULL
;
88 unsigned pdshift
= PGDIR_SHIFT
;
93 pg
= pgdir
+ pgd_index(ea
);
95 hpdp
= (hugepd_t
*)pg
;
96 } else if (!pgd_none(*pg
)) {
98 pu
= pud_offset(pg
, ea
);
100 hpdp
= (hugepd_t
*)pu
;
101 else if (!pud_none(*pu
)) {
103 pm
= pmd_offset(pu
, ea
);
105 hpdp
= (hugepd_t
*)pm
;
106 else if (!pmd_none(*pm
)) {
107 return pte_offset_map(pm
, ea
);
116 *shift
= hugepd_shift(*hpdp
);
117 return hugepte_offset(hpdp
, ea
, pdshift
);
120 pte_t
*huge_pte_offset(struct mm_struct
*mm
, unsigned long addr
)
122 return find_linux_pte_or_hugepte(mm
->pgd
, addr
, NULL
);
125 static int __hugepte_alloc(struct mm_struct
*mm
, hugepd_t
*hpdp
,
126 unsigned long address
, unsigned pdshift
, unsigned pshift
)
128 pte_t
*new = kmem_cache_zalloc(PGT_CACHE(pdshift
- pshift
),
129 GFP_KERNEL
|__GFP_REPEAT
);
131 BUG_ON(pshift
> HUGEPD_SHIFT_MASK
);
132 BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK
);
137 spin_lock(&mm
->page_table_lock
);
138 if (!hugepd_none(*hpdp
))
139 kmem_cache_free(PGT_CACHE(pdshift
- pshift
), new);
141 hpdp
->pd
= ((unsigned long)new & ~0x8000000000000000) | pshift
;
142 spin_unlock(&mm
->page_table_lock
);
146 pte_t
*huge_pte_alloc(struct mm_struct
*mm
, unsigned long addr
, unsigned long sz
)
151 hugepd_t
*hpdp
= NULL
;
152 unsigned pshift
= __ffs(sz
);
153 unsigned pdshift
= PGDIR_SHIFT
;
157 pg
= pgd_offset(mm
, addr
);
158 if (pshift
>= PUD_SHIFT
) {
159 hpdp
= (hugepd_t
*)pg
;
162 pu
= pud_alloc(mm
, pg
, addr
);
163 if (pshift
>= PMD_SHIFT
) {
164 hpdp
= (hugepd_t
*)pu
;
167 pm
= pmd_alloc(mm
, pu
, addr
);
168 hpdp
= (hugepd_t
*)pm
;
175 BUG_ON(!hugepd_none(*hpdp
) && !hugepd_ok(*hpdp
));
177 if (hugepd_none(*hpdp
) && __hugepte_alloc(mm
, hpdp
, addr
, pdshift
, pshift
))
180 return hugepte_offset(hpdp
, addr
, pdshift
);
183 /* Build list of addresses of gigantic pages. This function is used in early
184 * boot before the buddy or bootmem allocator is setup.
186 void add_gpage(unsigned long addr
, unsigned long page_size
,
187 unsigned long number_of_pages
)
191 while (number_of_pages
> 0) {
192 gpage_freearray
[nr_gpages
] = addr
;
199 /* Moves the gigantic page addresses from the temporary list to the
200 * huge_boot_pages list.
202 int alloc_bootmem_huge_page(struct hstate
*hstate
)
204 struct huge_bootmem_page
*m
;
207 m
= phys_to_virt(gpage_freearray
[--nr_gpages
]);
208 gpage_freearray
[nr_gpages
] = 0;
209 list_add(&m
->list
, &huge_boot_pages
);
214 int huge_pmd_unshare(struct mm_struct
*mm
, unsigned long *addr
, pte_t
*ptep
)
219 static void free_hugepd_range(struct mmu_gather
*tlb
, hugepd_t
*hpdp
, int pdshift
,
220 unsigned long start
, unsigned long end
,
221 unsigned long floor
, unsigned long ceiling
)
223 pte_t
*hugepte
= hugepd_page(*hpdp
);
224 unsigned shift
= hugepd_shift(*hpdp
);
225 unsigned long pdmask
= ~((1UL << pdshift
) - 1);
235 if (end
- 1 > ceiling
- 1)
240 pgtable_free_tlb(tlb
, hugepte
, pdshift
- shift
);
243 static void hugetlb_free_pmd_range(struct mmu_gather
*tlb
, pud_t
*pud
,
244 unsigned long addr
, unsigned long end
,
245 unsigned long floor
, unsigned long ceiling
)
252 pmd
= pmd_offset(pud
, addr
);
254 next
= pmd_addr_end(addr
, end
);
257 free_hugepd_range(tlb
, (hugepd_t
*)pmd
, PMD_SHIFT
,
258 addr
, next
, floor
, ceiling
);
259 } while (pmd
++, addr
= next
, addr
!= end
);
269 if (end
- 1 > ceiling
- 1)
272 pmd
= pmd_offset(pud
, start
);
274 pmd_free_tlb(tlb
, pmd
, start
);
277 static void hugetlb_free_pud_range(struct mmu_gather
*tlb
, pgd_t
*pgd
,
278 unsigned long addr
, unsigned long end
,
279 unsigned long floor
, unsigned long ceiling
)
286 pud
= pud_offset(pgd
, addr
);
288 next
= pud_addr_end(addr
, end
);
289 if (!is_hugepd(pud
)) {
290 if (pud_none_or_clear_bad(pud
))
292 hugetlb_free_pmd_range(tlb
, pud
, addr
, next
, floor
,
295 free_hugepd_range(tlb
, (hugepd_t
*)pud
, PUD_SHIFT
,
296 addr
, next
, floor
, ceiling
);
298 } while (pud
++, addr
= next
, addr
!= end
);
304 ceiling
&= PGDIR_MASK
;
308 if (end
- 1 > ceiling
- 1)
311 pud
= pud_offset(pgd
, start
);
313 pud_free_tlb(tlb
, pud
, start
);
317 * This function frees user-level page tables of a process.
319 * Must be called with pagetable lock held.
321 void hugetlb_free_pgd_range(struct mmu_gather
*tlb
,
322 unsigned long addr
, unsigned long end
,
323 unsigned long floor
, unsigned long ceiling
)
329 * Because there are a number of different possible pagetable
330 * layouts for hugepage ranges, we limit knowledge of how
331 * things should be laid out to the allocation path
332 * (huge_pte_alloc(), above). Everything else works out the
333 * structure as it goes from information in the hugepd
334 * pointers. That means that we can't here use the
335 * optimization used in the normal page free_pgd_range(), of
336 * checking whether we're actually covering a large enough
337 * range to have to do anything at the top level of the walk
338 * instead of at the bottom.
340 * To make sense of this, you should probably go read the big
341 * block comment at the top of the normal free_pgd_range(),
345 pgd
= pgd_offset(tlb
->mm
, addr
);
347 next
= pgd_addr_end(addr
, end
);
348 if (!is_hugepd(pgd
)) {
349 if (pgd_none_or_clear_bad(pgd
))
351 hugetlb_free_pud_range(tlb
, pgd
, addr
, next
, floor
, ceiling
);
353 free_hugepd_range(tlb
, (hugepd_t
*)pgd
, PGDIR_SHIFT
,
354 addr
, next
, floor
, ceiling
);
356 } while (pgd
++, addr
= next
, addr
!= end
);
359 void set_huge_pte_at(struct mm_struct
*mm
, unsigned long addr
,
360 pte_t
*ptep
, pte_t pte
)
362 if (pte_present(*ptep
)) {
363 /* We open-code pte_clear because we need to pass the right
364 * argument to hpte_need_flush (huge / !huge). Might not be
365 * necessary anymore if we make hpte_need_flush() get the
366 * page size from the slices
368 pte_update(mm
, addr
, ptep
, ~0UL, 1);
370 *ptep
= __pte(pte_val(pte
) & ~_PAGE_HPTEFLAGS
);
373 pte_t
huge_ptep_get_and_clear(struct mm_struct
*mm
, unsigned long addr
,
376 unsigned long old
= pte_update(mm
, addr
, ptep
, ~0UL, 1);
381 follow_huge_addr(struct mm_struct
*mm
, unsigned long address
, int write
)
388 ptep
= find_linux_pte_or_hugepte(mm
->pgd
, address
, &shift
);
390 /* Verify it is a huge page else bail. */
392 return ERR_PTR(-EINVAL
);
394 mask
= (1UL << shift
) - 1;
395 page
= pte_page(*ptep
);
397 page
+= (address
& mask
) / PAGE_SIZE
;
402 int pmd_huge(pmd_t pmd
)
407 int pud_huge(pud_t pud
)
413 follow_huge_pmd(struct mm_struct
*mm
, unsigned long address
,
414 pmd_t
*pmd
, int write
)
420 static noinline
int gup_hugepte(pte_t
*ptep
, unsigned long sz
, unsigned long addr
,
421 unsigned long end
, int write
, struct page
**pages
, int *nr
)
424 unsigned long pte_end
;
425 struct page
*head
, *page
;
429 pte_end
= (addr
+ sz
) & ~(sz
-1);
434 mask
= _PAGE_PRESENT
| _PAGE_USER
;
438 if ((pte_val(pte
) & mask
) != mask
)
441 /* hugepages are never "special" */
442 VM_BUG_ON(!pfn_valid(pte_pfn(pte
)));
445 head
= pte_page(pte
);
447 page
= head
+ ((addr
& (sz
-1)) >> PAGE_SHIFT
);
449 VM_BUG_ON(compound_head(page
) != head
);
454 } while (addr
+= PAGE_SIZE
, addr
!= end
);
456 if (!page_cache_add_speculative(head
, refs
)) {
461 if (unlikely(pte_val(pte
) != pte_val(*ptep
))) {
462 /* Could be optimized better */
472 int gup_hugepd(hugepd_t
*hugepd
, unsigned pdshift
,
473 unsigned long addr
, unsigned long end
,
474 int write
, struct page
**pages
, int *nr
)
477 unsigned long sz
= 1UL << hugepd_shift(*hugepd
);
479 ptep
= hugepte_offset(hugepd
, addr
, pdshift
);
481 if (!gup_hugepte(ptep
, sz
, addr
, end
, write
, pages
, nr
))
483 } while (ptep
++, addr
+= sz
, addr
!= end
);
488 unsigned long hugetlb_get_unmapped_area(struct file
*file
, unsigned long addr
,
489 unsigned long len
, unsigned long pgoff
,
492 struct hstate
*hstate
= hstate_file(file
);
493 int mmu_psize
= shift_to_mmu_psize(huge_page_shift(hstate
));
495 return slice_get_unmapped_area(addr
, len
, flags
, mmu_psize
, 1, 0);
498 unsigned long vma_mmu_pagesize(struct vm_area_struct
*vma
)
500 unsigned int psize
= get_slice_psize(vma
->vm_mm
, vma
->vm_start
);
502 return 1UL << mmu_psize_to_shift(psize
);
506 * Called by asm hashtable.S for doing lazy icache flush
508 static unsigned int hash_huge_page_do_lazy_icache(unsigned long rflags
,
509 pte_t pte
, int trap
, unsigned long sz
)
514 if (!pfn_valid(pte_pfn(pte
)))
517 page
= pte_page(pte
);
520 if (!test_bit(PG_arch_1
, &page
->flags
) && !PageReserved(page
)) {
522 for (i
= 0; i
< (sz
/ PAGE_SIZE
); i
++)
523 __flush_dcache_icache(page_address(page
+i
));
524 set_bit(PG_arch_1
, &page
->flags
);
532 int __hash_page_huge(unsigned long ea
, unsigned long access
, unsigned long vsid
,
533 pte_t
*ptep
, unsigned long trap
, int local
, int ssize
,
534 unsigned int shift
, unsigned int mmu_psize
)
536 unsigned long old_pte
, new_pte
;
537 unsigned long va
, rflags
, pa
, sz
;
541 BUG_ON(shift
!= mmu_psize_defs
[mmu_psize
].shift
);
543 /* Search the Linux page table for a match with va */
544 va
= hpt_va(ea
, vsid
, ssize
);
547 * Check the user's access rights to the page. If access should be
548 * prevented then send the problem up to do_page_fault.
550 if (unlikely(access
& ~pte_val(*ptep
)))
553 * At this point, we have a pte (old_pte) which can be used to build
554 * or update an HPTE. There are 2 cases:
556 * 1. There is a valid (present) pte with no associated HPTE (this is
557 * the most common case)
558 * 2. There is a valid (present) pte with an associated HPTE. The
559 * current values of the pp bits in the HPTE prevent access
560 * because we are doing software DIRTY bit management and the
561 * page is currently not DIRTY.
566 old_pte
= pte_val(*ptep
);
567 if (old_pte
& _PAGE_BUSY
)
569 new_pte
= old_pte
| _PAGE_BUSY
| _PAGE_ACCESSED
;
570 } while(old_pte
!= __cmpxchg_u64((unsigned long *)ptep
,
573 rflags
= 0x2 | (!(new_pte
& _PAGE_RW
));
574 /* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */
575 rflags
|= ((new_pte
& _PAGE_EXEC
) ? 0 : HPTE_R_N
);
576 sz
= ((1UL) << shift
);
577 if (!cpu_has_feature(CPU_FTR_COHERENT_ICACHE
))
578 /* No CPU has hugepages but lacks no execute, so we
579 * don't need to worry about that case */
580 rflags
= hash_huge_page_do_lazy_icache(rflags
, __pte(old_pte
),
583 /* Check if pte already has an hpte (case 2) */
584 if (unlikely(old_pte
& _PAGE_HASHPTE
)) {
585 /* There MIGHT be an HPTE for this pte */
586 unsigned long hash
, slot
;
588 hash
= hpt_hash(va
, shift
, ssize
);
589 if (old_pte
& _PAGE_F_SECOND
)
591 slot
= (hash
& htab_hash_mask
) * HPTES_PER_GROUP
;
592 slot
+= (old_pte
& _PAGE_F_GIX
) >> 12;
594 if (ppc_md
.hpte_updatepp(slot
, rflags
, va
, mmu_psize
,
596 old_pte
&= ~_PAGE_HPTEFLAGS
;
599 if (likely(!(old_pte
& _PAGE_HASHPTE
))) {
600 unsigned long hash
= hpt_hash(va
, shift
, ssize
);
601 unsigned long hpte_group
;
603 pa
= pte_pfn(__pte(old_pte
)) << PAGE_SHIFT
;
606 hpte_group
= ((hash
& htab_hash_mask
) *
607 HPTES_PER_GROUP
) & ~0x7UL
;
609 /* clear HPTE slot informations in new PTE */
610 #ifdef CONFIG_PPC_64K_PAGES
611 new_pte
= (new_pte
& ~_PAGE_HPTEFLAGS
) | _PAGE_HPTE_SUB0
;
613 new_pte
= (new_pte
& ~_PAGE_HPTEFLAGS
) | _PAGE_HASHPTE
;
615 /* Add in WIMG bits */
616 rflags
|= (new_pte
& (_PAGE_WRITETHRU
| _PAGE_NO_CACHE
|
617 _PAGE_COHERENT
| _PAGE_GUARDED
));
619 /* Insert into the hash table, primary slot */
620 slot
= ppc_md
.hpte_insert(hpte_group
, va
, pa
, rflags
, 0,
623 /* Primary is full, try the secondary */
624 if (unlikely(slot
== -1)) {
625 hpte_group
= ((~hash
& htab_hash_mask
) *
626 HPTES_PER_GROUP
) & ~0x7UL
;
627 slot
= ppc_md
.hpte_insert(hpte_group
, va
, pa
, rflags
,
632 hpte_group
= ((hash
& htab_hash_mask
) *
633 HPTES_PER_GROUP
)&~0x7UL
;
635 ppc_md
.hpte_remove(hpte_group
);
640 if (unlikely(slot
== -2))
641 panic("hash_huge_page: pte_insert failed\n");
643 new_pte
|= (slot
<< 12) & (_PAGE_F_SECOND
| _PAGE_F_GIX
);
647 * No need to use ldarx/stdcx here
649 *ptep
= __pte(new_pte
& ~_PAGE_BUSY
);
657 static int __init
add_huge_page_size(unsigned long long size
)
659 int shift
= __ffs(size
);
662 /* Check that it is a page size supported by the hardware and
663 * that it fits within pagetable and slice limits. */
664 if (!is_power_of_2(size
)
665 || (shift
> SLICE_HIGH_SHIFT
) || (shift
<= PAGE_SHIFT
))
668 if ((mmu_psize
= shift_to_mmu_psize(shift
)) < 0)
671 #ifdef CONFIG_SPU_FS_64K_LS
672 /* Disable support for 64K huge pages when 64K SPU local store
673 * support is enabled as the current implementation conflicts.
675 if (shift
== PAGE_SHIFT_64K
)
677 #endif /* CONFIG_SPU_FS_64K_LS */
679 BUG_ON(mmu_psize_defs
[mmu_psize
].shift
!= shift
);
681 /* Return if huge page size has already been setup */
682 if (size_to_hstate(size
))
685 hugetlb_add_hstate(shift
- PAGE_SHIFT
);
690 static int __init
hugepage_setup_sz(char *str
)
692 unsigned long long size
;
694 size
= memparse(str
, &str
);
696 if (add_huge_page_size(size
) != 0)
697 printk(KERN_WARNING
"Invalid huge page size specified(%llu)\n", size
);
701 __setup("hugepagesz=", hugepage_setup_sz
);
703 static int __init
hugetlbpage_init(void)
707 if (!cpu_has_feature(CPU_FTR_16M_PAGE
))
710 for (psize
= 0; psize
< MMU_PAGE_COUNT
; ++psize
) {
714 if (!mmu_psize_defs
[psize
].shift
)
717 shift
= mmu_psize_to_shift(psize
);
719 if (add_huge_page_size(1ULL << shift
) < 0)
722 if (shift
< PMD_SHIFT
)
724 else if (shift
< PUD_SHIFT
)
727 pdshift
= PGDIR_SHIFT
;
729 pgtable_cache_add(pdshift
- shift
, NULL
);
730 if (!PGT_CACHE(pdshift
- shift
))
731 panic("hugetlbpage_init(): could not create "
732 "pgtable cache for %d bit pagesize\n", shift
);
735 /* Set default large page size. Currently, we pick 16M or 1M
736 * depending on what is available
738 if (mmu_psize_defs
[MMU_PAGE_16M
].shift
)
739 HPAGE_SHIFT
= mmu_psize_defs
[MMU_PAGE_16M
].shift
;
740 else if (mmu_psize_defs
[MMU_PAGE_1M
].shift
)
741 HPAGE_SHIFT
= mmu_psize_defs
[MMU_PAGE_1M
].shift
;
746 module_init(hugetlbpage_init
);