2 * PPC64 (POWER4) Huge TLB Page Support for Kernel.
4 * Copyright (C) 2003 David Gibson, IBM Corporation.
6 * Based on the IA-32 version:
7 * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
12 #include <linux/hugetlb.h>
13 #include <asm/pgtable.h>
14 #include <asm/pgalloc.h>
17 #define PAGE_SHIFT_64K 16
18 #define PAGE_SHIFT_16M 24
19 #define PAGE_SHIFT_16G 34
21 #define MAX_NUMBER_GPAGES 1024
23 /* Tracks the 16G pages after the device tree is scanned and before the
24 * huge_boot_pages list is ready. */
25 static unsigned long gpage_freearray
[MAX_NUMBER_GPAGES
];
26 static unsigned nr_gpages
;
28 /* Flag to mark huge PD pointers. This means pmd_bad() and pud_bad()
29 * will choke on pointers to hugepte tables, which is handy for
30 * catching screwups early. */
32 static inline int shift_to_mmu_psize(unsigned int shift
)
36 for (psize
= 0; psize
< MMU_PAGE_COUNT
; ++psize
)
37 if (mmu_psize_defs
[psize
].shift
== shift
)
42 static inline unsigned int mmu_psize_to_shift(unsigned int mmu_psize
)
44 if (mmu_psize_defs
[mmu_psize
].shift
)
45 return mmu_psize_defs
[mmu_psize
].shift
;
49 #define hugepd_none(hpd) ((hpd).pd == 0)
51 static inline pte_t
*hugepd_page(hugepd_t hpd
)
53 BUG_ON(!hugepd_ok(hpd
));
54 return (pte_t
*)((hpd
.pd
& ~HUGEPD_SHIFT_MASK
) | 0xc000000000000000);
57 static inline unsigned int hugepd_shift(hugepd_t hpd
)
59 return hpd
.pd
& HUGEPD_SHIFT_MASK
;
62 static inline pte_t
*hugepte_offset(hugepd_t
*hpdp
, unsigned long addr
, unsigned pdshift
)
64 unsigned long idx
= (addr
& ((1UL << pdshift
) - 1)) >> hugepd_shift(*hpdp
);
65 pte_t
*dir
= hugepd_page(*hpdp
);
70 pte_t
*find_linux_pte_or_hugepte(pgd_t
*pgdir
, unsigned long ea
, unsigned *shift
)
75 hugepd_t
*hpdp
= NULL
;
76 unsigned pdshift
= PGDIR_SHIFT
;
81 pg
= pgdir
+ pgd_index(ea
);
83 hpdp
= (hugepd_t
*)pg
;
84 } else if (!pgd_none(*pg
)) {
86 pu
= pud_offset(pg
, ea
);
88 hpdp
= (hugepd_t
*)pu
;
89 else if (!pud_none(*pu
)) {
91 pm
= pmd_offset(pu
, ea
);
93 hpdp
= (hugepd_t
*)pm
;
94 else if (!pmd_none(*pm
)) {
95 return pte_offset_map(pm
, ea
);
104 *shift
= hugepd_shift(*hpdp
);
105 return hugepte_offset(hpdp
, ea
, pdshift
);
108 pte_t
*huge_pte_offset(struct mm_struct
*mm
, unsigned long addr
)
110 return find_linux_pte_or_hugepte(mm
->pgd
, addr
, NULL
);
113 static int __hugepte_alloc(struct mm_struct
*mm
, hugepd_t
*hpdp
,
114 unsigned long address
, unsigned pdshift
, unsigned pshift
)
116 pte_t
*new = kmem_cache_zalloc(PGT_CACHE(pdshift
- pshift
),
117 GFP_KERNEL
|__GFP_REPEAT
);
119 BUG_ON(pshift
> HUGEPD_SHIFT_MASK
);
120 BUG_ON((unsigned long)new & HUGEPD_SHIFT_MASK
);
125 spin_lock(&mm
->page_table_lock
);
126 if (!hugepd_none(*hpdp
))
127 kmem_cache_free(PGT_CACHE(pdshift
- pshift
), new);
129 hpdp
->pd
= ((unsigned long)new & ~0x8000000000000000) | pshift
;
130 spin_unlock(&mm
->page_table_lock
);
134 pte_t
*huge_pte_alloc(struct mm_struct
*mm
, unsigned long addr
, unsigned long sz
)
139 hugepd_t
*hpdp
= NULL
;
140 unsigned pshift
= __ffs(sz
);
141 unsigned pdshift
= PGDIR_SHIFT
;
145 pg
= pgd_offset(mm
, addr
);
146 if (pshift
>= PUD_SHIFT
) {
147 hpdp
= (hugepd_t
*)pg
;
150 pu
= pud_alloc(mm
, pg
, addr
);
151 if (pshift
>= PMD_SHIFT
) {
152 hpdp
= (hugepd_t
*)pu
;
155 pm
= pmd_alloc(mm
, pu
, addr
);
156 hpdp
= (hugepd_t
*)pm
;
163 BUG_ON(!hugepd_none(*hpdp
) && !hugepd_ok(*hpdp
));
165 if (hugepd_none(*hpdp
) && __hugepte_alloc(mm
, hpdp
, addr
, pdshift
, pshift
))
168 return hugepte_offset(hpdp
, addr
, pdshift
);
171 /* Build list of addresses of gigantic pages. This function is used in early
172 * boot before the buddy or bootmem allocator is setup.
174 void add_gpage(unsigned long addr
, unsigned long page_size
,
175 unsigned long number_of_pages
)
179 while (number_of_pages
> 0) {
180 gpage_freearray
[nr_gpages
] = addr
;
187 /* Moves the gigantic page addresses from the temporary list to the
188 * huge_boot_pages list.
190 int alloc_bootmem_huge_page(struct hstate
*hstate
)
192 struct huge_bootmem_page
*m
;
195 m
= phys_to_virt(gpage_freearray
[--nr_gpages
]);
196 gpage_freearray
[nr_gpages
] = 0;
197 list_add(&m
->list
, &huge_boot_pages
);
202 int huge_pmd_unshare(struct mm_struct
*mm
, unsigned long *addr
, pte_t
*ptep
)
207 static void free_hugepd_range(struct mmu_gather
*tlb
, hugepd_t
*hpdp
, int pdshift
,
208 unsigned long start
, unsigned long end
,
209 unsigned long floor
, unsigned long ceiling
)
211 pte_t
*hugepte
= hugepd_page(*hpdp
);
212 unsigned shift
= hugepd_shift(*hpdp
);
213 unsigned long pdmask
= ~((1UL << pdshift
) - 1);
223 if (end
- 1 > ceiling
- 1)
228 pgtable_free_tlb(tlb
, hugepte
, pdshift
- shift
);
231 static void hugetlb_free_pmd_range(struct mmu_gather
*tlb
, pud_t
*pud
,
232 unsigned long addr
, unsigned long end
,
233 unsigned long floor
, unsigned long ceiling
)
240 pmd
= pmd_offset(pud
, addr
);
242 next
= pmd_addr_end(addr
, end
);
245 free_hugepd_range(tlb
, (hugepd_t
*)pmd
, PMD_SHIFT
,
246 addr
, next
, floor
, ceiling
);
247 } while (pmd
++, addr
= next
, addr
!= end
);
257 if (end
- 1 > ceiling
- 1)
260 pmd
= pmd_offset(pud
, start
);
262 pmd_free_tlb(tlb
, pmd
, start
);
265 static void hugetlb_free_pud_range(struct mmu_gather
*tlb
, pgd_t
*pgd
,
266 unsigned long addr
, unsigned long end
,
267 unsigned long floor
, unsigned long ceiling
)
274 pud
= pud_offset(pgd
, addr
);
276 next
= pud_addr_end(addr
, end
);
277 if (!is_hugepd(pud
)) {
278 if (pud_none_or_clear_bad(pud
))
280 hugetlb_free_pmd_range(tlb
, pud
, addr
, next
, floor
,
283 free_hugepd_range(tlb
, (hugepd_t
*)pud
, PUD_SHIFT
,
284 addr
, next
, floor
, ceiling
);
286 } while (pud
++, addr
= next
, addr
!= end
);
292 ceiling
&= PGDIR_MASK
;
296 if (end
- 1 > ceiling
- 1)
299 pud
= pud_offset(pgd
, start
);
301 pud_free_tlb(tlb
, pud
, start
);
305 * This function frees user-level page tables of a process.
307 * Must be called with pagetable lock held.
309 void hugetlb_free_pgd_range(struct mmu_gather
*tlb
,
310 unsigned long addr
, unsigned long end
,
311 unsigned long floor
, unsigned long ceiling
)
317 * Because there are a number of different possible pagetable
318 * layouts for hugepage ranges, we limit knowledge of how
319 * things should be laid out to the allocation path
320 * (huge_pte_alloc(), above). Everything else works out the
321 * structure as it goes from information in the hugepd
322 * pointers. That means that we can't here use the
323 * optimization used in the normal page free_pgd_range(), of
324 * checking whether we're actually covering a large enough
325 * range to have to do anything at the top level of the walk
326 * instead of at the bottom.
328 * To make sense of this, you should probably go read the big
329 * block comment at the top of the normal free_pgd_range(),
333 pgd
= pgd_offset(tlb
->mm
, addr
);
335 next
= pgd_addr_end(addr
, end
);
336 if (!is_hugepd(pgd
)) {
337 if (pgd_none_or_clear_bad(pgd
))
339 hugetlb_free_pud_range(tlb
, pgd
, addr
, next
, floor
, ceiling
);
341 free_hugepd_range(tlb
, (hugepd_t
*)pgd
, PGDIR_SHIFT
,
342 addr
, next
, floor
, ceiling
);
344 } while (pgd
++, addr
= next
, addr
!= end
);
347 void set_huge_pte_at(struct mm_struct
*mm
, unsigned long addr
,
348 pte_t
*ptep
, pte_t pte
)
350 if (pte_present(*ptep
)) {
351 /* We open-code pte_clear because we need to pass the right
352 * argument to hpte_need_flush (huge / !huge). Might not be
353 * necessary anymore if we make hpte_need_flush() get the
354 * page size from the slices
356 pte_update(mm
, addr
, ptep
, ~0UL, 1);
358 *ptep
= __pte(pte_val(pte
) & ~_PAGE_HPTEFLAGS
);
361 pte_t
huge_ptep_get_and_clear(struct mm_struct
*mm
, unsigned long addr
,
364 unsigned long old
= pte_update(mm
, addr
, ptep
, ~0UL, 1);
369 follow_huge_addr(struct mm_struct
*mm
, unsigned long address
, int write
)
376 ptep
= find_linux_pte_or_hugepte(mm
->pgd
, address
, &shift
);
378 /* Verify it is a huge page else bail. */
380 return ERR_PTR(-EINVAL
);
382 mask
= (1UL << shift
) - 1;
383 page
= pte_page(*ptep
);
385 page
+= (address
& mask
) / PAGE_SIZE
;
390 int pmd_huge(pmd_t pmd
)
395 int pud_huge(pud_t pud
)
401 follow_huge_pmd(struct mm_struct
*mm
, unsigned long address
,
402 pmd_t
*pmd
, int write
)
408 static noinline
int gup_hugepte(pte_t
*ptep
, unsigned long sz
, unsigned long addr
,
409 unsigned long end
, int write
, struct page
**pages
, int *nr
)
412 unsigned long pte_end
;
413 struct page
*head
, *page
;
417 pte_end
= (addr
+ sz
) & ~(sz
-1);
422 mask
= _PAGE_PRESENT
| _PAGE_USER
;
426 if ((pte_val(pte
) & mask
) != mask
)
429 /* hugepages are never "special" */
430 VM_BUG_ON(!pfn_valid(pte_pfn(pte
)));
433 head
= pte_page(pte
);
435 page
= head
+ ((addr
& (sz
-1)) >> PAGE_SHIFT
);
437 VM_BUG_ON(compound_head(page
) != head
);
442 } while (addr
+= PAGE_SIZE
, addr
!= end
);
444 if (!page_cache_add_speculative(head
, refs
)) {
449 if (unlikely(pte_val(pte
) != pte_val(*ptep
))) {
450 /* Could be optimized better */
460 int gup_hugepd(hugepd_t
*hugepd
, unsigned pdshift
,
461 unsigned long addr
, unsigned long end
,
462 int write
, struct page
**pages
, int *nr
)
465 unsigned long sz
= 1UL << hugepd_shift(*hugepd
);
467 ptep
= hugepte_offset(hugepd
, addr
, pdshift
);
469 if (!gup_hugepte(ptep
, sz
, addr
, end
, write
, pages
, nr
))
471 } while (ptep
++, addr
+= sz
, addr
!= end
);
476 unsigned long hugetlb_get_unmapped_area(struct file
*file
, unsigned long addr
,
477 unsigned long len
, unsigned long pgoff
,
480 struct hstate
*hstate
= hstate_file(file
);
481 int mmu_psize
= shift_to_mmu_psize(huge_page_shift(hstate
));
483 return slice_get_unmapped_area(addr
, len
, flags
, mmu_psize
, 1, 0);
486 unsigned long vma_mmu_pagesize(struct vm_area_struct
*vma
)
488 unsigned int psize
= get_slice_psize(vma
->vm_mm
, vma
->vm_start
);
490 return 1UL << mmu_psize_to_shift(psize
);
493 static int __init
add_huge_page_size(unsigned long long size
)
495 int shift
= __ffs(size
);
498 /* Check that it is a page size supported by the hardware and
499 * that it fits within pagetable and slice limits. */
500 if (!is_power_of_2(size
)
501 || (shift
> SLICE_HIGH_SHIFT
) || (shift
<= PAGE_SHIFT
))
504 if ((mmu_psize
= shift_to_mmu_psize(shift
)) < 0)
507 #ifdef CONFIG_SPU_FS_64K_LS
508 /* Disable support for 64K huge pages when 64K SPU local store
509 * support is enabled as the current implementation conflicts.
511 if (shift
== PAGE_SHIFT_64K
)
513 #endif /* CONFIG_SPU_FS_64K_LS */
515 BUG_ON(mmu_psize_defs
[mmu_psize
].shift
!= shift
);
517 /* Return if huge page size has already been setup */
518 if (size_to_hstate(size
))
521 hugetlb_add_hstate(shift
- PAGE_SHIFT
);
526 static int __init
hugepage_setup_sz(char *str
)
528 unsigned long long size
;
530 size
= memparse(str
, &str
);
532 if (add_huge_page_size(size
) != 0)
533 printk(KERN_WARNING
"Invalid huge page size specified(%llu)\n", size
);
537 __setup("hugepagesz=", hugepage_setup_sz
);
539 static int __init
hugetlbpage_init(void)
543 if (!cpu_has_feature(CPU_FTR_16M_PAGE
))
546 for (psize
= 0; psize
< MMU_PAGE_COUNT
; ++psize
) {
550 if (!mmu_psize_defs
[psize
].shift
)
553 shift
= mmu_psize_to_shift(psize
);
555 if (add_huge_page_size(1ULL << shift
) < 0)
558 if (shift
< PMD_SHIFT
)
560 else if (shift
< PUD_SHIFT
)
563 pdshift
= PGDIR_SHIFT
;
565 pgtable_cache_add(pdshift
- shift
, NULL
);
566 if (!PGT_CACHE(pdshift
- shift
))
567 panic("hugetlbpage_init(): could not create "
568 "pgtable cache for %d bit pagesize\n", shift
);
571 /* Set default large page size. Currently, we pick 16M or 1M
572 * depending on what is available
574 if (mmu_psize_defs
[MMU_PAGE_16M
].shift
)
575 HPAGE_SHIFT
= mmu_psize_defs
[MMU_PAGE_16M
].shift
;
576 else if (mmu_psize_defs
[MMU_PAGE_1M
].shift
)
577 HPAGE_SHIFT
= mmu_psize_defs
[MMU_PAGE_1M
].shift
;
582 module_init(hugetlbpage_init
);