2 * PPC64 (POWER4) Huge TLB Page Support for Kernel.
4 * Copyright (C) 2003 David Gibson, IBM Corporation.
6 * Based on the IA-32 version:
7 * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
10 #include <linux/init.h>
13 #include <linux/hugetlb.h>
14 #include <linux/pagemap.h>
15 #include <linux/smp_lock.h>
16 #include <linux/slab.h>
17 #include <linux/err.h>
18 #include <linux/sysctl.h>
20 #include <asm/pgalloc.h>
22 #include <asm/tlbflush.h>
23 #include <asm/mmu_context.h>
24 #include <asm/machdep.h>
25 #include <asm/cputable.h>
28 #include <linux/sysctl.h>
30 #define HUGEPGDIR_SHIFT (HPAGE_SHIFT + PAGE_SHIFT - 3)
31 #define HUGEPGDIR_SIZE (1UL << HUGEPGDIR_SHIFT)
32 #define HUGEPGDIR_MASK (~(HUGEPGDIR_SIZE-1))
34 #define HUGEPTE_INDEX_SIZE 9
35 #define HUGEPGD_INDEX_SIZE 10
37 #define PTRS_PER_HUGEPTE (1 << HUGEPTE_INDEX_SIZE)
38 #define PTRS_PER_HUGEPGD (1 << HUGEPGD_INDEX_SIZE)
40 static inline int hugepgd_index(unsigned long addr
)
42 return (addr
& ~REGION_MASK
) >> HUGEPGDIR_SHIFT
;
45 static pud_t
*hugepgd_offset(struct mm_struct
*mm
, unsigned long addr
)
49 if (! mm
->context
.huge_pgdir
)
53 index
= hugepgd_index(addr
);
54 BUG_ON(index
>= PTRS_PER_HUGEPGD
);
55 return (pud_t
*)(mm
->context
.huge_pgdir
+ index
);
58 static inline pte_t
*hugepte_offset(pud_t
*dir
, unsigned long addr
)
65 index
= (addr
>> HPAGE_SHIFT
) % PTRS_PER_HUGEPTE
;
66 return (pte_t
*)pud_page(*dir
) + index
;
69 static pud_t
*hugepgd_alloc(struct mm_struct
*mm
, unsigned long addr
)
71 BUG_ON(! in_hugepage_area(mm
->context
, addr
));
73 if (! mm
->context
.huge_pgdir
) {
75 spin_unlock(&mm
->page_table_lock
);
76 /* Don't use pgd_alloc(), because we want __GFP_REPEAT */
77 new = kmem_cache_alloc(zero_cache
, GFP_KERNEL
| __GFP_REPEAT
);
78 BUG_ON(memcmp(new, empty_zero_page
, PAGE_SIZE
));
79 spin_lock(&mm
->page_table_lock
);
82 * Because we dropped the lock, we should re-check the
83 * entry, as somebody else could have populated it..
85 if (mm
->context
.huge_pgdir
)
88 mm
->context
.huge_pgdir
= new;
90 return hugepgd_offset(mm
, addr
);
93 static pte_t
*hugepte_alloc(struct mm_struct
*mm
, pud_t
*dir
, unsigned long addr
)
95 if (! pud_present(*dir
)) {
98 spin_unlock(&mm
->page_table_lock
);
99 new = kmem_cache_alloc(zero_cache
, GFP_KERNEL
| __GFP_REPEAT
);
100 BUG_ON(memcmp(new, empty_zero_page
, PAGE_SIZE
));
101 spin_lock(&mm
->page_table_lock
);
103 * Because we dropped the lock, we should re-check the
104 * entry, as somebody else could have populated it..
106 if (pud_present(*dir
)) {
108 kmem_cache_free(zero_cache
, new);
110 struct page
*ptepage
;
114 ptepage
= virt_to_page(new);
115 ptepage
->mapping
= (void *) mm
;
116 ptepage
->index
= addr
& HUGEPGDIR_MASK
;
117 pud_populate(mm
, dir
, new);
121 return hugepte_offset(dir
, addr
);
124 static pte_t
*huge_pte_offset(struct mm_struct
*mm
, unsigned long addr
)
128 BUG_ON(! in_hugepage_area(mm
->context
, addr
));
130 pud
= hugepgd_offset(mm
, addr
);
134 return hugepte_offset(pud
, addr
);
137 static pte_t
*huge_pte_alloc(struct mm_struct
*mm
, unsigned long addr
)
141 BUG_ON(! in_hugepage_area(mm
->context
, addr
));
143 pud
= hugepgd_alloc(mm
, addr
);
147 return hugepte_alloc(mm
, pud
, addr
);
150 static void set_huge_pte(struct mm_struct
*mm
, struct vm_area_struct
*vma
,
151 unsigned long addr
, struct page
*page
,
152 pte_t
*ptep
, int write_access
)
156 add_mm_counter(mm
, rss
, HPAGE_SIZE
/ PAGE_SIZE
);
159 pte_mkwrite(pte_mkdirty(mk_pte(page
, vma
->vm_page_prot
)));
161 entry
= pte_wrprotect(mk_pte(page
, vma
->vm_page_prot
));
163 entry
= pte_mkyoung(entry
);
164 entry
= pte_mkhuge(entry
);
166 set_pte_at(mm
, addr
, ptep
, entry
);
170 * This function checks for proper alignment of input addr and len parameters.
172 int is_aligned_hugepage_range(unsigned long addr
, unsigned long len
)
174 if (len
& ~HPAGE_MASK
)
176 if (addr
& ~HPAGE_MASK
)
178 if (! (within_hugepage_low_range(addr
, len
)
179 || within_hugepage_high_range(addr
, len
)) )
184 static void flush_segments(void *parm
)
186 u16 segs
= (unsigned long) parm
;
189 asm volatile("isync" : : : "memory");
191 for (i
= 0; i
< 16; i
++) {
192 if (! (segs
& (1U << i
)))
194 asm volatile("slbie %0" : : "r" (i
<< SID_SHIFT
));
197 asm volatile("isync" : : : "memory");
200 static int prepare_low_seg_for_htlb(struct mm_struct
*mm
, unsigned long seg
)
202 unsigned long start
= seg
<< SID_SHIFT
;
203 unsigned long end
= (seg
+1) << SID_SHIFT
;
204 struct vm_area_struct
*vma
;
208 /* Check no VMAs are in the region */
209 vma
= find_vma(mm
, start
);
210 if (vma
&& (vma
->vm_start
< end
))
216 static int open_low_hpage_segs(struct mm_struct
*mm
, u16 newsegs
)
220 newsegs
&= ~(mm
->context
.htlb_segs
);
222 return 0; /* The segments we want are already open */
224 for (i
= 0; i
< 16; i
++)
225 if ((1 << i
) & newsegs
)
226 if (prepare_low_seg_for_htlb(mm
, i
) != 0)
229 mm
->context
.htlb_segs
|= newsegs
;
231 /* update the paca copy of the context struct */
232 get_paca()->context
= mm
->context
;
234 /* the context change must make it to memory before the flush,
235 * so that further SLB misses do the right thing. */
237 on_each_cpu(flush_segments
, (void *)(unsigned long)newsegs
, 0, 1);
242 int prepare_hugepage_range(unsigned long addr
, unsigned long len
)
244 if (within_hugepage_high_range(addr
, len
))
246 else if ((addr
< 0x100000000UL
) && ((addr
+len
) < 0x100000000UL
)) {
248 /* Yes, we need both tests, in case addr+len overflows
249 * 64-bit arithmetic */
250 err
= open_low_hpage_segs(current
->mm
,
251 LOW_ESID_MASK(addr
, len
));
253 printk(KERN_DEBUG
"prepare_hugepage_range(%lx, %lx)"
254 " failed (segs: 0x%04hx)\n", addr
, len
,
255 LOW_ESID_MASK(addr
, len
));
262 int copy_hugetlb_page_range(struct mm_struct
*dst
, struct mm_struct
*src
,
263 struct vm_area_struct
*vma
)
265 pte_t
*src_pte
, *dst_pte
, entry
;
266 struct page
*ptepage
;
267 unsigned long addr
= vma
->vm_start
;
268 unsigned long end
= vma
->vm_end
;
272 dst_pte
= huge_pte_alloc(dst
, addr
);
276 src_pte
= huge_pte_offset(src
, addr
);
279 ptepage
= pte_page(entry
);
281 add_mm_counter(dst
, rss
, HPAGE_SIZE
/ PAGE_SIZE
);
282 set_pte_at(dst
, addr
, dst_pte
, entry
);
293 follow_hugetlb_page(struct mm_struct
*mm
, struct vm_area_struct
*vma
,
294 struct page
**pages
, struct vm_area_struct
**vmas
,
295 unsigned long *position
, int *length
, int i
)
297 unsigned long vpfn
, vaddr
= *position
;
298 int remainder
= *length
;
300 WARN_ON(!is_vm_hugetlb_page(vma
));
302 vpfn
= vaddr
/PAGE_SIZE
;
303 while (vaddr
< vma
->vm_end
&& remainder
) {
308 pte
= huge_pte_offset(mm
, vaddr
);
310 /* hugetlb should be locked, and hence, prefaulted */
311 WARN_ON(!pte
|| pte_none(*pte
));
313 page
= &pte_page(*pte
)[vpfn
% (HPAGE_SIZE
/PAGE_SIZE
)];
315 WARN_ON(!PageCompound(page
));
337 follow_huge_addr(struct mm_struct
*mm
, unsigned long address
, int write
)
342 if (! in_hugepage_area(mm
->context
, address
))
343 return ERR_PTR(-EINVAL
);
345 ptep
= huge_pte_offset(mm
, address
);
346 page
= pte_page(*ptep
);
348 page
+= (address
% HPAGE_SIZE
) / PAGE_SIZE
;
353 int pmd_huge(pmd_t pmd
)
359 follow_huge_pmd(struct mm_struct
*mm
, unsigned long address
,
360 pmd_t
*pmd
, int write
)
366 void unmap_hugepage_range(struct vm_area_struct
*vma
,
367 unsigned long start
, unsigned long end
)
369 struct mm_struct
*mm
= vma
->vm_mm
;
374 WARN_ON(!is_vm_hugetlb_page(vma
));
375 BUG_ON((start
% HPAGE_SIZE
) != 0);
376 BUG_ON((end
% HPAGE_SIZE
) != 0);
378 for (addr
= start
; addr
< end
; addr
+= HPAGE_SIZE
) {
381 ptep
= huge_pte_offset(mm
, addr
);
382 if (!ptep
|| pte_none(*ptep
))
386 page
= pte_page(pte
);
387 pte_clear(mm
, addr
, ptep
);
391 add_mm_counter(mm
, rss
, -((end
- start
) >> PAGE_SHIFT
));
395 int hugetlb_prefault(struct address_space
*mapping
, struct vm_area_struct
*vma
)
397 struct mm_struct
*mm
= current
->mm
;
401 WARN_ON(!is_vm_hugetlb_page(vma
));
402 BUG_ON((vma
->vm_start
% HPAGE_SIZE
) != 0);
403 BUG_ON((vma
->vm_end
% HPAGE_SIZE
) != 0);
405 spin_lock(&mm
->page_table_lock
);
406 for (addr
= vma
->vm_start
; addr
< vma
->vm_end
; addr
+= HPAGE_SIZE
) {
408 pte_t
*pte
= huge_pte_alloc(mm
, addr
);
415 if (! pte_none(*pte
))
418 idx
= ((addr
- vma
->vm_start
) >> HPAGE_SHIFT
)
419 + (vma
->vm_pgoff
>> (HPAGE_SHIFT
- PAGE_SHIFT
));
420 page
= find_get_page(mapping
, idx
);
422 /* charge the fs quota first */
423 if (hugetlb_get_quota(mapping
)) {
427 page
= alloc_huge_page();
429 hugetlb_put_quota(mapping
);
433 ret
= add_to_page_cache(page
, mapping
, idx
, GFP_ATOMIC
);
437 hugetlb_put_quota(mapping
);
438 free_huge_page(page
);
442 set_huge_pte(mm
, vma
, addr
, page
, pte
, vma
->vm_flags
& VM_WRITE
);
445 spin_unlock(&mm
->page_table_lock
);
449 /* Because we have an exclusive hugepage region which lies within the
450 * normal user address space, we have to take special measures to make
451 * non-huge mmap()s evade the hugepage reserved regions. */
452 unsigned long arch_get_unmapped_area(struct file
*filp
, unsigned long addr
,
453 unsigned long len
, unsigned long pgoff
,
456 struct mm_struct
*mm
= current
->mm
;
457 struct vm_area_struct
*vma
;
458 unsigned long start_addr
;
464 addr
= PAGE_ALIGN(addr
);
465 vma
= find_vma(mm
, addr
);
466 if (((TASK_SIZE
- len
) >= addr
)
467 && (!vma
|| (addr
+len
) <= vma
->vm_start
)
468 && !is_hugepage_only_range(mm
, addr
,len
))
471 start_addr
= addr
= mm
->free_area_cache
;
474 vma
= find_vma(mm
, addr
);
475 while (TASK_SIZE
- len
>= addr
) {
476 BUG_ON(vma
&& (addr
>= vma
->vm_end
));
478 if (touches_hugepage_low_range(mm
, addr
, len
)) {
479 addr
= ALIGN(addr
+1, 1<<SID_SHIFT
);
480 vma
= find_vma(mm
, addr
);
483 if (touches_hugepage_high_range(addr
, len
)) {
484 addr
= TASK_HPAGE_END
;
485 vma
= find_vma(mm
, addr
);
488 if (!vma
|| addr
+ len
<= vma
->vm_start
) {
490 * Remember the place where we stopped the search:
492 mm
->free_area_cache
= addr
+ len
;
499 /* Make sure we didn't miss any holes */
500 if (start_addr
!= TASK_UNMAPPED_BASE
) {
501 start_addr
= addr
= TASK_UNMAPPED_BASE
;
508 * This mmap-allocator allocates new areas top-down from below the
509 * stack's low limit (the base):
511 * Because we have an exclusive hugepage region which lies within the
512 * normal user address space, we have to take special measures to make
513 * non-huge mmap()s evade the hugepage reserved regions.
516 arch_get_unmapped_area_topdown(struct file
*filp
, const unsigned long addr0
,
517 const unsigned long len
, const unsigned long pgoff
,
518 const unsigned long flags
)
520 struct vm_area_struct
*vma
, *prev_vma
;
521 struct mm_struct
*mm
= current
->mm
;
522 unsigned long base
= mm
->mmap_base
, addr
= addr0
;
525 /* requested length too big for entire address space */
529 /* dont allow allocations above current base */
530 if (mm
->free_area_cache
> base
)
531 mm
->free_area_cache
= base
;
533 /* requesting a specific address */
535 addr
= PAGE_ALIGN(addr
);
536 vma
= find_vma(mm
, addr
);
537 if (TASK_SIZE
- len
>= addr
&&
538 (!vma
|| addr
+ len
<= vma
->vm_start
)
539 && !is_hugepage_only_range(mm
, addr
,len
))
544 /* make sure it can fit in the remaining address space */
545 if (mm
->free_area_cache
< len
)
548 /* either no address requested or cant fit in requested address hole */
549 addr
= (mm
->free_area_cache
- len
) & PAGE_MASK
;
552 if (touches_hugepage_low_range(mm
, addr
, len
)) {
553 addr
= (addr
& ((~0) << SID_SHIFT
)) - len
;
554 goto hugepage_recheck
;
555 } else if (touches_hugepage_high_range(addr
, len
)) {
556 addr
= TASK_HPAGE_BASE
- len
;
560 * Lookup failure means no vma is above this address,
561 * i.e. return with success:
563 if (!(vma
= find_vma_prev(mm
, addr
, &prev_vma
)))
567 * new region fits between prev_vma->vm_end and
568 * vma->vm_start, use it:
570 if (addr
+len
<= vma
->vm_start
&&
571 (!prev_vma
|| (addr
>= prev_vma
->vm_end
)))
572 /* remember the address as a hint for next time */
573 return (mm
->free_area_cache
= addr
);
575 /* pull free_area_cache down to the first hole */
576 if (mm
->free_area_cache
== vma
->vm_end
)
577 mm
->free_area_cache
= vma
->vm_start
;
579 /* try just below the current vma->vm_start */
580 addr
= vma
->vm_start
-len
;
581 } while (len
<= vma
->vm_start
);
585 * if hint left us with no space for the requested
586 * mapping then try again:
589 mm
->free_area_cache
= base
;
594 * A failed mmap() very likely causes application failure,
595 * so fall back to the bottom-up function here. This scenario
596 * can happen with large stack limits and large mmap()
599 mm
->free_area_cache
= TASK_UNMAPPED_BASE
;
600 addr
= arch_get_unmapped_area(filp
, addr0
, len
, pgoff
, flags
);
602 * Restore the topdown base:
604 mm
->free_area_cache
= base
;
609 static unsigned long htlb_get_low_area(unsigned long len
, u16 segmask
)
611 unsigned long addr
= 0;
612 struct vm_area_struct
*vma
;
614 vma
= find_vma(current
->mm
, addr
);
615 while (addr
+ len
<= 0x100000000UL
) {
616 BUG_ON(vma
&& (addr
>= vma
->vm_end
)); /* invariant */
618 if (! __within_hugepage_low_range(addr
, len
, segmask
)) {
619 addr
= ALIGN(addr
+1, 1<<SID_SHIFT
);
620 vma
= find_vma(current
->mm
, addr
);
624 if (!vma
|| (addr
+ len
) <= vma
->vm_start
)
626 addr
= ALIGN(vma
->vm_end
, HPAGE_SIZE
);
627 /* Depending on segmask this might not be a confirmed
628 * hugepage region, so the ALIGN could have skipped
630 vma
= find_vma(current
->mm
, addr
);
636 static unsigned long htlb_get_high_area(unsigned long len
)
638 unsigned long addr
= TASK_HPAGE_BASE
;
639 struct vm_area_struct
*vma
;
641 vma
= find_vma(current
->mm
, addr
);
642 for (vma
= find_vma(current
->mm
, addr
);
643 addr
+ len
<= TASK_HPAGE_END
;
644 vma
= vma
->vm_next
) {
645 BUG_ON(vma
&& (addr
>= vma
->vm_end
)); /* invariant */
646 BUG_ON(! within_hugepage_high_range(addr
, len
));
648 if (!vma
|| (addr
+ len
) <= vma
->vm_start
)
650 addr
= ALIGN(vma
->vm_end
, HPAGE_SIZE
);
651 /* Because we're in a hugepage region, this alignment
652 * should not skip us over any VMAs */
658 unsigned long hugetlb_get_unmapped_area(struct file
*file
, unsigned long addr
,
659 unsigned long len
, unsigned long pgoff
,
662 if (len
& ~HPAGE_MASK
)
665 if (!cpu_has_feature(CPU_FTR_16M_PAGE
))
668 if (test_thread_flag(TIF_32BIT
)) {
670 u16 segmask
, cursegs
= current
->mm
->context
.htlb_segs
;
672 /* First see if we can do the mapping in the existing
673 * low hpage segments */
674 addr
= htlb_get_low_area(len
, cursegs
);
678 for (segmask
= LOW_ESID_MASK(0x100000000UL
-len
, len
);
679 ! lastshift
; segmask
>>=1) {
683 addr
= htlb_get_low_area(len
, cursegs
| segmask
);
684 if ((addr
!= -ENOMEM
)
685 && open_low_hpage_segs(current
->mm
, segmask
) == 0)
688 printk(KERN_DEBUG
"hugetlb_get_unmapped_area() unable to open"
689 " enough segments\n");
692 return htlb_get_high_area(len
);
696 void hugetlb_mm_free_pgd(struct mm_struct
*mm
)
701 spin_lock(&mm
->page_table_lock
);
703 pgdir
= mm
->context
.huge_pgdir
;
707 mm
->context
.huge_pgdir
= NULL
;
709 /* cleanup any hugepte pages leftover */
710 for (i
= 0; i
< PTRS_PER_HUGEPGD
; i
++) {
711 pud_t
*pud
= (pud_t
*)(pgdir
+ i
);
713 if (! pud_none(*pud
)) {
714 pte_t
*pte
= (pte_t
*)pud_page(*pud
);
715 struct page
*ptepage
= virt_to_page(pte
);
717 ptepage
->mapping
= NULL
;
719 BUG_ON(memcmp(pte
, empty_zero_page
, PAGE_SIZE
));
720 kmem_cache_free(zero_cache
, pte
);
725 BUG_ON(memcmp(pgdir
, empty_zero_page
, PAGE_SIZE
));
726 kmem_cache_free(zero_cache
, pgdir
);
729 spin_unlock(&mm
->page_table_lock
);
732 int hash_huge_page(struct mm_struct
*mm
, unsigned long access
,
733 unsigned long ea
, unsigned long vsid
, int local
)
736 unsigned long va
, vpn
;
737 pte_t old_pte
, new_pte
;
738 unsigned long hpteflags
, prpn
;
742 spin_lock(&mm
->page_table_lock
);
744 ptep
= huge_pte_offset(mm
, ea
);
746 /* Search the Linux page table for a match with va */
747 va
= (vsid
<< 28) | (ea
& 0x0fffffff);
748 vpn
= va
>> HPAGE_SHIFT
;
751 * If no pte found or not present, send the problem up to
754 if (unlikely(!ptep
|| pte_none(*ptep
)))
757 /* BUG_ON(pte_bad(*ptep)); */
760 * Check the user's access rights to the page. If access should be
761 * prevented then send the problem up to do_page_fault.
763 if (unlikely(access
& ~pte_val(*ptep
)))
766 * At this point, we have a pte (old_pte) which can be used to build
767 * or update an HPTE. There are 2 cases:
769 * 1. There is a valid (present) pte with no associated HPTE (this is
770 * the most common case)
771 * 2. There is a valid (present) pte with an associated HPTE. The
772 * current values of the pp bits in the HPTE prevent access
773 * because we are doing software DIRTY bit management and the
774 * page is currently not DIRTY.
781 hpteflags
= 0x2 | (! (pte_val(new_pte
) & _PAGE_RW
));
782 /* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */
783 hpteflags
|= ((pte_val(new_pte
) & _PAGE_EXEC
) ? 0 : HW_NO_EXEC
);
785 /* Check if pte already has an hpte (case 2) */
786 if (unlikely(pte_val(old_pte
) & _PAGE_HASHPTE
)) {
787 /* There MIGHT be an HPTE for this pte */
788 unsigned long hash
, slot
;
790 hash
= hpt_hash(vpn
, 1);
791 if (pte_val(old_pte
) & _PAGE_SECONDARY
)
793 slot
= (hash
& htab_hash_mask
) * HPTES_PER_GROUP
;
794 slot
+= (pte_val(old_pte
) & _PAGE_GROUP_IX
) >> 12;
796 if (ppc_md
.hpte_updatepp(slot
, hpteflags
, va
, 1, local
) == -1)
797 pte_val(old_pte
) &= ~_PAGE_HPTEFLAGS
;
800 if (likely(!(pte_val(old_pte
) & _PAGE_HASHPTE
))) {
801 unsigned long hash
= hpt_hash(vpn
, 1);
802 unsigned long hpte_group
;
804 prpn
= pte_pfn(old_pte
);
807 hpte_group
= ((hash
& htab_hash_mask
) *
808 HPTES_PER_GROUP
) & ~0x7UL
;
810 /* Update the linux pte with the HPTE slot */
811 pte_val(new_pte
) &= ~_PAGE_HPTEFLAGS
;
812 pte_val(new_pte
) |= _PAGE_HASHPTE
;
814 /* Add in WIMG bits */
815 /* XXX We should store these in the pte */
816 hpteflags
|= _PAGE_COHERENT
;
818 slot
= ppc_md
.hpte_insert(hpte_group
, va
, prpn
, 0,
821 /* Primary is full, try the secondary */
822 if (unlikely(slot
== -1)) {
823 pte_val(new_pte
) |= _PAGE_SECONDARY
;
824 hpte_group
= ((~hash
& htab_hash_mask
) *
825 HPTES_PER_GROUP
) & ~0x7UL
;
826 slot
= ppc_md
.hpte_insert(hpte_group
, va
, prpn
,
830 hpte_group
= ((hash
& htab_hash_mask
) * HPTES_PER_GROUP
) & ~0x7UL
;
832 ppc_md
.hpte_remove(hpte_group
);
837 if (unlikely(slot
== -2))
838 panic("hash_huge_page: pte_insert failed\n");
840 pte_val(new_pte
) |= (slot
<<12) & _PAGE_GROUP_IX
;
843 * No need to use ldarx/stdcx here because all who
844 * might be updating the pte will hold the
853 spin_unlock(&mm
->page_table_lock
);