2 * PPC64 (POWER4) Huge TLB Page Support for Kernel.
4 * Copyright (C) 2003 David Gibson, IBM Corporation.
6 * Based on the IA-32 version:
7 * Copyright (C) 2002, Rohit Seth <rohit.seth@intel.com>
10 #include <linux/init.h>
13 #include <linux/hugetlb.h>
14 #include <linux/pagemap.h>
15 #include <linux/smp_lock.h>
16 #include <linux/slab.h>
17 #include <linux/err.h>
18 #include <linux/sysctl.h>
20 #include <asm/pgalloc.h>
22 #include <asm/tlbflush.h>
23 #include <asm/mmu_context.h>
24 #include <asm/machdep.h>
25 #include <asm/cputable.h>
28 #include <linux/sysctl.h>
30 #define HUGEPGDIR_SHIFT (HPAGE_SHIFT + PAGE_SHIFT - 3)
31 #define HUGEPGDIR_SIZE (1UL << HUGEPGDIR_SHIFT)
32 #define HUGEPGDIR_MASK (~(HUGEPGDIR_SIZE-1))
34 #define HUGEPTE_INDEX_SIZE 9
35 #define HUGEPGD_INDEX_SIZE 10
37 #define PTRS_PER_HUGEPTE (1 << HUGEPTE_INDEX_SIZE)
38 #define PTRS_PER_HUGEPGD (1 << HUGEPGD_INDEX_SIZE)
40 static inline int hugepgd_index(unsigned long addr
)
42 return (addr
& ~REGION_MASK
) >> HUGEPGDIR_SHIFT
;
45 static pud_t
*hugepgd_offset(struct mm_struct
*mm
, unsigned long addr
)
49 if (! mm
->context
.huge_pgdir
)
53 index
= hugepgd_index(addr
);
54 BUG_ON(index
>= PTRS_PER_HUGEPGD
);
55 return (pud_t
*)(mm
->context
.huge_pgdir
+ index
);
58 static inline pte_t
*hugepte_offset(pud_t
*dir
, unsigned long addr
)
65 index
= (addr
>> HPAGE_SHIFT
) % PTRS_PER_HUGEPTE
;
66 return (pte_t
*)pud_page(*dir
) + index
;
69 static pud_t
*hugepgd_alloc(struct mm_struct
*mm
, unsigned long addr
)
71 BUG_ON(! in_hugepage_area(mm
->context
, addr
));
73 if (! mm
->context
.huge_pgdir
) {
75 spin_unlock(&mm
->page_table_lock
);
76 /* Don't use pgd_alloc(), because we want __GFP_REPEAT */
77 new = kmem_cache_alloc(zero_cache
, GFP_KERNEL
| __GFP_REPEAT
);
78 BUG_ON(memcmp(new, empty_zero_page
, PAGE_SIZE
));
79 spin_lock(&mm
->page_table_lock
);
82 * Because we dropped the lock, we should re-check the
83 * entry, as somebody else could have populated it..
85 if (mm
->context
.huge_pgdir
)
88 mm
->context
.huge_pgdir
= new;
90 return hugepgd_offset(mm
, addr
);
93 static pte_t
*hugepte_alloc(struct mm_struct
*mm
, pud_t
*dir
, unsigned long addr
)
95 if (! pud_present(*dir
)) {
98 spin_unlock(&mm
->page_table_lock
);
99 new = kmem_cache_alloc(zero_cache
, GFP_KERNEL
| __GFP_REPEAT
);
100 BUG_ON(memcmp(new, empty_zero_page
, PAGE_SIZE
));
101 spin_lock(&mm
->page_table_lock
);
103 * Because we dropped the lock, we should re-check the
104 * entry, as somebody else could have populated it..
106 if (pud_present(*dir
)) {
108 kmem_cache_free(zero_cache
, new);
110 struct page
*ptepage
;
114 ptepage
= virt_to_page(new);
115 ptepage
->mapping
= (void *) mm
;
116 ptepage
->index
= addr
& HUGEPGDIR_MASK
;
117 pud_populate(mm
, dir
, new);
121 return hugepte_offset(dir
, addr
);
124 pte_t
*huge_pte_offset(struct mm_struct
*mm
, unsigned long addr
)
128 BUG_ON(! in_hugepage_area(mm
->context
, addr
));
130 pud
= hugepgd_offset(mm
, addr
);
134 return hugepte_offset(pud
, addr
);
137 pte_t
*huge_pte_alloc(struct mm_struct
*mm
, unsigned long addr
)
141 BUG_ON(! in_hugepage_area(mm
->context
, addr
));
143 pud
= hugepgd_alloc(mm
, addr
);
147 return hugepte_alloc(mm
, pud
, addr
);
151 * This function checks for proper alignment of input addr and len parameters.
153 int is_aligned_hugepage_range(unsigned long addr
, unsigned long len
)
155 if (len
& ~HPAGE_MASK
)
157 if (addr
& ~HPAGE_MASK
)
159 if (! (within_hugepage_low_range(addr
, len
)
160 || within_hugepage_high_range(addr
, len
)) )
165 static void flush_segments(void *parm
)
167 u16 segs
= (unsigned long) parm
;
170 asm volatile("isync" : : : "memory");
172 for (i
= 0; i
< 16; i
++) {
173 if (! (segs
& (1U << i
)))
175 asm volatile("slbie %0" : : "r" (i
<< SID_SHIFT
));
178 asm volatile("isync" : : : "memory");
181 static int prepare_low_seg_for_htlb(struct mm_struct
*mm
, unsigned long seg
)
183 unsigned long start
= seg
<< SID_SHIFT
;
184 unsigned long end
= (seg
+1) << SID_SHIFT
;
185 struct vm_area_struct
*vma
;
189 /* Check no VMAs are in the region */
190 vma
= find_vma(mm
, start
);
191 if (vma
&& (vma
->vm_start
< end
))
197 static int open_low_hpage_segs(struct mm_struct
*mm
, u16 newsegs
)
201 newsegs
&= ~(mm
->context
.htlb_segs
);
203 return 0; /* The segments we want are already open */
205 for (i
= 0; i
< 16; i
++)
206 if ((1 << i
) & newsegs
)
207 if (prepare_low_seg_for_htlb(mm
, i
) != 0)
210 mm
->context
.htlb_segs
|= newsegs
;
212 /* update the paca copy of the context struct */
213 get_paca()->context
= mm
->context
;
215 /* the context change must make it to memory before the flush,
216 * so that further SLB misses do the right thing. */
218 on_each_cpu(flush_segments
, (void *)(unsigned long)newsegs
, 0, 1);
223 int prepare_hugepage_range(unsigned long addr
, unsigned long len
)
225 if (within_hugepage_high_range(addr
, len
))
227 else if ((addr
< 0x100000000UL
) && ((addr
+len
) < 0x100000000UL
)) {
229 /* Yes, we need both tests, in case addr+len overflows
230 * 64-bit arithmetic */
231 err
= open_low_hpage_segs(current
->mm
,
232 LOW_ESID_MASK(addr
, len
));
234 printk(KERN_DEBUG
"prepare_hugepage_range(%lx, %lx)"
235 " failed (segs: 0x%04hx)\n", addr
, len
,
236 LOW_ESID_MASK(addr
, len
));
244 follow_huge_addr(struct mm_struct
*mm
, unsigned long address
, int write
)
249 if (! in_hugepage_area(mm
->context
, address
))
250 return ERR_PTR(-EINVAL
);
252 ptep
= huge_pte_offset(mm
, address
);
253 page
= pte_page(*ptep
);
255 page
+= (address
% HPAGE_SIZE
) / PAGE_SIZE
;
260 int pmd_huge(pmd_t pmd
)
266 follow_huge_pmd(struct mm_struct
*mm
, unsigned long address
,
267 pmd_t
*pmd
, int write
)
273 /* Because we have an exclusive hugepage region which lies within the
274 * normal user address space, we have to take special measures to make
275 * non-huge mmap()s evade the hugepage reserved regions. */
276 unsigned long arch_get_unmapped_area(struct file
*filp
, unsigned long addr
,
277 unsigned long len
, unsigned long pgoff
,
280 struct mm_struct
*mm
= current
->mm
;
281 struct vm_area_struct
*vma
;
282 unsigned long start_addr
;
288 addr
= PAGE_ALIGN(addr
);
289 vma
= find_vma(mm
, addr
);
290 if (((TASK_SIZE
- len
) >= addr
)
291 && (!vma
|| (addr
+len
) <= vma
->vm_start
)
292 && !is_hugepage_only_range(mm
, addr
,len
))
295 if (len
> mm
->cached_hole_size
) {
296 start_addr
= addr
= mm
->free_area_cache
;
298 start_addr
= addr
= TASK_UNMAPPED_BASE
;
299 mm
->cached_hole_size
= 0;
303 vma
= find_vma(mm
, addr
);
304 while (TASK_SIZE
- len
>= addr
) {
305 BUG_ON(vma
&& (addr
>= vma
->vm_end
));
307 if (touches_hugepage_low_range(mm
, addr
, len
)) {
308 addr
= ALIGN(addr
+1, 1<<SID_SHIFT
);
309 vma
= find_vma(mm
, addr
);
312 if (touches_hugepage_high_range(addr
, len
)) {
313 addr
= TASK_HPAGE_END
;
314 vma
= find_vma(mm
, addr
);
317 if (!vma
|| addr
+ len
<= vma
->vm_start
) {
319 * Remember the place where we stopped the search:
321 mm
->free_area_cache
= addr
+ len
;
324 if (addr
+ mm
->cached_hole_size
< vma
->vm_start
)
325 mm
->cached_hole_size
= vma
->vm_start
- addr
;
330 /* Make sure we didn't miss any holes */
331 if (start_addr
!= TASK_UNMAPPED_BASE
) {
332 start_addr
= addr
= TASK_UNMAPPED_BASE
;
333 mm
->cached_hole_size
= 0;
340 * This mmap-allocator allocates new areas top-down from below the
341 * stack's low limit (the base):
343 * Because we have an exclusive hugepage region which lies within the
344 * normal user address space, we have to take special measures to make
345 * non-huge mmap()s evade the hugepage reserved regions.
348 arch_get_unmapped_area_topdown(struct file
*filp
, const unsigned long addr0
,
349 const unsigned long len
, const unsigned long pgoff
,
350 const unsigned long flags
)
352 struct vm_area_struct
*vma
, *prev_vma
;
353 struct mm_struct
*mm
= current
->mm
;
354 unsigned long base
= mm
->mmap_base
, addr
= addr0
;
355 unsigned long largest_hole
= mm
->cached_hole_size
;
358 /* requested length too big for entire address space */
362 /* dont allow allocations above current base */
363 if (mm
->free_area_cache
> base
)
364 mm
->free_area_cache
= base
;
366 /* requesting a specific address */
368 addr
= PAGE_ALIGN(addr
);
369 vma
= find_vma(mm
, addr
);
370 if (TASK_SIZE
- len
>= addr
&&
371 (!vma
|| addr
+ len
<= vma
->vm_start
)
372 && !is_hugepage_only_range(mm
, addr
,len
))
376 if (len
<= largest_hole
) {
378 mm
->free_area_cache
= base
;
381 /* make sure it can fit in the remaining address space */
382 if (mm
->free_area_cache
< len
)
385 /* either no address requested or cant fit in requested address hole */
386 addr
= (mm
->free_area_cache
- len
) & PAGE_MASK
;
389 if (touches_hugepage_low_range(mm
, addr
, len
)) {
390 addr
= (addr
& ((~0) << SID_SHIFT
)) - len
;
391 goto hugepage_recheck
;
392 } else if (touches_hugepage_high_range(addr
, len
)) {
393 addr
= TASK_HPAGE_BASE
- len
;
397 * Lookup failure means no vma is above this address,
398 * i.e. return with success:
400 if (!(vma
= find_vma_prev(mm
, addr
, &prev_vma
)))
404 * new region fits between prev_vma->vm_end and
405 * vma->vm_start, use it:
407 if (addr
+len
<= vma
->vm_start
&&
408 (!prev_vma
|| (addr
>= prev_vma
->vm_end
))) {
409 /* remember the address as a hint for next time */
410 mm
->cached_hole_size
= largest_hole
;
411 return (mm
->free_area_cache
= addr
);
413 /* pull free_area_cache down to the first hole */
414 if (mm
->free_area_cache
== vma
->vm_end
) {
415 mm
->free_area_cache
= vma
->vm_start
;
416 mm
->cached_hole_size
= largest_hole
;
420 /* remember the largest hole we saw so far */
421 if (addr
+ largest_hole
< vma
->vm_start
)
422 largest_hole
= vma
->vm_start
- addr
;
424 /* try just below the current vma->vm_start */
425 addr
= vma
->vm_start
-len
;
426 } while (len
<= vma
->vm_start
);
430 * if hint left us with no space for the requested
431 * mapping then try again:
434 mm
->free_area_cache
= base
;
440 * A failed mmap() very likely causes application failure,
441 * so fall back to the bottom-up function here. This scenario
442 * can happen with large stack limits and large mmap()
445 mm
->free_area_cache
= TASK_UNMAPPED_BASE
;
446 mm
->cached_hole_size
= ~0UL;
447 addr
= arch_get_unmapped_area(filp
, addr0
, len
, pgoff
, flags
);
449 * Restore the topdown base:
451 mm
->free_area_cache
= base
;
452 mm
->cached_hole_size
= ~0UL;
457 static unsigned long htlb_get_low_area(unsigned long len
, u16 segmask
)
459 unsigned long addr
= 0;
460 struct vm_area_struct
*vma
;
462 vma
= find_vma(current
->mm
, addr
);
463 while (addr
+ len
<= 0x100000000UL
) {
464 BUG_ON(vma
&& (addr
>= vma
->vm_end
)); /* invariant */
466 if (! __within_hugepage_low_range(addr
, len
, segmask
)) {
467 addr
= ALIGN(addr
+1, 1<<SID_SHIFT
);
468 vma
= find_vma(current
->mm
, addr
);
472 if (!vma
|| (addr
+ len
) <= vma
->vm_start
)
474 addr
= ALIGN(vma
->vm_end
, HPAGE_SIZE
);
475 /* Depending on segmask this might not be a confirmed
476 * hugepage region, so the ALIGN could have skipped
478 vma
= find_vma(current
->mm
, addr
);
484 static unsigned long htlb_get_high_area(unsigned long len
)
486 unsigned long addr
= TASK_HPAGE_BASE
;
487 struct vm_area_struct
*vma
;
489 vma
= find_vma(current
->mm
, addr
);
490 for (vma
= find_vma(current
->mm
, addr
);
491 addr
+ len
<= TASK_HPAGE_END
;
492 vma
= vma
->vm_next
) {
493 BUG_ON(vma
&& (addr
>= vma
->vm_end
)); /* invariant */
494 BUG_ON(! within_hugepage_high_range(addr
, len
));
496 if (!vma
|| (addr
+ len
) <= vma
->vm_start
)
498 addr
= ALIGN(vma
->vm_end
, HPAGE_SIZE
);
499 /* Because we're in a hugepage region, this alignment
500 * should not skip us over any VMAs */
506 unsigned long hugetlb_get_unmapped_area(struct file
*file
, unsigned long addr
,
507 unsigned long len
, unsigned long pgoff
,
510 if (len
& ~HPAGE_MASK
)
513 if (!cpu_has_feature(CPU_FTR_16M_PAGE
))
516 if (test_thread_flag(TIF_32BIT
)) {
518 u16 segmask
, cursegs
= current
->mm
->context
.htlb_segs
;
520 /* First see if we can do the mapping in the existing
521 * low hpage segments */
522 addr
= htlb_get_low_area(len
, cursegs
);
526 for (segmask
= LOW_ESID_MASK(0x100000000UL
-len
, len
);
527 ! lastshift
; segmask
>>=1) {
531 addr
= htlb_get_low_area(len
, cursegs
| segmask
);
532 if ((addr
!= -ENOMEM
)
533 && open_low_hpage_segs(current
->mm
, segmask
) == 0)
536 printk(KERN_DEBUG
"hugetlb_get_unmapped_area() unable to open"
537 " enough segments\n");
540 return htlb_get_high_area(len
);
544 void hugetlb_mm_free_pgd(struct mm_struct
*mm
)
549 spin_lock(&mm
->page_table_lock
);
551 pgdir
= mm
->context
.huge_pgdir
;
555 mm
->context
.huge_pgdir
= NULL
;
557 /* cleanup any hugepte pages leftover */
558 for (i
= 0; i
< PTRS_PER_HUGEPGD
; i
++) {
559 pud_t
*pud
= (pud_t
*)(pgdir
+ i
);
561 if (! pud_none(*pud
)) {
562 pte_t
*pte
= (pte_t
*)pud_page(*pud
);
563 struct page
*ptepage
= virt_to_page(pte
);
565 ptepage
->mapping
= NULL
;
567 BUG_ON(memcmp(pte
, empty_zero_page
, PAGE_SIZE
));
568 kmem_cache_free(zero_cache
, pte
);
573 BUG_ON(memcmp(pgdir
, empty_zero_page
, PAGE_SIZE
));
574 kmem_cache_free(zero_cache
, pgdir
);
577 spin_unlock(&mm
->page_table_lock
);
580 int hash_huge_page(struct mm_struct
*mm
, unsigned long access
,
581 unsigned long ea
, unsigned long vsid
, int local
)
584 unsigned long va
, vpn
;
585 pte_t old_pte
, new_pte
;
586 unsigned long rflags
, prpn
;
590 spin_lock(&mm
->page_table_lock
);
592 ptep
= huge_pte_offset(mm
, ea
);
594 /* Search the Linux page table for a match with va */
595 va
= (vsid
<< 28) | (ea
& 0x0fffffff);
596 vpn
= va
>> HPAGE_SHIFT
;
599 * If no pte found or not present, send the problem up to
602 if (unlikely(!ptep
|| pte_none(*ptep
)))
605 /* BUG_ON(pte_bad(*ptep)); */
608 * Check the user's access rights to the page. If access should be
609 * prevented then send the problem up to do_page_fault.
611 if (unlikely(access
& ~pte_val(*ptep
)))
614 * At this point, we have a pte (old_pte) which can be used to build
615 * or update an HPTE. There are 2 cases:
617 * 1. There is a valid (present) pte with no associated HPTE (this is
618 * the most common case)
619 * 2. There is a valid (present) pte with an associated HPTE. The
620 * current values of the pp bits in the HPTE prevent access
621 * because we are doing software DIRTY bit management and the
622 * page is currently not DIRTY.
629 rflags
= 0x2 | (! (pte_val(new_pte
) & _PAGE_RW
));
630 /* _PAGE_EXEC -> HW_NO_EXEC since it's inverted */
631 rflags
|= ((pte_val(new_pte
) & _PAGE_EXEC
) ? 0 : HW_NO_EXEC
);
633 /* Check if pte already has an hpte (case 2) */
634 if (unlikely(pte_val(old_pte
) & _PAGE_HASHPTE
)) {
635 /* There MIGHT be an HPTE for this pte */
636 unsigned long hash
, slot
;
638 hash
= hpt_hash(vpn
, 1);
639 if (pte_val(old_pte
) & _PAGE_SECONDARY
)
641 slot
= (hash
& htab_hash_mask
) * HPTES_PER_GROUP
;
642 slot
+= (pte_val(old_pte
) & _PAGE_GROUP_IX
) >> 12;
644 if (ppc_md
.hpte_updatepp(slot
, rflags
, va
, 1, local
) == -1)
645 pte_val(old_pte
) &= ~_PAGE_HPTEFLAGS
;
648 if (likely(!(pte_val(old_pte
) & _PAGE_HASHPTE
))) {
649 unsigned long hash
= hpt_hash(vpn
, 1);
650 unsigned long hpte_group
;
652 prpn
= pte_pfn(old_pte
);
655 hpte_group
= ((hash
& htab_hash_mask
) *
656 HPTES_PER_GROUP
) & ~0x7UL
;
658 /* Update the linux pte with the HPTE slot */
659 pte_val(new_pte
) &= ~_PAGE_HPTEFLAGS
;
660 pte_val(new_pte
) |= _PAGE_HASHPTE
;
662 /* Add in WIMG bits */
663 /* XXX We should store these in the pte */
664 rflags
|= _PAGE_COHERENT
;
666 slot
= ppc_md
.hpte_insert(hpte_group
, va
, prpn
,
667 HPTE_V_LARGE
, rflags
);
669 /* Primary is full, try the secondary */
670 if (unlikely(slot
== -1)) {
671 pte_val(new_pte
) |= _PAGE_SECONDARY
;
672 hpte_group
= ((~hash
& htab_hash_mask
) *
673 HPTES_PER_GROUP
) & ~0x7UL
;
674 slot
= ppc_md
.hpte_insert(hpte_group
, va
, prpn
,
675 HPTE_V_LARGE
, rflags
);
678 hpte_group
= ((hash
& htab_hash_mask
) * HPTES_PER_GROUP
) & ~0x7UL
;
680 ppc_md
.hpte_remove(hpte_group
);
685 if (unlikely(slot
== -2))
686 panic("hash_huge_page: pte_insert failed\n");
688 pte_val(new_pte
) |= (slot
<<12) & _PAGE_GROUP_IX
;
691 * No need to use ldarx/stdcx here because all who
692 * might be updating the pte will hold the
701 spin_unlock(&mm
->page_table_lock
);