2 * Copyright IBM Corp. 2007, 2011
3 * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
6 #include <linux/sched.h>
7 #include <linux/kernel.h>
8 #include <linux/errno.h>
11 #include <linux/swap.h>
12 #include <linux/smp.h>
13 #include <linux/highmem.h>
14 #include <linux/pagemap.h>
15 #include <linux/spinlock.h>
16 #include <linux/module.h>
17 #include <linux/quicklist.h>
18 #include <linux/rcupdate.h>
19 #include <linux/slab.h>
21 #include <asm/pgtable.h>
22 #include <asm/pgalloc.h>
24 #include <asm/tlbflush.h>
25 #include <asm/mmu_context.h>
29 #define FRAG_MASK 0x0f
32 #define FRAG_MASK 0x03
36 unsigned long *crst_table_alloc(struct mm_struct
*mm
)
38 struct page
*page
= alloc_pages(GFP_KERNEL
, ALLOC_ORDER
);
42 return (unsigned long *) page_to_phys(page
);
45 void crst_table_free(struct mm_struct
*mm
, unsigned long *table
)
47 free_pages((unsigned long) table
, ALLOC_ORDER
);
51 int crst_table_upgrade(struct mm_struct
*mm
, unsigned long limit
)
53 unsigned long *table
, *pgd
;
56 BUG_ON(limit
> (1UL << 53));
58 table
= crst_table_alloc(mm
);
61 spin_lock_bh(&mm
->page_table_lock
);
62 if (mm
->context
.asce_limit
< limit
) {
63 pgd
= (unsigned long *) mm
->pgd
;
64 if (mm
->context
.asce_limit
<= (1UL << 31)) {
65 entry
= _REGION3_ENTRY_EMPTY
;
66 mm
->context
.asce_limit
= 1UL << 42;
67 mm
->context
.asce_bits
= _ASCE_TABLE_LENGTH
|
71 entry
= _REGION2_ENTRY_EMPTY
;
72 mm
->context
.asce_limit
= 1UL << 53;
73 mm
->context
.asce_bits
= _ASCE_TABLE_LENGTH
|
77 crst_table_init(table
, entry
);
78 pgd_populate(mm
, (pgd_t
*) table
, (pud_t
*) pgd
);
79 mm
->pgd
= (pgd_t
*) table
;
80 mm
->task_size
= mm
->context
.asce_limit
;
83 spin_unlock_bh(&mm
->page_table_lock
);
85 crst_table_free(mm
, table
);
86 if (mm
->context
.asce_limit
< limit
)
91 void crst_table_downgrade(struct mm_struct
*mm
, unsigned long limit
)
95 while (mm
->context
.asce_limit
> limit
) {
97 switch (pgd_val(*pgd
) & _REGION_ENTRY_TYPE_MASK
) {
98 case _REGION_ENTRY_TYPE_R2
:
99 mm
->context
.asce_limit
= 1UL << 42;
100 mm
->context
.asce_bits
= _ASCE_TABLE_LENGTH
|
104 case _REGION_ENTRY_TYPE_R3
:
105 mm
->context
.asce_limit
= 1UL << 31;
106 mm
->context
.asce_bits
= _ASCE_TABLE_LENGTH
|
113 mm
->pgd
= (pgd_t
*) (pgd_val(*pgd
) & _REGION_ENTRY_ORIGIN
);
114 mm
->task_size
= mm
->context
.asce_limit
;
115 crst_table_free(mm
, (unsigned long *) pgd
);
123 * gmap_alloc - allocate a guest address space
124 * @mm: pointer to the parent mm_struct
126 * Returns a guest address space structure.
128 struct gmap
*gmap_alloc(struct mm_struct
*mm
)
132 unsigned long *table
;
134 gmap
= kzalloc(sizeof(struct gmap
), GFP_KERNEL
);
137 INIT_LIST_HEAD(&gmap
->crst_list
);
139 page
= alloc_pages(GFP_KERNEL
, ALLOC_ORDER
);
142 list_add(&page
->lru
, &gmap
->crst_list
);
143 table
= (unsigned long *) page_to_phys(page
);
144 crst_table_init(table
, _REGION1_ENTRY_EMPTY
);
146 gmap
->asce
= _ASCE_TYPE_REGION1
| _ASCE_TABLE_LENGTH
|
147 _ASCE_USER_BITS
| __pa(table
);
148 list_add(&gmap
->list
, &mm
->context
.gmap_list
);
156 EXPORT_SYMBOL_GPL(gmap_alloc
);
158 static int gmap_unlink_segment(struct gmap
*gmap
, unsigned long *table
)
160 struct gmap_pgtable
*mp
;
161 struct gmap_rmap
*rmap
;
164 if (*table
& _SEGMENT_ENTRY_INV
)
166 page
= pfn_to_page(*table
>> PAGE_SHIFT
);
167 mp
= (struct gmap_pgtable
*) page
->index
;
168 list_for_each_entry(rmap
, &mp
->mapper
, list
) {
169 if (rmap
->entry
!= table
)
171 list_del(&rmap
->list
);
175 *table
= _SEGMENT_ENTRY_INV
| _SEGMENT_ENTRY_RO
| mp
->vmaddr
;
179 static void gmap_flush_tlb(struct gmap
*gmap
)
181 if (MACHINE_HAS_IDTE
)
182 __tlb_flush_idte((unsigned long) gmap
->table
|
185 __tlb_flush_global();
189 * gmap_free - free a guest address space
190 * @gmap: pointer to the guest address space structure
192 void gmap_free(struct gmap
*gmap
)
194 struct page
*page
, *next
;
195 unsigned long *table
;
200 if (MACHINE_HAS_IDTE
)
201 __tlb_flush_idte((unsigned long) gmap
->table
|
204 __tlb_flush_global();
206 /* Free all segment & region tables. */
207 down_read(&gmap
->mm
->mmap_sem
);
208 spin_lock(&gmap
->mm
->page_table_lock
);
209 list_for_each_entry_safe(page
, next
, &gmap
->crst_list
, lru
) {
210 table
= (unsigned long *) page_to_phys(page
);
211 if ((*table
& _REGION_ENTRY_TYPE_MASK
) == 0)
212 /* Remove gmap rmap structures for segment table. */
213 for (i
= 0; i
< PTRS_PER_PMD
; i
++, table
++)
214 gmap_unlink_segment(gmap
, table
);
215 __free_pages(page
, ALLOC_ORDER
);
217 spin_unlock(&gmap
->mm
->page_table_lock
);
218 up_read(&gmap
->mm
->mmap_sem
);
219 list_del(&gmap
->list
);
222 EXPORT_SYMBOL_GPL(gmap_free
);
225 * gmap_enable - switch primary space to the guest address space
226 * @gmap: pointer to the guest address space structure
228 void gmap_enable(struct gmap
*gmap
)
230 S390_lowcore
.gmap
= (unsigned long) gmap
;
232 EXPORT_SYMBOL_GPL(gmap_enable
);
235 * gmap_disable - switch back to the standard primary address space
236 * @gmap: pointer to the guest address space structure
238 void gmap_disable(struct gmap
*gmap
)
240 S390_lowcore
.gmap
= 0UL;
242 EXPORT_SYMBOL_GPL(gmap_disable
);
245 * gmap_alloc_table is assumed to be called with mmap_sem held
247 static int gmap_alloc_table(struct gmap
*gmap
,
248 unsigned long *table
, unsigned long init
)
253 /* since we dont free the gmap table until gmap_free we can unlock */
254 spin_unlock(&gmap
->mm
->page_table_lock
);
255 page
= alloc_pages(GFP_KERNEL
, ALLOC_ORDER
);
256 spin_lock(&gmap
->mm
->page_table_lock
);
259 new = (unsigned long *) page_to_phys(page
);
260 crst_table_init(new, init
);
261 if (*table
& _REGION_ENTRY_INV
) {
262 list_add(&page
->lru
, &gmap
->crst_list
);
263 *table
= (unsigned long) new | _REGION_ENTRY_LENGTH
|
264 (*table
& _REGION_ENTRY_TYPE_MASK
);
266 __free_pages(page
, ALLOC_ORDER
);
271 * gmap_unmap_segment - unmap segment from the guest address space
272 * @gmap: pointer to the guest address space structure
273 * @addr: address in the guest address space
274 * @len: length of the memory area to unmap
276 * Returns 0 if the unmap succeded, -EINVAL if not.
278 int gmap_unmap_segment(struct gmap
*gmap
, unsigned long to
, unsigned long len
)
280 unsigned long *table
;
284 if ((to
| len
) & (PMD_SIZE
- 1))
286 if (len
== 0 || to
+ len
< to
)
290 down_read(&gmap
->mm
->mmap_sem
);
291 spin_lock(&gmap
->mm
->page_table_lock
);
292 for (off
= 0; off
< len
; off
+= PMD_SIZE
) {
293 /* Walk the guest addr space page table */
294 table
= gmap
->table
+ (((to
+ off
) >> 53) & 0x7ff);
295 if (*table
& _REGION_ENTRY_INV
)
297 table
= (unsigned long *)(*table
& _REGION_ENTRY_ORIGIN
);
298 table
= table
+ (((to
+ off
) >> 42) & 0x7ff);
299 if (*table
& _REGION_ENTRY_INV
)
301 table
= (unsigned long *)(*table
& _REGION_ENTRY_ORIGIN
);
302 table
= table
+ (((to
+ off
) >> 31) & 0x7ff);
303 if (*table
& _REGION_ENTRY_INV
)
305 table
= (unsigned long *)(*table
& _REGION_ENTRY_ORIGIN
);
306 table
= table
+ (((to
+ off
) >> 20) & 0x7ff);
308 /* Clear segment table entry in guest address space. */
309 flush
|= gmap_unlink_segment(gmap
, table
);
310 *table
= _SEGMENT_ENTRY_INV
;
313 spin_unlock(&gmap
->mm
->page_table_lock
);
314 up_read(&gmap
->mm
->mmap_sem
);
316 gmap_flush_tlb(gmap
);
319 EXPORT_SYMBOL_GPL(gmap_unmap_segment
);
322 * gmap_mmap_segment - map a segment to the guest address space
323 * @gmap: pointer to the guest address space structure
324 * @from: source address in the parent address space
325 * @to: target address in the guest address space
327 * Returns 0 if the mmap succeded, -EINVAL or -ENOMEM if not.
329 int gmap_map_segment(struct gmap
*gmap
, unsigned long from
,
330 unsigned long to
, unsigned long len
)
332 unsigned long *table
;
336 if ((from
| to
| len
) & (PMD_SIZE
- 1))
338 if (len
== 0 || from
+ len
> PGDIR_SIZE
||
339 from
+ len
< from
|| to
+ len
< to
)
343 down_read(&gmap
->mm
->mmap_sem
);
344 spin_lock(&gmap
->mm
->page_table_lock
);
345 for (off
= 0; off
< len
; off
+= PMD_SIZE
) {
346 /* Walk the gmap address space page table */
347 table
= gmap
->table
+ (((to
+ off
) >> 53) & 0x7ff);
348 if ((*table
& _REGION_ENTRY_INV
) &&
349 gmap_alloc_table(gmap
, table
, _REGION2_ENTRY_EMPTY
))
351 table
= (unsigned long *)(*table
& _REGION_ENTRY_ORIGIN
);
352 table
= table
+ (((to
+ off
) >> 42) & 0x7ff);
353 if ((*table
& _REGION_ENTRY_INV
) &&
354 gmap_alloc_table(gmap
, table
, _REGION3_ENTRY_EMPTY
))
356 table
= (unsigned long *)(*table
& _REGION_ENTRY_ORIGIN
);
357 table
= table
+ (((to
+ off
) >> 31) & 0x7ff);
358 if ((*table
& _REGION_ENTRY_INV
) &&
359 gmap_alloc_table(gmap
, table
, _SEGMENT_ENTRY_EMPTY
))
361 table
= (unsigned long *) (*table
& _REGION_ENTRY_ORIGIN
);
362 table
= table
+ (((to
+ off
) >> 20) & 0x7ff);
364 /* Store 'from' address in an invalid segment table entry. */
365 flush
|= gmap_unlink_segment(gmap
, table
);
366 *table
= _SEGMENT_ENTRY_INV
| _SEGMENT_ENTRY_RO
| (from
+ off
);
368 spin_unlock(&gmap
->mm
->page_table_lock
);
369 up_read(&gmap
->mm
->mmap_sem
);
371 gmap_flush_tlb(gmap
);
375 spin_unlock(&gmap
->mm
->page_table_lock
);
376 up_read(&gmap
->mm
->mmap_sem
);
377 gmap_unmap_segment(gmap
, to
, len
);
380 EXPORT_SYMBOL_GPL(gmap_map_segment
);
382 static unsigned long *gmap_table_walk(unsigned long address
, struct gmap
*gmap
)
384 unsigned long *table
;
386 table
= gmap
->table
+ ((address
>> 53) & 0x7ff);
387 if (unlikely(*table
& _REGION_ENTRY_INV
))
388 return ERR_PTR(-EFAULT
);
389 table
= (unsigned long *)(*table
& _REGION_ENTRY_ORIGIN
);
390 table
= table
+ ((address
>> 42) & 0x7ff);
391 if (unlikely(*table
& _REGION_ENTRY_INV
))
392 return ERR_PTR(-EFAULT
);
393 table
= (unsigned long *)(*table
& _REGION_ENTRY_ORIGIN
);
394 table
= table
+ ((address
>> 31) & 0x7ff);
395 if (unlikely(*table
& _REGION_ENTRY_INV
))
396 return ERR_PTR(-EFAULT
);
397 table
= (unsigned long *)(*table
& _REGION_ENTRY_ORIGIN
);
398 table
= table
+ ((address
>> 20) & 0x7ff);
403 * __gmap_translate - translate a guest address to a user space address
404 * @address: guest address
405 * @gmap: pointer to guest mapping meta data structure
407 * Returns user space address which corresponds to the guest address or
408 * -EFAULT if no such mapping exists.
409 * This function does not establish potentially missing page table entries.
410 * The mmap_sem of the mm that belongs to the address space must be held
411 * when this function gets called.
413 unsigned long __gmap_translate(unsigned long address
, struct gmap
*gmap
)
415 unsigned long *segment_ptr
, vmaddr
, segment
;
416 struct gmap_pgtable
*mp
;
419 current
->thread
.gmap_addr
= address
;
420 segment_ptr
= gmap_table_walk(address
, gmap
);
421 if (IS_ERR(segment_ptr
))
422 return PTR_ERR(segment_ptr
);
423 /* Convert the gmap address to an mm address. */
424 segment
= *segment_ptr
;
425 if (!(segment
& _SEGMENT_ENTRY_INV
)) {
426 page
= pfn_to_page(segment
>> PAGE_SHIFT
);
427 mp
= (struct gmap_pgtable
*) page
->index
;
428 return mp
->vmaddr
| (address
& ~PMD_MASK
);
429 } else if (segment
& _SEGMENT_ENTRY_RO
) {
430 vmaddr
= segment
& _SEGMENT_ENTRY_ORIGIN
;
431 return vmaddr
| (address
& ~PMD_MASK
);
435 EXPORT_SYMBOL_GPL(__gmap_translate
);
438 * gmap_translate - translate a guest address to a user space address
439 * @address: guest address
440 * @gmap: pointer to guest mapping meta data structure
442 * Returns user space address which corresponds to the guest address or
443 * -EFAULT if no such mapping exists.
444 * This function does not establish potentially missing page table entries.
446 unsigned long gmap_translate(unsigned long address
, struct gmap
*gmap
)
450 down_read(&gmap
->mm
->mmap_sem
);
451 rc
= __gmap_translate(address
, gmap
);
452 up_read(&gmap
->mm
->mmap_sem
);
455 EXPORT_SYMBOL_GPL(gmap_translate
);
457 static int gmap_connect_pgtable(unsigned long address
, unsigned long segment
,
458 unsigned long *segment_ptr
, struct gmap
*gmap
)
460 unsigned long vmaddr
;
461 struct vm_area_struct
*vma
;
462 struct gmap_pgtable
*mp
;
463 struct gmap_rmap
*rmap
;
464 struct mm_struct
*mm
;
471 vmaddr
= segment
& _SEGMENT_ENTRY_ORIGIN
;
472 vma
= find_vma(mm
, vmaddr
);
473 if (!vma
|| vma
->vm_start
> vmaddr
)
475 /* Walk the parent mm page table */
476 pgd
= pgd_offset(mm
, vmaddr
);
477 pud
= pud_alloc(mm
, pgd
, vmaddr
);
480 pmd
= pmd_alloc(mm
, pud
, vmaddr
);
483 if (!pmd_present(*pmd
) &&
484 __pte_alloc(mm
, vma
, pmd
, vmaddr
))
486 /* pmd now points to a valid segment table entry. */
487 rmap
= kmalloc(sizeof(*rmap
), GFP_KERNEL
|__GFP_REPEAT
);
490 /* Link gmap segment table entry location to page table. */
491 page
= pmd_page(*pmd
);
492 mp
= (struct gmap_pgtable
*) page
->index
;
494 rmap
->entry
= segment_ptr
;
495 rmap
->vmaddr
= address
& PMD_MASK
;
496 spin_lock(&mm
->page_table_lock
);
497 if (*segment_ptr
== segment
) {
498 list_add(&rmap
->list
, &mp
->mapper
);
499 /* Set gmap segment table entry to page table. */
500 *segment_ptr
= pmd_val(*pmd
) & PAGE_MASK
;
503 spin_unlock(&mm
->page_table_lock
);
508 static void gmap_disconnect_pgtable(struct mm_struct
*mm
, unsigned long *table
)
510 struct gmap_rmap
*rmap
, *next
;
511 struct gmap_pgtable
*mp
;
516 spin_lock(&mm
->page_table_lock
);
517 page
= pfn_to_page(__pa(table
) >> PAGE_SHIFT
);
518 mp
= (struct gmap_pgtable
*) page
->index
;
519 list_for_each_entry_safe(rmap
, next
, &mp
->mapper
, list
) {
521 _SEGMENT_ENTRY_INV
| _SEGMENT_ENTRY_RO
| mp
->vmaddr
;
522 list_del(&rmap
->list
);
526 spin_unlock(&mm
->page_table_lock
);
528 __tlb_flush_global();
532 * this function is assumed to be called with mmap_sem held
534 unsigned long __gmap_fault(unsigned long address
, struct gmap
*gmap
)
536 unsigned long *segment_ptr
, segment
;
537 struct gmap_pgtable
*mp
;
541 current
->thread
.gmap_addr
= address
;
542 segment_ptr
= gmap_table_walk(address
, gmap
);
543 if (IS_ERR(segment_ptr
))
545 /* Convert the gmap address to an mm address. */
547 segment
= *segment_ptr
;
548 if (!(segment
& _SEGMENT_ENTRY_INV
)) {
549 /* Page table is present */
550 page
= pfn_to_page(segment
>> PAGE_SHIFT
);
551 mp
= (struct gmap_pgtable
*) page
->index
;
552 return mp
->vmaddr
| (address
& ~PMD_MASK
);
554 if (!(segment
& _SEGMENT_ENTRY_RO
))
555 /* Nothing mapped in the gmap address space. */
557 rc
= gmap_connect_pgtable(address
, segment
, segment_ptr
, gmap
);
564 unsigned long gmap_fault(unsigned long address
, struct gmap
*gmap
)
568 down_read(&gmap
->mm
->mmap_sem
);
569 rc
= __gmap_fault(address
, gmap
);
570 up_read(&gmap
->mm
->mmap_sem
);
574 EXPORT_SYMBOL_GPL(gmap_fault
);
576 void gmap_discard(unsigned long from
, unsigned long to
, struct gmap
*gmap
)
579 unsigned long *table
, address
, size
;
580 struct vm_area_struct
*vma
;
581 struct gmap_pgtable
*mp
;
584 down_read(&gmap
->mm
->mmap_sem
);
586 while (address
< to
) {
587 /* Walk the gmap address space page table */
588 table
= gmap
->table
+ ((address
>> 53) & 0x7ff);
589 if (unlikely(*table
& _REGION_ENTRY_INV
)) {
590 address
= (address
+ PMD_SIZE
) & PMD_MASK
;
593 table
= (unsigned long *)(*table
& _REGION_ENTRY_ORIGIN
);
594 table
= table
+ ((address
>> 42) & 0x7ff);
595 if (unlikely(*table
& _REGION_ENTRY_INV
)) {
596 address
= (address
+ PMD_SIZE
) & PMD_MASK
;
599 table
= (unsigned long *)(*table
& _REGION_ENTRY_ORIGIN
);
600 table
= table
+ ((address
>> 31) & 0x7ff);
601 if (unlikely(*table
& _REGION_ENTRY_INV
)) {
602 address
= (address
+ PMD_SIZE
) & PMD_MASK
;
605 table
= (unsigned long *)(*table
& _REGION_ENTRY_ORIGIN
);
606 table
= table
+ ((address
>> 20) & 0x7ff);
607 if (unlikely(*table
& _SEGMENT_ENTRY_INV
)) {
608 address
= (address
+ PMD_SIZE
) & PMD_MASK
;
611 page
= pfn_to_page(*table
>> PAGE_SHIFT
);
612 mp
= (struct gmap_pgtable
*) page
->index
;
613 vma
= find_vma(gmap
->mm
, mp
->vmaddr
);
614 size
= min(to
- address
, PMD_SIZE
- (address
& ~PMD_MASK
));
615 zap_page_range(vma
, mp
->vmaddr
| (address
& ~PMD_MASK
),
617 address
= (address
+ PMD_SIZE
) & PMD_MASK
;
619 up_read(&gmap
->mm
->mmap_sem
);
621 EXPORT_SYMBOL_GPL(gmap_discard
);
623 static LIST_HEAD(gmap_notifier_list
);
624 static DEFINE_SPINLOCK(gmap_notifier_lock
);
627 * gmap_register_ipte_notifier - register a pte invalidation callback
628 * @nb: pointer to the gmap notifier block
630 void gmap_register_ipte_notifier(struct gmap_notifier
*nb
)
632 spin_lock(&gmap_notifier_lock
);
633 list_add(&nb
->list
, &gmap_notifier_list
);
634 spin_unlock(&gmap_notifier_lock
);
636 EXPORT_SYMBOL_GPL(gmap_register_ipte_notifier
);
639 * gmap_unregister_ipte_notifier - remove a pte invalidation callback
640 * @nb: pointer to the gmap notifier block
642 void gmap_unregister_ipte_notifier(struct gmap_notifier
*nb
)
644 spin_lock(&gmap_notifier_lock
);
645 list_del_init(&nb
->list
);
646 spin_unlock(&gmap_notifier_lock
);
648 EXPORT_SYMBOL_GPL(gmap_unregister_ipte_notifier
);
651 * gmap_ipte_notify - mark a range of ptes for invalidation notification
652 * @gmap: pointer to guest mapping meta data structure
653 * @address: virtual address in the guest address space
656 * Returns 0 if for each page in the given range a gmap mapping exists and
657 * the invalidation notification could be set. If the gmap mapping is missing
658 * for one or more pages -EFAULT is returned. If no memory could be allocated
659 * -ENOMEM is returned. This function establishes missing page table entries.
661 int gmap_ipte_notify(struct gmap
*gmap
, unsigned long start
, unsigned long len
)
669 if ((start
& ~PAGE_MASK
) || (len
& ~PAGE_MASK
))
671 down_read(&gmap
->mm
->mmap_sem
);
673 /* Convert gmap address and connect the page tables */
674 addr
= __gmap_fault(start
, gmap
);
675 if (IS_ERR_VALUE(addr
)) {
679 /* Get the page mapped */
680 if (fixup_user_fault(current
, gmap
->mm
, addr
, FAULT_FLAG_WRITE
)) {
684 /* Walk the process page table, lock and get pte pointer */
685 ptep
= get_locked_pte(gmap
->mm
, addr
, &ptl
);
688 /* Set notification bit in the pgste of the pte */
690 if ((pte_val(entry
) & (_PAGE_INVALID
| _PAGE_RO
)) == 0) {
691 pgste
= pgste_get_lock(ptep
);
692 pgste_val(pgste
) |= PGSTE_IN_BIT
;
693 pgste_set_unlock(ptep
, pgste
);
699 up_read(&gmap
->mm
->mmap_sem
);
702 EXPORT_SYMBOL_GPL(gmap_ipte_notify
);
705 * gmap_do_ipte_notify - call all invalidation callbacks for a specific pte.
706 * @mm: pointer to the process mm_struct
707 * @addr: virtual address in the process address space
708 * @pte: pointer to the page table entry
710 * This function is assumed to be called with the page table lock held
711 * for the pte to notify.
713 void gmap_do_ipte_notify(struct mm_struct
*mm
, unsigned long addr
, pte_t
*pte
)
715 unsigned long segment_offset
;
716 struct gmap_notifier
*nb
;
717 struct gmap_pgtable
*mp
;
718 struct gmap_rmap
*rmap
;
721 segment_offset
= ((unsigned long) pte
) & (255 * sizeof(pte_t
));
722 segment_offset
= segment_offset
* (4096 / sizeof(pte_t
));
723 page
= pfn_to_page(__pa(pte
) >> PAGE_SHIFT
);
724 mp
= (struct gmap_pgtable
*) page
->index
;
725 spin_lock(&gmap_notifier_lock
);
726 list_for_each_entry(rmap
, &mp
->mapper
, list
) {
727 list_for_each_entry(nb
, &gmap_notifier_list
, list
)
728 nb
->notifier_call(rmap
->gmap
,
729 rmap
->vmaddr
+ segment_offset
);
731 spin_unlock(&gmap_notifier_lock
);
734 static inline unsigned long *page_table_alloc_pgste(struct mm_struct
*mm
,
735 unsigned long vmaddr
)
738 unsigned long *table
;
739 struct gmap_pgtable
*mp
;
741 page
= alloc_page(GFP_KERNEL
|__GFP_REPEAT
);
744 mp
= kmalloc(sizeof(*mp
), GFP_KERNEL
|__GFP_REPEAT
);
749 pgtable_page_ctor(page
);
750 mp
->vmaddr
= vmaddr
& PMD_MASK
;
751 INIT_LIST_HEAD(&mp
->mapper
);
752 page
->index
= (unsigned long) mp
;
753 atomic_set(&page
->_mapcount
, 3);
754 table
= (unsigned long *) page_to_phys(page
);
755 clear_table(table
, _PAGE_TYPE_EMPTY
, PAGE_SIZE
/2);
756 clear_table(table
+ PTRS_PER_PTE
, 0, PAGE_SIZE
/2);
760 static inline void page_table_free_pgste(unsigned long *table
)
763 struct gmap_pgtable
*mp
;
765 page
= pfn_to_page(__pa(table
) >> PAGE_SHIFT
);
766 mp
= (struct gmap_pgtable
*) page
->index
;
767 BUG_ON(!list_empty(&mp
->mapper
));
768 pgtable_page_dtor(page
);
769 atomic_set(&page
->_mapcount
, -1);
774 int set_guest_storage_key(struct mm_struct
*mm
, unsigned long addr
,
775 unsigned long key
, bool nq
)
781 down_read(&mm
->mmap_sem
);
782 ptep
= get_locked_pte(current
->mm
, addr
, &ptl
);
783 if (unlikely(!ptep
)) {
784 up_read(&mm
->mmap_sem
);
788 new = old
= pgste_get_lock(ptep
);
789 pgste_val(new) &= ~(PGSTE_GR_BIT
| PGSTE_GC_BIT
|
790 PGSTE_ACC_BITS
| PGSTE_FP_BIT
);
791 pgste_val(new) |= (key
& (_PAGE_CHANGED
| _PAGE_REFERENCED
)) << 48;
792 pgste_val(new) |= (key
& (_PAGE_ACC_BITS
| _PAGE_FP_BIT
)) << 56;
793 if (!(pte_val(*ptep
) & _PAGE_INVALID
)) {
794 unsigned long address
, bits
;
797 address
= pte_val(*ptep
) & PAGE_MASK
;
798 skey
= page_get_storage_key(address
);
799 bits
= skey
& (_PAGE_CHANGED
| _PAGE_REFERENCED
);
800 /* Set storage key ACC and FP */
801 page_set_storage_key(address
,
802 (key
& (_PAGE_ACC_BITS
| _PAGE_FP_BIT
)),
805 /* Merge host changed & referenced into pgste */
806 pgste_val(new) |= bits
<< 52;
807 /* Transfer skey changed & referenced bit to kvm user bits */
808 pgste_val(new) |= bits
<< 45; /* PGSTE_UR_BIT & PGSTE_UC_BIT */
810 /* changing the guest storage key is considered a change of the page */
811 if ((pgste_val(new) ^ pgste_val(old
)) &
812 (PGSTE_ACC_BITS
| PGSTE_FP_BIT
| PGSTE_GR_BIT
| PGSTE_GC_BIT
))
813 pgste_val(new) |= PGSTE_UC_BIT
;
815 pgste_set_unlock(ptep
, new);
816 pte_unmap_unlock(*ptep
, ptl
);
817 up_read(&mm
->mmap_sem
);
820 EXPORT_SYMBOL(set_guest_storage_key
);
822 #else /* CONFIG_PGSTE */
824 static inline unsigned long *page_table_alloc_pgste(struct mm_struct
*mm
,
825 unsigned long vmaddr
)
830 static inline void page_table_free_pgste(unsigned long *table
)
834 static inline void gmap_disconnect_pgtable(struct mm_struct
*mm
,
835 unsigned long *table
)
839 #endif /* CONFIG_PGSTE */
841 static inline unsigned int atomic_xor_bits(atomic_t
*v
, unsigned int bits
)
843 unsigned int old
, new;
846 old
= atomic_read(v
);
848 } while (atomic_cmpxchg(v
, old
, new) != old
);
853 * page table entry allocation/free routines.
855 unsigned long *page_table_alloc(struct mm_struct
*mm
, unsigned long vmaddr
)
857 unsigned long *uninitialized_var(table
);
858 struct page
*uninitialized_var(page
);
859 unsigned int mask
, bit
;
861 if (mm_has_pgste(mm
))
862 return page_table_alloc_pgste(mm
, vmaddr
);
863 /* Allocate fragments of a 4K page as 1K/2K page table */
864 spin_lock_bh(&mm
->context
.list_lock
);
866 if (!list_empty(&mm
->context
.pgtable_list
)) {
867 page
= list_first_entry(&mm
->context
.pgtable_list
,
869 table
= (unsigned long *) page_to_phys(page
);
870 mask
= atomic_read(&page
->_mapcount
);
871 mask
= mask
| (mask
>> 4);
873 if ((mask
& FRAG_MASK
) == FRAG_MASK
) {
874 spin_unlock_bh(&mm
->context
.list_lock
);
875 page
= alloc_page(GFP_KERNEL
|__GFP_REPEAT
);
878 pgtable_page_ctor(page
);
879 atomic_set(&page
->_mapcount
, 1);
880 table
= (unsigned long *) page_to_phys(page
);
881 clear_table(table
, _PAGE_TYPE_EMPTY
, PAGE_SIZE
);
882 spin_lock_bh(&mm
->context
.list_lock
);
883 list_add(&page
->lru
, &mm
->context
.pgtable_list
);
885 for (bit
= 1; mask
& bit
; bit
<<= 1)
886 table
+= PTRS_PER_PTE
;
887 mask
= atomic_xor_bits(&page
->_mapcount
, bit
);
888 if ((mask
& FRAG_MASK
) == FRAG_MASK
)
889 list_del(&page
->lru
);
891 spin_unlock_bh(&mm
->context
.list_lock
);
895 void page_table_free(struct mm_struct
*mm
, unsigned long *table
)
898 unsigned int bit
, mask
;
900 if (mm_has_pgste(mm
)) {
901 gmap_disconnect_pgtable(mm
, table
);
902 return page_table_free_pgste(table
);
904 /* Free 1K/2K page table fragment of a 4K page */
905 page
= pfn_to_page(__pa(table
) >> PAGE_SHIFT
);
906 bit
= 1 << ((__pa(table
) & ~PAGE_MASK
)/(PTRS_PER_PTE
*sizeof(pte_t
)));
907 spin_lock_bh(&mm
->context
.list_lock
);
908 if ((atomic_read(&page
->_mapcount
) & FRAG_MASK
) != FRAG_MASK
)
909 list_del(&page
->lru
);
910 mask
= atomic_xor_bits(&page
->_mapcount
, bit
);
911 if (mask
& FRAG_MASK
)
912 list_add(&page
->lru
, &mm
->context
.pgtable_list
);
913 spin_unlock_bh(&mm
->context
.list_lock
);
915 pgtable_page_dtor(page
);
916 atomic_set(&page
->_mapcount
, -1);
921 static void __page_table_free_rcu(void *table
, unsigned bit
)
925 if (bit
== FRAG_MASK
)
926 return page_table_free_pgste(table
);
927 /* Free 1K/2K page table fragment of a 4K page */
928 page
= pfn_to_page(__pa(table
) >> PAGE_SHIFT
);
929 if (atomic_xor_bits(&page
->_mapcount
, bit
) == 0) {
930 pgtable_page_dtor(page
);
931 atomic_set(&page
->_mapcount
, -1);
936 void page_table_free_rcu(struct mmu_gather
*tlb
, unsigned long *table
)
938 struct mm_struct
*mm
;
940 unsigned int bit
, mask
;
943 if (mm_has_pgste(mm
)) {
944 gmap_disconnect_pgtable(mm
, table
);
945 table
= (unsigned long *) (__pa(table
) | FRAG_MASK
);
946 tlb_remove_table(tlb
, table
);
949 bit
= 1 << ((__pa(table
) & ~PAGE_MASK
) / (PTRS_PER_PTE
*sizeof(pte_t
)));
950 page
= pfn_to_page(__pa(table
) >> PAGE_SHIFT
);
951 spin_lock_bh(&mm
->context
.list_lock
);
952 if ((atomic_read(&page
->_mapcount
) & FRAG_MASK
) != FRAG_MASK
)
953 list_del(&page
->lru
);
954 mask
= atomic_xor_bits(&page
->_mapcount
, bit
| (bit
<< 4));
955 if (mask
& FRAG_MASK
)
956 list_add_tail(&page
->lru
, &mm
->context
.pgtable_list
);
957 spin_unlock_bh(&mm
->context
.list_lock
);
958 table
= (unsigned long *) (__pa(table
) | (bit
<< 4));
959 tlb_remove_table(tlb
, table
);
962 void __tlb_remove_table(void *_table
)
964 const unsigned long mask
= (FRAG_MASK
<< 4) | FRAG_MASK
;
965 void *table
= (void *)((unsigned long) _table
& ~mask
);
966 unsigned type
= (unsigned long) _table
& mask
;
969 __page_table_free_rcu(table
, type
);
971 free_pages((unsigned long) table
, ALLOC_ORDER
);
974 static void tlb_remove_table_smp_sync(void *arg
)
976 /* Simply deliver the interrupt */
979 static void tlb_remove_table_one(void *table
)
982 * This isn't an RCU grace period and hence the page-tables cannot be
983 * assumed to be actually RCU-freed.
985 * It is however sufficient for software page-table walkers that rely
986 * on IRQ disabling. See the comment near struct mmu_table_batch.
988 smp_call_function(tlb_remove_table_smp_sync
, NULL
, 1);
989 __tlb_remove_table(table
);
992 static void tlb_remove_table_rcu(struct rcu_head
*head
)
994 struct mmu_table_batch
*batch
;
997 batch
= container_of(head
, struct mmu_table_batch
, rcu
);
999 for (i
= 0; i
< batch
->nr
; i
++)
1000 __tlb_remove_table(batch
->tables
[i
]);
1002 free_page((unsigned long)batch
);
1005 void tlb_table_flush(struct mmu_gather
*tlb
)
1007 struct mmu_table_batch
**batch
= &tlb
->batch
;
1010 __tlb_flush_mm(tlb
->mm
);
1011 call_rcu_sched(&(*batch
)->rcu
, tlb_remove_table_rcu
);
1016 void tlb_remove_table(struct mmu_gather
*tlb
, void *table
)
1018 struct mmu_table_batch
**batch
= &tlb
->batch
;
1020 if (*batch
== NULL
) {
1021 *batch
= (struct mmu_table_batch
*)
1022 __get_free_page(GFP_NOWAIT
| __GFP_NOWARN
);
1023 if (*batch
== NULL
) {
1024 __tlb_flush_mm(tlb
->mm
);
1025 tlb_remove_table_one(table
);
1030 (*batch
)->tables
[(*batch
)->nr
++] = table
;
1031 if ((*batch
)->nr
== MAX_TABLE_BATCH
)
1032 tlb_table_flush(tlb
);
1035 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
1036 void thp_split_vma(struct vm_area_struct
*vma
)
1041 for (addr
= vma
->vm_start
; addr
< vma
->vm_end
; addr
+= PAGE_SIZE
) {
1042 page
= follow_page(vma
, addr
, FOLL_SPLIT
);
1046 void thp_split_mm(struct mm_struct
*mm
)
1048 struct vm_area_struct
*vma
= mm
->mmap
;
1050 while (vma
!= NULL
) {
1052 vma
->vm_flags
&= ~VM_HUGEPAGE
;
1053 vma
->vm_flags
|= VM_NOHUGEPAGE
;
1057 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
1060 * switch on pgstes for its userspace process (for kvm)
1062 int s390_enable_sie(void)
1064 struct task_struct
*tsk
= current
;
1065 struct mm_struct
*mm
, *old_mm
;
1067 /* Do we have switched amode? If no, we cannot do sie */
1068 if (s390_user_mode
== HOME_SPACE_MODE
)
1071 /* Do we have pgstes? if yes, we are done */
1072 if (mm_has_pgste(tsk
->mm
))
1075 /* lets check if we are allowed to replace the mm */
1077 if (!tsk
->mm
|| atomic_read(&tsk
->mm
->mm_users
) > 1 ||
1079 !hlist_empty(&tsk
->mm
->ioctx_list
) ||
1081 tsk
->mm
!= tsk
->active_mm
) {
1087 /* we copy the mm and let dup_mm create the page tables with_pgstes */
1088 tsk
->mm
->context
.alloc_pgste
= 1;
1089 /* make sure that both mms have a correct rss state */
1090 sync_mm_rss(tsk
->mm
);
1092 tsk
->mm
->context
.alloc_pgste
= 0;
1096 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
1097 /* split thp mappings and disable thp for future mappings */
1099 mm
->def_flags
|= VM_NOHUGEPAGE
;
1102 /* Now lets check again if something happened */
1104 if (!tsk
->mm
|| atomic_read(&tsk
->mm
->mm_users
) > 1 ||
1106 !hlist_empty(&tsk
->mm
->ioctx_list
) ||
1108 tsk
->mm
!= tsk
->active_mm
) {
1114 /* ok, we are alone. No ptrace, no threads, etc. */
1116 tsk
->mm
= tsk
->active_mm
= mm
;
1119 atomic_inc(&mm
->context
.attach_count
);
1120 atomic_dec(&old_mm
->context
.attach_count
);
1121 cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm
));
1127 EXPORT_SYMBOL_GPL(s390_enable_sie
);
1129 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
1130 int pmdp_clear_flush_young(struct vm_area_struct
*vma
, unsigned long address
,
1133 VM_BUG_ON(address
& ~HPAGE_PMD_MASK
);
1134 /* No need to flush TLB
1135 * On s390 reference bits are in storage key and never in TLB */
1136 return pmdp_test_and_clear_young(vma
, address
, pmdp
);
1139 int pmdp_set_access_flags(struct vm_area_struct
*vma
,
1140 unsigned long address
, pmd_t
*pmdp
,
1141 pmd_t entry
, int dirty
)
1143 VM_BUG_ON(address
& ~HPAGE_PMD_MASK
);
1145 if (pmd_same(*pmdp
, entry
))
1147 pmdp_invalidate(vma
, address
, pmdp
);
1148 set_pmd_at(vma
->vm_mm
, address
, pmdp
, entry
);
1152 static void pmdp_splitting_flush_sync(void *arg
)
1154 /* Simply deliver the interrupt */
1157 void pmdp_splitting_flush(struct vm_area_struct
*vma
, unsigned long address
,
1160 VM_BUG_ON(address
& ~HPAGE_PMD_MASK
);
1161 if (!test_and_set_bit(_SEGMENT_ENTRY_SPLIT_BIT
,
1162 (unsigned long *) pmdp
)) {
1163 /* need to serialize against gup-fast (IRQ disabled) */
1164 smp_call_function(pmdp_splitting_flush_sync
, NULL
, 1);
1168 void pgtable_trans_huge_deposit(struct mm_struct
*mm
, pmd_t
*pmdp
,
1171 struct list_head
*lh
= (struct list_head
*) pgtable
;
1173 assert_spin_locked(&mm
->page_table_lock
);
1176 if (!mm
->pmd_huge_pte
)
1179 list_add(lh
, (struct list_head
*) mm
->pmd_huge_pte
);
1180 mm
->pmd_huge_pte
= pgtable
;
1183 pgtable_t
pgtable_trans_huge_withdraw(struct mm_struct
*mm
, pmd_t
*pmdp
)
1185 struct list_head
*lh
;
1189 assert_spin_locked(&mm
->page_table_lock
);
1192 pgtable
= mm
->pmd_huge_pte
;
1193 lh
= (struct list_head
*) pgtable
;
1195 mm
->pmd_huge_pte
= NULL
;
1197 mm
->pmd_huge_pte
= (pgtable_t
) lh
->next
;
1200 ptep
= (pte_t
*) pgtable
;
1201 pte_val(*ptep
) = _PAGE_TYPE_EMPTY
;
1203 pte_val(*ptep
) = _PAGE_TYPE_EMPTY
;
1206 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */