From b443039b5d8fa7246ce7be3e04756121508c48eb Mon Sep 17 00:00:00 2001
From: Matthew Dillon <dillon@apollo.backplane.com>
Date: Tue, 31 Jan 2017 17:10:44 -0800
Subject: [PATCH] kernel - Augment vm_fault_page() for vkernel operations

* Augment vm_fault_page(), adding required elements from vm_fault() that
  were missing.

* In particular, this fixes a bug in any copyout to a growable stack segment
  (including the copyouts the exec code does), where the stack was not being
  properly grown and the copyout/suword64/etc calls were failing when they
  shouldn't have.

* Note optimization in pmap_clean_pte().  When turning off VPTE_RW, we
  do not have to MADV_INVAL to the real host if we determine that VPTE_M
  is not set.  Due to the way the vkernel works and host works, the host
  will utilize a read-only real PTE for read faults on VPTE_RW entries,
  in order to be able to detect writes by forcing a write fault when the
  write occurs.

* Fix a race between pmap_enter() and pmap_page_protect().  The vkernel
  doesn't have a convenient pv-lock like the read kernel pmap code, so
  use the vm_page spinlock instead.

* Generally use atomic ops for other operations on ptes that should be
  using atomic ops.

* Only clear PG_WRITEABLE while holding the vm_page spinlock.
---
 sys/platform/pc64/x86_64/pmap.c              |   9 +-
 sys/platform/vkernel64/platform/pmap.c       |  49 +++++++----
 sys/platform/vkernel64/platform/pmap_inval.c |  37 ++++++--
 sys/vm/vm_fault.c                            | 124 ++++++++++++++++++++-------
 4 files changed, 157 insertions(+), 62 deletions(-)

diff --git a/sys/platform/pc64/x86_64/pmap.c b/sys/platform/pc64/x86_64/pmap.c
index 73b1439eab..204d0f0a72 100644
--- a/sys/platform/pc64/x86_64/pmap.c
+++ b/sys/platform/pc64/x86_64/pmap.c
@@ -5368,10 +5368,11 @@ pmap_clearbit(vm_page_t m, int bit_index)
 	pt_entry_t pbits;
 	pmap_t pmap;
 
-	if (bit_index == PG_RW_IDX)
-		vm_page_flag_clear(m, PG_WRITEABLE);
-	if (!pmap_initialized || (m->flags & PG_FICTITIOUS))
+	if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) {
+		if (bit_index == PG_RW_IDX)
+			vm_page_flag_clear(m, PG_WRITEABLE);
 		return;
+	}
 
 	/*
 	 * PG_M or PG_A case
@@ -5490,6 +5491,8 @@ restart:
 		pv_put(pv);
 		goto restart;
 	}
+	if (bit_index == PG_RW_IDX)
+		vm_page_flag_clear(m, PG_WRITEABLE);
 	vm_page_spin_unlock(m);
 }
 
diff --git a/sys/platform/vkernel64/platform/pmap.c b/sys/platform/vkernel64/platform/pmap.c
index b01a054de3..175625c76e 100644
--- a/sys/platform/vkernel64/platform/pmap.c
+++ b/sys/platform/vkernel64/platform/pmap.c
@@ -1122,7 +1122,7 @@ pmap_unwire_pgtable(pmap_t pmap, vm_offset_t va, vm_page_t m)
 		KKASSERT(m->wire_count == 0);
 		vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE);
 		vm_page_flash(m);
-		vm_page_free_zero(m);
+		vm_page_free(m);
 		return 1;
 	} else {
 		/* XXX SMP race to 1 if not holding vmobj */
@@ -1215,7 +1215,7 @@ pmap_pinit(struct pmap *pmap)
 				     VM_ALLOC_NORMAL | VM_ALLOC_RETRY |
 				     VM_ALLOC_ZERO);
 		pmap->pm_pdirm = ptdpg;
-		vm_page_flag_clear(ptdpg, PG_MAPPED);
+		vm_page_flag_clear(ptdpg, PG_MAPPED | PG_WRITEABLE);
 		vm_page_wire(ptdpg);
 		vm_page_wakeup(ptdpg);
 		pmap_kenter((vm_offset_t)pmap->pm_pml4, VM_PAGE_TO_PHYS(ptdpg));
@@ -1248,9 +1248,9 @@ pmap_puninit(pmap_t pmap)
 		KKASSERT(pmap->pm_pml4 != NULL);
 		pmap_kremove((vm_offset_t)pmap->pm_pml4);
 		vm_page_busy_wait(p, FALSE, "pgpun");
-		atomic_add_int(&p->wire_count, -1);
-		atomic_add_int(&vmstats.v_wire_count, -1);
-		vm_page_free_zero(p);
+		vm_page_unwire(p, 0);
+		vm_page_flag_clear(p, PG_MAPPED | PG_WRITEABLE);
+		vm_page_free(p);
 		pmap->pm_pdirm = NULL;
 		atomic_add_long(&pmap->pm_stats.wired_count, -1);
 		KKASSERT(pmap->pm_stats.wired_count == 0);
@@ -1415,6 +1415,7 @@ _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex)
 	 * Map the page table page into its parent, giving it 1 wire count.
 	 */
 	vm_page_wire(m);
+	vm_page_unmanage(m);
 	atomic_add_long(&pmap->pm_stats.resident_count, 1);
 	vm_page_flag_set(m, PG_MAPPED | PG_WRITEABLE);
 
@@ -1818,10 +1819,10 @@ pmap_remove_entry(struct pmap *pmap, vm_page_t m, vm_offset_t va)
 	rtval = 0;
 	if (pv) {
 		TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
-		m->md.pv_list_count--;
-		KKASSERT(m->md.pv_list_count >= 0);
 		if (TAILQ_EMPTY(&m->md.pv_list))
 			vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE);
+		m->md.pv_list_count--;
+		KKASSERT(m->md.pv_list_count >= 0);
 		pv_entry_rb_tree_RB_REMOVE(&pmap->pm_pvroot, pv);
 		atomic_add_int(&pmap->pm_generation, 1);
 		vm_page_spin_unlock(m);
@@ -2128,16 +2129,17 @@ restart:
 				vm_page_dirty(m);
 		}
 		TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
-		pv_entry_rb_tree_RB_REMOVE(&pmap->pm_pvroot, pv);
-		atomic_add_int(&pmap->pm_generation, 1);
-		m->md.pv_list_count--;
-		KKASSERT(m->md.pv_list_count >= 0);
 		if (TAILQ_EMPTY(&m->md.pv_list))
 			vm_page_flag_clear(m, PG_MAPPED | PG_WRITEABLE);
+		m->md.pv_list_count--;
+		KKASSERT(m->md.pv_list_count >= 0);
+		pv_entry_rb_tree_RB_REMOVE(&pmap->pm_pvroot, pv);
+		atomic_add_int(&pmap->pm_generation, 1);
 		vm_page_spin_unlock(m);
 		pmap_unuse_pt(pmap, pv->pv_va, pv->pv_ptem);
-		vm_object_drop(pmobj);
 		free_pv_entry(pv);
+
+		vm_object_drop(pmobj);
 		vm_page_spin_lock(m);
 	}
 	KKASSERT((m->flags & (PG_MAPPED|PG_WRITEABLE)) == 0);
@@ -2338,6 +2340,7 @@ pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
 	pt_entry_t origpte, newpte;
 	vm_paddr_t opa;
 	vm_page_t mpte;
+	int spun = 0;
 
 	if (pmap == NULL)
 		return;
@@ -2431,12 +2434,20 @@ pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
 	 */
 	if (pmap_initialized) {
 		if ((m->flags & (PG_FICTITIOUS|PG_UNMANAGED)) == 0) {
+			/*
+			 * WARNING!  We are using m's spin-lock as a
+			 *	     man's pte lock to interlock against
+			 *	     pmap_page_protect() operations.
+			 *
+			 *	     This is a bad hack (obviously).
+			 */
 			pv = get_pv_entry();
 			vm_page_spin_lock(m);
 			pmap_insert_entry(pmap, va, mpte, m, pv);
 			pa |= VPTE_MANAGED;
 			vm_page_flag_set(m, PG_MAPPED);
-			vm_page_spin_unlock(m);
+			spun = 1;
+			/* vm_page_spin_unlock(m); */
 		}
 	}
 
@@ -2468,6 +2479,8 @@ validate:
 		kprintf("pmap [M] race @ %016jx\n", va);
 		atomic_set_long(pte, VPTE_M);
 	}
+	if (spun)
+		vm_page_spin_unlock(m);
 
 	if (mpte)
 		vm_page_wakeup(mpte);
@@ -2867,7 +2880,6 @@ pmap_remove_pages(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 		TAILQ_REMOVE(&pmap->pm_pvlist, pv, pv_plist);
 		atomic_add_int(&pmap->pm_generation, 1);
 		save_generation = pmap->pm_generation;
-
 		m->md.pv_list_count--;
 		TAILQ_REMOVE(&m->md.pv_list, pv, pv_list);
 		if (TAILQ_EMPTY(&m->md.pv_list))
@@ -2948,10 +2960,11 @@ pmap_clearbit(vm_page_t m, int bit)
 	pt_entry_t *pte;
 	pt_entry_t pbits;
 
-	if (bit == VPTE_RW)
-		vm_page_flag_clear(m, PG_WRITEABLE);
-	if (!pmap_initialized || (m->flags & PG_FICTITIOUS))
+	if (!pmap_initialized || (m->flags & PG_FICTITIOUS)) {
+		if (bit == VPTE_RW)
+			vm_page_flag_clear(m, PG_WRITEABLE);
 		return;
+	}
 
 	/*
 	 * Loop over all current mappings setting/clearing as appropos If
@@ -3033,6 +3046,8 @@ restart:
 			}
 		}
 	}
+	if (bit == VPTE_RW)
+		vm_page_flag_clear(m, PG_WRITEABLE);
 	vm_page_spin_unlock(m);
 }
 
diff --git a/sys/platform/vkernel64/platform/pmap_inval.c b/sys/platform/vkernel64/platform/pmap_inval.c
index 68c6d8c606..7242026663 100644
--- a/sys/platform/vkernel64/platform/pmap_inval.c
+++ b/sys/platform/vkernel64/platform/pmap_inval.c
@@ -272,9 +272,14 @@ pmap_inval_pde_quick(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va)
 /*
  * These carefully handle interactions with other cpus and return
  * the original vpte.  Clearing VPTE_RW prevents us from racing the
- * setting of VPTE_M, allowing us to invalidate the tlb (the real cpu's
+ * setting of VPTE_M, allowing us to invalidate the TLB (the real cpu's
  * pmap) and get good status for VPTE_M.
  *
+ * By using an atomic op we can detect if the real PTE is writable by
+ * testing whether VPTE_M was set.  If it wasn't set, the real PTE is
+ * already read-only and we do not have to waste time invalidating it
+ * further.
+ *
  * clean: clear VPTE_M and VPTE_RW
  * setro: clear VPTE_RW
  * load&clear: clear entire field
@@ -285,9 +290,18 @@ pmap_clean_pte(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va)
 	vpte_t pte;
 
 	if (vmm_enabled == 0) {
-		pte = atomic_swap_long(ptep, *ptep & ~(VPTE_RW | VPTE_M));
-		if (pte & VPTE_RW)
-			pmap_inval_cpu(pmap, va, PAGE_SIZE);
+		for (;;) {
+			pte = *ptep;
+			cpu_ccfence();
+			if ((pte & VPTE_RW) == 0)
+				break;
+			if (atomic_cmpset_long(ptep,
+					       pte,
+					       pte & ~(VPTE_RW | VPTE_M))) {
+				pmap_inval_cpu(pmap, va, PAGE_SIZE);
+				break;
+			}
+		}
 	} else {
 		pte = *ptep & ~(VPTE_RW | VPTE_M);
 		guest_sync_addr(pmap, ptep, &pte);
@@ -335,11 +349,16 @@ pmap_setro_pte(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va)
 	vpte_t pte;
 
 	if (vmm_enabled == 0) {
-		pte = atomic_swap_long(ptep, *ptep & ~VPTE_RW);
-		if (pte & VPTE_M)
-			atomic_set_long(ptep, VPTE_M);
-		if (pte & VPTE_RW)
-			pmap_inval_cpu(pmap, va, PAGE_SIZE);
+		for (;;) {
+			pte = *ptep;
+			cpu_ccfence();
+			if ((pte & VPTE_RW) == 0)
+				break;
+			if (atomic_cmpset_long(ptep, pte, pte & ~VPTE_RW)) {
+				pmap_inval_cpu(pmap, va, PAGE_SIZE);
+				break;
+			}
+		}
 	} else {
 		pte = *ptep & ~(VPTE_RW | VPTE_M);
 		guest_sync_addr(pmap, ptep, &pte);
diff --git a/sys/vm/vm_fault.c b/sys/vm/vm_fault.c
index a749bd36d7..e8a5ac6ac8 100644
--- a/sys/vm/vm_fault.c
+++ b/sys/vm/vm_fault.c
@@ -369,12 +369,12 @@ RetryFault:
 		}
 
 		/*
-   		 * If we are user-wiring a r/w segment, and it is COW, then
-   		 * we need to do the COW operation.  Note that we don't
+		 * If we are user-wiring a r/w segment, and it is COW, then
+		 * we need to do the COW operation.  Note that we don't
 		 * currently COW RO sections now, because it is NOT desirable
-   		 * to COW .text.  We simply keep .text from ever being COW'ed
-   		 * and take the heat that one cannot debug wired .text sections.
-   		 */
+		 * to COW .text.  We simply keep .text from ever being COW'ed
+		 * and take the heat that one cannot debug wired .text sections.
+		 */
 		result = vm_map_lookup(&fs.map, vaddr,
 				       VM_PROT_READ|VM_PROT_WRITE|
 				        VM_PROT_OVERRIDE_WRITE,
@@ -395,8 +395,10 @@ RetryFault:
 		 * XXX We have a shared lock, this will have a MP race but
 		 * I don't see how it can hurt anything.
 		 */
-		if ((fs.entry->protection & VM_PROT_WRITE) == 0)
-			fs.entry->max_protection &= ~VM_PROT_WRITE;
+		if ((fs.entry->protection & VM_PROT_WRITE) == 0) {
+			atomic_clear_char(&fs.entry->max_protection,
+					  VM_PROT_WRITE);
+		}
 	}
 
 	/*
@@ -746,6 +748,7 @@ vm_fault_page(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type,
 	struct faultstate fs;
 	int result;
 	int retry;
+	int growstack;
 	vm_prot_t orig_fault_type = fault_type;
 
 	retry = 0;
@@ -762,6 +765,8 @@ vm_fault_page(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type,
 	 */
 	fs.m = pmap_fault_page_quick(map->pmap, vaddr, fault_type, busyp);
 	if (fs.m) {
+		if (fault_type & (VM_PROT_WRITE|VM_PROT_OVERRIDE_WRITE))
+			vm_page_dirty(fs.m);
 		*errorp = 0;
 		return(fs.m);
 	}
@@ -770,9 +775,10 @@ vm_fault_page(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type,
 	 * Otherwise take a concurrency hit and do a formal page
 	 * fault.
 	 */
+	fs.vp = NULL;
 	fs.shared = vm_shared_fault;
 	fs.first_shared = vm_shared_fault;
-	fs.vp = NULL;
+	growstack = 1;
 	lwkt_gettoken(&map->token);
 
 	/*
@@ -805,9 +811,61 @@ RetryFault:
 			       &first_pindex, &fs.first_prot, &fs.wired);
 
 	if (result != KERN_SUCCESS) {
-		*errorp = result;
-		fs.m = NULL;
-		goto done;
+		if (result == KERN_FAILURE_NOFAULT) {
+			*errorp = KERN_FAILURE;
+			fs.m = NULL;
+			goto done;
+		}
+		if (result != KERN_PROTECTION_FAILURE ||
+		    (fs.fault_flags & VM_FAULT_WIRE_MASK) != VM_FAULT_USER_WIRE)
+		{
+			if (result == KERN_INVALID_ADDRESS && growstack &&
+			    map != &kernel_map && curproc != NULL) {
+				result = vm_map_growstack(curproc, vaddr);
+				if (result == KERN_SUCCESS) {
+					growstack = 0;
+					++retry;
+					goto RetryFault;
+				}
+				result = KERN_FAILURE;
+			}
+			fs.m = NULL;
+			*errorp = result;
+			goto done;
+		}
+
+		/*
+		 * If we are user-wiring a r/w segment, and it is COW, then
+		 * we need to do the COW operation.  Note that we don't
+		 * currently COW RO sections now, because it is NOT desirable
+		 * to COW .text.  We simply keep .text from ever being COW'ed
+		 * and take the heat that one cannot debug wired .text sections.
+		 */
+		result = vm_map_lookup(&fs.map, vaddr,
+				       VM_PROT_READ|VM_PROT_WRITE|
+				        VM_PROT_OVERRIDE_WRITE,
+				       &fs.entry, &fs.first_object,
+				       &first_pindex, &fs.first_prot,
+				       &fs.wired);
+		if (result != KERN_SUCCESS) {
+			/* could also be KERN_FAILURE_NOFAULT */
+			*errorp = KERN_FAILURE;
+			fs.m = NULL;
+			goto done;
+		}
+
+		/*
+		 * If we don't COW now, on a user wire, the user will never
+		 * be able to write to the mapping.  If we don't make this
+		 * restriction, the bookkeeping would be nearly impossible.
+		 *
+		 * XXX We have a shared lock, this will have a MP race but
+		 * I don't see how it can hurt anything.
+		 */
+		if ((fs.entry->protection & VM_PROT_WRITE) == 0) {
+			atomic_clear_char(&fs.entry->max_protection,
+					  VM_PROT_WRITE);
+		}
 	}
 
 	/*
@@ -1017,7 +1075,8 @@ RetryFault:
 	 * Unlock everything, and return the held or busied page.
 	 */
 	if (busyp) {
-		if (fault_type & VM_PROT_WRITE) {
+		if (fault_type & (VM_PROT_WRITE|VM_PROT_OVERRIDE_WRITE)) {
+			vm_page_dirty(fs.m);
 			*busyp = 1;
 		} else {
 			*busyp = 0;
@@ -1245,7 +1304,6 @@ vm_fault_vpagetable(struct faultstate *fs, vm_pindex_t *pindex,
 		}
 		if ((vpte & VPTE_PS) || vshift == 0)
 			break;
-		KKASSERT(vshift >= VPTE_PAGE_BITS);
 
 		/*
 		 * Get the page table page.  Nominally we only read the page
@@ -1285,17 +1343,21 @@ vm_fault_vpagetable(struct faultstate *fs, vm_pindex_t *pindex,
 		for (;;) {
 			vpte_t nvpte;
 
+			/*
+			 * Reload for the cmpset, but make sure the pte is
+			 * still valid.
+			 */
 			vpte = *ptep;
 			cpu_ccfence();
 			nvpte = vpte;
 
-			if ((fault_type & VM_PROT_WRITE) && (vpte & VPTE_V) &&
-			    (vpte & VPTE_RW)) {
+			if ((vpte & VPTE_V) == 0)
+				break;
+
+			if ((fault_type & VM_PROT_WRITE) && (vpte & VPTE_RW))
 				nvpte |= VPTE_M | VPTE_A;
-			}
-			if ((fault_type & VM_PROT_READ) && (vpte & VPTE_V)) {
+			if (fault_type & VM_PROT_READ)
 				nvpte |= VPTE_A;
-			}
 			if (vpte == nvpte)
 				break;
 			if (atomic_cmpset_long(ptep, vpte, nvpte)) {
@@ -1311,20 +1373,16 @@ vm_fault_vpagetable(struct faultstate *fs, vm_pindex_t *pindex,
 	}
 
 	/*
-	 * We can't reconcile the real page table's M bit back to the
-	 * virtual page table, so instead we force the real page table pte
-	 * to be read-only on a read-fault for a VPTE_RW vpagetable
-	 * fault, and set the VPTE_M bit manually (above) for a write-fault
-	 * on a VPTE_RW vpagetable fault.
-	 *
-	 * The virtual kernel must MADV_INVAL areas that it clears the
-	 * VPTE_M bit on to ensure that the real kernel is able to take
-	 * a write fault to set VPTE_M again.
-	 *
-	 * If we find the VPTE_M bit already set above, we don't have to
-	 * downgrade here.
+	 * When the vkernel sets VPTE_RW it expects the real kernel to
+	 * reflect VPTE_M back when the page is modified via the mapping.
+	 * In order to accomplish this the real kernel must map the page
+	 * read-only for read faults and use write faults to reflect VPTE_M
+	 * back.
 	 *
-	 * This guarantees that the M bit will be set.
+	 * Once VPTE_M has been set, the real kernel's pte allows writing.
+	 * If the vkernel clears VPTE_M the vkernel must be sure to
+	 * MADV_INVAL the real kernel's mappings to force the real kernel
+	 * to re-fault on the next write so oit can set VPTE_M again.
 	 */
 	if ((fault_type & VM_PROT_WRITE) == 0 &&
 	    (vpte & (VPTE_RW | VPTE_M)) != (VPTE_RW | VPTE_M)) {
@@ -2306,8 +2364,8 @@ vm_fault_copy_entry(vm_map_t dst_map, vm_map_t src_map,
 	vm_object_hold(src_object);
 	vm_object_hold(dst_object);
 	for (vaddr = dst_entry->start, dst_offset = 0;
-	    vaddr < dst_entry->end;
-	    vaddr += PAGE_SIZE, dst_offset += PAGE_SIZE) {
+	     vaddr < dst_entry->end;
+	     vaddr += PAGE_SIZE, dst_offset += PAGE_SIZE) {
 
 		/*
 		 * Allocate a page in the destination object
-- 
2.11.4.GIT