kernel - Augment vm_fault_page() for vkernel operations
[dragonfly.git] / sys / platform / vkernel64 / platform / pmap_inval.c
blob7242026663f768b1e67f15b6d8c281712dfb205c
1 /*
2 * Copyright (c) 2003-2016 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
34 * $DragonFly: src/sys/platform/vkernel/platform/pmap_inval.c,v 1.4 2007/07/02 02:22:58 dillon Exp $
38 * pmap invalidation support code. Certain hardware requirements must
39 * be dealt with when manipulating page table entries and page directory
40 * entries within a pmap. In particular, we cannot safely manipulate
41 * page tables which are in active use by another cpu (even if it is
42 * running in userland) for two reasons: First, TLB writebacks will
43 * race against our own modifications and tests. Second, even if we
44 * were to use bus-locked instruction we can still screw up the
45 * target cpu's instruction pipeline due to Intel cpu errata.
47 * For our virtual page tables, the real kernel will handle SMP interactions
48 * with pmaps that may be active on other cpus. Even so, we have to be
49 * careful about bit setting races particularly when we are trying to clean
50 * a page and test the modified bit to avoid races where the modified bit
51 * might get set after our poll but before we clear the field.
53 #include <sys/param.h>
54 #include <sys/systm.h>
55 #include <sys/kernel.h>
56 #include <sys/proc.h>
57 #include <sys/vmmeter.h>
58 #include <sys/thread2.h>
59 #include <sys/cdefs.h>
60 #include <sys/mman.h>
61 #include <sys/vmspace.h>
62 #include <sys/vmm.h>
64 #include <vm/vm.h>
65 #include <vm/pmap.h>
66 #include <vm/vm_object.h>
68 #include <machine/cputypes.h>
69 #include <machine/md_var.h>
70 #include <machine/specialreg.h>
71 #include <machine/smp.h>
72 #include <machine/globaldata.h>
73 #include <machine/pmap.h>
74 #include <machine/pmap_inval.h>
76 #include <unistd.h>
77 #include <pthread.h>
79 extern int vmm_enabled;
82 * Invalidate the TLB on the current cpu
84 * (VMM enabled only)
86 static __inline
87 void
88 vmm_cpu_invltlb(void)
90 #if 0
91 /* not directly supported */
92 cpu_invltlb();
93 #else
94 /* vmm_guest_sync_addr(NULL, NULL); */
95 /* For VMM mode forces vmmexit/resume */
96 uint64_t rax = -1;
97 __asm __volatile("syscall;"
99 : "a" (rax)
101 #endif
104 static __inline
105 void
106 vmm_cpu_invlpg(void *addr __unused)
108 vmm_cpu_invltlb();
112 * Invalidate va in the TLB on the current cpu
114 * (VMM disabled only)
116 static __inline
117 void
118 pmap_inval_cpu(struct pmap *pmap, vm_offset_t va, size_t bytes)
120 if (pmap == &kernel_pmap) {
121 madvise((void *)va, bytes, MADV_INVAL);
122 } else {
123 vmspace_mcontrol(pmap, (void *)va, bytes, MADV_INVAL, 0);
128 * This is a bit of a mess because we don't know what virtual cpus are
129 * mapped to real cpus. Basically try to optimize the degenerate cases
130 * (primarily related to user processes with only one thread or only one
131 * running thread), and shunt all the rest to the host cpu. The host cpu
132 * will invalidate all real cpu's the vkernel is running on.
134 * This can't optimize situations where a pmap is only mapped to some of
135 * the virtual cpus, though shunting to the real host will still be faster
136 * if the virtual kernel processes are running on fewer real-host cpus.
137 * (And probably will be faster anyway since there's no round-trip signaling
138 * overhead).
140 * NOTE: The critical section protects against preemption while the pmap
141 * is locked, which could otherwise result in a deadlock.
143 static __inline
144 void
145 guest_sync_addr(struct pmap *pmap, volatile vpte_t *ptep, vpte_t *srcv)
147 globaldata_t gd = mycpu;
148 cpulock_t olock;
149 cpulock_t nlock;
152 * Lock the pmap
154 crit_enter();
155 for (;;) {
156 olock = pmap->pm_active_lock;
157 cpu_ccfence();
158 if ((olock & CPULOCK_EXCL) == 0) {
159 nlock = olock | CPULOCK_EXCL;
160 if (atomic_cmpset_int(&pmap->pm_active_lock,
161 olock, nlock)) {
162 break;
165 cpu_pause();
166 lwkt_process_ipiq();
167 pthread_yield();
171 * Update the pte and synchronize with other cpus. If we can update
172 * it trivially, do so.
174 if (CPUMASK_TESTZERO(pmap->pm_active) ||
175 CPUMASK_CMPMASKEQ(pmap->pm_active, gd->gd_cpumask)) {
176 if (ptep)
177 *srcv = atomic_swap_long(ptep, *srcv);
178 vmm_cpu_invltlb();
179 } else {
180 vmm_guest_sync_addr(__DEVOLATILE(void *, ptep), srcv);
184 * Unlock the pmap
186 atomic_clear_int(&pmap->pm_active_lock, CPULOCK_EXCL);
187 crit_exit();
191 * Invalidate a pte in a pmap and synchronize with target cpus
192 * as required. Throw away the modified and access bits. Use
193 * pmap_clean_pte() to do the same thing but also get an interlocked
194 * modified/access status.
196 * Clearing the field first (basically clearing VPTE_V) prevents any
197 * new races from occuring while we invalidate the TLB (i.e. the pmap
198 * on the real cpu), then clear it again to clean out any race that
199 * might have occured before the invalidation completed.
201 void
202 pmap_inval_pte(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va)
204 vpte_t pte;
206 if (vmm_enabled == 0) {
207 atomic_swap_long(ptep, 0);
208 pmap_inval_cpu(pmap, va, PAGE_SIZE);
209 } else {
210 pte = 0;
211 guest_sync_addr(pmap, ptep, &pte);
216 * Same as pmap_inval_pte() but only synchronize with the current
217 * cpu. For the moment its the same as the non-quick version.
219 void
220 pmap_inval_pte_quick(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va)
222 atomic_swap_long(ptep, 0);
223 if (vmm_enabled == 0)
224 pmap_inval_cpu(pmap, va, PAGE_SIZE);
225 else
226 vmm_cpu_invltlb();
230 * Invalidate the tlb for a range of virtual addresses across all cpus
231 * belonging to the pmap.
233 void
234 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
236 if (vmm_enabled == 0) {
237 pmap_inval_cpu(pmap, sva, eva - sva);
238 } else {
239 guest_sync_addr(pmap, NULL, NULL);
244 * Invalidating page directory entries requires some additional
245 * sophistication. The cachemask must be cleared so the kernel
246 * resynchronizes its temporary page table mappings cache.
248 void
249 pmap_inval_pde(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va)
251 vpte_t pte;
253 if (vmm_enabled == 0) {
254 atomic_swap_long(ptep, 0);
255 pmap_inval_cpu(pmap, va, SEG_SIZE);
256 } else if (CPUMASK_TESTMASK(pmap->pm_active,
257 mycpu->gd_other_cpus) == 0) {
258 atomic_swap_long(ptep, 0);
259 vmm_cpu_invltlb();
260 } else {
261 pte = 0;
262 guest_sync_addr(pmap, ptep, &pte);
266 void
267 pmap_inval_pde_quick(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va)
269 pmap_inval_pde(ptep, pmap, va);
273 * These carefully handle interactions with other cpus and return
274 * the original vpte. Clearing VPTE_RW prevents us from racing the
275 * setting of VPTE_M, allowing us to invalidate the TLB (the real cpu's
276 * pmap) and get good status for VPTE_M.
278 * By using an atomic op we can detect if the real PTE is writable by
279 * testing whether VPTE_M was set. If it wasn't set, the real PTE is
280 * already read-only and we do not have to waste time invalidating it
281 * further.
283 * clean: clear VPTE_M and VPTE_RW
284 * setro: clear VPTE_RW
285 * load&clear: clear entire field
287 vpte_t
288 pmap_clean_pte(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va)
290 vpte_t pte;
292 if (vmm_enabled == 0) {
293 for (;;) {
294 pte = *ptep;
295 cpu_ccfence();
296 if ((pte & VPTE_RW) == 0)
297 break;
298 if (atomic_cmpset_long(ptep,
299 pte,
300 pte & ~(VPTE_RW | VPTE_M))) {
301 pmap_inval_cpu(pmap, va, PAGE_SIZE);
302 break;
305 } else {
306 pte = *ptep & ~(VPTE_RW | VPTE_M);
307 guest_sync_addr(pmap, ptep, &pte);
309 return pte;
312 #if 0
314 vpte_t
315 pmap_clean_pde(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va)
317 vpte_t pte;
319 pte = *ptep;
320 if (pte & VPTE_V) {
321 atomic_clear_long(ptep, VPTE_RW);
322 if (vmm_enabled == 0) {
323 atomic_clear_long(ptep, VPTE_RW);
324 pmap_inval_cpu(pmap, va, PAGE_SIZE);
325 pte = *ptep | (pte & VPTE_RW);
326 atomic_clear_long(ptep, VPTE_M);
327 } else {
328 pte &= ~(VPTE_RW | VPTE_M);
329 guest_sync_addr(pmap, ptep, &pte);
332 return(pte);
335 #endif
338 * This is an odd case and I'm not sure whether it even occurs in normal
339 * operation. Turn off write access to the page, clean out the tlb
340 * (the real cpu's pmap), and deal with any VPTE_M race that may have
341 * occured.
343 * VPTE_M is not cleared. If we accidently removed it due to the swap
344 * we throw it back into the pte.
346 vpte_t
347 pmap_setro_pte(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va)
349 vpte_t pte;
351 if (vmm_enabled == 0) {
352 for (;;) {
353 pte = *ptep;
354 cpu_ccfence();
355 if ((pte & VPTE_RW) == 0)
356 break;
357 if (atomic_cmpset_long(ptep, pte, pte & ~VPTE_RW)) {
358 pmap_inval_cpu(pmap, va, PAGE_SIZE);
359 break;
362 } else {
363 pte = *ptep & ~(VPTE_RW | VPTE_M);
364 guest_sync_addr(pmap, ptep, &pte);
366 return pte;
370 * This is a combination of pmap_inval_pte() and pmap_clean_pte().
371 * Firts prevent races with the 'A' and 'M' bits, then clean out
372 * the tlb (the real cpu's pmap), then incorporate any races that
373 * may have occured in the mean time, and finally zero out the pte.
375 vpte_t
376 pmap_inval_loadandclear(volatile vpte_t *ptep, struct pmap *pmap,
377 vm_offset_t va)
379 vpte_t pte;
381 if (vmm_enabled == 0) {
382 pte = atomic_swap_long(ptep, 0);
383 pmap_inval_cpu(pmap, va, PAGE_SIZE);
384 } else {
385 pte = 0;
386 guest_sync_addr(pmap, ptep, &pte);
388 return(pte);
391 void
392 cpu_invlpg(void *addr)
394 if (vmm_enabled)
395 vmm_cpu_invlpg(addr);
396 else
397 madvise(addr, PAGE_SIZE, MADV_INVAL);
400 void
401 cpu_invltlb(void)
403 if (vmm_enabled)
404 vmm_cpu_invltlb(); /* For VMM mode forces vmmexit/resume */
405 else
406 madvise((void *)KvaStart, KvaEnd - KvaStart, MADV_INVAL);
409 void
410 smp_invltlb(void)
412 /* XXX must invalidate the tlb on all cpus */
413 /* at the moment pmap_inval_pte_quick */
414 /* do nothing */
417 void
418 smp_sniff(void)
420 /* not implemented */