sys/platform/vkernel64/platform/pmap_inval.c

   1 /*
   2  * Copyright (c) 2003-2016 The DragonFly Project.  All rights reserved.
   3  *
   4  * This code is derived from software contributed to The DragonFly Project
   5  * by Matthew Dillon <dillon@backplane.com>
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  *
  11  * 1. Redistributions of source code must retain the above copyright
  12  *    notice, this list of conditions and the following disclaimer.
  13  * 2. Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in
  15  *    the documentation and/or other materials provided with the
  16  *    distribution.
  17  * 3. Neither the name of The DragonFly Project nor the names of its
  18  *    contributors may be used to endorse or promote products derived
  19  *    from this software without specific, prior written permission.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
  25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
  27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  32  * SUCH DAMAGE.
  33  *
  34  * $DragonFly: src/sys/platform/vkernel/platform/pmap_inval.c,v 1.4 2007/07/02 02:22:58 dillon Exp $
  35  */
  36
  37 /*
  38  * pmap invalidation support code.  Certain hardware requirements must
  39  * be dealt with when manipulating page table entries and page directory
  40  * entries within a pmap.  In particular, we cannot safely manipulate
  41  * page tables which are in active use by another cpu (even if it is
  42  * running in userland) for two reasons: First, TLB writebacks will
  43  * race against our own modifications and tests.  Second, even if we
  44  * were to use bus-locked instruction we can still screw up the
  45  * target cpu's instruction pipeline due to Intel cpu errata.
  46  *
  47  * For our virtual page tables, the real kernel will handle SMP interactions
  48  * with pmaps that may be active on other cpus.  Even so, we have to be
  49  * careful about bit setting races particularly when we are trying to clean
  50  * a page and test the modified bit to avoid races where the modified bit
  51  * might get set after our poll but before we clear the field.
  52  */
  53 #include <sys/param.h>
  54 #include <sys/systm.h>
  55 #include <sys/kernel.h>
  56 #include <sys/proc.h>
  57 #include <sys/vmmeter.h>
  58 #include <sys/thread2.h>
  59 #include <sys/cdefs.h>
  60 #include <sys/mman.h>
  61 #include <sys/vmspace.h>
  62 #include <sys/vmm.h>
  63
  64 #include <vm/vm.h>
  65 #include <vm/pmap.h>
  66 #include <vm/vm_object.h>
  67
  68 #include <machine/cputypes.h>
  69 #include <machine/md_var.h>
  70 #include <machine/specialreg.h>
  71 #include <machine/smp.h>
  72 #include <machine/globaldata.h>
  73 #include <machine/pmap.h>
  74 #include <machine/pmap_inval.h>
  75
  76 #include <unistd.h>
  77 #include <pthread.h>
  78
  79 #include <vm/vm_page2.h>
  80
  81 extern int vmm_enabled;
  82
  83 /*
  84  * Invalidate the TLB on the current cpu
  85  *
  86  * (VMM enabled only)
  87  */
  88 static __inline
  89 void
  90 vmm_cpu_invltlb(void)
  91 {
  92 #if 0
  93         /* not directly supported */
  94         cpu_invltlb();
  95 #else
  96         /* vmm_guest_sync_addr(NULL, NULL); */
  97         /* For VMM mode forces vmmexit/resume */
  98         uint64_t rax = -1;
  99         __asm __volatile("syscall;"
 100                         :
 101                         : "a" (rax)
 102                         :);
 103 #endif
 104 }
 105
 106 static __inline
 107 void
 108 vmm_cpu_invlpg(void *addr __unused)
 109 {
 110         vmm_cpu_invltlb();
 111 }
 112
 113 /*
 114  * Invalidate va in the TLB on the current cpu
 115  *
 116  * (VMM disabled only)
 117  */
 118 static __inline
 119 void
 120 pmap_inval_cpu(struct pmap *pmap, vm_offset_t va, size_t bytes)
 121 {
 122         if (pmap == &kernel_pmap) {
 123                 madvise((void *)va, bytes, MADV_INVAL);
 124         } else {
 125                 vmspace_mcontrol(pmap, (void *)va, bytes, MADV_INVAL, 0);
 126         }
 127 }
 128
 129 /*
 130  * This is a bit of a mess because we don't know what virtual cpus are
 131  * mapped to real cpus.  Basically try to optimize the degenerate cases
 132  * (primarily related to user processes with only one thread or only one
 133  * running thread), and shunt all the rest to the host cpu.  The host cpu
 134  * will invalidate all real cpu's the vkernel is running on.
 135  *
 136  * This can't optimize situations where a pmap is only mapped to some of
 137  * the virtual cpus, though shunting to the real host will still be faster
 138  * if the virtual kernel processes are running on fewer real-host cpus.
 139  * (And probably will be faster anyway since there's no round-trip signaling
 140  * overhead).
 141  *
 142  * NOTE: The critical section protects against preemption while the pmap
 143  *       is locked, which could otherwise result in a deadlock.
 144  */
 145 static __inline
 146 void
 147 guest_sync_addr(struct pmap *pmap, volatile vpte_t *ptep, vpte_t *srcv)
 148 {
 149         globaldata_t gd = mycpu;
 150         cpulock_t olock;
 151         cpulock_t nlock;
 152
 153         /*
 154          * Lock the pmap
 155          */
 156         crit_enter();
 157         for (;;) {
 158                 olock = pmap->pm_active_lock;
 159                 cpu_ccfence();
 160                 if ((olock & CPULOCK_EXCL) == 0) {
 161                         nlock = olock | CPULOCK_EXCL;
 162                         if (atomic_cmpset_int(&pmap->pm_active_lock,
 163                                               olock, nlock)) {
 164                                 break;
 165                         }
 166                 }
 167                 cpu_pause();
 168                 lwkt_process_ipiq();
 169                 pthread_yield();
 170         }
 171
 172         /*
 173          * Update the pte and synchronize with other cpus.  If we can update
 174          * it trivially, do so.
 175          */
 176         if (CPUMASK_TESTZERO(pmap->pm_active) ||
 177             CPUMASK_CMPMASKEQ(pmap->pm_active, gd->gd_cpumask)) {
 178                 if (ptep)
 179                         *srcv = atomic_swap_long(ptep, *srcv);
 180                 vmm_cpu_invltlb();
 181         } else {
 182                 vmm_guest_sync_addr(__DEVOLATILE(void *, ptep), srcv);
 183         }
 184
 185         /*
 186          * Unlock the pmap
 187          */
 188         atomic_clear_int(&pmap->pm_active_lock, CPULOCK_EXCL);
 189         crit_exit();
 190 }
 191
 192 /*
 193  * Invalidate a pte in a pmap and synchronize with target cpus
 194  * as required.  Throw away the modified and access bits.  Use
 195  * pmap_clean_pte() to do the same thing but also get an interlocked
 196  * modified/access status.
 197  *
 198  * Clearing the field first (basically clearing VPTE_V) prevents any
 199  * new races from occuring while we invalidate the TLB (i.e. the pmap
 200  * on the real cpu), then clear it again to clean out any race that
 201  * might have occured before the invalidation completed.
 202  */
 203 void
 204 pmap_inval_pte(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va)
 205 {
 206         vpte_t pte;
 207
 208         if (vmm_enabled == 0) {
 209                 atomic_swap_long(ptep, 0);
 210                 pmap_inval_cpu(pmap, va, PAGE_SIZE);
 211         } else {
 212                 pte = 0;
 213                 guest_sync_addr(pmap, ptep, &pte);
 214         }
 215 }
 216
 217 /*
 218  * Same as pmap_inval_pte() but only synchronize with the current
 219  * cpu.  For the moment its the same as the non-quick version.
 220  */
 221 void
 222 pmap_inval_pte_quick(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va)
 223 {
 224         atomic_swap_long(ptep, 0);
 225         if (vmm_enabled == 0)
 226                 pmap_inval_cpu(pmap, va, PAGE_SIZE);
 227         else
 228                 vmm_cpu_invltlb();
 229 }
 230
 231 /*
 232  * Invalidate the tlb for a range of virtual addresses across all cpus
 233  * belonging to the pmap.
 234  */
 235 void
 236 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 237 {
 238         if (vmm_enabled == 0) {
 239                 pmap_inval_cpu(pmap, sva, eva - sva);
 240         } else {
 241                 guest_sync_addr(pmap, NULL, NULL);
 242         }
 243 }
 244
 245 /*
 246  * Invalidating page directory entries requires some additional
 247  * sophistication.  The cachemask must be cleared so the kernel
 248  * resynchronizes its temporary page table mappings cache.
 249  */
 250 void
 251 pmap_inval_pde(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va)
 252 {
 253         vpte_t pte;
 254
 255         if (vmm_enabled == 0) {
 256                 atomic_swap_long(ptep, 0);
 257                 pmap_inval_cpu(pmap, va, SEG_SIZE);
 258         } else if (CPUMASK_TESTMASK(pmap->pm_active,
 259                                     mycpu->gd_other_cpus) == 0) {
 260                 atomic_swap_long(ptep, 0);
 261                 vmm_cpu_invltlb();
 262         } else {
 263                 pte = 0;
 264                 guest_sync_addr(pmap, ptep, &pte);
 265         }
 266 }
 267
 268 void
 269 pmap_inval_pde_quick(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va)
 270 {
 271         pmap_inval_pde(ptep, pmap, va);
 272 }
 273
 274 /*
 275  * This is really nasty.
 276  *
 277  * (1) The vkernel interlocks pte operations with the related vm_page_t
 278  *     spin-lock (and doesn't handle unmanaged page races).
 279  *
 280  * (2) The vkernel must also issu an invalidation to the real cpu.  It
 281  *     (nastily) does this while holding the spin-lock too.
 282  *
 283  * In addition, atomic ops must be used to properly interlock against
 284  * other cpus and the real kernel (which could be taking a fault on another
 285  * cpu and will adjust VPTE_M and VPTE_A appropriately).
 286  *
 287  * The atomicc ops do a good job of interlocking against other cpus, but
 288  * we still need to lock the pte location (which we use the vm_page spin-lock
 289  * for) to avoid races against PG_WRITEABLE and other tests.
 290  *
 291  * Cleaning the pte involves clearing VPTE_M and VPTE_RW, synchronizing with
 292  * the real host, and updating the vm_page appropriately.
 293  *
 294  * If the caller passes a non-NULL (m), the caller holds the spin-lock,
 295  * otherwise we must acquire and release the spin-lock.  (m) is only
 296  * applicable to managed pages.
 297  */
 298 vpte_t
 299 pmap_clean_pte(volatile vpte_t *ptep, struct pmap *pmap, vm_offset_t va,
 300                vm_page_t m)
 301 {
 302         vpte_t pte;
 303         int spin = 0;
 304
 305         /*
 306          * Acquire (m) and spin-lock it.
 307          */
 308         while (m == NULL) {
 309                 pte = *ptep;
 310                 if ((pte & VPTE_V) == 0)
 311                         return pte;
 312                 if ((pte & VPTE_MANAGED) == 0)
 313                         break;
 314                 m = PHYS_TO_VM_PAGE(pte & VPTE_FRAME);
 315                 vm_page_spin_lock(m);
 316
 317                 pte = *ptep;
 318                 if ((pte & VPTE_V) == 0) {
 319                         vm_page_spin_unlock(m);
 320                         m = NULL;
 321                         continue;
 322                 }
 323                 if ((pte & VPTE_MANAGED) == 0) {
 324                         vm_page_spin_unlock(m);
 325                         m = NULL;
 326                         continue;
 327                 }
 328                 if (m != PHYS_TO_VM_PAGE(pte & VPTE_FRAME)) {
 329                         vm_page_spin_unlock(m);
 330                         m = NULL;
 331                         continue;
 332                 }
 333                 spin = 1;
 334                 break;
 335         }
 336
 337         if (vmm_enabled == 0) {
 338                 for (;;) {
 339                         pte = *ptep;
 340                         cpu_ccfence();
 341                         if ((pte & VPTE_RW) == 0)
 342                                 break;
 343                         if (atomic_cmpset_long(ptep,
 344                                                pte,
 345                                                pte & ~(VPTE_RW | VPTE_M))) {
 346                                 pmap_inval_cpu(pmap, va, PAGE_SIZE);
 347                                 break;
 348                         }
 349                 }
 350         } else {
 351                 pte = *ptep & ~(VPTE_RW | VPTE_M);
 352                 guest_sync_addr(pmap, ptep, &pte);
 353         }
 354
 355         if (m) {
 356                 if (pte & VPTE_A) {
 357                         vm_page_flag_set(m, PG_REFERENCED);
 358                         atomic_clear_long(ptep, VPTE_A);
 359                 }
 360                 if (pte & VPTE_M) {
 361                         if (pmap_track_modified(pmap, va))
 362                                 vm_page_dirty(m);
 363                 }
 364                 if (spin)
 365                         vm_page_spin_unlock(m);
 366         }
 367         return pte;
 368 }
 369
 370 /*
 371  * This is a combination of pmap_inval_pte() and pmap_clean_pte().
 372  * Firts prevent races with the 'A' and 'M' bits, then clean out
 373  * the tlb (the real cpu's pmap), then incorporate any races that
 374  * may have occured in the mean time, and finally zero out the pte.
 375  */
 376 vpte_t
 377 pmap_inval_loadandclear(volatile vpte_t *ptep, struct pmap *pmap,
 378                         vm_offset_t va)
 379 {
 380         vpte_t pte;
 381
 382         if (vmm_enabled == 0) {
 383                 pte = atomic_swap_long(ptep, 0);
 384                 pmap_inval_cpu(pmap, va, PAGE_SIZE);
 385         } else {
 386                 pte = 0;
 387                 guest_sync_addr(pmap, ptep, &pte);
 388         }
 389         return(pte);
 390 }
 391
 392 void
 393 cpu_invlpg(void *addr)
 394 {
 395         if (vmm_enabled)
 396                 vmm_cpu_invlpg(addr);
 397         else
 398                 madvise(addr, PAGE_SIZE, MADV_INVAL);
 399 }
 400
 401 void
 402 cpu_invltlb(void)
 403 {
 404         if (vmm_enabled)
 405                 vmm_cpu_invltlb(); /* For VMM mode forces vmmexit/resume */
 406         else
 407                 madvise((void *)KvaStart, KvaEnd - KvaStart, MADV_INVAL);
 408 }
 409
 410 /*
 411  * Invalidate the TLB on all cpus.  Instead what the vkernel does is
 412  * ignore VM_PROT_NOSYNC on pmap_enter() calls.
 413  */
 414 void
 415 smp_invltlb(void)
 416 {
 417         /* do nothing */
 418 }
 419
 420 void
 421 smp_sniff(void)
 422 {
 423         /* not implemented */
 424 }
 425
 426 void
 427 cpu_sniff(int dcpu __unused)
 428 {
 429         /* not implemented */
 430 }