sys/platform/pc64/x86_64/pmap_inval.c

   1 /*
   2  * Copyright (c) 2003-2011 The DragonFly Project.  All rights reserved.
   3  *
   4  * This code is derived from software contributed to The DragonFly Project
   5  * by Matthew Dillon <dillon@backplane.com>
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  *
  11  * 1. Redistributions of source code must retain the above copyright
  12  *    notice, this list of conditions and the following disclaimer.
  13  * 2. Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in
  15  *    the documentation and/or other materials provided with the
  16  *    distribution.
  17  * 3. Neither the name of The DragonFly Project nor the names of its
  18  *    contributors may be used to endorse or promote products derived
  19  *    from this software without specific, prior written permission.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
  25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
  27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  32  * SUCH DAMAGE.
  33  */
  34
  35 /*
  36  * pmap invalidation support code.  Certain hardware requirements must
  37  * be dealt with when manipulating page table entries and page directory
  38  * entries within a pmap.  In particular, we cannot safely manipulate
  39  * page tables which are in active use by another cpu (even if it is
  40  * running in userland) for two reasons: First, TLB writebacks will
  41  * race against our own modifications and tests.  Second, even if we
  42  * were to use bus-locked instruction we can still screw up the
  43  * target cpu's instruction pipeline due to Intel cpu errata.
  44  */
  45
  46 #include <sys/param.h>
  47 #include <sys/systm.h>
  48 #include <sys/kernel.h>
  49 #include <sys/proc.h>
  50 #include <sys/vmmeter.h>
  51 #include <sys/thread2.h>
  52 #include <sys/sysctl.h>
  53
  54 #include <vm/vm.h>
  55 #include <vm/pmap.h>
  56 #include <vm/vm_object.h>
  57
  58 #include <machine/cputypes.h>
  59 #include <machine/md_var.h>
  60 #include <machine/specialreg.h>
  61 #include <machine/smp.h>
  62 #include <machine/globaldata.h>
  63 #include <machine/pmap.h>
  64 #include <machine/pmap_inval.h>
  65 #include <machine/clock.h>
  66
  67 #if 1   /* DEBUGGING */
  68 #define LOOPRECOVER                     /* enable watchdog */
  69 #endif
  70
  71 /*
  72  * Watchdog recovery interval, in seconds.
  73  *
  74  * The watchdog value is generous for two reasons.  First, because the
  75  * situation is not supposed to happen at all (but does), and second,
  76  * because VMs could be very slow at handling IPIs.
  77  */
  78 #define LOOPRECOVER_TIMEOUT1    2       /* initial recovery */
  79 #define LOOPRECOVER_TIMEOUT2    1       /* repeated recoveries */
  80
  81 #define MAX_INVAL_PAGES         128
  82
  83 struct pmap_inval_info {
  84         vm_offset_t     va;
  85         pt_entry_t      *ptep;
  86         pt_entry_t      opte;
  87         pt_entry_t      npte;
  88         enum { INVDONE, INVSTORE, INVCMPSET } mode;
  89         int             success;
  90         vm_pindex_t     npgs;
  91         cpumask_t       done;
  92         cpumask_t       mask;
  93 #ifdef LOOPRECOVER
  94         cpumask_t       sigmask;
  95         int             failed;
  96         int64_t         tsc_target;
  97 #endif
  98 } __cachealign;
  99
 100 typedef struct pmap_inval_info pmap_inval_info_t;
 101
 102 static pmap_inval_info_t        invinfo[MAXCPU];
 103 extern cpumask_t                smp_invmask;
 104 #ifdef LOOPRECOVER
 105 #ifdef LOOPMASK_IN
 106 extern cpumask_t                smp_in_mask;
 107 #endif
 108 extern cpumask_t                smp_smurf_mask;
 109 #endif
 110 static int pmap_inval_watchdog_print;   /* must always default off */
 111 static int pmap_inval_force_allcpus;
 112 static int pmap_inval_force_nonopt;
 113
 114 SYSCTL_INT(_machdep, OID_AUTO, pmap_inval_watchdog_print, CTLFLAG_RW,
 115             &pmap_inval_watchdog_print, 0, "");
 116 SYSCTL_INT(_machdep, OID_AUTO, pmap_inval_force_allcpus, CTLFLAG_RW,
 117             &pmap_inval_force_allcpus, 0, "");
 118 SYSCTL_INT(_machdep, OID_AUTO, pmap_inval_force_nonopt, CTLFLAG_RW,
 119             &pmap_inval_force_nonopt, 0, "");
 120
 121 static void
 122 pmap_inval_init(pmap_t pmap)
 123 {
 124         cpulock_t olock;
 125         cpulock_t nlock;
 126
 127         crit_enter_id("inval");
 128
 129         if (pmap != &kernel_pmap) {
 130                 for (;;) {
 131                         olock = pmap->pm_active_lock;
 132                         cpu_ccfence();
 133                         nlock = olock | CPULOCK_EXCL;
 134                         if (olock != nlock &&
 135                             atomic_cmpset_int(&pmap->pm_active_lock,
 136                                               olock, nlock)) {
 137                                 break;
 138                         }
 139                         lwkt_process_ipiq();
 140                         cpu_pause();
 141                 }
 142                 atomic_add_acq_long(&pmap->pm_invgen, 1);
 143         }
 144 }
 145
 146 static void
 147 pmap_inval_done(pmap_t pmap)
 148 {
 149         if (pmap != &kernel_pmap) {
 150                 atomic_add_acq_long(&pmap->pm_invgen, 1);
 151                 atomic_clear_int(&pmap->pm_active_lock, CPULOCK_EXCL);
 152         }
 153         crit_exit_id("inval");
 154 }
 155
 156 #ifdef LOOPRECOVER
 157
 158 /*
 159  * Debugging and lost IPI recovery code.
 160  */
 161 static
 162 __inline
 163 int
 164 loopwdog(struct pmap_inval_info *info)
 165 {
 166         int64_t tsc;
 167
 168         tsc = rdtsc();
 169         if (info->tsc_target - tsc < 0 && tsc_frequency) {
 170                 info->tsc_target = tsc + (tsc_frequency * LOOPRECOVER_TIMEOUT2);
 171                 return 1;
 172         }
 173         return 0;
 174 }
 175
 176 static
 177 void
 178 loopdebug(const char *msg, pmap_inval_info_t *info)
 179 {
 180         int p;
 181         int cpu = mycpu->gd_cpuid;
 182
 183         /*
 184          * Don't kprintf() anything if the pmap inval watchdog gets hit.
 185          * DRM can cause an occassional watchdog hit (at least with a 1/16
 186          * second watchdog), and attempting to kprintf to the KVM frame buffer
 187          * from Xinvltlb, which ignores critical sections, can implode the
 188          * system.
 189          */
 190         if (pmap_inval_watchdog_print == 0)
 191                 return;
 192
 193         cpu_lfence();
 194 #ifdef LOOPRECOVER
 195         atomic_add_long(&smp_smurf_mask.ary[0], 0);
 196 #endif
 197         kprintf("ipilost-%s! %d mode=%d m=%08jx d=%08jx "
 198 #ifdef LOOPRECOVER
 199                 "s=%08jx "
 200 #endif
 201 #ifdef LOOPMASK_IN
 202                 "in=%08jx "
 203 #endif
 204 #ifdef LOOPRECOVER
 205                 "smurf=%08jx\n"
 206 #endif
 207                 , msg, cpu, info->mode,
 208                 info->mask.ary[0],
 209                 info->done.ary[0]
 210 #ifdef LOOPRECOVER
 211                 , info->sigmask.ary[0]
 212 #endif
 213 #ifdef LOOPMASK_IN
 214                 , smp_in_mask.ary[0]
 215 #endif
 216 #ifdef LOOPRECOVER
 217                 , smp_smurf_mask.ary[0]
 218 #endif
 219                 );
 220         kprintf("mdglob ");
 221         for (p = 0; p < ncpus; ++p)
 222                 kprintf(" %d", CPU_prvspace[p]->mdglobaldata.gd_xinvaltlb);
 223         kprintf("\n");
 224 }
 225
 226 #endif
 227
 228 #ifdef CHECKSIG
 229
 230 #define CHECKSIGMASK(info)      _checksigmask(info, __FILE__, __LINE__)
 231
 232 static
 233 void
 234 _checksigmask(pmap_inval_info_t *info, const char *file, int line)
 235 {
 236         cpumask_t tmp;
 237
 238         tmp = info->mask;
 239         CPUMASK_ANDMASK(tmp, info->sigmask);
 240         if (CPUMASK_CMPMASKNEQ(tmp, info->mask)) {
 241                 kprintf("\"%s\" line %d: bad sig/mask %08jx %08jx\n",
 242                         file, line, info->sigmask.ary[0], info->mask.ary[0]);
 243         }
 244 }
 245
 246 #else
 247
 248 #define CHECKSIGMASK(info)
 249
 250 #endif
 251
 252 /*
 253  * Invalidate the specified va across all cpus associated with the pmap.
 254  * If va == (vm_offset_t)-1, we invltlb() instead of invlpg().  The operation
 255  * will be done fully synchronously with storing npte into *ptep and returning
 256  * opte.
 257  *
 258  * If ptep is NULL the operation will execute semi-synchronously.
 259  * ptep must be NULL if npgs > 1
 260  */
 261 pt_entry_t
 262 pmap_inval_smp(pmap_t pmap, vm_offset_t va, vm_pindex_t npgs,
 263                pt_entry_t *ptep, pt_entry_t npte)
 264 {
 265         globaldata_t gd = mycpu;
 266         pmap_inval_info_t *info;
 267         pt_entry_t opte = 0;
 268         int cpu = gd->gd_cpuid;
 269         cpumask_t tmpmask;
 270         unsigned long rflags;
 271
 272         /*
 273          * Initialize invalidation for pmap and enter critical section.
 274          * This will enter a critical section for us.
 275          */
 276         if (pmap == NULL)
 277                 pmap = &kernel_pmap;
 278         pmap_inval_init(pmap);
 279
 280         /*
 281          * Shortcut single-cpu case if possible.
 282          */
 283         if (CPUMASK_CMPMASKEQ(pmap->pm_active, gd->gd_cpumask) &&
 284             pmap_inval_force_nonopt == 0) {
 285                 /*
 286                  * Convert to invltlb if there are too many pages to
 287                  * invlpg on.
 288                  */
 289                 if (npgs == 1) {
 290                         if (ptep)
 291                                 opte = atomic_swap_long(ptep, npte);
 292                         if (va == (vm_offset_t)-1)
 293                                 cpu_invltlb();
 294                         else
 295                                 cpu_invlpg((void *)va);
 296                 } else if (va == (vm_offset_t)-1 || npgs > MAX_INVAL_PAGES) {
 297                         if (ptep) {
 298                                 while (npgs) {
 299                                         opte = atomic_swap_long(ptep, npte);
 300                                         ++ptep;
 301                                         --npgs;
 302                                 }
 303                         }
 304                         cpu_invltlb();
 305                 } else {
 306                         while (npgs) {
 307                                 if (ptep) {
 308                                         opte = atomic_swap_long(ptep, npte);
 309                                         ++ptep;
 310                                 }
 311                                 cpu_invlpg((void *)va);
 312                                 va += PAGE_SIZE;
 313                                 --npgs;
 314                         }
 315                 }
 316                 pmap_inval_done(pmap);
 317
 318                 return opte;
 319         }
 320
 321         /*
 322          * We need a critical section to prevent getting preempted while
 323          * we setup our command.  A preemption might execute its own
 324          * pmap_inval*() command and create confusion below.
 325          *
 326          * tsc_target is our watchdog timeout that will attempt to recover
 327          * from a lost IPI.  Set to 1/16 second for now.
 328          */
 329         info = &invinfo[cpu];
 330
 331         /*
 332          * We must wait for other cpus which may still be finishing up a
 333          * prior operation that we requested.
 334          *
 335          * We do not have to disable interrupts here.  An Xinvltlb can occur
 336          * at any time (even within a critical section), but it will not
 337          * act on our command until we set our done bits.
 338          */
 339         while (CPUMASK_TESTNZERO(info->done)) {
 340 #ifdef LOOPRECOVER
 341                 if (loopwdog(info)) {
 342                         info->failed = 1;
 343                         loopdebug("A", info);
 344                         /* XXX recover from possible bug */
 345                         CPUMASK_ASSZERO(info->done);
 346                 }
 347 #endif
 348                 cpu_pause();
 349         }
 350         KKASSERT(info->mode == INVDONE);
 351         cpu_mfence();
 352
 353         /*
 354          * Must set our cpu in the invalidation scan mask before
 355          * any possibility of [partial] execution (remember, XINVLTLB
 356          * can interrupt a critical section).
 357          */
 358         ATOMIC_CPUMASK_ORBIT(smp_invmask, cpu);
 359
 360         info->tsc_target = rdtsc() + (tsc_frequency * LOOPRECOVER_TIMEOUT1);
 361         info->va = va;
 362         info->npgs = npgs;
 363         info->ptep = ptep;
 364         info->npte = npte;
 365         info->opte = 0;
 366 #ifdef LOOPRECOVER
 367         info->failed = 0;
 368 #endif
 369         info->mode = INVSTORE;
 370
 371         tmpmask = pmap->pm_active;      /* volatile (bits may be cleared) */
 372         if (pmap_inval_force_allcpus)
 373                 tmpmask = smp_active_mask;
 374         cpu_ccfence();
 375         CPUMASK_ANDMASK(tmpmask, smp_active_mask);
 376
 377         /*
 378          * If ptep is NULL the operation can be semi-synchronous, which means
 379          * we can improve performance by flagging and removing idle cpus
 380          * (see the idleinvlclr function in mp_machdep.c).
 381          *
 382          * Typically kernel page table operation is semi-synchronous.
 383          */
 384         if (ptep == NULL)
 385                 smp_smurf_idleinvlclr(&tmpmask);
 386         CPUMASK_ORBIT(tmpmask, cpu);
 387         info->mask = tmpmask;
 388
 389         /*
 390          * Command may start executing the moment 'done' is initialized,
 391          * disable current cpu interrupt to prevent 'done' field from
 392          * changing (other cpus can't clear done bits until the originating
 393          * cpu clears its mask bit, but other cpus CAN start clearing their
 394          * mask bits).
 395          */
 396 #ifdef LOOPRECOVER
 397         info->sigmask = tmpmask;
 398         CHECKSIGMASK(info);
 399 #endif
 400         cpu_sfence();
 401         rflags = read_rflags();
 402         cpu_disable_intr();
 403
 404         ATOMIC_CPUMASK_COPY(info->done, tmpmask);
 405         /* execution can begin here on other cpus due to races */
 406
 407         /*
 408          * Pass our copy of the done bits (so they don't change out from
 409          * under us) to generate the Xinvltlb interrupt on the targets.
 410          */
 411         smp_invlpg(&tmpmask);
 412         opte = info->opte;
 413         KKASSERT(info->mode == INVDONE);
 414
 415         /*
 416          * Target cpus will be in their loop exiting concurrently with our
 417          * cleanup.  They will not lose the bitmask they obtained before so
 418          * we can safely clear this bit.
 419          */
 420         ATOMIC_CPUMASK_NANDBIT(smp_invmask, cpu);
 421         write_rflags(rflags);
 422         pmap_inval_done(pmap);
 423
 424         return opte;
 425 }
 426
 427 /*
 428  * API function - invalidate the pte at (va) and replace *ptep with npte
 429  * atomically only if *ptep equals opte, across the pmap's active cpus.
 430  *
 431  * Returns 1 on success, 0 on failure (caller typically retries).
 432  */
 433 int
 434 pmap_inval_smp_cmpset(pmap_t pmap, vm_offset_t va, pt_entry_t *ptep,
 435                       pt_entry_t opte, pt_entry_t npte)
 436 {
 437         globaldata_t gd = mycpu;
 438         pmap_inval_info_t *info;
 439         int success;
 440         int cpu = gd->gd_cpuid;
 441         cpumask_t tmpmask;
 442         unsigned long rflags;
 443
 444         /*
 445          * Initialize invalidation for pmap and enter critical section.
 446          */
 447         if (pmap == NULL)
 448                 pmap = &kernel_pmap;
 449         pmap_inval_init(pmap);
 450
 451         /*
 452          * Shortcut single-cpu case if possible.
 453          */
 454         if (CPUMASK_CMPMASKEQ(pmap->pm_active, gd->gd_cpumask) &&
 455             pmap_inval_force_nonopt == 0) {
 456                 if (atomic_cmpset_long(ptep, opte, npte)) {
 457                         if (va == (vm_offset_t)-1)
 458                                 cpu_invltlb();
 459                         else
 460                                 cpu_invlpg((void *)va);
 461                         pmap_inval_done(pmap);
 462                         return 1;
 463                 } else {
 464                         pmap_inval_done(pmap);
 465                         return 0;
 466                 }
 467         }
 468
 469         /*
 470          * We need a critical section to prevent getting preempted while
 471          * we setup our command.  A preemption might execute its own
 472          * pmap_inval*() command and create confusion below.
 473          */
 474         info = &invinfo[cpu];
 475
 476         /*
 477          * We must wait for other cpus which may still be finishing
 478          * up a prior operation.
 479          */
 480         while (CPUMASK_TESTNZERO(info->done)) {
 481 #ifdef LOOPRECOVER
 482                 if (loopwdog(info)) {
 483                         info->failed = 1;
 484                         loopdebug("B", info);
 485                         /* XXX recover from possible bug */
 486                         CPUMASK_ASSZERO(info->done);
 487                 }
 488 #endif
 489                 cpu_pause();
 490         }
 491         KKASSERT(info->mode == INVDONE);
 492         cpu_mfence();
 493
 494         /*
 495          * Must set our cpu in the invalidation scan mask before
 496          * any possibility of [partial] execution (remember, XINVLTLB
 497          * can interrupt a critical section).
 498          */
 499         ATOMIC_CPUMASK_ORBIT(smp_invmask, cpu);
 500
 501         info->tsc_target = rdtsc() + (tsc_frequency * LOOPRECOVER_TIMEOUT1);
 502         info->va = va;
 503         info->npgs = 1;                 /* unused */
 504         info->ptep = ptep;
 505         info->npte = npte;
 506         info->opte = opte;
 507 #ifdef LOOPRECOVER
 508         info->failed = 0;
 509 #endif
 510         info->mode = INVCMPSET;
 511         info->success = 0;
 512
 513         tmpmask = pmap->pm_active;      /* volatile */
 514         if (pmap_inval_force_allcpus)
 515                 tmpmask = smp_active_mask;
 516         cpu_ccfence();
 517         CPUMASK_ANDMASK(tmpmask, smp_active_mask);
 518         CPUMASK_ORBIT(tmpmask, cpu);
 519         info->mask = tmpmask;
 520
 521         /*
 522          * Command may start executing the moment 'done' is initialized,
 523          * disable current cpu interrupt to prevent 'done' field from
 524          * changing (other cpus can't clear done bits until the originating
 525          * cpu clears its mask bit).
 526          */
 527 #ifdef LOOPRECOVER
 528         info->sigmask = tmpmask;
 529         CHECKSIGMASK(info);
 530 #endif
 531         cpu_sfence();
 532         rflags = read_rflags();
 533         cpu_disable_intr();
 534
 535         ATOMIC_CPUMASK_COPY(info->done, tmpmask);
 536
 537         /*
 538          * Pass our copy of the done bits (so they don't change out from
 539          * under us) to generate the Xinvltlb interrupt on the targets.
 540          */
 541         smp_invlpg(&tmpmask);
 542         success = info->success;
 543         KKASSERT(info->mode == INVDONE);
 544
 545         ATOMIC_CPUMASK_NANDBIT(smp_invmask, cpu);
 546         write_rflags(rflags);
 547         pmap_inval_done(pmap);
 548
 549         return success;
 550 }
 551
 552 void
 553 pmap_inval_bulk_init(pmap_inval_bulk_t *bulk, struct pmap *pmap)
 554 {
 555         bulk->pmap = pmap;
 556         bulk->va_beg = 0;
 557         bulk->va_end = 0;
 558         bulk->count = 0;
 559 }
 560
 561 pt_entry_t
 562 pmap_inval_bulk(pmap_inval_bulk_t *bulk, vm_offset_t va,
 563                 pt_entry_t *ptep, pt_entry_t npte)
 564 {
 565         pt_entry_t pte;
 566
 567         /*
 568          * Degenerate case, localized or we don't care (e.g. because we
 569          * are jacking the entire page table) or the pmap is not in-use
 570          * by anyone.  No invalidations are done on any cpu.
 571          */
 572         if (bulk == NULL) {
 573                 pte = atomic_swap_long(ptep, npte);
 574                 return pte;
 575         }
 576
 577         /*
 578          * If it isn't the kernel pmap we execute the operation synchronously
 579          * on all cpus belonging to the pmap, which avoids concurrency bugs in
 580          * the hw related to changing pte's out from under threads.
 581          *
 582          * Eventually I would like to implement streaming pmap invalidation
 583          * for user pmaps to reduce mmap/munmap overheads for heavily-loaded
 584          * threaded programs.
 585          */
 586         if (bulk->pmap != &kernel_pmap) {
 587                 pte = pmap_inval_smp(bulk->pmap, va, 1, ptep, npte);
 588                 return pte;
 589         }
 590
 591         /*
 592          * This is the kernel_pmap.  All unmap operations presume that there
 593          * are no other cpus accessing the addresses in question.  Implement
 594          * the bulking algorithm.  collect the required information and
 595          * synchronize once at the end.
 596          */
 597         pte = atomic_swap_long(ptep, npte);
 598         if (va == (vm_offset_t)-1) {
 599                 bulk->va_beg = va;
 600         } else if (bulk->va_beg == bulk->va_end) {
 601                 bulk->va_beg = va;
 602                 bulk->va_end = va + PAGE_SIZE;
 603         } else if (va == bulk->va_end) {
 604                 bulk->va_end = va + PAGE_SIZE;
 605         } else {
 606                 bulk->va_beg = (vm_offset_t)-1;
 607                 bulk->va_end = 0;
 608 #if 0
 609                 pmap_inval_bulk_flush(bulk);
 610                 bulk->count = 1;
 611                 if (va == (vm_offset_t)-1) {
 612                         bulk->va_beg = va;
 613                         bulk->va_end = 0;
 614                 } else {
 615                         bulk->va_beg = va;
 616                         bulk->va_end = va + PAGE_SIZE;
 617                 }
 618 #endif
 619         }
 620         ++bulk->count;
 621
 622         return pte;
 623 }
 624
 625 void
 626 pmap_inval_bulk_flush(pmap_inval_bulk_t *bulk)
 627 {
 628         if (bulk == NULL)
 629                 return;
 630         if (bulk->va_beg != bulk->va_end) {
 631                 if (bulk->va_beg == (vm_offset_t)-1) {
 632                         pmap_inval_smp(bulk->pmap, bulk->va_beg, 1, NULL, 0);
 633                 } else {
 634                         vm_pindex_t n;
 635
 636                         n = (bulk->va_end - bulk->va_beg) >> PAGE_SHIFT;
 637                         pmap_inval_smp(bulk->pmap, bulk->va_beg, n, NULL, 0);
 638                 }
 639         }
 640         bulk->va_beg = 0;
 641         bulk->va_end = 0;
 642         bulk->count = 0;
 643 }
 644
 645 /*
 646  * Called from Xinvl with a critical section held and interrupts enabled.
 647  */
 648 int
 649 pmap_inval_intr(cpumask_t *cpumaskp, int toolong)
 650 {
 651         globaldata_t gd = mycpu;
 652         pmap_inval_info_t *info;
 653         int loopme = 0;
 654         int cpu;
 655         cpumask_t cpumask;
 656
 657         /*
 658          * Check all cpus for invalidations we may need to service.
 659          */
 660         cpu_ccfence();
 661         cpu = gd->gd_cpuid;
 662         cpumask = *cpumaskp;
 663
 664         while (CPUMASK_TESTNZERO(cpumask)) {
 665                 int n = BSFCPUMASK(cpumask);
 666
 667 #ifdef LOOPRECOVER
 668                 KKASSERT(n >= 0 && n < MAXCPU);
 669 #endif
 670
 671                 CPUMASK_NANDBIT(cpumask, n);
 672                 info = &invinfo[n];
 673
 674                 /*
 675                  * Checkout cpu (cpu) for work in the target cpu info (n)
 676                  *
 677                  * if (n == cpu) - check our cpu for a master operation
 678                  * if (n != cpu) - check other cpus for a slave operation
 679                  *
 680                  * Due to interrupts/races we can catch a new operation
 681                  * in an older interrupt in other cpus.
 682                  *
 683                  * A fence is needed once we detect the (not) done bit.
 684                  */
 685                 if (!CPUMASK_TESTBIT(info->done, cpu))
 686                         continue;
 687                 cpu_lfence();
 688 #ifdef LOOPRECOVER
 689                 if (toolong) {
 690                         kprintf("pminvl %d->%d %08jx %08jx mode=%d\n",
 691                                 cpu, n, info->done.ary[0], info->mask.ary[0],
 692                                 info->mode);
 693                 }
 694 #endif
 695
 696                 /*
 697                  * info->mask and info->done always contain the originating
 698                  * cpu until the originator is done.  Targets may still be
 699                  * present in info->done after the originator is done (they
 700                  * will be finishing up their loops).
 701                  *
 702                  * Clear info->mask bits on other cpus to indicate that they
 703                  * have quiesced (entered the loop).  Once the other mask bits
 704                  * are clear we can execute the operation on the original,
 705                  * then clear the mask and done bits on the originator.  The
 706                  * targets will then finish up their side and clear their
 707                  * done bits.
 708                  *
 709                  * The command is considered 100% done when all done bits have
 710                  * been cleared.
 711                  */
 712                 if (n != cpu) {
 713                         /*
 714                          * Command state machine for 'other' cpus.
 715                          */
 716                         if (CPUMASK_TESTBIT(info->mask, cpu)) {
 717                                 /*
 718                                  * Other cpus indicate to originator that they
 719                                  * are quiesced.
 720                                  */
 721                                 ATOMIC_CPUMASK_NANDBIT(info->mask, cpu);
 722                                 loopme = 1;
 723                         } else if (info->ptep &&
 724                                    CPUMASK_TESTBIT(info->mask, n)) {
 725                                 /*
 726                                  * Other cpu must wait for the originator (n)
 727                                  * to complete its command if ptep is not NULL.
 728                                  */
 729                                 loopme = 1;
 730                         } else {
 731                                 /*
 732                                  * Other cpu detects that the originator has
 733                                  * completed its command, or there was no
 734                                  * command.
 735                                  *
 736                                  * Now that the page table entry has changed,
 737                                  * we can follow up with our own invalidation.
 738                                  */
 739                                 vm_offset_t va = info->va;
 740                                 vm_pindex_t npgs;
 741
 742                                 if (va == (vm_offset_t)-1 ||
 743                                     info->npgs > MAX_INVAL_PAGES) {
 744                                         cpu_invltlb();
 745                                 } else {
 746                                         for (npgs = info->npgs; npgs; --npgs) {
 747                                                 cpu_invlpg((void *)va);
 748                                                 va += PAGE_SIZE;
 749                                         }
 750                                 }
 751                                 ATOMIC_CPUMASK_NANDBIT(info->done, cpu);
 752                                 /* info invalid now */
 753                                 /* loopme left alone */
 754                         }
 755                 } else if (CPUMASK_TESTBIT(info->mask, cpu)) {
 756                         /*
 757                          * Originator is waiting for other cpus
 758                          */
 759                         if (CPUMASK_CMPMASKNEQ(info->mask, gd->gd_cpumask)) {
 760                                 /*
 761                                  * Originator waits for other cpus to enter
 762                                  * their loop (aka quiesce).
 763                                  *
 764                                  * If this bugs out the IPI may have been lost,
 765                                  * try to reissue by resetting our own
 766                                  * reentrancy bit and clearing the smurf mask
 767                                  * for the cpus that did not respond, then
 768                                  * reissuing the IPI.
 769                                  */
 770                                 loopme = 1;
 771 #ifdef LOOPRECOVER
 772                                 if (loopwdog(info)) {
 773                                         info->failed = 1;
 774                                         loopdebug("C", info);
 775                                         /* XXX recover from possible bug */
 776                                         mdcpu->gd_xinvaltlb = 0;
 777                                         ATOMIC_CPUMASK_NANDMASK(smp_smurf_mask,
 778                                                                 info->mask);
 779                                         cpu_disable_intr();
 780                                         smp_invlpg(&smp_active_mask);
 781
 782                                         /*
 783                                          * Force outer-loop retest of Xinvltlb
 784                                          * requests (see mp_machdep.c).
 785                                          */
 786                                         mdcpu->gd_xinvaltlb = 2;
 787                                         cpu_enable_intr();
 788                                 }
 789 #endif
 790                         } else {
 791                                 /*
 792                                  * Originator executes operation and clears
 793                                  * mask to allow other cpus to finish.
 794                                  */
 795                                 KKASSERT(info->mode != INVDONE);
 796                                 if (info->mode == INVSTORE) {
 797                                         if (info->ptep)
 798                                                 info->opte = atomic_swap_long(info->ptep, info->npte);
 799                                         CHECKSIGMASK(info);
 800                                         ATOMIC_CPUMASK_NANDBIT(info->mask, cpu);
 801                                         CHECKSIGMASK(info);
 802                                 } else {
 803                                         if (atomic_cmpset_long(info->ptep,
 804                                                               info->opte, info->npte)) {
 805                                                 info->success = 1;
 806                                         } else {
 807                                                 info->success = 0;
 808                                         }
 809                                         CHECKSIGMASK(info);
 810                                         ATOMIC_CPUMASK_NANDBIT(info->mask, cpu);
 811                                         CHECKSIGMASK(info);
 812                                 }
 813                                 loopme = 1;
 814                         }
 815                 } else {
 816                         /*
 817                          * Originator does not have to wait for the other
 818                          * cpus to finish.  It clears its done bit.  A new
 819                          * command will not be initiated by the originator
 820                          * until the other cpus have cleared their done bits
 821                          * (asynchronously).
 822                          */
 823                         vm_offset_t va = info->va;
 824                         vm_pindex_t npgs;
 825
 826                         if (va == (vm_offset_t)-1 ||
 827                             info->npgs > MAX_INVAL_PAGES) {
 828                                 cpu_invltlb();
 829                         } else {
 830                                 for (npgs = info->npgs; npgs; --npgs) {
 831                                         cpu_invlpg((void *)va);
 832                                         va += PAGE_SIZE;
 833                                 }
 834                         }
 835
 836                         /* leave loopme alone */
 837                         /* other cpus may still be finishing up */
 838                         /* can't race originator since that's us */
 839                         info->mode = INVDONE;
 840                         ATOMIC_CPUMASK_NANDBIT(info->done, cpu);
 841                 }
 842         }
 843         return loopme;
 844 }