sys/platform/pc64/x86_64/pmap_inval.c

   1 /*
   2  * Copyright (c) 2003-2011 The DragonFly Project.  All rights reserved.
   3  *
   4  * This code is derived from software contributed to The DragonFly Project
   5  * by Matthew Dillon <dillon@backplane.com>
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  *
  11  * 1. Redistributions of source code must retain the above copyright
  12  *    notice, this list of conditions and the following disclaimer.
  13  * 2. Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in
  15  *    the documentation and/or other materials provided with the
  16  *    distribution.
  17  * 3. Neither the name of The DragonFly Project nor the names of its
  18  *    contributors may be used to endorse or promote products derived
  19  *    from this software without specific, prior written permission.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
  25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
  27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  32  * SUCH DAMAGE.
  33  */
  34
  35 /*
  36  * pmap invalidation support code.  Certain hardware requirements must
  37  * be dealt with when manipulating page table entries and page directory
  38  * entries within a pmap.  In particular, we cannot safely manipulate
  39  * page tables which are in active use by another cpu (even if it is
  40  * running in userland) for two reasons: First, TLB writebacks will
  41  * race against our own modifications and tests.  Second, even if we
  42  * were to use bus-locked instruction we can still screw up the
  43  * target cpu's instruction pipeline due to Intel cpu errata.
  44  */
  45
  46 #include <sys/param.h>
  47 #include <sys/systm.h>
  48 #include <sys/kernel.h>
  49 #include <sys/proc.h>
  50 #include <sys/vmmeter.h>
  51 #include <sys/thread2.h>
  52 #include <sys/sysctl.h>
  53
  54 #include <vm/vm.h>
  55 #include <vm/pmap.h>
  56 #include <vm/vm_object.h>
  57
  58 #include <machine/cputypes.h>
  59 #include <machine/md_var.h>
  60 #include <machine/specialreg.h>
  61 #include <machine/smp.h>
  62 #include <machine/globaldata.h>
  63 #include <machine/pmap.h>
  64 #include <machine/pmap_inval.h>
  65 #include <machine/clock.h>
  66
  67 #if 1   /* DEBUGGING */
  68 #define LOOPRECOVER                     /* enable watchdog */
  69 #endif
  70
  71 /*
  72  * Watchdog recovery interval = 1.0 / (1 << radix), or 1/16 second
  73  * for the initial watchdog.  If the initial watchdog fails, further
  74  * instances occur at 1/2 second intervals.
  75  *
  76  * The watchdog value is generous for two reasons.  First, because the
  77  * situaation is not supposed to happen at all (but does), and second,
  78  * because VMs could be very slow at handling IPIs.
  79  */
  80 #define LOOPRECOVER_RADIX1      4       /* initial recovery */
  81 #define LOOPRECOVER_RADIX2      1       /* repeated recoveries */
  82
  83 #define MAX_INVAL_PAGES         128
  84
  85 struct pmap_inval_info {
  86         vm_offset_t     va;
  87         pt_entry_t      *ptep;
  88         pt_entry_t      opte;
  89         pt_entry_t      npte;
  90         enum { INVDONE, INVSTORE, INVCMPSET } mode;
  91         int             success;
  92         int             npgs;
  93         cpumask_t       done;
  94         cpumask_t       mask;
  95 #ifdef LOOPRECOVER
  96         cpumask_t       sigmask;
  97         int             failed;
  98         int64_t         tsc_target;
  99 #endif
 100 } __cachealign;
 101
 102 typedef struct pmap_inval_info pmap_inval_info_t;
 103
 104 static pmap_inval_info_t        invinfo[MAXCPU];
 105 extern cpumask_t                smp_invmask;
 106 #ifdef LOOPRECOVER
 107 #ifdef LOOPMASK_IN
 108 extern cpumask_t                smp_in_mask;
 109 #endif
 110 extern cpumask_t                smp_smurf_mask;
 111 #endif
 112 static long pmap_inval_bulk_count;
 113
 114 SYSCTL_LONG(_machdep, OID_AUTO, pmap_inval_bulk_count, CTLFLAG_RW,
 115             &pmap_inval_bulk_count, 0, "");
 116
 117 static void
 118 pmap_inval_init(pmap_t pmap)
 119 {
 120         cpulock_t olock;
 121         cpulock_t nlock;
 122
 123         crit_enter_id("inval");
 124
 125         if (pmap != &kernel_pmap) {
 126                 for (;;) {
 127                         olock = pmap->pm_active_lock;
 128                         cpu_ccfence();
 129                         nlock = olock | CPULOCK_EXCL;
 130                         if (olock != nlock &&
 131                             atomic_cmpset_int(&pmap->pm_active_lock,
 132                                               olock, nlock)) {
 133                                 break;
 134                         }
 135                         lwkt_process_ipiq();
 136                         cpu_pause();
 137                 }
 138                 atomic_add_acq_long(&pmap->pm_invgen, 1);
 139         }
 140 }
 141
 142 static void
 143 pmap_inval_done(pmap_t pmap)
 144 {
 145         if (pmap != &kernel_pmap) {
 146                 atomic_clear_int(&pmap->pm_active_lock, CPULOCK_EXCL);
 147                 atomic_add_acq_long(&pmap->pm_invgen, 1);
 148         }
 149         crit_exit_id("inval");
 150 }
 151
 152 #ifdef LOOPRECOVER
 153
 154 /*
 155  * Debugging and lost IPI recovery code.
 156  */
 157 static
 158 __inline
 159 int
 160 loopwdog(struct pmap_inval_info *info)
 161 {
 162         int64_t tsc;
 163
 164         tsc = rdtsc();
 165         if (info->tsc_target - tsc < 0 && tsc_frequency) {
 166                 info->tsc_target = tsc + (tsc_frequency >> LOOPRECOVER_RADIX2);
 167                 return 1;
 168         }
 169         return 0;
 170 }
 171
 172 static
 173 void
 174 loopdebug(const char *msg, pmap_inval_info_t *info)
 175 {
 176         int p;
 177         int cpu = mycpu->gd_cpuid;
 178
 179         cpu_lfence();
 180 #ifdef LOOPRECOVER
 181         atomic_add_long(&smp_smurf_mask.ary[0], 0);
 182 #endif
 183         kprintf("ipilost-%s! %d mode=%d m=%08jx d=%08jx "
 184 #ifdef LOOPRECOVER
 185                 "s=%08jx "
 186 #endif
 187 #ifdef LOOPMASK_IN
 188                 "in=%08jx "
 189 #endif
 190 #ifdef LOOPRECOVER
 191                 "smurf=%08jx\n"
 192 #endif
 193                 , msg, cpu, info->mode,
 194                 info->mask.ary[0],
 195                 info->done.ary[0]
 196 #ifdef LOOPRECOVER
 197                 , info->sigmask.ary[0]
 198 #endif
 199 #ifdef LOOPMASK_IN
 200                 , smp_in_mask.ary[0]
 201 #endif
 202 #ifdef LOOPRECOVER
 203                 , smp_smurf_mask.ary[0]
 204 #endif
 205                 );
 206         kprintf("mdglob ");
 207         for (p = 0; p < ncpus; ++p)
 208                 kprintf(" %d", CPU_prvspace[p]->mdglobaldata.gd_xinvaltlb);
 209         kprintf("\n");
 210 }
 211
 212 #endif
 213
 214 #ifdef CHECKSIG
 215
 216 #define CHECKSIGMASK(info)      _checksigmask(info, __FILE__, __LINE__)
 217
 218 static
 219 void
 220 _checksigmask(pmap_inval_info_t *info, const char *file, int line)
 221 {
 222         cpumask_t tmp;
 223
 224         tmp = info->mask;
 225         CPUMASK_ANDMASK(tmp, info->sigmask);
 226         if (CPUMASK_CMPMASKNEQ(tmp, info->mask)) {
 227                 kprintf("\"%s\" line %d: bad sig/mask %08jx %08jx\n",
 228                         file, line, info->sigmask.ary[0], info->mask.ary[0]);
 229         }
 230 }
 231
 232 #else
 233
 234 #define CHECKSIGMASK(info)
 235
 236 #endif
 237
 238 /*
 239  * Invalidate the specified va across all cpus associated with the pmap.
 240  * If va == (vm_offset_t)-1, we invltlb() instead of invlpg().  The operation
 241  * will be done fully synchronously with storing npte into *ptep and returning
 242  * opte.
 243  *
 244  * If ptep is NULL the operation will execute semi-synchronously.
 245  * ptep must be NULL if npgs > 1
 246  */
 247 pt_entry_t
 248 pmap_inval_smp(pmap_t pmap, vm_offset_t va, int npgs,
 249                pt_entry_t *ptep, pt_entry_t npte)
 250 {
 251         globaldata_t gd = mycpu;
 252         pmap_inval_info_t *info;
 253         pt_entry_t opte = 0;
 254         int cpu = gd->gd_cpuid;
 255         cpumask_t tmpmask;
 256         unsigned long rflags;
 257
 258         /*
 259          * Initialize invalidation for pmap and enter critical section.
 260          */
 261         if (pmap == NULL)
 262                 pmap = &kernel_pmap;
 263         pmap_inval_init(pmap);
 264
 265         /*
 266          * Shortcut single-cpu case if possible.
 267          */
 268         if (CPUMASK_CMPMASKEQ(pmap->pm_active, gd->gd_cpumask)) {
 269                 /*
 270                  * Convert to invltlb if there are too many pages to
 271                  * invlpg on.
 272                  */
 273                 if (npgs > MAX_INVAL_PAGES) {
 274                         npgs = 0;
 275                         va = (vm_offset_t)-1;
 276                 }
 277
 278                 /*
 279                  * Invalidate the specified pages, handle invltlb if requested.
 280                  */
 281                 while (npgs) {
 282                         --npgs;
 283                         if (ptep) {
 284                                 opte = atomic_swap_long(ptep, npte);
 285                                 ++ptep;
 286                         }
 287                         if (va == (vm_offset_t)-1)
 288                                 break;
 289                         cpu_invlpg((void *)va);
 290                         va += PAGE_SIZE;
 291                 }
 292                 if (va == (vm_offset_t)-1)
 293                         cpu_invltlb();
 294                 pmap_inval_done(pmap);
 295
 296                 return opte;
 297         }
 298
 299         /*
 300          * We need a critical section to prevent getting preempted while
 301          * we setup our command.  A preemption might execute its own
 302          * pmap_inval*() command and create confusion below.
 303          *
 304          * tsc_target is our watchdog timeout that will attempt to recover
 305          * from a lost IPI.  Set to 1/16 second for now.
 306          */
 307         info = &invinfo[cpu];
 308         info->tsc_target = rdtsc() + (tsc_frequency >> LOOPRECOVER_RADIX1);
 309
 310         /*
 311          * We must wait for other cpus which may still be finishing up a
 312          * prior operation that we requested.
 313          *
 314          * We do not have to disable interrupts here.  An Xinvltlb can occur
 315          * at any time (even within a critical section), but it will not
 316          * act on our command until we set our done bits.
 317          */
 318         while (CPUMASK_TESTNZERO(info->done)) {
 319 #ifdef LOOPRECOVER
 320                 if (loopwdog(info)) {
 321                         info->failed = 1;
 322                         loopdebug("A", info);
 323                         /* XXX recover from possible bug */
 324                         CPUMASK_ASSZERO(info->done);
 325                 }
 326 #endif
 327                 cpu_pause();
 328         }
 329         KKASSERT(info->mode == INVDONE);
 330
 331         /*
 332          * Must set our cpu in the invalidation scan mask before
 333          * any possibility of [partial] execution (remember, XINVLTLB
 334          * can interrupt a critical section).
 335          */
 336         ATOMIC_CPUMASK_ORBIT(smp_invmask, cpu);
 337
 338         info->va = va;
 339         info->npgs = npgs;
 340         info->ptep = ptep;
 341         info->npte = npte;
 342         info->opte = 0;
 343 #ifdef LOOPRECOVER
 344         info->failed = 0;
 345 #endif
 346         info->mode = INVSTORE;
 347
 348         tmpmask = pmap->pm_active;      /* volatile (bits may be cleared) */
 349         cpu_ccfence();
 350         CPUMASK_ANDMASK(tmpmask, smp_active_mask);
 351
 352         /*
 353          * If ptep is NULL the operation can be semi-synchronous, which means
 354          * we can improve performance by flagging and removing idle cpus
 355          * (see the idleinvlclr function in mp_machdep.c).
 356          *
 357          * Typically kernel page table operation is semi-synchronous.
 358          */
 359         if (ptep == NULL)
 360                 smp_smurf_idleinvlclr(&tmpmask);
 361         CPUMASK_ORBIT(tmpmask, cpu);
 362         info->mask = tmpmask;
 363
 364         /*
 365          * Command may start executing the moment 'done' is initialized,
 366          * disable current cpu interrupt to prevent 'done' field from
 367          * changing (other cpus can't clear done bits until the originating
 368          * cpu clears its mask bit, but other cpus CAN start clearing their
 369          * mask bits).
 370          */
 371 #ifdef LOOPRECOVER
 372         info->sigmask = tmpmask;
 373         CHECKSIGMASK(info);
 374 #endif
 375         cpu_sfence();
 376         rflags = read_rflags();
 377         cpu_disable_intr();
 378
 379         ATOMIC_CPUMASK_COPY(info->done, tmpmask);
 380         /* execution can begin here due to races */
 381
 382         /*
 383          * Pass our copy of the done bits (so they don't change out from
 384          * under us) to generate the Xinvltlb interrupt on the targets.
 385          */
 386         smp_invlpg(&tmpmask);
 387         opte = info->opte;
 388         KKASSERT(info->mode == INVDONE);
 389
 390         /*
 391          * Target cpus will be in their loop exiting concurrently with our
 392          * cleanup.  They will not lose the bitmask they obtained before so
 393          * we can safely clear this bit.
 394          */
 395         ATOMIC_CPUMASK_NANDBIT(smp_invmask, cpu);
 396         write_rflags(rflags);
 397         pmap_inval_done(pmap);
 398
 399         return opte;
 400 }
 401
 402 /*
 403  * API function - invalidate the pte at (va) and replace *ptep with npte
 404  * atomically only if *ptep equals opte, across the pmap's active cpus.
 405  *
 406  * Returns 1 on success, 0 on failure (caller typically retries).
 407  */
 408 int
 409 pmap_inval_smp_cmpset(pmap_t pmap, vm_offset_t va, pt_entry_t *ptep,
 410                       pt_entry_t opte, pt_entry_t npte)
 411 {
 412         globaldata_t gd = mycpu;
 413         pmap_inval_info_t *info;
 414         int success;
 415         int cpu = gd->gd_cpuid;
 416         cpumask_t tmpmask;
 417         unsigned long rflags;
 418
 419         /*
 420          * Initialize invalidation for pmap and enter critical section.
 421          */
 422         if (pmap == NULL)
 423                 pmap = &kernel_pmap;
 424         pmap_inval_init(pmap);
 425
 426         /*
 427          * Shortcut single-cpu case if possible.
 428          */
 429         if (CPUMASK_CMPMASKEQ(pmap->pm_active, gd->gd_cpumask)) {
 430                 if (atomic_cmpset_long(ptep, opte, npte)) {
 431                         if (va == (vm_offset_t)-1)
 432                                 cpu_invltlb();
 433                         else
 434                                 cpu_invlpg((void *)va);
 435                         pmap_inval_done(pmap);
 436                         return 1;
 437                 } else {
 438                         pmap_inval_done(pmap);
 439                         return 0;
 440                 }
 441         }
 442
 443         /*
 444          * We need a critical section to prevent getting preempted while
 445          * we setup our command.  A preemption might execute its own
 446          * pmap_inval*() command and create confusion below.
 447          */
 448         info = &invinfo[cpu];
 449
 450         /*
 451          * We must wait for other cpus which may still be finishing
 452          * up a prior operation.
 453          */
 454         while (CPUMASK_TESTNZERO(info->done)) {
 455 #ifdef LOOPRECOVER
 456                 if (loopwdog(info)) {
 457                         info->failed = 1;
 458                         loopdebug("B", info);
 459                         /* XXX recover from possible bug */
 460                         CPUMASK_ASSZERO(info->done);
 461                 }
 462 #endif
 463                 cpu_pause();
 464         }
 465         KKASSERT(info->mode == INVDONE);
 466
 467         /*
 468          * Must set our cpu in the invalidation scan mask before
 469          * any possibility of [partial] execution (remember, XINVLTLB
 470          * can interrupt a critical section).
 471          */
 472         ATOMIC_CPUMASK_ORBIT(smp_invmask, cpu);
 473
 474         info->va = va;
 475         info->npgs = 1;                 /* unused */
 476         info->ptep = ptep;
 477         info->npte = npte;
 478         info->opte = opte;
 479 #ifdef LOOPRECOVER
 480         info->failed = 0;
 481 #endif
 482         info->mode = INVCMPSET;
 483         info->success = 0;
 484
 485         tmpmask = pmap->pm_active;      /* volatile */
 486         cpu_ccfence();
 487         CPUMASK_ANDMASK(tmpmask, smp_active_mask);
 488         CPUMASK_ORBIT(tmpmask, cpu);
 489         info->mask = tmpmask;
 490
 491         /*
 492          * Command may start executing the moment 'done' is initialized,
 493          * disable current cpu interrupt to prevent 'done' field from
 494          * changing (other cpus can't clear done bits until the originating
 495          * cpu clears its mask bit).
 496          */
 497 #ifdef LOOPRECOVER
 498         info->sigmask = tmpmask;
 499         CHECKSIGMASK(info);
 500 #endif
 501         cpu_sfence();
 502         rflags = read_rflags();
 503         cpu_disable_intr();
 504
 505         ATOMIC_CPUMASK_COPY(info->done, tmpmask);
 506
 507         /*
 508          * Pass our copy of the done bits (so they don't change out from
 509          * under us) to generate the Xinvltlb interrupt on the targets.
 510          */
 511         smp_invlpg(&tmpmask);
 512         success = info->success;
 513         KKASSERT(info->mode == INVDONE);
 514
 515         ATOMIC_CPUMASK_NANDBIT(smp_invmask, cpu);
 516         write_rflags(rflags);
 517         pmap_inval_done(pmap);
 518
 519         return success;
 520 }
 521
 522 void
 523 pmap_inval_bulk_init(pmap_inval_bulk_t *bulk, struct pmap *pmap)
 524 {
 525         bulk->pmap = pmap;
 526         bulk->va_beg = 0;
 527         bulk->va_end = 0;
 528         bulk->count = 0;
 529 }
 530
 531 pt_entry_t
 532 pmap_inval_bulk(pmap_inval_bulk_t *bulk, vm_offset_t va,
 533                 pt_entry_t *ptep, pt_entry_t npte)
 534 {
 535         pt_entry_t pte;
 536
 537         /*
 538          * Degenerate case, localized or we don't care (e.g. because we
 539          * are jacking the entire page table) or the pmap is not in-use
 540          * by anyone.  No invalidations are done on any cpu.
 541          */
 542         if (bulk == NULL) {
 543                 pte = atomic_swap_long(ptep, npte);
 544                 return pte;
 545         }
 546
 547         /*
 548          * If it isn't the kernel pmap we execute the operation synchronously
 549          * on all cpus belonging to the pmap, which avoids concurrency bugs in
 550          * the hw related to changing pte's out from under threads.
 551          *
 552          * Eventually I would like to implement streaming pmap invalidation
 553          * for user pmaps to reduce mmap/munmap overheads for heavily-loaded
 554          * threaded programs.
 555          */
 556         if (bulk->pmap != &kernel_pmap) {
 557                 pte = pmap_inval_smp(bulk->pmap, va, 1, ptep, npte);
 558                 return pte;
 559         }
 560
 561         /*
 562          * This is the kernel_pmap.  All unmap operations presume that there
 563          * are no other cpus accessing the addresses in question.  Implement
 564          * the bulking algorithm.  collect the required information and
 565          * synchronize once at the end.
 566          */
 567         pte = atomic_swap_long(ptep, npte);
 568         if (va == (vm_offset_t)-1) {
 569                 bulk->va_beg = va;
 570         } else if (bulk->va_beg == bulk->va_end) {
 571                 bulk->va_beg = va;
 572                 bulk->va_end = va + PAGE_SIZE;
 573         } else if (va == bulk->va_end) {
 574                 bulk->va_end = va + PAGE_SIZE;
 575         } else {
 576                 bulk->va_beg = (vm_offset_t)-1;
 577                 bulk->va_end = 0;
 578 #if 0
 579                 pmap_inval_bulk_flush(bulk);
 580                 bulk->count = 1;
 581                 if (va == (vm_offset_t)-1) {
 582                         bulk->va_beg = va;
 583                         bulk->va_end = 0;
 584                 } else {
 585                         bulk->va_beg = va;
 586                         bulk->va_end = va + PAGE_SIZE;
 587                 }
 588 #endif
 589         }
 590         ++bulk->count;
 591
 592         return pte;
 593 }
 594
 595 void
 596 pmap_inval_bulk_flush(pmap_inval_bulk_t *bulk)
 597 {
 598         if (bulk == NULL)
 599                 return;
 600         if (bulk->count > 0)
 601                 pmap_inval_bulk_count += (bulk->count - 1);
 602         if (bulk->va_beg != bulk->va_end) {
 603                 if (bulk->va_beg == (vm_offset_t)-1) {
 604                         pmap_inval_smp(bulk->pmap, bulk->va_beg, 1, NULL, 0);
 605                 } else {
 606                         long n;
 607
 608                         n = (bulk->va_end - bulk->va_beg) >> PAGE_SHIFT;
 609                         pmap_inval_smp(bulk->pmap, bulk->va_beg, n, NULL, 0);
 610                 }
 611         }
 612         bulk->va_beg = 0;
 613         bulk->va_end = 0;
 614         bulk->count = 0;
 615 }
 616
 617 /*
 618  * Called with a critical section held and interrupts enabled.
 619  */
 620 int
 621 pmap_inval_intr(cpumask_t *cpumaskp, int toolong)
 622 {
 623         globaldata_t gd = mycpu;
 624         pmap_inval_info_t *info;
 625         int loopme = 0;
 626         int cpu;
 627         cpumask_t cpumask;
 628
 629         /*
 630          * Check all cpus for invalidations we may need to service.
 631          */
 632         cpu_ccfence();
 633         cpu = gd->gd_cpuid;
 634         cpumask = *cpumaskp;
 635
 636         while (CPUMASK_TESTNZERO(cpumask)) {
 637                 int n = BSFCPUMASK(cpumask);
 638
 639 #ifdef LOOPRECOVER
 640                 KKASSERT(n >= 0 && n < MAXCPU);
 641 #endif
 642
 643                 CPUMASK_NANDBIT(cpumask, n);
 644                 info = &invinfo[n];
 645
 646                 /*
 647                  * Due to interrupts/races we can catch a new operation
 648                  * in an older interrupt.  A fence is needed once we detect
 649                  * the (not) done bit.
 650                  */
 651                 if (!CPUMASK_TESTBIT(info->done, cpu))
 652                         continue;
 653                 cpu_lfence();
 654 #ifdef LOOPRECOVER
 655                 if (toolong) {
 656                         kprintf("pminvl %d->%d %08jx %08jx mode=%d\n",
 657                                 cpu, n, info->done.ary[0], info->mask.ary[0],
 658                                 info->mode);
 659                 }
 660 #endif
 661
 662                 /*
 663                  * info->mask and info->done always contain the originating
 664                  * cpu until the originator is done.  Targets may still be
 665                  * present in info->done after the originator is done (they
 666                  * will be finishing up their loops).
 667                  *
 668                  * Clear info->mask bits on other cpus to indicate that they
 669                  * have quiesced (entered the loop).  Once the other mask bits
 670                  * are clear we can execute the operation on the original,
 671                  * then clear the mask and done bits on the originator.  The
 672                  * targets will then finish up their side and clear their
 673                  * done bits.
 674                  *
 675                  * The command is considered 100% done when all done bits have
 676                  * been cleared.
 677                  */
 678                 if (n != cpu) {
 679                         /*
 680                          * Command state machine for 'other' cpus.
 681                          */
 682                         if (CPUMASK_TESTBIT(info->mask, cpu)) {
 683                                 /*
 684                                  * Other cpu indicate to originator that they
 685                                  * are quiesced.
 686                                  */
 687                                 ATOMIC_CPUMASK_NANDBIT(info->mask, cpu);
 688                                 loopme = 1;
 689                         } else if (info->ptep &&
 690                                    CPUMASK_TESTBIT(info->mask, n)) {
 691                                 /*
 692                                  * Other cpu must wait for the originator (n)
 693                                  * to complete its command if ptep is not NULL.
 694                                  */
 695                                 loopme = 1;
 696                         } else {
 697                                 /*
 698                                  * Other cpu detects that the originator has
 699                                  * completed its command, or there was no
 700                                  * command.
 701                                  *
 702                                  * Now that the page table entry has changed,
 703                                  * we can follow up with our own invalidation.
 704                                  */
 705                                 vm_offset_t va = info->va;
 706                                 int npgs;
 707
 708                                 if (va == (vm_offset_t)-1 ||
 709                                     info->npgs > MAX_INVAL_PAGES) {
 710                                         cpu_invltlb();
 711                                 } else {
 712                                         for (npgs = info->npgs; npgs; --npgs) {
 713                                                 cpu_invlpg((void *)va);
 714                                                 va += PAGE_SIZE;
 715                                         }
 716                                 }
 717                                 ATOMIC_CPUMASK_NANDBIT(info->done, cpu);
 718                                 /* info invalid now */
 719                                 /* loopme left alone */
 720                         }
 721                 } else if (CPUMASK_TESTBIT(info->mask, cpu)) {
 722                         /*
 723                          * Originator is waiting for other cpus
 724                          */
 725                         if (CPUMASK_CMPMASKNEQ(info->mask, gd->gd_cpumask)) {
 726                                 /*
 727                                  * Originator waits for other cpus to enter
 728                                  * their loop (aka quiesce).
 729                                  *
 730                                  * If this bugs out the IPI may have been lost,
 731                                  * try to reissue by resetting our own
 732                                  * reentrancy bit and clearing the smurf mask
 733                                  * for the cpus that did not respond, then
 734                                  * reissuing the IPI.
 735                                  */
 736                                 loopme = 1;
 737 #ifdef LOOPRECOVER
 738                                 if (loopwdog(info)) {
 739                                         info->failed = 1;
 740                                         loopdebug("C", info);
 741                                         /* XXX recover from possible bug */
 742                                         mdcpu->gd_xinvaltlb = 0;
 743                                         ATOMIC_CPUMASK_NANDMASK(smp_smurf_mask,
 744                                                                 info->mask);
 745                                         cpu_disable_intr();
 746                                         smp_invlpg(&smp_active_mask);
 747                                         cpu_enable_intr();
 748                                 }
 749 #endif
 750                         } else {
 751                                 /*
 752                                  * Originator executes operation and clears
 753                                  * mask to allow other cpus to finish.
 754                                  */
 755                                 KKASSERT(info->mode != INVDONE);
 756                                 if (info->mode == INVSTORE) {
 757                                         if (info->ptep)
 758                                                 info->opte = atomic_swap_long(info->ptep, info->npte);
 759                                         CHECKSIGMASK(info);
 760                                         ATOMIC_CPUMASK_NANDBIT(info->mask, cpu);
 761                                         CHECKSIGMASK(info);
 762                                 } else {
 763                                         if (atomic_cmpset_long(info->ptep,
 764                                                               info->opte, info->npte)) {
 765                                                 info->success = 1;
 766                                         } else {
 767                                                 info->success = 0;
 768                                         }
 769                                         CHECKSIGMASK(info);
 770                                         ATOMIC_CPUMASK_NANDBIT(info->mask, cpu);
 771                                         CHECKSIGMASK(info);
 772                                 }
 773                                 loopme = 1;
 774                         }
 775                 } else {
 776                         /*
 777                          * Originator does not have to wait for the other
 778                          * cpus to finish.  It clears its done bit.  A new
 779                          * command will not be initiated by the originator
 780                          * until the other cpus have cleared their done bits
 781                          * (asynchronously).
 782                          */
 783                         vm_offset_t va = info->va;
 784                         int npgs;
 785
 786                         if (va == (vm_offset_t)-1 ||
 787                             info->npgs > MAX_INVAL_PAGES) {
 788                                 cpu_invltlb();
 789                         } else {
 790                                 for (npgs = info->npgs; npgs; --npgs) {
 791                                         cpu_invlpg((void *)va);
 792                                         va += PAGE_SIZE;
 793                                 }
 794                         }
 795
 796                         /* leave loopme alone */
 797                         /* other cpus may still be finishing up */
 798                         /* can't race originator since that's us */
 799                         info->mode = INVDONE;
 800                         ATOMIC_CPUMASK_NANDBIT(info->done, cpu);
 801                 }
 802         }
 803         return loopme;
 804 }