2 * Copyright (c) 2003-2011 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36 * pmap invalidation support code. Certain hardware requirements must
37 * be dealt with when manipulating page table entries and page directory
38 * entries within a pmap. In particular, we cannot safely manipulate
39 * page tables which are in active use by another cpu (even if it is
40 * running in userland) for two reasons: First, TLB writebacks will
41 * race against our own modifications and tests. Second, even if we
42 * were to use bus-locked instruction we can still screw up the
43 * target cpu's instruction pipeline due to Intel cpu errata.
46 #include <sys/param.h>
47 #include <sys/systm.h>
48 #include <sys/kernel.h>
50 #include <sys/vmmeter.h>
51 #include <sys/thread2.h>
52 #include <sys/sysctl.h>
56 #include <vm/vm_object.h>
58 #include <machine/cputypes.h>
59 #include <machine/md_var.h>
60 #include <machine/specialreg.h>
61 #include <machine/smp.h>
62 #include <machine/globaldata.h>
63 #include <machine/pmap.h>
64 #include <machine/pmap_inval.h>
65 #include <machine/clock.h>
68 #define LOOPRECOVER /* enable watchdog */
72 * Watchdog recovery interval, in seconds.
74 * The watchdog value is generous for two reasons. First, because the
75 * situation is not supposed to happen at all (but does), and second,
76 * because VMs could be very slow at handling IPIs.
78 #define LOOPRECOVER_TIMEOUT1 2 /* initial recovery */
79 #define LOOPRECOVER_TIMEOUT2 1 /* repeated recoveries */
81 #define MAX_INVAL_PAGES 128
83 struct pmap_inval_info
{
88 enum { INVDONE
, INVSTORE
, INVCMPSET
} mode
;
100 typedef struct pmap_inval_info pmap_inval_info_t
;
102 static pmap_inval_info_t invinfo
[MAXCPU
];
103 extern cpumask_t smp_invmask
;
106 extern cpumask_t smp_in_mask
;
108 extern cpumask_t smp_smurf_mask
;
110 static int pmap_inval_watchdog_print
; /* must always default off */
111 static int pmap_inval_force_allcpus
;
112 static int pmap_inval_force_nonopt
;
114 SYSCTL_INT(_machdep
, OID_AUTO
, pmap_inval_watchdog_print
, CTLFLAG_RW
,
115 &pmap_inval_watchdog_print
, 0, "");
116 SYSCTL_INT(_machdep
, OID_AUTO
, pmap_inval_force_allcpus
, CTLFLAG_RW
,
117 &pmap_inval_force_allcpus
, 0, "");
118 SYSCTL_INT(_machdep
, OID_AUTO
, pmap_inval_force_nonopt
, CTLFLAG_RW
,
119 &pmap_inval_force_nonopt
, 0, "");
122 pmap_inval_init(pmap_t pmap
)
127 crit_enter_id("inval");
129 if (pmap
!= &kernel_pmap
) {
131 olock
= pmap
->pm_active_lock
;
133 nlock
= olock
| CPULOCK_EXCL
;
134 if (olock
!= nlock
&&
135 atomic_cmpset_int(&pmap
->pm_active_lock
,
142 atomic_add_acq_long(&pmap
->pm_invgen
, 1);
147 pmap_inval_done(pmap_t pmap
)
149 if (pmap
!= &kernel_pmap
) {
150 atomic_add_acq_long(&pmap
->pm_invgen
, 1);
151 atomic_clear_int(&pmap
->pm_active_lock
, CPULOCK_EXCL
);
153 crit_exit_id("inval");
159 * Debugging and lost IPI recovery code.
164 loopwdog(struct pmap_inval_info
*info
)
169 if (info
->tsc_target
- tsc
< 0 && tsc_frequency
) {
170 info
->tsc_target
= tsc
+ (tsc_frequency
* LOOPRECOVER_TIMEOUT2
);
178 loopdebug(const char *msg
, pmap_inval_info_t
*info
)
181 int cpu
= mycpu
->gd_cpuid
;
184 * Don't kprintf() anything if the pmap inval watchdog gets hit.
185 * DRM can cause an occassional watchdog hit (at least with a 1/16
186 * second watchdog), and attempting to kprintf to the KVM frame buffer
187 * from Xinvltlb, which ignores critical sections, can implode the
190 if (pmap_inval_watchdog_print
== 0)
195 atomic_add_long(&smp_smurf_mask
.ary
[0], 0);
197 kprintf("ipilost-%s! %d mode=%d m=%08jx d=%08jx "
207 , msg
, cpu
, info
->mode
,
211 , info
->sigmask
.ary
[0]
217 , smp_smurf_mask
.ary
[0]
221 for (p
= 0; p
< ncpus
; ++p
)
222 kprintf(" %d", CPU_prvspace
[p
]->mdglobaldata
.gd_xinvaltlb
);
230 #define CHECKSIGMASK(info) _checksigmask(info, __FILE__, __LINE__)
234 _checksigmask(pmap_inval_info_t
*info
, const char *file
, int line
)
239 CPUMASK_ANDMASK(tmp
, info
->sigmask
);
240 if (CPUMASK_CMPMASKNEQ(tmp
, info
->mask
)) {
241 kprintf("\"%s\" line %d: bad sig/mask %08jx %08jx\n",
242 file
, line
, info
->sigmask
.ary
[0], info
->mask
.ary
[0]);
248 #define CHECKSIGMASK(info)
253 * Invalidate the specified va across all cpus associated with the pmap.
254 * If va == (vm_offset_t)-1, we invltlb() instead of invlpg(). The operation
255 * will be done fully synchronously with storing npte into *ptep and returning
258 * If ptep is NULL the operation will execute semi-synchronously.
259 * ptep must be NULL if npgs > 1
262 pmap_inval_smp(pmap_t pmap
, vm_offset_t va
, vm_pindex_t npgs
,
263 pt_entry_t
*ptep
, pt_entry_t npte
)
265 globaldata_t gd
= mycpu
;
266 pmap_inval_info_t
*info
;
268 int cpu
= gd
->gd_cpuid
;
270 unsigned long rflags
;
273 * Initialize invalidation for pmap and enter critical section.
274 * This will enter a critical section for us.
278 pmap_inval_init(pmap
);
281 * Shortcut single-cpu case if possible.
283 if (CPUMASK_CMPMASKEQ(pmap
->pm_active
, gd
->gd_cpumask
) &&
284 pmap_inval_force_nonopt
== 0) {
286 * Convert to invltlb if there are too many pages to
291 opte
= atomic_swap_long(ptep
, npte
);
292 if (va
== (vm_offset_t
)-1)
295 cpu_invlpg((void *)va
);
296 } else if (va
== (vm_offset_t
)-1 || npgs
> MAX_INVAL_PAGES
) {
299 opte
= atomic_swap_long(ptep
, npte
);
308 opte
= atomic_swap_long(ptep
, npte
);
311 cpu_invlpg((void *)va
);
316 pmap_inval_done(pmap
);
322 * We need a critical section to prevent getting preempted while
323 * we setup our command. A preemption might execute its own
324 * pmap_inval*() command and create confusion below.
326 * tsc_target is our watchdog timeout that will attempt to recover
327 * from a lost IPI. Set to 1/16 second for now.
329 info
= &invinfo
[cpu
];
332 * We must wait for other cpus which may still be finishing up a
333 * prior operation that we requested.
335 * We do not have to disable interrupts here. An Xinvltlb can occur
336 * at any time (even within a critical section), but it will not
337 * act on our command until we set our done bits.
339 while (CPUMASK_TESTNZERO(info
->done
)) {
341 if (loopwdog(info
)) {
343 loopdebug("A", info
);
344 /* XXX recover from possible bug */
345 CPUMASK_ASSZERO(info
->done
);
350 KKASSERT(info
->mode
== INVDONE
);
354 * Must set our cpu in the invalidation scan mask before
355 * any possibility of [partial] execution (remember, XINVLTLB
356 * can interrupt a critical section).
358 ATOMIC_CPUMASK_ORBIT(smp_invmask
, cpu
);
360 info
->tsc_target
= rdtsc() + (tsc_frequency
* LOOPRECOVER_TIMEOUT1
);
369 info
->mode
= INVSTORE
;
371 tmpmask
= pmap
->pm_active
; /* volatile (bits may be cleared) */
372 if (pmap_inval_force_allcpus
)
373 tmpmask
= smp_active_mask
;
375 CPUMASK_ANDMASK(tmpmask
, smp_active_mask
);
378 * If ptep is NULL the operation can be semi-synchronous, which means
379 * we can improve performance by flagging and removing idle cpus
380 * (see the idleinvlclr function in mp_machdep.c).
382 * Typically kernel page table operation is semi-synchronous.
385 smp_smurf_idleinvlclr(&tmpmask
);
386 CPUMASK_ORBIT(tmpmask
, cpu
);
387 info
->mask
= tmpmask
;
390 * Command may start executing the moment 'done' is initialized,
391 * disable current cpu interrupt to prevent 'done' field from
392 * changing (other cpus can't clear done bits until the originating
393 * cpu clears its mask bit, but other cpus CAN start clearing their
397 info
->sigmask
= tmpmask
;
401 rflags
= read_rflags();
404 ATOMIC_CPUMASK_COPY(info
->done
, tmpmask
);
405 /* execution can begin here on other cpus due to races */
408 * Pass our copy of the done bits (so they don't change out from
409 * under us) to generate the Xinvltlb interrupt on the targets.
411 smp_invlpg(&tmpmask
);
413 KKASSERT(info
->mode
== INVDONE
);
416 * Target cpus will be in their loop exiting concurrently with our
417 * cleanup. They will not lose the bitmask they obtained before so
418 * we can safely clear this bit.
420 ATOMIC_CPUMASK_NANDBIT(smp_invmask
, cpu
);
421 write_rflags(rflags
);
422 pmap_inval_done(pmap
);
428 * API function - invalidate the pte at (va) and replace *ptep with npte
429 * atomically only if *ptep equals opte, across the pmap's active cpus.
431 * Returns 1 on success, 0 on failure (caller typically retries).
434 pmap_inval_smp_cmpset(pmap_t pmap
, vm_offset_t va
, pt_entry_t
*ptep
,
435 pt_entry_t opte
, pt_entry_t npte
)
437 globaldata_t gd
= mycpu
;
438 pmap_inval_info_t
*info
;
440 int cpu
= gd
->gd_cpuid
;
442 unsigned long rflags
;
445 * Initialize invalidation for pmap and enter critical section.
449 pmap_inval_init(pmap
);
452 * Shortcut single-cpu case if possible.
454 if (CPUMASK_CMPMASKEQ(pmap
->pm_active
, gd
->gd_cpumask
) &&
455 pmap_inval_force_nonopt
== 0) {
456 if (atomic_cmpset_long(ptep
, opte
, npte
)) {
457 if (va
== (vm_offset_t
)-1)
460 cpu_invlpg((void *)va
);
461 pmap_inval_done(pmap
);
464 pmap_inval_done(pmap
);
470 * We need a critical section to prevent getting preempted while
471 * we setup our command. A preemption might execute its own
472 * pmap_inval*() command and create confusion below.
474 info
= &invinfo
[cpu
];
477 * We must wait for other cpus which may still be finishing
478 * up a prior operation.
480 while (CPUMASK_TESTNZERO(info
->done
)) {
482 if (loopwdog(info
)) {
484 loopdebug("B", info
);
485 /* XXX recover from possible bug */
486 CPUMASK_ASSZERO(info
->done
);
491 KKASSERT(info
->mode
== INVDONE
);
495 * Must set our cpu in the invalidation scan mask before
496 * any possibility of [partial] execution (remember, XINVLTLB
497 * can interrupt a critical section).
499 ATOMIC_CPUMASK_ORBIT(smp_invmask
, cpu
);
501 info
->tsc_target
= rdtsc() + (tsc_frequency
* LOOPRECOVER_TIMEOUT1
);
503 info
->npgs
= 1; /* unused */
510 info
->mode
= INVCMPSET
;
513 tmpmask
= pmap
->pm_active
; /* volatile */
514 if (pmap_inval_force_allcpus
)
515 tmpmask
= smp_active_mask
;
517 CPUMASK_ANDMASK(tmpmask
, smp_active_mask
);
518 CPUMASK_ORBIT(tmpmask
, cpu
);
519 info
->mask
= tmpmask
;
522 * Command may start executing the moment 'done' is initialized,
523 * disable current cpu interrupt to prevent 'done' field from
524 * changing (other cpus can't clear done bits until the originating
525 * cpu clears its mask bit).
528 info
->sigmask
= tmpmask
;
532 rflags
= read_rflags();
535 ATOMIC_CPUMASK_COPY(info
->done
, tmpmask
);
538 * Pass our copy of the done bits (so they don't change out from
539 * under us) to generate the Xinvltlb interrupt on the targets.
541 smp_invlpg(&tmpmask
);
542 success
= info
->success
;
543 KKASSERT(info
->mode
== INVDONE
);
545 ATOMIC_CPUMASK_NANDBIT(smp_invmask
, cpu
);
546 write_rflags(rflags
);
547 pmap_inval_done(pmap
);
553 pmap_inval_bulk_init(pmap_inval_bulk_t
*bulk
, struct pmap
*pmap
)
562 pmap_inval_bulk(pmap_inval_bulk_t
*bulk
, vm_offset_t va
,
563 pt_entry_t
*ptep
, pt_entry_t npte
)
568 * Degenerate case, localized or we don't care (e.g. because we
569 * are jacking the entire page table) or the pmap is not in-use
570 * by anyone. No invalidations are done on any cpu.
573 pte
= atomic_swap_long(ptep
, npte
);
578 * If it isn't the kernel pmap we execute the operation synchronously
579 * on all cpus belonging to the pmap, which avoids concurrency bugs in
580 * the hw related to changing pte's out from under threads.
582 * Eventually I would like to implement streaming pmap invalidation
583 * for user pmaps to reduce mmap/munmap overheads for heavily-loaded
586 if (bulk
->pmap
!= &kernel_pmap
) {
587 pte
= pmap_inval_smp(bulk
->pmap
, va
, 1, ptep
, npte
);
592 * This is the kernel_pmap. All unmap operations presume that there
593 * are no other cpus accessing the addresses in question. Implement
594 * the bulking algorithm. collect the required information and
595 * synchronize once at the end.
597 pte
= atomic_swap_long(ptep
, npte
);
598 if (va
== (vm_offset_t
)-1) {
600 } else if (bulk
->va_beg
== bulk
->va_end
) {
602 bulk
->va_end
= va
+ PAGE_SIZE
;
603 } else if (va
== bulk
->va_end
) {
604 bulk
->va_end
= va
+ PAGE_SIZE
;
606 bulk
->va_beg
= (vm_offset_t
)-1;
609 pmap_inval_bulk_flush(bulk
);
611 if (va
== (vm_offset_t
)-1) {
616 bulk
->va_end
= va
+ PAGE_SIZE
;
626 pmap_inval_bulk_flush(pmap_inval_bulk_t
*bulk
)
630 if (bulk
->va_beg
!= bulk
->va_end
) {
631 if (bulk
->va_beg
== (vm_offset_t
)-1) {
632 pmap_inval_smp(bulk
->pmap
, bulk
->va_beg
, 1, NULL
, 0);
636 n
= (bulk
->va_end
- bulk
->va_beg
) >> PAGE_SHIFT
;
637 pmap_inval_smp(bulk
->pmap
, bulk
->va_beg
, n
, NULL
, 0);
646 * Called from Xinvl with a critical section held and interrupts enabled.
649 pmap_inval_intr(cpumask_t
*cpumaskp
, int toolong
)
651 globaldata_t gd
= mycpu
;
652 pmap_inval_info_t
*info
;
658 * Check all cpus for invalidations we may need to service.
664 while (CPUMASK_TESTNZERO(cpumask
)) {
665 int n
= BSFCPUMASK(cpumask
);
668 KKASSERT(n
>= 0 && n
< MAXCPU
);
671 CPUMASK_NANDBIT(cpumask
, n
);
675 * Checkout cpu (cpu) for work in the target cpu info (n)
677 * if (n == cpu) - check our cpu for a master operation
678 * if (n != cpu) - check other cpus for a slave operation
680 * Due to interrupts/races we can catch a new operation
681 * in an older interrupt in other cpus.
683 * A fence is needed once we detect the (not) done bit.
685 if (!CPUMASK_TESTBIT(info
->done
, cpu
))
690 kprintf("pminvl %d->%d %08jx %08jx mode=%d\n",
691 cpu
, n
, info
->done
.ary
[0], info
->mask
.ary
[0],
697 * info->mask and info->done always contain the originating
698 * cpu until the originator is done. Targets may still be
699 * present in info->done after the originator is done (they
700 * will be finishing up their loops).
702 * Clear info->mask bits on other cpus to indicate that they
703 * have quiesced (entered the loop). Once the other mask bits
704 * are clear we can execute the operation on the original,
705 * then clear the mask and done bits on the originator. The
706 * targets will then finish up their side and clear their
709 * The command is considered 100% done when all done bits have
714 * Command state machine for 'other' cpus.
716 if (CPUMASK_TESTBIT(info
->mask
, cpu
)) {
718 * Other cpus indicate to originator that they
721 ATOMIC_CPUMASK_NANDBIT(info
->mask
, cpu
);
723 } else if (info
->ptep
&&
724 CPUMASK_TESTBIT(info
->mask
, n
)) {
726 * Other cpu must wait for the originator (n)
727 * to complete its command if ptep is not NULL.
732 * Other cpu detects that the originator has
733 * completed its command, or there was no
736 * Now that the page table entry has changed,
737 * we can follow up with our own invalidation.
739 vm_offset_t va
= info
->va
;
742 if (va
== (vm_offset_t
)-1 ||
743 info
->npgs
> MAX_INVAL_PAGES
) {
746 for (npgs
= info
->npgs
; npgs
; --npgs
) {
747 cpu_invlpg((void *)va
);
751 ATOMIC_CPUMASK_NANDBIT(info
->done
, cpu
);
752 /* info invalid now */
753 /* loopme left alone */
755 } else if (CPUMASK_TESTBIT(info
->mask
, cpu
)) {
757 * Originator is waiting for other cpus
759 if (CPUMASK_CMPMASKNEQ(info
->mask
, gd
->gd_cpumask
)) {
761 * Originator waits for other cpus to enter
762 * their loop (aka quiesce).
764 * If this bugs out the IPI may have been lost,
765 * try to reissue by resetting our own
766 * reentrancy bit and clearing the smurf mask
767 * for the cpus that did not respond, then
772 if (loopwdog(info
)) {
774 loopdebug("C", info
);
775 /* XXX recover from possible bug */
776 mdcpu
->gd_xinvaltlb
= 0;
777 ATOMIC_CPUMASK_NANDMASK(smp_smurf_mask
,
780 smp_invlpg(&smp_active_mask
);
783 * Force outer-loop retest of Xinvltlb
784 * requests (see mp_machdep.c).
786 mdcpu
->gd_xinvaltlb
= 2;
792 * Originator executes operation and clears
793 * mask to allow other cpus to finish.
795 KKASSERT(info
->mode
!= INVDONE
);
796 if (info
->mode
== INVSTORE
) {
798 info
->opte
= atomic_swap_long(info
->ptep
, info
->npte
);
800 ATOMIC_CPUMASK_NANDBIT(info
->mask
, cpu
);
803 if (atomic_cmpset_long(info
->ptep
,
804 info
->opte
, info
->npte
)) {
810 ATOMIC_CPUMASK_NANDBIT(info
->mask
, cpu
);
817 * Originator does not have to wait for the other
818 * cpus to finish. It clears its done bit. A new
819 * command will not be initiated by the originator
820 * until the other cpus have cleared their done bits
823 vm_offset_t va
= info
->va
;
826 if (va
== (vm_offset_t
)-1 ||
827 info
->npgs
> MAX_INVAL_PAGES
) {
830 for (npgs
= info
->npgs
; npgs
; --npgs
) {
831 cpu_invlpg((void *)va
);
836 /* leave loopme alone */
837 /* other cpus may still be finishing up */
838 /* can't race originator since that's us */
839 info
->mode
= INVDONE
;
840 ATOMIC_CPUMASK_NANDBIT(info
->done
, cpu
);