kernel - Refactor Xinvltlb a little, turn off the idle-thread invltlb opt
[dragonfly.git] / sys / platform / pc64 / x86_64 / pmap_inval.c
blobc5b6cbd086c75f0882b7ff2c099485e1cb286c66
1 /*
2 * Copyright (c) 2003-2011 The DragonFly Project. All rights reserved.
3 *
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
36 * pmap invalidation support code. Certain hardware requirements must
37 * be dealt with when manipulating page table entries and page directory
38 * entries within a pmap. In particular, we cannot safely manipulate
39 * page tables which are in active use by another cpu (even if it is
40 * running in userland) for two reasons: First, TLB writebacks will
41 * race against our own modifications and tests. Second, even if we
42 * were to use bus-locked instruction we can still screw up the
43 * target cpu's instruction pipeline due to Intel cpu errata.
46 #include <sys/param.h>
47 #include <sys/systm.h>
48 #include <sys/kernel.h>
49 #include <sys/proc.h>
50 #include <sys/vmmeter.h>
51 #include <sys/thread2.h>
52 #include <sys/sysctl.h>
54 #include <vm/vm.h>
55 #include <vm/pmap.h>
56 #include <vm/vm_object.h>
58 #include <machine/cputypes.h>
59 #include <machine/md_var.h>
60 #include <machine/specialreg.h>
61 #include <machine/smp.h>
62 #include <machine/globaldata.h>
63 #include <machine/pmap.h>
64 #include <machine/pmap_inval.h>
66 #if 1 /* DEBUGGING */
67 #define LOOPMASK (/* 32 * */ 16 * 128 * 1024 - 1)
68 #endif
70 #define MAX_INVAL_PAGES 128
72 struct pmap_inval_info {
73 vm_offset_t va;
74 pt_entry_t *ptep;
75 pt_entry_t opte;
76 pt_entry_t npte;
77 enum { INVDONE, INVSTORE, INVCMPSET } mode;
78 int success;
79 int npgs;
80 cpumask_t done;
81 cpumask_t mask;
82 #ifdef LOOPMASK
83 cpumask_t sigmask;
84 int failed;
85 int xloops;
86 #endif
87 } __cachealign;
89 typedef struct pmap_inval_info pmap_inval_info_t;
91 static pmap_inval_info_t invinfo[MAXCPU];
92 extern cpumask_t smp_invmask;
93 #ifdef LOOPMASK
94 #ifdef LOOPMASK_IN
95 extern cpumask_t smp_in_mask;
96 #endif
97 extern cpumask_t smp_smurf_mask;
98 #endif
99 static long pmap_inval_bulk_count;
101 SYSCTL_LONG(_machdep, OID_AUTO, pmap_inval_bulk_count, CTLFLAG_RW,
102 &pmap_inval_bulk_count, 0, "");
104 static void
105 pmap_inval_init(pmap_t pmap)
107 cpulock_t olock;
108 cpulock_t nlock;
110 crit_enter_id("inval");
112 if (pmap != &kernel_pmap) {
113 for (;;) {
114 olock = pmap->pm_active_lock;
115 cpu_ccfence();
116 nlock = olock | CPULOCK_EXCL;
117 if (olock != nlock &&
118 atomic_cmpset_int(&pmap->pm_active_lock,
119 olock, nlock)) {
120 break;
122 lwkt_process_ipiq();
123 cpu_pause();
125 atomic_add_acq_long(&pmap->pm_invgen, 1);
129 static void
130 pmap_inval_done(pmap_t pmap)
132 if (pmap != &kernel_pmap) {
133 atomic_clear_int(&pmap->pm_active_lock, CPULOCK_EXCL);
134 atomic_add_acq_long(&pmap->pm_invgen, 1);
136 crit_exit_id("inval");
140 * API function - invalidation the pte at (va) and replace *ptep with
141 * npte atomically across the pmap's active cpus.
143 * This is a holy mess.
145 * Returns the previous contents of *ptep.
147 static
148 void
149 loopdebug(const char *msg, pmap_inval_info_t *info)
151 int p;
152 int cpu = mycpu->gd_cpuid;
154 cpu_lfence();
155 atomic_add_long(&smp_smurf_mask.ary[0], 0);
156 kprintf("%s %d mode=%d m=%08jx d=%08jx s=%08jx "
157 #ifdef LOOPMASK_IN
158 "in=%08jx "
159 #endif
160 "smurf=%08jx\n",
161 msg, cpu, info->mode,
162 info->mask.ary[0],
163 info->done.ary[0],
164 info->sigmask.ary[0],
165 #ifdef LOOPMASK_IN
166 smp_in_mask.ary[0],
167 #endif
168 smp_smurf_mask.ary[0]);
169 kprintf("mdglob ");
170 for (p = 0; p < ncpus; ++p)
171 kprintf(" %d", CPU_prvspace[p]->mdglobaldata.gd_xinvaltlb);
172 kprintf("\n");
175 #ifdef CHECKSIG
177 #define CHECKSIGMASK(info) _checksigmask(info, __FILE__, __LINE__)
179 static
180 void
181 _checksigmask(pmap_inval_info_t *info, const char *file, int line)
183 cpumask_t tmp;
185 tmp = info->mask;
186 CPUMASK_ANDMASK(tmp, info->sigmask);
187 if (CPUMASK_CMPMASKNEQ(tmp, info->mask)) {
188 kprintf("\"%s\" line %d: bad sig/mask %08jx %08jx\n",
189 file, line, info->sigmask.ary[0], info->mask.ary[0]);
193 #else
195 #define CHECKSIGMASK(info)
197 #endif
200 * Invalidate the specified va across all cpus associated with the pmap.
201 * If va == (vm_offset_t)-1, we invltlb() instead of invlpg(). The operation
202 * will be done fully synchronously with storing npte into *ptep and returning
203 * opte.
205 * If ptep is NULL the operation will execute semi-synchronously.
206 * ptep must be NULL if npgs > 1
208 pt_entry_t
209 pmap_inval_smp(pmap_t pmap, vm_offset_t va, int npgs,
210 pt_entry_t *ptep, pt_entry_t npte)
212 globaldata_t gd = mycpu;
213 pmap_inval_info_t *info;
214 pt_entry_t opte = 0;
215 int cpu = gd->gd_cpuid;
216 cpumask_t tmpmask;
217 unsigned long rflags;
220 * Initialize invalidation for pmap and enter critical section.
222 if (pmap == NULL)
223 pmap = &kernel_pmap;
224 pmap_inval_init(pmap);
227 * Shortcut single-cpu case if possible.
229 if (CPUMASK_CMPMASKEQ(pmap->pm_active, gd->gd_cpumask)) {
231 * Convert to invltlb if there are too many pages to
232 * invlpg on.
234 if (npgs > MAX_INVAL_PAGES) {
235 npgs = 0;
236 va = (vm_offset_t)-1;
240 * Invalidate the specified pages, handle invltlb if requested.
242 while (npgs) {
243 --npgs;
244 if (ptep) {
245 opte = atomic_swap_long(ptep, npte);
246 ++ptep;
248 if (va == (vm_offset_t)-1)
249 break;
250 cpu_invlpg((void *)va);
251 va += PAGE_SIZE;
253 if (va == (vm_offset_t)-1)
254 cpu_invltlb();
255 pmap_inval_done(pmap);
257 return opte;
261 * We need a critical section to prevent getting preempted while
262 * we setup our command. A preemption might execute its own
263 * pmap_inval*() command and create confusion below.
265 info = &invinfo[cpu];
268 * We must wait for other cpus which may still be finishing up a
269 * prior operation that we requested.
271 * We do not have to disable interrupts here. An Xinvltlb can occur
272 * at any time (even within a critical section), but it will not
273 * act on our command until we set our done bits.
275 while (CPUMASK_TESTNZERO(info->done)) {
276 #ifdef LOOPMASK
277 int loops;
279 loops = ++info->xloops;
280 if ((loops & LOOPMASK) == 0) {
281 info->failed = 1;
282 loopdebug("orig_waitA", info);
283 /* XXX recover from possible bug */
284 CPUMASK_ASSZERO(info->done);
286 #endif
287 cpu_pause();
289 KKASSERT(info->mode == INVDONE);
292 * Must set our cpu in the invalidation scan mask before
293 * any possibility of [partial] execution (remember, XINVLTLB
294 * can interrupt a critical section).
296 ATOMIC_CPUMASK_ORBIT(smp_invmask, cpu);
298 info->va = va;
299 info->npgs = npgs;
300 info->ptep = ptep;
301 info->npte = npte;
302 info->opte = 0;
303 #ifdef LOOPMASK
304 info->failed = 0;
305 #endif
306 info->mode = INVSTORE;
308 tmpmask = pmap->pm_active; /* volatile (bits may be cleared) */
309 cpu_ccfence();
310 CPUMASK_ANDMASK(tmpmask, smp_active_mask);
313 * If ptep is NULL the operation can be semi-synchronous, which means
314 * we can improve performance by flagging and removing idle cpus
315 * (see the idleinvlclr function in mp_machdep.c).
317 * Typically kernel page table operation is semi-synchronous.
319 if (ptep == NULL)
320 smp_smurf_idleinvlclr(&tmpmask);
321 CPUMASK_ORBIT(tmpmask, cpu);
322 info->mask = tmpmask;
325 * Command may start executing the moment 'done' is initialized,
326 * disable current cpu interrupt to prevent 'done' field from
327 * changing (other cpus can't clear done bits until the originating
328 * cpu clears its mask bit, but other cpus CAN start clearing their
329 * mask bits).
331 #ifdef LOOPMASK
332 info->sigmask = tmpmask;
333 CHECKSIGMASK(info);
334 #endif
335 cpu_sfence();
336 rflags = read_rflags();
337 cpu_disable_intr();
339 ATOMIC_CPUMASK_COPY(info->done, tmpmask);
340 /* execution can begin here due to races */
343 * Pass our copy of the done bits (so they don't change out from
344 * under us) to generate the Xinvltlb interrupt on the targets.
346 smp_invlpg(&tmpmask);
347 opte = info->opte;
348 KKASSERT(info->mode == INVDONE);
351 * Target cpus will be in their loop exiting concurrently with our
352 * cleanup. They will not lose the bitmask they obtained before so
353 * we can safely clear this bit.
355 ATOMIC_CPUMASK_NANDBIT(smp_invmask, cpu);
356 write_rflags(rflags);
357 pmap_inval_done(pmap);
359 return opte;
363 * API function - invalidate the pte at (va) and replace *ptep with npte
364 * atomically only if *ptep equals opte, across the pmap's active cpus.
366 * Returns 1 on success, 0 on failure (caller typically retries).
369 pmap_inval_smp_cmpset(pmap_t pmap, vm_offset_t va, pt_entry_t *ptep,
370 pt_entry_t opte, pt_entry_t npte)
372 globaldata_t gd = mycpu;
373 pmap_inval_info_t *info;
374 int success;
375 int cpu = gd->gd_cpuid;
376 cpumask_t tmpmask;
377 unsigned long rflags;
380 * Initialize invalidation for pmap and enter critical section.
382 if (pmap == NULL)
383 pmap = &kernel_pmap;
384 pmap_inval_init(pmap);
387 * Shortcut single-cpu case if possible.
389 if (CPUMASK_CMPMASKEQ(pmap->pm_active, gd->gd_cpumask)) {
390 if (atomic_cmpset_long(ptep, opte, npte)) {
391 if (va == (vm_offset_t)-1)
392 cpu_invltlb();
393 else
394 cpu_invlpg((void *)va);
395 pmap_inval_done(pmap);
396 return 1;
397 } else {
398 pmap_inval_done(pmap);
399 return 0;
404 * We need a critical section to prevent getting preempted while
405 * we setup our command. A preemption might execute its own
406 * pmap_inval*() command and create confusion below.
408 info = &invinfo[cpu];
411 * We must wait for other cpus which may still be finishing
412 * up a prior operation.
414 while (CPUMASK_TESTNZERO(info->done)) {
415 #ifdef LOOPMASK
416 int loops;
418 loops = ++info->xloops;
419 if ((loops & LOOPMASK) == 0) {
420 info->failed = 1;
421 loopdebug("orig_waitB", info);
422 /* XXX recover from possible bug */
423 CPUMASK_ASSZERO(info->done);
425 #endif
426 cpu_pause();
428 KKASSERT(info->mode == INVDONE);
431 * Must set our cpu in the invalidation scan mask before
432 * any possibility of [partial] execution (remember, XINVLTLB
433 * can interrupt a critical section).
435 ATOMIC_CPUMASK_ORBIT(smp_invmask, cpu);
437 info->va = va;
438 info->npgs = 1; /* unused */
439 info->ptep = ptep;
440 info->npte = npte;
441 info->opte = opte;
442 info->failed = 0;
443 info->mode = INVCMPSET;
444 info->success = 0;
446 tmpmask = pmap->pm_active; /* volatile */
447 cpu_ccfence();
448 CPUMASK_ANDMASK(tmpmask, smp_active_mask);
449 CPUMASK_ORBIT(tmpmask, cpu);
450 info->mask = tmpmask;
453 * Command may start executing the moment 'done' is initialized,
454 * disable current cpu interrupt to prevent 'done' field from
455 * changing (other cpus can't clear done bits until the originating
456 * cpu clears its mask bit).
458 #ifdef LOOPMASK
459 info->sigmask = tmpmask;
460 CHECKSIGMASK(info);
461 #endif
462 cpu_sfence();
463 rflags = read_rflags();
464 cpu_disable_intr();
466 ATOMIC_CPUMASK_COPY(info->done, tmpmask);
469 * Pass our copy of the done bits (so they don't change out from
470 * under us) to generate the Xinvltlb interrupt on the targets.
472 smp_invlpg(&tmpmask);
473 success = info->success;
474 KKASSERT(info->mode == INVDONE);
476 ATOMIC_CPUMASK_NANDBIT(smp_invmask, cpu);
477 write_rflags(rflags);
478 pmap_inval_done(pmap);
480 return success;
483 void
484 pmap_inval_bulk_init(pmap_inval_bulk_t *bulk, struct pmap *pmap)
486 bulk->pmap = pmap;
487 bulk->va_beg = 0;
488 bulk->va_end = 0;
489 bulk->count = 0;
492 pt_entry_t
493 pmap_inval_bulk(pmap_inval_bulk_t *bulk, vm_offset_t va,
494 pt_entry_t *ptep, pt_entry_t npte)
496 pt_entry_t pte;
499 * Degenerate case, localized or we don't care (e.g. because we
500 * are jacking the entire page table) or the pmap is not in-use
501 * by anyone. No invalidations are done on any cpu.
503 if (bulk == NULL) {
504 pte = atomic_swap_long(ptep, npte);
505 return pte;
509 * If it isn't the kernel pmap we execute the operation synchronously
510 * on all cpus belonging to the pmap, which avoids concurrency bugs in
511 * the hw related to changing pte's out from under threads.
513 * Eventually I would like to implement streaming pmap invalidation
514 * for user pmaps to reduce mmap/munmap overheads for heavily-loaded
515 * threaded programs.
517 if (bulk->pmap != &kernel_pmap) {
518 pte = pmap_inval_smp(bulk->pmap, va, 1, ptep, npte);
519 return pte;
523 * This is the kernel_pmap. All unmap operations presume that there
524 * are no other cpus accessing the addresses in question. Implement
525 * the bulking algorithm. collect the required information and
526 * synchronize once at the end.
528 pte = atomic_swap_long(ptep, npte);
529 if (va == (vm_offset_t)-1) {
530 bulk->va_beg = va;
531 } else if (bulk->va_beg == bulk->va_end) {
532 bulk->va_beg = va;
533 bulk->va_end = va + PAGE_SIZE;
534 } else if (va == bulk->va_end) {
535 bulk->va_end = va + PAGE_SIZE;
536 } else {
537 bulk->va_beg = (vm_offset_t)-1;
538 bulk->va_end = 0;
539 #if 0
540 pmap_inval_bulk_flush(bulk);
541 bulk->count = 1;
542 if (va == (vm_offset_t)-1) {
543 bulk->va_beg = va;
544 bulk->va_end = 0;
545 } else {
546 bulk->va_beg = va;
547 bulk->va_end = va + PAGE_SIZE;
549 #endif
551 ++bulk->count;
553 return pte;
556 void
557 pmap_inval_bulk_flush(pmap_inval_bulk_t *bulk)
559 if (bulk == NULL)
560 return;
561 if (bulk->count > 0)
562 pmap_inval_bulk_count += (bulk->count - 1);
563 if (bulk->va_beg != bulk->va_end) {
564 if (bulk->va_beg == (vm_offset_t)-1) {
565 pmap_inval_smp(bulk->pmap, bulk->va_beg, 1, NULL, 0);
566 } else {
567 long n;
569 n = (bulk->va_end - bulk->va_beg) >> PAGE_SHIFT;
570 pmap_inval_smp(bulk->pmap, bulk->va_beg, n, NULL, 0);
573 bulk->va_beg = 0;
574 bulk->va_end = 0;
575 bulk->count = 0;
579 * Called with a critical section held and interrupts enabled.
582 pmap_inval_intr(cpumask_t *cpumaskp, int toolong)
584 globaldata_t gd = mycpu;
585 pmap_inval_info_t *info;
586 int loopme = 0;
587 int cpu;
588 cpumask_t cpumask;
589 #ifdef LOOPMASK
590 int loops;
591 #endif
594 * Check all cpus for invalidations we may need to service.
596 cpu_ccfence();
597 cpu = gd->gd_cpuid;
598 cpumask = *cpumaskp;
600 while (CPUMASK_TESTNZERO(cpumask)) {
601 int n = BSFCPUMASK(cpumask);
603 #ifdef LOOPMASK
604 KKASSERT(n >= 0 && n < MAXCPU);
605 #endif
607 CPUMASK_NANDBIT(cpumask, n);
608 info = &invinfo[n];
611 * Due to interrupts/races we can catch a new operation
612 * in an older interrupt. A fence is needed once we detect
613 * the (not) done bit.
615 if (!CPUMASK_TESTBIT(info->done, cpu))
616 continue;
617 cpu_lfence();
618 #ifdef LOOPMASK
619 if (toolong) {
620 kprintf("pminvl %d->%d %08jx %08jx mode=%d\n",
621 cpu, n, info->done.ary[0], info->mask.ary[0],
622 info->mode);
624 #endif
627 * info->mask and info->done always contain the originating
628 * cpu until the originator is done. Targets may still be
629 * present in info->done after the originator is done (they
630 * will be finishing up their loops).
632 * Clear info->mask bits on other cpus to indicate that they
633 * have quiesced (entered the loop). Once the other mask bits
634 * are clear we can execute the operation on the original,
635 * then clear the mask and done bits on the originator. The
636 * targets will then finish up their side and clear their
637 * done bits.
639 * The command is considered 100% done when all done bits have
640 * been cleared.
642 if (n != cpu) {
644 * Command state machine for 'other' cpus.
646 if (CPUMASK_TESTBIT(info->mask, cpu)) {
648 * Other cpu indicate to originator that they
649 * are quiesced.
651 ATOMIC_CPUMASK_NANDBIT(info->mask, cpu);
652 loopme = 1;
653 } else if (info->ptep &&
654 CPUMASK_TESTBIT(info->mask, n)) {
656 * Other cpu must wait for the originator (n)
657 * to complete its command if ptep is not NULL.
659 loopme = 1;
660 } else {
662 * Other cpu detects that the originator has
663 * completed its command, or there was no
664 * command.
666 * Now that the page table entry has changed,
667 * we can follow up with our own invalidation.
669 vm_offset_t va = info->va;
670 int npgs;
672 if (va == (vm_offset_t)-1 ||
673 info->npgs > MAX_INVAL_PAGES) {
674 cpu_invltlb();
675 } else {
676 for (npgs = info->npgs; npgs; --npgs) {
677 cpu_invlpg((void *)va);
678 va += PAGE_SIZE;
681 ATOMIC_CPUMASK_NANDBIT(info->done, cpu);
682 /* info invalid now */
683 /* loopme left alone */
685 } else if (CPUMASK_TESTBIT(info->mask, cpu)) {
687 * Originator is waiting for other cpus
689 if (CPUMASK_CMPMASKNEQ(info->mask, gd->gd_cpumask)) {
691 * Originator waits for other cpus to enter
692 * their loop (aka quiesce).
694 loopme = 1;
695 #ifdef LOOPMASK
696 loops = ++info->xloops;
697 if ((loops & LOOPMASK) == 0) {
698 info->failed = 1;
699 loopdebug("orig_waitC", info);
700 /* XXX recover from possible bug */
701 mdcpu->gd_xinvaltlb = 0;
702 cpu_disable_intr();
703 smp_invlpg(&smp_active_mask);
704 cpu_enable_intr();
706 #endif
707 } else {
709 * Originator executes operation and clears
710 * mask to allow other cpus to finish.
712 KKASSERT(info->mode != INVDONE);
713 if (info->mode == INVSTORE) {
714 if (info->ptep)
715 info->opte = atomic_swap_long(info->ptep, info->npte);
716 CHECKSIGMASK(info);
717 ATOMIC_CPUMASK_NANDBIT(info->mask, cpu);
718 CHECKSIGMASK(info);
719 } else {
720 if (atomic_cmpset_long(info->ptep,
721 info->opte, info->npte)) {
722 info->success = 1;
723 } else {
724 info->success = 0;
726 CHECKSIGMASK(info);
727 ATOMIC_CPUMASK_NANDBIT(info->mask, cpu);
728 CHECKSIGMASK(info);
730 loopme = 1;
732 } else {
734 * Originator does not have to wait for the other
735 * cpus to finish. It clears its done bit. A new
736 * command will not be initiated by the originator
737 * until the other cpus have cleared their done bits
738 * (asynchronously).
740 vm_offset_t va = info->va;
741 int npgs;
743 if (va == (vm_offset_t)-1 ||
744 info->npgs > MAX_INVAL_PAGES) {
745 cpu_invltlb();
746 } else {
747 for (npgs = info->npgs; npgs; --npgs) {
748 cpu_invlpg((void *)va);
749 va += PAGE_SIZE;
752 #ifdef LOOPMASK
753 info->xloops = 0;
754 #endif
755 /* leave loopme alone */
756 /* other cpus may still be finishing up */
757 /* can't race originator since that's us */
758 info->mode = INVDONE;
759 ATOMIC_CPUMASK_NANDBIT(info->done, cpu);
762 return loopme;