From bba35d66575dc3caadf7839245299cedfc7027b8 Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Mon, 5 Sep 2016 17:11:05 -0700 Subject: [PATCH] kernel - Deal with lost IPIs (VM related) * Some (all?) VMs appear to be able to lose IPIs. Hopefully the same can't be said for device interrupts! Add some recovery code for lost Xinvltlb IPIs for now. For synchronizing invalidations we use the TSC and run a recovery attempt after 1/16 second, and every 1 second there-after, if an Xinvltlb is not responded to (smp_invltlb() and smp_invlpg()). The IPI will be re-issued. * Some basic testing shows that a VM can stall out a cpu thread for an indefinite period of time, potentially causing the above watchdog to trigger. Even so it should not have required re-issuing the IPI, but it seems it does, so the VM appears to be losing the IPI(!) when a cpu thread stalls out on the host! At least with the VM we tested under, type unknown. * IPIQ IPIs currently do not have any specific recovery but I think each cpu will poll for IPIQs slowly in the idle thread, so they might automatically recover anyway. Reported-by: zach --- sys/platform/pc64/x86_64/mp_machdep.c | 24 +++++--- sys/platform/pc64/x86_64/pmap_inval.c | 113 ++++++++++++++++++++-------------- 2 files changed, 85 insertions(+), 52 deletions(-) diff --git a/sys/platform/pc64/x86_64/mp_machdep.c b/sys/platform/pc64/x86_64/mp_machdep.c index a9328b1811..476b989359 100644 --- a/sys/platform/pc64/x86_64/mp_machdep.c +++ b/sys/platform/pc64/x86_64/mp_machdep.c @@ -189,6 +189,9 @@ SYSCTL_INT(_machdep, OID_AUTO, report_invltlb_src, CTLFLAG_RW, static int optimized_invltlb; SYSCTL_INT(_machdep, OID_AUTO, optimized_invltlb, CTLFLAG_RW, &optimized_invltlb, 0, ""); +static int all_but_self_ipi_enable = 1; +SYSCTL_INT(_machdep, OID_AUTO, all_but_self_ipi_enable, CTLFLAG_RW, + &all_but_self_ipi_enable, 0, ""); /* Local data for detecting CPU TOPOLOGY */ static int core_bits = 0; @@ -815,7 +818,7 @@ smitest(void) cpumask_t smp_smurf_mask; static cpumask_t smp_invltlb_mask; -#define LOOPMASK (/* 32 * */ 16 * 128 * 1024 - 1) +#define LOOPRECOVER #ifdef LOOPMASK_IN cpumask_t smp_in_mask; #endif @@ -887,7 +890,7 @@ smp_invltlb(void) struct mdglobaldata *md = mdcpu; cpumask_t mask; unsigned long rflags; -#ifdef LOOPMASK +#ifdef LOOPRECOVER uint64_t tsc_base = rdtsc(); int repeats = 0; #endif @@ -936,7 +939,8 @@ smp_invltlb(void) * the critical section count on the target cpus. */ CPUMASK_ORMASK(mask, md->mi.gd_cpumask); - if (CPUMASK_CMPMASKEQ(smp_startup_mask, mask)) { + if (all_but_self_ipi_enable && + CPUMASK_CMPMASKEQ(smp_startup_mask, mask)) { all_but_self_ipi(XINVLTLB_OFFSET); } else { CPUMASK_NANDMASK(mask, md->mi.gd_cpumask); @@ -961,7 +965,7 @@ smp_invltlb(void) while (CPUMASK_CMPMASKNEQ(smp_invltlb_mask, mask)) { smp_inval_intr(); cpu_pause(); -#ifdef LOOPMASK +#ifdef LOOPRECOVER if (tsc_frequency && rdtsc() - tsc_base > tsc_frequency) { kprintf("smp_invltlb %d: waited too long %08jx " "dbg=%08jx %08jx\n", @@ -970,6 +974,8 @@ smp_invltlb(void) smp_idleinvl_mask.ary[0], smp_idleinvl_reqs.ary[0]); mdcpu->gd_xinvaltlb = 0; + ATOMIC_CPUMASK_NANDMASK(smp_smurf_mask, + smp_invltlb_mask); smp_invlpg(&smp_active_mask); tsc_base = rdtsc(); if (++repeats > 10) { @@ -1023,7 +1029,8 @@ smp_invlpg(cpumask_t *cmdmask) * * We do not include our own cpu when issuing the IPI. */ - if (CPUMASK_CMPMASKEQ(smp_startup_mask, mask)) { + if (all_but_self_ipi_enable && + CPUMASK_CMPMASKEQ(smp_startup_mask, mask)) { all_but_self_ipi(XINVLTLB_OFFSET); } else { CPUMASK_NANDMASK(mask, md->mi.gd_cpumask); @@ -1046,6 +1053,9 @@ smp_sniff(void) globaldata_t gd = mycpu; int dummy; + /* + * Ignore all_but_self_ipi_enable here and just use it. + */ all_but_self_ipi(XSNIFF_OFFSET); gd->gd_sample_pc = smp_sniff; gd->gd_sample_sp = &dummy; @@ -1065,7 +1075,7 @@ smp_inval_intr(void) { struct mdglobaldata *md = mdcpu; cpumask_t cpumask; -#ifdef LOOPMASK +#ifdef LOOPRECOVER uint64_t tsc_base = rdtsc(); #endif @@ -1135,7 +1145,7 @@ loop: cpu_mfence(); } -#ifdef LOOPMASK +#ifdef LOOPRECOVER if (tsc_frequency && rdtsc() - tsc_base > tsc_frequency) { kprintf("smp_inval_intr %d inv=%08jx tlbm=%08jx " "idle=%08jx/%08jx\n", diff --git a/sys/platform/pc64/x86_64/pmap_inval.c b/sys/platform/pc64/x86_64/pmap_inval.c index 0f2f42deb7..520f6c476a 100644 --- a/sys/platform/pc64/x86_64/pmap_inval.c +++ b/sys/platform/pc64/x86_64/pmap_inval.c @@ -62,11 +62,24 @@ #include #include #include +#include #if 1 /* DEBUGGING */ -#define LOOPMASK (/* 32 * */ 16 * 128 * 1024 - 1) +#define LOOPRECOVER /* enable watchdog */ #endif +/* + * Watchdog recovery interval = 1.0 / (1 << radix), or 1/16 second + * for the initial watchdog. If the initial watchdog fails, further + * instances occur at 1/2 second intervals. + * + * The watchdog value is generous for two reasons. First, because the + * situaation is not supposed to happen at all (but does), and second, + * because VMs could be very slow at handling IPIs. + */ +#define LOOPRECOVER_RADIX1 4 /* initial recovery */ +#define LOOPRECOVER_RADIX2 1 /* repeated recoveries */ + #define MAX_INVAL_PAGES 128 struct pmap_inval_info { @@ -79,10 +92,10 @@ struct pmap_inval_info { int npgs; cpumask_t done; cpumask_t mask; -#ifdef LOOPMASK +#ifdef LOOPRECOVER cpumask_t sigmask; int failed; - int xloops; + int64_t tsc_target; #endif } __cachealign; @@ -90,7 +103,7 @@ typedef struct pmap_inval_info pmap_inval_info_t; static pmap_inval_info_t invinfo[MAXCPU]; extern cpumask_t smp_invmask; -#ifdef LOOPMASK +#ifdef LOOPRECOVER #ifdef LOOPMASK_IN extern cpumask_t smp_in_mask; #endif @@ -136,17 +149,27 @@ pmap_inval_done(pmap_t pmap) crit_exit_id("inval"); } -#ifdef LOOPMASK +#ifdef LOOPRECOVER /* - * API function - invalidation the pte at (va) and replace *ptep with - * npte atomically across the pmap's active cpus. - * - * This is a holy mess. - * - * Returns the previous contents of *ptep. + * Debugging and lost IPI recovery code. */ static +__inline +int +loopwdog(struct pmap_inval_info *info) +{ + int64_t tsc; + + tsc = rdtsc(); + if (info->tsc_target - tsc < 0 && tsc_frequency) { + info->tsc_target = tsc + (tsc_frequency >> LOOPRECOVER_RADIX2); + return 1; + } + return 0; +} + +static void loopdebug(const char *msg, pmap_inval_info_t *info) { @@ -154,29 +177,29 @@ loopdebug(const char *msg, pmap_inval_info_t *info) int cpu = mycpu->gd_cpuid; cpu_lfence(); -#ifdef LOOPMASK +#ifdef LOOPRECOVER atomic_add_long(&smp_smurf_mask.ary[0], 0); #endif - kprintf("%s %d mode=%d m=%08jx d=%08jx " -#ifdef LOOPMASK + kprintf("ipilost-%s! %d mode=%d m=%08jx d=%08jx " +#ifdef LOOPRECOVER "s=%08jx " #endif #ifdef LOOPMASK_IN "in=%08jx " #endif -#ifdef LOOPMASK +#ifdef LOOPRECOVER "smurf=%08jx\n" #endif , msg, cpu, info->mode, info->mask.ary[0], info->done.ary[0] -#ifdef LOOPMASK +#ifdef LOOPRECOVER , info->sigmask.ary[0] #endif #ifdef LOOPMASK_IN , smp_in_mask.ary[0] #endif -#ifdef LOOPMASK +#ifdef LOOPRECOVER , smp_smurf_mask.ary[0] #endif ); @@ -277,8 +300,12 @@ pmap_inval_smp(pmap_t pmap, vm_offset_t va, int npgs, * We need a critical section to prevent getting preempted while * we setup our command. A preemption might execute its own * pmap_inval*() command and create confusion below. + * + * tsc_target is our watchdog timeout that will attempt to recover + * from a lost IPI. Set to 1/16 second for now. */ info = &invinfo[cpu]; + info->tsc_target = rdtsc() + (tsc_frequency >> LOOPRECOVER_RADIX1); /* * We must wait for other cpus which may still be finishing up a @@ -289,13 +316,10 @@ pmap_inval_smp(pmap_t pmap, vm_offset_t va, int npgs, * act on our command until we set our done bits. */ while (CPUMASK_TESTNZERO(info->done)) { -#ifdef LOOPMASK - int loops; - - loops = ++info->xloops; - if ((loops & LOOPMASK) == 0) { +#ifdef LOOPRECOVER + if (loopwdog(info)) { info->failed = 1; - loopdebug("orig_waitA", info); + loopdebug("A", info); /* XXX recover from possible bug */ CPUMASK_ASSZERO(info->done); } @@ -316,7 +340,7 @@ pmap_inval_smp(pmap_t pmap, vm_offset_t va, int npgs, info->ptep = ptep; info->npte = npte; info->opte = 0; -#ifdef LOOPMASK +#ifdef LOOPRECOVER info->failed = 0; #endif info->mode = INVSTORE; @@ -344,7 +368,7 @@ pmap_inval_smp(pmap_t pmap, vm_offset_t va, int npgs, * cpu clears its mask bit, but other cpus CAN start clearing their * mask bits). */ -#ifdef LOOPMASK +#ifdef LOOPRECOVER info->sigmask = tmpmask; CHECKSIGMASK(info); #endif @@ -428,13 +452,10 @@ pmap_inval_smp_cmpset(pmap_t pmap, vm_offset_t va, pt_entry_t *ptep, * up a prior operation. */ while (CPUMASK_TESTNZERO(info->done)) { -#ifdef LOOPMASK - int loops; - - loops = ++info->xloops; - if ((loops & LOOPMASK) == 0) { +#ifdef LOOPRECOVER + if (loopwdog(info)) { info->failed = 1; - loopdebug("orig_waitB", info); + loopdebug("B", info); /* XXX recover from possible bug */ CPUMASK_ASSZERO(info->done); } @@ -455,7 +476,7 @@ pmap_inval_smp_cmpset(pmap_t pmap, vm_offset_t va, pt_entry_t *ptep, info->ptep = ptep; info->npte = npte; info->opte = opte; -#ifdef LOOPMASK +#ifdef LOOPRECOVER info->failed = 0; #endif info->mode = INVCMPSET; @@ -473,7 +494,7 @@ pmap_inval_smp_cmpset(pmap_t pmap, vm_offset_t va, pt_entry_t *ptep, * changing (other cpus can't clear done bits until the originating * cpu clears its mask bit). */ -#ifdef LOOPMASK +#ifdef LOOPRECOVER info->sigmask = tmpmask; CHECKSIGMASK(info); #endif @@ -604,9 +625,6 @@ pmap_inval_intr(cpumask_t *cpumaskp, int toolong) int loopme = 0; int cpu; cpumask_t cpumask; -#ifdef LOOPMASK - int loops; -#endif /* * Check all cpus for invalidations we may need to service. @@ -618,7 +636,7 @@ pmap_inval_intr(cpumask_t *cpumaskp, int toolong) while (CPUMASK_TESTNZERO(cpumask)) { int n = BSFCPUMASK(cpumask); -#ifdef LOOPMASK +#ifdef LOOPRECOVER KKASSERT(n >= 0 && n < MAXCPU); #endif @@ -633,7 +651,7 @@ pmap_inval_intr(cpumask_t *cpumaskp, int toolong) if (!CPUMASK_TESTBIT(info->done, cpu)) continue; cpu_lfence(); -#ifdef LOOPMASK +#ifdef LOOPRECOVER if (toolong) { kprintf("pminvl %d->%d %08jx %08jx mode=%d\n", cpu, n, info->done.ary[0], info->mask.ary[0], @@ -708,15 +726,22 @@ pmap_inval_intr(cpumask_t *cpumaskp, int toolong) /* * Originator waits for other cpus to enter * their loop (aka quiesce). + * + * If this bugs out the IPI may have been lost, + * try to reissue by resetting our own + * reentrancy bit and clearing the smurf mask + * for the cpus that did not respond, then + * reissuing the IPI. */ loopme = 1; -#ifdef LOOPMASK - loops = ++info->xloops; - if ((loops & LOOPMASK) == 0) { +#ifdef LOOPRECOVER + if (loopwdog(info)) { info->failed = 1; - loopdebug("orig_waitC", info); + loopdebug("C", info); /* XXX recover from possible bug */ mdcpu->gd_xinvaltlb = 0; + ATOMIC_CPUMASK_NANDMASK(smp_smurf_mask, + info->mask); cpu_disable_intr(); smp_invlpg(&smp_active_mask); cpu_enable_intr(); @@ -767,9 +792,7 @@ pmap_inval_intr(cpumask_t *cpumaskp, int toolong) va += PAGE_SIZE; } } -#ifdef LOOPMASK - info->xloops = 0; -#endif + /* leave loopme alone */ /* other cpus may still be finishing up */ /* can't race originator since that's us */ -- 2.11.4.GIT