From 5b49787bf81ce6370fea5dfb5cde2d0dddc5c13b Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Wed, 4 Oct 2017 21:46:57 -0700 Subject: [PATCH] kernel - Refactor smp collision statistics * Add an indefinite wait timing API (sys/indefinite.h, sys/indefinite2.h). This interface uses the TSC and will record lock latencies to our pcpu stats in microseconds. The systat -pv 1 display shows this under smpcoll. Note that latencies generated by tokens, lockmgr, and mutex locks do not necessarily reflect actual lost cpu time as the kernel will schedule other threads while those are blocked, if other threads are available. * Formalize TSC operations more, supply a type (tsc_uclock_t and tsc_sclock_t). * Reinstrument lockmgr, mutex, token, and spinlocks to use the new indefinite timing interface. --- sys/cpu/x86_64/include/cpufunc.h | 9 +- sys/kern/kern_clock.c | 4 +- sys/kern/kern_lock.c | 58 +++++----- sys/kern/kern_mutex.c | 22 ++-- sys/kern/kern_spinlock.c | 96 ++-------------- sys/kern/lwkt_thread.c | 10 +- sys/kern/lwkt_token.c | 17 ++- sys/net/altq/altq_subr.c | 2 +- sys/platform/pc64/apic/lapic.c | 6 +- sys/platform/pc64/include/clock.h | 6 +- sys/platform/pc64/isa/clock.c | 23 ++-- sys/platform/pc64/x86_64/mp_machdep.c | 4 +- sys/platform/pc64/x86_64/pmap_inval.c | 6 +- sys/platform/pc64/x86_64/trap.c | 2 +- sys/platform/vkernel64/include/clock.h | 6 +- sys/platform/vkernel64/platform/init.c | 5 +- sys/sys/buf2.h | 2 +- sys/sys/{microtime_pcpu.h => indefinite.h} | 59 +++------- sys/sys/indefinite2.h | 172 +++++++++++++++++++++++++++++ sys/sys/lock.h | 3 +- sys/sys/microtime_pcpu.h | 5 +- sys/sys/mutex.h | 7 +- sys/sys/mutex2.h | 12 ++ sys/sys/thread.h | 4 + sys/sys/time.h | 4 +- sys/vfs/nfs/nfs_vfsops.c | 4 +- 26 files changed, 348 insertions(+), 200 deletions(-) copy sys/sys/{microtime_pcpu.h => indefinite.h} (65%) create mode 100644 sys/sys/indefinite2.h diff --git a/sys/cpu/x86_64/include/cpufunc.h b/sys/cpu/x86_64/include/cpufunc.h index 1d95c11190..a379bbcc46 100644 --- a/sys/cpu/x86_64/include/cpufunc.h +++ b/sys/cpu/x86_64/include/cpufunc.h @@ -42,6 +42,7 @@ #include #include +#include #include #include @@ -549,20 +550,20 @@ rdpmc(u_int pmc) #define _RDTSC_SUPPORTED_ -static __inline u_int64_t +static __inline tsc_uclock_t rdtsc(void) { u_int32_t low, high; __asm __volatile("rdtsc" : "=a" (low), "=d" (high)); - return (low | ((u_int64_t)high << 32)); + return (low | ((tsc_uclock_t)high << 32)); } #ifdef _KERNEL #include #include -static __inline u_int64_t +static __inline tsc_uclock_t rdtsc_ordered(void) { if (cpu_vendor_id == CPU_VENDOR_INTEL) @@ -953,7 +954,7 @@ u_short rfs(void); u_short rgs(void); u_int64_t rdmsr(u_int msr); u_int64_t rdpmc(u_int pmc); -u_int64_t rdtsc(void); +tsc_uclock_t rdtsc(void); u_int read_rflags(void); void wbinvd(void); void write_rflags(u_int rf); diff --git a/sys/kern/kern_clock.c b/sys/kern/kern_clock.c index 22dc72a992..5b5cc0c809 100644 --- a/sys/kern/kern_clock.c +++ b/sys/kern/kern_clock.c @@ -316,7 +316,7 @@ initclocks(void *dummy) initclocks_pcpu(); clocks_running = 1; if (kpmap) { - kpmap->tsc_freq = (uint64_t)tsc_frequency; + kpmap->tsc_freq = tsc_frequency; kpmap->tick_freq = hz; } } @@ -1672,7 +1672,7 @@ pps_event(struct pps_state *pps, sysclock_t count, int event) * * Returns -1 if the TSC is not supported. */ -int64_t +tsc_uclock_t tsc_get_target(int ns) { #if defined(_RDTSC_SUPPORTED_) diff --git a/sys/kern/kern_lock.c b/sys/kern/kern_lock.c index d512a862c2..f95acb3315 100644 --- a/sys/kern/kern_lock.c +++ b/sys/kern/kern_lock.c @@ -46,6 +46,7 @@ #include #include #include +#include static void undo_upreq(struct lock *lkp); @@ -63,6 +64,10 @@ SYSCTL_PROC(_kern, OID_AUTO, cancel_test, CTLTYPE_INT|CTLFLAG_RW, 0, 0, #endif +int lock_test_mode; +SYSCTL_INT(_debug, OID_AUTO, lock_test_mode, CTLFLAG_RW, + &lock_test_mode, 0, ""); + /* * Locking primitives implementation. * Locks provide shared/exclusive sychronization. @@ -93,11 +98,13 @@ debuglockmgr(struct lock *lkp, u_int flags, int pflags; int wflags; int timo; + int info_init; #ifdef DEBUG_LOCKS int i; #endif error = 0; + info_init = 0; if (mycpu->gd_intr_nesting_level && (flags & LK_NOWAIT) == 0 && @@ -198,11 +205,12 @@ again: goto again; } - mycpu->gd_cnt.v_lock_name[0] = 'S'; - strncpy(mycpu->gd_cnt.v_lock_name + 1, - lkp->lk_wmesg, - sizeof(mycpu->gd_cnt.v_lock_name) - 2); - ++mycpu->gd_cnt.v_lock_colls; + if (info_init == 0 && + (lkp->lk_flags & LK_NOCOLLSTATS) == 0) { + indefinite_init(&td->td_indefinite, + lkp->lk_wmesg, 1, 'l'); + info_init = 1; + } error = tsleep(lkp, pflags | PINTERLOCKED, lkp->lk_wmesg, timo); @@ -284,11 +292,12 @@ again: goto again; } - mycpu->gd_cnt.v_lock_name[0] = 'X'; - strncpy(mycpu->gd_cnt.v_lock_name + 1, - lkp->lk_wmesg, - sizeof(mycpu->gd_cnt.v_lock_name) - 2); - ++mycpu->gd_cnt.v_lock_colls; + if (info_init == 0 && + (lkp->lk_flags & LK_NOCOLLSTATS) == 0) { + indefinite_init(&td->td_indefinite, lkp->lk_wmesg, + 1, 'L'); + info_init = 1; + } error = tsleep(lkp, pflags | PINTERLOCKED, lkp->lk_wmesg, timo); @@ -298,6 +307,7 @@ again: error = ENOLCK; break; } + indefinite_check(&td->td_indefinite); goto again; case LK_DOWNGRADE: @@ -436,6 +446,13 @@ again: wflags |= (count - 1); } + if (info_init == 0 && + (lkp->lk_flags & LK_NOCOLLSTATS) == 0) { + indefinite_init(&td->td_indefinite, lkp->lk_wmesg, + 1, 'U'); + info_init = 1; + } + if (atomic_cmpset_int(&lkp->lk_count, count, wflags)) { COUNT(td, -1); @@ -445,12 +462,6 @@ again: if ((count & (LKC_UPREQ|LKC_MASK)) == (LKC_UPREQ | 1)) wakeup(lkp); - mycpu->gd_cnt.v_lock_name[0] = 'U'; - strncpy(mycpu->gd_cnt.v_lock_name + 1, - lkp->lk_wmesg, - sizeof(mycpu->gd_cnt.v_lock_name) - 2); - ++mycpu->gd_cnt.v_lock_colls; - error = tsleep(lkp, pflags | PINTERLOCKED, lkp->lk_wmesg, timo); if (error) { @@ -475,6 +486,7 @@ again: else flags = LK_WAITUPGRADE; /* we own the bit */ } + indefinite_check(&td->td_indefinite); goto again; case LK_WAITUPGRADE: @@ -503,12 +515,6 @@ again: timo = (extflags & LK_TIMELOCK) ? lkp->lk_timo : 0; tsleep_interlock(lkp, pflags); if (atomic_fetchadd_int(&lkp->lk_count, 0) == count) { - mycpu->gd_cnt.v_lock_name[0] = 'U'; - strncpy(mycpu->gd_cnt.v_lock_name + 1, - lkp->lk_wmesg, - sizeof(mycpu->gd_cnt.v_lock_name) - 2); - ++mycpu->gd_cnt.v_lock_colls; - error = tsleep(lkp, pflags | PINTERLOCKED, lkp->lk_wmesg, timo); if (error) { @@ -523,6 +529,7 @@ again: } /* retry */ } + indefinite_check(&td->td_indefinite); goto again; case LK_RELEASE: @@ -673,6 +680,10 @@ again: flags & LK_TYPE_MASK); /* NOTREACHED */ } + + if (info_init) + indefinite_done(&td->td_indefinite); + return (error); } @@ -903,12 +914,9 @@ sysctl_cancel_lock(SYSCTL_HANDLER_ARGS) if (req->newptr) { SYSCTL_XUNLOCK(); lockmgr(&cancel_lk, LK_EXCLUSIVE); - kprintf("x"); error = tsleep(&error, PCATCH, "canmas", hz * 5); lockmgr(&cancel_lk, LK_CANCEL_BEG); - kprintf("y"); error = tsleep(&error, PCATCH, "canmas", hz * 5); - kprintf("z"); lockmgr(&cancel_lk, LK_RELEASE); SYSCTL_XLOCK(); SYSCTL_OUT(req, &error, sizeof(error)); diff --git a/sys/kern/kern_mutex.c b/sys/kern/kern_mutex.c index b7b247f1be..60cd51b0b8 100644 --- a/sys/kern/kern_mutex.c +++ b/sys/kern/kern_mutex.c @@ -57,12 +57,14 @@ #include #include #include +#include #include #include #include #include +#include static int mtx_chain_link_ex(mtx_t *mtx, u_int olock); static int mtx_chain_link_sh(mtx_t *mtx, u_int olock); @@ -936,8 +938,14 @@ mtx_delete_link(mtx_t *mtx, mtx_link_t *link) int mtx_wait_link(mtx_t *mtx, mtx_link_t *link, int flags, int to) { + thread_t td = curthread; int error; + if ((mtx->mtx_flags & MTXF_NOCOLLSTATS) == 0) { + indefinite_init(&td->td_indefinite, mtx->mtx_ident, 1, + ((link->state & MTX_LINK_LINKED_SH) ? 'm' : 'M')); + } + /* * Sleep. Handle false wakeups, interruptions, etc. * The link may also have been aborted. The LINKED @@ -949,20 +957,13 @@ mtx_wait_link(mtx_t *mtx, mtx_link_t *link, int flags, int to) tsleep_interlock(link, 0); cpu_lfence(); if (link->state & MTX_LINK_LINKED) { - if (link->state & MTX_LINK_LINKED_SH) - mycpu->gd_cnt.v_lock_name[0] = 'S'; - else - mycpu->gd_cnt.v_lock_name[0] = 'X'; - strncpy(mycpu->gd_cnt.v_lock_name + 1, - mtx->mtx_ident, - sizeof(mycpu->gd_cnt.v_lock_name) - 2); - ++mycpu->gd_cnt.v_lock_colls; - error = tsleep(link, flags | PINTERLOCKED, mtx->mtx_ident, to); if (error) break; } + if ((mtx->mtx_flags & MTXF_NOCOLLSTATS) == 0) + indefinite_check(&td->td_indefinite); } /* @@ -1014,6 +1015,9 @@ mtx_wait_link(mtx_t *mtx, mtx_link_t *link, int flags, int to) */ link->state = MTX_LINK_IDLE; + if ((mtx->mtx_flags & MTXF_NOCOLLSTATS) == 0) + indefinite_done(&td->td_indefinite); + return error; } diff --git a/sys/kern/kern_spinlock.c b/sys/kern/kern_spinlock.c index c009ac0a4f..031cbb91bd 100644 --- a/sys/kern/kern_spinlock.c +++ b/sys/kern/kern_spinlock.c @@ -62,6 +62,7 @@ #include #include #include +#include #include #include #include @@ -72,12 +73,6 @@ struct spinlock pmap_spin = SPINLOCK_INITIALIZER(pmap_spin, "pmap_spin"); -struct indefinite_info { - sysclock_t base; - int secs; - const char *ident; -}; - /* * Kernal Trace */ @@ -109,9 +104,6 @@ SYSCTL_LONG(_debug, OID_AUTO, spinlocks_add_latency, CTLFLAG_RW, #endif -static int spin_indefinite_check(struct spinlock *spin, - struct indefinite_info *info); - /* * We contested due to another exclusive lock holder. We lose. * @@ -178,8 +170,7 @@ spin_trylock_contested(struct spinlock *spin) void _spin_lock_contested(struct spinlock *spin, const char *ident, int value) { - struct indefinite_info info = { 0, 0, ident }; - int i; + thread_t td = curthread; /* * WARNING! Caller has already incremented the lock. We must @@ -194,6 +185,7 @@ _spin_lock_contested(struct spinlock *spin, const char *ident, int value) if (atomic_cmpset_int(&spin->counta, SPINLOCK_SHARED | 1, 1)) return; } + indefinite_init(&td->td_indefinite, ident, 0, 'S'); /* * Transfer our exclusive request to the high bits and clear the @@ -211,16 +203,9 @@ _spin_lock_contested(struct spinlock *spin, const char *ident, int value) if (value & SPINLOCK_SHARED) atomic_clear_int(&spin->counta, SPINLOCK_SHARED); -#ifdef DEBUG_LOCKS_LATENCY - long j; - for (j = spinlocks_add_latency; j > 0; --j) - cpu_ccfence(); -#endif /* * Spin until we can acquire a low-count of 1. */ - i = 0; - /*logspin(beg, spin, 'w');*/ for (;;) { /* * If the low bits are zero, try to acquire the exclusive lock @@ -243,20 +228,10 @@ _spin_lock_contested(struct spinlock *spin, const char *ident, int value) (ovalue - SPINLOCK_EXCLWAIT) | 1)) { break; } - if ((++i & 0x7F) == 0x7F) { - mycpu->gd_cnt.v_lock_name[0] = 'X'; - strncpy(mycpu->gd_cnt.v_lock_name + 1, - ident, - sizeof(mycpu->gd_cnt.v_lock_name) - 2); - ++mycpu->gd_cnt.v_lock_colls; - if (spin_indefinite_check(spin, &info)) - break; - } -#ifdef _KERNEL_VIRTUAL - pthread_yield(); -#endif + if (indefinite_check(&td->td_indefinite)) + break; } - /*logspin(end, spin, 'w');*/ + indefinite_done(&td->td_indefinite); } /* @@ -269,8 +244,9 @@ _spin_lock_contested(struct spinlock *spin, const char *ident, int value) void _spin_lock_shared_contested(struct spinlock *spin, const char *ident) { - struct indefinite_info info = { 0, 0, ident }; - int i; + thread_t td = curthread; + + indefinite_init(&td->td_indefinite, ident, 0, 's'); /* * Undo the inline's increment. @@ -283,8 +259,6 @@ _spin_lock_shared_contested(struct spinlock *spin, const char *ident) cpu_ccfence(); #endif - /*logspin(beg, spin, 'w');*/ - i = 0; for (;;) { /* * Loop until we can acquire the shared spinlock. Note that @@ -315,56 +289,10 @@ _spin_lock_shared_contested(struct spinlock *spin, const char *ident) ovalue + 1)) break; } - if ((++i & 0x7F) == 0x7F) { - mycpu->gd_cnt.v_lock_name[0] = 'S'; - strncpy(mycpu->gd_cnt.v_lock_name + 1, - ident, - sizeof(mycpu->gd_cnt.v_lock_name) - 2); - ++mycpu->gd_cnt.v_lock_colls; - if (spin_indefinite_check(spin, &info)) - break; - } -#ifdef _KERNEL_VIRTUAL - pthread_yield(); -#endif - } - /*logspin(end, spin, 'w');*/ -} - -static -int -spin_indefinite_check(struct spinlock *spin, struct indefinite_info *info) -{ - sysclock_t count; - - cpu_spinlock_contested(); - - count = sys_cputimer->count(); - if (info->secs == 0) { - info->base = count; - ++info->secs; - } else if (count - info->base > sys_cputimer->freq) { - kprintf("spin_lock: %s(%p), indefinite wait (%d secs)!\n", - info->ident, spin, info->secs); - info->base = count; - ++info->secs; - if (panicstr) - return (TRUE); -#if defined(INVARIANTS) - if (spin_lock_test_mode) { - print_backtrace(-1); - return (TRUE); - } -#endif -#if defined(INVARIANTS) - if (info->secs == 11) - print_backtrace(-1); -#endif - if (info->secs == 60) - panic("spin_lock: %s(%p), indefinite wait!", - info->ident, spin); + if (indefinite_check(&td->td_indefinite)) + break; } - return (FALSE); + indefinite_done(&td->td_indefinite); } /* diff --git a/sys/kern/lwkt_thread.c b/sys/kern/lwkt_thread.c index d3a26f5706..e6e4d62943 100644 --- a/sys/kern/lwkt_thread.c +++ b/sys/kern/lwkt_thread.c @@ -52,9 +52,11 @@ #include #include #include +#include #include #include +#include #include @@ -701,7 +703,6 @@ lwkt_switch(void) { goto havethread; } - ++gd->gd_cnt.v_lock_colls; ++ntd->td_contended; /* overflow ok */ #ifdef LOOPMASK if (tsc_frequency && rdtsc() - tsc_base > tsc_frequency) { @@ -735,7 +736,6 @@ lwkt_switch(void) goto havethread; } ++ntd->td_contended; /* overflow ok */ - ++gd->gd_cnt.v_lock_colls; } /* @@ -766,6 +766,12 @@ havethread: ++gd->gd_cnt.v_swtch; gd->gd_idle_repeat = 0; + /* + * If we were busy waiting record final disposition + */ + if (ntd->td_indefinite.type) + indefinite_done(&ntd->td_indefinite); + havethread_preempted: /* * If the new target does not need the MP lock and we are holding it, diff --git a/sys/kern/lwkt_token.c b/sys/kern/lwkt_token.c index 8e35887b3e..70830456e2 100644 --- a/sys/kern/lwkt_token.c +++ b/sys/kern/lwkt_token.c @@ -57,10 +57,12 @@ #include #include #include +#include #include #include #include +#include #include #include @@ -480,10 +482,13 @@ lwkt_getalltokens(thread_t td, int spinning) */ KASSERT(tok->t_desc, ("token %p is not initialized", tok)); - strncpy(td->td_gd->gd_cnt.v_lock_name, - tok->t_desc, - sizeof(td->td_gd->gd_cnt.v_lock_name) - 1); + if (td->td_indefinite.type == 0) { + indefinite_init(&td->td_indefinite, + tok->t_desc, 1, 't'); + } else { + indefinite_check(&td->td_indefinite); + } if (lwkt_sched_debug > 0) { --lwkt_sched_debug; kprintf("toka %p %s %s\n", @@ -592,6 +597,12 @@ _lwkt_getalltokens_sorted(thread_t td) * Otherwise we failed to acquire all the tokens. * Release whatever we did get. */ + if (td->td_indefinite.type == 0) { + indefinite_init(&td->td_indefinite, + tok->t_desc, 1, 't'); + } else { + indefinite_check(&td->td_indefinite); + } if (lwkt_sched_debug > 0) { --lwkt_sched_debug; kprintf("tokb %p %s %s\n", diff --git a/sys/net/altq/altq_subr.c b/sys/net/altq/altq_subr.c index 00381c0607..0210a9120c 100644 --- a/sys/net/altq/altq_subr.c +++ b/sys/net/altq/altq_subr.c @@ -867,7 +867,7 @@ init_machclk(void) */ #ifdef _RDTSC_SUPPORTED_ if (tsc_present) - machclk_freq = (uint64_t)tsc_frequency; + machclk_freq = tsc_frequency; #endif /* diff --git a/sys/platform/pc64/apic/lapic.c b/sys/platform/pc64/apic/lapic.c index 7bc552ed8a..5de6239026 100644 --- a/sys/platform/pc64/apic/lapic.c +++ b/sys/platform/pc64/apic/lapic.c @@ -665,7 +665,8 @@ apic_ipi(int dest_type, int vector, int delivery_mode) tsc = rdtsc(); while ((lapic->icr_lo & APIC_DELSTAT_MASK) != 0) { cpu_pause(); - if ((int64_t)(rdtsc() - (tsc + tsc_frequency)) > 0) { + if ((tsc_sclock_t)(rdtsc() - + (tsc + tsc_frequency)) > 0) { kprintf("apic_ipi stall cpu %d (sing)\n", mycpuid); tsc = rdtsc(); @@ -698,7 +699,8 @@ single_apic_ipi(int cpu, int vector, int delivery_mode) tsc = rdtsc(); while ((lapic->icr_lo & APIC_DELSTAT_MASK) != 0) { cpu_pause(); - if ((int64_t)(rdtsc() - (tsc + tsc_frequency)) > 0) { + if ((tsc_sclock_t)(rdtsc() - + (tsc + tsc_frequency)) > 0) { kprintf("single_apic_ipi stall cpu %d (sing)\n", mycpuid); tsc = rdtsc(); diff --git a/sys/platform/pc64/include/clock.h b/sys/platform/pc64/include/clock.h index fda63d7527..4011bc7a35 100644 --- a/sys/platform/pc64/include/clock.h +++ b/sys/platform/pc64/include/clock.h @@ -25,6 +25,9 @@ typedef struct TOTALDELAY { sysclock_t last_clock; } TOTALDELAY; +typedef uint64_t tsc_uclock_t; +typedef int64_t tsc_sclock_t; + /* * i386 to clock driver interface. * XXX large parts of the driver and its interface are misplaced. @@ -36,9 +39,10 @@ extern int timer0_max_count; extern int tsc_present; extern int tsc_invariant; extern int tsc_mpsync; -extern int64_t tsc_frequency; extern int tsc_is_broken; extern int wall_cmos_clock; +extern tsc_uclock_t tsc_frequency; +extern tsc_uclock_t tsc_oneus_approx; /* do not use for fine calc, min 1 */ /* * Driver to clock driver interface. diff --git a/sys/platform/pc64/isa/clock.c b/sys/platform/pc64/isa/clock.c index ee45f6f9a5..496f09c9ce 100644 --- a/sys/platform/pc64/isa/clock.c +++ b/sys/platform/pc64/isa/clock.c @@ -106,10 +106,12 @@ int disable_rtc_set; /* disable resettodr() if != 0 */ int tsc_present; int tsc_invariant; int tsc_mpsync; -int64_t tsc_frequency; int tsc_is_broken; int wall_cmos_clock; /* wall CMOS clock assumed if != 0 */ int timer0_running; +tsc_uclock_t tsc_frequency; +tsc_uclock_t tsc_oneus_approx; /* always at least 1, approx only */ + enum tstate { RELEASED, ACQUIRED }; enum tstate timer0_state; enum tstate timer1_state; @@ -557,7 +559,7 @@ readrtc(int port) static u_int calibrate_clocks(void) { - u_int64_t old_tsc; + tsc_uclock_t old_tsc; u_int tot_count; sysclock_t count, prev_count; int sec, start_sec, timeout; @@ -633,6 +635,7 @@ calibrate_clocks(void) (intmax_t)tsc_frequency); } } + tsc_oneus_approx = ((tsc_frequency|1) + 999999) / 1000000; kprintf("i8254 clock: %u Hz\n", tot_count); return (tot_count); @@ -849,15 +852,15 @@ startrtclock(void) cputimer_set_frequency(&i8254_cputimer, freq); } else { if (bootverbose) - kprintf( - "%d Hz differs from default of %d Hz by more than 1%%\n", - freq, i8254_cputimer.freq); + kprintf("%d Hz differs from default of %d Hz " + "by more than 1%%\n", + freq, i8254_cputimer.freq); tsc_frequency = 0; } if (tsc_frequency != 0 && calibrate_timers_with_rtc == 0) { - kprintf( -"hw.calibrate_timers_with_rtc not set - using old calibration method\n"); + kprintf("hw.calibrate_timers_with_rtc not " + "set - using old calibration method\n"); tsc_frequency = 0; } @@ -883,8 +886,10 @@ skip_rtc_based: tsc_invariant ? " invariant" : "", (intmax_t)tsc_frequency); } + tsc_oneus_approx = ((tsc_frequency|1) + 999999) / 1000000; - EVENTHANDLER_REGISTER(shutdown_post_sync, resettodr_on_shutdown, NULL, SHUTDOWN_PRI_LAST); + EVENTHANDLER_REGISTER(shutdown_post_sync, resettodr_on_shutdown, + NULL, SHUTDOWN_PRI_LAST); } /* @@ -1247,7 +1252,7 @@ static void tsc_mpsync_test_loop(struct tsc_mpsync_arg *arg) { struct globaldata *gd = mycpu; - uint64_t test_end, test_begin; + tsc_uclock_t test_end, test_begin; u_int i; if (bootverbose) { diff --git a/sys/platform/pc64/x86_64/mp_machdep.c b/sys/platform/pc64/x86_64/mp_machdep.c index 310d6fc734..80d5b2abb0 100644 --- a/sys/platform/pc64/x86_64/mp_machdep.c +++ b/sys/platform/pc64/x86_64/mp_machdep.c @@ -894,7 +894,7 @@ smp_invltlb(void) cpumask_t mask; unsigned long rflags; #ifdef LOOPRECOVER - uint64_t tsc_base = rdtsc(); + tsc_uclock_t tsc_base = rdtsc(); int repeats = 0; #endif @@ -1115,7 +1115,7 @@ smp_inval_intr(void) struct mdglobaldata *md = mdcpu; cpumask_t cpumask; #ifdef LOOPRECOVER - uint64_t tsc_base = rdtsc(); + tsc_uclock_t tsc_base = rdtsc(); #endif #if 0 diff --git a/sys/platform/pc64/x86_64/pmap_inval.c b/sys/platform/pc64/x86_64/pmap_inval.c index a572da847f..65e4ba70b9 100644 --- a/sys/platform/pc64/x86_64/pmap_inval.c +++ b/sys/platform/pc64/x86_64/pmap_inval.c @@ -93,7 +93,7 @@ struct pmap_inval_info { #ifdef LOOPRECOVER cpumask_t sigmask; int failed; - int64_t tsc_target; + tsc_uclock_t tsc_target; #endif } __cachealign; @@ -163,10 +163,10 @@ __inline int loopwdog(struct pmap_inval_info *info) { - int64_t tsc; + tsc_uclock_t tsc; tsc = rdtsc(); - if (info->tsc_target - tsc < 0 && tsc_frequency) { + if ((tsc_sclock_t)(info->tsc_target - tsc) < 0 && tsc_frequency) { info->tsc_target = tsc + (tsc_frequency * LOOPRECOVER_TIMEOUT2); return 1; } diff --git a/sys/platform/pc64/x86_64/trap.c b/sys/platform/pc64/x86_64/trap.c index a6f0678d24..e701ff39a9 100644 --- a/sys/platform/pc64/x86_64/trap.c +++ b/sys/platform/pc64/x86_64/trap.c @@ -1223,7 +1223,7 @@ syscall2(struct trapframe *frame) * is responsible for getting the MP lock. */ #ifdef SYSCALL_DEBUG - uint64_t tscval = rdtsc(); + tsc_uclock_t tscval = rdtsc(); #endif error = (*callp->sy_call)(&args); #ifdef SYSCALL_DEBUG diff --git a/sys/platform/vkernel64/include/clock.h b/sys/platform/vkernel64/include/clock.h index c145eefabf..ed49bfbb7c 100644 --- a/sys/platform/vkernel64/include/clock.h +++ b/sys/platform/vkernel64/include/clock.h @@ -15,6 +15,9 @@ #include #endif +typedef uint64_t tsc_uclock_t; +typedef int64_t tsc_sclock_t; + /* * i386 to clock driver interface. * XXX large parts of the driver and its interface are misplaced. @@ -26,9 +29,10 @@ extern int timer0_max_count; extern int tsc_present; extern int tsc_invariant; extern int tsc_mpsync; -extern int64_t tsc_frequency; extern int tsc_is_broken; extern int wall_cmos_clock; +extern tsc_uclock_t tsc_frequency; +extern tsc_uclock_t tsc_oneus_approx; /* do not use for fine calc, min 1 */ /* * Driver to clock driver interface. diff --git a/sys/platform/vkernel64/platform/init.c b/sys/platform/vkernel64/platform/init.c index 9cbfc36409..f6f7fbbbfe 100644 --- a/sys/platform/vkernel64/platform/init.c +++ b/sys/platform/vkernel64/platform/init.c @@ -115,7 +115,6 @@ u_int cpu_feature; /* XXX */ int tsc_present; int tsc_invariant; int tsc_mpsync; -int64_t tsc_frequency; int optcpus; /* number of cpus - see mp_start() */ int cpu_bits; int lwp_cpu_lock; /* if/how to lock virtual CPUs to real CPUs */ @@ -127,6 +126,9 @@ int vmm_enabled; /* VMM HW assisted enable */ int use_precise_timer = 0; /* use a precise timer (more expensive) */ struct privatespace *CPU_prvspace; +tsc_uclock_t tsc_frequency; +tsc_uclock_t tsc_oneus_approx; + extern uint64_t KPML4phys; /* phys addr of kernel level 4 */ static struct trapframe proc0_tf; @@ -481,6 +483,7 @@ main(int ac, char **av) sysctlbyname("hw.tsc_frequency", &tsc_frequency, &vsize, NULL, 0); if (tsc_present) cpu_feature |= CPUID_TSC; + tsc_oneus_approx = ((tsc_frequency|1) + 999999) / 1000000; /* * Check SSE diff --git a/sys/sys/buf2.h b/sys/sys/buf2.h index ed29fcd159..b40d882e16 100644 --- a/sys/sys/buf2.h +++ b/sys/sys/buf2.h @@ -67,7 +67,7 @@ * Initialize a lock. */ #define BUF_LOCKINIT(bp) \ - lockinit(&(bp)->b_lock, buf_wmesg, 0, 0) + lockinit(&(bp)->b_lock, buf_wmesg, 0, LK_NOCOLLSTATS) /* * diff --git a/sys/sys/microtime_pcpu.h b/sys/sys/indefinite.h similarity index 65% copy from sys/sys/microtime_pcpu.h copy to sys/sys/indefinite.h index 32e1a7967b..c6d8673966 100644 --- a/sys/sys/microtime_pcpu.h +++ b/sys/sys/indefinite.h @@ -1,8 +1,8 @@ /* - * Copyright (c) 2014 The DragonFly Project. All rights reserved. + * Copyright (c) 2017 The DragonFly Project. All rights reserved. * * This code is derived from software contributed to The DragonFly Project - * by Sepherosa Ziehau + * by Matthew Dillon * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -31,49 +31,26 @@ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ - -#ifndef _SYS_MICROTIME_PCPU_H_ -#define _SYS_MICROTIME_PCPU_H_ - -#ifndef _SYS_PARAM_H_ -#include -#endif - -#ifndef _SYS_TIME_H_ -#include -#endif - -#include -#include +#ifndef _SYS_INDEFINITE_H_ +#define _SYS_INDEFINITE_H_ /* - * This 'time' only guarantees monotonicly increment on the same CPU + * Indefinite info collection and handling code for contention loops */ +#ifndef _MACHINE_CLOCK_H_ +#include +#endif -union microtime_pcpu { - struct timeval tv; - uint64_t tsc; -}; +extern int lock_test_mode; -static __inline void -microtime_pcpu_get(union microtime_pcpu *t) -{ - if (tsc_invariant) - t->tsc = rdtsc(); - else - microuptime(&t->tv); -} +struct indefinite_info { + tsc_uclock_t base; + const char *ident; + int secs; + int count; + char type; +}; -static __inline int -microtime_pcpu_diff(const union microtime_pcpu *s, - const union microtime_pcpu *e) -{ - if (tsc_invariant) { - return (((e->tsc - s->tsc) * 1000000) / tsc_frequency); - } else { - return ((e->tv.tv_usec - s->tv.tv_usec) + - (e->tv.tv_sec - s->tv.tv_sec) * 1000000); - } -} +typedef struct indefinite_info indefinite_info_t; -#endif /* !_SYS_MICROTIME_PCPU_H_ */ +#endif diff --git a/sys/sys/indefinite2.h b/sys/sys/indefinite2.h new file mode 100644 index 0000000000..9d2f77a4f8 --- /dev/null +++ b/sys/sys/indefinite2.h @@ -0,0 +1,172 @@ +/* + * Copyright (c) 2017 The DragonFly Project. All rights reserved. + * + * This code is derived from software contributed to The DragonFly Project + * by Matthew Dillon + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of The DragonFly Project nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific, prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +#ifndef _SYS_INDEFINITE2_H_ +#define _SYS_INDEFINITE2_H_ + +/* + * Indefinite info collection and handling code for contention loops + */ +#ifndef _SYS_INDEFINITE_H_ +#include +#endif +#ifndef _SYS_GLOBALDATA_H_ +#include +#endif + +/* + * Initialize the indefinite state (only if the TSC is supported) + */ +static __inline void +indefinite_init(indefinite_info_t *info, const char *ident, int now, char type) +{ + if (tsc_frequency) { + info->base = rdtsc(); + info->ident = ident; + info->secs = 0; + info->count = 0; + info->type = type; + + if (now) { + mycpu->gd_cnt.v_lock_name[0] = info->type; + strncpy(mycpu->gd_cnt.v_lock_name + 1, info->ident, + sizeof(mycpu->gd_cnt.v_lock_name) - 2); + } + } +} + +/* + * Update the state during any loop, record collision time in microseconds. + */ +static __inline int +indefinite_check(indefinite_info_t *info) +{ + tsc_uclock_t delta; + const char *str; + +#ifdef _KERNEL_VIRTUAL + pthread_yield(); +#else + cpu_pause(); +#endif + if (info->type == 0) + return FALSE; + if (++info->count != 128) + return FALSE; + info->count = 0; + delta = rdtsc() - info->base; + + /* + * Ignore minor one-second interval error accumulation in + * favor of ensuring that info->base is fully synchronized. + */ + if (info->secs == 0 && delta > tsc_oneus_approx) { + mycpu->gd_cnt.v_lock_name[0] = info->type; + strncpy(mycpu->gd_cnt.v_lock_name + 1, info->ident, + sizeof(mycpu->gd_cnt.v_lock_name) - 2); + } + if (delta >= tsc_frequency) { + info->secs += delta / tsc_frequency; + info->base += delta; + mycpu->gd_cnt.v_lock_colls += delta / tsc_frequency * 1000000U; + + switch(info->type) { + case 's': + str = "spin_lock_sh"; + break; + case 'S': + str = "spin_lock_ex"; + break; + case 'm': + str = "mutex_sh"; + break; + case 'M': + str = "mutex_ex"; + break; + case 'l': + str = "lock_sh"; + break; + case 'L': + str = "lock_ex"; + break; + case 't': + str = "token"; + break; + default: + str = "lock(?)"; + break; + } + kprintf("%s: %s, indefinite wait (%d secs)!\n", + str, info->ident, info->secs); + if (panicstr) + return TRUE; +#if defined(INVARIANTS) + if (lock_test_mode) { + print_backtrace(-1); + return TRUE; + } +#endif +#if defined(INVARIANTS) + if (info->secs == 11 && + (info->type == 's' || info->type == 'S')) { + print_backtrace(-1); + } +#endif + if (info->secs == 60 && + (info->type == 's' || info->type == 'S')) { + panic("%s: %s, indefinite wait!", str, info->ident); + } + + } + return FALSE; +} + +/* + * Finalize the state, record collision time in microseconds. + */ +static __inline void +indefinite_done(indefinite_info_t *info) +{ + tsc_uclock_t delta; + + if (info->type) { + delta = rdtsc() - info->base; + delta = delta * 1000000U / tsc_frequency; + if (lock_test_mode && delta > 1000) + kprintf("TEST %s (%lu)\n", info->ident, delta); + mycpu->gd_cnt.v_lock_colls += delta; + info->type = 0; + } +} + +#endif diff --git a/sys/sys/lock.h b/sys/sys/lock.h index d60ef13ebf..42c34d87c5 100644 --- a/sys/sys/lock.h +++ b/sys/sys/lock.h @@ -163,10 +163,11 @@ struct lock { * The first three flags may be set in lock_init to set their mode permanently, * or passed in as arguments to the lock manager. */ -#define LK_EXTFLG_MASK 0x07000070 /* mask of external flags */ +#define LK_EXTFLG_MASK 0x070000F0 /* mask of external flags */ #define LK_NOWAIT 0x00000010 /* do not sleep to await lock */ #define LK_SLEEPFAIL 0x00000020 /* sleep, then return failure */ #define LK_CANRECURSE 0x00000040 /* allow recursive exclusive lock */ +#define LK_NOCOLLSTATS 0x00000080 /* v_lock_coll not applicable */ #define LK_CANCELABLE 0x01000000 /* blocked caller can be canceled */ #define LK_TIMELOCK 0x02000000 #define LK_PCATCH 0x04000000 /* timelocked with signal catching */ diff --git a/sys/sys/microtime_pcpu.h b/sys/sys/microtime_pcpu.h index 32e1a7967b..7cd73c32c9 100644 --- a/sys/sys/microtime_pcpu.h +++ b/sys/sys/microtime_pcpu.h @@ -66,10 +66,11 @@ microtime_pcpu_get(union microtime_pcpu *t) static __inline int microtime_pcpu_diff(const union microtime_pcpu *s, - const union microtime_pcpu *e) + const union microtime_pcpu *e) { if (tsc_invariant) { - return (((e->tsc - s->tsc) * 1000000) / tsc_frequency); + return (((e->tsc - s->tsc) * 1000000) / + (tsc_sclock_t)tsc_frequency); } else { return ((e->tv.tv_usec - s->tv.tv_usec) + (e->tv.tv_sec - s->tv.tv_sec) * 1000000); diff --git a/sys/sys/mutex.h b/sys/sys/mutex.h index 51b7ea2048..38fa18b947 100644 --- a/sys/sys/mutex.h +++ b/sys/sys/mutex.h @@ -65,17 +65,20 @@ typedef struct mtx_link mtx_link_t; struct mtx { volatile u_int mtx_lock; - int mtx_reserved01; /* future use & struct alignmnent */ + uint32_t mtx_flags; struct thread *mtx_owner; mtx_link_t *mtx_exlink; mtx_link_t *mtx_shlink; const char *mtx_ident; } __cachealign; +#define MTXF_NOCOLLSTATS 0x00000001 /* v_lock_coll not applicable */ + typedef struct mtx mtx_t; typedef u_int mtx_state_t; -#define MTX_INITIALIZER(ident) { .mtx_lock = 0, .mtx_owner = NULL, \ +#define MTX_INITIALIZER(ident) { .mtx_lock = 0, .mtx_flags = 0, \ + .mtx_owner = NULL, \ .mtx_exlink = NULL, .mtx_shlink = NULL, \ .mtx_ident = ident } diff --git a/sys/sys/mutex2.h b/sys/sys/mutex2.h index b03d1a4f8f..a0bc402a57 100644 --- a/sys/sys/mutex2.h +++ b/sys/sys/mutex2.h @@ -53,6 +53,18 @@ static __inline void mtx_init(mtx_t *mtx, const char *ident) { mtx->mtx_lock = 0; + mtx->mtx_flags = 0; + mtx->mtx_owner = NULL; + mtx->mtx_exlink = NULL; + mtx->mtx_shlink = NULL; + mtx->mtx_ident = ident; +} + +static __inline void +mtx_init_flags(mtx_t *mtx, const char *ident, uint32_t flags) +{ + mtx->mtx_lock = 0; + mtx->mtx_flags = flags; mtx->mtx_owner = NULL; mtx->mtx_exlink = NULL; mtx->mtx_shlink = NULL; diff --git a/sys/sys/thread.h b/sys/sys/thread.h index 052373106b..b4819cec9e 100644 --- a/sys/sys/thread.h +++ b/sys/sys/thread.h @@ -35,6 +35,9 @@ #ifndef _SYS_IOSCHED_H_ #include #endif +#ifndef _SYS_INDEFINITE_H_ +#include +#endif #include struct globaldata; @@ -272,6 +275,7 @@ struct thread { int td_in_crit_report; #endif struct md_thread td_mach; + indefinite_info_t td_indefinite; #ifdef DEBUG_LOCKS #define SPINLOCK_DEBUG_ARRAY_SIZE 32 int td_spinlock_stack_id[SPINLOCK_DEBUG_ARRAY_SIZE]; diff --git a/sys/sys/time.h b/sys/sys/time.h index a5f8624936..2a101d46a1 100644 --- a/sys/sys/time.h +++ b/sys/sys/time.h @@ -35,6 +35,7 @@ #ifdef _KERNEL #include +#include #else #include #endif @@ -221,11 +222,12 @@ int tvtohz_high(struct timeval *); int tvtohz_low(struct timeval *); int tstohz_high(struct timespec *); int tstohz_low(struct timespec *); -int64_t tsc_get_target(int ns); int tsc_test_target(int64_t target); void tsc_delay(int ns); int nanosleep1(struct timespec *rqt, struct timespec *rmt); +tsc_uclock_t tsc_get_target(int ns); + #else /* !_KERNEL */ #include diff --git a/sys/vfs/nfs/nfs_vfsops.c b/sys/vfs/nfs/nfs_vfsops.c index 2af2e01555..af1224e63a 100644 --- a/sys/vfs/nfs/nfs_vfsops.c +++ b/sys/vfs/nfs/nfs_vfsops.c @@ -1031,8 +1031,8 @@ mountnfs(struct nfs_args *argp, struct mount *mp, struct sockaddr *nam, } else { nmp = objcache_get(nfsmount_objcache, M_WAITOK); bzero((caddr_t)nmp, sizeof (struct nfsmount)); - mtx_init(&nmp->nm_rxlock, "nfsrx"); - mtx_init(&nmp->nm_txlock, "nfstx"); + mtx_init_flags(&nmp->nm_rxlock, "nfsrx", MTXF_NOCOLLSTATS); + mtx_init_flags(&nmp->nm_txlock, "nfstx", MTXF_NOCOLLSTATS); TAILQ_INIT(&nmp->nm_uidlruhead); TAILQ_INIT(&nmp->nm_bioq); TAILQ_INIT(&nmp->nm_reqq); -- 2.11.4.GIT