From cc705b823a33a247956d2a54a4e929a0ca2936d9 Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Sun, 22 Apr 2018 18:56:14 -0700 Subject: [PATCH] kernel - Carefully refactor contended tokens and spinlocks * Carefully put the exponential backoff back in for spinlocks, and implement for tokens. Only applicable to exclusive locks, capped via sysctl (mjg). Tested on dual-socket xeon. * Exclusive priority for shared locks reduces the shared/exclusive starvation that can occur when exclusive locks use exponential backoff. * Exponential backoff significantly improves performance for heavily contended exclusive locks by allowing some degree of burst operation. * Implement TSC windowing for shared locks (and a little for exclusive locks too). This prevents heavily contended exclusive locks from completely starving shared locks by using windowing to disable the exclusive-priority mechanic for shared locks. This allows a few contending shared locks to compete on equal ground with exclusive locks. Suggested-by: mjg --- sys/kern/kern_spinlock.c | 122 +++++++++++++++++++++++++---------------------- sys/kern/lwkt_token.c | 70 +++++++++++++++++++-------- 2 files changed, 117 insertions(+), 75 deletions(-) diff --git a/sys/kern/kern_spinlock.c b/sys/kern/kern_spinlock.c index f2fd60d536..11108ad4d9 100644 --- a/sys/kern/kern_spinlock.c +++ b/sys/kern/kern_spinlock.c @@ -104,6 +104,15 @@ SYSCTL_LONG(_debug, OID_AUTO, spinlocks_add_latency, CTLFLAG_RW, #endif +static long spin_backoff_max = 4096; +SYSCTL_LONG(_debug, OID_AUTO, spin_backoff_max, CTLFLAG_RW, + &spin_backoff_max, 0, + "Spinlock exponential backoff limit"); +static long spin_window_shift = 8; /* 1 << n clock cycles, approx */ +SYSCTL_LONG(_debug, OID_AUTO, spin_window_shift, CTLFLAG_RW, + &spin_window_shift, 0, + "Spinlock TSC windowing"); + /* * We contested due to another exclusive lock holder. We lose. * @@ -143,18 +152,21 @@ spin_trylock_contested(struct spinlock *spin) * use atomic_fetchadd_int() for both spinlock types in the critical * path. * - * Backoff algorithms can create even worse starvation problems, particularly - * on multi-socket cpus, and don't really improve performance when a lot - * of cores are contending. However, if we are contested on an exclusive - * lock due to a large number of shared locks being present, we throw in - * extra cpu_pause()'s to account for the necessary time it will take other - * cores to contend among themselves and release their shared locks. + * The exponential (n^1.5) backoff algorithm is designed to both reduce + * cache bus contention between cpu cores and sockets, and to allow some + * bursting of exclusive locks in heavily contended situations to improve + * performance. + * + * The exclusive lock priority mechanism prevents even heavily contended + * exclusive locks from being starved by shared locks */ void _spin_lock_contested(struct spinlock *spin, const char *ident, int value) { indefinite_info_t info; uint32_t ovalue; + long expbackoff; + long loop; /* * WARNING! Caller has already incremented the lock. We must @@ -164,11 +176,14 @@ _spin_lock_contested(struct spinlock *spin, const char *ident, int value) * Handle the degenerate case where the spinlock is flagged SHARED * with only our reference. We can convert it to EXCLUSIVE. */ - ++value; - if (value == (SPINLOCK_SHARED | 1)) { + if (value == (SPINLOCK_SHARED | 1) - 1) { if (atomic_cmpset_int(&spin->counta, SPINLOCK_SHARED | 1, 1)) return; } + /* ++value; value not used after this */ + info.type = 0; /* avoid improper gcc warning */ + info.ident = NULL; /* avoid improper gcc warning */ + expbackoff = 0; /* * Transfer our exclusive request to the high bits and clear the @@ -189,26 +204,24 @@ _spin_lock_contested(struct spinlock *spin, const char *ident, int value) ovalue &= ~SPINLOCK_SHARED; } - indefinite_init(&info, ident, 0, 'S'); - - /* - * Spin until we can acquire a low-count of 1. - */ for (;;) { + expbackoff = (expbackoff + 1) * 3 / 2; + if (expbackoff == 6) /* 1, 3, 6, 10, ... */ + indefinite_init(&info, ident, 0, 'S'); + if ((rdtsc() >> spin_window_shift) % ncpus != mycpuid) { + for (loop = expbackoff; loop; --loop) + cpu_pause(); + } + /*cpu_lfence();*/ + /* * If the low bits are zero, try to acquire the exclusive lock * by transfering our high bit reservation to the low bits. * - * NOTE: Reading spin->counta prior to the swap is extremely - * important on multi-chip/many-core boxes. On 48-core - * this one change improves fully concurrent all-cores - * compiles by 100% or better. - * - * I can't emphasize enough how important the pre-read - * is in preventing hw cache bus armageddon on - * multi-chip systems. And on single-chip/multi-core - * systems it just doesn't hurt. + * NOTE: Avoid unconditional atomic op by testing ovalue, + * otherwise we get cache bus armageddon. */ + ovalue = spin->counta; cpu_ccfence(); if ((ovalue & (SPINLOCK_EXCLWAIT - 1)) == 0) { if (atomic_fcmpset_int(&spin->counta, &ovalue, @@ -217,32 +230,15 @@ _spin_lock_contested(struct spinlock *spin, const char *ident, int value) } continue; } - - /* - * Throw in extra cpu_pause()'s when we are waiting on - * multiple other shared lock holders to release (the - * indefinite_check() also throws one in). - * - * We know these are shared lock holders when the count - * is larger than 1, because an exclusive lock holder can - * only have one count. Do this optimization only when - * the number of shared lock holders is 3 or greater. - */ - ovalue &= SPINLOCK_EXCLWAIT - 1; - while (ovalue > 2) { - cpu_pause(); - cpu_pause(); - --ovalue; + if (expbackoff > 6 + spin_backoff_max) + expbackoff = 6 + spin_backoff_max; + if (expbackoff >= 6) { + if (indefinite_check(&info)) + break; } - - if (indefinite_check(&info)) - break; - /* - * ovalue was wrong anyway, just reload - */ - ovalue = spin->counta; } - indefinite_done(&info); + if (expbackoff >= 6) + indefinite_done(&info); } /* @@ -251,6 +247,17 @@ _spin_lock_contested(struct spinlock *spin, const char *ident, int value) * * This is not in the critical path unless there is contention between * shared and exclusive holders. + * + * Exclusive locks have priority over shared locks. However, this can + * cause shared locks to be starved when large numbers of threads are + * competing for exclusive locks so the shared lock code uses TSC-windowing + * to selectively ignore the exclusive priority mechanism. This has the + * effect of allowing a limited number of shared locks to compete against + * exclusive waiters at any given moment. + * + * Note that shared locks do not implement exponential backoff. Instead, + * the shared lock simply polls the lock value. One cpu_pause() is built + * into indefinite_check(). */ void _spin_lock_shared_contested(struct spinlock *spin, const char *ident) @@ -291,25 +298,28 @@ _spin_lock_shared_contested(struct spinlock *spin, const char *ident) * systems it just doesn't hurt. */ cpu_ccfence(); - if (ovalue == 0) { + + /* + * Ignore the EXCLWAIT bits if we are inside our window. + */ + if ((ovalue & (SPINLOCK_EXCLWAIT - 1)) == 0 && + (rdtsc() >> spin_window_shift) % ncpus == mycpuid) { if (atomic_fcmpset_int(&spin->counta, &ovalue, - SPINLOCK_SHARED | 1)) { + ovalue | SPINLOCK_SHARED | 1)) { break; } continue; } /* - * Ignore the EXCLWAIT bits if we have waited too long. - * This would be a situation where most of the cpu cores - * are concurrently cycling both shared and exclusive use - * of the same spinlock, which can cause one or more cores - * to wait indefinitely on a shared spinlock. This can - * only occur in the most extreme testing environments. + * Check ovalue tightly (no exponential backoff for shared + * locks, that would result in horrible performance. Instead, + * shared locks depend on the exclusive priority mechanism + * to avoid starving exclusive locks). */ - if (info.secs > 1 && (ovalue & (SPINLOCK_EXCLWAIT - 1)) == 0) { + if (ovalue == 0) { if (atomic_fcmpset_int(&spin->counta, &ovalue, - ovalue | SPINLOCK_SHARED | 1)) { + SPINLOCK_SHARED | 1)) { break; } continue; diff --git a/sys/kern/lwkt_token.c b/sys/kern/lwkt_token.c index 1926cdfd51..739313fd90 100644 --- a/sys/kern/lwkt_token.c +++ b/sys/kern/lwkt_token.c @@ -82,10 +82,8 @@ extern int lwkt_sched_debug; -#define TOKEN_PRIME 66555444443333333ULL - #ifndef LWKT_NUM_POOL_TOKENS -#define LWKT_NUM_POOL_TOKENS 4096 /* power of 2 */ +#define LWKT_NUM_POOL_TOKENS 4001 #endif struct lwkt_pool_token { @@ -144,9 +142,17 @@ struct lwkt_token sigio_token = LWKT_TOKEN_INITIALIZER(sigio_token); struct lwkt_token tty_token = LWKT_TOKEN_INITIALIZER(tty_token); struct lwkt_token vnode_token = LWKT_TOKEN_INITIALIZER(vnode_token); -static int lwkt_token_spin = 5; -SYSCTL_INT(_lwkt, OID_AUTO, token_spin, CTLFLAG_RW, - &lwkt_token_spin, 0, "Decontention spin loops"); +/* + * Exponential backoff (exclusive tokens) and TSC windowing (shared tokens) + * parameters. We generally want a smaller backoff than we use for spinlocks + * because we can fall-back to the scheduler. + */ +static int token_backoff_max __cachealign = 1024; +SYSCTL_INT(_lwkt, OID_AUTO, token_backoff_max, CTLFLAG_RW, + &token_backoff_max, 0, "Tokens exponential backoff"); +static int token_window_shift __cachealign = 8; +SYSCTL_INT(_lwkt, OID_AUTO, token_window_shift, CTLFLAG_RW, + &token_window_shift, 0, "Tokens TSC windowing shift"); /* * The collision count is bumped every time the LWKT scheduler fails @@ -199,12 +205,10 @@ static __inline lwkt_token_t _lwkt_token_pool_lookup(void *ptr) { - uintptr_t n; + uint32_t i; - n = (uintptr_t)ptr + ((uintptr_t)ptr >> 18); - n = (n % TOKEN_PRIME); - n = (n ^ (n >> 16)) & (LWKT_NUM_POOL_TOKENS - 1); - return (&pool_tokens[n].token); + i = (uint32_t)(uintptr_t)ptr % LWKT_NUM_POOL_TOKENS; + return (&pool_tokens[i].token); } /* @@ -330,7 +334,7 @@ _lwkt_trytokref(lwkt_tokref_t ref, thread_t td, long mode) for (;;) { oref = tok->t_ref; /* can be NULL */ cpu_ccfence(); - if ((count & (TOK_EXCLUSIVE|TOK_EXCLREQ)) == 0 || + if ((count & (TOK_EXCLUSIVE|mode)) == 0 || ((count & TOK_EXCLUSIVE) == 0 && td->td_toks_stop != &td->td_toks_base + 1) ) { @@ -368,16 +372,44 @@ static __inline int _lwkt_trytokref_spin(lwkt_tokref_t ref, thread_t td, long mode) { - int spin; - if (_lwkt_trytokref(ref, td, mode)) return TRUE; - for (spin = lwkt_token_spin; spin > 0; --spin) { - cpu_pause(); - cpu_pause(); - if (_lwkt_trytokref(ref, td, mode)) - return TRUE; + + if (mode & TOK_EXCLUSIVE) { + /* + * Contested exclusive token, use exponential backoff + * algorithm. + */ + long expbackoff; + long loop; + + expbackoff = 0; + while (expbackoff < 6 + token_backoff_max) { + expbackoff = (expbackoff + 1) * 3 / 2; + if ((rdtsc() >> token_window_shift) % ncpus != mycpuid) { + for (loop = expbackoff; loop; --loop) + cpu_pause(); + } + if (_lwkt_trytokref(ref, td, mode)) + return TRUE; + } + } else { + /* + * Contested shared token, use TSC windowing. Note that + * exclusive tokens have priority over shared tokens only + * for the first token. + */ + if ((rdtsc() >> token_window_shift) % ncpus == mycpuid) { + if (_lwkt_trytokref(ref, td, mode & ~TOK_EXCLREQ)) + return TRUE; + } else { + if (_lwkt_trytokref(ref, td, mode)) + return TRUE; + } + } + ++mycpu->gd_cnt.v_lock_colls; + return FALSE; } -- 2.11.4.GIT