From 31efdff0873efcd437dc814d7de1c4bf1fc1f61f Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Sun, 15 Oct 2017 10:54:59 -0700 Subject: [PATCH] kernel - Simplify umtx_sleep and umtx_wakeup support * Rip out the vm_page_action / vm_page_event() API. This code was fairly SMP unfriendly and created serious bottlenecks with large threaded user programs using mutexes. * Replace with a simpler mechanism that simply wakes up any UMTX domain tsleeps after a fork(). * Implement a 4uS spin loop in umtx_sleep() similar to what the pipe code does. --- sys/kern/kern_fork.c | 22 +++++++++ sys/kern/kern_umtx.c | 97 +++++++++++++++++++++++++++---------- sys/vm/vm_fault.c | 15 +++--- sys/vm/vm_page.c | 133 --------------------------------------------------- sys/vm/vm_page.h | 21 +------- sys/vm/vm_page2.h | 18 ------- 6 files changed, 104 insertions(+), 202 deletions(-) diff --git a/sys/kern/kern_fork.c b/sys/kern/kern_fork.c index 6dd53d371d..d6d5e5a9d1 100644 --- a/sys/kern/kern_fork.c +++ b/sys/kern/kern_fork.c @@ -107,6 +107,26 @@ rb_lwp_compare(struct lwp *lp1, struct lwp *lp2) RB_GENERATE2(lwp_rb_tree, lwp, u.lwp_rbnode, rb_lwp_compare, lwpid_t, lwp_tid); /* + * When forking, memory underpinning umtx-supported mutexes may be set + * COW causing the physical address to change. We must wakeup any threads + * blocked on the physical address to allow them to re-resolve their VM. + */ +static void +wake_umtx_threads(struct proc *p1) +{ + struct lwp *lp; + struct thread *td; + + RB_FOREACH(lp, lwp_rb_tree, &p1->p_lwp_tree) { + td = lp->lwp_thread; + if (td && (td->td_flags & TDF_TSLEEPQ) && + (td->td_wdomain & PDOMAIN_MASK) == PDOMAIN_UMTX) { + wakeup_domain(td->td_wchan, PDOMAIN_UMTX); + } + } +} + +/* * fork() system call */ int @@ -312,6 +332,7 @@ fork1(struct lwp *lp1, int flags, struct proc **procp) } vm_fork(p1, 0, flags); + wake_umtx_threads(p1); /* * Close all file descriptors. @@ -634,6 +655,7 @@ fork1(struct lwp *lp1, int flags, struct proc **procp) PHOLD(p1); vm_fork(p1, p2, flags); + wake_umtx_threads(p1); /* * Create the first lwp associated with the new proc. diff --git a/sys/kern/kern_umtx.c b/sys/kern/kern_umtx.c index a2ce14f13a..ec3af2c62b 100644 --- a/sys/kern/kern_umtx.c +++ b/sys/kern/kern_umtx.c @@ -47,6 +47,7 @@ #include #include #include +#include #include #include @@ -67,7 +68,14 @@ #include -static void umtx_sleep_page_action_cow(vm_page_t m, vm_page_action_t action); +/* + * Improve umtx performance by polling for 4uS before going to sleep. + * This can avoid many IPIs in typical pthreads mutex situations. + */ +#ifdef _RDTSC_SUPPORTED_ +static int umtx_delay = 4000; +SYSCTL_INT(_kern, OID_AUTO, umtx_delay, CTLFLAG_RW, &umtx_delay, 0, ""); +#endif /* * If the contents of the userland-supplied pointer matches the specified @@ -101,12 +109,11 @@ sys_umtx_sleep(struct umtx_sleep_args *uap) { struct lwbuf lwb_cache; struct lwbuf *lwb; - struct vm_page_action action; vm_page_t m; void *waddr; int offset; int timeout; - int error = EBUSY; + int error; if (uap->timeout < 0) return (EINVAL); @@ -125,12 +132,23 @@ sys_umtx_sleep(struct umtx_sleep_args *uap) * Otherwise the physical page we sleep on my not match the page * being woken up. * + * The returned page is held, and this hold count prevents it from + * being paged out. This is important since we are sleeping on what + * is essentially a physical address (which avoids all sorts of + * collision-space issues). + * * WARNING! We can only use vm_fault_page*() for reading data. We * cannot use it for writing data because there is no pmap * interlock to protect against flushes/pageouts. + * + * (XXX with recent code work, this may no longer be an issue) + * + * WARNING! If the user program replaces the mapping of the underlying + * uap->ptr, the physical address may change and the umtx code + * will not be able to match wakeups with tsleeps. */ m = vm_fault_page_quick((vm_offset_t)uap->ptr, - VM_PROT_READ|VM_PROT_WRITE, &error, NULL); + VM_PROT_READ | VM_PROT_WRITE, &error, NULL); if (m == NULL) { error = EFAULT; goto done; @@ -138,38 +156,72 @@ sys_umtx_sleep(struct umtx_sleep_args *uap) lwb = lwbuf_alloc(m, &lwb_cache); offset = (vm_offset_t)uap->ptr & PAGE_MASK; - /* - * The critical section is required to interlock the tsleep against - * a wakeup from another cpu. The lfence forces synchronization. - */ + error = EBUSY; if (*(int *)(lwbuf_kva(lwb) + offset) == uap->value) { +#ifdef _RDTSC_SUPPORTED_ + /* + * Poll a little while before sleeping, most mutexes are + * short-lived. + */ + if (umtx_delay) { + int64_t tsc_target; + int good = 0; + + tsc_target = tsc_get_target(umtx_delay); + while (tsc_test_target(tsc_target) == 0) { + cpu_lfence(); + if (*(int *)(lwbuf_kva(lwb) + offset) != uap->value) { + good = 1; + break; + } + cpu_pause(); + } + if (good) { + error = EBUSY; + goto skip; + } + } +#endif + /* + * Calculate the timeout. This will be acccurate to within ~2 ticks. + */ if ((timeout = uap->timeout) != 0) { timeout = (timeout / 1000000) * hz + ((timeout % 1000000) * hz + 999999) / 1000000; } + + /* + * Calculate the physical address of the mutex. This gives us + * good distribution between unrelated processes using the + * feature. + */ waddr = (void *)((intptr_t)VM_PAGE_TO_PHYS(m) + offset); /* * Wake us up if the memory location COWs while we are sleeping. + * Use a critical section to tighten up the interlock. Also, + * tsleep_remove() requires the caller be in a critical section. */ crit_enter(); - vm_page_init_action(m, &action, umtx_sleep_page_action_cow, waddr); - vm_page_register_action(&action, VMEVENT_COW); /* * We must interlock just before sleeping. If we interlock before * registration the lock operations done by the registration can * interfere with it. + * + * We cannot leave our interlock hanging on return because this + * will interfere with umtx_wakeup() calls with limited wakeup + * counts. */ tsleep_interlock(waddr, PCATCH | PDOMAIN_UMTX); - if (*(int *)(lwbuf_kva(lwb) + offset) == uap->value && - action.event == VMEVENT_COW) { + cpu_lfence(); + if (*(int *)(lwbuf_kva(lwb) + offset) == uap->value) { error = tsleep(waddr, PCATCH | PINTERLOCKED | PDOMAIN_UMTX, "umtxsl", timeout); } else { + tsleep_remove(curthread); error = EBUSY; } - vm_page_unregister_action(&action); crit_exit(); /* Always break out in case of signal, even if restartable */ if (error == ERESTART) @@ -177,7 +229,7 @@ sys_umtx_sleep(struct umtx_sleep_args *uap) } else { error = EBUSY; } - +skip: lwbuf_free(lwb); vm_page_unhold(m); done: @@ -185,17 +237,6 @@ done: } /* - * If this page is being copied it may no longer represent the page - * underlying our virtual address. Wake up any umtx_sleep()'s - * that were waiting on its physical address to force them to retry. - */ -static void -umtx_sleep_page_action_cow(vm_page_t m, vm_page_action_t action) -{ - wakeup_domain(action->data, PDOMAIN_UMTX); -} - -/* * umtx_wakeup { const int *ptr, int count } * * Wakeup the specified number of processes held in umtx_sleep() on the @@ -227,7 +268,7 @@ sys_umtx_wakeup(struct umtx_wakeup_args *uap) if ((vm_offset_t)uap->ptr & (sizeof(int) - 1)) return (EFAULT); m = vm_fault_page_quick((vm_offset_t)uap->ptr, - VM_PROT_READ, &error, NULL); + VM_PROT_READ | VM_PROT_WRITE, &error, NULL); if (m == NULL) { error = EFAULT; goto done; @@ -235,12 +276,16 @@ sys_umtx_wakeup(struct umtx_wakeup_args *uap) offset = (vm_offset_t)uap->ptr & PAGE_MASK; waddr = (void *)((intptr_t)VM_PAGE_TO_PHYS(m) + offset); +#if 1 if (uap->count == 1) { wakeup_domain_one(waddr, PDOMAIN_UMTX); } else { /* XXX wakes them all up for now */ wakeup_domain(waddr, PDOMAIN_UMTX); } +#else + wakeup_domain(waddr, PDOMAIN_UMTX); +#endif vm_page_unhold(m); error = 0; done: diff --git a/sys/vm/vm_fault.c b/sys/vm/vm_fault.c index 29a1bb2b3b..fb0d3d07c4 100644 --- a/sys/vm/vm_fault.c +++ b/sys/vm/vm_fault.c @@ -322,8 +322,9 @@ RetryFault: /* * Find the vm_map_entry representing the backing store and resolve * the top level object and page index. This may have the side - * effect of executing a copy-on-write on the map entry and/or - * creating a shadow object, but will not COW any actual VM pages. + * effect of executing a copy-on-write on the map entry, + * creating a shadow object, or splitting an anonymous entry for + * performance, but will not COW any actual VM pages. * * On success fs.map is left read-locked and various other fields * are initialized but not otherwise referenced or locked. @@ -774,11 +775,15 @@ vm_fault_page(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type, * * This works great for normal programs but will always return * NULL for host lookups of vkernel maps in VMM mode. + * + * NOTE: pmap_fault_page_quick() might not busy the page. If + * VM_PROT_WRITE or VM_PROT_OVERRIDE_WRITE is set in + * fault_type and pmap_fault_page_quick() returns non-NULL, + * it will safely dirty the returned vm_page_t for us. We + * cannot safely dirty it here (it might not be busy). */ fs.m = pmap_fault_page_quick(map->pmap, vaddr, fault_type, busyp); if (fs.m) { - if (fault_type & (VM_PROT_WRITE|VM_PROT_OVERRIDE_WRITE)) - vm_page_dirty(fs.m); *errorp = 0; return(fs.m); } @@ -2075,7 +2080,6 @@ readrest: KKASSERT(fs->first_shared == 0); vm_page_copy(fs->m, fs->first_m); vm_page_protect(fs->m, VM_PROT_NONE); - vm_page_event(fs->m, VMEVENT_COW); } /* @@ -2415,7 +2419,6 @@ vm_fault_copy_entry(vm_map_t dst_map, vm_map_t src_map, panic("vm_fault_copy_wired: page missing"); vm_page_copy(src_m, dst_m); - vm_page_event(src_m, VMEVENT_COW); /* * Enter it in the pmap... diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c index 08ce9dc747..aa6df20c10 100644 --- a/sys/vm/vm_page.c +++ b/sys/vm/vm_page.c @@ -115,35 +115,11 @@ static vm_page_t vm_page_select_cache(u_short pg_color); static vm_page_t _vm_page_list_find2(int basequeue, int index); static void _vm_page_deactivate_locked(vm_page_t m, int athead); -MALLOC_DEFINE(M_ACTIONHASH, "acthash", "vmpage action hash"); - /* * Array of tailq lists */ __cachealign struct vpgqueues vm_page_queues[PQ_COUNT]; -LIST_HEAD(vm_page_action_list, vm_page_action); - -/* - * Action hash for user umtx support. Contention is governed by both - * tsleep/wakeup handling (kern/kern_synch.c) and action_hash[] below. - * Because action_hash[] represents active table locks, a modest fixed - * value well in excess of MAXCPU works here. - * - * There is also scan overhead depending on the number of threads in - * umtx*() calls, so we also size the hash table based on maxproc. - */ -struct vm_page_action_hash { - struct vm_page_action_list list; - struct lock lk; -} __cachealign; - -#define VMACTION_MINHSIZE 256 - -struct vm_page_action_hash *action_hash; -static int vmaction_hsize; -static int vmaction_hmask; - static volatile int vm_pages_waiting; static struct alist vm_contig_alist; static struct almeta vm_contig_ameta[ALIST_RECORDS_65536]; @@ -562,8 +538,6 @@ vm_numa_organize(vm_paddr_t ran_beg, vm_paddr_t bytes, int physid) * allocations. * * We leave vm_dma_reserved bytes worth of free pages in the reserve pool. - * - * Also setup the action_hash[] table here (which is only used by userland) */ static void vm_page_startup_finish(void *dummy __unused) @@ -574,7 +548,6 @@ vm_page_startup_finish(void *dummy __unused) alist_blk_t xcount; alist_blk_t bfree; vm_page_t m; - int i; spin_lock(&vm_contig_spin); for (;;) { @@ -643,33 +616,6 @@ vm_page_startup_finish(void *dummy __unused) (intmax_t)(vmstats.v_dma_pages - vm_contig_alist.bl_free) * (PAGE_SIZE / 1024), (intmax_t)vm_contig_alist.bl_free * (PAGE_SIZE / 1024)); - - /* - * Scale the action_hash[] array. Primary contention occurs due - * to cpu locks, scaled to ncpus, and scan overhead may be incurred - * depending on the number of threads, which we scale to maxproc. - * - * NOTE: Action lock might recurse due to callback, so allow - * recursion. - */ - vmaction_hsize = VMACTION_MINHSIZE; - if (vmaction_hsize < ncpus * 2) - vmaction_hsize = ncpus * 2; - if (vmaction_hsize < maxproc / 16) - vmaction_hsize = maxproc / 16; - vmaction_hmask = 1; - while (vmaction_hmask < vmaction_hsize) - vmaction_hmask = (vmaction_hmask << 1) | 1; - vmaction_hsize = vmaction_hmask + 1; - - action_hash = kmalloc(sizeof(action_hash[0]) * vmaction_hsize, - M_ACTIONHASH, - M_WAITOK | M_ZERO); - - for (i = 0; i < vmaction_hsize; i++) { - LIST_INIT(&action_hash[i].list); - lockinit(&action_hash[i].lk, "actlk", 0, LK_CANRECURSE); - } } SYSINIT(vm_pgend, SI_SUB_PROC0_POST, SI_ORDER_ANY, vm_page_startup_finish, NULL); @@ -3274,85 +3220,6 @@ vm_page_test_dirty(vm_page_t m) } } -/* - * Register an action, associating it with its vm_page - */ -void -vm_page_register_action(vm_page_action_t action, vm_page_event_t event) -{ - struct vm_page_action_hash *hash; - int hv; - - hv = (int)((intptr_t)action->m >> 8) & vmaction_hmask; - hash = &action_hash[hv]; - - lockmgr(&hash->lk, LK_EXCLUSIVE); - vm_page_flag_set(action->m, PG_ACTIONLIST); - action->event = event; - LIST_INSERT_HEAD(&hash->list, action, entry); - lockmgr(&hash->lk, LK_RELEASE); -} - -/* - * Unregister an action, disassociating it from its related vm_page - */ -void -vm_page_unregister_action(vm_page_action_t action) -{ - struct vm_page_action_hash *hash; - int hv; - - hv = (int)((intptr_t)action->m >> 8) & vmaction_hmask; - hash = &action_hash[hv]; - lockmgr(&hash->lk, LK_EXCLUSIVE); - if (action->event != VMEVENT_NONE) { - action->event = VMEVENT_NONE; - LIST_REMOVE(action, entry); - - if (LIST_EMPTY(&hash->list)) - vm_page_flag_clear(action->m, PG_ACTIONLIST); - } - lockmgr(&hash->lk, LK_RELEASE); -} - -/* - * Issue an event on a VM page. Corresponding action structures are - * removed from the page's list and called. - * - * If the vm_page has no more pending action events we clear its - * PG_ACTIONLIST flag. - */ -void -vm_page_event_internal(vm_page_t m, vm_page_event_t event) -{ - struct vm_page_action_hash *hash; - struct vm_page_action *scan; - struct vm_page_action *next; - int hv; - int all; - - hv = (int)((intptr_t)m >> 8) & vmaction_hmask; - hash = &action_hash[hv]; - all = 1; - - lockmgr(&hash->lk, LK_EXCLUSIVE); - LIST_FOREACH_MUTABLE(scan, &hash->list, entry, next) { - if (scan->m == m) { - if (scan->event == event) { - scan->event = VMEVENT_NONE; - LIST_REMOVE(scan, entry); - scan->func(m, scan); - /* XXX */ - } else { - all = 0; - } - } - } - if (all) - vm_page_flag_clear(m, PG_ACTIONLIST); - lockmgr(&hash->lk, LK_RELEASE); -} - #include "opt_ddb.h" #ifdef DDB #include diff --git a/sys/vm/vm_page.h b/sys/vm/vm_page.h index 120532804d..3d06ef1e7b 100644 --- a/sys/vm/vm_page.h +++ b/sys/vm/vm_page.h @@ -99,19 +99,6 @@ #endif -typedef enum vm_page_event { VMEVENT_NONE, VMEVENT_COW } vm_page_event_t; - -struct vm_page_action { - LIST_ENTRY(vm_page_action) entry; - struct vm_page *m; - vm_page_event_t event; - void (*func)(struct vm_page *, - struct vm_page_action *); - void *data; -}; - -typedef struct vm_page_action *vm_page_action_t; - /* * Management of resident (logical) pages. * @@ -301,12 +288,11 @@ extern struct vpgqueues vm_page_queues[PQ_COUNT]; #define PG_RAM 0x00002000 /* read ahead mark */ #define PG_SWAPPED 0x00004000 /* backed by swap */ #define PG_NOTMETA 0x00008000 /* do not back with swap */ -#define PG_ACTIONLIST 0x00010000 /* lookaside action list present */ +#define PG_UNUSED10000 0x00010000 #define PG_SBUSY 0x00020000 /* soft-busy also set */ #define PG_NEED_COMMIT 0x00040000 /* clean page requires commit */ -#define PG_KEEP_NEWPAGE_MASK (PG_BUSY | PG_SBUSY | \ - PG_WANTED | PG_ACTIONLIST) +#define PG_KEEP_NEWPAGE_MASK (PG_BUSY | PG_SBUSY | PG_WANTED) /* @@ -451,10 +437,7 @@ void vm_page_zero_invalid(vm_page_t m, boolean_t setvalid); void vm_page_free_toq(vm_page_t m); void vm_page_free_contig(vm_page_t m, unsigned long size); vm_page_t vm_page_free_fromq_fast(void); -void vm_page_event_internal(vm_page_t, vm_page_event_t); void vm_page_dirty(vm_page_t m); -void vm_page_register_action(vm_page_action_t action, vm_page_event_t event); -void vm_page_unregister_action(vm_page_action_t action); void vm_page_sleep_busy(vm_page_t m, int also_m_busy, const char *msg); void VM_PAGE_DEBUG_EXT(vm_page_busy_wait)(vm_page_t m, int also_m_busy, const char *wmsg VM_PAGE_DEBUG_ARGS); diff --git a/sys/vm/vm_page2.h b/sys/vm/vm_page2.h index 8dac2c4a2b..c7b1930a20 100644 --- a/sys/vm/vm_page2.h +++ b/sys/vm/vm_page2.h @@ -166,24 +166,6 @@ vm_paging_needed(void) return 0; } -static __inline -void -vm_page_event(vm_page_t m, vm_page_event_t event) -{ - if (m->flags & PG_ACTIONLIST) - vm_page_event_internal(m, event); -} - -static __inline -void -vm_page_init_action(vm_page_t m, vm_page_action_t action, - void (*func)(vm_page_t, vm_page_action_t), void *data) -{ - action->m = m; - action->func = func; - action->data = data; -} - /* * Clear dirty bits in the VM page but truncate the * end to a DEV_BSIZE'd boundary. -- 2.11.4.GIT