From e32fb2aa5ec45f533085d913b32f3b5b441047fe Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Sun, 19 May 2019 12:59:49 -0700 Subject: [PATCH] kernel - VM rework part 13 - Core pmap work, stabilize & optimize * Refactor the vm_page_hash hash again to get a better distribution. * I tried to only hash shared objects but this resulted in a number of edge cases where program re-use could miss the optimization. * Add a sysctl vm.page_hash_vnode_only (default off). If turned on, only vm_page's associated with vnodes will be hashed. This should generally not be necessary. * Refactor vm_page_list_find2() again to avoid all duplicate queue checks. This time I mocked the algorithm up in userland and twisted it until it did what I wanted. * VM_FAULT_QUICK_DEBUG was accidently left on, turn it off. * Do not remove the original page from the pmap when vm_fault_object() must do a COW. And just in case this is ever added back in later, don't do it using pmap_remove_specific() !!! Use pmap_remove_pages() to avoid the backing scan lock. vm_fault_page() will now do this removal (for procfs rwmem), the normal vm_fault will of course replace the page anyway, and the umtx code uses different recovery mechanisms now and should be ok. * Optimize vm_map_entry_shadow() for the situation where the old object is no longer shared. Get rid of an unnecessary transient kmalloc() and vm_object_hold_shared(). --- sys/vm/vm_fault.c | 49 +++++++----------- sys/vm/vm_map.c | 9 ++-- sys/vm/vm_page.c | 129 ++++++++++++++++++++++++++++++++++-------------- test/debug/vmpageinfo.c | 2 +- 4 files changed, 116 insertions(+), 73 deletions(-) diff --git a/sys/vm/vm_fault.c b/sys/vm/vm_fault.c index f7801c1ab2..73755e1b07 100644 --- a/sys/vm/vm_fault.c +++ b/sys/vm/vm_fault.c @@ -161,10 +161,17 @@ __read_mostly int vm_shared_fault = 1; TUNABLE_INT("vm.shared_fault", &vm_shared_fault); SYSCTL_INT(_vm, OID_AUTO, shared_fault, CTLFLAG_RW, &vm_shared_fault, 0, "Allow shared token on vm_object"); -__read_mostly static int vm_fault_quick_enable = 0; +__read_mostly static int vm_fault_quick_enable = 1; TUNABLE_INT("vm.fault_quick", &vm_fault_quick_enable); SYSCTL_INT(_vm, OID_AUTO, fault_quick, CTLFLAG_RW, &vm_fault_quick_enable, 0, "Allow fast vm_fault shortcut"); + +/* + * Define here for debugging ioctls. Note that these are globals, so + * they were cause a ton of cache line bouncing. Only use for debugging + * purposes. + */ +/*#define VM_FAULT_QUICK_DEBUG */ #ifdef VM_FAULT_QUICK_DEBUG static long vm_fault_quick_success_count = 0; SYSCTL_LONG(_vm, OID_AUTO, fault_quick_success_count, CTLFLAG_RW, @@ -865,14 +872,6 @@ vm_fault_quick(struct faultstate *fs, vm_pindex_t first_pindex, /* * Don't waste time if the object is only being used by one vm_map. - * - * WARNING! We can't rely on obj->ref_count here because it might - * be part of a shared ba chain, and we can't rely on - * ba->refs for the same reason. The combination of it - * being the ba embedded in the entry (aka first_ba) AND - * ref_count == 1 would work, but OBJ_ONEMAPPING is better - * because it will remain flagged even when ref_count > 1 - * for situations where entries are clipped. */ obj = fs->first_ba->object; if (obj->flags & OBJ_ONEMAPPING) @@ -2265,26 +2264,15 @@ next: /* * Oh, well, lets copy it. * - * Why are we unmapping the original page - * here? Well, in short, not all accessors - * of user memory go through the pmap. The - * procfs code doesn't have access user memory - * via a local pmap, so vm_fault_page*() - * can't call pmap_enter(). And the umtx*() - * code may modify the COW'd page via a DMAP - * or kernel mapping and not via the pmap, - * leaving the original page still mapped - * read-only into the pmap. - * - * So we have to remove the page from at - * least the current pmap if it is in it. + * We used to unmap the original page here + * because vm_fault_page() didn't and this + * would cause havoc for the umtx*() code + * and the procfs code. * - * We used to just remove it from all pmaps - * but that creates inefficiencies on SMP, - * particularly for COW program & library - * mappings that are concurrently exec'd. - * Only remove the page from the current - * pmap. + * This is no longer necessary. The + * vm_fault_page() routine will now unmap the + * page after a COW, and the umtx code will + * recover on its own. */ /* * NOTE: Since fs->m is a backing page, it @@ -2293,10 +2281,9 @@ next: */ KKASSERT(fs->first_shared == 0); vm_page_copy(fs->m, fs->first_m); - /*vm_page_protect(fs->m, VM_PROT_NONE);*/ - pmap_remove_specific( + /* pmap_remove_specific( &curthread->td_lwp->lwp_vmspace->vm_pmap, - fs->m); + fs->m); */ } /* diff --git a/sys/vm/vm_map.c b/sys/vm/vm_map.c index c478b62417..aa3071c435 100644 --- a/sys/vm/vm_map.c +++ b/sys/vm/vm_map.c @@ -740,28 +740,29 @@ vm_map_entry_shadow(vm_map_entry_t entry) length = 0x7FFFFFFF; else length = atop(entry->ba.end - entry->ba.start); - ba = kmalloc(sizeof(*ba), M_MAP_BACKING, M_INTWAIT); /* copied later */ /* * Don't create the new object if the old object isn't shared. + * This case occurs quite often when programs fork/exec/wait. * * Caller ensures source exists (all backing_ba's must have objects), * typically indirectly by virtue of the NEEDS_COPY flag being set. + * We have a ref on source by virtue of the entry and do not need + * to lock it to do this test. */ source = entry->ba.object; KKASSERT(source); - vm_object_hold_shared(source); if (source->type != OBJT_VNODE) { if (source->ref_count == 1 && source->handle == NULL && (source->type == OBJT_DEFAULT || source->type == OBJT_SWAP)) { - vm_object_drop(source); - kfree(ba, M_MAP_BACKING); goto done; } } + ba = kmalloc(sizeof(*ba), M_MAP_BACKING, M_INTWAIT); /* copied later */ + vm_object_hold_shared(source); /* * Once it becomes part of a backing_ba chain it can wind up anywhere, diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c index 583619dd14..9da2327b7a 100644 --- a/sys/vm/vm_page.c +++ b/sys/vm/vm_page.c @@ -106,6 +106,9 @@ struct vm_page_hash_elm { int unused01; }; +#define VM_PAGE_HASH_SET 4 /* power of 2, set-assoc */ +#define VM_PAGE_HASH_MAX (1024 * 1024) /* power of 2, max size */ + /* * SET - Minimum required set associative size, must be a power of 2. We * want this to match or exceed the set-associativeness of the cpu, @@ -130,8 +133,9 @@ static struct alist vm_contig_alist; static struct almeta vm_contig_ameta[ALIST_RECORDS_65536]; static struct spinlock vm_contig_spin = SPINLOCK_INITIALIZER(&vm_contig_spin, "vm_contig_spin"); -static struct vm_page_hash_elm *vm_page_hash; +__read_mostly static int vm_page_hash_vnode_only; __read_mostly static int vm_page_hash_size; +__read_mostly static struct vm_page_hash_elm *vm_page_hash; static u_long vm_dma_reserved = 0; TUNABLE_ULONG("vm.dma_reserved", &vm_dma_reserved); @@ -140,6 +144,14 @@ SYSCTL_ULONG(_vm, OID_AUTO, dma_reserved, CTLFLAG_RD, &vm_dma_reserved, 0, SYSCTL_UINT(_vm, OID_AUTO, dma_free_pages, CTLFLAG_RD, &vm_contig_alist.bl_free, 0, "Memory reserved for DMA"); +SYSCTL_INT(_vm, OID_AUTO, page_hash_vnode_only, CTLFLAG_RW, + &vm_page_hash_vnode_only, 0, "Only hash vnode pages"); +#if 0 +static int vm_page_hash_debug; +SYSCTL_INT(_vm, OID_AUTO, page_hash_debug, CTLFLAG_RW, + &vm_page_hash_debug, 0, "Only hash vnode pages"); +#endif + static int vm_contig_verbose = 0; TUNABLE_INT("vm.contig_verbose", &vm_contig_verbose); @@ -706,15 +718,18 @@ vm_page_startup_finish(void *dummy __unused) * Set the set_assoc_mask based on the fitted number of CPUs. * This is a mask, so we subject 1. * - * w/PQ_L2_SIZE = 1024: + * w/PQ_L2_SIZE = 1024, Don't let the associativity drop below 8. + * So if we have 256 CPUs, two hyper-threads will wind up sharing. * - * Don't let the associativity drop below 8. So if we have - * 256 CPUs, two hyper-threads will wind up sharing. The - * maximum is PQ_L2_SIZE. + * The maximum is PQ_L2_SIZE. However, we limit the starting + * maximum to 16 (mask = 15) in order to improve the cache locality + * of related kernel data structures. */ mask = PQ_L2_SIZE / ncpus_fit - 1; if (mask < 7) /* minimum is 8-way w/256 CPU threads */ mask = 7; + if (mask < 15) + mask = 15; cpu_ccfence(); set_assoc_mask = mask; @@ -795,8 +810,8 @@ vm_page_startup_finish(void *dummy __unused) vm_page_hash_size = 4096; while (vm_page_hash_size < (vm_page_array_size / 16)) vm_page_hash_size <<= 1; - if (vm_page_hash_size > 1024*1024) - vm_page_hash_size = 1024*1024; + if (vm_page_hash_size > VM_PAGE_HASH_MAX) + vm_page_hash_size = VM_PAGE_HASH_MAX; /* * hash table for vm_page_lookup_quick() @@ -1500,9 +1515,12 @@ vm_page_hash_hash(vm_object_t object, vm_pindex_t pindex) { size_t hi; - hi = ((object->pg_color << 8) ^ (uintptr_t)object) + (pindex << 2); - hi &= vm_page_hash_size - 1; - hi &= ~3; + /* mix it up */ + hi = (intptr_t)object ^ object->pg_color ^ pindex; + hi += object->pg_color * pindex; + hi = hi ^ (hi >> 20); + hi &= vm_page_hash_size - 1; /* bounds */ + hi &= ~(VM_PAGE_HASH_SET - 1); /* set-assoc */ return (&vm_page_hash[hi]); } @@ -1522,7 +1540,7 @@ vm_page_hash_get(vm_object_t object, vm_pindex_t pindex) if (vm_page_hash == NULL) return NULL; mp = vm_page_hash_hash(object, pindex); - for (i = 0; i < 4; ++i) { + for (i = 0; i < VM_PAGE_HASH_SET; ++i) { m = mp[i].m; cpu_ccfence(); if (m == NULL) @@ -1552,28 +1570,49 @@ vm_page_hash_enter(vm_page_t m) struct vm_page_hash_elm *best; int i; - if (vm_page_hash && - m > &vm_page_array[0] && - m < &vm_page_array[vm_page_array_size]) { - mp = vm_page_hash_hash(m->object, m->pindex); - best = mp; - for (i = 0; i < 4; ++i) { - if (mp[i].m == m) { - mp[i].ticks = ticks; - return; - } + /* + * Only enter type-stable vm_pages with well-shared objects. + */ + if (vm_page_hash == NULL || + m < &vm_page_array[0] || + m >= &vm_page_array[vm_page_array_size]) + return; + if (m->object == NULL) + return; +#if 0 + /* + * Disabled at the moment, there are some degenerate conditions + * with often-exec'd programs that get ignored. In particular, + * the kernel's elf loader does a vn_rdwr() on the first page of + * a binary. + */ + if (m->object->ref_count <= 2 || (m->object->flags & OBJ_ONEMAPPING)) + return; +#endif + if (vm_page_hash_vnode_only && m->object->type != OBJT_VNODE) + return; - /* - * The best choice is the oldest entry - */ - if ((ticks - best->ticks) < (ticks - mp[i].ticks) || - (int)(ticks - mp[i].ticks) < 0) { - best = &mp[i]; - } + /* + * + */ + mp = vm_page_hash_hash(m->object, m->pindex); + best = mp; + for (i = 0; i < VM_PAGE_HASH_SET; ++i) { + if (mp[i].m == m) { + mp[i].ticks = ticks; + return; + } + + /* + * The best choice is the oldest entry + */ + if ((ticks - best->ticks) < (ticks - mp[i].ticks) || + (int)(ticks - mp[i].ticks) < 0) { + best = &mp[i]; } - best->m = m; - best->ticks = ticks; } + best->m = m; + best->ticks = ticks; } /* @@ -1902,12 +1941,17 @@ _vm_page_list_find2(int basequeue, int index, int *lastp) struct vpgqueues *pq; vm_page_t m = NULL; int pqmask = set_assoc_mask >> 1; - int pqstart = 0; int pqi; - int i; + int range; + int skip_start; + int skip_next; + int count; index &= PQ_L2_MASK; pq = &vm_page_queues[basequeue]; + count = 0; + skip_start = -1; + skip_next = -1; /* * Run local sets of 16, 32, 64, 128, up to the entire queue if all @@ -1922,9 +1966,16 @@ _vm_page_list_find2(int basequeue, int index, int *lastp) */ do { pqmask = (pqmask << 1) | 1; - for (i = pqstart; i <= pqmask; ++i) { - pqi = (index & ~pqmask) | ((index + i) & pqmask); - if (TAILQ_FIRST(&pq[pqi].pl)) { + + pqi = index; + range = pqmask + 1; + + while (range > 0) { + if (pqi >= skip_start && pqi < skip_next) { + range -= skip_next - pqi; + pqi = (pqi & ~pqmask) | (skip_next & pqmask); + } + if (range > 0 && TAILQ_FIRST(&pq[pqi].pl)) { spin_lock(&pq[pqi].spin); TAILQ_FOREACH(m, &pq[pqi].pl, pageq) { if (spin_trylock(&m->spin) == 0) @@ -1936,14 +1987,18 @@ _vm_page_list_find2(int basequeue, int index, int *lastp) * If we had to wander too far, set * *lastp to skip past empty queues. */ - if (i >= 8) + if (count >= 8) *lastp = pqi & PQ_L2_MASK; return(m); } spin_unlock(&pq[pqi].spin); } + --range; + ++count; + pqi = (pqi & ~pqmask) | ((pqi + 1) & pqmask); } - pqstart = i; + skip_start = pqi & ~pqmask; + skip_next = (pqi | pqmask) + 1; } while (pqmask != PQ_L2_MASK); return(m); diff --git a/test/debug/vmpageinfo.c b/test/debug/vmpageinfo.c index 77419afe99..c7d0a042df 100644 --- a/test/debug/vmpageinfo.c +++ b/test/debug/vmpageinfo.c @@ -1,7 +1,7 @@ /* * VMPAGEINFO.C * - * cc -I/usr/src/sys vmpageinfo.c -o /usr/local/bin/vmpageinfo -lkvm + * cc -I/usr/src/sys vmpageinfo.c -o ~/bin/vmpageinfo -lkvm * * vmpageinfo * -- 2.11.4.GIT