From df49ec1e8e3e6cf6347f36d778f14b9c9baaff2d Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Tue, 21 May 2019 13:55:43 -0700 Subject: [PATCH] kernel - VM rework part 18 - Cleanup * Significantly reduce the zone limit for pvzone (for pmap pv_entry structures). pv_entry's are no longer allocated on a per-page basis so the limit can be made much smaller. This also has the effect of reducing the per-cpu cache limit which ultimately stabilizes wired memory use for the zone. * Also reduce the generic pre-cpu cache limit for zones. This only really effects the pvzone. * Make pvzone, mapentzone, and swap_zone __read_mostly. * Enhance vmstat -z, report current structural use and actual total memory use. * Also cleanup the copyright statement for vm/vm_zone.c. John Dyson's original copyright was slightly different than the BSD copyright and stipulated no changes, so separate out the DragonFly addendum. --- sys/config/LINT64 | 14 ----------- sys/platform/pc64/conf/options | 1 - sys/platform/pc64/x86_64/pmap.c | 42 ++++++++++++++++++++------------ sys/vm/swap_pager.c | 2 +- sys/vm/vm_map.c | 2 +- sys/vm/vm_zone.c | 46 +++++++++++++++++++++++++++++++---- usr.bin/vmstat/vmstat.c | 54 +++++++++++++++++++++++++++-------------- 7 files changed, 105 insertions(+), 56 deletions(-) diff --git a/sys/config/LINT64 b/sys/config/LINT64 index e5eb5df277..99eccc3108 100644 --- a/sys/config/LINT64 +++ b/sys/config/LINT64 @@ -1720,20 +1720,6 @@ options BOOTP_NFSROOT # NFS mount root filesystem using BOOTP info options BOOTP_COMPAT # Workaround for broken bootp daemons. options BOOTP_WIRED_TO=fxp0 # Use interface fxp0 for BOOTP -# -# Set the number of PV entries per process. Increasing this can -# stop panics related to heavy use of shared memory. However, that can -# (combined with large amounts of physical memory) cause panics at -# boot time due the kernel running out of VM space. -# -# If you're tweaking this, you might also want to increase the sysctls -# "vm.v_free_min", "vm.v_free_reserved", and "vm.v_free_target". -# -# The value below is the one more than the default. -# -options PMAP_SHPGPERPROC=201 - -# # Disable swapping. This option removes all code which actually performs # swapping, so it's not possible to turn it back on at run-time. # diff --git a/sys/platform/pc64/conf/options b/sys/platform/pc64/conf/options index 8ef0cc23c7..934805992f 100644 --- a/sys/platform/pc64/conf/options +++ b/sys/platform/pc64/conf/options @@ -1,4 +1,3 @@ -PMAP_SHPGPERPROC opt_pmap.h PPC_PROBE_CHIPSET opt_ppc.h PPC_DEBUG opt_ppc.h MAXMEM diff --git a/sys/platform/pc64/x86_64/pmap.c b/sys/platform/pc64/x86_64/pmap.c index f8cfdb0cf3..cf5cdbd55f 100644 --- a/sys/platform/pc64/x86_64/pmap.c +++ b/sys/platform/pc64/x86_64/pmap.c @@ -96,9 +96,6 @@ #include #define PMAP_KEEP_PDIRS -#ifndef PMAP_SHPGPERPROC -#define PMAP_SHPGPERPROC 2000 -#endif #if defined(DIAGNOSTIC) #define PMAP_DIAGNOSTIC @@ -247,10 +244,9 @@ static uint64_t DMPDPphys; /* phys addr of direct mapped level 3 */ /* * Data for the pv entry allocation mechanism */ -static vm_zone_t pvzone; +__read_mostly static vm_zone_t pvzone; +__read_mostly static int pmap_pagedaemon_waken = 0; static struct vm_zone pvzone_store; -static vm_pindex_t pv_entry_max=0, pv_entry_high_water=0; -static int pmap_pagedaemon_waken = 0; static struct pv_entry *pvinit; /* @@ -321,6 +317,10 @@ static int pmap_pv_debug = 50; SYSCTL_INT(_machdep, OID_AUTO, pmap_pv_debug, CTLFLAG_RW, &pmap_pv_debug, 0, ""); +static long vm_pmap_pv_entries; +SYSCTL_LONG(_vm, OID_AUTO, pmap_pv_entries, CTLFLAG_RD, + &vm_pmap_pv_entries, 0, ""); + /* Standard user access funtions */ extern int std_copyinstr (const void *udaddr, void *kaddr, size_t len, size_t *lencopied); @@ -1363,20 +1363,29 @@ static void dump_pmap(pmap_t pmap, pt_entry_t pte, int level, vm_offset_t base); void pmap_init2(void) { - vm_pindex_t shpgperproc = PMAP_SHPGPERPROC; vm_pindex_t entry_max; - TUNABLE_LONG_FETCH("vm.pmap.shpgperproc", &shpgperproc); - pv_entry_max = shpgperproc * maxproc + vm_page_array_size; - TUNABLE_LONG_FETCH("vm.pmap.pv_entries", &pv_entry_max); - pv_entry_high_water = 9 * (pv_entry_max / 10); + /* + * We can significantly reduce pv_entry_max from historical + * levels because pv_entry's are no longer use for PTEs at the + * leafs. This prevents excessive pcpu caching on many-core + * boxes (even with the further '/ 16' done in zinitna(). + * + * Remember, however, that processes can share physical pages + * with each process still needing the pdp/pd/pt infrstructure + * (which still use pv_entry's). And don't just assume that + * every PT will be completely filled up. So don't make it + * too small. + */ + entry_max = maxproc * 32 + vm_page_array_size / 16; + TUNABLE_LONG_FETCH("vm.pmap.pv_entries", &entry_max); + vm_pmap_pv_entries = entry_max; /* * Subtract out pages already installed in the zone (hack) */ - entry_max = pv_entry_max - vm_page_array_size; - if (entry_max <= 0) - entry_max = 1; + if (entry_max <= MINPV) + entry_max = MINPV; zinitna(pvzone, NULL, 0, entry_max, ZONE_INTERRUPT); @@ -3840,8 +3849,9 @@ pmap_collect(void) return; pmap_pagedaemon_waken = 0; if (warningdone < 5) { - kprintf("pmap_collect: collecting pv entries -- " - "suggest increasing PMAP_SHPGPERPROC\n"); + kprintf("pmap_collect: pv_entries exhausted -- " + "suggest increasing vm.pmap_pv_entries above %ld\n", + vm_pmap_pv_entries); warningdone++; } diff --git a/sys/vm/swap_pager.c b/sys/vm/swap_pager.c index 5204e8a3eb..21d02c216f 100644 --- a/sys/vm/swap_pager.c +++ b/sys/vm/swap_pager.c @@ -206,7 +206,7 @@ SYSCTL_INT(_vm, OID_AUTO, swap_size, SYSCTL_INT(_vm, OID_AUTO, report_swap_allocs, CTLFLAG_RW, &vm_report_swap_allocs, 0, ""); -vm_zone_t swap_zone; +__read_mostly vm_zone_t swap_zone; /* * Red-Black tree for swblock entries diff --git a/sys/vm/vm_map.c b/sys/vm/vm_map.c index aa3071c435..22be1506f8 100644 --- a/sys/vm/vm_map.c +++ b/sys/vm/vm_map.c @@ -138,7 +138,7 @@ static struct objcache *vmspace_cache; ((((entry)->ba.start ^ (entry)->ba.end) & ~MAP_ENTRY_PARTITION_MASK) == 0) static struct vm_zone mapentzone_store; -static vm_zone_t mapentzone; +__read_mostly static vm_zone_t mapentzone; static struct vm_map_entry map_entry_init[MAX_MAPENT]; static struct vm_map_entry cpu_map_entry_init_bsp[MAPENTRYBSP_CACHE]; diff --git a/sys/vm/vm_zone.c b/sys/vm/vm_zone.c index e730f0eac1..1172dbe3cd 100644 --- a/sys/vm/vm_zone.c +++ b/sys/vm/vm_zone.c @@ -1,8 +1,5 @@ /* - * (MPSAFE) - * - * Copyright (c) 1997, 1998 John S. Dyson - * All rights reserved. + * Copyright (c) 1997, 1998 John S. Dyson. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -14,6 +11,38 @@ * John S. Dyson. * * $FreeBSD: src/sys/vm/vm_zone.c,v 1.30.2.6 2002/10/10 19:50:16 dillon Exp $ + * + * Copyright (c) 2003-2017,2019 The DragonFly Project. All rights reserved. + * + * This code is derived from software contributed to The DragonFly Project + * by Matthew Dillon + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of The DragonFly Project nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific, prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. */ #include @@ -303,7 +332,6 @@ zinitna(vm_zone_t z, char *name, size_t size, long nentries, uint32_t flags) z->zallocflag = VM_ALLOC_SYSTEM | VM_ALLOC_INTERRUPT | VM_ALLOC_NORMAL | VM_ALLOC_RETRY; z->zmax += nentries; - z->zmax_pcpu = z->zmax / ncpus / 16; /* * Set reasonable pcpu cache bounds. Low-memory systems @@ -313,6 +341,7 @@ zinitna(vm_zone_t z, char *name, size_t size, long nentries, uint32_t flags) * In particular, pvzone can wind up being excessive and * waste memory unnecessarily. */ + z->zmax_pcpu = z->zmax / ncpus / 64; if (z->zmax_pcpu < 1024) z->zmax_pcpu = 1024; if (z->zmax_pcpu * z->zsize > 16*1024*1024) @@ -530,10 +559,15 @@ zget(vm_zone_t z) noffset = (size_t)z->zpagecount * PAGE_SIZE; /* noffset -= noffset % z->zsize; */ savezpc = z->zpagecount; + + /* + * Track total memory use and kmem offset. + */ if (z->zpagecount + nalloc > z->zpagemax) z->zpagecount = z->zpagemax; else z->zpagecount += nalloc; + item = (char *)z->zkva + noffset; npages = z->zpagecount - savezpc; nitems = ((size_t)(savezpc + npages) * PAGE_SIZE - noffset) / @@ -570,6 +604,7 @@ zget(vm_zone_t z) * by vm_map_entry_reserve_cpu_init(). */ nbytes = (size_t)z->zalloc * PAGE_SIZE; + z->zpagecount += z->zalloc; /* Track total memory use */ item = (void *)kmem_alloc3(&kernel_map, nbytes, VM_SUBSYS_ZALLOC, KM_KRESERVE); @@ -587,6 +622,7 @@ zget(vm_zone_t z) * Otherwise allocate KVA from the kernel_map. */ nbytes = (size_t)z->zalloc * PAGE_SIZE; + z->zpagecount += z->zalloc; /* Track total memory use */ item = (void *)kmem_alloc3(&kernel_map, nbytes, VM_SUBSYS_ZALLOC, 0); diff --git a/usr.bin/vmstat/vmstat.c b/usr.bin/vmstat/vmstat.c index b06a9177bc..06279f0a5c 100644 --- a/usr.bin/vmstat/vmstat.c +++ b/usr.bin/vmstat/vmstat.c @@ -1064,12 +1064,16 @@ dozmem(u_int interval, int reps) struct zlist zlist; struct vm_zone *kz; struct vm_zone zone; - struct vm_zone copy; struct vm_zone save[MAXSAVE]; + long zfreecnt_prev; + long znalloc_prev; + long zfreecnt_next; + long znalloc_next; char name[64]; size_t namesz; - int i; int first = 1; + int i; + int n; bzero(save, sizeof(save)); @@ -1084,9 +1088,21 @@ again: perror("kvm_read"); break; } - copy = zone; - zone.znalloc -= save[i].znalloc; - save[i] = copy; + zfreecnt_prev = save[i].zfreecnt; + znalloc_prev = save[i].znalloc; + for (n = 0; n < SMP_MAXCPU; ++n) { + zfreecnt_prev += save[i].zpcpu[n].zfreecnt; + znalloc_prev += save[i].zpcpu[n].znalloc; + } + + zfreecnt_next = zone.zfreecnt; + znalloc_next = zone.znalloc; + for (n = 0; n < SMP_MAXCPU; ++n) { + zfreecnt_next += zone.zpcpu[n].zfreecnt; + znalloc_next += zone.zpcpu[n].znalloc; + } + save[i] = zone; + namesz = sizeof(name); if (kvm_readstr(kd, (intptr_t)zone.zname, name, &namesz) == NULL) { perror("kvm_read"); @@ -1095,25 +1111,27 @@ again: if (first && interval) { /* do nothing */ } else if (zone.zmax) { - printf("%-10s %9ld/%9ld %5ldM used" - " use=%-9lu %6.2f%%\n", + printf("%-10s %9ld / %-9ld %5ldM used" + " %6.2f%% ", name, - (long)(zone.ztotal - zone.zfreecnt), + (long)(zone.ztotal - zfreecnt_next), (long)zone.zmax, - (long)(zone.ztotal - zone.zfreecnt) * - zone.zsize / (1024 * 1024), - (unsigned long)zone.znalloc, - (double)(zone.ztotal - zone.zfreecnt) * + (long)zone.zpagecount * 4096 / (1024 * 1024), + (double)(zone.ztotal - zfreecnt_next) * 100.0 / (double)zone.zmax); } else { - printf("%-10s %9ld %5ldM used" - " use=%-9lu\n", + printf("%-10s %9ld %5ldM used" + " ", name, - (long)(zone.ztotal - zone.zfreecnt), - (long)(zone.ztotal - zone.zfreecnt) * - zone.zsize / (1024 * 1024), - (unsigned long)zone.znalloc); + (long)(zone.ztotal - zfreecnt_next), + (long)(zone.ztotal - zfreecnt_next) * + zone.zsize / (1024 * 1024)); } + if (first == 0) { + printf("use=%ld\n", znalloc_next - znalloc_prev); + } else if (interval == 0) + printf("\n"); + kz = LIST_NEXT(&zone, zlink); ++i; } -- 2.11.4.GIT