From 586c43085fc900273732f99de6c9ef43f73ded76 Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Sat, 12 Aug 2017 12:24:16 -0700 Subject: [PATCH] kernel - Break up scheduler and loadavg callout * Change the scheduler and loadavg callouts from cpu 0 to all cpus, and adjust the allproc_scan() and alllwp_scan() to segment the hash table when asked. Every cpu is now tasked with handling the nominal scheduler recalc and nominal load calculation for a portion of the process list. The portion is unrelated to which cpu(s) the processes are actually scheduled on, it is strictly a way to spread the work around, split up by hash range. * Significantly reduces cpu 0 stalls when a large number of user processes or threads are present (that is, in the tens of thousands or more). In the test below, before this change, cpu 0 was straining under 40%+ interupt load (from the callout). After this change the load is spread across all cpus, approximately 1.5% per cpu. * Tested with 400,000 running user processes on a 32-thread dual-socket xeon (yes, these numbers are real): 12:27PM up 8 mins, 3 users, load avg: 395143.28, 270541.13, 132638.33 12:33PM up 14 mins, 3 users, load avg: 399496.57, 361405.54, 225669.14 * NOTE: There are still a number of other non-segmented allproc scans in the system, particularly related to paging and swapping. * NOTE: Further spreading-out of the work may be needed, by using a more frequent callout and smaller hash index range for each. --- sys/kern/imgact_elf.c | 2 +- sys/kern/init_main.c | 2 +- sys/kern/kern_descrip.c | 4 +-- sys/kern/kern_ktrace.c | 4 +-- sys/kern/kern_proc.c | 30 ++++++++++++++++--- sys/kern/kern_resource.c | 8 ++--- sys/kern/kern_sig.c | 2 +- sys/kern/kern_synch.c | 70 +++++++++++++++++++++++++++++++------------ sys/kern/vfs_syscalls.c | 4 +-- sys/sys/globaldata.h | 8 ++++- sys/sys/proc.h | 6 ++-- sys/vfs/procfs/procfs_vnops.c | 2 +- sys/vm/vm_glue.c | 7 +++-- sys/vm/vm_meter.c | 2 +- sys/vm/vm_object.c | 2 +- sys/vm/vm_pageout.c | 4 +-- 16 files changed, 111 insertions(+), 46 deletions(-) diff --git a/sys/kern/imgact_elf.c b/sys/kern/imgact_elf.c index 84b3e1650c..5ece1375e4 100644 --- a/sys/kern/imgact_elf.c +++ b/sys/kern/imgact_elf.c @@ -200,7 +200,7 @@ __elfN(brand_inuse)(Elf_Brandinfo *entry) info.rval = FALSE; info.entry = entry; - allproc_scan(elf_brand_inuse_callback, &info); + allproc_scan(elf_brand_inuse_callback, &info, 0); return (info.rval); } diff --git a/sys/kern/init_main.c b/sys/kern/init_main.c index 4930f36eb5..2d89f222af 100644 --- a/sys/kern/init_main.c +++ b/sys/kern/init_main.c @@ -467,7 +467,7 @@ proc0_post(void *dummy __unused) * Now we can look at the time, having had a chance to verify the * time from the file system. Pretend that proc0 started now. */ - allproc_scan(proc0_post_callback, NULL); + allproc_scan(proc0_post_callback, NULL, 0); /* * Give the ``random'' number generator a thump. diff --git a/sys/kern/kern_descrip.c b/sys/kern/kern_descrip.c index d3bea26d02..402f845f75 100644 --- a/sys/kern/kern_descrip.c +++ b/sys/kern/kern_descrip.c @@ -1408,7 +1408,7 @@ fdrevoke(void *f_data, short f_type, struct ucred *cred) * the FREVOKED already set in the fp and do the right thing. */ if (info.found) - allproc_scan(fdrevoke_proc_callback, &info); + allproc_scan(fdrevoke_proc_callback, &info, 0); fdrop(info.nfp); return(0); } @@ -2782,7 +2782,7 @@ sysctl_kern_file(SYSCTL_HANDLER_ARGS) info.count = 0; info.error = 0; info.req = req; - allproc_scan(sysctl_kern_file_callback, &info); + allproc_scan(sysctl_kern_file_callback, &info, 0); /* * When just calculating the size, overestimate a bit to try to diff --git a/sys/kern/kern_ktrace.c b/sys/kern/kern_ktrace.c index fdddb895a2..7f9d1bb663 100644 --- a/sys/kern/kern_ktrace.c +++ b/sys/kern/kern_ktrace.c @@ -302,7 +302,7 @@ sys_ktrace(struct ktrace_args *uap) info.tracenode = tracenode; info.error = 0; info.rootclear = 0; - allproc_scan(ktrace_clear_callback, &info); + allproc_scan(ktrace_clear_callback, &info, 0); error = info.error; goto done; } @@ -616,7 +616,7 @@ ktrwrite(struct lwp *lp, struct ktr_header *kth, struct uio *uio) info.tracenode = tracenode; info.error = 0; info.rootclear = 1; - allproc_scan(ktrace_clear_callback, &info); + allproc_scan(ktrace_clear_callback, &info, 0); } ktrdestroy(&tracenode); } diff --git a/sys/kern/kern_proc.c b/sys/kern/kern_proc.c index 6758d9561f..e6f876de61 100644 --- a/sys/kern/kern_proc.c +++ b/sys/kern/kern_proc.c @@ -1254,19 +1254,30 @@ proc_userunmap(struct proc *p) * No requirements. */ void -allproc_scan(int (*callback)(struct proc *, void *), void *data) +allproc_scan(int (*callback)(struct proc *, void *), void *data, int segmented) { int limit = nprocs + ncpus; struct proc *p; + int ns; + int ne; int r; int n; + if (segmented) { + int id = mycpu->gd_cpuid; + ns = id * ALLPROC_HSIZE / ncpus; + ne = (id + 1) * ALLPROC_HSIZE / ncpus; + } else { + ns = 0; + ne = ALLPROC_HSIZE; + } + /* * prg->proc_token protects the allproc list and PHOLD() prevents the * process from being removed from the allproc list or the zombproc * list. */ - for (n = 0; n < ALLPROC_HSIZE; ++n) { + for (n = ns; n < ne; ++n) { procglob_t *prg = &procglob[n]; if (LIST_FIRST(&prg->allproc) == NULL) continue; @@ -1301,14 +1312,25 @@ allproc_scan(int (*callback)(struct proc *, void *), void *data) * No requirements. */ void -alllwp_scan(int (*callback)(struct lwp *, void *), void *data) +alllwp_scan(int (*callback)(struct lwp *, void *), void *data, int segmented) { struct proc *p; struct lwp *lp; + int ns; + int ne; int r = 0; int n; - for (n = 0; n < ALLPROC_HSIZE; ++n) { + if (segmented) { + int id = mycpu->gd_cpuid; + ns = id * ALLPROC_HSIZE / ncpus; + ne = (id + 1) * ALLPROC_HSIZE / ncpus; + } else { + ns = 0; + ne = ALLPROC_HSIZE; + } + + for (n = ns; n < ne; ++n) { procglob_t *prg = &procglob[n]; if (LIST_FIRST(&prg->allproc) == NULL) diff --git a/sys/kern/kern_resource.c b/sys/kern/kern_resource.c index 1012cfb5cc..8d1cbbbd9a 100644 --- a/sys/kern/kern_resource.c +++ b/sys/kern/kern_resource.c @@ -134,7 +134,7 @@ sys_getpriority(struct getpriority_args *uap) uap->who = curtd->td_ucred->cr_uid; info.low = low; info.who = uap->who; - allproc_scan(getpriority_callback, &info); + allproc_scan(getpriority_callback, &info, 0); low = info.low; break; @@ -251,7 +251,7 @@ restart: info.who = uap->who; info.error = 0; info.found = 0; - allproc_scan(setpriority_callback, &info); + allproc_scan(setpriority_callback, &info, 0); error = info.error; found = info.found; break; @@ -374,7 +374,7 @@ sys_ioprio_get(struct ioprio_get_args *uap) uap->who = curtd->td_ucred->cr_uid; info.high = high; info.who = uap->who; - allproc_scan(ioprio_get_callback, &info); + allproc_scan(ioprio_get_callback, &info, 0); high = info.high; break; default: @@ -491,7 +491,7 @@ restart: info.who = uap->who; info.error = 0; info.found = 0; - allproc_scan(ioprio_set_callback, &info); + allproc_scan(ioprio_set_callback, &info, 0); error = info.error; found = info.found; break; diff --git a/sys/kern/kern_sig.c b/sys/kern/kern_sig.c index 294e67a2ee..26a8d32c77 100644 --- a/sys/kern/kern_sig.c +++ b/sys/kern/kern_sig.c @@ -672,7 +672,7 @@ dokillpg(int sig, int pgid, int all) /* * broadcast */ - allproc_scan(killpg_all_callback, &info); + allproc_scan(killpg_all_callback, &info, 0); } else { if (pgid == 0) { /* diff --git a/sys/kern/kern_synch.c b/sys/kern/kern_synch.c index dcf2cf17eb..9fe1901faa 100644 --- a/sys/kern/kern_synch.c +++ b/sys/kern/kern_synch.c @@ -77,8 +77,6 @@ int safepri; int tsleep_now_works; int tsleep_crypto_dump = 0; -static struct callout loadav_callout; -static struct callout schedcpu_callout; MALLOC_DEFINE(M_TSLEEP, "tslpque", "tsleep queues"); #define __DEALL(ident) __DEQUALIFY(void *, ident) @@ -164,11 +162,13 @@ static int schedcpu_resource(struct proc *p, void *data __unused); static void schedcpu(void *arg) { - allproc_scan(schedcpu_stats, NULL); - allproc_scan(schedcpu_resource, NULL); - wakeup((caddr_t)&lbolt); - wakeup(lbolt_syncer); - callout_reset(&schedcpu_callout, hz, schedcpu, NULL); + allproc_scan(schedcpu_stats, NULL, 1); + allproc_scan(schedcpu_resource, NULL, 1); + if (mycpu->gd_cpuid == 0) { + wakeup((caddr_t)&lbolt); + wakeup(lbolt_syncer); + } + callout_reset(&mycpu->gd_schedcpu_callout, hz, schedcpu, NULL); } /* @@ -282,6 +282,27 @@ schedcpu_resource(struct proc *p, void *data __unused) } /* + * + */ +static void +schedcpu_setup(void *arg) +{ + globaldata_t save_gd = mycpu; + globaldata_t gd; + int n; + + for (n = 0; n < ncpus; ++n) { + gd = globaldata_find(n); + lwkt_setcpu_self(gd); + callout_init_mp(&gd->gd_loadav_callout); + callout_init_mp(&gd->gd_schedcpu_callout); + schedcpu(NULL); + loadav(NULL); + } + lwkt_setcpu_self(save_gd); +} + +/* * This is only used by ps. Generate a cpu percentage use over * a period of one second. */ @@ -1220,22 +1241,36 @@ tstop(void) /* * Compute a tenex style load average of a quantity on - * 1, 5 and 15 minute intervals. + * 1, 5 and 15 minute intervals. This is a pcpu callout. + * + * We segment the lwp scan on a pcpu basis. This does NOT + * mean the associated lwps are on this cpu, it is done + * just to break the work up. + * + * The callout on cpu0 rolls up the stats from the other + * cpus. */ static int loadav_count_runnable(struct lwp *p, void *data); static void loadav(void *arg) { + globaldata_t gd = mycpu; struct loadavg *avg; int i, nrun; nrun = 0; - alllwp_scan(loadav_count_runnable, &nrun); - avg = &averunnable; - for (i = 0; i < 3; i++) { - avg->ldavg[i] = (cexp[i] * avg->ldavg[i] + - (long)nrun * FSCALE * (FSCALE - cexp[i])) >> FSHIFT; + alllwp_scan(loadav_count_runnable, &nrun, 1); + gd->gd_loadav_nrunnable = nrun; + if (gd->gd_cpuid == 0) { + avg = &averunnable; + nrun = 0; + for (i = 0; i < ncpus; ++i) + nrun += globaldata_find(i)->gd_loadav_nrunnable; + for (i = 0; i < 3; i++) { + avg->ldavg[i] = (cexp[i] * avg->ldavg[i] + + (long)nrun * FSCALE * (FSCALE - cexp[i])) >> FSHIFT; + } } /* @@ -1243,7 +1278,8 @@ loadav(void *arg) * random variation to avoid synchronisation with processes that * run at regular intervals. */ - callout_reset(&loadav_callout, hz * 4 + (int)(krandom() % (hz * 2 + 1)), + callout_reset(&gd->gd_loadav_callout, + hz * 4 + (int)(krandom() % (hz * 2 + 1)), loadav, NULL); } @@ -1283,12 +1319,8 @@ collect_load_callback(int n) static void sched_setup(void *dummy) { - callout_init_mp(&loadav_callout); - callout_init_mp(&schedcpu_callout); kcollect_register(KCOLLECT_LOAD, "load", collect_load_callback, KCOLLECT_SCALE(KCOLLECT_LOAD_FORMAT, 0)); /* Kick off timeout driven events by calling first time. */ - schedcpu(NULL); - loadav(NULL); + schedcpu_setup(NULL); } - diff --git a/sys/kern/vfs_syscalls.c b/sys/kern/vfs_syscalls.c index 787bec0f1d..b24dccbd88 100644 --- a/sys/kern/vfs_syscalls.c +++ b/sys/kern/vfs_syscalls.c @@ -514,7 +514,7 @@ checkdirs(struct nchandle *old_nch, struct nchandle *new_nch) info.old_nch = *old_nch; info.new_nch = *new_nch; info.new_vp = newdp; - allproc_scan(checkdirs_callback, &info); + allproc_scan(checkdirs_callback, &info, 0); vput(newdp); } @@ -745,7 +745,7 @@ dounmount(struct mount *mp, int flags) cache_clearmntcache(); if ((ncp = mp->mnt_ncmountpt.ncp) != NULL && (ncp->nc_refs != 1 || TAILQ_FIRST(&ncp->nc_list))) { - allproc_scan(&unmount_allproc_cb, mp); + allproc_scan(&unmount_allproc_cb, mp, 0); } cache_clearmntcache(); diff --git a/sys/sys/globaldata.h b/sys/sys/globaldata.h index 65a7bd6e8a..d6a6ee69e8 100644 --- a/sys/sys/globaldata.h +++ b/sys/sys/globaldata.h @@ -84,6 +84,9 @@ #ifndef _SYS_SYSID_H_ #include /* sysid_t */ #endif +#ifndef _SYS_CALLOUT_H_ +#include +#endif /* * This structure maps out the global data that needs to be kept on a @@ -182,7 +185,10 @@ struct globaldata { uint64_t gd_cpumask_offset; struct vmstats gd_vmstats; /* pcpu local copy of vmstats */ struct vmstats gd_vmstats_adj; /* pcpu adj for vmstats */ - uint64_t gd_reserved64[1]; + struct callout gd_loadav_callout; /* loadavg calc */ + struct callout gd_schedcpu_callout; /* scheduler/stats */ + uint32_t gd_loadav_nrunnable; /* pcpu lwps nrunnable */ + uint32_t gd_reserved32[1]; void *gd_preserved[4]; /* future fields */ /* extended by */ }; diff --git a/sys/sys/proc.h b/sys/sys/proc.h index efd4f2dd7f..0100050dc8 100644 --- a/sys/sys/proc.h +++ b/sys/sys/proc.h @@ -540,8 +540,10 @@ int enterpgrp (struct proc *p, pid_t pgid, int mksess); void proc_add_allproc(struct proc *p); void proc_move_allproc_zombie(struct proc *); void proc_remove_zombie(struct proc *); -void allproc_scan(int (*callback)(struct proc *, void *), void *data); -void alllwp_scan(int (*callback)(struct lwp *, void *), void *data); +void allproc_scan(int (*callback)(struct proc *, void *), void *data, + int segmented); +void alllwp_scan(int (*callback)(struct lwp *, void *), void *data, + int segmented); void zombproc_scan(int (*callback)(struct proc *, void *), void *data); void fixjobc (struct proc *p, struct pgrp *pgrp, int entering); void updatepcpu(struct lwp *, int, int); diff --git a/sys/vfs/procfs/procfs_vnops.c b/sys/vfs/procfs/procfs_vnops.c index 0ba6786bb5..98bd17f3f0 100644 --- a/sys/vfs/procfs/procfs_vnops.c +++ b/sys/vfs/procfs/procfs_vnops.c @@ -1000,7 +1000,7 @@ procfs_readdir_root(struct vop_readdir_args *ap) break; } if (res >= 0) - allproc_scan(procfs_readdir_root_callback, &info); + allproc_scan(procfs_readdir_root_callback, &info, 0); uio->uio_offset = (off_t)info.i; return (info.error); diff --git a/sys/vm/vm_glue.c b/sys/vm/vm_glue.c index 164086456b..1783ff852e 100644 --- a/sys/vm/vm_glue.c +++ b/sys/vm/vm_glue.c @@ -348,10 +348,13 @@ loop: /* * Look for a good candidate to wake up + * + * XXX we should make the schedule thread pcpu and then use a + * segmented allproc scan. */ info.pp = NULL; info.ppri = INT_MIN; - allproc_scan(scheduler_callback, &info); + allproc_scan(scheduler_callback, &info, 0); /* * Nothing to do, back to sleep for at least 1/10 of a second. If @@ -505,7 +508,7 @@ static int swapout_procs_callback(struct proc *p, void *data); void swapout_procs(int action) { - allproc_scan(swapout_procs_callback, &action); + allproc_scan(swapout_procs_callback, &action, 0); } static int diff --git a/sys/vm/vm_meter.c b/sys/vm/vm_meter.c index 0c95be3308..8e07b91b4b 100644 --- a/sys/vm/vm_meter.c +++ b/sys/vm/vm_meter.c @@ -129,7 +129,7 @@ do_vmtotal(SYSCTL_HANDLER_ARGS) /* * Calculate process statistics. */ - allproc_scan(do_vmtotal_callback, &total); + allproc_scan(do_vmtotal_callback, &total, 0); /* * Adjust for sysctl return. Add real memory into virtual memory. diff --git a/sys/vm/vm_object.c b/sys/vm/vm_object.c index 016e60e2bc..843a55579d 100644 --- a/sys/vm/vm_object.c +++ b/sys/vm/vm_object.c @@ -3026,7 +3026,7 @@ vm_object_in_map(vm_object_t object) info.rv = 0; info.object = object; - allproc_scan(vm_object_in_map_callback, &info); + allproc_scan(vm_object_in_map_callback, &info, 0); if (info.rv) return 1; if( _vm_object_in_map(&kernel_map, object, 0)) diff --git a/sys/vm/vm_pageout.c b/sys/vm/vm_pageout.c index d5b1ef18ed..579af7ed51 100644 --- a/sys/vm/vm_pageout.c +++ b/sys/vm/vm_pageout.c @@ -1543,7 +1543,7 @@ vm_pageout_scan_cache(int avail_shortage, int pass, lastkillticks = ticks; info.bigproc = NULL; info.bigsize = 0; - allproc_scan(vm_pageout_scan_callback, &info); + allproc_scan(vm_pageout_scan_callback, &info, 0); if (info.bigproc != NULL) { kprintf("Try to kill process %d %s\n", info.bigproc->p_pid, info.bigproc->p_comm); @@ -2192,7 +2192,7 @@ vm_daemon(void) * scan the processes for exceeding their rlimits or if * process is swapped out -- deactivate pages */ - allproc_scan(vm_daemon_callback, NULL); + allproc_scan(vm_daemon_callback, NULL, 0); } } -- 2.11.4.GIT