From 586c43085fc900273732f99de6c9ef43f73ded76 Mon Sep 17 00:00:00 2001
From: Matthew Dillon <dillon@apollo.backplane.com>
Date: Sat, 12 Aug 2017 12:24:16 -0700
Subject: [PATCH] kernel - Break up scheduler and loadavg callout

* Change the scheduler and loadavg callouts from cpu 0 to all cpus, and
  adjust the allproc_scan() and alllwp_scan() to segment the hash table
  when asked.

  Every cpu is now tasked with handling the nominal scheduler recalc and
  nominal load calculation for a portion of the process list.  The portion
  is unrelated to which cpu(s) the processes are actually scheduled on,
  it is strictly a way to spread the work around, split up by hash range.

* Significantly reduces cpu 0 stalls when a large number of user processes
  or threads are present (that is, in the tens of thousands or more).  In
  the test below, before this change, cpu 0 was straining under 40%+
  interupt load (from the callout).  After this change the load is spread
  across all cpus, approximately 1.5% per cpu.

* Tested with 400,000 running user processes on a 32-thread dual-socket
  xeon (yes, these numbers are real):

  12:27PM  up 8 mins,  3 users, load avg: 395143.28, 270541.13, 132638.33
  12:33PM  up 14 mins, 3 users, load avg: 399496.57, 361405.54, 225669.14

* NOTE: There are still a number of other non-segmented allproc scans in
	the system, particularly related to paging and swapping.

* NOTE: Further spreading-out of the work may be needed, by using a more
  	frequent callout and smaller hash index range for each.
---
 sys/kern/imgact_elf.c         |  2 +-
 sys/kern/init_main.c          |  2 +-
 sys/kern/kern_descrip.c       |  4 +--
 sys/kern/kern_ktrace.c        |  4 +--
 sys/kern/kern_proc.c          | 30 ++++++++++++++++---
 sys/kern/kern_resource.c      |  8 ++---
 sys/kern/kern_sig.c           |  2 +-
 sys/kern/kern_synch.c         | 70 +++++++++++++++++++++++++++++++------------
 sys/kern/vfs_syscalls.c       |  4 +--
 sys/sys/globaldata.h          |  8 ++++-
 sys/sys/proc.h                |  6 ++--
 sys/vfs/procfs/procfs_vnops.c |  2 +-
 sys/vm/vm_glue.c              |  7 +++--
 sys/vm/vm_meter.c             |  2 +-
 sys/vm/vm_object.c            |  2 +-
 sys/vm/vm_pageout.c           |  4 +--
 16 files changed, 111 insertions(+), 46 deletions(-)

diff --git a/sys/kern/imgact_elf.c b/sys/kern/imgact_elf.c
index 84b3e1650c..5ece1375e4 100644
--- a/sys/kern/imgact_elf.c
+++ b/sys/kern/imgact_elf.c
@@ -200,7 +200,7 @@ __elfN(brand_inuse)(Elf_Brandinfo *entry)
 
 	info.rval = FALSE;
 	info.entry = entry;
-	allproc_scan(elf_brand_inuse_callback, &info);
+	allproc_scan(elf_brand_inuse_callback, &info, 0);
 	return (info.rval);
 }
 
diff --git a/sys/kern/init_main.c b/sys/kern/init_main.c
index 4930f36eb5..2d89f222af 100644
--- a/sys/kern/init_main.c
+++ b/sys/kern/init_main.c
@@ -467,7 +467,7 @@ proc0_post(void *dummy __unused)
 	 * Now we can look at the time, having had a chance to verify the
 	 * time from the file system.  Pretend that proc0 started now.
 	 */
-	allproc_scan(proc0_post_callback, NULL);
+	allproc_scan(proc0_post_callback, NULL, 0);
 
 	/*
 	 * Give the ``random'' number generator a thump.
diff --git a/sys/kern/kern_descrip.c b/sys/kern/kern_descrip.c
index d3bea26d02..402f845f75 100644
--- a/sys/kern/kern_descrip.c
+++ b/sys/kern/kern_descrip.c
@@ -1408,7 +1408,7 @@ fdrevoke(void *f_data, short f_type, struct ucred *cred)
 	 * the FREVOKED already set in the fp and do the right thing.
 	 */
 	if (info.found)
-		allproc_scan(fdrevoke_proc_callback, &info);
+		allproc_scan(fdrevoke_proc_callback, &info, 0);
 	fdrop(info.nfp);
 	return(0);
 }
@@ -2782,7 +2782,7 @@ sysctl_kern_file(SYSCTL_HANDLER_ARGS)
 	info.count = 0;
 	info.error = 0;
 	info.req = req;
-	allproc_scan(sysctl_kern_file_callback, &info);
+	allproc_scan(sysctl_kern_file_callback, &info, 0);
 
 	/*
 	 * When just calculating the size, overestimate a bit to try to
diff --git a/sys/kern/kern_ktrace.c b/sys/kern/kern_ktrace.c
index fdddb895a2..7f9d1bb663 100644
--- a/sys/kern/kern_ktrace.c
+++ b/sys/kern/kern_ktrace.c
@@ -302,7 +302,7 @@ sys_ktrace(struct ktrace_args *uap)
 		info.tracenode = tracenode;
 		info.error = 0;
 		info.rootclear = 0;
-		allproc_scan(ktrace_clear_callback, &info);
+		allproc_scan(ktrace_clear_callback, &info, 0);
 		error = info.error;
 		goto done;
 	}
@@ -616,7 +616,7 @@ ktrwrite(struct lwp *lp, struct ktr_header *kth, struct uio *uio)
 		info.tracenode = tracenode;
 		info.error = 0;
 		info.rootclear = 1;
-		allproc_scan(ktrace_clear_callback, &info);
+		allproc_scan(ktrace_clear_callback, &info, 0);
 	}
 	ktrdestroy(&tracenode);
 }
diff --git a/sys/kern/kern_proc.c b/sys/kern/kern_proc.c
index 6758d9561f..e6f876de61 100644
--- a/sys/kern/kern_proc.c
+++ b/sys/kern/kern_proc.c
@@ -1254,19 +1254,30 @@ proc_userunmap(struct proc *p)
  * No requirements.
  */
 void
-allproc_scan(int (*callback)(struct proc *, void *), void *data)
+allproc_scan(int (*callback)(struct proc *, void *), void *data, int segmented)
 {
 	int limit = nprocs + ncpus;
 	struct proc *p;
+	int ns;
+	int ne;
 	int r;
 	int n;
 
+	if (segmented) {
+		int id = mycpu->gd_cpuid;
+		ns = id * ALLPROC_HSIZE / ncpus;
+		ne = (id + 1) * ALLPROC_HSIZE / ncpus;
+	} else {
+		ns = 0;
+		ne = ALLPROC_HSIZE;
+	}
+
 	/*
 	 * prg->proc_token protects the allproc list and PHOLD() prevents the
 	 * process from being removed from the allproc list or the zombproc
 	 * list.
 	 */
-	for (n = 0; n < ALLPROC_HSIZE; ++n) {
+	for (n = ns; n < ne; ++n) {
 		procglob_t *prg = &procglob[n];
 		if (LIST_FIRST(&prg->allproc) == NULL)
 			continue;
@@ -1301,14 +1312,25 @@ allproc_scan(int (*callback)(struct proc *, void *), void *data)
  * No requirements.
  */
 void
-alllwp_scan(int (*callback)(struct lwp *, void *), void *data)
+alllwp_scan(int (*callback)(struct lwp *, void *), void *data, int segmented)
 {
 	struct proc *p;
 	struct lwp *lp;
+	int ns;
+	int ne;
 	int r = 0;
 	int n;
 
-	for (n = 0; n < ALLPROC_HSIZE; ++n) {
+	if (segmented) {
+		int id = mycpu->gd_cpuid;
+		ns = id * ALLPROC_HSIZE / ncpus;
+		ne = (id + 1) * ALLPROC_HSIZE / ncpus;
+	} else {
+		ns = 0;
+		ne = ALLPROC_HSIZE;
+	}
+
+	for (n = ns; n < ne; ++n) {
 		procglob_t *prg = &procglob[n];
 
 		if (LIST_FIRST(&prg->allproc) == NULL)
diff --git a/sys/kern/kern_resource.c b/sys/kern/kern_resource.c
index 1012cfb5cc..8d1cbbbd9a 100644
--- a/sys/kern/kern_resource.c
+++ b/sys/kern/kern_resource.c
@@ -134,7 +134,7 @@ sys_getpriority(struct getpriority_args *uap)
 			uap->who = curtd->td_ucred->cr_uid;
 		info.low = low;
 		info.who = uap->who;
-		allproc_scan(getpriority_callback, &info);
+		allproc_scan(getpriority_callback, &info, 0);
 		low = info.low;
 		break;
 
@@ -251,7 +251,7 @@ restart:
 		info.who = uap->who;
 		info.error = 0;
 		info.found = 0;
-		allproc_scan(setpriority_callback, &info);
+		allproc_scan(setpriority_callback, &info, 0);
 		error = info.error;
 		found = info.found;
 		break;
@@ -374,7 +374,7 @@ sys_ioprio_get(struct ioprio_get_args *uap)
 			uap->who = curtd->td_ucred->cr_uid;
 		info.high = high;
 		info.who = uap->who;
-		allproc_scan(ioprio_get_callback, &info);
+		allproc_scan(ioprio_get_callback, &info, 0);
 		high = info.high;
 		break;
 	default:
@@ -491,7 +491,7 @@ restart:
 		info.who = uap->who;
 		info.error = 0;
 		info.found = 0;
-		allproc_scan(ioprio_set_callback, &info);
+		allproc_scan(ioprio_set_callback, &info, 0);
 		error = info.error;
 		found = info.found;
 		break;
diff --git a/sys/kern/kern_sig.c b/sys/kern/kern_sig.c
index 294e67a2ee..26a8d32c77 100644
--- a/sys/kern/kern_sig.c
+++ b/sys/kern/kern_sig.c
@@ -672,7 +672,7 @@ dokillpg(int sig, int pgid, int all)
 		/*
 		 * broadcast
 		 */
-		allproc_scan(killpg_all_callback, &info);
+		allproc_scan(killpg_all_callback, &info, 0);
 	} else {
 		if (pgid == 0) {
 			/*
diff --git a/sys/kern/kern_synch.c b/sys/kern/kern_synch.c
index dcf2cf17eb..9fe1901faa 100644
--- a/sys/kern/kern_synch.c
+++ b/sys/kern/kern_synch.c
@@ -77,8 +77,6 @@ int	safepri;
 int	tsleep_now_works;
 int	tsleep_crypto_dump = 0;
 
-static struct callout loadav_callout;
-static struct callout schedcpu_callout;
 MALLOC_DEFINE(M_TSLEEP, "tslpque", "tsleep queues");
 
 #define __DEALL(ident)	__DEQUALIFY(void *, ident)
@@ -164,11 +162,13 @@ static int schedcpu_resource(struct proc *p, void *data __unused);
 static void
 schedcpu(void *arg)
 {
-	allproc_scan(schedcpu_stats, NULL);
-	allproc_scan(schedcpu_resource, NULL);
-	wakeup((caddr_t)&lbolt);
-	wakeup(lbolt_syncer);
-	callout_reset(&schedcpu_callout, hz, schedcpu, NULL);
+	allproc_scan(schedcpu_stats, NULL, 1);
+	allproc_scan(schedcpu_resource, NULL, 1);
+	if (mycpu->gd_cpuid == 0) {
+		wakeup((caddr_t)&lbolt);
+		wakeup(lbolt_syncer);
+	}
+	callout_reset(&mycpu->gd_schedcpu_callout, hz, schedcpu, NULL);
 }
 
 /*
@@ -282,6 +282,27 @@ schedcpu_resource(struct proc *p, void *data __unused)
 }
 
 /*
+ *
+ */
+static void
+schedcpu_setup(void *arg)
+{
+	globaldata_t save_gd = mycpu;
+	globaldata_t gd;
+	int n;
+
+	for (n = 0; n < ncpus; ++n) {
+		gd = globaldata_find(n);
+		lwkt_setcpu_self(gd);
+		callout_init_mp(&gd->gd_loadav_callout);
+		callout_init_mp(&gd->gd_schedcpu_callout);
+		schedcpu(NULL);
+		loadav(NULL);
+	}
+	lwkt_setcpu_self(save_gd);
+}
+
+/*
  * This is only used by ps.  Generate a cpu percentage use over
  * a period of one second.
  */
@@ -1220,22 +1241,36 @@ tstop(void)
 
 /*
  * Compute a tenex style load average of a quantity on
- * 1, 5 and 15 minute intervals.
+ * 1, 5 and 15 minute intervals.  This is a pcpu callout.
+ *
+ * We segment the lwp scan on a pcpu basis.  This does NOT
+ * mean the associated lwps are on this cpu, it is done
+ * just to break the work up.
+ *
+ * The callout on cpu0 rolls up the stats from the other
+ * cpus.
  */
 static int loadav_count_runnable(struct lwp *p, void *data);
 
 static void
 loadav(void *arg)
 {
+	globaldata_t gd = mycpu;
 	struct loadavg *avg;
 	int i, nrun;
 
 	nrun = 0;
-	alllwp_scan(loadav_count_runnable, &nrun);
-	avg = &averunnable;
-	for (i = 0; i < 3; i++) {
-		avg->ldavg[i] = (cexp[i] * avg->ldavg[i] +
-		    (long)nrun * FSCALE * (FSCALE - cexp[i])) >> FSHIFT;
+	alllwp_scan(loadav_count_runnable, &nrun, 1);
+	gd->gd_loadav_nrunnable = nrun;
+	if (gd->gd_cpuid == 0) {
+		avg = &averunnable;
+		nrun = 0;
+		for (i = 0; i < ncpus; ++i)
+			nrun += globaldata_find(i)->gd_loadav_nrunnable;
+		for (i = 0; i < 3; i++) {
+			avg->ldavg[i] = (cexp[i] * avg->ldavg[i] +
+			    (long)nrun * FSCALE * (FSCALE - cexp[i])) >> FSHIFT;
+		}
 	}
 
 	/*
@@ -1243,7 +1278,8 @@ loadav(void *arg)
 	 * random variation to avoid synchronisation with processes that
 	 * run at regular intervals.
 	 */
-	callout_reset(&loadav_callout, hz * 4 + (int)(krandom() % (hz * 2 + 1)),
+	callout_reset(&gd->gd_loadav_callout,
+		      hz * 4 + (int)(krandom() % (hz * 2 + 1)),
 		      loadav, NULL);
 }
 
@@ -1283,12 +1319,8 @@ collect_load_callback(int n)
 static void
 sched_setup(void *dummy)
 {
-	callout_init_mp(&loadav_callout);
-	callout_init_mp(&schedcpu_callout);
 	kcollect_register(KCOLLECT_LOAD, "load", collect_load_callback,
 			  KCOLLECT_SCALE(KCOLLECT_LOAD_FORMAT, 0));
 	/* Kick off timeout driven events by calling first time. */
-	schedcpu(NULL);
-	loadav(NULL);
+	schedcpu_setup(NULL);
 }
-
diff --git a/sys/kern/vfs_syscalls.c b/sys/kern/vfs_syscalls.c
index 787bec0f1d..b24dccbd88 100644
--- a/sys/kern/vfs_syscalls.c
+++ b/sys/kern/vfs_syscalls.c
@@ -514,7 +514,7 @@ checkdirs(struct nchandle *old_nch, struct nchandle *new_nch)
 	info.old_nch = *old_nch;
 	info.new_nch = *new_nch;
 	info.new_vp = newdp;
-	allproc_scan(checkdirs_callback, &info);
+	allproc_scan(checkdirs_callback, &info, 0);
 	vput(newdp);
 }
 
@@ -745,7 +745,7 @@ dounmount(struct mount *mp, int flags)
 		cache_clearmntcache();
 		if ((ncp = mp->mnt_ncmountpt.ncp) != NULL &&
 		    (ncp->nc_refs != 1 || TAILQ_FIRST(&ncp->nc_list))) {
-			allproc_scan(&unmount_allproc_cb, mp);
+			allproc_scan(&unmount_allproc_cb, mp, 0);
 		}
 
 		cache_clearmntcache();
diff --git a/sys/sys/globaldata.h b/sys/sys/globaldata.h
index 65a7bd6e8a..d6a6ee69e8 100644
--- a/sys/sys/globaldata.h
+++ b/sys/sys/globaldata.h
@@ -84,6 +84,9 @@
 #ifndef _SYS_SYSID_H_
 #include <sys/sysid.h>		/* sysid_t */
 #endif
+#ifndef _SYS_CALLOUT_H_
+#include <sys/callout.h>
+#endif
 
 /*
  * This structure maps out the global data that needs to be kept on a
@@ -182,7 +185,10 @@ struct globaldata {
 	uint64_t	gd_cpumask_offset;
 	struct vmstats	gd_vmstats;		/* pcpu local copy of vmstats */
 	struct vmstats	gd_vmstats_adj;		/* pcpu adj for vmstats */
-	uint64_t	gd_reserved64[1];
+	struct callout	gd_loadav_callout;	/* loadavg calc */
+	struct callout	gd_schedcpu_callout;	/* scheduler/stats */
+	uint32_t	gd_loadav_nrunnable;	/* pcpu lwps nrunnable */
+	uint32_t	gd_reserved32[1];
 	void		*gd_preserved[4];	/* future fields */
 	/* extended by <machine/globaldata.h> */
 };
diff --git a/sys/sys/proc.h b/sys/sys/proc.h
index efd4f2dd7f..0100050dc8 100644
--- a/sys/sys/proc.h
+++ b/sys/sys/proc.h
@@ -540,8 +540,10 @@ int	enterpgrp (struct proc *p, pid_t pgid, int mksess);
 void	proc_add_allproc(struct proc *p);
 void	proc_move_allproc_zombie(struct proc *);
 void	proc_remove_zombie(struct proc *);
-void	allproc_scan(int (*callback)(struct proc *, void *), void *data);
-void	alllwp_scan(int (*callback)(struct lwp *, void *), void *data);
+void	allproc_scan(int (*callback)(struct proc *, void *), void *data,
+			int segmented);
+void	alllwp_scan(int (*callback)(struct lwp *, void *), void *data,
+			int segmented);
 void	zombproc_scan(int (*callback)(struct proc *, void *), void *data);
 void	fixjobc (struct proc *p, struct pgrp *pgrp, int entering);
 void	updatepcpu(struct lwp *, int, int);
diff --git a/sys/vfs/procfs/procfs_vnops.c b/sys/vfs/procfs/procfs_vnops.c
index 0ba6786bb5..98bd17f3f0 100644
--- a/sys/vfs/procfs/procfs_vnops.c
+++ b/sys/vfs/procfs/procfs_vnops.c
@@ -1000,7 +1000,7 @@ procfs_readdir_root(struct vop_readdir_args *ap)
 			break;
 	}
 	if (res >= 0)
-		allproc_scan(procfs_readdir_root_callback, &info);
+		allproc_scan(procfs_readdir_root_callback, &info, 0);
 	uio->uio_offset = (off_t)info.i;
 
 	return (info.error);
diff --git a/sys/vm/vm_glue.c b/sys/vm/vm_glue.c
index 164086456b..1783ff852e 100644
--- a/sys/vm/vm_glue.c
+++ b/sys/vm/vm_glue.c
@@ -348,10 +348,13 @@ loop:
 
 	/*
 	 * Look for a good candidate to wake up
+	 *
+	 * XXX we should make the schedule thread pcpu and then use a
+	 * segmented allproc scan.
 	 */
 	info.pp = NULL;
 	info.ppri = INT_MIN;
-	allproc_scan(scheduler_callback, &info);
+	allproc_scan(scheduler_callback, &info, 0);
 
 	/*
 	 * Nothing to do, back to sleep for at least 1/10 of a second.  If
@@ -505,7 +508,7 @@ static int swapout_procs_callback(struct proc *p, void *data);
 void
 swapout_procs(int action)
 {
-	allproc_scan(swapout_procs_callback, &action);
+	allproc_scan(swapout_procs_callback, &action, 0);
 }
 
 static int
diff --git a/sys/vm/vm_meter.c b/sys/vm/vm_meter.c
index 0c95be3308..8e07b91b4b 100644
--- a/sys/vm/vm_meter.c
+++ b/sys/vm/vm_meter.c
@@ -129,7 +129,7 @@ do_vmtotal(SYSCTL_HANDLER_ARGS)
 	/*
 	 * Calculate process statistics.
 	 */
-	allproc_scan(do_vmtotal_callback, &total);
+	allproc_scan(do_vmtotal_callback, &total, 0);
 
 	/*
 	 * Adjust for sysctl return.  Add real memory into virtual memory.
diff --git a/sys/vm/vm_object.c b/sys/vm/vm_object.c
index 016e60e2bc..843a55579d 100644
--- a/sys/vm/vm_object.c
+++ b/sys/vm/vm_object.c
@@ -3026,7 +3026,7 @@ vm_object_in_map(vm_object_t object)
 	info.rv = 0;
 	info.object = object;
 
-	allproc_scan(vm_object_in_map_callback, &info);
+	allproc_scan(vm_object_in_map_callback, &info, 0);
 	if (info.rv)
 		return 1;
 	if( _vm_object_in_map(&kernel_map, object, 0))
diff --git a/sys/vm/vm_pageout.c b/sys/vm/vm_pageout.c
index d5b1ef18ed..579af7ed51 100644
--- a/sys/vm/vm_pageout.c
+++ b/sys/vm/vm_pageout.c
@@ -1543,7 +1543,7 @@ vm_pageout_scan_cache(int avail_shortage, int pass,
 		lastkillticks = ticks;
 		info.bigproc = NULL;
 		info.bigsize = 0;
-		allproc_scan(vm_pageout_scan_callback, &info);
+		allproc_scan(vm_pageout_scan_callback, &info, 0);
 		if (info.bigproc != NULL) {
 			kprintf("Try to kill process %d %s\n",
 				info.bigproc->p_pid, info.bigproc->p_comm);
@@ -2192,7 +2192,7 @@ vm_daemon(void)
 		 * scan the processes for exceeding their rlimits or if
 		 * process is swapped out -- deactivate pages
 		 */
-		allproc_scan(vm_daemon_callback, NULL);
+		allproc_scan(vm_daemon_callback, NULL, 0);
 	}
 }
 
-- 
2.11.4.GIT