4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
27 * The System Duty Cycle (SDC) scheduling class
28 * --------------------------------------------
32 * Kernel threads in Solaris have traditionally not been large consumers
33 * of CPU time. They typically wake up, perform a small amount of
34 * work, then go back to sleep waiting for either a timeout or another
35 * signal. On the assumption that the small amount of work that they do
36 * is important for the behavior of the whole system, these threads are
37 * treated kindly by the dispatcher and the SYS scheduling class: they run
38 * without preemption from anything other than real-time and interrupt
39 * threads; when preempted, they are put at the front of the queue, so they
40 * generally do not migrate between CPUs; and they are allowed to stay
41 * running until they voluntarily give up the CPU.
43 * As Solaris has evolved, new workloads have emerged which require the
44 * kernel to perform significant amounts of CPU-intensive work. One
45 * example of such a workload is ZFS's transaction group sync processing.
46 * Each sync operation generates a large batch of I/Os, and each I/O
47 * may need to be compressed and/or checksummed before it is written to
48 * storage. The taskq threads which perform the compression and checksums
49 * will run nonstop as long as they have work to do; a large sync operation
50 * on a compression-heavy dataset can keep them busy for seconds on end.
51 * This causes human-time-scale dispatch latency bubbles for any other
52 * threads which have the misfortune to share a CPU with the taskq threads.
54 * The SDC scheduling class is a solution to this problem.
59 * SDC is centered around the concept of a thread's duty cycle (DC):
62 * Duty Cycle = ----------------------
63 * ONPROC + Runnable time
65 * This is the ratio of the time that the thread spent running on a CPU
66 * divided by the time it spent running or trying to run. It is unaffected
67 * by any time the thread spent sleeping, stopped, etc.
69 * A thread joining the SDC class specifies a "target" DC that it wants
70 * to run at. To implement this policy, the routine sysdc_update() scans
71 * the list of active SDC threads every few ticks and uses each thread's
72 * microstate data to compute the actual duty cycle that that thread
73 * has experienced recently. If the thread is under its target DC, its
74 * priority is increased to the maximum available (sysdc_maxpri, which is
75 * 99 by default). If the thread is over its target DC, its priority is
76 * reduced to the minimum available (sysdc_minpri, 0 by default). This
77 * is a fairly primitive approach, in that it doesn't use any of the
78 * intermediate priorities, but it's not completely inappropriate. Even
79 * though threads in the SDC class might take a while to do their job, they
80 * are by some definition important if they're running inside the kernel,
81 * so it is reasonable that they should get to run at priority 99.
83 * If a thread is running when sysdc_update() calculates its actual duty
84 * cycle, and there are other threads of equal or greater priority on its
85 * CPU's dispatch queue, sysdc_update() preempts that thread. The thread
86 * acknowledges the preemption by calling sysdc_preempt(), which calls
87 * setbackdq(), which gives other threads with the same priority a chance
88 * to run. This creates a de facto time quantum for threads in the SDC
91 * An SDC thread which is assigned priority 0 can continue to run if
92 * nothing else needs to use the CPU that it's running on. Similarly, an
93 * SDC thread at priority 99 might not get to run as much as it wants to
94 * if there are other priority-99 or higher threads on its CPU. These
95 * situations would cause the thread to get ahead of or behind its target
96 * DC; the longer the situations lasted, the further ahead or behind the
97 * thread would get. Rather than condemning a thread to a lifetime of
98 * paying for its youthful indiscretions, SDC keeps "base" values for
99 * ONPROC and Runnable times in each thread's sysdc data, and updates these
100 * values periodically. The duty cycle is then computed using the elapsed
101 * amount of ONPROC and Runnable times since those base times.
103 * Since sysdc_update() scans SDC threads fairly frequently, it tries to
104 * keep the list of "active" threads small by pruning out threads which
105 * have been asleep for a brief time. They are not pruned immediately upon
106 * going to sleep, since some threads may bounce back and forth between
107 * sleeping and being runnable.
112 * void sysdc_thread_enter(t, dc, flags)
114 * Moves a kernel thread from the SYS scheduling class to the
115 * SDC class. t must have an associated LWP (created by calling
116 * lwp_kernel_create()). The thread will have a target DC of dc.
117 * Flags should be either 0 or SYSDC_THREAD_BATCH. If
118 * SYSDC_THREAD_BATCH is specified, the thread is expected to be
119 * doing large amounts of processing.
124 * - Run queue balancing
126 * The Solaris dispatcher is biased towards letting a thread run
127 * on the same CPU which it last ran on, if no more than 3 ticks
128 * (i.e. rechoose_interval) have passed since the thread last ran.
129 * This helps to preserve cache warmth. On the other hand, it also
130 * tries to keep the per-CPU run queues fairly balanced; if the CPU
131 * chosen for a runnable thread has a run queue which is three or
132 * more threads longer than a neighboring CPU's queue, the runnable
133 * thread is dispatched onto the neighboring CPU instead.
135 * These policies work well for some workloads, but not for many SDC
136 * threads. The taskq client of SDC, for example, has many discrete
137 * units of work to do. The work units are largely independent, so
138 * cache warmth is not an important consideration. It is important
139 * that the threads fan out quickly to different CPUs, since the
140 * amount of work these threads have to do (a few seconds worth at a
141 * time) doesn't leave much time to correct thread placement errors
142 * (i.e. two SDC threads being dispatched to the same CPU).
144 * To fix this, SDC uses the TS_RUNQMATCH flag introduced for FSS.
145 * This tells the dispatcher to keep neighboring run queues' lengths
146 * more evenly matched, which allows SDC threads to migrate more
149 * - LWPs and system processes
151 * SDC can only be used for kernel threads. Since SDC uses microstate
152 * accounting data to compute each thread's actual duty cycle, all
153 * threads entering the SDC class must have associated LWPs (which
154 * store the microstate data). This means that the threads have to
155 * be associated with an SSYS process, i.e. one created by newproc().
156 * If the microstate accounting information is ever moved into the
157 * kthread_t, this restriction could be lifted.
159 * - Dealing with oversubscription
161 * Since SDC duty cycles are per-thread, it is possible that the
162 * aggregate requested duty cycle of all SDC threads in a processor
163 * set could be greater than the total CPU time available in that set.
164 * The FSS scheduling class has an analogous situation, which it deals
165 * with by reducing each thread's allotted CPU time proportionally.
166 * Since SDC doesn't need to be as precise as FSS, it uses a simpler
167 * solution to the oversubscription problem.
169 * sysdc_update() accumulates the amount of time that max-priority SDC
170 * threads have spent on-CPU in each processor set, and uses that sum
171 * to create an implied duty cycle for that processor set:
173 * accumulated CPU time
174 * pset DC = -----------------------------------
175 * (# CPUs) * time since last update
177 * If this implied duty cycle is above a maximum pset duty cycle (90%
178 * by default), sysdc_update() sets the priority of all SDC threads
179 * in that processor set to sysdc_minpri for a "break" period. After
180 * the break period, it waits for a "nobreak" period before trying to
181 * enforce the pset duty cycle limit again.
185 * As the above implies, SDC is processor set aware, but it does not
186 * currently allow threads to change processor sets while in the SDC
187 * class. Instead, those threads must join the desired processor set
188 * before entering SDC. [1]
192 * A thread joining the SDC class can specify the SDC_THREAD_BATCH
193 * flag. This flag currently has no effect, but marks threads which
194 * do bulk processing.
198 * The TS and FSS scheduling classes pay attention to t_kpri_req,
199 * which provides a simple form of priority inheritance for
200 * synchronization primitives (such as rwlocks held as READER) which
201 * cannot be traced to a unique thread. The SDC class does not honor
202 * t_kpri_req, for a few reasons:
204 * 1. t_kpri_req is notoriously inaccurate. A measure of its
205 * inaccuracy is that it needs to be cleared every time a thread
206 * returns to user mode, because it is frequently non-zero at that
207 * point. This can happen because "ownership" of synchronization
208 * primitives that use t_kpri_req can be silently handed off,
209 * leaving no opportunity to will the t_kpri_req inheritance.
211 * 2. Unlike in TS and FSS, threads in SDC *will* eventually run at
212 * kernel priority. This means that even if an SDC thread
213 * is holding a synchronization primitive and running at low
214 * priority, its priority will eventually be raised above 60,
215 * allowing it to drive on and release the resource.
217 * 3. The first consumer of SDC uses the taskq subsystem, which holds
218 * a reader lock for the duration of the task's execution. This
219 * would mean that SDC threads would never drop below kernel
220 * priority in practice, which defeats one of the purposes of SDC.
224 * It might seem that the existing FSS scheduling class could solve
225 * the problems that SDC is attempting to solve. FSS's more precise
226 * solution to the oversubscription problem would hardly cause
227 * trouble, as long as it performed well. SDC is implemented as
228 * a separate scheduling class for two main reasons: the initial
229 * consumer of SDC does not map well onto the "project" abstraction
230 * that is central to FSS, and FSS does not expect to run at kernel
236 * - sysdc_update_interval_msec: Number of milliseconds between
237 * consecutive thread priority updates.
239 * - sysdc_reset_interval_msec: Number of milliseconds between
240 * consecutive resets of a thread's base ONPROC and Runnable
243 * - sysdc_prune_interval_msec: Number of milliseconds of sleeping
244 * before a thread is pruned from the active list.
246 * - sysdc_max_pset_DC: Allowable percentage of a processor set's
247 * CPU time which SDC can give to its high-priority threads.
249 * - sysdc_break_msec: Number of milliseconds of "break" taken when
250 * sysdc_max_pset_DC is exceeded.
253 * Future work (in SDC and related subsystems)
255 * - Per-thread rechoose interval (0 for SDC)
257 * Allow each thread to specify its own rechoose interval. SDC
258 * threads would specify an interval of zero, which would rechoose
259 * the CPU with the lowest priority once per update.
261 * - Allow threads to change processor sets after joining the SDC class
263 * - Thread groups and per-group DC
265 * It might be nice to be able to specify a duty cycle which applies
266 * to a group of threads in aggregate.
268 * - Per-group DC callback to allow dynamic DC tuning
270 * Currently, DCs are assigned when the thread joins SDC. Some
271 * workloads could benefit from being able to tune their DC using
272 * subsystem-specific knowledge about the workload.
274 * - Finer-grained priority updates
276 * - More nuanced management of oversubscription
278 * - Moving other CPU-intensive threads into SDC
280 * - Move msacct data into kthread_t
282 * This would allow kernel threads without LWPs to join SDC.
287 * [1] The details of doing so are left as an exercise for the reader.
290 #include <sys/types.h>
291 #include <sys/sysdc.h>
292 #include <sys/sysdc_impl.h>
294 #include <sys/class.h>
295 #include <sys/cmn_err.h>
296 #include <sys/cpuvar.h>
297 #include <sys/cpupart.h>
298 #include <sys/debug.h>
299 #include <sys/disp.h>
300 #include <sys/errno.h>
301 #include <sys/inline.h>
302 #include <sys/kmem.h>
303 #include <sys/modctl.h>
304 #include <sys/schedctl.h>
306 #include <sys/sunddi.h>
307 #include <sys/sysmacros.h>
308 #include <sys/systm.h>
312 * Tunables - loaded into the internal state at module load time
314 uint_t sysdc_update_interval_msec
= 20;
315 uint_t sysdc_reset_interval_msec
= 400;
316 uint_t sysdc_prune_interval_msec
= 100;
317 uint_t sysdc_max_pset_DC
= 90;
318 uint_t sysdc_break_msec
= 80;
321 * Internal state - constants set up by sysdc_initparam()
323 static clock_t sysdc_update_ticks
; /* ticks between updates */
324 static uint_t sysdc_prune_updates
; /* updates asleep before pruning */
325 static uint_t sysdc_reset_updates
; /* # of updates before reset */
326 static uint_t sysdc_break_updates
; /* updates to break */
327 static uint_t sysdc_nobreak_updates
; /* updates to not check */
328 static uint_t sysdc_minDC
; /* minimum allowed DC */
329 static uint_t sysdc_maxDC
; /* maximum allowed DC */
330 static pri_t sysdc_minpri
; /* minimum allowed priority */
331 static pri_t sysdc_maxpri
; /* maximum allowed priority */
336 static kmutex_t sysdc_pset_lock
; /* lock protecting pset data */
337 static list_t sysdc_psets
; /* list of psets with SDC threads */
338 static uint_t sysdc_param_init
; /* sysdc_initparam() has been called */
339 static uint_t sysdc_update_timeout_started
; /* update timeout is active */
340 static hrtime_t sysdc_last_update
; /* time of last sysdc_update() */
341 static sysdc_t sysdc_dummy
; /* used to terminate active lists */
344 * Internal state - active hash table
346 #define SYSDC_NLISTS 8
347 #define SYSDC_HASH(sdc) (((uintptr_t)(sdc) >> 6) & (SYSDC_NLISTS - 1))
348 static sysdc_list_t sysdc_active
[SYSDC_NLISTS
];
349 #define SYSDC_LIST(sdc) (&sysdc_active[SYSDC_HASH(sdc)])
353 uint64_t sysdc_update_times_asleep
;
354 uint64_t sysdc_update_times_base_ran_backwards
;
355 uint64_t sysdc_update_times_already_done
;
356 uint64_t sysdc_update_times_cur_ran_backwards
;
357 uint64_t sysdc_compute_pri_breaking
;
358 uint64_t sysdc_activate_enter
;
359 uint64_t sysdc_update_enter
;
360 uint64_t sysdc_update_exited
;
361 uint64_t sysdc_update_not_sdc
;
362 uint64_t sysdc_update_idle
;
363 uint64_t sysdc_update_take_break
;
364 uint64_t sysdc_update_no_psets
;
365 uint64_t sysdc_tick_not_sdc
;
366 uint64_t sysdc_tick_quantum_expired
;
367 uint64_t sysdc_thread_enter_enter
;
370 #define SYSDC_INC_STAT(x) (sysdc_stats.x++)
372 #define SYSDC_INC_STAT(x) ((void)0)
375 /* macros are UPPER CASE */
376 #define HOWMANY(a, b) howmany((a), (b))
377 #define MSECTOTICKS(a) HOWMANY((a) * 1000, usec_per_tick)
380 sysdc_initparam(void)
382 uint_t sysdc_break_ticks
;
384 /* update / prune intervals */
385 sysdc_update_ticks
= MSECTOTICKS(sysdc_update_interval_msec
);
387 sysdc_prune_updates
= HOWMANY(sysdc_prune_interval_msec
,
388 sysdc_update_interval_msec
);
389 sysdc_reset_updates
= HOWMANY(sysdc_reset_interval_msec
,
390 sysdc_update_interval_msec
);
392 /* We must get at least a little time on CPU. */
394 sysdc_maxDC
= SYSDC_DC_MAX
;
396 sysdc_maxpri
= maxclsyspri
- 1;
398 /* break parameters */
399 if (sysdc_max_pset_DC
> SYSDC_DC_MAX
) {
400 sysdc_max_pset_DC
= SYSDC_DC_MAX
;
402 sysdc_break_ticks
= MSECTOTICKS(sysdc_break_msec
);
403 sysdc_break_updates
= HOWMANY(sysdc_break_ticks
, sysdc_update_ticks
);
408 * sysdc_max_pset_DC = (nobreak / (break + nobreak))
410 * ==> nobreak = sysdc_max_pset_DC * (break + nobreak)
412 * sysdc_max_pset_DC * break
413 * ==> nobreak = -------------------------
414 * 1 - sysdc_max_pset_DC
416 sysdc_nobreak_updates
=
417 HOWMANY((uint64_t)sysdc_break_updates
* sysdc_max_pset_DC
,
418 (SYSDC_DC_MAX
- sysdc_max_pset_DC
));
420 sysdc_param_init
= 1;
426 #define SDC_UPDATE_INITIAL 0x1 /* for the initial update */
427 #define SDC_UPDATE_TIMEOUT 0x2 /* from sysdc_update() */
428 #define SDC_UPDATE_TICK 0x4 /* from sysdc_tick(), on expiry */
431 * Updates the recorded times in the sdc, and returns the elapsed ONPROC
432 * and Runnable times since the last reset.
434 * newO is the thread's actual ONPROC time; it's used during sysdc_update()
435 * to track processor set usage.
438 sysdc_update_times(sysdc_t
*sdc
, uint_t flags
,
439 hrtime_t
*O
, hrtime_t
*R
, hrtime_t
*newO
)
441 kthread_t
*const t
= sdc
->sdc_thread
;
442 const uint_t initial
= (flags
& SDC_UPDATE_INITIAL
);
443 const uint_t update
= (flags
& SDC_UPDATE_TIMEOUT
);
444 const clock_t now
= ddi_get_lbolt();
447 ASSERT(THREAD_LOCK_HELD(t
));
451 /* If we've been sleeping, we know we haven't had any ONPROC time. */
452 if (sdc
->sdc_sleep_updates
!= 0 &&
453 sdc
->sdc_sleep_updates
!= sdc
->sdc_nupdates
) {
454 *newO
= sdc
->sdc_last_base_O
;
455 SYSDC_INC_STAT(sysdc_update_times_asleep
);
460 * If this is our first update, or we've hit the reset point,
461 * we need to reset our base_{O,R}. Once we've updated them, we
462 * report O and R for the entire prior interval.
467 if ((sdc
->sdc_nupdates
% sysdc_reset_updates
) == 0)
471 hrtime_t baseO
, baseR
;
474 * Start off our cycle count somewhere in the middle,
475 * to keep the resets from all happening at once.
477 * 4999 is a handy prime much larger than
478 * sysdc_reset_updates, so that we don't run into
479 * trouble if the resolution is a multiple of
480 * sysdc_reset_updates.
482 sdc
->sdc_nupdates
= (uint_t
)((gethrtime() % 4999) %
483 sysdc_reset_updates
);
486 baseO
= sdc
->sdc_base_O
;
487 baseR
= sdc
->sdc_base_R
;
490 mstate_systhread_times(t
, &sdc
->sdc_base_O
, &sdc
->sdc_base_R
);
491 *newO
= sdc
->sdc_base_O
;
493 sdc
->sdc_reset
= now
;
494 sdc
->sdc_pri_check
= -1; /* force mismatch below */
497 * See below for rationale.
499 if (baseO
> sdc
->sdc_base_O
|| baseR
> sdc
->sdc_base_R
) {
500 SYSDC_INC_STAT(sysdc_update_times_base_ran_backwards
);
501 baseO
= sdc
->sdc_base_O
;
502 baseR
= sdc
->sdc_base_R
;
505 /* compute based on the entire interval */
506 *O
= (sdc
->sdc_base_O
- baseO
);
507 *R
= (sdc
->sdc_base_R
- baseR
);
512 * If we're called from sysdc_update(), we *must* return a value
513 * for newO, so we always call mstate_systhread_times().
515 * Otherwise, if we've already done a pri check this tick,
518 if (!update
&& sdc
->sdc_pri_check
== now
) {
519 SYSDC_INC_STAT(sysdc_update_times_already_done
);
523 /* Get the current times from the thread */
524 sdc
->sdc_pri_check
= now
;
525 mstate_systhread_times(t
, &sdc
->sdc_cur_O
, &sdc
->sdc_cur_R
);
526 *newO
= sdc
->sdc_cur_O
;
529 * The updating of microstate accounting is not done under a
530 * consistent set of locks, particularly the t_waitrq field. This
531 * can lead to narrow windows in which we account for time in the
532 * wrong bucket, which on the next read will be accounted for
535 * If our sdc_base_* fields were affected by one of these blips, we
536 * throw away the old data, and pretend this tick didn't happen.
538 if (sdc
->sdc_cur_O
< sdc
->sdc_base_O
||
539 sdc
->sdc_cur_R
< sdc
->sdc_base_R
) {
541 sdc
->sdc_base_O
= sdc
->sdc_cur_O
;
542 sdc
->sdc_base_R
= sdc
->sdc_cur_R
;
544 SYSDC_INC_STAT(sysdc_update_times_cur_ran_backwards
);
548 *O
= sdc
->sdc_cur_O
- sdc
->sdc_base_O
;
549 *R
= sdc
->sdc_cur_R
- sdc
->sdc_base_R
;
553 * sysdc_compute_pri()
555 * Recomputes the priority of the thread, leaving the result in
556 * sdc->sdc_epri. Returns 1 if a priority update should occur
557 * (which will also trigger a cpu_surrender()), otherwise
561 sysdc_compute_pri(sysdc_t
*sdc
, uint_t flags
)
563 kthread_t
*const t
= sdc
->sdc_thread
;
564 const uint_t update
= (flags
& SDC_UPDATE_TIMEOUT
);
565 const uint_t tick
= (flags
& SDC_UPDATE_TICK
);
570 ASSERT(THREAD_LOCK_HELD(t
));
572 sysdc_update_times(sdc
, flags
, &O
, &R
, &newO
);
573 ASSERT(!update
|| newO
!= -1);
575 /* If we have new data, recompute our priority. */
577 sdc
->sdc_cur_DC
= (O
* SYSDC_DC_MAX
) / (O
+ R
);
579 /* Adjust our priority to move our DC closer to the target. */
580 if (sdc
->sdc_cur_DC
< sdc
->sdc_target_DC
)
581 sdc
->sdc_pri
= sdc
->sdc_maxpri
;
583 sdc
->sdc_pri
= sdc
->sdc_minpri
;
587 * If our per-pset duty cycle goes over the max, we will take a break.
588 * This forces all sysdc threads in the pset to minimum priority, in
589 * order to let everyone else have a chance at the CPU.
591 if (sdc
->sdc_pset
->sdp_need_break
) {
592 SYSDC_INC_STAT(sysdc_compute_pri_breaking
);
593 sdc
->sdc_epri
= sdc
->sdc_minpri
;
595 sdc
->sdc_epri
= sdc
->sdc_pri
;
598 DTRACE_PROBE4(sysdc__compute__pri
,
599 kthread_t
*, t
, pri_t
, sdc
->sdc_epri
, uint_t
, sdc
->sdc_cur_DC
,
600 uint_t
, sdc
->sdc_target_DC
);
603 * For sysdc_update(), we compute the ONPROC time for high-priority
604 * threads, which is used to calculate the per-pset duty cycle. We
605 * will always tell our callers to update the thread's priority,
606 * since we want to force a cpu_surrender().
608 * We reset sdc_update_ticks so that sysdc_tick() will only update
609 * the thread's priority if our timeout is delayed by a tick or
613 /* SDC threads are not allowed to change cpupart bindings. */
614 ASSERT(t
->t_cpupart
== sdc
->sdc_pset
->sdp_cpupart
);
616 /* If we were at MAXPRI, account for our onproc time. */
617 if (t
->t_pri
== sdc
->sdc_maxpri
&&
618 sdc
->sdc_last_base_O
!= 0 &&
619 sdc
->sdc_last_base_O
< newO
) {
620 sdc
->sdc_last_O
= newO
- sdc
->sdc_last_base_O
;
621 sdc
->sdc_pset
->sdp_onproc_time
+=
622 (uint64_t)sdc
->sdc_last_O
;
623 sdc
->sdc_pset
->sdp_onproc_threads
++;
627 sdc
->sdc_last_base_O
= newO
;
629 sdc
->sdc_update_ticks
= sdc
->sdc_ticks
+ sysdc_update_ticks
+ 1;
634 * Like sysdc_update(), sysdc_tick() always wants to update the
635 * thread's priority, so that the CPU is surrendered if necessary.
636 * We reset sdc_update_ticks so that if the timeout continues to be
637 * delayed, we'll update at the regular interval.
640 ASSERT(sdc
->sdc_ticks
== sdc
->sdc_update_ticks
);
641 sdc
->sdc_update_ticks
= sdc
->sdc_ticks
+ sysdc_update_ticks
;
646 * Otherwise, only tell our callers to update the priority if it has
649 return (sdc
->sdc_epri
!= t
->t_pri
);
653 sysdc_update_pri(sysdc_t
*sdc
, uint_t flags
)
655 kthread_t
*t
= sdc
->sdc_thread
;
657 ASSERT(THREAD_LOCK_HELD(t
));
659 if (sysdc_compute_pri(sdc
, flags
)) {
660 if (!thread_change_pri(t
, sdc
->sdc_epri
, 0)) {
667 * Add a thread onto the active list. It will only be removed by
671 sysdc_activate(sysdc_t
*sdc
)
673 sysdc_t
*volatile *headp
= &SYSDC_LIST(sdc
)->sdl_list
;
675 kthread_t
*t
= sdc
->sdc_thread
;
677 SYSDC_INC_STAT(sysdc_activate_enter
);
679 ASSERT(sdc
->sdc_next
== NULL
);
680 ASSERT(THREAD_LOCK_HELD(t
));
684 sdc
->sdc_next
= head
;
685 } while (atomic_cas_ptr(headp
, head
, sdc
) != head
);
689 * sysdc_update() has two jobs:
691 * 1. It updates the priorities of all active SDC threads on the system.
692 * 2. It measures pset CPU usage and enforces sysdc_max_pset_DC.
695 sysdc_update(void *arg
)
698 sysdc_t
*freelist
= NULL
;
703 SYSDC_INC_STAT(sysdc_update_enter
);
705 ASSERT(sysdc_update_timeout_started
);
708 * If this is our first time through, diff will be gigantic, and
709 * no breaks will be necessary.
712 diff
= now
- sysdc_last_update
;
713 sysdc_last_update
= now
;
715 mutex_enter(&sysdc_pset_lock
);
716 for (cur
= list_head(&sysdc_psets
); cur
!= NULL
;
717 cur
= list_next(&sysdc_psets
, cur
)) {
718 bool breaking
= (cur
->sdp_should_break
!= 0);
720 if (cur
->sdp_need_break
!= breaking
) {
721 DTRACE_PROBE2(sdc__pset__break
, sysdc_pset_t
*, cur
,
724 cur
->sdp_onproc_time
= 0;
725 cur
->sdp_onproc_threads
= 0;
726 cur
->sdp_need_break
= breaking
;
728 mutex_exit(&sysdc_pset_lock
);
730 for (idx
= 0; idx
< SYSDC_NLISTS
; idx
++) {
731 sysdc_list_t
*sdl
= &sysdc_active
[idx
];
732 sysdc_t
*volatile *headp
= &sdl
->sdl_list
;
733 sysdc_t
*head
, *tail
;
736 if (*headp
== &sysdc_dummy
)
739 /* Prevent any threads from exiting while we're poking them. */
740 mutex_enter(&sdl
->sdl_lock
);
743 * Each sdl_list contains a singly-linked list of active
744 * threads. Threads which become active while we are
745 * processing the list will be added to sdl_list. Since we
746 * don't want that to interfere with our own processing, we
747 * swap in an empty list. Any newly active threads will
748 * go on to this empty list. When finished, we'll put any
749 * such threads at the end of the processed list.
751 head
= atomic_swap_ptr(headp
, &sysdc_dummy
);
753 while (*prevptr
!= &sysdc_dummy
) {
754 sysdc_t
*const sdc
= *prevptr
;
755 kthread_t
*const t
= sdc
->sdc_thread
;
758 * If the thread has exited, move its sysdc_t onto
759 * freelist, to be freed later.
762 *prevptr
= sdc
->sdc_next
;
763 SYSDC_INC_STAT(sysdc_update_exited
);
764 sdc
->sdc_next
= freelist
;
770 if (t
->t_cid
!= sysdccid
) {
772 prevptr
= &sdc
->sdc_next
;
773 SYSDC_INC_STAT(sysdc_update_not_sdc
);
776 ASSERT(t
->t_cldata
== sdc
);
779 * If the thread has been sleeping for longer
780 * than sysdc_prune_interval, make it inactive by
781 * removing it from the list.
783 if (!(t
->t_state
& (TS_RUN
| TS_ONPROC
)) &&
784 sdc
->sdc_sleep_updates
!= 0 &&
785 (sdc
->sdc_sleep_updates
- sdc
->sdc_nupdates
) >
786 sysdc_prune_updates
) {
787 *prevptr
= sdc
->sdc_next
;
788 SYSDC_INC_STAT(sysdc_update_idle
);
789 sdc
->sdc_next
= NULL
;
793 sysdc_update_pri(sdc
, SDC_UPDATE_TIMEOUT
);
796 prevptr
= &sdc
->sdc_next
;
800 * Add our list to the bucket, putting any new entries
801 * added while we were working at the tail of the list.
806 } while (atomic_cas_ptr(headp
, tail
, head
) != tail
);
808 mutex_exit(&sdl
->sdl_lock
);
811 mutex_enter(&sysdc_pset_lock
);
812 for (cur
= list_head(&sysdc_psets
); cur
!= NULL
;
813 cur
= list_next(&sysdc_psets
, cur
)) {
815 cur
->sdp_vtime_last_interval
=
816 diff
* cur
->sdp_cpupart
->cp_ncpus
;
817 cur
->sdp_DC_last_interval
=
818 (cur
->sdp_onproc_time
* SYSDC_DC_MAX
) /
819 cur
->sdp_vtime_last_interval
;
821 if (cur
->sdp_should_break
> 0) {
822 cur
->sdp_should_break
--; /* breaking */
825 if (cur
->sdp_dont_break
> 0) {
826 cur
->sdp_dont_break
--; /* waiting before checking */
829 if (cur
->sdp_DC_last_interval
> sysdc_max_pset_DC
) {
830 cur
->sdp_should_break
= sysdc_break_updates
;
831 cur
->sdp_dont_break
= sysdc_nobreak_updates
;
832 SYSDC_INC_STAT(sysdc_update_take_break
);
837 * If there are no sysdc_psets, there can be no threads, so
838 * we can stop doing our timeout. Since we're holding the
839 * sysdc_pset_lock, no new sysdc_psets can come in, which will
840 * prevent anyone from racing with this and dropping our timeout
843 if (list_is_empty(&sysdc_psets
)) {
844 SYSDC_INC_STAT(sysdc_update_no_psets
);
845 ASSERT(sysdc_update_timeout_started
);
846 sysdc_update_timeout_started
= 0;
850 mutex_exit(&sysdc_pset_lock
);
852 while (freelist
!= NULL
) {
853 sysdc_t
*cur
= freelist
;
854 freelist
= cur
->sdc_next
;
855 kmem_free(cur
, sizeof (*cur
));
859 (void) timeout(sysdc_update
, arg
, sysdc_update_ticks
);
864 sysdc_preempt(kthread_t
*t
)
866 ASSERT(t
== curthread
);
867 ASSERT(THREAD_LOCK_HELD(t
));
869 setbackdq(t
); /* give others a chance to run */
873 sysdc_tick(kthread_t
*t
)
878 if (t
->t_cid
!= sysdccid
) {
879 SYSDC_INC_STAT(sysdc_tick_not_sdc
);
884 if (t
->t_state
== TS_ONPROC
&&
885 t
->t_pri
< t
->t_disp_queue
->disp_maxrunpri
) {
889 if (t
->t_state
== TS_ONPROC
|| t
->t_state
== TS_RUN
) {
890 ASSERT(sdc
->sdc_sleep_updates
== 0);
893 ASSERT(sdc
->sdc_ticks
!= sdc
->sdc_update_ticks
);
895 if (sdc
->sdc_ticks
== sdc
->sdc_update_ticks
) {
896 SYSDC_INC_STAT(sysdc_tick_quantum_expired
);
897 sysdc_update_pri(sdc
, SDC_UPDATE_TICK
);
898 ASSERT(sdc
->sdc_ticks
!= sdc
->sdc_update_ticks
);
904 sysdc_setrun(kthread_t
*t
)
906 sysdc_t
*sdc
= t
->t_cldata
;
908 ASSERT(THREAD_LOCK_HELD(t
)); /* t should be in transition */
910 sdc
->sdc_sleep_updates
= 0;
912 if (sdc
->sdc_next
== NULL
) {
914 * Since we're in transition, we don't want to use the
915 * full thread_update_pri().
917 if (sysdc_compute_pri(sdc
, 0)) {
918 THREAD_CHANGE_PRI(t
, sdc
->sdc_epri
);
922 ASSERT(sdc
->sdc_next
!= NULL
);
929 sysdc_wakeup(kthread_t
*t
)
935 sysdc_sleep(kthread_t
*t
)
937 sysdc_t
*sdc
= t
->t_cldata
;
939 ASSERT(THREAD_LOCK_HELD(t
)); /* t should be in transition */
941 sdc
->sdc_sleep_updates
= sdc
->sdc_nupdates
;
946 sysdc_enterclass(kthread_t
*t
, id_t cid
, void *parmsp
, cred_t
*reqpcredp
,
949 cpupart_t
*const cpupart
= t
->t_cpupart
;
951 sysdc_params_t
*sdpp
= parmsp
;
952 sysdc_pset_t
*newpset
= sdc
->sdc_pset
;
956 if (t
->t_cid
!= syscid
)
959 ASSERT(ttolwp(t
) != NULL
);
960 ASSERT(sdpp
!= NULL
);
961 ASSERT(newpset
!= NULL
);
962 ASSERT(sysdc_param_init
);
964 ASSERT(sdpp
->sdp_minpri
>= sysdc_minpri
);
965 ASSERT(sdpp
->sdp_maxpri
<= sysdc_maxpri
);
966 ASSERT(sdpp
->sdp_DC
>= sysdc_minDC
);
967 ASSERT(sdpp
->sdp_DC
<= sysdc_maxDC
);
970 sdc
->sdc_pri
= sdpp
->sdp_maxpri
; /* start off maximally */
971 sdc
->sdc_minpri
= sdpp
->sdp_minpri
;
972 sdc
->sdc_maxpri
= sdpp
->sdp_maxpri
;
973 sdc
->sdc_target_DC
= sdpp
->sdp_DC
;
975 sdc
->sdc_update_ticks
= sysdc_update_ticks
+ 1;
977 /* Assign ourselves to the appropriate pset. */
978 sdc
->sdc_pset
= NULL
;
979 mutex_enter(&sysdc_pset_lock
);
980 for (pset
= list_head(&sysdc_psets
); pset
!= NULL
;
981 pset
= list_next(&sysdc_psets
, pset
)) {
982 if (pset
->sdp_cpupart
== cpupart
) {
989 pset
->sdp_cpupart
= cpupart
;
990 list_insert_tail(&sysdc_psets
, pset
);
992 pset
->sdp_nthreads
++;
993 ASSERT(pset
->sdp_nthreads
> 0);
995 sdc
->sdc_pset
= pset
;
997 start_timeout
= (sysdc_update_timeout_started
== 0);
998 sysdc_update_timeout_started
= 1;
999 mutex_exit(&sysdc_pset_lock
);
1001 if (newpset
!= NULL
)
1002 kmem_free(newpset
, sizeof (*newpset
));
1004 /* Update t's scheduling class and priority. */
1006 t
->t_clfuncs
= &(sclass
[cid
].cl_funcs
->thread
);
1009 t
->t_schedflag
|= TS_RUNQMATCH
;
1011 sysdc_update_pri(sdc
, SDC_UPDATE_INITIAL
);
1014 /* Kick off the thread timeout if we're the first one in. */
1015 if (start_timeout
) {
1016 (void) timeout(sysdc_update
, NULL
, sysdc_update_ticks
);
1023 sysdc_leave(sysdc_t
*sdc
)
1025 sysdc_pset_t
*sdp
= sdc
->sdc_pset
;
1026 sysdc_list_t
*sdl
= SYSDC_LIST(sdc
);
1029 mutex_enter(&sdl
->sdl_lock
); /* block sysdc_update() */
1030 sdc
->sdc_thread
= NULL
;
1031 freedc
= (sdc
->sdc_next
== NULL
);
1032 mutex_exit(&sdl
->sdl_lock
);
1034 mutex_enter(&sysdc_pset_lock
);
1035 ASSERT(sdp
!= NULL
);
1036 ASSERT(sdp
->sdp_nthreads
> 0);
1037 --sdp
->sdp_nthreads
;
1038 if (sdp
->sdp_nthreads
== 0) {
1039 list_remove(&sysdc_psets
, sdp
);
1043 mutex_exit(&sysdc_pset_lock
);
1046 kmem_free(sdc
, sizeof (*sdc
));
1048 kmem_free(sdp
, sizeof (*sdp
));
1052 sysdc_exitclass(void *buf
)
1054 sysdc_leave((sysdc_t
*)buf
);
1059 sysdc_canexit(kthread_t
*t
, cred_t
*reqpcredp
)
1061 /* Threads cannot exit SDC once joined, except in a body bag. */
1066 sysdc_exit(kthread_t
*t
)
1070 /* We're exiting, so we just rejoin the SYS class. */
1072 ASSERT(t
->t_cid
== sysdccid
);
1076 t
->t_clfuncs
= &(sclass
[syscid
].cl_funcs
->thread
);
1077 (void) thread_change_pri(t
, maxclsyspri
, 0);
1078 t
->t_schedflag
&= ~TS_RUNQMATCH
;
1079 thread_unlock_nopreempt(t
);
1081 /* Unlink the sdc from everything. */
1087 sysdc_fork(kthread_t
*t
, kthread_t
*ct
, void *bufp
)
1090 * Threads cannot be created with SDC as their class; they must
1091 * be created as SYS and then added with sysdc_thread_enter().
1092 * Because of this restriction, sysdc_fork() should never be called.
1094 panic("sysdc cannot be forked");
1101 sysdc_forkret(kthread_t
*t
, kthread_t
*ct
)
1103 /* SDC threads are part of system processes, which never fork. */
1104 panic("sysdc cannot be forked");
1108 sysdc_globpri(kthread_t
*t
)
1114 * Get maximum and minimum priorities enjoyed by SDC threads.
1117 sysdc_getclpri(pcpri_t
*pcprip
)
1119 pcprip
->pc_clpmax
= sysdc_maxpri
;
1120 pcprip
->pc_clpmin
= sysdc_minpri
;
1126 sysdc_getclinfo(void *arg
)
1128 return (0); /* no class-specific info */
1133 sysdc_alloc(void **p
, int flag
)
1138 if ((new = kmem_zalloc(sizeof (*new), flag
)) == NULL
) {
1141 if ((new->sdc_pset
= kmem_zalloc(sizeof (*new->sdc_pset
), flag
)) ==
1143 kmem_free(new, sizeof (*new));
1157 * We must have failed CL_ENTERCLASS(), so our pset should be
1160 ASSERT(sdc
->sdc_pset
!= NULL
);
1161 ASSERT(sdc
->sdc_pset
->sdp_cpupart
== NULL
);
1162 kmem_free(sdc
->sdc_pset
, sizeof (*sdc
->sdc_pset
));
1163 kmem_free(sdc
, sizeof (*sdc
));
1167 static int sysdc_enosys(); /* Boy, ANSI-C's K&R compatibility is weird. */
1168 static int sysdc_einval();
1169 static void sysdc_nullsys();
1171 static struct classfuncs sysdc_classfuncs
= {
1172 /* messages to class manager */
1174 sysdc_enosys
, /* admin */
1176 sysdc_enosys
, /* parmsin */
1177 sysdc_enosys
, /* parmsout */
1178 sysdc_enosys
, /* vaparmsin */
1179 sysdc_enosys
, /* vaparmsout */
1184 /* operations on threads */
1191 sysdc_nullsys
, /* parmsget */
1192 sysdc_enosys
, /* parmsset */
1193 sysdc_nullsys
, /* stop */
1195 sysdc_nullsys
, /* active */
1196 sysdc_nullsys
, /* inactive */
1197 sysdc_nullsys
, /* trapret */
1203 sysdc_einval
, /* donice */
1205 sysdc_nullsys
, /* set_process_group */
1206 sysdc_nullsys
, /* yield */
1207 sysdc_einval
, /* doprio */
1230 sysdc_init(id_t cid
, int clparmsz
, classfuncs_t
**clfuncspp
)
1234 list_create(&sysdc_psets
, sizeof (sysdc_pset_t
),
1235 offsetof(sysdc_pset_t
, sdp_node
));
1237 for (idx
= 0; idx
< SYSDC_NLISTS
; idx
++) {
1238 sysdc_active
[idx
].sdl_list
= &sysdc_dummy
;
1244 *clfuncspp
= &sysdc_classfuncs
;
1246 return ((pri_t
)v
.v_maxsyspri
);
1249 static struct sclass csw
= {
1255 static struct modlsched modlsched
= {
1256 &mod_schedops
, "system duty cycle scheduling class", &csw
1259 static struct modlinkage modlinkage
= {
1260 MODREV_1
, (void *)&modlsched
, NULL
1266 return (mod_install(&modlinkage
));
1272 return (EBUSY
); /* can't unload for now */
1276 _info(struct modinfo
*modinfop
)
1278 return (mod_info(&modlinkage
, modinfop
));
1281 /* --- consolidation-private interfaces --- */
1283 sysdc_thread_enter(kthread_t
*t
, uint_t dc
, uint_t flags
)
1288 SYSDC_INC_STAT(sysdc_thread_enter_enter
);
1290 ASSERT(sysdc_param_init
);
1291 ASSERT(sysdccid
>= 0);
1293 ASSERT((flags
& ~SYSDC_THREAD_BATCH
) == 0);
1295 sdp
.sdp_minpri
= sysdc_minpri
;
1296 sdp
.sdp_maxpri
= sysdc_maxpri
;
1297 sdp
.sdp_DC
= MAX(MIN(dc
, sysdc_maxDC
), sysdc_minDC
);
1299 VERIFY0(CL_ALLOC(&buf
, sysdccid
, KM_SLEEP
));
1301 ASSERT(t
->t_lwp
!= NULL
);
1302 ASSERT(t
->t_cid
== syscid
);
1303 ASSERT(t
->t_cldata
== NULL
);
1304 VERIFY0(CL_CANEXIT(t
, NULL
));
1305 VERIFY0(CL_ENTERCLASS(t
, sysdccid
, &sdp
, kcred
, buf
));
1306 CL_EXITCLASS(syscid
, NULL
);