kernel/sched: convert to C99 bool
[unleashed.git] / kernel / sched / fss / fss.c
blob74dbdeb2998fc5f6ddcf1952d90c9dc1fb5c2612
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
23 * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright 2013, Joyent, Inc. All rights reserved.
27 #include <sys/types.h>
28 #include <sys/param.h>
29 #include <sys/sysmacros.h>
30 #include <sys/cred.h>
31 #include <sys/proc.h>
32 #include <sys/strsubr.h>
33 #include <sys/priocntl.h>
34 #include <sys/class.h>
35 #include <sys/disp.h>
36 #include <sys/procset.h>
37 #include <sys/debug.h>
38 #include <sys/kmem.h>
39 #include <sys/errno.h>
40 #include <sys/systm.h>
41 #include <sys/schedctl.h>
42 #include <sys/vmsystm.h>
43 #include <sys/atomic.h>
44 #include <sys/project.h>
45 #include <sys/modctl.h>
46 #include <sys/fss.h>
47 #include <sys/fsspriocntl.h>
48 #include <sys/cpupart.h>
49 #include <sys/zone.h>
50 #include <vm/rm.h>
51 #include <vm/seg_kmem.h>
52 #include <sys/tnf_probe.h>
53 #include <sys/policy.h>
54 #include <sys/sdt.h>
55 #include <sys/cpucaps.h>
58 * The fair share scheduling class ensures that collections of processes
59 * (zones and projects) each get their configured share of CPU. This is in
60 * contrast to the TS class which considers individual processes.
62 * The FSS cpu-share is set on zones using the zone.cpu-shares rctl and on
63 * projects using the project.cpu-shares rctl. By default the value is 1
64 * and it can range from 0 - 64k. A value of 0 means that processes in the
65 * collection will only get CPU resources when there are no other processes
66 * that need CPU. The cpu-share is used as one of the inputs to calculate a
67 * thread's "user-mode" priority (umdpri) for the scheduler. The umdpri falls
68 * in the range 0-59. FSS calculates other, internal, priorities which are not
69 * visible outside of the FSS class.
71 * The FSS class should approximate TS behavior when there are excess CPU
72 * resources. When there is a backlog of runnable processes, then the share
73 * is used as input into the runnable process's priority calculation, where
74 * the final umdpri is used by the scheduler to determine when the process runs.
76 * Projects in a zone compete with each other for CPU time, receiving CPU
77 * allocation within a zone proportional to the project's share; at a higher
78 * level zones compete with each other, receiving allocation in a pset
79 * proportional to the zone's share.
81 * The FSS priority calculation consists of several parts.
83 * 1) Once per second the fss_update function runs. The first thing it does is
84 * call fss_decay_usage. This function does three things.
86 * a) fss_decay_usage first decays the maxfsspri value for the pset. This
87 * value is used in the per-process priority calculation described in step
88 * (2b). The maxfsspri is decayed using the following formula:
90 * maxfsspri * fss_nice_decay[NZERO])
91 * maxfsspri = ------------------------------------
92 * FSS_DECAY_BASE
95 * - NZERO is the default process priority (i.e. 20)
97 * The fss_nice_decay array is a fixed set of values used to adjust the
98 * decay rate of processes based on their nice value. Entries in this
99 * array are initialized in fss_init using the following formula:
101 * (FSS_DECAY_MAX - FSS_DECAY_MIN) * i
102 * FSS_DECAY_MIN + -------------------------------------
103 * FSS_NICE_RANGE - 1
105 * - FSS_DECAY_MIN is 82 = approximates 65% (82/128)
106 * - FSS_DECAY_MAX is 108 = approximates 85% (108/128)
107 * - FSS_NICE_RANGE is 40 (range is 0 - 39)
109 * b) The second thing fss_decay_usage does is update each project's "usage"
110 * for the last second and then recalculates the project's "share usage".
112 * The usage value is the recent CPU usage for all of the threads in the
113 * project. It is decayed and updated this way:
115 * (usage * FSS_DECAY_USG)
116 * usage = ------------------------- + ticks;
117 * FSS_DECAY_BASE
119 * - FSS_DECAY_BASE is 128 - used instead of 100 so we can shift vs divide
120 * - FSS_DECAY_USG is 96 - approximates 75% (96/128)
121 * - ticks is updated whenever a process in this project is running
122 * when the scheduler's tick processing fires. This is not a simple
123 * counter, the values are based on the entries in the fss_nice_tick
124 * array (see section 3 below). ticks is then reset to 0 so it can track
125 * the next seconds worth of nice-adjusted time for the project.
127 * c) The third thing fss_decay_usage does is update each project's "share
128 * usage" (shusage). This is the normalized usage value for the project and
129 * is calculated this way:
131 * pset_shares^2 zone_int_shares^2
132 * usage * ------------- * ------------------
133 * kpj_shares^2 zone_ext_shares^2
135 * - usage - see (1b) for more details
136 * - pset_shares is the total of all *active* zone shares in the pset (by
137 * default there is only one pset)
138 * - kpj_shares is the individual project's share (project.cpu-shares rctl)
139 * - zone_int_shares is the sum of shares of all active projects within the
140 * zone (the zone-internal total)
141 * - zone_ext_shares is the share value for the zone (zone.cpu-shares rctl)
143 * The shusage is used in step (2b) to calculate the thread's new internal
144 * priority. A larger shusage value leads to a lower priority.
146 * 2) The fss_update function then calls fss_update_list to update the priority
147 * of all threads. This does two things.
149 * a) First the thread's internal priority is decayed using the following
150 * formula:
152 * fsspri * fss_nice_decay[nice_value])
153 * fsspri = ------------------------------------
154 * FSS_DECAY_BASE
156 * - FSS_DECAY_BASE is 128 as described above
158 * b) Second, if the thread is runnable (TS_RUN or TS_WAIT) calls fss_newpri
159 * to update the user-mode priority (umdpri) of the runnable thread.
160 * Threads that are running (TS_ONPROC) or waiting for an event (TS_SLEEP)
161 * are not updated at this time. The updated user-mode priority can cause
162 * threads to change their position in the run queue.
164 * The process's new internal fsspri is calculated using the following
165 * formula. All runnable threads in the project will use the same shusage
166 * and nrunnable values in their calculation.
168 * fsspri += shusage * nrunnable * ticks
170 * - shusage is the project's share usage, calculated in (1c)
171 * - nrunnable is the number of runnable threads in the project
172 * - ticks is the number of ticks this thread ran since the last fss_newpri
173 * invocation.
175 * Finally the process's new user-mode priority is calculated using the
176 * following formula:
178 * (fsspri * umdprirange)
179 * umdpri = maxumdpri - ------------------------
180 * maxfsspri
182 * - maxumdpri is MINCLSYSPRI - 1 (i.e. 59)
183 * - umdprirange is maxumdpri - 1 (i.e. 58)
184 * - maxfsspri is the largest fsspri seen so far, as we're iterating all
185 * runnable processes
187 * Thus, a higher internal priority (fsspri) leads to a lower user-mode
188 * priority which means the thread runs less. The fsspri is higher when
189 * the project's normalized share usage is higher, when the project has
190 * more runnable threads, or when the thread has accumulated more run-time.
192 * This code has various checks to ensure the resulting umdpri is in the
193 * range 1-59. See fss_newpri for more details.
195 * To reiterate, the above processing is performed once per second to recompute
196 * the runnable thread user-mode priorities.
198 * 3) The final major component in the priority calculation is the tick
199 * processing which occurs on a thread that is running when the clock
200 * calls fss_tick.
202 * A thread can run continuously in user-land (compute-bound) for the
203 * fss_quantum (see "dispadmin -c FSS -g" for the configurable properties).
204 * The fss_quantum defaults to 11 (i.e. 11 ticks).
206 * Once the quantum has been consumed, the thread will call fss_newpri to
207 * recompute its umdpri priority, as described above in (2b). Threads that
208 * were T_ONPROC at the one second interval when runnable thread priorities
209 * were recalculated will have their umdpri priority recalculated when their
210 * quanta expires.
212 * To ensure that runnable threads within a project see the expected
213 * round-robin behavior, there is a special case in fss_newpri for a thread
214 * that has run for its quanta within the one second update interval. See
215 * the handling for the quanta_up parameter within fss_newpri.
217 * Also of interest, the fss_tick code increments the project's tick value
218 * using the fss_nice_tick array entry for the thread's nice value. The idea
219 * behind the fss_nice_tick array is that the cost of a tick is lower at
220 * positive nice values (so that it doesn't increase the project's usage
221 * as much as normal) with a 50% drop at the maximum level and a 50%
222 * increase at the minimum level. See (1b). The fss_nice_tick array is
223 * initialized in fss_init using the following formula:
225 * FSS_TICK_COST * (((3 * FSS_NICE_RANGE) / 2) - i)
226 * --------------------------------------------------
227 * FSS_NICE_RANGE
229 * - FSS_TICK_COST is 1000, the tick cost for threads with nice level 0
231 * FSS Data Structures:
233 * fsszone
234 * ----- -----
235 * ----- | | | |
236 * | |-------->| |<------->| |<---->...
237 * | | ----- -----
238 * | | ^ ^ ^
239 * | |--- | \ \
240 * ----- | | \ \
241 * fsspset | | \ \
242 * | | \ \
243 * | ----- ----- -----
244 * -->| |<--->| |<--->| |
245 * | | | | | |
246 * ----- ----- -----
247 * fssproj
249 * That is, fsspsets contain a list of fsszone's that are currently active in
250 * the pset, and a list of fssproj's, corresponding to projects with runnable
251 * threads on the pset. fssproj's in turn point to the fsszone which they
252 * are a member of.
254 * An fssproj_t is removed when there are no threads in it.
256 * An fsszone_t is removed when there are no projects with threads in it.
259 static pri_t fss_init(id_t, int, classfuncs_t **);
261 static struct sclass fss = {
262 "FSS",
263 fss_init,
267 extern struct mod_ops mod_schedops;
270 * Module linkage information for the kernel.
272 static struct modlsched modlsched = {
273 &mod_schedops, "fair share scheduling class", &fss
276 static struct modlinkage modlinkage = {
277 MODREV_1, (void *)&modlsched, NULL
280 #define FSS_MAXUPRI 60
283 * The fssproc_t structures are kept in an array of circular doubly linked
284 * lists. A hash on the thread pointer is used to determine which list each
285 * thread should be placed in. Each list has a dummy "head" which is never
286 * removed, so the list is never empty. fss_update traverses these lists to
287 * update the priorities of threads that have been waiting on the run queue.
289 #define FSS_LISTS 16 /* number of lists, must be power of 2 */
290 #define FSS_LIST_HASH(t) (((uintptr_t)(t) >> 9) & (FSS_LISTS - 1))
291 #define FSS_LIST_NEXT(i) (((i) + 1) & (FSS_LISTS - 1))
293 #define FSS_LIST_INSERT(fssproc) \
295 int index = FSS_LIST_HASH(fssproc->fss_tp); \
296 kmutex_t *lockp = &fss_listlock[index]; \
297 fssproc_t *headp = &fss_listhead[index]; \
298 mutex_enter(lockp); \
299 fssproc->fss_next = headp->fss_next; \
300 fssproc->fss_prev = headp; \
301 headp->fss_next->fss_prev = fssproc; \
302 headp->fss_next = fssproc; \
303 mutex_exit(lockp); \
306 #define FSS_LIST_DELETE(fssproc) \
308 int index = FSS_LIST_HASH(fssproc->fss_tp); \
309 kmutex_t *lockp = &fss_listlock[index]; \
310 mutex_enter(lockp); \
311 fssproc->fss_prev->fss_next = fssproc->fss_next; \
312 fssproc->fss_next->fss_prev = fssproc->fss_prev; \
313 mutex_exit(lockp); \
316 #define FSS_TICK_COST 1000 /* tick cost for threads with nice level = 0 */
319 * Decay rate percentages are based on n/128 rather than n/100 so that
320 * calculations can avoid having to do an integer divide by 100 (divide
321 * by FSS_DECAY_BASE == 128 optimizes to an arithmetic shift).
323 * FSS_DECAY_MIN = 83/128 ~= 65%
324 * FSS_DECAY_MAX = 108/128 ~= 85%
325 * FSS_DECAY_USG = 96/128 ~= 75%
327 #define FSS_DECAY_MIN 83 /* fsspri decay pct for threads w/ nice -20 */
328 #define FSS_DECAY_MAX 108 /* fsspri decay pct for threads w/ nice +19 */
329 #define FSS_DECAY_USG 96 /* fssusage decay pct for projects */
330 #define FSS_DECAY_BASE 128 /* base for decay percentages above */
332 #define FSS_NICE_MIN 0
333 #define FSS_NICE_MAX (2 * NZERO - 1)
334 #define FSS_NICE_RANGE (FSS_NICE_MAX - FSS_NICE_MIN + 1)
336 static int fss_nice_tick[FSS_NICE_RANGE];
337 static int fss_nice_decay[FSS_NICE_RANGE];
339 static pri_t fss_maxupri = FSS_MAXUPRI; /* maximum FSS user priority */
340 static pri_t fss_maxumdpri; /* maximum user mode fss priority */
341 static pri_t fss_maxglobpri; /* maximum global priority used by fss class */
342 static pri_t fss_minglobpri; /* minimum global priority */
344 static fssproc_t fss_listhead[FSS_LISTS];
345 static kmutex_t fss_listlock[FSS_LISTS];
347 static fsspset_t *fsspsets;
348 static kmutex_t fsspsets_lock; /* protects fsspsets */
350 static id_t fss_cid;
352 static int fss_quantum = 11;
354 static void fss_newpri(fssproc_t *, bool);
355 static void fss_update(void *);
356 static int fss_update_list(int);
357 static void fss_change_priority(kthread_t *, fssproc_t *);
359 static int fss_admin(caddr_t, cred_t *);
360 static int fss_getclinfo(void *);
361 static int fss_parmsin(void *);
362 static int fss_parmsout(void *, pc_vaparms_t *);
363 static int fss_vaparmsin(void *, pc_vaparms_t *);
364 static int fss_vaparmsout(void *, pc_vaparms_t *);
365 static int fss_getclpri(pcpri_t *);
366 static int fss_alloc(void **, int);
367 static void fss_free(void *);
369 static int fss_enterclass(kthread_t *, id_t, void *, cred_t *, void *);
370 static void fss_exitclass(void *);
371 static int fss_canexit(kthread_t *, cred_t *);
372 static int fss_fork(kthread_t *, kthread_t *, void *);
373 static void fss_forkret(kthread_t *, kthread_t *);
374 static void fss_parmsget(kthread_t *, void *);
375 static int fss_parmsset(kthread_t *, void *, id_t, cred_t *);
376 static void fss_stop(kthread_t *, int, int);
377 static void fss_exit(kthread_t *);
378 static void fss_active(kthread_t *);
379 static void fss_inactive(kthread_t *);
380 static void fss_trapret(kthread_t *);
381 static void fss_preempt(kthread_t *);
382 static void fss_setrun(kthread_t *);
383 static void fss_sleep(kthread_t *);
384 static void fss_tick(kthread_t *);
385 static void fss_wakeup(kthread_t *);
386 static int fss_donice(kthread_t *, cred_t *, int, int *);
387 static int fss_doprio(kthread_t *, cred_t *, int, int *);
388 static pri_t fss_globpri(kthread_t *);
389 static void fss_yield(kthread_t *);
390 static void fss_nullsys();
392 static struct classfuncs fss_classfuncs = {
393 /* class functions */
394 fss_admin,
395 fss_getclinfo,
396 fss_parmsin,
397 fss_parmsout,
398 fss_vaparmsin,
399 fss_vaparmsout,
400 fss_getclpri,
401 fss_alloc,
402 fss_free,
404 /* thread functions */
405 fss_enterclass,
406 fss_exitclass,
407 fss_canexit,
408 fss_fork,
409 fss_forkret,
410 fss_parmsget,
411 fss_parmsset,
412 fss_stop,
413 fss_exit,
414 fss_active,
415 fss_inactive,
416 fss_trapret,
417 fss_preempt,
418 fss_setrun,
419 fss_sleep,
420 fss_tick,
421 fss_wakeup,
422 fss_donice,
423 fss_globpri,
424 fss_nullsys, /* set_process_group */
425 fss_yield,
426 fss_doprio,
430 _init()
432 return (mod_install(&modlinkage));
436 _fini()
438 return (EBUSY);
442 _info(struct modinfo *modinfop)
444 return (mod_info(&modlinkage, modinfop));
447 /*ARGSUSED*/
448 static int
449 fss_project_walker(kproject_t *kpj, void *buf)
451 return (0);
454 void *
455 fss_allocbuf(int op, int type)
457 fssbuf_t *fssbuf;
458 void **fsslist;
459 int cnt;
460 int i;
461 size_t size;
463 ASSERT(op == FSS_NPSET_BUF || op == FSS_NPROJ_BUF || op == FSS_ONE_BUF);
464 ASSERT(type == FSS_ALLOC_PROJ || type == FSS_ALLOC_ZONE);
465 ASSERT(MUTEX_HELD(&cpu_lock));
467 fssbuf = kmem_zalloc(sizeof (fssbuf_t), KM_SLEEP);
468 switch (op) {
469 case FSS_NPSET_BUF:
470 cnt = cpupart_list(NULL, 0, CP_NONEMPTY);
471 break;
472 case FSS_NPROJ_BUF:
473 cnt = project_walk_all(ALL_ZONES, fss_project_walker, NULL);
474 break;
475 case FSS_ONE_BUF:
476 cnt = 1;
477 break;
480 switch (type) {
481 case FSS_ALLOC_PROJ:
482 size = sizeof (fssproj_t);
483 break;
484 case FSS_ALLOC_ZONE:
485 size = sizeof (fsszone_t);
486 break;
488 fsslist = kmem_zalloc(cnt * sizeof (void *), KM_SLEEP);
489 fssbuf->fssb_size = cnt;
490 fssbuf->fssb_list = fsslist;
491 for (i = 0; i < cnt; i++)
492 fsslist[i] = kmem_zalloc(size, KM_SLEEP);
493 return (fssbuf);
496 void
497 fss_freebuf(fssbuf_t *fssbuf, int type)
499 void **fsslist;
500 int i;
501 size_t size;
503 ASSERT(fssbuf != NULL);
504 ASSERT(type == FSS_ALLOC_PROJ || type == FSS_ALLOC_ZONE);
505 fsslist = fssbuf->fssb_list;
507 switch (type) {
508 case FSS_ALLOC_PROJ:
509 size = sizeof (fssproj_t);
510 break;
511 case FSS_ALLOC_ZONE:
512 size = sizeof (fsszone_t);
513 break;
516 for (i = 0; i < fssbuf->fssb_size; i++) {
517 if (fsslist[i] != NULL)
518 kmem_free(fsslist[i], size);
520 kmem_free(fsslist, sizeof (void *) * fssbuf->fssb_size);
521 kmem_free(fssbuf, sizeof (fssbuf_t));
524 static fsspset_t *
525 fss_find_fsspset(cpupart_t *cpupart)
527 int i;
528 fsspset_t *fsspset = NULL;
529 int found = 0;
531 ASSERT(cpupart != NULL);
532 ASSERT(MUTEX_HELD(&fsspsets_lock));
535 * Search for the cpupart pointer in the array of fsspsets.
537 for (i = 0; i < max_ncpus; i++) {
538 fsspset = &fsspsets[i];
539 if (fsspset->fssps_cpupart == cpupart) {
540 ASSERT(fsspset->fssps_nproj > 0);
541 found = 1;
542 break;
545 if (found == 0) {
547 * If we didn't find anything, then use the first
548 * available slot in the fsspsets array.
550 for (i = 0; i < max_ncpus; i++) {
551 fsspset = &fsspsets[i];
552 if (fsspset->fssps_cpupart == NULL) {
553 ASSERT(fsspset->fssps_nproj == 0);
554 found = 1;
555 break;
558 fsspset->fssps_cpupart = cpupart;
560 ASSERT(found == 1);
561 return (fsspset);
564 static void
565 fss_del_fsspset(fsspset_t *fsspset)
567 ASSERT(MUTEX_HELD(&fsspsets_lock));
568 ASSERT(MUTEX_HELD(&fsspset->fssps_lock));
569 ASSERT(fsspset->fssps_nproj == 0);
570 ASSERT(fsspset->fssps_list == NULL);
571 ASSERT(fsspset->fssps_zones == NULL);
572 fsspset->fssps_cpupart = NULL;
573 fsspset->fssps_maxfsspri = 0;
574 fsspset->fssps_shares = 0;
578 * The following routine returns a pointer to the fsszone structure which
579 * belongs to zone "zone" and cpu partition fsspset, if such structure exists.
581 static fsszone_t *
582 fss_find_fsszone(fsspset_t *fsspset, zone_t *zone)
584 fsszone_t *fsszone;
586 ASSERT(MUTEX_HELD(&fsspset->fssps_lock));
588 if (fsspset->fssps_list != NULL) {
590 * There are projects/zones active on this cpu partition
591 * already. Try to find our zone among them.
593 fsszone = fsspset->fssps_zones;
594 do {
595 if (fsszone->fssz_zone == zone) {
596 return (fsszone);
598 fsszone = fsszone->fssz_next;
599 } while (fsszone != fsspset->fssps_zones);
601 return (NULL);
605 * The following routine links new fsszone structure into doubly linked list of
606 * zones active on the specified cpu partition.
608 static void
609 fss_insert_fsszone(fsspset_t *fsspset, zone_t *zone, fsszone_t *fsszone)
611 ASSERT(MUTEX_HELD(&fsspset->fssps_lock));
613 fsszone->fssz_zone = zone;
614 fsszone->fssz_rshares = zone->zone_shares;
616 if (fsspset->fssps_zones == NULL) {
618 * This will be the first fsszone for this fsspset
620 fsszone->fssz_next = fsszone->fssz_prev = fsszone;
621 fsspset->fssps_zones = fsszone;
622 } else {
624 * Insert this fsszone to the doubly linked list.
626 fsszone_t *fssz_head = fsspset->fssps_zones;
628 fsszone->fssz_next = fssz_head;
629 fsszone->fssz_prev = fssz_head->fssz_prev;
630 fssz_head->fssz_prev->fssz_next = fsszone;
631 fssz_head->fssz_prev = fsszone;
632 fsspset->fssps_zones = fsszone;
637 * The following routine removes a single fsszone structure from the doubly
638 * linked list of zones active on the specified cpu partition. Note that
639 * global fsspsets_lock must be held in case this fsszone structure is the last
640 * on the above mentioned list. Also note that the fsszone structure is not
641 * freed here, it is the responsibility of the caller to call kmem_free for it.
643 static void
644 fss_remove_fsszone(fsspset_t *fsspset, fsszone_t *fsszone)
646 ASSERT(MUTEX_HELD(&fsspset->fssps_lock));
647 ASSERT(fsszone->fssz_nproj == 0);
648 ASSERT(fsszone->fssz_shares == 0);
649 ASSERT(fsszone->fssz_runnable == 0);
651 if (fsszone->fssz_next != fsszone) {
653 * This is not the last zone in the list.
655 fsszone->fssz_prev->fssz_next = fsszone->fssz_next;
656 fsszone->fssz_next->fssz_prev = fsszone->fssz_prev;
657 if (fsspset->fssps_zones == fsszone)
658 fsspset->fssps_zones = fsszone->fssz_next;
659 } else {
661 * This was the last zone active in this cpu partition.
663 fsspset->fssps_zones = NULL;
668 * The following routine returns a pointer to the fssproj structure
669 * which belongs to project kpj and cpu partition fsspset, if such structure
670 * exists.
672 static fssproj_t *
673 fss_find_fssproj(fsspset_t *fsspset, kproject_t *kpj)
675 fssproj_t *fssproj;
677 ASSERT(MUTEX_HELD(&fsspset->fssps_lock));
679 if (fsspset->fssps_list != NULL) {
681 * There are projects running on this cpu partition already.
682 * Try to find our project among them.
684 fssproj = fsspset->fssps_list;
685 do {
686 if (fssproj->fssp_proj == kpj) {
687 ASSERT(fssproj->fssp_pset == fsspset);
688 return (fssproj);
690 fssproj = fssproj->fssp_next;
691 } while (fssproj != fsspset->fssps_list);
693 return (NULL);
697 * The following routine links new fssproj structure into doubly linked list
698 * of projects running on the specified cpu partition.
700 static void
701 fss_insert_fssproj(fsspset_t *fsspset, kproject_t *kpj, fsszone_t *fsszone,
702 fssproj_t *fssproj)
704 ASSERT(MUTEX_HELD(&fsspset->fssps_lock));
706 fssproj->fssp_pset = fsspset;
707 fssproj->fssp_proj = kpj;
708 fssproj->fssp_shares = kpj->kpj_shares;
710 fsspset->fssps_nproj++;
712 if (fsspset->fssps_list == NULL) {
714 * This will be the first fssproj for this fsspset
716 fssproj->fssp_next = fssproj->fssp_prev = fssproj;
717 fsspset->fssps_list = fssproj;
718 } else {
720 * Insert this fssproj to the doubly linked list.
722 fssproj_t *fssp_head = fsspset->fssps_list;
724 fssproj->fssp_next = fssp_head;
725 fssproj->fssp_prev = fssp_head->fssp_prev;
726 fssp_head->fssp_prev->fssp_next = fssproj;
727 fssp_head->fssp_prev = fssproj;
728 fsspset->fssps_list = fssproj;
730 fssproj->fssp_fsszone = fsszone;
731 fsszone->fssz_nproj++;
732 ASSERT(fsszone->fssz_nproj != 0);
736 * The following routine removes a single fssproj structure from the doubly
737 * linked list of projects running on the specified cpu partition. Note that
738 * global fsspsets_lock must be held in case if this fssproj structure is the
739 * last on the above mentioned list. Also note that the fssproj structure is
740 * not freed here, it is the responsibility of the caller to call kmem_free
741 * for it.
743 static void
744 fss_remove_fssproj(fsspset_t *fsspset, fssproj_t *fssproj)
746 fsszone_t *fsszone;
748 ASSERT(MUTEX_HELD(&fsspsets_lock));
749 ASSERT(MUTEX_HELD(&fsspset->fssps_lock));
750 ASSERT(fssproj->fssp_runnable == 0);
752 fsspset->fssps_nproj--;
754 fsszone = fssproj->fssp_fsszone;
755 fsszone->fssz_nproj--;
757 if (fssproj->fssp_next != fssproj) {
759 * This is not the last part in the list.
761 fssproj->fssp_prev->fssp_next = fssproj->fssp_next;
762 fssproj->fssp_next->fssp_prev = fssproj->fssp_prev;
763 if (fsspset->fssps_list == fssproj)
764 fsspset->fssps_list = fssproj->fssp_next;
765 if (fsszone->fssz_nproj == 0)
766 fss_remove_fsszone(fsspset, fsszone);
767 } else {
769 * This was the last project part running
770 * at this cpu partition.
772 fsspset->fssps_list = NULL;
773 ASSERT(fsspset->fssps_nproj == 0);
774 ASSERT(fsszone->fssz_nproj == 0);
775 fss_remove_fsszone(fsspset, fsszone);
776 fss_del_fsspset(fsspset);
780 static void
781 fss_inactive(kthread_t *t)
783 fssproc_t *fssproc;
784 fssproj_t *fssproj;
785 fsspset_t *fsspset;
786 fsszone_t *fsszone;
788 ASSERT(THREAD_LOCK_HELD(t));
789 fssproc = FSSPROC(t);
790 fssproj = FSSPROC2FSSPROJ(fssproc);
791 if (fssproj == NULL) /* if this thread already exited */
792 return;
793 fsspset = FSSPROJ2FSSPSET(fssproj);
794 fsszone = fssproj->fssp_fsszone;
795 disp_lock_enter_high(&fsspset->fssps_displock);
796 ASSERT(fssproj->fssp_runnable > 0);
797 if (--fssproj->fssp_runnable == 0) {
798 fsszone->fssz_shares -= fssproj->fssp_shares;
799 if (--fsszone->fssz_runnable == 0)
800 fsspset->fssps_shares -= fsszone->fssz_rshares;
802 ASSERT(fssproc->fss_runnable == 1);
803 fssproc->fss_runnable = 0;
804 disp_lock_exit_high(&fsspset->fssps_displock);
807 static void
808 fss_active(kthread_t *t)
810 fssproc_t *fssproc;
811 fssproj_t *fssproj;
812 fsspset_t *fsspset;
813 fsszone_t *fsszone;
815 ASSERT(THREAD_LOCK_HELD(t));
816 fssproc = FSSPROC(t);
817 fssproj = FSSPROC2FSSPROJ(fssproc);
818 if (fssproj == NULL) /* if this thread already exited */
819 return;
820 fsspset = FSSPROJ2FSSPSET(fssproj);
821 fsszone = fssproj->fssp_fsszone;
822 disp_lock_enter_high(&fsspset->fssps_displock);
823 if (++fssproj->fssp_runnable == 1) {
824 fsszone->fssz_shares += fssproj->fssp_shares;
825 if (++fsszone->fssz_runnable == 1)
826 fsspset->fssps_shares += fsszone->fssz_rshares;
828 ASSERT(fssproc->fss_runnable == 0);
829 fssproc->fss_runnable = 1;
830 disp_lock_exit_high(&fsspset->fssps_displock);
834 * Fair share scheduler initialization. Called by dispinit() at boot time.
835 * We can ignore clparmsz argument since we know that the smallest possible
836 * parameter buffer is big enough for us.
838 /*ARGSUSED*/
839 static pri_t
840 fss_init(id_t cid, int clparmsz, classfuncs_t **clfuncspp)
842 int i;
844 ASSERT(MUTEX_HELD(&cpu_lock));
846 fss_cid = cid;
847 fss_maxumdpri = minclsyspri - 1;
848 fss_maxglobpri = minclsyspri;
849 fss_minglobpri = 0;
850 fsspsets = kmem_zalloc(sizeof (fsspset_t) * max_ncpus, KM_SLEEP);
853 * Initialize the fssproc hash table.
855 for (i = 0; i < FSS_LISTS; i++)
856 fss_listhead[i].fss_next = fss_listhead[i].fss_prev =
857 &fss_listhead[i];
859 *clfuncspp = &fss_classfuncs;
862 * Fill in fss_nice_tick and fss_nice_decay arrays:
863 * The cost of a tick is lower at positive nice values (so that it
864 * will not increase its project's usage as much as normal) with 50%
865 * drop at the maximum level and 50% increase at the minimum level.
866 * The fsspri decay is slower at positive nice values. fsspri values
867 * of processes with negative nice levels must decay faster to receive
868 * time slices more frequently than normal.
870 for (i = 0; i < FSS_NICE_RANGE; i++) {
871 fss_nice_tick[i] = (FSS_TICK_COST * (((3 * FSS_NICE_RANGE) / 2)
872 - i)) / FSS_NICE_RANGE;
873 fss_nice_decay[i] = FSS_DECAY_MIN +
874 ((FSS_DECAY_MAX - FSS_DECAY_MIN) * i) /
875 (FSS_NICE_RANGE - 1);
878 return (fss_maxglobpri);
882 * Calculate the new fss_umdpri based on the usage, the normalized share usage
883 * and the number of active threads. Reset the tick counter for this thread.
885 * When calculating the new priority using the standard formula we can hit
886 * a scenario where we don't have good round-robin behavior. This would be
887 * most commonly seen when there is a zone with lots of runnable threads.
888 * In the bad scenario we will see the following behavior when using the
889 * standard formula and these conditions:
891 * - there are multiple runnable threads in the zone (project)
892 * - the fssps_maxfsspri is a very large value
893 * - (we also know all of these threads will use the project's
894 * fssp_shusage)
896 * Under these conditions, a thread with a low fss_fsspri value is chosen
897 * to run and the thread gets a high fss_umdpri. This thread can run for
898 * its full quanta (fss_timeleft) at which time fss_newpri is called to
899 * calculate the thread's new priority.
901 * In this case, because the newly calculated fsspri value is much smaller
902 * (orders of magnitude) than the fssps_maxfsspri value, if we used the
903 * standard formula the thread will still get a high fss_umdpri value and
904 * will run again for another quanta, even though there are other runnable
905 * threads in the project.
907 * For a thread that is runnable for a long time, the thread can continue
908 * to run for many quanta (totaling many seconds) before the thread's fsspri
909 * exceeds the fssps_maxfsspri and the thread's fss_umdpri is reset back
910 * down to 1. This behavior also keeps the fssps_maxfsspr at a high value,
911 * so that the next runnable thread might repeat this cycle.
913 * This leads to the case where we don't have round-robin behavior at quanta
914 * granularity, but instead, runnable threads within the project only run
915 * at several second intervals.
917 * To prevent this scenario from occuring, when a thread has consumed its
918 * quanta and there are multiple runnable threads in the project, we
919 * immediately cause the thread to hit fssps_maxfsspri so that it gets
920 * reset back to 1 and another runnable thread in the project can run.
922 static void
923 fss_newpri(fssproc_t *fssproc, bool quanta_up)
925 kthread_t *tp;
926 fssproj_t *fssproj;
927 fsspset_t *fsspset;
928 fsszone_t *fsszone;
929 fsspri_t fsspri, maxfsspri;
930 uint32_t n_runnable;
931 pri_t invpri;
932 uint32_t ticks;
934 tp = fssproc->fss_tp;
935 ASSERT(tp != NULL);
937 if (tp->t_cid != fss_cid)
938 return;
940 ASSERT(THREAD_LOCK_HELD(tp));
942 fssproj = FSSPROC2FSSPROJ(fssproc);
943 fsszone = FSSPROJ2FSSZONE(fssproj);
944 if (fssproj == NULL)
946 * No need to change priority of exited threads.
948 return;
950 fsspset = FSSPROJ2FSSPSET(fssproj);
951 disp_lock_enter_high(&fsspset->fssps_displock);
953 ticks = fssproc->fss_ticks;
954 fssproc->fss_ticks = 0;
956 if (fssproj->fssp_shares == 0 || fsszone->fssz_rshares == 0) {
958 * Special case: threads with no shares.
960 fssproc->fss_umdpri = fss_minglobpri;
961 disp_lock_exit_high(&fsspset->fssps_displock);
962 return;
965 maxfsspri = fsspset->fssps_maxfsspri;
966 n_runnable = fssproj->fssp_runnable;
968 if (quanta_up && n_runnable > 1) {
969 fsspri = maxfsspri;
970 } else {
972 * fsspri += fssp_shusage * nrunnable * ticks
973 * If all three values are non-0, this typically calculates to
974 * a large number (sometimes > 1M, sometimes > 100B) due to
975 * fssp_shusage which can be > 1T.
977 fsspri = fssproc->fss_fsspri;
978 fsspri += fssproj->fssp_shusage * n_runnable * ticks;
981 fssproc->fss_fsspri = fsspri;
984 * fss_maxumdpri is normally 59, since FSS priorities are 0-59.
985 * If the previous calculation resulted in 0 (e.g. was 0 and added 0
986 * because ticks == 0), then instead of 0, we use the largest priority,
987 * which is still small in comparison to the large numbers we typically
988 * see.
990 if (fsspri < fss_maxumdpri)
991 fsspri = fss_maxumdpri; /* so that maxfsspri is != 0 */
994 * The general priority formula:
996 * (fsspri * umdprirange)
997 * pri = maxumdpri - ------------------------
998 * maxfsspri
1000 * If this thread's fsspri is greater than the previous largest
1001 * fsspri, then record it as the new high and priority for this
1002 * thread will be one (the lowest priority assigned to a thread
1003 * that has non-zero shares). Because of this check, maxfsspri can
1004 * change as this function is called via the
1005 * fss_update -> fss_update_list -> fss_newpri code path to update
1006 * all runnable threads. See the code in fss_update for how we
1007 * mitigate this issue.
1009 * Note that this formula cannot produce out of bounds priority
1010 * values (0-59); if it is changed, additional checks may need to be
1011 * added.
1013 if (fsspri >= maxfsspri) {
1014 fsspset->fssps_maxfsspri = fsspri;
1015 disp_lock_exit_high(&fsspset->fssps_displock);
1016 fssproc->fss_umdpri = 1;
1017 } else {
1018 disp_lock_exit_high(&fsspset->fssps_displock);
1019 invpri = (fsspri * (fss_maxumdpri - 1)) / maxfsspri;
1020 fssproc->fss_umdpri = fss_maxumdpri - invpri;
1025 * Decays usages of all running projects, resets their tick counters and
1026 * calcluates the projects normalized share usage. Called once per second from
1027 * fss_update().
1029 static void
1030 fss_decay_usage()
1032 uint32_t zone_ext_shares, zone_int_shares;
1033 uint32_t kpj_shares, pset_shares;
1034 fsspset_t *fsspset;
1035 fssproj_t *fssproj;
1036 fsszone_t *fsszone;
1037 fsspri_t maxfsspri;
1038 int psetid;
1039 struct zone *zp;
1041 mutex_enter(&fsspsets_lock);
1043 * Go through all active processor sets and decay usages of projects
1044 * running on them.
1046 for (psetid = 0; psetid < max_ncpus; psetid++) {
1047 fsspset = &fsspsets[psetid];
1048 mutex_enter(&fsspset->fssps_lock);
1050 fsspset->fssps_gen++;
1052 if (fsspset->fssps_cpupart == NULL ||
1053 (fssproj = fsspset->fssps_list) == NULL) {
1054 mutex_exit(&fsspset->fssps_lock);
1055 continue;
1059 * Decay maxfsspri for this cpu partition with the
1060 * fastest possible decay rate.
1062 disp_lock_enter(&fsspset->fssps_displock);
1064 pset_shares = fsspset->fssps_shares;
1066 maxfsspri = (fsspset->fssps_maxfsspri *
1067 fss_nice_decay[NZERO]) / FSS_DECAY_BASE;
1068 if (maxfsspri < fss_maxumdpri)
1069 maxfsspri = fss_maxumdpri;
1070 fsspset->fssps_maxfsspri = maxfsspri;
1072 do {
1073 fsszone = fssproj->fssp_fsszone;
1074 zp = fsszone->fssz_zone;
1077 * Reset zone's FSS stats if they are from a
1078 * previous cycle.
1080 if (fsspset->fssps_gen != zp->zone_fss_gen) {
1081 zp->zone_fss_gen = fsspset->fssps_gen;
1082 zp->zone_run_ticks = 0;
1086 * Decay project usage, then add in this cycle's
1087 * nice tick value.
1089 fssproj->fssp_usage =
1090 (fssproj->fssp_usage * FSS_DECAY_USG) /
1091 FSS_DECAY_BASE +
1092 fssproj->fssp_ticks;
1094 fssproj->fssp_ticks = 0;
1095 zp->zone_run_ticks += fssproj->fssp_tick_cnt;
1096 fssproj->fssp_tick_cnt = 0;
1099 * Readjust the project's number of shares if it has
1100 * changed since we checked it last time.
1102 kpj_shares = fssproj->fssp_proj->kpj_shares;
1103 if (fssproj->fssp_shares != kpj_shares) {
1104 if (fssproj->fssp_runnable != 0) {
1105 fsszone->fssz_shares -=
1106 fssproj->fssp_shares;
1107 fsszone->fssz_shares += kpj_shares;
1109 fssproj->fssp_shares = kpj_shares;
1113 * Readjust the zone's number of shares if it
1114 * has changed since we checked it last time.
1116 zone_ext_shares = zp->zone_shares;
1117 if (fsszone->fssz_rshares != zone_ext_shares) {
1118 if (fsszone->fssz_runnable != 0) {
1119 fsspset->fssps_shares -=
1120 fsszone->fssz_rshares;
1121 fsspset->fssps_shares +=
1122 zone_ext_shares;
1123 pset_shares = fsspset->fssps_shares;
1125 fsszone->fssz_rshares = zone_ext_shares;
1127 zone_int_shares = fsszone->fssz_shares;
1130 * If anything is runnable in the project, track the
1131 * overall project share percent for monitoring useage.
1133 if (fssproj->fssp_runnable > 0) {
1134 uint32_t zone_shr_pct;
1135 uint32_t int_shr_pct;
1138 * Times 1000 to get tenths of a percent
1140 * zone_ext_shares
1141 * zone_shr_pct = ---------------
1142 * pset_shares
1144 * kpj_shares
1145 * int_shr_pct = ---------------
1146 * zone_int_shares
1148 if (pset_shares == 0 || zone_int_shares == 0) {
1149 fssproj->fssp_shr_pct = 0;
1150 } else {
1151 zone_shr_pct =
1152 (zone_ext_shares * 1000) /
1153 pset_shares;
1154 int_shr_pct = (kpj_shares * 1000) /
1155 zone_int_shares;
1156 fssproj->fssp_shr_pct =
1157 (zone_shr_pct * int_shr_pct) /
1158 1000;
1160 } else {
1161 DTRACE_PROBE1(fss__prj__norun, fssproj_t *,
1162 fssproj);
1166 * Calculate fssp_shusage value to be used
1167 * for fsspri increments for the next second.
1169 if (kpj_shares == 0 || zone_ext_shares == 0) {
1170 fssproj->fssp_shusage = 0;
1171 } else if (FSSPROJ2KPROJ(fssproj) == proj0p) {
1172 uint32_t zone_shr_pct;
1175 * Project 0 in the global zone has 50%
1176 * of its zone. See calculation above for
1177 * the zone's share percent.
1179 if (pset_shares == 0)
1180 zone_shr_pct = 1000;
1181 else
1182 zone_shr_pct =
1183 (zone_ext_shares * 1000) /
1184 pset_shares;
1186 fssproj->fssp_shr_pct = zone_shr_pct / 2;
1188 fssproj->fssp_shusage = (fssproj->fssp_usage *
1189 zone_int_shares * zone_int_shares) /
1190 (zone_ext_shares * zone_ext_shares);
1191 } else {
1193 * Thread's priority is based on its project's
1194 * normalized usage (shusage) value which gets
1195 * calculated this way:
1197 * pset_shares^2 zone_int_shares^2
1198 * usage * ------------- * ------------------
1199 * kpj_shares^2 zone_ext_shares^2
1201 * Where zone_int_shares is the sum of shares
1202 * of all active projects within the zone (and
1203 * the pset), and zone_ext_shares is the number
1204 * of zone shares (ie, zone.cpu-shares).
1206 * If there is only one zone active on the pset
1207 * the above reduces to:
1209 * zone_int_shares^2
1210 * shusage = usage * ---------------------
1211 * kpj_shares^2
1213 * If there's only one project active in the
1214 * zone this formula reduces to:
1216 * pset_shares^2
1217 * shusage = usage * ----------------------
1218 * zone_ext_shares^2
1220 * shusage is one input to calculating fss_pri
1221 * in fss_newpri(). Larger values tend toward
1222 * lower priorities for processes in the proj.
1224 fssproj->fssp_shusage = fssproj->fssp_usage *
1225 pset_shares * zone_int_shares;
1226 fssproj->fssp_shusage /=
1227 kpj_shares * zone_ext_shares;
1228 fssproj->fssp_shusage *=
1229 pset_shares * zone_int_shares;
1230 fssproj->fssp_shusage /=
1231 kpj_shares * zone_ext_shares;
1233 fssproj = fssproj->fssp_next;
1234 } while (fssproj != fsspset->fssps_list);
1236 disp_lock_exit(&fsspset->fssps_displock);
1237 mutex_exit(&fsspset->fssps_lock);
1239 mutex_exit(&fsspsets_lock);
1242 static void
1243 fss_change_priority(kthread_t *t, fssproc_t *fssproc)
1245 pri_t new_pri;
1247 ASSERT(THREAD_LOCK_HELD(t));
1248 new_pri = fssproc->fss_umdpri;
1249 ASSERT(new_pri >= 0 && new_pri <= fss_maxglobpri);
1251 t->t_cpri = fssproc->fss_upri;
1252 fssproc->fss_flags &= ~FSSRESTORE;
1253 if (t == curthread || t->t_state == TS_ONPROC) {
1255 * curthread is always onproc
1257 cpu_t *cp = t->t_disp_queue->disp_cpu;
1258 THREAD_CHANGE_PRI(t, new_pri);
1259 if (t == cp->cpu_dispthread)
1260 cp->cpu_dispatch_pri = DISP_PRIO(t);
1261 if (DISP_MUST_SURRENDER(t)) {
1262 fssproc->fss_flags |= FSSBACKQ;
1263 cpu_surrender(t);
1264 } else {
1265 fssproc->fss_timeleft = fss_quantum;
1267 } else {
1269 * When the priority of a thread is changed, it may be
1270 * necessary to adjust its position on a sleep queue or
1271 * dispatch queue. The function thread_change_pri accomplishes
1272 * this.
1274 if (thread_change_pri(t, new_pri, 0)) {
1276 * The thread was on a run queue.
1278 fssproc->fss_timeleft = fss_quantum;
1279 } else {
1280 fssproc->fss_flags |= FSSBACKQ;
1286 * Update priorities of all fair-sharing threads that are currently runnable
1287 * at a user mode priority based on the number of shares and current usage.
1288 * Called once per second via timeout which we reset here.
1290 * There are several lists of fair-sharing threads broken up by a hash on the
1291 * thread pointer. Each list has its own lock. This avoids blocking all
1292 * fss_enterclass, fss_fork, and fss_exitclass operations while fss_update runs.
1293 * fss_update traverses each list in turn.
1295 * Each time we're run (once/second) we may start at the next list and iterate
1296 * through all of the lists. By starting with a different list, we mitigate any
1297 * effects we would see updating the fssps_maxfsspri value in fss_newpri.
1299 static void
1300 fss_update(void *arg)
1302 int i;
1303 int new_marker = -1;
1304 static int fss_update_marker;
1307 * Decay and update usages for all projects.
1309 fss_decay_usage();
1312 * Start with the fss_update_marker list, then do the rest.
1314 i = fss_update_marker;
1317 * Go around all threads, set new priorities and decay
1318 * per-thread CPU usages.
1320 do {
1322 * If this is the first list after the current marker to have
1323 * threads with priority updates, advance the marker to this
1324 * list for the next time fss_update runs.
1326 if (fss_update_list(i) &&
1327 new_marker == -1 && i != fss_update_marker)
1328 new_marker = i;
1329 } while ((i = FSS_LIST_NEXT(i)) != fss_update_marker);
1332 * Advance marker for the next fss_update call
1334 if (new_marker != -1)
1335 fss_update_marker = new_marker;
1337 (void) timeout(fss_update, arg, hz);
1341 * Updates priority for a list of threads. Returns 1 if the priority of one
1342 * of the threads was actually updated, 0 if none were for various reasons
1343 * (thread is no longer in the FSS class, is not runnable, has the preemption
1344 * control no-preempt bit set, etc.)
1346 static int
1347 fss_update_list(int i)
1349 fssproc_t *fssproc;
1350 fssproj_t *fssproj;
1351 fsspri_t fsspri;
1352 pri_t fss_umdpri;
1353 kthread_t *t;
1354 int updated = 0;
1356 mutex_enter(&fss_listlock[i]);
1357 for (fssproc = fss_listhead[i].fss_next; fssproc != &fss_listhead[i];
1358 fssproc = fssproc->fss_next) {
1359 t = fssproc->fss_tp;
1361 * Lock the thread and verify the state.
1363 thread_lock(t);
1365 * Skip the thread if it is no longer in the FSS class or
1366 * is running with kernel mode priority.
1368 if (t->t_cid != fss_cid)
1369 goto next;
1370 if ((fssproc->fss_flags & FSSKPRI) != 0)
1371 goto next;
1373 fssproj = FSSPROC2FSSPROJ(fssproc);
1374 if (fssproj == NULL)
1375 goto next;
1377 if (fssproj->fssp_shares != 0) {
1379 * Decay fsspri value.
1381 fsspri = fssproc->fss_fsspri;
1382 fsspri = (fsspri * fss_nice_decay[fssproc->fss_nice]) /
1383 FSS_DECAY_BASE;
1384 fssproc->fss_fsspri = fsspri;
1387 if (t->t_schedctl && schedctl_get_nopreempt(t))
1388 goto next;
1389 if (t->t_state != TS_RUN && t->t_state != TS_WAIT) {
1391 * Make next syscall/trap call fss_trapret
1393 t->t_trapret = 1;
1394 aston(t);
1395 if (t->t_state == TS_ONPROC)
1396 DTRACE_PROBE1(fss__onproc, fssproc_t *,
1397 fssproc);
1398 goto next;
1400 fss_newpri(fssproc, false);
1401 updated = 1;
1403 fss_umdpri = fssproc->fss_umdpri;
1406 * Only dequeue the thread if it needs to be moved; otherwise
1407 * it should just round-robin here.
1409 if (t->t_pri != fss_umdpri)
1410 fss_change_priority(t, fssproc);
1411 next:
1412 thread_unlock(t);
1414 mutex_exit(&fss_listlock[i]);
1415 return (updated);
1418 /*ARGSUSED*/
1419 static int
1420 fss_admin(caddr_t uaddr, cred_t *reqpcredp)
1422 fssadmin_t fssadmin;
1424 if (copyin(uaddr, &fssadmin, sizeof (fssadmin_t)))
1425 return (EFAULT);
1427 switch (fssadmin.fss_cmd) {
1428 case FSS_SETADMIN:
1429 if (secpolicy_dispadm(reqpcredp) != 0)
1430 return (EPERM);
1431 if (fssadmin.fss_quantum <= 0 || fssadmin.fss_quantum >= hz)
1432 return (EINVAL);
1433 fss_quantum = fssadmin.fss_quantum;
1434 break;
1435 case FSS_GETADMIN:
1436 fssadmin.fss_quantum = fss_quantum;
1437 if (copyout(&fssadmin, uaddr, sizeof (fssadmin_t)))
1438 return (EFAULT);
1439 break;
1440 default:
1441 return (EINVAL);
1443 return (0);
1446 static int
1447 fss_getclinfo(void *infop)
1449 fssinfo_t *fssinfo = (fssinfo_t *)infop;
1450 fssinfo->fss_maxupri = fss_maxupri;
1451 return (0);
1454 static int
1455 fss_parmsin(void *parmsp)
1457 fssparms_t *fssparmsp = (fssparms_t *)parmsp;
1460 * Check validity of parameters.
1462 if ((fssparmsp->fss_uprilim > fss_maxupri ||
1463 fssparmsp->fss_uprilim < -fss_maxupri) &&
1464 fssparmsp->fss_uprilim != FSS_NOCHANGE)
1465 return (EINVAL);
1467 if ((fssparmsp->fss_upri > fss_maxupri ||
1468 fssparmsp->fss_upri < -fss_maxupri) &&
1469 fssparmsp->fss_upri != FSS_NOCHANGE)
1470 return (EINVAL);
1472 return (0);
1475 /*ARGSUSED*/
1476 static int
1477 fss_parmsout(void *parmsp, pc_vaparms_t *vaparmsp)
1479 return (0);
1482 static int
1483 fss_vaparmsin(void *parmsp, pc_vaparms_t *vaparmsp)
1485 fssparms_t *fssparmsp = (fssparms_t *)parmsp;
1486 int priflag = 0;
1487 int limflag = 0;
1488 uint_t cnt;
1489 pc_vaparm_t *vpp = &vaparmsp->pc_parms[0];
1492 * FSS_NOCHANGE (-32768) is outside of the range of values for
1493 * fss_uprilim and fss_upri. If the structure fssparms_t is changed,
1494 * FSS_NOCHANGE should be replaced by a flag word.
1496 fssparmsp->fss_uprilim = FSS_NOCHANGE;
1497 fssparmsp->fss_upri = FSS_NOCHANGE;
1500 * Get the varargs parameter and check validity of parameters.
1502 if (vaparmsp->pc_vaparmscnt > PC_VAPARMCNT)
1503 return (EINVAL);
1505 for (cnt = 0; cnt < vaparmsp->pc_vaparmscnt; cnt++, vpp++) {
1506 switch (vpp->pc_key) {
1507 case FSS_KY_UPRILIM:
1508 if (limflag++)
1509 return (EINVAL);
1510 fssparmsp->fss_uprilim = (pri_t)vpp->pc_parm;
1511 if (fssparmsp->fss_uprilim > fss_maxupri ||
1512 fssparmsp->fss_uprilim < -fss_maxupri)
1513 return (EINVAL);
1514 break;
1515 case FSS_KY_UPRI:
1516 if (priflag++)
1517 return (EINVAL);
1518 fssparmsp->fss_upri = (pri_t)vpp->pc_parm;
1519 if (fssparmsp->fss_upri > fss_maxupri ||
1520 fssparmsp->fss_upri < -fss_maxupri)
1521 return (EINVAL);
1522 break;
1523 default:
1524 return (EINVAL);
1528 if (vaparmsp->pc_vaparmscnt == 0) {
1530 * Use default parameters.
1532 fssparmsp->fss_upri = fssparmsp->fss_uprilim = 0;
1535 return (0);
1539 * Copy all selected fair-sharing class parameters to the user. The parameters
1540 * are specified by a key.
1542 static int
1543 fss_vaparmsout(void *parmsp, pc_vaparms_t *vaparmsp)
1545 fssparms_t *fssparmsp = (fssparms_t *)parmsp;
1546 int priflag = 0;
1547 int limflag = 0;
1548 uint_t cnt;
1549 pc_vaparm_t *vpp = &vaparmsp->pc_parms[0];
1551 ASSERT(MUTEX_NOT_HELD(&curproc->p_lock));
1553 if (vaparmsp->pc_vaparmscnt > PC_VAPARMCNT)
1554 return (EINVAL);
1556 for (cnt = 0; cnt < vaparmsp->pc_vaparmscnt; cnt++, vpp++) {
1557 switch (vpp->pc_key) {
1558 case FSS_KY_UPRILIM:
1559 if (limflag++)
1560 return (EINVAL);
1561 if (copyout(&fssparmsp->fss_uprilim,
1562 (caddr_t)(uintptr_t)vpp->pc_parm, sizeof (pri_t)))
1563 return (EFAULT);
1564 break;
1565 case FSS_KY_UPRI:
1566 if (priflag++)
1567 return (EINVAL);
1568 if (copyout(&fssparmsp->fss_upri,
1569 (caddr_t)(uintptr_t)vpp->pc_parm, sizeof (pri_t)))
1570 return (EFAULT);
1571 break;
1572 default:
1573 return (EINVAL);
1577 return (0);
1581 * Return the user mode scheduling priority range.
1583 static int
1584 fss_getclpri(pcpri_t *pcprip)
1586 pcprip->pc_clpmax = fss_maxupri;
1587 pcprip->pc_clpmin = -fss_maxupri;
1588 return (0);
1591 static int
1592 fss_alloc(void **p, int flag)
1594 void *bufp;
1596 if ((bufp = kmem_zalloc(sizeof (fssproc_t), flag)) == NULL) {
1597 return (ENOMEM);
1598 } else {
1599 *p = bufp;
1600 return (0);
1604 static void
1605 fss_free(void *bufp)
1607 if (bufp)
1608 kmem_free(bufp, sizeof (fssproc_t));
1612 * Thread functions
1614 static int
1615 fss_enterclass(kthread_t *t, id_t cid, void *parmsp, cred_t *reqpcredp,
1616 void *bufp)
1618 fssparms_t *fssparmsp = (fssparms_t *)parmsp;
1619 fssproc_t *fssproc;
1620 pri_t reqfssuprilim;
1621 pri_t reqfssupri;
1622 static uint32_t fssexists = 0;
1623 fsspset_t *fsspset;
1624 fssproj_t *fssproj;
1625 fsszone_t *fsszone;
1626 kproject_t *kpj;
1627 zone_t *zone;
1628 int fsszone_allocated = 0;
1630 fssproc = (fssproc_t *)bufp;
1631 ASSERT(fssproc != NULL);
1633 ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock));
1636 * Only root can move threads to FSS class.
1638 if (reqpcredp != NULL && secpolicy_setpriority(reqpcredp) != 0)
1639 return (EPERM);
1641 * Initialize the fssproc structure.
1643 fssproc->fss_umdpri = fss_maxumdpri / 2;
1645 if (fssparmsp == NULL) {
1647 * Use default values.
1649 fssproc->fss_nice = NZERO;
1650 fssproc->fss_uprilim = fssproc->fss_upri = 0;
1651 } else {
1653 * Use supplied values.
1655 if (fssparmsp->fss_uprilim == FSS_NOCHANGE) {
1656 reqfssuprilim = 0;
1657 } else {
1658 if (fssparmsp->fss_uprilim > 0 &&
1659 secpolicy_setpriority(reqpcredp) != 0)
1660 return (EPERM);
1661 reqfssuprilim = fssparmsp->fss_uprilim;
1663 if (fssparmsp->fss_upri == FSS_NOCHANGE) {
1664 reqfssupri = reqfssuprilim;
1665 } else {
1666 if (fssparmsp->fss_upri > 0 &&
1667 secpolicy_setpriority(reqpcredp) != 0)
1668 return (EPERM);
1670 * Set the user priority to the requested value or
1671 * the upri limit, whichever is lower.
1673 reqfssupri = fssparmsp->fss_upri;
1674 if (reqfssupri > reqfssuprilim)
1675 reqfssupri = reqfssuprilim;
1677 fssproc->fss_uprilim = reqfssuprilim;
1678 fssproc->fss_upri = reqfssupri;
1679 fssproc->fss_nice = NZERO - (NZERO * reqfssupri) / fss_maxupri;
1680 if (fssproc->fss_nice > FSS_NICE_MAX)
1681 fssproc->fss_nice = FSS_NICE_MAX;
1684 fssproc->fss_timeleft = fss_quantum;
1685 fssproc->fss_tp = t;
1686 cpucaps_sc_init(&fssproc->fss_caps);
1689 * Put a lock on our fsspset structure.
1691 mutex_enter(&fsspsets_lock);
1692 fsspset = fss_find_fsspset(t->t_cpupart);
1693 mutex_enter(&fsspset->fssps_lock);
1694 mutex_exit(&fsspsets_lock);
1696 zone = ttoproc(t)->p_zone;
1697 if ((fsszone = fss_find_fsszone(fsspset, zone)) == NULL) {
1698 if ((fsszone = kmem_zalloc(sizeof (fsszone_t), KM_NOSLEEP))
1699 == NULL) {
1700 mutex_exit(&fsspset->fssps_lock);
1701 return (ENOMEM);
1702 } else {
1703 fsszone_allocated = 1;
1704 fss_insert_fsszone(fsspset, zone, fsszone);
1707 kpj = ttoproj(t);
1708 if ((fssproj = fss_find_fssproj(fsspset, kpj)) == NULL) {
1709 if ((fssproj = kmem_zalloc(sizeof (fssproj_t), KM_NOSLEEP))
1710 == NULL) {
1711 if (fsszone_allocated) {
1712 fss_remove_fsszone(fsspset, fsszone);
1713 kmem_free(fsszone, sizeof (fsszone_t));
1715 mutex_exit(&fsspset->fssps_lock);
1716 return (ENOMEM);
1717 } else {
1718 fss_insert_fssproj(fsspset, kpj, fsszone, fssproj);
1721 fssproj->fssp_threads++;
1722 fssproc->fss_proj = fssproj;
1725 * Reset priority. Process goes to a "user mode" priority here
1726 * regardless of whether or not it has slept since entering the kernel.
1728 thread_lock(t);
1729 t->t_clfuncs = &(sclass[cid].cl_funcs->thread);
1730 t->t_cid = cid;
1731 t->t_cldata = (void *)fssproc;
1732 t->t_schedflag |= TS_RUNQMATCH;
1733 fss_change_priority(t, fssproc);
1734 if (t->t_state == TS_RUN || t->t_state == TS_ONPROC ||
1735 t->t_state == TS_WAIT)
1736 fss_active(t);
1737 thread_unlock(t);
1739 mutex_exit(&fsspset->fssps_lock);
1742 * Link new structure into fssproc list.
1744 FSS_LIST_INSERT(fssproc);
1747 * If this is the first fair-sharing thread to occur since boot,
1748 * we set up the initial call to fss_update() here. Use an atomic
1749 * compare-and-swap since that's easier and faster than a mutex
1750 * (but check with an ordinary load first since most of the time
1751 * this will already be done).
1753 if (fssexists == 0 && atomic_cas_32(&fssexists, 0, 1) == 0)
1754 (void) timeout(fss_update, NULL, hz);
1756 return (0);
1760 * Remove fssproc_t from the list.
1762 static void
1763 fss_exitclass(void *procp)
1765 fssproc_t *fssproc = (fssproc_t *)procp;
1766 fssproj_t *fssproj;
1767 fsspset_t *fsspset;
1768 fsszone_t *fsszone;
1769 kthread_t *t = fssproc->fss_tp;
1772 * We should be either getting this thread off the deathrow or
1773 * this thread has already moved to another scheduling class and
1774 * we're being called with its old cldata buffer pointer. In both
1775 * cases, the content of this buffer can not be changed while we're
1776 * here.
1778 mutex_enter(&fsspsets_lock);
1779 thread_lock(t);
1780 if (t->t_cid != fss_cid) {
1782 * We're being called as a result of the priocntl() system
1783 * call -- someone is trying to move our thread to another
1784 * scheduling class. We can't call fss_inactive() here
1785 * because our thread's t_cldata pointer already points
1786 * to another scheduling class specific data.
1788 ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock));
1790 fssproj = FSSPROC2FSSPROJ(fssproc);
1791 fsspset = FSSPROJ2FSSPSET(fssproj);
1792 fsszone = fssproj->fssp_fsszone;
1794 if (fssproc->fss_runnable) {
1795 disp_lock_enter_high(&fsspset->fssps_displock);
1796 if (--fssproj->fssp_runnable == 0) {
1797 fsszone->fssz_shares -= fssproj->fssp_shares;
1798 if (--fsszone->fssz_runnable == 0)
1799 fsspset->fssps_shares -=
1800 fsszone->fssz_rshares;
1802 disp_lock_exit_high(&fsspset->fssps_displock);
1804 thread_unlock(t);
1806 mutex_enter(&fsspset->fssps_lock);
1807 if (--fssproj->fssp_threads == 0) {
1808 fss_remove_fssproj(fsspset, fssproj);
1809 if (fsszone->fssz_nproj == 0)
1810 kmem_free(fsszone, sizeof (fsszone_t));
1811 kmem_free(fssproj, sizeof (fssproj_t));
1813 mutex_exit(&fsspset->fssps_lock);
1815 } else {
1816 ASSERT(t->t_state == TS_FREE);
1818 * We're being called from thread_free() when our thread
1819 * is removed from the deathrow. There is nothing we need
1820 * do here since everything should've been done earlier
1821 * in fss_exit().
1823 thread_unlock(t);
1825 mutex_exit(&fsspsets_lock);
1827 FSS_LIST_DELETE(fssproc);
1828 fss_free(fssproc);
1831 /*ARGSUSED*/
1832 static int
1833 fss_canexit(kthread_t *t, cred_t *credp)
1836 * A thread is allowed to exit FSS only if we have sufficient
1837 * privileges.
1839 if (credp != NULL && secpolicy_setpriority(credp) != 0)
1840 return (EPERM);
1841 else
1842 return (0);
1846 * Initialize fair-share class specific proc structure for a child.
1848 static int
1849 fss_fork(kthread_t *pt, kthread_t *ct, void *bufp)
1851 fssproc_t *pfssproc; /* ptr to parent's fssproc structure */
1852 fssproc_t *cfssproc; /* ptr to child's fssproc structure */
1853 fssproj_t *fssproj;
1854 fsspset_t *fsspset;
1856 ASSERT(MUTEX_HELD(&ttoproc(pt)->p_lock));
1857 ASSERT(ct->t_state == TS_STOPPED);
1859 cfssproc = (fssproc_t *)bufp;
1860 ASSERT(cfssproc != NULL);
1861 bzero(cfssproc, sizeof (fssproc_t));
1863 thread_lock(pt);
1864 pfssproc = FSSPROC(pt);
1865 fssproj = FSSPROC2FSSPROJ(pfssproc);
1866 fsspset = FSSPROJ2FSSPSET(fssproj);
1867 thread_unlock(pt);
1869 mutex_enter(&fsspset->fssps_lock);
1871 * Initialize child's fssproc structure.
1873 thread_lock(pt);
1874 ASSERT(FSSPROJ(pt) == fssproj);
1875 cfssproc->fss_proj = fssproj;
1876 cfssproc->fss_timeleft = fss_quantum;
1877 cfssproc->fss_umdpri = pfssproc->fss_umdpri;
1878 cfssproc->fss_fsspri = 0;
1879 cfssproc->fss_uprilim = pfssproc->fss_uprilim;
1880 cfssproc->fss_upri = pfssproc->fss_upri;
1881 cfssproc->fss_tp = ct;
1882 cfssproc->fss_nice = pfssproc->fss_nice;
1883 cpucaps_sc_init(&cfssproc->fss_caps);
1885 cfssproc->fss_flags =
1886 pfssproc->fss_flags & ~(FSSKPRI | FSSBACKQ | FSSRESTORE);
1887 ct->t_cldata = (void *)cfssproc;
1888 ct->t_schedflag |= TS_RUNQMATCH;
1889 thread_unlock(pt);
1891 fssproj->fssp_threads++;
1892 mutex_exit(&fsspset->fssps_lock);
1895 * Link new structure into fssproc hash table.
1897 FSS_LIST_INSERT(cfssproc);
1898 return (0);
1902 * Child is placed at back of dispatcher queue and parent gives up processor
1903 * so that the child runs first after the fork. This allows the child
1904 * immediately execing to break the multiple use of copy on write pages with no
1905 * disk home. The parent will get to steal them back rather than uselessly
1906 * copying them.
1908 static void
1909 fss_forkret(kthread_t *t, kthread_t *ct)
1911 proc_t *pp = ttoproc(t);
1912 proc_t *cp = ttoproc(ct);
1913 fssproc_t *fssproc;
1915 ASSERT(t == curthread);
1916 ASSERT(MUTEX_HELD(&pidlock));
1919 * Grab the child's p_lock before dropping pidlock to ensure the
1920 * process does not disappear before we set it running.
1922 mutex_enter(&cp->p_lock);
1923 continuelwps(cp);
1924 mutex_exit(&cp->p_lock);
1926 mutex_enter(&pp->p_lock);
1927 mutex_exit(&pidlock);
1928 continuelwps(pp);
1930 thread_lock(t);
1932 fssproc = FSSPROC(t);
1933 fss_newpri(fssproc, false);
1934 fssproc->fss_timeleft = fss_quantum;
1935 t->t_pri = fssproc->fss_umdpri;
1936 ASSERT(t->t_pri >= 0 && t->t_pri <= fss_maxglobpri);
1937 fssproc->fss_flags &= ~FSSKPRI;
1938 THREAD_TRANSITION(t);
1941 * We don't want to call fss_setrun(t) here because it may call
1942 * fss_active, which we don't need.
1944 fssproc->fss_flags &= ~FSSBACKQ;
1946 if (t->t_disp_time != ddi_get_lbolt())
1947 setbackdq(t);
1948 else
1949 setfrontdq(t);
1951 thread_unlock(t);
1953 * Safe to drop p_lock now since it is safe to change
1954 * the scheduling class after this point.
1956 mutex_exit(&pp->p_lock);
1958 swtch();
1962 * Get the fair-sharing parameters of the thread pointed to by fssprocp into
1963 * the buffer pointed by fssparmsp.
1965 static void
1966 fss_parmsget(kthread_t *t, void *parmsp)
1968 fssproc_t *fssproc = FSSPROC(t);
1969 fssparms_t *fssparmsp = (fssparms_t *)parmsp;
1971 fssparmsp->fss_uprilim = fssproc->fss_uprilim;
1972 fssparmsp->fss_upri = fssproc->fss_upri;
1975 /*ARGSUSED*/
1976 static int
1977 fss_parmsset(kthread_t *t, void *parmsp, id_t reqpcid, cred_t *reqpcredp)
1979 char nice;
1980 pri_t reqfssuprilim;
1981 pri_t reqfssupri;
1982 fssproc_t *fssproc = FSSPROC(t);
1983 fssparms_t *fssparmsp = (fssparms_t *)parmsp;
1985 ASSERT(MUTEX_HELD(&(ttoproc(t))->p_lock));
1987 if (fssparmsp->fss_uprilim == FSS_NOCHANGE)
1988 reqfssuprilim = fssproc->fss_uprilim;
1989 else
1990 reqfssuprilim = fssparmsp->fss_uprilim;
1992 if (fssparmsp->fss_upri == FSS_NOCHANGE)
1993 reqfssupri = fssproc->fss_upri;
1994 else
1995 reqfssupri = fssparmsp->fss_upri;
1998 * Make sure the user priority doesn't exceed the upri limit.
2000 if (reqfssupri > reqfssuprilim)
2001 reqfssupri = reqfssuprilim;
2004 * Basic permissions enforced by generic kernel code for all classes
2005 * require that a thread attempting to change the scheduling parameters
2006 * of a target thread be privileged or have a real or effective UID
2007 * matching that of the target thread. We are not called unless these
2008 * basic permission checks have already passed. The fair-sharing class
2009 * requires in addition that the calling thread be privileged if it
2010 * is attempting to raise the upri limit above its current value.
2011 * This may have been checked previously but if our caller passed us
2012 * a non-NULL credential pointer we assume it hasn't and we check it
2013 * here.
2015 if ((reqpcredp != NULL) &&
2016 (reqfssuprilim > fssproc->fss_uprilim) &&
2017 secpolicy_raisepriority(reqpcredp) != 0)
2018 return (EPERM);
2021 * Set fss_nice to the nice value corresponding to the user priority we
2022 * are setting. Note that setting the nice field of the parameter
2023 * struct won't affect upri or nice.
2025 nice = NZERO - (reqfssupri * NZERO) / fss_maxupri;
2026 if (nice > FSS_NICE_MAX)
2027 nice = FSS_NICE_MAX;
2029 thread_lock(t);
2031 fssproc->fss_uprilim = reqfssuprilim;
2032 fssproc->fss_upri = reqfssupri;
2033 fssproc->fss_nice = nice;
2034 fss_newpri(fssproc, false);
2036 if ((fssproc->fss_flags & FSSKPRI) != 0) {
2037 thread_unlock(t);
2038 return (0);
2041 fss_change_priority(t, fssproc);
2042 thread_unlock(t);
2043 return (0);
2048 * The thread is being stopped.
2050 /*ARGSUSED*/
2051 static void
2052 fss_stop(kthread_t *t, int why, int what)
2054 ASSERT(THREAD_LOCK_HELD(t));
2055 ASSERT(t == curthread);
2057 fss_inactive(t);
2061 * The current thread is exiting, do necessary adjustments to its project
2063 static void
2064 fss_exit(kthread_t *t)
2066 fsspset_t *fsspset;
2067 fssproj_t *fssproj;
2068 fssproc_t *fssproc;
2069 fsszone_t *fsszone;
2070 int free = 0;
2073 * Thread t here is either a current thread (in which case we hold
2074 * its process' p_lock), or a thread being destroyed by forklwp_fail(),
2075 * in which case we hold pidlock and thread is no longer on the
2076 * thread list.
2078 ASSERT(MUTEX_HELD(&(ttoproc(t))->p_lock) || MUTEX_HELD(&pidlock));
2080 fssproc = FSSPROC(t);
2081 fssproj = FSSPROC2FSSPROJ(fssproc);
2082 fsspset = FSSPROJ2FSSPSET(fssproj);
2083 fsszone = fssproj->fssp_fsszone;
2085 mutex_enter(&fsspsets_lock);
2086 mutex_enter(&fsspset->fssps_lock);
2088 thread_lock(t);
2089 disp_lock_enter_high(&fsspset->fssps_displock);
2090 if (t->t_state == TS_ONPROC || t->t_state == TS_RUN) {
2091 if (--fssproj->fssp_runnable == 0) {
2092 fsszone->fssz_shares -= fssproj->fssp_shares;
2093 if (--fsszone->fssz_runnable == 0)
2094 fsspset->fssps_shares -= fsszone->fssz_rshares;
2096 ASSERT(fssproc->fss_runnable == 1);
2097 fssproc->fss_runnable = 0;
2099 if (--fssproj->fssp_threads == 0) {
2100 fss_remove_fssproj(fsspset, fssproj);
2101 free = 1;
2103 disp_lock_exit_high(&fsspset->fssps_displock);
2104 fssproc->fss_proj = NULL; /* mark this thread as already exited */
2105 thread_unlock(t);
2107 if (free) {
2108 if (fsszone->fssz_nproj == 0)
2109 kmem_free(fsszone, sizeof (fsszone_t));
2110 kmem_free(fssproj, sizeof (fssproj_t));
2112 mutex_exit(&fsspset->fssps_lock);
2113 mutex_exit(&fsspsets_lock);
2116 * A thread could be exiting in between clock ticks, so we need to
2117 * calculate how much CPU time it used since it was charged last time.
2119 * CPU caps are not enforced on exiting processes - it is usually
2120 * desirable to exit as soon as possible to free resources.
2122 if (CPUCAPS_ON()) {
2123 thread_lock(t);
2124 fssproc = FSSPROC(t);
2125 (void) cpucaps_charge(t, &fssproc->fss_caps,
2126 CPUCAPS_CHARGE_ONLY);
2127 thread_unlock(t);
2131 static void
2132 fss_nullsys()
2137 * If thread is currently at a kernel mode priority (has slept) and is
2138 * returning to the userland we assign it the appropriate user mode priority
2139 * and time quantum here. If we're lowering the thread's priority below that
2140 * of other runnable threads then we will set runrun via cpu_surrender() to
2141 * cause preemption.
2143 static void
2144 fss_trapret(kthread_t *t)
2146 fssproc_t *fssproc = FSSPROC(t);
2147 cpu_t *cp = CPU;
2149 ASSERT(THREAD_LOCK_HELD(t));
2150 ASSERT(t == curthread);
2151 ASSERT(cp->cpu_dispthread == t);
2152 ASSERT(t->t_state == TS_ONPROC);
2154 t->t_kpri_req = 0;
2155 if (fssproc->fss_flags & FSSKPRI) {
2157 * If thread has blocked in the kernel
2159 THREAD_CHANGE_PRI(t, fssproc->fss_umdpri);
2160 cp->cpu_dispatch_pri = DISP_PRIO(t);
2161 ASSERT(t->t_pri >= 0 && t->t_pri <= fss_maxglobpri);
2162 fssproc->fss_flags &= ~FSSKPRI;
2164 if (DISP_MUST_SURRENDER(t))
2165 cpu_surrender(t);
2170 * Arrange for thread to be placed in appropriate location on dispatcher queue.
2171 * This is called with the current thread in TS_ONPROC and locked.
2173 static void
2174 fss_preempt(kthread_t *t)
2176 fssproc_t *fssproc = FSSPROC(t);
2177 klwp_t *lwp;
2178 uint_t flags;
2180 ASSERT(t == curthread);
2181 ASSERT(THREAD_LOCK_HELD(curthread));
2182 ASSERT(t->t_state == TS_ONPROC);
2185 * If preempted in the kernel, make sure the thread has a kernel
2186 * priority if needed.
2188 lwp = curthread->t_lwp;
2189 if (!(fssproc->fss_flags & FSSKPRI) && lwp != NULL && t->t_kpri_req) {
2190 fssproc->fss_flags |= FSSKPRI;
2191 THREAD_CHANGE_PRI(t, minclsyspri);
2192 ASSERT(t->t_pri >= 0 && t->t_pri <= fss_maxglobpri);
2193 t->t_trapret = 1; /* so that fss_trapret will run */
2194 aston(t);
2198 * This thread may be placed on wait queue by CPU Caps. In this case we
2199 * do not need to do anything until it is removed from the wait queue.
2200 * Do not enforce CPU caps on threads running at a kernel priority
2202 if (CPUCAPS_ON()) {
2203 (void) cpucaps_charge(t, &fssproc->fss_caps,
2204 CPUCAPS_CHARGE_ENFORCE);
2206 if (!(fssproc->fss_flags & FSSKPRI) && CPUCAPS_ENFORCE(t))
2207 return;
2211 * Check to see if we're doing "preemption control" here. If
2212 * we are, and if the user has requested that this thread not
2213 * be preempted, and if preemptions haven't been put off for
2214 * too long, let the preemption happen here but try to make
2215 * sure the thread is rescheduled as soon as possible. We do
2216 * this by putting it on the front of the highest priority run
2217 * queue in the FSS class. If the preemption has been put off
2218 * for too long, clear the "nopreempt" bit and let the thread
2219 * be preempted.
2221 if (t->t_schedctl && schedctl_get_nopreempt(t)) {
2222 if (fssproc->fss_timeleft > -SC_MAX_TICKS) {
2223 DTRACE_SCHED1(schedctl__nopreempt, kthread_t *, t);
2224 if (!(fssproc->fss_flags & FSSKPRI)) {
2226 * If not already remembered, remember current
2227 * priority for restoration in fss_yield().
2229 if (!(fssproc->fss_flags & FSSRESTORE)) {
2230 fssproc->fss_scpri = t->t_pri;
2231 fssproc->fss_flags |= FSSRESTORE;
2233 THREAD_CHANGE_PRI(t, fss_maxumdpri);
2235 schedctl_set_yield(t, 1);
2236 setfrontdq(t);
2237 return;
2238 } else {
2239 if (fssproc->fss_flags & FSSRESTORE) {
2240 THREAD_CHANGE_PRI(t, fssproc->fss_scpri);
2241 fssproc->fss_flags &= ~FSSRESTORE;
2243 schedctl_set_nopreempt(t, 0);
2244 DTRACE_SCHED1(schedctl__preempt, kthread_t *, t);
2246 * Fall through and be preempted below.
2251 flags = fssproc->fss_flags & (FSSBACKQ | FSSKPRI);
2253 if (flags == FSSBACKQ) {
2254 fssproc->fss_timeleft = fss_quantum;
2255 fssproc->fss_flags &= ~FSSBACKQ;
2256 setbackdq(t);
2257 } else if (flags == (FSSBACKQ | FSSKPRI)) {
2258 fssproc->fss_flags &= ~FSSBACKQ;
2259 setbackdq(t);
2260 } else {
2261 setfrontdq(t);
2266 * Called when a thread is waking up and is to be placed on the run queue.
2268 static void
2269 fss_setrun(kthread_t *t)
2271 fssproc_t *fssproc = FSSPROC(t);
2273 ASSERT(THREAD_LOCK_HELD(t)); /* t should be in transition */
2275 if (t->t_state == TS_SLEEP || t->t_state == TS_STOPPED)
2276 fss_active(t);
2278 fssproc->fss_timeleft = fss_quantum;
2280 fssproc->fss_flags &= ~FSSBACKQ;
2282 * If previously were running at the kernel priority then keep that
2283 * priority and the fss_timeleft doesn't matter.
2285 if ((fssproc->fss_flags & FSSKPRI) == 0)
2286 THREAD_CHANGE_PRI(t, fssproc->fss_umdpri);
2288 if (t->t_disp_time != ddi_get_lbolt())
2289 setbackdq(t);
2290 else
2291 setfrontdq(t);
2295 * Prepare thread for sleep. We reset the thread priority so it will run at the
2296 * kernel priority level when it wakes up.
2298 static void
2299 fss_sleep(kthread_t *t)
2301 fssproc_t *fssproc = FSSPROC(t);
2303 ASSERT(t == curthread);
2304 ASSERT(THREAD_LOCK_HELD(t));
2306 ASSERT(t->t_state == TS_ONPROC);
2309 * Account for time spent on CPU before going to sleep.
2311 (void) CPUCAPS_CHARGE(t, &fssproc->fss_caps, CPUCAPS_CHARGE_ENFORCE);
2313 fss_inactive(t);
2316 * Assign a system priority to the thread and arrange for it to be
2317 * retained when the thread is next placed on the run queue (i.e.,
2318 * when it wakes up) instead of being given a new pri. Also arrange
2319 * for trapret processing as the thread leaves the system call so it
2320 * will drop back to normal priority range.
2322 if (t->t_kpri_req) {
2323 THREAD_CHANGE_PRI(t, minclsyspri);
2324 fssproc->fss_flags |= FSSKPRI;
2325 t->t_trapret = 1; /* so that fss_trapret will run */
2326 aston(t);
2327 } else if (fssproc->fss_flags & FSSKPRI) {
2329 * The thread has done a THREAD_KPRI_REQUEST(), slept, then
2330 * done THREAD_KPRI_RELEASE() (so no t_kpri_req is 0 again),
2331 * then slept again all without finishing the current system
2332 * call so trapret won't have cleared FSSKPRI
2334 fssproc->fss_flags &= ~FSSKPRI;
2335 THREAD_CHANGE_PRI(t, fssproc->fss_umdpri);
2336 if (DISP_MUST_SURRENDER(curthread))
2337 cpu_surrender(t);
2342 * A tick interrupt has ocurrend on a running thread. Check to see if our
2343 * time slice has expired.
2345 static void
2346 fss_tick(kthread_t *t)
2348 fssproc_t *fssproc;
2349 fssproj_t *fssproj;
2350 bool call_cpu_surrender = false;
2351 bool cpucaps_enforce = false;
2353 ASSERT(MUTEX_HELD(&(ttoproc(t))->p_lock));
2356 * It's safe to access fsspset and fssproj structures because we're
2357 * holding our p_lock here.
2359 thread_lock(t);
2360 fssproc = FSSPROC(t);
2361 fssproj = FSSPROC2FSSPROJ(fssproc);
2362 if (fssproj != NULL) {
2363 fsspset_t *fsspset = FSSPROJ2FSSPSET(fssproj);
2364 disp_lock_enter_high(&fsspset->fssps_displock);
2365 fssproj->fssp_ticks += fss_nice_tick[fssproc->fss_nice];
2366 fssproj->fssp_tick_cnt++;
2367 fssproc->fss_ticks++;
2368 disp_lock_exit_high(&fsspset->fssps_displock);
2372 * Keep track of thread's project CPU usage. Note that projects
2373 * get charged even when threads are running in the kernel.
2374 * Do not surrender CPU if running in the SYS class.
2376 if (CPUCAPS_ON()) {
2377 cpucaps_enforce = cpucaps_charge(t,
2378 &fssproc->fss_caps, CPUCAPS_CHARGE_ENFORCE) &&
2379 !(fssproc->fss_flags & FSSKPRI);
2383 * A thread's execution time for threads running in the SYS class
2384 * is not tracked.
2386 if ((fssproc->fss_flags & FSSKPRI) == 0) {
2388 * If thread is not in kernel mode, decrement its fss_timeleft
2390 if (--fssproc->fss_timeleft <= 0) {
2391 pri_t new_pri;
2394 * If we're doing preemption control and trying to
2395 * avoid preempting this thread, just note that the
2396 * thread should yield soon and let it keep running
2397 * (unless it's been a while).
2399 if (t->t_schedctl && schedctl_get_nopreempt(t)) {
2400 if (fssproc->fss_timeleft > -SC_MAX_TICKS) {
2401 DTRACE_SCHED1(schedctl__nopreempt,
2402 kthread_t *, t);
2403 schedctl_set_yield(t, 1);
2404 thread_unlock_nopreempt(t);
2405 return;
2408 fssproc->fss_flags &= ~FSSRESTORE;
2410 fss_newpri(fssproc, true);
2411 new_pri = fssproc->fss_umdpri;
2412 ASSERT(new_pri >= 0 && new_pri <= fss_maxglobpri);
2415 * When the priority of a thread is changed, it may
2416 * be necessary to adjust its position on a sleep queue
2417 * or dispatch queue. The function thread_change_pri
2418 * accomplishes this.
2420 if (thread_change_pri(t, new_pri, 0)) {
2421 fssproc->fss_timeleft = fss_quantum;
2422 } else {
2423 call_cpu_surrender = true;
2425 } else if (t->t_state == TS_ONPROC &&
2426 t->t_pri < t->t_disp_queue->disp_maxrunpri) {
2428 * If there is a higher-priority thread which is
2429 * waiting for a processor, then thread surrenders
2430 * the processor.
2432 call_cpu_surrender = true;
2436 if (cpucaps_enforce && 2 * fssproc->fss_timeleft > fss_quantum) {
2438 * The thread used more than half of its quantum, so assume that
2439 * it used the whole quantum.
2441 * Update thread's priority just before putting it on the wait
2442 * queue so that it gets charged for the CPU time from its
2443 * quantum even before that quantum expires.
2445 fss_newpri(fssproc, false);
2446 if (t->t_pri != fssproc->fss_umdpri)
2447 fss_change_priority(t, fssproc);
2450 * We need to call cpu_surrender for this thread due to cpucaps
2451 * enforcement, but fss_change_priority may have already done
2452 * so. In this case FSSBACKQ is set and there is no need to call
2453 * cpu-surrender again.
2455 if (!(fssproc->fss_flags & FSSBACKQ))
2456 call_cpu_surrender = true;
2459 if (call_cpu_surrender) {
2460 fssproc->fss_flags |= FSSBACKQ;
2461 cpu_surrender(t);
2464 thread_unlock_nopreempt(t); /* clock thread can't be preempted */
2468 * Processes waking up go to the back of their queue. We don't need to assign
2469 * a time quantum here because thread is still at a kernel mode priority and
2470 * the time slicing is not done for threads running in the kernel after
2471 * sleeping. The proper time quantum will be assigned by fss_trapret before the
2472 * thread returns to user mode.
2474 static void
2475 fss_wakeup(kthread_t *t)
2477 fssproc_t *fssproc;
2479 ASSERT(THREAD_LOCK_HELD(t));
2480 ASSERT(t->t_state == TS_SLEEP);
2482 fss_active(t);
2484 fssproc = FSSPROC(t);
2485 fssproc->fss_flags &= ~FSSBACKQ;
2487 if (fssproc->fss_flags & FSSKPRI) {
2489 * If we already have a kernel priority assigned, then we
2490 * just use it.
2492 setbackdq(t);
2493 } else if (t->t_kpri_req) {
2495 * Give thread a priority boost if we were asked.
2497 fssproc->fss_flags |= FSSKPRI;
2498 THREAD_CHANGE_PRI(t, minclsyspri);
2499 setbackdq(t);
2500 t->t_trapret = 1; /* so that fss_trapret will run */
2501 aston(t);
2502 } else {
2504 * Otherwise, we recalculate the priority.
2506 if (t->t_disp_time == ddi_get_lbolt()) {
2507 setfrontdq(t);
2508 } else {
2509 fssproc->fss_timeleft = fss_quantum;
2510 THREAD_CHANGE_PRI(t, fssproc->fss_umdpri);
2511 setbackdq(t);
2517 * fss_donice() is called when a nice(1) command is issued on the thread to
2518 * alter the priority. The nice(1) command exists in Solaris for compatibility.
2519 * Thread priority adjustments should be done via priocntl(1).
2521 static int
2522 fss_donice(kthread_t *t, cred_t *cr, int incr, int *retvalp)
2524 int newnice;
2525 fssproc_t *fssproc = FSSPROC(t);
2526 fssparms_t fssparms;
2529 * If there is no change to priority, just return current setting.
2531 if (incr == 0) {
2532 if (retvalp)
2533 *retvalp = fssproc->fss_nice - NZERO;
2534 return (0);
2537 if ((incr < 0 || incr > 2 * NZERO) && secpolicy_raisepriority(cr) != 0)
2538 return (EPERM);
2541 * Specifying a nice increment greater than the upper limit of
2542 * FSS_NICE_MAX (== 2 * NZERO - 1) will result in the thread's nice
2543 * value being set to the upper limit. We check for this before
2544 * computing the new value because otherwise we could get overflow
2545 * if a privileged user specified some ridiculous increment.
2547 if (incr > FSS_NICE_MAX)
2548 incr = FSS_NICE_MAX;
2550 newnice = fssproc->fss_nice + incr;
2551 if (newnice > FSS_NICE_MAX)
2552 newnice = FSS_NICE_MAX;
2553 else if (newnice < FSS_NICE_MIN)
2554 newnice = FSS_NICE_MIN;
2556 fssparms.fss_uprilim = fssparms.fss_upri =
2557 -((newnice - NZERO) * fss_maxupri) / NZERO;
2560 * Reset the uprilim and upri values of the thread.
2562 (void) fss_parmsset(t, (void *)&fssparms, (id_t)0, (cred_t *)NULL);
2565 * Although fss_parmsset already reset fss_nice it may not have been
2566 * set to precisely the value calculated above because fss_parmsset
2567 * determines the nice value from the user priority and we may have
2568 * truncated during the integer conversion from nice value to user
2569 * priority and back. We reset fss_nice to the value we calculated
2570 * above.
2572 fssproc->fss_nice = (char)newnice;
2574 if (retvalp)
2575 *retvalp = newnice - NZERO;
2576 return (0);
2580 * Increment the priority of the specified thread by incr and
2581 * return the new value in *retvalp.
2583 static int
2584 fss_doprio(kthread_t *t, cred_t *cr, int incr, int *retvalp)
2586 int newpri;
2587 fssproc_t *fssproc = FSSPROC(t);
2588 fssparms_t fssparms;
2591 * If there is no change to priority, just return current setting.
2593 if (incr == 0) {
2594 *retvalp = fssproc->fss_upri;
2595 return (0);
2598 newpri = fssproc->fss_upri + incr;
2599 if (newpri > fss_maxupri || newpri < -fss_maxupri)
2600 return (EINVAL);
2602 *retvalp = newpri;
2603 fssparms.fss_uprilim = fssparms.fss_upri = newpri;
2606 * Reset the uprilim and upri values of the thread.
2608 return (fss_parmsset(t, &fssparms, (id_t)0, cr));
2612 * Return the global scheduling priority that would be assigned to a thread
2613 * entering the fair-sharing class with the fss_upri.
2615 /*ARGSUSED*/
2616 static pri_t
2617 fss_globpri(kthread_t *t)
2619 ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock));
2621 return (fss_maxumdpri / 2);
2625 * Called from the yield(2) system call when a thread is yielding (surrendering)
2626 * the processor. The kernel thread is placed at the back of a dispatch queue.
2628 static void
2629 fss_yield(kthread_t *t)
2631 fssproc_t *fssproc = FSSPROC(t);
2633 ASSERT(t == curthread);
2634 ASSERT(THREAD_LOCK_HELD(t));
2637 * Collect CPU usage spent before yielding
2639 (void) CPUCAPS_CHARGE(t, &fssproc->fss_caps, CPUCAPS_CHARGE_ENFORCE);
2642 * Clear the preemption control "yield" bit since the user is
2643 * doing a yield.
2645 if (t->t_schedctl)
2646 schedctl_set_yield(t, 0);
2648 * If fss_preempt() artifically increased the thread's priority
2649 * to avoid preemption, restore the original priority now.
2651 if (fssproc->fss_flags & FSSRESTORE) {
2652 THREAD_CHANGE_PRI(t, fssproc->fss_scpri);
2653 fssproc->fss_flags &= ~FSSRESTORE;
2655 if (fssproc->fss_timeleft < 0) {
2657 * Time slice was artificially extended to avoid preemption,
2658 * so pretend we're preempting it now.
2660 DTRACE_SCHED1(schedctl__yield, int, -fssproc->fss_timeleft);
2661 fssproc->fss_timeleft = fss_quantum;
2663 fssproc->fss_flags &= ~FSSBACKQ;
2664 setbackdq(t);
2667 void
2668 fss_changeproj(kthread_t *t, void *kp, void *zp, fssbuf_t *projbuf,
2669 fssbuf_t *zonebuf)
2671 kproject_t *kpj_new = kp;
2672 zone_t *zone = zp;
2673 fssproj_t *fssproj_old, *fssproj_new;
2674 fsspset_t *fsspset;
2675 kproject_t *kpj_old;
2676 fssproc_t *fssproc;
2677 fsszone_t *fsszone_old, *fsszone_new;
2678 int free = 0;
2679 int id;
2681 ASSERT(MUTEX_HELD(&cpu_lock));
2682 ASSERT(MUTEX_HELD(&pidlock));
2683 ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock));
2685 if (t->t_cid != fss_cid)
2686 return;
2688 fssproc = FSSPROC(t);
2689 mutex_enter(&fsspsets_lock);
2690 fssproj_old = FSSPROC2FSSPROJ(fssproc);
2691 if (fssproj_old == NULL) {
2692 mutex_exit(&fsspsets_lock);
2693 return;
2696 fsspset = FSSPROJ2FSSPSET(fssproj_old);
2697 mutex_enter(&fsspset->fssps_lock);
2698 kpj_old = FSSPROJ2KPROJ(fssproj_old);
2699 fsszone_old = fssproj_old->fssp_fsszone;
2701 ASSERT(t->t_cpupart == fsspset->fssps_cpupart);
2703 if (kpj_old == kpj_new) {
2704 mutex_exit(&fsspset->fssps_lock);
2705 mutex_exit(&fsspsets_lock);
2706 return;
2709 if ((fsszone_new = fss_find_fsszone(fsspset, zone)) == NULL) {
2711 * If the zone for the new project is not currently active on
2712 * the cpu partition we're on, get one of the pre-allocated
2713 * buffers and link it in our per-pset zone list. Such buffers
2714 * should already exist.
2716 for (id = 0; id < zonebuf->fssb_size; id++) {
2717 if ((fsszone_new = zonebuf->fssb_list[id]) != NULL) {
2718 fss_insert_fsszone(fsspset, zone, fsszone_new);
2719 zonebuf->fssb_list[id] = NULL;
2720 break;
2724 ASSERT(fsszone_new != NULL);
2725 if ((fssproj_new = fss_find_fssproj(fsspset, kpj_new)) == NULL) {
2727 * If our new project is not currently running
2728 * on the cpu partition we're on, get one of the
2729 * pre-allocated buffers and link it in our new cpu
2730 * partition doubly linked list. Such buffers should already
2731 * exist.
2733 for (id = 0; id < projbuf->fssb_size; id++) {
2734 if ((fssproj_new = projbuf->fssb_list[id]) != NULL) {
2735 fss_insert_fssproj(fsspset, kpj_new,
2736 fsszone_new, fssproj_new);
2737 projbuf->fssb_list[id] = NULL;
2738 break;
2742 ASSERT(fssproj_new != NULL);
2744 thread_lock(t);
2745 if (t->t_state == TS_RUN || t->t_state == TS_ONPROC ||
2746 t->t_state == TS_WAIT)
2747 fss_inactive(t);
2748 ASSERT(fssproj_old->fssp_threads > 0);
2749 if (--fssproj_old->fssp_threads == 0) {
2750 fss_remove_fssproj(fsspset, fssproj_old);
2751 free = 1;
2753 fssproc->fss_proj = fssproj_new;
2754 fssproc->fss_fsspri = 0;
2755 fssproj_new->fssp_threads++;
2756 if (t->t_state == TS_RUN || t->t_state == TS_ONPROC ||
2757 t->t_state == TS_WAIT)
2758 fss_active(t);
2759 thread_unlock(t);
2760 if (free) {
2761 if (fsszone_old->fssz_nproj == 0)
2762 kmem_free(fsszone_old, sizeof (fsszone_t));
2763 kmem_free(fssproj_old, sizeof (fssproj_t));
2766 mutex_exit(&fsspset->fssps_lock);
2767 mutex_exit(&fsspsets_lock);
2770 void
2771 fss_changepset(kthread_t *t, void *newcp, fssbuf_t *projbuf,
2772 fssbuf_t *zonebuf)
2774 fsspset_t *fsspset_old, *fsspset_new;
2775 fssproj_t *fssproj_old, *fssproj_new;
2776 fsszone_t *fsszone_old, *fsszone_new;
2777 fssproc_t *fssproc;
2778 kproject_t *kpj;
2779 zone_t *zone;
2780 int id;
2782 ASSERT(MUTEX_HELD(&cpu_lock));
2783 ASSERT(MUTEX_HELD(&pidlock));
2784 ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock));
2786 if (t->t_cid != fss_cid)
2787 return;
2789 fssproc = FSSPROC(t);
2790 zone = ttoproc(t)->p_zone;
2791 mutex_enter(&fsspsets_lock);
2792 fssproj_old = FSSPROC2FSSPROJ(fssproc);
2793 if (fssproj_old == NULL) {
2794 mutex_exit(&fsspsets_lock);
2795 return;
2797 fsszone_old = fssproj_old->fssp_fsszone;
2798 fsspset_old = FSSPROJ2FSSPSET(fssproj_old);
2799 kpj = FSSPROJ2KPROJ(fssproj_old);
2801 if (fsspset_old->fssps_cpupart == newcp) {
2802 mutex_exit(&fsspsets_lock);
2803 return;
2806 ASSERT(ttoproj(t) == kpj);
2808 fsspset_new = fss_find_fsspset(newcp);
2810 mutex_enter(&fsspset_new->fssps_lock);
2811 if ((fsszone_new = fss_find_fsszone(fsspset_new, zone)) == NULL) {
2812 for (id = 0; id < zonebuf->fssb_size; id++) {
2813 if ((fsszone_new = zonebuf->fssb_list[id]) != NULL) {
2814 fss_insert_fsszone(fsspset_new, zone,
2815 fsszone_new);
2816 zonebuf->fssb_list[id] = NULL;
2817 break;
2821 ASSERT(fsszone_new != NULL);
2822 if ((fssproj_new = fss_find_fssproj(fsspset_new, kpj)) == NULL) {
2823 for (id = 0; id < projbuf->fssb_size; id++) {
2824 if ((fssproj_new = projbuf->fssb_list[id]) != NULL) {
2825 fss_insert_fssproj(fsspset_new, kpj,
2826 fsszone_new, fssproj_new);
2827 projbuf->fssb_list[id] = NULL;
2828 break;
2832 ASSERT(fssproj_new != NULL);
2834 fssproj_new->fssp_threads++;
2835 thread_lock(t);
2836 if (t->t_state == TS_RUN || t->t_state == TS_ONPROC ||
2837 t->t_state == TS_WAIT)
2838 fss_inactive(t);
2839 fssproc->fss_proj = fssproj_new;
2840 fssproc->fss_fsspri = 0;
2841 if (t->t_state == TS_RUN || t->t_state == TS_ONPROC ||
2842 t->t_state == TS_WAIT)
2843 fss_active(t);
2844 thread_unlock(t);
2845 mutex_exit(&fsspset_new->fssps_lock);
2847 mutex_enter(&fsspset_old->fssps_lock);
2848 if (--fssproj_old->fssp_threads == 0) {
2849 fss_remove_fssproj(fsspset_old, fssproj_old);
2850 if (fsszone_old->fssz_nproj == 0)
2851 kmem_free(fsszone_old, sizeof (fsszone_t));
2852 kmem_free(fssproj_old, sizeof (fssproj_t));
2854 mutex_exit(&fsspset_old->fssps_lock);
2856 mutex_exit(&fsspsets_lock);