4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright 2013, Joyent, Inc. All rights reserved.
27 #include <sys/types.h>
28 #include <sys/param.h>
29 #include <sys/sysmacros.h>
32 #include <sys/strsubr.h>
33 #include <sys/priocntl.h>
34 #include <sys/class.h>
36 #include <sys/procset.h>
37 #include <sys/debug.h>
39 #include <sys/errno.h>
40 #include <sys/systm.h>
41 #include <sys/schedctl.h>
42 #include <sys/vmsystm.h>
43 #include <sys/atomic.h>
44 #include <sys/project.h>
45 #include <sys/modctl.h>
47 #include <sys/fsspriocntl.h>
48 #include <sys/cpupart.h>
51 #include <vm/seg_kmem.h>
52 #include <sys/tnf_probe.h>
53 #include <sys/policy.h>
55 #include <sys/cpucaps.h>
58 * The fair share scheduling class ensures that collections of processes
59 * (zones and projects) each get their configured share of CPU. This is in
60 * contrast to the TS class which considers individual processes.
62 * The FSS cpu-share is set on zones using the zone.cpu-shares rctl and on
63 * projects using the project.cpu-shares rctl. By default the value is 1
64 * and it can range from 0 - 64k. A value of 0 means that processes in the
65 * collection will only get CPU resources when there are no other processes
66 * that need CPU. The cpu-share is used as one of the inputs to calculate a
67 * thread's "user-mode" priority (umdpri) for the scheduler. The umdpri falls
68 * in the range 0-59. FSS calculates other, internal, priorities which are not
69 * visible outside of the FSS class.
71 * The FSS class should approximate TS behavior when there are excess CPU
72 * resources. When there is a backlog of runnable processes, then the share
73 * is used as input into the runnable process's priority calculation, where
74 * the final umdpri is used by the scheduler to determine when the process runs.
76 * Projects in a zone compete with each other for CPU time, receiving CPU
77 * allocation within a zone proportional to the project's share; at a higher
78 * level zones compete with each other, receiving allocation in a pset
79 * proportional to the zone's share.
81 * The FSS priority calculation consists of several parts.
83 * 1) Once per second the fss_update function runs. The first thing it does is
84 * call fss_decay_usage. This function does three things.
86 * a) fss_decay_usage first decays the maxfsspri value for the pset. This
87 * value is used in the per-process priority calculation described in step
88 * (2b). The maxfsspri is decayed using the following formula:
90 * maxfsspri * fss_nice_decay[NZERO])
91 * maxfsspri = ------------------------------------
95 * - NZERO is the default process priority (i.e. 20)
97 * The fss_nice_decay array is a fixed set of values used to adjust the
98 * decay rate of processes based on their nice value. Entries in this
99 * array are initialized in fss_init using the following formula:
101 * (FSS_DECAY_MAX - FSS_DECAY_MIN) * i
102 * FSS_DECAY_MIN + -------------------------------------
105 * - FSS_DECAY_MIN is 82 = approximates 65% (82/128)
106 * - FSS_DECAY_MAX is 108 = approximates 85% (108/128)
107 * - FSS_NICE_RANGE is 40 (range is 0 - 39)
109 * b) The second thing fss_decay_usage does is update each project's "usage"
110 * for the last second and then recalculates the project's "share usage".
112 * The usage value is the recent CPU usage for all of the threads in the
113 * project. It is decayed and updated this way:
115 * (usage * FSS_DECAY_USG)
116 * usage = ------------------------- + ticks;
119 * - FSS_DECAY_BASE is 128 - used instead of 100 so we can shift vs divide
120 * - FSS_DECAY_USG is 96 - approximates 75% (96/128)
121 * - ticks is updated whenever a process in this project is running
122 * when the scheduler's tick processing fires. This is not a simple
123 * counter, the values are based on the entries in the fss_nice_tick
124 * array (see section 3 below). ticks is then reset to 0 so it can track
125 * the next seconds worth of nice-adjusted time for the project.
127 * c) The third thing fss_decay_usage does is update each project's "share
128 * usage" (shusage). This is the normalized usage value for the project and
129 * is calculated this way:
131 * pset_shares^2 zone_int_shares^2
132 * usage * ------------- * ------------------
133 * kpj_shares^2 zone_ext_shares^2
135 * - usage - see (1b) for more details
136 * - pset_shares is the total of all *active* zone shares in the pset (by
137 * default there is only one pset)
138 * - kpj_shares is the individual project's share (project.cpu-shares rctl)
139 * - zone_int_shares is the sum of shares of all active projects within the
140 * zone (the zone-internal total)
141 * - zone_ext_shares is the share value for the zone (zone.cpu-shares rctl)
143 * The shusage is used in step (2b) to calculate the thread's new internal
144 * priority. A larger shusage value leads to a lower priority.
146 * 2) The fss_update function then calls fss_update_list to update the priority
147 * of all threads. This does two things.
149 * a) First the thread's internal priority is decayed using the following
152 * fsspri * fss_nice_decay[nice_value])
153 * fsspri = ------------------------------------
156 * - FSS_DECAY_BASE is 128 as described above
158 * b) Second, if the thread is runnable (TS_RUN or TS_WAIT) calls fss_newpri
159 * to update the user-mode priority (umdpri) of the runnable thread.
160 * Threads that are running (TS_ONPROC) or waiting for an event (TS_SLEEP)
161 * are not updated at this time. The updated user-mode priority can cause
162 * threads to change their position in the run queue.
164 * The process's new internal fsspri is calculated using the following
165 * formula. All runnable threads in the project will use the same shusage
166 * and nrunnable values in their calculation.
168 * fsspri += shusage * nrunnable * ticks
170 * - shusage is the project's share usage, calculated in (1c)
171 * - nrunnable is the number of runnable threads in the project
172 * - ticks is the number of ticks this thread ran since the last fss_newpri
175 * Finally the process's new user-mode priority is calculated using the
178 * (fsspri * umdprirange)
179 * umdpri = maxumdpri - ------------------------
182 * - maxumdpri is MINCLSYSPRI - 1 (i.e. 59)
183 * - umdprirange is maxumdpri - 1 (i.e. 58)
184 * - maxfsspri is the largest fsspri seen so far, as we're iterating all
187 * Thus, a higher internal priority (fsspri) leads to a lower user-mode
188 * priority which means the thread runs less. The fsspri is higher when
189 * the project's normalized share usage is higher, when the project has
190 * more runnable threads, or when the thread has accumulated more run-time.
192 * This code has various checks to ensure the resulting umdpri is in the
193 * range 1-59. See fss_newpri for more details.
195 * To reiterate, the above processing is performed once per second to recompute
196 * the runnable thread user-mode priorities.
198 * 3) The final major component in the priority calculation is the tick
199 * processing which occurs on a thread that is running when the clock
202 * A thread can run continuously in user-land (compute-bound) for the
203 * fss_quantum (see "dispadmin -c FSS -g" for the configurable properties).
204 * The fss_quantum defaults to 11 (i.e. 11 ticks).
206 * Once the quantum has been consumed, the thread will call fss_newpri to
207 * recompute its umdpri priority, as described above in (2b). Threads that
208 * were T_ONPROC at the one second interval when runnable thread priorities
209 * were recalculated will have their umdpri priority recalculated when their
212 * To ensure that runnable threads within a project see the expected
213 * round-robin behavior, there is a special case in fss_newpri for a thread
214 * that has run for its quanta within the one second update interval. See
215 * the handling for the quanta_up parameter within fss_newpri.
217 * Also of interest, the fss_tick code increments the project's tick value
218 * using the fss_nice_tick array entry for the thread's nice value. The idea
219 * behind the fss_nice_tick array is that the cost of a tick is lower at
220 * positive nice values (so that it doesn't increase the project's usage
221 * as much as normal) with a 50% drop at the maximum level and a 50%
222 * increase at the minimum level. See (1b). The fss_nice_tick array is
223 * initialized in fss_init using the following formula:
225 * FSS_TICK_COST * (((3 * FSS_NICE_RANGE) / 2) - i)
226 * --------------------------------------------------
229 * - FSS_TICK_COST is 1000, the tick cost for threads with nice level 0
231 * FSS Data Structures:
236 * | |-------->| |<------->| |<---->...
243 * | ----- ----- -----
244 * -->| |<--->| |<--->| |
249 * That is, fsspsets contain a list of fsszone's that are currently active in
250 * the pset, and a list of fssproj's, corresponding to projects with runnable
251 * threads on the pset. fssproj's in turn point to the fsszone which they
254 * An fssproj_t is removed when there are no threads in it.
256 * An fsszone_t is removed when there are no projects with threads in it.
259 static pri_t
fss_init(id_t
, int, classfuncs_t
**);
261 static struct sclass fss
= {
267 extern struct mod_ops mod_schedops
;
270 * Module linkage information for the kernel.
272 static struct modlsched modlsched
= {
273 &mod_schedops
, "fair share scheduling class", &fss
276 static struct modlinkage modlinkage
= {
277 MODREV_1
, (void *)&modlsched
, NULL
280 #define FSS_MAXUPRI 60
283 * The fssproc_t structures are kept in an array of circular doubly linked
284 * lists. A hash on the thread pointer is used to determine which list each
285 * thread should be placed in. Each list has a dummy "head" which is never
286 * removed, so the list is never empty. fss_update traverses these lists to
287 * update the priorities of threads that have been waiting on the run queue.
289 #define FSS_LISTS 16 /* number of lists, must be power of 2 */
290 #define FSS_LIST_HASH(t) (((uintptr_t)(t) >> 9) & (FSS_LISTS - 1))
291 #define FSS_LIST_NEXT(i) (((i) + 1) & (FSS_LISTS - 1))
293 #define FSS_LIST_INSERT(fssproc) \
295 int index = FSS_LIST_HASH(fssproc->fss_tp); \
296 kmutex_t *lockp = &fss_listlock[index]; \
297 fssproc_t *headp = &fss_listhead[index]; \
298 mutex_enter(lockp); \
299 fssproc->fss_next = headp->fss_next; \
300 fssproc->fss_prev = headp; \
301 headp->fss_next->fss_prev = fssproc; \
302 headp->fss_next = fssproc; \
306 #define FSS_LIST_DELETE(fssproc) \
308 int index = FSS_LIST_HASH(fssproc->fss_tp); \
309 kmutex_t *lockp = &fss_listlock[index]; \
310 mutex_enter(lockp); \
311 fssproc->fss_prev->fss_next = fssproc->fss_next; \
312 fssproc->fss_next->fss_prev = fssproc->fss_prev; \
316 #define FSS_TICK_COST 1000 /* tick cost for threads with nice level = 0 */
319 * Decay rate percentages are based on n/128 rather than n/100 so that
320 * calculations can avoid having to do an integer divide by 100 (divide
321 * by FSS_DECAY_BASE == 128 optimizes to an arithmetic shift).
323 * FSS_DECAY_MIN = 83/128 ~= 65%
324 * FSS_DECAY_MAX = 108/128 ~= 85%
325 * FSS_DECAY_USG = 96/128 ~= 75%
327 #define FSS_DECAY_MIN 83 /* fsspri decay pct for threads w/ nice -20 */
328 #define FSS_DECAY_MAX 108 /* fsspri decay pct for threads w/ nice +19 */
329 #define FSS_DECAY_USG 96 /* fssusage decay pct for projects */
330 #define FSS_DECAY_BASE 128 /* base for decay percentages above */
332 #define FSS_NICE_MIN 0
333 #define FSS_NICE_MAX (2 * NZERO - 1)
334 #define FSS_NICE_RANGE (FSS_NICE_MAX - FSS_NICE_MIN + 1)
336 static int fss_nice_tick
[FSS_NICE_RANGE
];
337 static int fss_nice_decay
[FSS_NICE_RANGE
];
339 static pri_t fss_maxupri
= FSS_MAXUPRI
; /* maximum FSS user priority */
340 static pri_t fss_maxumdpri
; /* maximum user mode fss priority */
341 static pri_t fss_maxglobpri
; /* maximum global priority used by fss class */
342 static pri_t fss_minglobpri
; /* minimum global priority */
344 static fssproc_t fss_listhead
[FSS_LISTS
];
345 static kmutex_t fss_listlock
[FSS_LISTS
];
347 static fsspset_t
*fsspsets
;
348 static kmutex_t fsspsets_lock
; /* protects fsspsets */
352 static int fss_quantum
= 11;
354 static void fss_newpri(fssproc_t
*, bool);
355 static void fss_update(void *);
356 static int fss_update_list(int);
357 static void fss_change_priority(kthread_t
*, fssproc_t
*);
359 static int fss_admin(caddr_t
, cred_t
*);
360 static int fss_getclinfo(void *);
361 static int fss_parmsin(void *);
362 static int fss_parmsout(void *, pc_vaparms_t
*);
363 static int fss_vaparmsin(void *, pc_vaparms_t
*);
364 static int fss_vaparmsout(void *, pc_vaparms_t
*);
365 static int fss_getclpri(pcpri_t
*);
366 static int fss_alloc(void **, int);
367 static void fss_free(void *);
369 static int fss_enterclass(kthread_t
*, id_t
, void *, cred_t
*, void *);
370 static void fss_exitclass(void *);
371 static int fss_canexit(kthread_t
*, cred_t
*);
372 static int fss_fork(kthread_t
*, kthread_t
*, void *);
373 static void fss_forkret(kthread_t
*, kthread_t
*);
374 static void fss_parmsget(kthread_t
*, void *);
375 static int fss_parmsset(kthread_t
*, void *, id_t
, cred_t
*);
376 static void fss_stop(kthread_t
*, int, int);
377 static void fss_exit(kthread_t
*);
378 static void fss_active(kthread_t
*);
379 static void fss_inactive(kthread_t
*);
380 static void fss_trapret(kthread_t
*);
381 static void fss_preempt(kthread_t
*);
382 static void fss_setrun(kthread_t
*);
383 static void fss_sleep(kthread_t
*);
384 static void fss_tick(kthread_t
*);
385 static void fss_wakeup(kthread_t
*);
386 static int fss_donice(kthread_t
*, cred_t
*, int, int *);
387 static int fss_doprio(kthread_t
*, cred_t
*, int, int *);
388 static pri_t
fss_globpri(kthread_t
*);
389 static void fss_yield(kthread_t
*);
390 static void fss_nullsys();
392 static struct classfuncs fss_classfuncs
= {
393 /* class functions */
404 /* thread functions */
424 fss_nullsys
, /* set_process_group */
432 return (mod_install(&modlinkage
));
442 _info(struct modinfo
*modinfop
)
444 return (mod_info(&modlinkage
, modinfop
));
449 fss_project_walker(kproject_t
*kpj
, void *buf
)
455 fss_allocbuf(int op
, int type
)
463 ASSERT(op
== FSS_NPSET_BUF
|| op
== FSS_NPROJ_BUF
|| op
== FSS_ONE_BUF
);
464 ASSERT(type
== FSS_ALLOC_PROJ
|| type
== FSS_ALLOC_ZONE
);
465 ASSERT(MUTEX_HELD(&cpu_lock
));
467 fssbuf
= kmem_zalloc(sizeof (fssbuf_t
), KM_SLEEP
);
470 cnt
= cpupart_list(NULL
, 0, CP_NONEMPTY
);
473 cnt
= project_walk_all(ALL_ZONES
, fss_project_walker
, NULL
);
482 size
= sizeof (fssproj_t
);
485 size
= sizeof (fsszone_t
);
488 fsslist
= kmem_zalloc(cnt
* sizeof (void *), KM_SLEEP
);
489 fssbuf
->fssb_size
= cnt
;
490 fssbuf
->fssb_list
= fsslist
;
491 for (i
= 0; i
< cnt
; i
++)
492 fsslist
[i
] = kmem_zalloc(size
, KM_SLEEP
);
497 fss_freebuf(fssbuf_t
*fssbuf
, int type
)
503 ASSERT(fssbuf
!= NULL
);
504 ASSERT(type
== FSS_ALLOC_PROJ
|| type
== FSS_ALLOC_ZONE
);
505 fsslist
= fssbuf
->fssb_list
;
509 size
= sizeof (fssproj_t
);
512 size
= sizeof (fsszone_t
);
516 for (i
= 0; i
< fssbuf
->fssb_size
; i
++) {
517 if (fsslist
[i
] != NULL
)
518 kmem_free(fsslist
[i
], size
);
520 kmem_free(fsslist
, sizeof (void *) * fssbuf
->fssb_size
);
521 kmem_free(fssbuf
, sizeof (fssbuf_t
));
525 fss_find_fsspset(cpupart_t
*cpupart
)
528 fsspset_t
*fsspset
= NULL
;
531 ASSERT(cpupart
!= NULL
);
532 ASSERT(MUTEX_HELD(&fsspsets_lock
));
535 * Search for the cpupart pointer in the array of fsspsets.
537 for (i
= 0; i
< max_ncpus
; i
++) {
538 fsspset
= &fsspsets
[i
];
539 if (fsspset
->fssps_cpupart
== cpupart
) {
540 ASSERT(fsspset
->fssps_nproj
> 0);
547 * If we didn't find anything, then use the first
548 * available slot in the fsspsets array.
550 for (i
= 0; i
< max_ncpus
; i
++) {
551 fsspset
= &fsspsets
[i
];
552 if (fsspset
->fssps_cpupart
== NULL
) {
553 ASSERT(fsspset
->fssps_nproj
== 0);
558 fsspset
->fssps_cpupart
= cpupart
;
565 fss_del_fsspset(fsspset_t
*fsspset
)
567 ASSERT(MUTEX_HELD(&fsspsets_lock
));
568 ASSERT(MUTEX_HELD(&fsspset
->fssps_lock
));
569 ASSERT(fsspset
->fssps_nproj
== 0);
570 ASSERT(fsspset
->fssps_list
== NULL
);
571 ASSERT(fsspset
->fssps_zones
== NULL
);
572 fsspset
->fssps_cpupart
= NULL
;
573 fsspset
->fssps_maxfsspri
= 0;
574 fsspset
->fssps_shares
= 0;
578 * The following routine returns a pointer to the fsszone structure which
579 * belongs to zone "zone" and cpu partition fsspset, if such structure exists.
582 fss_find_fsszone(fsspset_t
*fsspset
, zone_t
*zone
)
586 ASSERT(MUTEX_HELD(&fsspset
->fssps_lock
));
588 if (fsspset
->fssps_list
!= NULL
) {
590 * There are projects/zones active on this cpu partition
591 * already. Try to find our zone among them.
593 fsszone
= fsspset
->fssps_zones
;
595 if (fsszone
->fssz_zone
== zone
) {
598 fsszone
= fsszone
->fssz_next
;
599 } while (fsszone
!= fsspset
->fssps_zones
);
605 * The following routine links new fsszone structure into doubly linked list of
606 * zones active on the specified cpu partition.
609 fss_insert_fsszone(fsspset_t
*fsspset
, zone_t
*zone
, fsszone_t
*fsszone
)
611 ASSERT(MUTEX_HELD(&fsspset
->fssps_lock
));
613 fsszone
->fssz_zone
= zone
;
614 fsszone
->fssz_rshares
= zone
->zone_shares
;
616 if (fsspset
->fssps_zones
== NULL
) {
618 * This will be the first fsszone for this fsspset
620 fsszone
->fssz_next
= fsszone
->fssz_prev
= fsszone
;
621 fsspset
->fssps_zones
= fsszone
;
624 * Insert this fsszone to the doubly linked list.
626 fsszone_t
*fssz_head
= fsspset
->fssps_zones
;
628 fsszone
->fssz_next
= fssz_head
;
629 fsszone
->fssz_prev
= fssz_head
->fssz_prev
;
630 fssz_head
->fssz_prev
->fssz_next
= fsszone
;
631 fssz_head
->fssz_prev
= fsszone
;
632 fsspset
->fssps_zones
= fsszone
;
637 * The following routine removes a single fsszone structure from the doubly
638 * linked list of zones active on the specified cpu partition. Note that
639 * global fsspsets_lock must be held in case this fsszone structure is the last
640 * on the above mentioned list. Also note that the fsszone structure is not
641 * freed here, it is the responsibility of the caller to call kmem_free for it.
644 fss_remove_fsszone(fsspset_t
*fsspset
, fsszone_t
*fsszone
)
646 ASSERT(MUTEX_HELD(&fsspset
->fssps_lock
));
647 ASSERT(fsszone
->fssz_nproj
== 0);
648 ASSERT(fsszone
->fssz_shares
== 0);
649 ASSERT(fsszone
->fssz_runnable
== 0);
651 if (fsszone
->fssz_next
!= fsszone
) {
653 * This is not the last zone in the list.
655 fsszone
->fssz_prev
->fssz_next
= fsszone
->fssz_next
;
656 fsszone
->fssz_next
->fssz_prev
= fsszone
->fssz_prev
;
657 if (fsspset
->fssps_zones
== fsszone
)
658 fsspset
->fssps_zones
= fsszone
->fssz_next
;
661 * This was the last zone active in this cpu partition.
663 fsspset
->fssps_zones
= NULL
;
668 * The following routine returns a pointer to the fssproj structure
669 * which belongs to project kpj and cpu partition fsspset, if such structure
673 fss_find_fssproj(fsspset_t
*fsspset
, kproject_t
*kpj
)
677 ASSERT(MUTEX_HELD(&fsspset
->fssps_lock
));
679 if (fsspset
->fssps_list
!= NULL
) {
681 * There are projects running on this cpu partition already.
682 * Try to find our project among them.
684 fssproj
= fsspset
->fssps_list
;
686 if (fssproj
->fssp_proj
== kpj
) {
687 ASSERT(fssproj
->fssp_pset
== fsspset
);
690 fssproj
= fssproj
->fssp_next
;
691 } while (fssproj
!= fsspset
->fssps_list
);
697 * The following routine links new fssproj structure into doubly linked list
698 * of projects running on the specified cpu partition.
701 fss_insert_fssproj(fsspset_t
*fsspset
, kproject_t
*kpj
, fsszone_t
*fsszone
,
704 ASSERT(MUTEX_HELD(&fsspset
->fssps_lock
));
706 fssproj
->fssp_pset
= fsspset
;
707 fssproj
->fssp_proj
= kpj
;
708 fssproj
->fssp_shares
= kpj
->kpj_shares
;
710 fsspset
->fssps_nproj
++;
712 if (fsspset
->fssps_list
== NULL
) {
714 * This will be the first fssproj for this fsspset
716 fssproj
->fssp_next
= fssproj
->fssp_prev
= fssproj
;
717 fsspset
->fssps_list
= fssproj
;
720 * Insert this fssproj to the doubly linked list.
722 fssproj_t
*fssp_head
= fsspset
->fssps_list
;
724 fssproj
->fssp_next
= fssp_head
;
725 fssproj
->fssp_prev
= fssp_head
->fssp_prev
;
726 fssp_head
->fssp_prev
->fssp_next
= fssproj
;
727 fssp_head
->fssp_prev
= fssproj
;
728 fsspset
->fssps_list
= fssproj
;
730 fssproj
->fssp_fsszone
= fsszone
;
731 fsszone
->fssz_nproj
++;
732 ASSERT(fsszone
->fssz_nproj
!= 0);
736 * The following routine removes a single fssproj structure from the doubly
737 * linked list of projects running on the specified cpu partition. Note that
738 * global fsspsets_lock must be held in case if this fssproj structure is the
739 * last on the above mentioned list. Also note that the fssproj structure is
740 * not freed here, it is the responsibility of the caller to call kmem_free
744 fss_remove_fssproj(fsspset_t
*fsspset
, fssproj_t
*fssproj
)
748 ASSERT(MUTEX_HELD(&fsspsets_lock
));
749 ASSERT(MUTEX_HELD(&fsspset
->fssps_lock
));
750 ASSERT(fssproj
->fssp_runnable
== 0);
752 fsspset
->fssps_nproj
--;
754 fsszone
= fssproj
->fssp_fsszone
;
755 fsszone
->fssz_nproj
--;
757 if (fssproj
->fssp_next
!= fssproj
) {
759 * This is not the last part in the list.
761 fssproj
->fssp_prev
->fssp_next
= fssproj
->fssp_next
;
762 fssproj
->fssp_next
->fssp_prev
= fssproj
->fssp_prev
;
763 if (fsspset
->fssps_list
== fssproj
)
764 fsspset
->fssps_list
= fssproj
->fssp_next
;
765 if (fsszone
->fssz_nproj
== 0)
766 fss_remove_fsszone(fsspset
, fsszone
);
769 * This was the last project part running
770 * at this cpu partition.
772 fsspset
->fssps_list
= NULL
;
773 ASSERT(fsspset
->fssps_nproj
== 0);
774 ASSERT(fsszone
->fssz_nproj
== 0);
775 fss_remove_fsszone(fsspset
, fsszone
);
776 fss_del_fsspset(fsspset
);
781 fss_inactive(kthread_t
*t
)
788 ASSERT(THREAD_LOCK_HELD(t
));
789 fssproc
= FSSPROC(t
);
790 fssproj
= FSSPROC2FSSPROJ(fssproc
);
791 if (fssproj
== NULL
) /* if this thread already exited */
793 fsspset
= FSSPROJ2FSSPSET(fssproj
);
794 fsszone
= fssproj
->fssp_fsszone
;
795 disp_lock_enter_high(&fsspset
->fssps_displock
);
796 ASSERT(fssproj
->fssp_runnable
> 0);
797 if (--fssproj
->fssp_runnable
== 0) {
798 fsszone
->fssz_shares
-= fssproj
->fssp_shares
;
799 if (--fsszone
->fssz_runnable
== 0)
800 fsspset
->fssps_shares
-= fsszone
->fssz_rshares
;
802 ASSERT(fssproc
->fss_runnable
== 1);
803 fssproc
->fss_runnable
= 0;
804 disp_lock_exit_high(&fsspset
->fssps_displock
);
808 fss_active(kthread_t
*t
)
815 ASSERT(THREAD_LOCK_HELD(t
));
816 fssproc
= FSSPROC(t
);
817 fssproj
= FSSPROC2FSSPROJ(fssproc
);
818 if (fssproj
== NULL
) /* if this thread already exited */
820 fsspset
= FSSPROJ2FSSPSET(fssproj
);
821 fsszone
= fssproj
->fssp_fsszone
;
822 disp_lock_enter_high(&fsspset
->fssps_displock
);
823 if (++fssproj
->fssp_runnable
== 1) {
824 fsszone
->fssz_shares
+= fssproj
->fssp_shares
;
825 if (++fsszone
->fssz_runnable
== 1)
826 fsspset
->fssps_shares
+= fsszone
->fssz_rshares
;
828 ASSERT(fssproc
->fss_runnable
== 0);
829 fssproc
->fss_runnable
= 1;
830 disp_lock_exit_high(&fsspset
->fssps_displock
);
834 * Fair share scheduler initialization. Called by dispinit() at boot time.
835 * We can ignore clparmsz argument since we know that the smallest possible
836 * parameter buffer is big enough for us.
840 fss_init(id_t cid
, int clparmsz
, classfuncs_t
**clfuncspp
)
844 ASSERT(MUTEX_HELD(&cpu_lock
));
847 fss_maxumdpri
= minclsyspri
- 1;
848 fss_maxglobpri
= minclsyspri
;
850 fsspsets
= kmem_zalloc(sizeof (fsspset_t
) * max_ncpus
, KM_SLEEP
);
853 * Initialize the fssproc hash table.
855 for (i
= 0; i
< FSS_LISTS
; i
++)
856 fss_listhead
[i
].fss_next
= fss_listhead
[i
].fss_prev
=
859 *clfuncspp
= &fss_classfuncs
;
862 * Fill in fss_nice_tick and fss_nice_decay arrays:
863 * The cost of a tick is lower at positive nice values (so that it
864 * will not increase its project's usage as much as normal) with 50%
865 * drop at the maximum level and 50% increase at the minimum level.
866 * The fsspri decay is slower at positive nice values. fsspri values
867 * of processes with negative nice levels must decay faster to receive
868 * time slices more frequently than normal.
870 for (i
= 0; i
< FSS_NICE_RANGE
; i
++) {
871 fss_nice_tick
[i
] = (FSS_TICK_COST
* (((3 * FSS_NICE_RANGE
) / 2)
872 - i
)) / FSS_NICE_RANGE
;
873 fss_nice_decay
[i
] = FSS_DECAY_MIN
+
874 ((FSS_DECAY_MAX
- FSS_DECAY_MIN
) * i
) /
875 (FSS_NICE_RANGE
- 1);
878 return (fss_maxglobpri
);
882 * Calculate the new fss_umdpri based on the usage, the normalized share usage
883 * and the number of active threads. Reset the tick counter for this thread.
885 * When calculating the new priority using the standard formula we can hit
886 * a scenario where we don't have good round-robin behavior. This would be
887 * most commonly seen when there is a zone with lots of runnable threads.
888 * In the bad scenario we will see the following behavior when using the
889 * standard formula and these conditions:
891 * - there are multiple runnable threads in the zone (project)
892 * - the fssps_maxfsspri is a very large value
893 * - (we also know all of these threads will use the project's
896 * Under these conditions, a thread with a low fss_fsspri value is chosen
897 * to run and the thread gets a high fss_umdpri. This thread can run for
898 * its full quanta (fss_timeleft) at which time fss_newpri is called to
899 * calculate the thread's new priority.
901 * In this case, because the newly calculated fsspri value is much smaller
902 * (orders of magnitude) than the fssps_maxfsspri value, if we used the
903 * standard formula the thread will still get a high fss_umdpri value and
904 * will run again for another quanta, even though there are other runnable
905 * threads in the project.
907 * For a thread that is runnable for a long time, the thread can continue
908 * to run for many quanta (totaling many seconds) before the thread's fsspri
909 * exceeds the fssps_maxfsspri and the thread's fss_umdpri is reset back
910 * down to 1. This behavior also keeps the fssps_maxfsspr at a high value,
911 * so that the next runnable thread might repeat this cycle.
913 * This leads to the case where we don't have round-robin behavior at quanta
914 * granularity, but instead, runnable threads within the project only run
915 * at several second intervals.
917 * To prevent this scenario from occuring, when a thread has consumed its
918 * quanta and there are multiple runnable threads in the project, we
919 * immediately cause the thread to hit fssps_maxfsspri so that it gets
920 * reset back to 1 and another runnable thread in the project can run.
923 fss_newpri(fssproc_t
*fssproc
, bool quanta_up
)
929 fsspri_t fsspri
, maxfsspri
;
934 tp
= fssproc
->fss_tp
;
937 if (tp
->t_cid
!= fss_cid
)
940 ASSERT(THREAD_LOCK_HELD(tp
));
942 fssproj
= FSSPROC2FSSPROJ(fssproc
);
943 fsszone
= FSSPROJ2FSSZONE(fssproj
);
946 * No need to change priority of exited threads.
950 fsspset
= FSSPROJ2FSSPSET(fssproj
);
951 disp_lock_enter_high(&fsspset
->fssps_displock
);
953 ticks
= fssproc
->fss_ticks
;
954 fssproc
->fss_ticks
= 0;
956 if (fssproj
->fssp_shares
== 0 || fsszone
->fssz_rshares
== 0) {
958 * Special case: threads with no shares.
960 fssproc
->fss_umdpri
= fss_minglobpri
;
961 disp_lock_exit_high(&fsspset
->fssps_displock
);
965 maxfsspri
= fsspset
->fssps_maxfsspri
;
966 n_runnable
= fssproj
->fssp_runnable
;
968 if (quanta_up
&& n_runnable
> 1) {
972 * fsspri += fssp_shusage * nrunnable * ticks
973 * If all three values are non-0, this typically calculates to
974 * a large number (sometimes > 1M, sometimes > 100B) due to
975 * fssp_shusage which can be > 1T.
977 fsspri
= fssproc
->fss_fsspri
;
978 fsspri
+= fssproj
->fssp_shusage
* n_runnable
* ticks
;
981 fssproc
->fss_fsspri
= fsspri
;
984 * fss_maxumdpri is normally 59, since FSS priorities are 0-59.
985 * If the previous calculation resulted in 0 (e.g. was 0 and added 0
986 * because ticks == 0), then instead of 0, we use the largest priority,
987 * which is still small in comparison to the large numbers we typically
990 if (fsspri
< fss_maxumdpri
)
991 fsspri
= fss_maxumdpri
; /* so that maxfsspri is != 0 */
994 * The general priority formula:
996 * (fsspri * umdprirange)
997 * pri = maxumdpri - ------------------------
1000 * If this thread's fsspri is greater than the previous largest
1001 * fsspri, then record it as the new high and priority for this
1002 * thread will be one (the lowest priority assigned to a thread
1003 * that has non-zero shares). Because of this check, maxfsspri can
1004 * change as this function is called via the
1005 * fss_update -> fss_update_list -> fss_newpri code path to update
1006 * all runnable threads. See the code in fss_update for how we
1007 * mitigate this issue.
1009 * Note that this formula cannot produce out of bounds priority
1010 * values (0-59); if it is changed, additional checks may need to be
1013 if (fsspri
>= maxfsspri
) {
1014 fsspset
->fssps_maxfsspri
= fsspri
;
1015 disp_lock_exit_high(&fsspset
->fssps_displock
);
1016 fssproc
->fss_umdpri
= 1;
1018 disp_lock_exit_high(&fsspset
->fssps_displock
);
1019 invpri
= (fsspri
* (fss_maxumdpri
- 1)) / maxfsspri
;
1020 fssproc
->fss_umdpri
= fss_maxumdpri
- invpri
;
1025 * Decays usages of all running projects, resets their tick counters and
1026 * calcluates the projects normalized share usage. Called once per second from
1032 uint32_t zone_ext_shares
, zone_int_shares
;
1033 uint32_t kpj_shares
, pset_shares
;
1041 mutex_enter(&fsspsets_lock
);
1043 * Go through all active processor sets and decay usages of projects
1046 for (psetid
= 0; psetid
< max_ncpus
; psetid
++) {
1047 fsspset
= &fsspsets
[psetid
];
1048 mutex_enter(&fsspset
->fssps_lock
);
1050 fsspset
->fssps_gen
++;
1052 if (fsspset
->fssps_cpupart
== NULL
||
1053 (fssproj
= fsspset
->fssps_list
) == NULL
) {
1054 mutex_exit(&fsspset
->fssps_lock
);
1059 * Decay maxfsspri for this cpu partition with the
1060 * fastest possible decay rate.
1062 disp_lock_enter(&fsspset
->fssps_displock
);
1064 pset_shares
= fsspset
->fssps_shares
;
1066 maxfsspri
= (fsspset
->fssps_maxfsspri
*
1067 fss_nice_decay
[NZERO
]) / FSS_DECAY_BASE
;
1068 if (maxfsspri
< fss_maxumdpri
)
1069 maxfsspri
= fss_maxumdpri
;
1070 fsspset
->fssps_maxfsspri
= maxfsspri
;
1073 fsszone
= fssproj
->fssp_fsszone
;
1074 zp
= fsszone
->fssz_zone
;
1077 * Reset zone's FSS stats if they are from a
1080 if (fsspset
->fssps_gen
!= zp
->zone_fss_gen
) {
1081 zp
->zone_fss_gen
= fsspset
->fssps_gen
;
1082 zp
->zone_run_ticks
= 0;
1086 * Decay project usage, then add in this cycle's
1089 fssproj
->fssp_usage
=
1090 (fssproj
->fssp_usage
* FSS_DECAY_USG
) /
1092 fssproj
->fssp_ticks
;
1094 fssproj
->fssp_ticks
= 0;
1095 zp
->zone_run_ticks
+= fssproj
->fssp_tick_cnt
;
1096 fssproj
->fssp_tick_cnt
= 0;
1099 * Readjust the project's number of shares if it has
1100 * changed since we checked it last time.
1102 kpj_shares
= fssproj
->fssp_proj
->kpj_shares
;
1103 if (fssproj
->fssp_shares
!= kpj_shares
) {
1104 if (fssproj
->fssp_runnable
!= 0) {
1105 fsszone
->fssz_shares
-=
1106 fssproj
->fssp_shares
;
1107 fsszone
->fssz_shares
+= kpj_shares
;
1109 fssproj
->fssp_shares
= kpj_shares
;
1113 * Readjust the zone's number of shares if it
1114 * has changed since we checked it last time.
1116 zone_ext_shares
= zp
->zone_shares
;
1117 if (fsszone
->fssz_rshares
!= zone_ext_shares
) {
1118 if (fsszone
->fssz_runnable
!= 0) {
1119 fsspset
->fssps_shares
-=
1120 fsszone
->fssz_rshares
;
1121 fsspset
->fssps_shares
+=
1123 pset_shares
= fsspset
->fssps_shares
;
1125 fsszone
->fssz_rshares
= zone_ext_shares
;
1127 zone_int_shares
= fsszone
->fssz_shares
;
1130 * If anything is runnable in the project, track the
1131 * overall project share percent for monitoring useage.
1133 if (fssproj
->fssp_runnable
> 0) {
1134 uint32_t zone_shr_pct
;
1135 uint32_t int_shr_pct
;
1138 * Times 1000 to get tenths of a percent
1141 * zone_shr_pct = ---------------
1145 * int_shr_pct = ---------------
1148 if (pset_shares
== 0 || zone_int_shares
== 0) {
1149 fssproj
->fssp_shr_pct
= 0;
1152 (zone_ext_shares
* 1000) /
1154 int_shr_pct
= (kpj_shares
* 1000) /
1156 fssproj
->fssp_shr_pct
=
1157 (zone_shr_pct
* int_shr_pct
) /
1161 DTRACE_PROBE1(fss__prj__norun
, fssproj_t
*,
1166 * Calculate fssp_shusage value to be used
1167 * for fsspri increments for the next second.
1169 if (kpj_shares
== 0 || zone_ext_shares
== 0) {
1170 fssproj
->fssp_shusage
= 0;
1171 } else if (FSSPROJ2KPROJ(fssproj
) == proj0p
) {
1172 uint32_t zone_shr_pct
;
1175 * Project 0 in the global zone has 50%
1176 * of its zone. See calculation above for
1177 * the zone's share percent.
1179 if (pset_shares
== 0)
1180 zone_shr_pct
= 1000;
1183 (zone_ext_shares
* 1000) /
1186 fssproj
->fssp_shr_pct
= zone_shr_pct
/ 2;
1188 fssproj
->fssp_shusage
= (fssproj
->fssp_usage
*
1189 zone_int_shares
* zone_int_shares
) /
1190 (zone_ext_shares
* zone_ext_shares
);
1193 * Thread's priority is based on its project's
1194 * normalized usage (shusage) value which gets
1195 * calculated this way:
1197 * pset_shares^2 zone_int_shares^2
1198 * usage * ------------- * ------------------
1199 * kpj_shares^2 zone_ext_shares^2
1201 * Where zone_int_shares is the sum of shares
1202 * of all active projects within the zone (and
1203 * the pset), and zone_ext_shares is the number
1204 * of zone shares (ie, zone.cpu-shares).
1206 * If there is only one zone active on the pset
1207 * the above reduces to:
1210 * shusage = usage * ---------------------
1213 * If there's only one project active in the
1214 * zone this formula reduces to:
1217 * shusage = usage * ----------------------
1220 * shusage is one input to calculating fss_pri
1221 * in fss_newpri(). Larger values tend toward
1222 * lower priorities for processes in the proj.
1224 fssproj
->fssp_shusage
= fssproj
->fssp_usage
*
1225 pset_shares
* zone_int_shares
;
1226 fssproj
->fssp_shusage
/=
1227 kpj_shares
* zone_ext_shares
;
1228 fssproj
->fssp_shusage
*=
1229 pset_shares
* zone_int_shares
;
1230 fssproj
->fssp_shusage
/=
1231 kpj_shares
* zone_ext_shares
;
1233 fssproj
= fssproj
->fssp_next
;
1234 } while (fssproj
!= fsspset
->fssps_list
);
1236 disp_lock_exit(&fsspset
->fssps_displock
);
1237 mutex_exit(&fsspset
->fssps_lock
);
1239 mutex_exit(&fsspsets_lock
);
1243 fss_change_priority(kthread_t
*t
, fssproc_t
*fssproc
)
1247 ASSERT(THREAD_LOCK_HELD(t
));
1248 new_pri
= fssproc
->fss_umdpri
;
1249 ASSERT(new_pri
>= 0 && new_pri
<= fss_maxglobpri
);
1251 t
->t_cpri
= fssproc
->fss_upri
;
1252 fssproc
->fss_flags
&= ~FSSRESTORE
;
1253 if (t
== curthread
|| t
->t_state
== TS_ONPROC
) {
1255 * curthread is always onproc
1257 cpu_t
*cp
= t
->t_disp_queue
->disp_cpu
;
1258 THREAD_CHANGE_PRI(t
, new_pri
);
1259 if (t
== cp
->cpu_dispthread
)
1260 cp
->cpu_dispatch_pri
= DISP_PRIO(t
);
1261 if (DISP_MUST_SURRENDER(t
)) {
1262 fssproc
->fss_flags
|= FSSBACKQ
;
1265 fssproc
->fss_timeleft
= fss_quantum
;
1269 * When the priority of a thread is changed, it may be
1270 * necessary to adjust its position on a sleep queue or
1271 * dispatch queue. The function thread_change_pri accomplishes
1274 if (thread_change_pri(t
, new_pri
, 0)) {
1276 * The thread was on a run queue.
1278 fssproc
->fss_timeleft
= fss_quantum
;
1280 fssproc
->fss_flags
|= FSSBACKQ
;
1286 * Update priorities of all fair-sharing threads that are currently runnable
1287 * at a user mode priority based on the number of shares and current usage.
1288 * Called once per second via timeout which we reset here.
1290 * There are several lists of fair-sharing threads broken up by a hash on the
1291 * thread pointer. Each list has its own lock. This avoids blocking all
1292 * fss_enterclass, fss_fork, and fss_exitclass operations while fss_update runs.
1293 * fss_update traverses each list in turn.
1295 * Each time we're run (once/second) we may start at the next list and iterate
1296 * through all of the lists. By starting with a different list, we mitigate any
1297 * effects we would see updating the fssps_maxfsspri value in fss_newpri.
1300 fss_update(void *arg
)
1303 int new_marker
= -1;
1304 static int fss_update_marker
;
1307 * Decay and update usages for all projects.
1312 * Start with the fss_update_marker list, then do the rest.
1314 i
= fss_update_marker
;
1317 * Go around all threads, set new priorities and decay
1318 * per-thread CPU usages.
1322 * If this is the first list after the current marker to have
1323 * threads with priority updates, advance the marker to this
1324 * list for the next time fss_update runs.
1326 if (fss_update_list(i
) &&
1327 new_marker
== -1 && i
!= fss_update_marker
)
1329 } while ((i
= FSS_LIST_NEXT(i
)) != fss_update_marker
);
1332 * Advance marker for the next fss_update call
1334 if (new_marker
!= -1)
1335 fss_update_marker
= new_marker
;
1337 (void) timeout(fss_update
, arg
, hz
);
1341 * Updates priority for a list of threads. Returns 1 if the priority of one
1342 * of the threads was actually updated, 0 if none were for various reasons
1343 * (thread is no longer in the FSS class, is not runnable, has the preemption
1344 * control no-preempt bit set, etc.)
1347 fss_update_list(int i
)
1356 mutex_enter(&fss_listlock
[i
]);
1357 for (fssproc
= fss_listhead
[i
].fss_next
; fssproc
!= &fss_listhead
[i
];
1358 fssproc
= fssproc
->fss_next
) {
1359 t
= fssproc
->fss_tp
;
1361 * Lock the thread and verify the state.
1365 * Skip the thread if it is no longer in the FSS class or
1366 * is running with kernel mode priority.
1368 if (t
->t_cid
!= fss_cid
)
1370 if ((fssproc
->fss_flags
& FSSKPRI
) != 0)
1373 fssproj
= FSSPROC2FSSPROJ(fssproc
);
1374 if (fssproj
== NULL
)
1377 if (fssproj
->fssp_shares
!= 0) {
1379 * Decay fsspri value.
1381 fsspri
= fssproc
->fss_fsspri
;
1382 fsspri
= (fsspri
* fss_nice_decay
[fssproc
->fss_nice
]) /
1384 fssproc
->fss_fsspri
= fsspri
;
1387 if (t
->t_schedctl
&& schedctl_get_nopreempt(t
))
1389 if (t
->t_state
!= TS_RUN
&& t
->t_state
!= TS_WAIT
) {
1391 * Make next syscall/trap call fss_trapret
1395 if (t
->t_state
== TS_ONPROC
)
1396 DTRACE_PROBE1(fss__onproc
, fssproc_t
*,
1400 fss_newpri(fssproc
, false);
1403 fss_umdpri
= fssproc
->fss_umdpri
;
1406 * Only dequeue the thread if it needs to be moved; otherwise
1407 * it should just round-robin here.
1409 if (t
->t_pri
!= fss_umdpri
)
1410 fss_change_priority(t
, fssproc
);
1414 mutex_exit(&fss_listlock
[i
]);
1420 fss_admin(caddr_t uaddr
, cred_t
*reqpcredp
)
1422 fssadmin_t fssadmin
;
1424 if (copyin(uaddr
, &fssadmin
, sizeof (fssadmin_t
)))
1427 switch (fssadmin
.fss_cmd
) {
1429 if (secpolicy_dispadm(reqpcredp
) != 0)
1431 if (fssadmin
.fss_quantum
<= 0 || fssadmin
.fss_quantum
>= hz
)
1433 fss_quantum
= fssadmin
.fss_quantum
;
1436 fssadmin
.fss_quantum
= fss_quantum
;
1437 if (copyout(&fssadmin
, uaddr
, sizeof (fssadmin_t
)))
1447 fss_getclinfo(void *infop
)
1449 fssinfo_t
*fssinfo
= (fssinfo_t
*)infop
;
1450 fssinfo
->fss_maxupri
= fss_maxupri
;
1455 fss_parmsin(void *parmsp
)
1457 fssparms_t
*fssparmsp
= (fssparms_t
*)parmsp
;
1460 * Check validity of parameters.
1462 if ((fssparmsp
->fss_uprilim
> fss_maxupri
||
1463 fssparmsp
->fss_uprilim
< -fss_maxupri
) &&
1464 fssparmsp
->fss_uprilim
!= FSS_NOCHANGE
)
1467 if ((fssparmsp
->fss_upri
> fss_maxupri
||
1468 fssparmsp
->fss_upri
< -fss_maxupri
) &&
1469 fssparmsp
->fss_upri
!= FSS_NOCHANGE
)
1477 fss_parmsout(void *parmsp
, pc_vaparms_t
*vaparmsp
)
1483 fss_vaparmsin(void *parmsp
, pc_vaparms_t
*vaparmsp
)
1485 fssparms_t
*fssparmsp
= (fssparms_t
*)parmsp
;
1489 pc_vaparm_t
*vpp
= &vaparmsp
->pc_parms
[0];
1492 * FSS_NOCHANGE (-32768) is outside of the range of values for
1493 * fss_uprilim and fss_upri. If the structure fssparms_t is changed,
1494 * FSS_NOCHANGE should be replaced by a flag word.
1496 fssparmsp
->fss_uprilim
= FSS_NOCHANGE
;
1497 fssparmsp
->fss_upri
= FSS_NOCHANGE
;
1500 * Get the varargs parameter and check validity of parameters.
1502 if (vaparmsp
->pc_vaparmscnt
> PC_VAPARMCNT
)
1505 for (cnt
= 0; cnt
< vaparmsp
->pc_vaparmscnt
; cnt
++, vpp
++) {
1506 switch (vpp
->pc_key
) {
1507 case FSS_KY_UPRILIM
:
1510 fssparmsp
->fss_uprilim
= (pri_t
)vpp
->pc_parm
;
1511 if (fssparmsp
->fss_uprilim
> fss_maxupri
||
1512 fssparmsp
->fss_uprilim
< -fss_maxupri
)
1518 fssparmsp
->fss_upri
= (pri_t
)vpp
->pc_parm
;
1519 if (fssparmsp
->fss_upri
> fss_maxupri
||
1520 fssparmsp
->fss_upri
< -fss_maxupri
)
1528 if (vaparmsp
->pc_vaparmscnt
== 0) {
1530 * Use default parameters.
1532 fssparmsp
->fss_upri
= fssparmsp
->fss_uprilim
= 0;
1539 * Copy all selected fair-sharing class parameters to the user. The parameters
1540 * are specified by a key.
1543 fss_vaparmsout(void *parmsp
, pc_vaparms_t
*vaparmsp
)
1545 fssparms_t
*fssparmsp
= (fssparms_t
*)parmsp
;
1549 pc_vaparm_t
*vpp
= &vaparmsp
->pc_parms
[0];
1551 ASSERT(MUTEX_NOT_HELD(&curproc
->p_lock
));
1553 if (vaparmsp
->pc_vaparmscnt
> PC_VAPARMCNT
)
1556 for (cnt
= 0; cnt
< vaparmsp
->pc_vaparmscnt
; cnt
++, vpp
++) {
1557 switch (vpp
->pc_key
) {
1558 case FSS_KY_UPRILIM
:
1561 if (copyout(&fssparmsp
->fss_uprilim
,
1562 (caddr_t
)(uintptr_t)vpp
->pc_parm
, sizeof (pri_t
)))
1568 if (copyout(&fssparmsp
->fss_upri
,
1569 (caddr_t
)(uintptr_t)vpp
->pc_parm
, sizeof (pri_t
)))
1581 * Return the user mode scheduling priority range.
1584 fss_getclpri(pcpri_t
*pcprip
)
1586 pcprip
->pc_clpmax
= fss_maxupri
;
1587 pcprip
->pc_clpmin
= -fss_maxupri
;
1592 fss_alloc(void **p
, int flag
)
1596 if ((bufp
= kmem_zalloc(sizeof (fssproc_t
), flag
)) == NULL
) {
1605 fss_free(void *bufp
)
1608 kmem_free(bufp
, sizeof (fssproc_t
));
1615 fss_enterclass(kthread_t
*t
, id_t cid
, void *parmsp
, cred_t
*reqpcredp
,
1618 fssparms_t
*fssparmsp
= (fssparms_t
*)parmsp
;
1620 pri_t reqfssuprilim
;
1622 static uint32_t fssexists
= 0;
1628 int fsszone_allocated
= 0;
1630 fssproc
= (fssproc_t
*)bufp
;
1631 ASSERT(fssproc
!= NULL
);
1633 ASSERT(MUTEX_HELD(&ttoproc(t
)->p_lock
));
1636 * Only root can move threads to FSS class.
1638 if (reqpcredp
!= NULL
&& secpolicy_setpriority(reqpcredp
) != 0)
1641 * Initialize the fssproc structure.
1643 fssproc
->fss_umdpri
= fss_maxumdpri
/ 2;
1645 if (fssparmsp
== NULL
) {
1647 * Use default values.
1649 fssproc
->fss_nice
= NZERO
;
1650 fssproc
->fss_uprilim
= fssproc
->fss_upri
= 0;
1653 * Use supplied values.
1655 if (fssparmsp
->fss_uprilim
== FSS_NOCHANGE
) {
1658 if (fssparmsp
->fss_uprilim
> 0 &&
1659 secpolicy_setpriority(reqpcredp
) != 0)
1661 reqfssuprilim
= fssparmsp
->fss_uprilim
;
1663 if (fssparmsp
->fss_upri
== FSS_NOCHANGE
) {
1664 reqfssupri
= reqfssuprilim
;
1666 if (fssparmsp
->fss_upri
> 0 &&
1667 secpolicy_setpriority(reqpcredp
) != 0)
1670 * Set the user priority to the requested value or
1671 * the upri limit, whichever is lower.
1673 reqfssupri
= fssparmsp
->fss_upri
;
1674 if (reqfssupri
> reqfssuprilim
)
1675 reqfssupri
= reqfssuprilim
;
1677 fssproc
->fss_uprilim
= reqfssuprilim
;
1678 fssproc
->fss_upri
= reqfssupri
;
1679 fssproc
->fss_nice
= NZERO
- (NZERO
* reqfssupri
) / fss_maxupri
;
1680 if (fssproc
->fss_nice
> FSS_NICE_MAX
)
1681 fssproc
->fss_nice
= FSS_NICE_MAX
;
1684 fssproc
->fss_timeleft
= fss_quantum
;
1685 fssproc
->fss_tp
= t
;
1686 cpucaps_sc_init(&fssproc
->fss_caps
);
1689 * Put a lock on our fsspset structure.
1691 mutex_enter(&fsspsets_lock
);
1692 fsspset
= fss_find_fsspset(t
->t_cpupart
);
1693 mutex_enter(&fsspset
->fssps_lock
);
1694 mutex_exit(&fsspsets_lock
);
1696 zone
= ttoproc(t
)->p_zone
;
1697 if ((fsszone
= fss_find_fsszone(fsspset
, zone
)) == NULL
) {
1698 if ((fsszone
= kmem_zalloc(sizeof (fsszone_t
), KM_NOSLEEP
))
1700 mutex_exit(&fsspset
->fssps_lock
);
1703 fsszone_allocated
= 1;
1704 fss_insert_fsszone(fsspset
, zone
, fsszone
);
1708 if ((fssproj
= fss_find_fssproj(fsspset
, kpj
)) == NULL
) {
1709 if ((fssproj
= kmem_zalloc(sizeof (fssproj_t
), KM_NOSLEEP
))
1711 if (fsszone_allocated
) {
1712 fss_remove_fsszone(fsspset
, fsszone
);
1713 kmem_free(fsszone
, sizeof (fsszone_t
));
1715 mutex_exit(&fsspset
->fssps_lock
);
1718 fss_insert_fssproj(fsspset
, kpj
, fsszone
, fssproj
);
1721 fssproj
->fssp_threads
++;
1722 fssproc
->fss_proj
= fssproj
;
1725 * Reset priority. Process goes to a "user mode" priority here
1726 * regardless of whether or not it has slept since entering the kernel.
1729 t
->t_clfuncs
= &(sclass
[cid
].cl_funcs
->thread
);
1731 t
->t_cldata
= (void *)fssproc
;
1732 t
->t_schedflag
|= TS_RUNQMATCH
;
1733 fss_change_priority(t
, fssproc
);
1734 if (t
->t_state
== TS_RUN
|| t
->t_state
== TS_ONPROC
||
1735 t
->t_state
== TS_WAIT
)
1739 mutex_exit(&fsspset
->fssps_lock
);
1742 * Link new structure into fssproc list.
1744 FSS_LIST_INSERT(fssproc
);
1747 * If this is the first fair-sharing thread to occur since boot,
1748 * we set up the initial call to fss_update() here. Use an atomic
1749 * compare-and-swap since that's easier and faster than a mutex
1750 * (but check with an ordinary load first since most of the time
1751 * this will already be done).
1753 if (fssexists
== 0 && atomic_cas_32(&fssexists
, 0, 1) == 0)
1754 (void) timeout(fss_update
, NULL
, hz
);
1760 * Remove fssproc_t from the list.
1763 fss_exitclass(void *procp
)
1765 fssproc_t
*fssproc
= (fssproc_t
*)procp
;
1769 kthread_t
*t
= fssproc
->fss_tp
;
1772 * We should be either getting this thread off the deathrow or
1773 * this thread has already moved to another scheduling class and
1774 * we're being called with its old cldata buffer pointer. In both
1775 * cases, the content of this buffer can not be changed while we're
1778 mutex_enter(&fsspsets_lock
);
1780 if (t
->t_cid
!= fss_cid
) {
1782 * We're being called as a result of the priocntl() system
1783 * call -- someone is trying to move our thread to another
1784 * scheduling class. We can't call fss_inactive() here
1785 * because our thread's t_cldata pointer already points
1786 * to another scheduling class specific data.
1788 ASSERT(MUTEX_HELD(&ttoproc(t
)->p_lock
));
1790 fssproj
= FSSPROC2FSSPROJ(fssproc
);
1791 fsspset
= FSSPROJ2FSSPSET(fssproj
);
1792 fsszone
= fssproj
->fssp_fsszone
;
1794 if (fssproc
->fss_runnable
) {
1795 disp_lock_enter_high(&fsspset
->fssps_displock
);
1796 if (--fssproj
->fssp_runnable
== 0) {
1797 fsszone
->fssz_shares
-= fssproj
->fssp_shares
;
1798 if (--fsszone
->fssz_runnable
== 0)
1799 fsspset
->fssps_shares
-=
1800 fsszone
->fssz_rshares
;
1802 disp_lock_exit_high(&fsspset
->fssps_displock
);
1806 mutex_enter(&fsspset
->fssps_lock
);
1807 if (--fssproj
->fssp_threads
== 0) {
1808 fss_remove_fssproj(fsspset
, fssproj
);
1809 if (fsszone
->fssz_nproj
== 0)
1810 kmem_free(fsszone
, sizeof (fsszone_t
));
1811 kmem_free(fssproj
, sizeof (fssproj_t
));
1813 mutex_exit(&fsspset
->fssps_lock
);
1816 ASSERT(t
->t_state
== TS_FREE
);
1818 * We're being called from thread_free() when our thread
1819 * is removed from the deathrow. There is nothing we need
1820 * do here since everything should've been done earlier
1825 mutex_exit(&fsspsets_lock
);
1827 FSS_LIST_DELETE(fssproc
);
1833 fss_canexit(kthread_t
*t
, cred_t
*credp
)
1836 * A thread is allowed to exit FSS only if we have sufficient
1839 if (credp
!= NULL
&& secpolicy_setpriority(credp
) != 0)
1846 * Initialize fair-share class specific proc structure for a child.
1849 fss_fork(kthread_t
*pt
, kthread_t
*ct
, void *bufp
)
1851 fssproc_t
*pfssproc
; /* ptr to parent's fssproc structure */
1852 fssproc_t
*cfssproc
; /* ptr to child's fssproc structure */
1856 ASSERT(MUTEX_HELD(&ttoproc(pt
)->p_lock
));
1857 ASSERT(ct
->t_state
== TS_STOPPED
);
1859 cfssproc
= (fssproc_t
*)bufp
;
1860 ASSERT(cfssproc
!= NULL
);
1861 bzero(cfssproc
, sizeof (fssproc_t
));
1864 pfssproc
= FSSPROC(pt
);
1865 fssproj
= FSSPROC2FSSPROJ(pfssproc
);
1866 fsspset
= FSSPROJ2FSSPSET(fssproj
);
1869 mutex_enter(&fsspset
->fssps_lock
);
1871 * Initialize child's fssproc structure.
1874 ASSERT(FSSPROJ(pt
) == fssproj
);
1875 cfssproc
->fss_proj
= fssproj
;
1876 cfssproc
->fss_timeleft
= fss_quantum
;
1877 cfssproc
->fss_umdpri
= pfssproc
->fss_umdpri
;
1878 cfssproc
->fss_fsspri
= 0;
1879 cfssproc
->fss_uprilim
= pfssproc
->fss_uprilim
;
1880 cfssproc
->fss_upri
= pfssproc
->fss_upri
;
1881 cfssproc
->fss_tp
= ct
;
1882 cfssproc
->fss_nice
= pfssproc
->fss_nice
;
1883 cpucaps_sc_init(&cfssproc
->fss_caps
);
1885 cfssproc
->fss_flags
=
1886 pfssproc
->fss_flags
& ~(FSSKPRI
| FSSBACKQ
| FSSRESTORE
);
1887 ct
->t_cldata
= (void *)cfssproc
;
1888 ct
->t_schedflag
|= TS_RUNQMATCH
;
1891 fssproj
->fssp_threads
++;
1892 mutex_exit(&fsspset
->fssps_lock
);
1895 * Link new structure into fssproc hash table.
1897 FSS_LIST_INSERT(cfssproc
);
1902 * Child is placed at back of dispatcher queue and parent gives up processor
1903 * so that the child runs first after the fork. This allows the child
1904 * immediately execing to break the multiple use of copy on write pages with no
1905 * disk home. The parent will get to steal them back rather than uselessly
1909 fss_forkret(kthread_t
*t
, kthread_t
*ct
)
1911 proc_t
*pp
= ttoproc(t
);
1912 proc_t
*cp
= ttoproc(ct
);
1915 ASSERT(t
== curthread
);
1916 ASSERT(MUTEX_HELD(&pidlock
));
1919 * Grab the child's p_lock before dropping pidlock to ensure the
1920 * process does not disappear before we set it running.
1922 mutex_enter(&cp
->p_lock
);
1924 mutex_exit(&cp
->p_lock
);
1926 mutex_enter(&pp
->p_lock
);
1927 mutex_exit(&pidlock
);
1932 fssproc
= FSSPROC(t
);
1933 fss_newpri(fssproc
, false);
1934 fssproc
->fss_timeleft
= fss_quantum
;
1935 t
->t_pri
= fssproc
->fss_umdpri
;
1936 ASSERT(t
->t_pri
>= 0 && t
->t_pri
<= fss_maxglobpri
);
1937 fssproc
->fss_flags
&= ~FSSKPRI
;
1938 THREAD_TRANSITION(t
);
1941 * We don't want to call fss_setrun(t) here because it may call
1942 * fss_active, which we don't need.
1944 fssproc
->fss_flags
&= ~FSSBACKQ
;
1946 if (t
->t_disp_time
!= ddi_get_lbolt())
1953 * Safe to drop p_lock now since it is safe to change
1954 * the scheduling class after this point.
1956 mutex_exit(&pp
->p_lock
);
1962 * Get the fair-sharing parameters of the thread pointed to by fssprocp into
1963 * the buffer pointed by fssparmsp.
1966 fss_parmsget(kthread_t
*t
, void *parmsp
)
1968 fssproc_t
*fssproc
= FSSPROC(t
);
1969 fssparms_t
*fssparmsp
= (fssparms_t
*)parmsp
;
1971 fssparmsp
->fss_uprilim
= fssproc
->fss_uprilim
;
1972 fssparmsp
->fss_upri
= fssproc
->fss_upri
;
1977 fss_parmsset(kthread_t
*t
, void *parmsp
, id_t reqpcid
, cred_t
*reqpcredp
)
1980 pri_t reqfssuprilim
;
1982 fssproc_t
*fssproc
= FSSPROC(t
);
1983 fssparms_t
*fssparmsp
= (fssparms_t
*)parmsp
;
1985 ASSERT(MUTEX_HELD(&(ttoproc(t
))->p_lock
));
1987 if (fssparmsp
->fss_uprilim
== FSS_NOCHANGE
)
1988 reqfssuprilim
= fssproc
->fss_uprilim
;
1990 reqfssuprilim
= fssparmsp
->fss_uprilim
;
1992 if (fssparmsp
->fss_upri
== FSS_NOCHANGE
)
1993 reqfssupri
= fssproc
->fss_upri
;
1995 reqfssupri
= fssparmsp
->fss_upri
;
1998 * Make sure the user priority doesn't exceed the upri limit.
2000 if (reqfssupri
> reqfssuprilim
)
2001 reqfssupri
= reqfssuprilim
;
2004 * Basic permissions enforced by generic kernel code for all classes
2005 * require that a thread attempting to change the scheduling parameters
2006 * of a target thread be privileged or have a real or effective UID
2007 * matching that of the target thread. We are not called unless these
2008 * basic permission checks have already passed. The fair-sharing class
2009 * requires in addition that the calling thread be privileged if it
2010 * is attempting to raise the upri limit above its current value.
2011 * This may have been checked previously but if our caller passed us
2012 * a non-NULL credential pointer we assume it hasn't and we check it
2015 if ((reqpcredp
!= NULL
) &&
2016 (reqfssuprilim
> fssproc
->fss_uprilim
) &&
2017 secpolicy_raisepriority(reqpcredp
) != 0)
2021 * Set fss_nice to the nice value corresponding to the user priority we
2022 * are setting. Note that setting the nice field of the parameter
2023 * struct won't affect upri or nice.
2025 nice
= NZERO
- (reqfssupri
* NZERO
) / fss_maxupri
;
2026 if (nice
> FSS_NICE_MAX
)
2027 nice
= FSS_NICE_MAX
;
2031 fssproc
->fss_uprilim
= reqfssuprilim
;
2032 fssproc
->fss_upri
= reqfssupri
;
2033 fssproc
->fss_nice
= nice
;
2034 fss_newpri(fssproc
, false);
2036 if ((fssproc
->fss_flags
& FSSKPRI
) != 0) {
2041 fss_change_priority(t
, fssproc
);
2048 * The thread is being stopped.
2052 fss_stop(kthread_t
*t
, int why
, int what
)
2054 ASSERT(THREAD_LOCK_HELD(t
));
2055 ASSERT(t
== curthread
);
2061 * The current thread is exiting, do necessary adjustments to its project
2064 fss_exit(kthread_t
*t
)
2073 * Thread t here is either a current thread (in which case we hold
2074 * its process' p_lock), or a thread being destroyed by forklwp_fail(),
2075 * in which case we hold pidlock and thread is no longer on the
2078 ASSERT(MUTEX_HELD(&(ttoproc(t
))->p_lock
) || MUTEX_HELD(&pidlock
));
2080 fssproc
= FSSPROC(t
);
2081 fssproj
= FSSPROC2FSSPROJ(fssproc
);
2082 fsspset
= FSSPROJ2FSSPSET(fssproj
);
2083 fsszone
= fssproj
->fssp_fsszone
;
2085 mutex_enter(&fsspsets_lock
);
2086 mutex_enter(&fsspset
->fssps_lock
);
2089 disp_lock_enter_high(&fsspset
->fssps_displock
);
2090 if (t
->t_state
== TS_ONPROC
|| t
->t_state
== TS_RUN
) {
2091 if (--fssproj
->fssp_runnable
== 0) {
2092 fsszone
->fssz_shares
-= fssproj
->fssp_shares
;
2093 if (--fsszone
->fssz_runnable
== 0)
2094 fsspset
->fssps_shares
-= fsszone
->fssz_rshares
;
2096 ASSERT(fssproc
->fss_runnable
== 1);
2097 fssproc
->fss_runnable
= 0;
2099 if (--fssproj
->fssp_threads
== 0) {
2100 fss_remove_fssproj(fsspset
, fssproj
);
2103 disp_lock_exit_high(&fsspset
->fssps_displock
);
2104 fssproc
->fss_proj
= NULL
; /* mark this thread as already exited */
2108 if (fsszone
->fssz_nproj
== 0)
2109 kmem_free(fsszone
, sizeof (fsszone_t
));
2110 kmem_free(fssproj
, sizeof (fssproj_t
));
2112 mutex_exit(&fsspset
->fssps_lock
);
2113 mutex_exit(&fsspsets_lock
);
2116 * A thread could be exiting in between clock ticks, so we need to
2117 * calculate how much CPU time it used since it was charged last time.
2119 * CPU caps are not enforced on exiting processes - it is usually
2120 * desirable to exit as soon as possible to free resources.
2124 fssproc
= FSSPROC(t
);
2125 (void) cpucaps_charge(t
, &fssproc
->fss_caps
,
2126 CPUCAPS_CHARGE_ONLY
);
2137 * If thread is currently at a kernel mode priority (has slept) and is
2138 * returning to the userland we assign it the appropriate user mode priority
2139 * and time quantum here. If we're lowering the thread's priority below that
2140 * of other runnable threads then we will set runrun via cpu_surrender() to
2144 fss_trapret(kthread_t
*t
)
2146 fssproc_t
*fssproc
= FSSPROC(t
);
2149 ASSERT(THREAD_LOCK_HELD(t
));
2150 ASSERT(t
== curthread
);
2151 ASSERT(cp
->cpu_dispthread
== t
);
2152 ASSERT(t
->t_state
== TS_ONPROC
);
2155 if (fssproc
->fss_flags
& FSSKPRI
) {
2157 * If thread has blocked in the kernel
2159 THREAD_CHANGE_PRI(t
, fssproc
->fss_umdpri
);
2160 cp
->cpu_dispatch_pri
= DISP_PRIO(t
);
2161 ASSERT(t
->t_pri
>= 0 && t
->t_pri
<= fss_maxglobpri
);
2162 fssproc
->fss_flags
&= ~FSSKPRI
;
2164 if (DISP_MUST_SURRENDER(t
))
2170 * Arrange for thread to be placed in appropriate location on dispatcher queue.
2171 * This is called with the current thread in TS_ONPROC and locked.
2174 fss_preempt(kthread_t
*t
)
2176 fssproc_t
*fssproc
= FSSPROC(t
);
2180 ASSERT(t
== curthread
);
2181 ASSERT(THREAD_LOCK_HELD(curthread
));
2182 ASSERT(t
->t_state
== TS_ONPROC
);
2185 * If preempted in the kernel, make sure the thread has a kernel
2186 * priority if needed.
2188 lwp
= curthread
->t_lwp
;
2189 if (!(fssproc
->fss_flags
& FSSKPRI
) && lwp
!= NULL
&& t
->t_kpri_req
) {
2190 fssproc
->fss_flags
|= FSSKPRI
;
2191 THREAD_CHANGE_PRI(t
, minclsyspri
);
2192 ASSERT(t
->t_pri
>= 0 && t
->t_pri
<= fss_maxglobpri
);
2193 t
->t_trapret
= 1; /* so that fss_trapret will run */
2198 * This thread may be placed on wait queue by CPU Caps. In this case we
2199 * do not need to do anything until it is removed from the wait queue.
2200 * Do not enforce CPU caps on threads running at a kernel priority
2203 (void) cpucaps_charge(t
, &fssproc
->fss_caps
,
2204 CPUCAPS_CHARGE_ENFORCE
);
2206 if (!(fssproc
->fss_flags
& FSSKPRI
) && CPUCAPS_ENFORCE(t
))
2211 * Check to see if we're doing "preemption control" here. If
2212 * we are, and if the user has requested that this thread not
2213 * be preempted, and if preemptions haven't been put off for
2214 * too long, let the preemption happen here but try to make
2215 * sure the thread is rescheduled as soon as possible. We do
2216 * this by putting it on the front of the highest priority run
2217 * queue in the FSS class. If the preemption has been put off
2218 * for too long, clear the "nopreempt" bit and let the thread
2221 if (t
->t_schedctl
&& schedctl_get_nopreempt(t
)) {
2222 if (fssproc
->fss_timeleft
> -SC_MAX_TICKS
) {
2223 DTRACE_SCHED1(schedctl__nopreempt
, kthread_t
*, t
);
2224 if (!(fssproc
->fss_flags
& FSSKPRI
)) {
2226 * If not already remembered, remember current
2227 * priority for restoration in fss_yield().
2229 if (!(fssproc
->fss_flags
& FSSRESTORE
)) {
2230 fssproc
->fss_scpri
= t
->t_pri
;
2231 fssproc
->fss_flags
|= FSSRESTORE
;
2233 THREAD_CHANGE_PRI(t
, fss_maxumdpri
);
2235 schedctl_set_yield(t
, 1);
2239 if (fssproc
->fss_flags
& FSSRESTORE
) {
2240 THREAD_CHANGE_PRI(t
, fssproc
->fss_scpri
);
2241 fssproc
->fss_flags
&= ~FSSRESTORE
;
2243 schedctl_set_nopreempt(t
, 0);
2244 DTRACE_SCHED1(schedctl__preempt
, kthread_t
*, t
);
2246 * Fall through and be preempted below.
2251 flags
= fssproc
->fss_flags
& (FSSBACKQ
| FSSKPRI
);
2253 if (flags
== FSSBACKQ
) {
2254 fssproc
->fss_timeleft
= fss_quantum
;
2255 fssproc
->fss_flags
&= ~FSSBACKQ
;
2257 } else if (flags
== (FSSBACKQ
| FSSKPRI
)) {
2258 fssproc
->fss_flags
&= ~FSSBACKQ
;
2266 * Called when a thread is waking up and is to be placed on the run queue.
2269 fss_setrun(kthread_t
*t
)
2271 fssproc_t
*fssproc
= FSSPROC(t
);
2273 ASSERT(THREAD_LOCK_HELD(t
)); /* t should be in transition */
2275 if (t
->t_state
== TS_SLEEP
|| t
->t_state
== TS_STOPPED
)
2278 fssproc
->fss_timeleft
= fss_quantum
;
2280 fssproc
->fss_flags
&= ~FSSBACKQ
;
2282 * If previously were running at the kernel priority then keep that
2283 * priority and the fss_timeleft doesn't matter.
2285 if ((fssproc
->fss_flags
& FSSKPRI
) == 0)
2286 THREAD_CHANGE_PRI(t
, fssproc
->fss_umdpri
);
2288 if (t
->t_disp_time
!= ddi_get_lbolt())
2295 * Prepare thread for sleep. We reset the thread priority so it will run at the
2296 * kernel priority level when it wakes up.
2299 fss_sleep(kthread_t
*t
)
2301 fssproc_t
*fssproc
= FSSPROC(t
);
2303 ASSERT(t
== curthread
);
2304 ASSERT(THREAD_LOCK_HELD(t
));
2306 ASSERT(t
->t_state
== TS_ONPROC
);
2309 * Account for time spent on CPU before going to sleep.
2311 (void) CPUCAPS_CHARGE(t
, &fssproc
->fss_caps
, CPUCAPS_CHARGE_ENFORCE
);
2316 * Assign a system priority to the thread and arrange for it to be
2317 * retained when the thread is next placed on the run queue (i.e.,
2318 * when it wakes up) instead of being given a new pri. Also arrange
2319 * for trapret processing as the thread leaves the system call so it
2320 * will drop back to normal priority range.
2322 if (t
->t_kpri_req
) {
2323 THREAD_CHANGE_PRI(t
, minclsyspri
);
2324 fssproc
->fss_flags
|= FSSKPRI
;
2325 t
->t_trapret
= 1; /* so that fss_trapret will run */
2327 } else if (fssproc
->fss_flags
& FSSKPRI
) {
2329 * The thread has done a THREAD_KPRI_REQUEST(), slept, then
2330 * done THREAD_KPRI_RELEASE() (so no t_kpri_req is 0 again),
2331 * then slept again all without finishing the current system
2332 * call so trapret won't have cleared FSSKPRI
2334 fssproc
->fss_flags
&= ~FSSKPRI
;
2335 THREAD_CHANGE_PRI(t
, fssproc
->fss_umdpri
);
2336 if (DISP_MUST_SURRENDER(curthread
))
2342 * A tick interrupt has ocurrend on a running thread. Check to see if our
2343 * time slice has expired.
2346 fss_tick(kthread_t
*t
)
2350 bool call_cpu_surrender
= false;
2351 bool cpucaps_enforce
= false;
2353 ASSERT(MUTEX_HELD(&(ttoproc(t
))->p_lock
));
2356 * It's safe to access fsspset and fssproj structures because we're
2357 * holding our p_lock here.
2360 fssproc
= FSSPROC(t
);
2361 fssproj
= FSSPROC2FSSPROJ(fssproc
);
2362 if (fssproj
!= NULL
) {
2363 fsspset_t
*fsspset
= FSSPROJ2FSSPSET(fssproj
);
2364 disp_lock_enter_high(&fsspset
->fssps_displock
);
2365 fssproj
->fssp_ticks
+= fss_nice_tick
[fssproc
->fss_nice
];
2366 fssproj
->fssp_tick_cnt
++;
2367 fssproc
->fss_ticks
++;
2368 disp_lock_exit_high(&fsspset
->fssps_displock
);
2372 * Keep track of thread's project CPU usage. Note that projects
2373 * get charged even when threads are running in the kernel.
2374 * Do not surrender CPU if running in the SYS class.
2377 cpucaps_enforce
= cpucaps_charge(t
,
2378 &fssproc
->fss_caps
, CPUCAPS_CHARGE_ENFORCE
) &&
2379 !(fssproc
->fss_flags
& FSSKPRI
);
2383 * A thread's execution time for threads running in the SYS class
2386 if ((fssproc
->fss_flags
& FSSKPRI
) == 0) {
2388 * If thread is not in kernel mode, decrement its fss_timeleft
2390 if (--fssproc
->fss_timeleft
<= 0) {
2394 * If we're doing preemption control and trying to
2395 * avoid preempting this thread, just note that the
2396 * thread should yield soon and let it keep running
2397 * (unless it's been a while).
2399 if (t
->t_schedctl
&& schedctl_get_nopreempt(t
)) {
2400 if (fssproc
->fss_timeleft
> -SC_MAX_TICKS
) {
2401 DTRACE_SCHED1(schedctl__nopreempt
,
2403 schedctl_set_yield(t
, 1);
2404 thread_unlock_nopreempt(t
);
2408 fssproc
->fss_flags
&= ~FSSRESTORE
;
2410 fss_newpri(fssproc
, true);
2411 new_pri
= fssproc
->fss_umdpri
;
2412 ASSERT(new_pri
>= 0 && new_pri
<= fss_maxglobpri
);
2415 * When the priority of a thread is changed, it may
2416 * be necessary to adjust its position on a sleep queue
2417 * or dispatch queue. The function thread_change_pri
2418 * accomplishes this.
2420 if (thread_change_pri(t
, new_pri
, 0)) {
2421 fssproc
->fss_timeleft
= fss_quantum
;
2423 call_cpu_surrender
= true;
2425 } else if (t
->t_state
== TS_ONPROC
&&
2426 t
->t_pri
< t
->t_disp_queue
->disp_maxrunpri
) {
2428 * If there is a higher-priority thread which is
2429 * waiting for a processor, then thread surrenders
2432 call_cpu_surrender
= true;
2436 if (cpucaps_enforce
&& 2 * fssproc
->fss_timeleft
> fss_quantum
) {
2438 * The thread used more than half of its quantum, so assume that
2439 * it used the whole quantum.
2441 * Update thread's priority just before putting it on the wait
2442 * queue so that it gets charged for the CPU time from its
2443 * quantum even before that quantum expires.
2445 fss_newpri(fssproc
, false);
2446 if (t
->t_pri
!= fssproc
->fss_umdpri
)
2447 fss_change_priority(t
, fssproc
);
2450 * We need to call cpu_surrender for this thread due to cpucaps
2451 * enforcement, but fss_change_priority may have already done
2452 * so. In this case FSSBACKQ is set and there is no need to call
2453 * cpu-surrender again.
2455 if (!(fssproc
->fss_flags
& FSSBACKQ
))
2456 call_cpu_surrender
= true;
2459 if (call_cpu_surrender
) {
2460 fssproc
->fss_flags
|= FSSBACKQ
;
2464 thread_unlock_nopreempt(t
); /* clock thread can't be preempted */
2468 * Processes waking up go to the back of their queue. We don't need to assign
2469 * a time quantum here because thread is still at a kernel mode priority and
2470 * the time slicing is not done for threads running in the kernel after
2471 * sleeping. The proper time quantum will be assigned by fss_trapret before the
2472 * thread returns to user mode.
2475 fss_wakeup(kthread_t
*t
)
2479 ASSERT(THREAD_LOCK_HELD(t
));
2480 ASSERT(t
->t_state
== TS_SLEEP
);
2484 fssproc
= FSSPROC(t
);
2485 fssproc
->fss_flags
&= ~FSSBACKQ
;
2487 if (fssproc
->fss_flags
& FSSKPRI
) {
2489 * If we already have a kernel priority assigned, then we
2493 } else if (t
->t_kpri_req
) {
2495 * Give thread a priority boost if we were asked.
2497 fssproc
->fss_flags
|= FSSKPRI
;
2498 THREAD_CHANGE_PRI(t
, minclsyspri
);
2500 t
->t_trapret
= 1; /* so that fss_trapret will run */
2504 * Otherwise, we recalculate the priority.
2506 if (t
->t_disp_time
== ddi_get_lbolt()) {
2509 fssproc
->fss_timeleft
= fss_quantum
;
2510 THREAD_CHANGE_PRI(t
, fssproc
->fss_umdpri
);
2517 * fss_donice() is called when a nice(1) command is issued on the thread to
2518 * alter the priority. The nice(1) command exists in Solaris for compatibility.
2519 * Thread priority adjustments should be done via priocntl(1).
2522 fss_donice(kthread_t
*t
, cred_t
*cr
, int incr
, int *retvalp
)
2525 fssproc_t
*fssproc
= FSSPROC(t
);
2526 fssparms_t fssparms
;
2529 * If there is no change to priority, just return current setting.
2533 *retvalp
= fssproc
->fss_nice
- NZERO
;
2537 if ((incr
< 0 || incr
> 2 * NZERO
) && secpolicy_raisepriority(cr
) != 0)
2541 * Specifying a nice increment greater than the upper limit of
2542 * FSS_NICE_MAX (== 2 * NZERO - 1) will result in the thread's nice
2543 * value being set to the upper limit. We check for this before
2544 * computing the new value because otherwise we could get overflow
2545 * if a privileged user specified some ridiculous increment.
2547 if (incr
> FSS_NICE_MAX
)
2548 incr
= FSS_NICE_MAX
;
2550 newnice
= fssproc
->fss_nice
+ incr
;
2551 if (newnice
> FSS_NICE_MAX
)
2552 newnice
= FSS_NICE_MAX
;
2553 else if (newnice
< FSS_NICE_MIN
)
2554 newnice
= FSS_NICE_MIN
;
2556 fssparms
.fss_uprilim
= fssparms
.fss_upri
=
2557 -((newnice
- NZERO
) * fss_maxupri
) / NZERO
;
2560 * Reset the uprilim and upri values of the thread.
2562 (void) fss_parmsset(t
, (void *)&fssparms
, (id_t
)0, (cred_t
*)NULL
);
2565 * Although fss_parmsset already reset fss_nice it may not have been
2566 * set to precisely the value calculated above because fss_parmsset
2567 * determines the nice value from the user priority and we may have
2568 * truncated during the integer conversion from nice value to user
2569 * priority and back. We reset fss_nice to the value we calculated
2572 fssproc
->fss_nice
= (char)newnice
;
2575 *retvalp
= newnice
- NZERO
;
2580 * Increment the priority of the specified thread by incr and
2581 * return the new value in *retvalp.
2584 fss_doprio(kthread_t
*t
, cred_t
*cr
, int incr
, int *retvalp
)
2587 fssproc_t
*fssproc
= FSSPROC(t
);
2588 fssparms_t fssparms
;
2591 * If there is no change to priority, just return current setting.
2594 *retvalp
= fssproc
->fss_upri
;
2598 newpri
= fssproc
->fss_upri
+ incr
;
2599 if (newpri
> fss_maxupri
|| newpri
< -fss_maxupri
)
2603 fssparms
.fss_uprilim
= fssparms
.fss_upri
= newpri
;
2606 * Reset the uprilim and upri values of the thread.
2608 return (fss_parmsset(t
, &fssparms
, (id_t
)0, cr
));
2612 * Return the global scheduling priority that would be assigned to a thread
2613 * entering the fair-sharing class with the fss_upri.
2617 fss_globpri(kthread_t
*t
)
2619 ASSERT(MUTEX_HELD(&ttoproc(t
)->p_lock
));
2621 return (fss_maxumdpri
/ 2);
2625 * Called from the yield(2) system call when a thread is yielding (surrendering)
2626 * the processor. The kernel thread is placed at the back of a dispatch queue.
2629 fss_yield(kthread_t
*t
)
2631 fssproc_t
*fssproc
= FSSPROC(t
);
2633 ASSERT(t
== curthread
);
2634 ASSERT(THREAD_LOCK_HELD(t
));
2637 * Collect CPU usage spent before yielding
2639 (void) CPUCAPS_CHARGE(t
, &fssproc
->fss_caps
, CPUCAPS_CHARGE_ENFORCE
);
2642 * Clear the preemption control "yield" bit since the user is
2646 schedctl_set_yield(t
, 0);
2648 * If fss_preempt() artifically increased the thread's priority
2649 * to avoid preemption, restore the original priority now.
2651 if (fssproc
->fss_flags
& FSSRESTORE
) {
2652 THREAD_CHANGE_PRI(t
, fssproc
->fss_scpri
);
2653 fssproc
->fss_flags
&= ~FSSRESTORE
;
2655 if (fssproc
->fss_timeleft
< 0) {
2657 * Time slice was artificially extended to avoid preemption,
2658 * so pretend we're preempting it now.
2660 DTRACE_SCHED1(schedctl__yield
, int, -fssproc
->fss_timeleft
);
2661 fssproc
->fss_timeleft
= fss_quantum
;
2663 fssproc
->fss_flags
&= ~FSSBACKQ
;
2668 fss_changeproj(kthread_t
*t
, void *kp
, void *zp
, fssbuf_t
*projbuf
,
2671 kproject_t
*kpj_new
= kp
;
2673 fssproj_t
*fssproj_old
, *fssproj_new
;
2675 kproject_t
*kpj_old
;
2677 fsszone_t
*fsszone_old
, *fsszone_new
;
2681 ASSERT(MUTEX_HELD(&cpu_lock
));
2682 ASSERT(MUTEX_HELD(&pidlock
));
2683 ASSERT(MUTEX_HELD(&ttoproc(t
)->p_lock
));
2685 if (t
->t_cid
!= fss_cid
)
2688 fssproc
= FSSPROC(t
);
2689 mutex_enter(&fsspsets_lock
);
2690 fssproj_old
= FSSPROC2FSSPROJ(fssproc
);
2691 if (fssproj_old
== NULL
) {
2692 mutex_exit(&fsspsets_lock
);
2696 fsspset
= FSSPROJ2FSSPSET(fssproj_old
);
2697 mutex_enter(&fsspset
->fssps_lock
);
2698 kpj_old
= FSSPROJ2KPROJ(fssproj_old
);
2699 fsszone_old
= fssproj_old
->fssp_fsszone
;
2701 ASSERT(t
->t_cpupart
== fsspset
->fssps_cpupart
);
2703 if (kpj_old
== kpj_new
) {
2704 mutex_exit(&fsspset
->fssps_lock
);
2705 mutex_exit(&fsspsets_lock
);
2709 if ((fsszone_new
= fss_find_fsszone(fsspset
, zone
)) == NULL
) {
2711 * If the zone for the new project is not currently active on
2712 * the cpu partition we're on, get one of the pre-allocated
2713 * buffers and link it in our per-pset zone list. Such buffers
2714 * should already exist.
2716 for (id
= 0; id
< zonebuf
->fssb_size
; id
++) {
2717 if ((fsszone_new
= zonebuf
->fssb_list
[id
]) != NULL
) {
2718 fss_insert_fsszone(fsspset
, zone
, fsszone_new
);
2719 zonebuf
->fssb_list
[id
] = NULL
;
2724 ASSERT(fsszone_new
!= NULL
);
2725 if ((fssproj_new
= fss_find_fssproj(fsspset
, kpj_new
)) == NULL
) {
2727 * If our new project is not currently running
2728 * on the cpu partition we're on, get one of the
2729 * pre-allocated buffers and link it in our new cpu
2730 * partition doubly linked list. Such buffers should already
2733 for (id
= 0; id
< projbuf
->fssb_size
; id
++) {
2734 if ((fssproj_new
= projbuf
->fssb_list
[id
]) != NULL
) {
2735 fss_insert_fssproj(fsspset
, kpj_new
,
2736 fsszone_new
, fssproj_new
);
2737 projbuf
->fssb_list
[id
] = NULL
;
2742 ASSERT(fssproj_new
!= NULL
);
2745 if (t
->t_state
== TS_RUN
|| t
->t_state
== TS_ONPROC
||
2746 t
->t_state
== TS_WAIT
)
2748 ASSERT(fssproj_old
->fssp_threads
> 0);
2749 if (--fssproj_old
->fssp_threads
== 0) {
2750 fss_remove_fssproj(fsspset
, fssproj_old
);
2753 fssproc
->fss_proj
= fssproj_new
;
2754 fssproc
->fss_fsspri
= 0;
2755 fssproj_new
->fssp_threads
++;
2756 if (t
->t_state
== TS_RUN
|| t
->t_state
== TS_ONPROC
||
2757 t
->t_state
== TS_WAIT
)
2761 if (fsszone_old
->fssz_nproj
== 0)
2762 kmem_free(fsszone_old
, sizeof (fsszone_t
));
2763 kmem_free(fssproj_old
, sizeof (fssproj_t
));
2766 mutex_exit(&fsspset
->fssps_lock
);
2767 mutex_exit(&fsspsets_lock
);
2771 fss_changepset(kthread_t
*t
, void *newcp
, fssbuf_t
*projbuf
,
2774 fsspset_t
*fsspset_old
, *fsspset_new
;
2775 fssproj_t
*fssproj_old
, *fssproj_new
;
2776 fsszone_t
*fsszone_old
, *fsszone_new
;
2782 ASSERT(MUTEX_HELD(&cpu_lock
));
2783 ASSERT(MUTEX_HELD(&pidlock
));
2784 ASSERT(MUTEX_HELD(&ttoproc(t
)->p_lock
));
2786 if (t
->t_cid
!= fss_cid
)
2789 fssproc
= FSSPROC(t
);
2790 zone
= ttoproc(t
)->p_zone
;
2791 mutex_enter(&fsspsets_lock
);
2792 fssproj_old
= FSSPROC2FSSPROJ(fssproc
);
2793 if (fssproj_old
== NULL
) {
2794 mutex_exit(&fsspsets_lock
);
2797 fsszone_old
= fssproj_old
->fssp_fsszone
;
2798 fsspset_old
= FSSPROJ2FSSPSET(fssproj_old
);
2799 kpj
= FSSPROJ2KPROJ(fssproj_old
);
2801 if (fsspset_old
->fssps_cpupart
== newcp
) {
2802 mutex_exit(&fsspsets_lock
);
2806 ASSERT(ttoproj(t
) == kpj
);
2808 fsspset_new
= fss_find_fsspset(newcp
);
2810 mutex_enter(&fsspset_new
->fssps_lock
);
2811 if ((fsszone_new
= fss_find_fsszone(fsspset_new
, zone
)) == NULL
) {
2812 for (id
= 0; id
< zonebuf
->fssb_size
; id
++) {
2813 if ((fsszone_new
= zonebuf
->fssb_list
[id
]) != NULL
) {
2814 fss_insert_fsszone(fsspset_new
, zone
,
2816 zonebuf
->fssb_list
[id
] = NULL
;
2821 ASSERT(fsszone_new
!= NULL
);
2822 if ((fssproj_new
= fss_find_fssproj(fsspset_new
, kpj
)) == NULL
) {
2823 for (id
= 0; id
< projbuf
->fssb_size
; id
++) {
2824 if ((fssproj_new
= projbuf
->fssb_list
[id
]) != NULL
) {
2825 fss_insert_fssproj(fsspset_new
, kpj
,
2826 fsszone_new
, fssproj_new
);
2827 projbuf
->fssb_list
[id
] = NULL
;
2832 ASSERT(fssproj_new
!= NULL
);
2834 fssproj_new
->fssp_threads
++;
2836 if (t
->t_state
== TS_RUN
|| t
->t_state
== TS_ONPROC
||
2837 t
->t_state
== TS_WAIT
)
2839 fssproc
->fss_proj
= fssproj_new
;
2840 fssproc
->fss_fsspri
= 0;
2841 if (t
->t_state
== TS_RUN
|| t
->t_state
== TS_ONPROC
||
2842 t
->t_state
== TS_WAIT
)
2845 mutex_exit(&fsspset_new
->fssps_lock
);
2847 mutex_enter(&fsspset_old
->fssps_lock
);
2848 if (--fssproj_old
->fssp_threads
== 0) {
2849 fss_remove_fssproj(fsspset_old
, fssproj_old
);
2850 if (fsszone_old
->fssz_nproj
== 0)
2851 kmem_free(fsszone_old
, sizeof (fsszone_t
));
2852 kmem_free(fssproj_old
, sizeof (fssproj_t
));
2854 mutex_exit(&fsspset_old
->fssps_lock
);
2856 mutex_exit(&fsspsets_lock
);