4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
28 #include <sys/param.h>
29 #include <sys/systm.h>
30 #include <sys/sysmacros.h>
31 #include <sys/atomic.h>
32 #include <sys/cpucaps_impl.h>
33 #include <sys/dtrace.h>
35 #include <sys/debug.h>
37 #include <sys/errno.h>
40 * CPU Caps implementation
41 * =======================
43 * A CPU cap can be set on any project or any zone. Zone CPU cap limits the CPU
44 * usage for all projects running inside the zone. If the zone CPU cap is set
45 * below the project CPU cap, the latter will have no effect.
47 * When CPU usage of projects and/or zones reaches specified caps, threads in
48 * them do not get scheduled and instead are placed on wait queues associated
49 * with a cap. Such threads will start running again only when CPU usage drops
50 * below the cap level. Each zone and each project has its own wait queue.
52 * When CPU cap is set, the kernel continously keeps track of CPU time used by
53 * capped zones and/or projects over a short time interval and calculates their
54 * current CPU usage as a percentage. When the accumulated usage reaches the CPU
55 * cap, LWPs running in the user-land (when they are not holding any critical
56 * kernel locks) are placed on special wait queues until their project's or
57 * zone's CPU usage drops below the cap.
59 * The system maintains a list of all capped projects and all capped zones. On
60 * every clock tick every active thread belonging to a capped project adds its
61 * CPU usage to its project. Usage from all projects belonging to a capped zone
62 * is aggregated to get the zone usage.
64 * When the current CPU usage is above the cap, a project or zone is considered
65 * over-capped. Every user thread caught running in an over-capped project or
66 * zone is marked by setting TS_PROJWAITQ flag in thread's t_schedflag field and
67 * is requested to surrender its CPU. This causes scheduling class specific
68 * CL_PREEMPT() callback to be invoked. The callback function places threads
69 * marked as TS_PROJWAIT on a wait queue and calls switch().
71 * Threads are only placed on wait queues after trapping from user-land
72 * (they could be holding some user locks, but no kernel locks) and while
73 * returning from the trap back to the user-land when no kernel locks are held.
74 * Putting threads on wait queues in random places while running in the
75 * kernel might lead to all kinds of locking problems.
80 * Accounting of CPU usage is based on per-thread micro-state accounting data.
81 * On every clock tick clock() adds new on-CPU time for every thread found on
82 * CPU. Scheduling classes also add new on-CPU time for any thread leaving CPU.
83 * New times means time since it was last accounted for. On-CPU times greater
84 * than 1 tick are truncated to 1 tick.
86 * Project CPU usage is aggregated from all threads within the project.
87 * Zone CPU usage is the sum of usages for all projects within the zone. Zone
88 * CPU usage is calculated on every clock tick by walking list of projects and
89 * adding their usage together.
94 * CPU usage is decayed by the caps_update() routine which is called once per
95 * every clock tick. It walks lists of project caps and decays their usages by
96 * one per cent. If CPU usage drops below cap levels, threads on the wait queue
97 * are made runnable again, one thread per clock tick.
102 * The CPU Caps facility provides the following interfaces to the rest of the
105 * cpucaps_project_add(kproject_t *)
107 * Notifies the framework of a new project. It should be put on the
108 * capped_projects list if its zone has a cap.
110 * cpucaps_project_remove(kproject_t *)
112 * Remove the association between the specified project and its cap.
113 * Called right before the project is destroyed.
115 * cpucaps_project_set(kproject_t *, rctl_qty_t)
117 * Set project cap of the specified project to the specified value. Setting the
118 * value to NOCAP is equivalent to removing the cap.
120 * cpucaps_zone_set(zone_t *, rctl_qty_t)
122 * Set zone cap of the specified zone to the specified value. Setting the value
123 * to NOCAP is equivalent to removing the cap.
125 * cpucaps_zone_remove(zone_t *)
127 * Remove the association between the zone and its cap.
129 * cpucaps_charge(kthread_id_t, caps_sc_t *, cpucaps_charge_t)
131 * Charges specified thread's project the amount of on-CPU time that it used.
132 * If the third argument is CPUCAPS_CHARGE_ONLY returns False.
133 * Otherwise returns True if project or zone should be penalized because its
134 * project or zone is exceeding its cap. Also sets TS_PROJWAITQ or TS_ZONEWAITQ
135 * bits in t_schedflag in this case.
137 * CPUCAPS_ENFORCE(kthread_id_t *)
139 * Enforces CPU caps for a specified thread. Places LWPs running in LWP_USER
140 * state on project or zone wait queues, as requested by TS_PROJWAITQ or
141 * TS_ZONEWAITQ bits in t_schedflag. Returns True if the thread was placed on a
142 * wait queue or False otherwise.
144 * cpucaps_sc_init(caps_sc_t *)
146 * Initializes the scheduling-class specific CPU Caps data for a thread.
151 * all the individual caps structures and their lists are protected by a global
152 * caps_lock mutex. The lock is grabbed either by clock() or by events modifying
153 * caps, so it is usually uncontended. We avoid all blocking memory allocations
154 * while holding caps_lock to prevent clock() from blocking.
156 * Thread state is protected by the thread lock. It protects the association
157 * between a thread and its project and, as a consequence, to its zone. The
158 * association can not break while thread lock is held, so the project or zone
159 * cap are not going to disappear while thread lock is held.
161 * Cap usage field is protected by high-pil spin-lock cap_usagelock. It is
162 * grabbed by scheduling classes already holding thread lock at high PIL and by
163 * clock thread performing usage decay. We should do as little work as possible
164 * while holding the lock since it may be very hot. All threads in the project
165 * contend for the same cache line doing cap usage updates.
169 * caps_lock protects list of capped projects and zones, changes in the cap
170 * state and changes of the global cpucaps_enabled flag.
172 * Changing zone caps also sets cpucaps_busy to avoid races when a zone cap is
173 * modified in parallel. This can be per-zone cap flag, but we don't keep any
176 static kmutex_t caps_lock
; /* lock to protect: */
177 static list_t capped_zones
; /* - list of zones with caps */
178 static list_t capped_projects
; /* - list of projects with caps */
179 boolean_t cpucaps_enabled
; /* - are there any caps defined? */
180 boolean_t cpucaps_busy
; /* - is framework busy? */
183 * The accounting is based on the number of nanoseconds threads spend running
184 * during a tick which is kept in the cap_tick_cost variable.
186 static hrtime_t cap_tick_cost
;
189 * How much of the usage value is decayed every clock tick
190 * Decay one per cent of value per tick
192 #define CAP_DECAY_FACTOR 100
195 * Scale the value and round it to the closest integer value
197 #define ROUND_SCALE(x, y) (((x) + (y) / 2) / (y))
199 static void caps_update();
205 kstat_named_t cap_value
;
206 kstat_named_t cap_usage
;
207 kstat_named_t cap_nwait
;
208 kstat_named_t cap_below
;
209 kstat_named_t cap_above
;
210 kstat_named_t cap_maxusage
;
211 kstat_named_t cap_zonename
;
213 { "value", KSTAT_DATA_UINT64
},
214 { "usage", KSTAT_DATA_UINT64
},
215 { "nwait", KSTAT_DATA_UINT64
},
216 { "below_sec", KSTAT_DATA_UINT64
},
217 { "above_sec", KSTAT_DATA_UINT64
},
218 { "maxusage", KSTAT_DATA_UINT64
},
219 { "zonename", KSTAT_DATA_STRING
},
223 static kmutex_t cap_kstat_lock
;
224 static int cap_kstat_update(kstat_t
*, int);
227 * Initialize CPU caps infrastructure.
228 * - Initialize lists of capped zones and capped projects
229 * - Set cpucaps_clock_callout to NULL
235 * Initialize global variables
237 cap_tick_cost
= TICK_TO_NSEC((hrtime_t
)1);
239 list_create(&capped_zones
, sizeof (cpucap_t
),
240 offsetof(cpucap_t
, cap_link
));
241 list_create(&capped_projects
, sizeof (cpucap_t
),
242 offsetof(cpucap_t
, cap_link
));
244 cpucaps_enabled
= B_FALSE
;
245 cpucaps_busy
= B_FALSE
;
246 cpucaps_clock_callout
= NULL
;
250 * Initialize scheduling-class specific CPU Caps data.
253 cpucaps_sc_init(caps_sc_t
*csc
)
255 csc
->csc_cputime
= 0;
259 * Allocate and initialize cpucap structure
264 cpucap_t
*cap
= kmem_zalloc(sizeof (cpucap_t
), KM_SLEEP
);
266 DISP_LOCK_INIT(&cap
->cap_usagelock
);
267 waitq_init(&cap
->cap_waitq
);
273 * Free cpucap structure
276 cap_free(cpucap_t
*cap
)
282 * This cap should not be active
284 ASSERT(!list_link_active(&cap
->cap_link
));
285 ASSERT(cap
->cap_value
== 0);
286 ASSERT(!DISP_LOCK_HELD(&cap
->cap_usagelock
));
288 waitq_fini(&cap
->cap_waitq
);
289 DISP_LOCK_DESTROY(&cap
->cap_usagelock
);
291 kmem_free(cap
, sizeof (cpucap_t
));
295 * Activate cap - insert into active list and unblock its
296 * wait queue. Should be called with caps_lock held.
297 * The cap_value field is set to the value supplied.
300 cap_enable(list_t
*l
, cpucap_t
*cap
, hrtime_t value
)
302 ASSERT(MUTEX_HELD(&caps_lock
));
305 * Cap can not be already enabled
307 ASSERT(!CAP_ENABLED(cap
));
308 ASSERT(!list_link_active(&cap
->cap_link
));
310 list_insert_tail(l
, cap
);
311 cap
->cap_below
= cap
->cap_above
= 0;
312 cap
->cap_maxusage
= 0;
314 cap
->cap_value
= value
;
315 waitq_unblock(&cap
->cap_waitq
);
317 cpucaps_enabled
= B_TRUE
;
318 cpucaps_clock_callout
= caps_update
;
324 * - Block its wait queue. This prevents any new threads from being
325 * enqueued there and moves all enqueued threads to the run queue.
326 * - Remove cap from list l.
327 * - Disable CPU caps globally if there are no capped projects or zones
329 * Should be called with caps_lock held.
332 cap_disable(list_t
*l
, cpucap_t
*cap
)
334 ASSERT(MUTEX_HELD(&caps_lock
));
336 * Cap should be currently active
338 ASSERT(CPUCAPS_ON());
339 ASSERT(list_link_active(&cap
->cap_link
));
340 ASSERT(CAP_ENABLED(cap
));
342 waitq_block(&cap
->cap_waitq
);
344 if (list_is_empty(&capped_projects
) && list_is_empty(&capped_zones
)) {
345 cpucaps_enabled
= B_FALSE
;
346 cpucaps_clock_callout
= NULL
;
349 cap
->cap_project
= NULL
;
350 cap
->cap_zone
= NULL
;
351 if (cap
->cap_kstat
!= NULL
) {
352 kstat_delete(cap
->cap_kstat
);
353 cap
->cap_kstat
= NULL
;
359 * Enable cap for a project kpj
360 * It is safe to enable already enabled project cap.
361 * Should be called with caps_lock held.
364 cap_project_enable(kproject_t
*kpj
, hrtime_t value
)
366 cpucap_t
*cap
= kpj
->kpj_cpucap
;
368 ASSERT(MUTEX_HELD(&caps_lock
));
371 if (CAP_DISABLED(cap
)) {
372 ASSERT(cap
->cap_kstat
== NULL
);
373 cap_enable(&capped_projects
, cap
, value
);
374 cap
->cap_project
= kpj
;
375 cap
->cap_zone
= kpj
->kpj_zone
;
380 if ((cap
->cap_kstat
= rctl_kstat_create_project(kpj
, "cpucaps",
382 sizeof (cap_kstat
) / sizeof (kstat_named_t
),
383 KSTAT_FLAG_VIRTUAL
)) != NULL
) {
384 cap
->cap_kstat
->ks_data_size
+=
385 strlen(cap
->cap_zone
->zone_name
) + 1;
386 cap
->cap_kstat
->ks_lock
= &cap_kstat_lock
;
387 cap
->cap_kstat
->ks_data
= &cap_kstat
;
388 cap
->cap_kstat
->ks_update
= cap_kstat_update
;
389 cap
->cap_kstat
->ks_private
= cap
;
390 kstat_install(cap
->cap_kstat
);
396 * Disable project cap.
397 * It is safe to disable already disabled project cap.
398 * Should be called with caps_lock held.
401 cap_project_disable(kproject_t
*kpj
)
403 cpucap_t
*cap
= kpj
->kpj_cpucap
;
405 ASSERT(MUTEX_HELD(&caps_lock
));
407 ASSERT(cap
->cap_project
== kpj
);
409 if (CAP_ENABLED(cap
))
410 cap_disable(&capped_projects
, cap
);
414 * Enable cap for a zone
415 * It is safe to enable already enabled zone cap.
416 * Should be called with caps_lock held.
419 cap_zone_enable(zone_t
*zone
, hrtime_t value
)
421 cpucap_t
*cap
= zone
->zone_cpucap
;
423 ASSERT(MUTEX_HELD(&caps_lock
));
426 if (CAP_DISABLED(cap
)) {
427 ASSERT(cap
->cap_kstat
== NULL
);
428 cap_enable(&capped_zones
, cap
, value
);
429 cap
->cap_zone
= zone
;
434 if ((cap
->cap_kstat
= rctl_kstat_create_zone(zone
, "cpucaps",
436 sizeof (cap_kstat
) / sizeof (kstat_named_t
),
437 KSTAT_FLAG_VIRTUAL
)) != NULL
) {
438 cap
->cap_kstat
->ks_data_size
+=
439 strlen(cap
->cap_zone
->zone_name
) + 1;
440 cap
->cap_kstat
->ks_lock
= &cap_kstat_lock
;
441 cap
->cap_kstat
->ks_data
= &cap_kstat
;
442 cap
->cap_kstat
->ks_update
= cap_kstat_update
;
443 cap
->cap_kstat
->ks_private
= cap
;
444 kstat_install(cap
->cap_kstat
);
451 * It is safe to disable already disabled zone cap.
452 * Should be called with caps_lock held.
455 cap_zone_disable(zone_t
*zone
)
457 cpucap_t
*cap
= zone
->zone_cpucap
;
459 ASSERT(MUTEX_HELD(&caps_lock
));
461 ASSERT(cap
->cap_zone
== zone
);
463 if (CAP_ENABLED(cap
))
464 cap_disable(&capped_zones
, cap
);
468 * Apply specified callback to all caps contained in the list `l'.
471 cap_walk(list_t
*l
, void (*cb
)(cpucap_t
*, int64_t))
473 static uint64_t cpucap_walk_gen
;
476 ASSERT(MUTEX_HELD(&caps_lock
));
478 for (cap
= list_head(l
); cap
!= NULL
; cap
= list_next(l
, cap
)) {
479 (*cb
)(cap
, cpucap_walk_gen
);
482 atomic_inc_64(&cpucap_walk_gen
);
486 * If cap limit is not reached, make one thread from wait queue runnable.
487 * The waitq_isempty check is performed without the waitq lock. If a new thread
488 * is placed on the waitq right after the check, it will be picked up during the
489 * next invocation of cap_poke_waitq().
493 cap_poke_waitq(cpucap_t
*cap
, int64_t gen
)
495 ASSERT(MUTEX_HELD(&caps_lock
));
497 if (cap
->cap_usage
>= cap
->cap_value
) {
500 waitq_t
*wq
= &cap
->cap_waitq
;
504 if (!waitq_isempty(wq
))
510 * The callback function called for every cap on capped_projects list.
511 * Decay cap usage by CAP_DECAY_FACTOR
512 * Add this cap project usage to its zone usage.
513 * Kick off a thread from the cap waitq if cap is not reached.
516 cap_project_usage_walker(cpucap_t
*cap
, int64_t gen
)
518 zone_t
*zone
= cap
->cap_zone
;
519 hrtime_t cap_usage
= cap
->cap_usage
;
521 ASSERT(MUTEX_HELD(&caps_lock
));
522 ASSERT(cap
->cap_project
->kpj_cpucap
== cap
);
523 ASSERT(zone
== cap
->cap_project
->kpj_zone
);
524 ASSERT(CAP_ENABLED(cap
));
527 * Set or clear the CAP_REACHED flag based on the current usage.
528 * Only projects having their own caps are ever marked as CAP_REACHED.
530 cap_poke_waitq(cap
, 0);
533 * Add project's CPU usage to our zone's CPU usage.
535 if (ZONE_IS_CAPPED(zone
)) {
536 cpucap_t
*zcap
= zone
->zone_cpucap
;
538 ASSERT(zcap
->cap_zone
== zone
);
541 * If we haven't reset this zone's usage during this clock tick
542 * yet, then do it now. The cap_gen field is used to check
543 * whether this is the first zone's project we see during this
544 * tick or a subsequent one.
546 if (zcap
->cap_gen
!= gen
) {
547 if (zcap
->cap_usage
> zcap
->cap_maxusage
)
548 zcap
->cap_maxusage
= zcap
->cap_usage
;
552 DTRACE_PROBE2(cpucaps__zusage
, cpucap_t
*, zcap
,
553 hrtime_t
, cap_usage
);
554 zcap
->cap_usage
+= cap_usage
;
555 /* Check for overflows */
556 if (zcap
->cap_usage
< 0)
557 zcap
->cap_usage
= MAX_USAGE
- 1;
561 * Decay project usage.
563 disp_lock_enter(&cap
->cap_usagelock
);
564 cap
->cap_usage
-= ROUND_SCALE(cap_usage
, CAP_DECAY_FACTOR
);
565 disp_lock_exit(&cap
->cap_usagelock
);
569 * On every clock tick walk the list of project caps and update the CPU usage.
570 * Also walk the list of zone caps checking whether any threads should
571 * transition from wait queue to run queue.
573 * This function gets called by the clock thread directly when there are any
574 * defined caps. The only lock that it grabs is caps_lock. Nothing else grabs
575 * caps_lock for long periods of time, so there should be almost no contention
581 mutex_enter(&caps_lock
);
582 cap_walk(&capped_projects
, cap_project_usage_walker
);
583 cap_walk(&capped_zones
, cap_poke_waitq
);
584 mutex_exit(&caps_lock
);
588 * The function is called for each project in a zone when the zone cap is
589 * modified. It enables project caps if zone cap is enabled and disables if the
590 * zone cap is disabled and project doesn't have its own cap.
592 * For each project that does not have cpucap structure allocated it allocates a
593 * new structure and assigns to kpj->cpu_cap. The allocation is performed
594 * without holding caps_lock to avoid using KM_SLEEP allocation with caps_lock
598 cap_project_zone_modify_walker(kproject_t
*kpj
, void *arg
)
600 cpucap_t
*project_cap
= NULL
;
601 cpucap_t
*zone_cap
= (cpucap_t
*)arg
;
603 ASSERT(zone_cap
!= NULL
);
605 if (kpj
->kpj_cpucap
== NULL
) {
607 * This is the first time any cap was established for this
608 * project. Allocate a new cpucap structure for it.
610 project_cap
= cap_alloc();
613 mutex_enter(&caps_lock
);
616 * Double-check that kpj_cpucap is still NULL - now with caps_lock held
617 * and assign the newly allocated cpucap structure to it.
619 if (kpj
->kpj_cpucap
== NULL
) {
620 kpj
->kpj_cpucap
= project_cap
;
621 } else if (project_cap
!= NULL
) {
622 cap_free(project_cap
);
625 project_cap
= kpj
->kpj_cpucap
;
627 if (CAP_DISABLED(zone_cap
)) {
629 * Remove all projects in this zone without caps
630 * from the capped_projects list.
632 if (project_cap
->cap_value
== MAX_USAGE
) {
633 cap_project_disable(kpj
);
635 } else if (CAP_DISABLED(project_cap
)) {
637 * Add the project to capped_projects list.
639 ASSERT(project_cap
->cap_value
== 0);
640 cap_project_enable(kpj
, MAX_USAGE
);
642 mutex_exit(&caps_lock
);
648 * Set zone cap to cap_val
649 * If cap_val is equal to NOCAP, disable zone cap.
651 * If this is the first time a cap is set on a zone, allocate cpucap structure
652 * without holding caps_lock to avoid KM_SLEEP allocation with caps_lock held.
655 cpucaps_zone_set(zone_t
*zone
, rctl_qty_t cap_val
)
657 cpucap_t
*cap
= NULL
;
663 ASSERT(cap_val
<= MAXCAP
);
664 if (cap_val
> MAXCAP
)
668 * Nothing to do if trying to disable a cap on a zone when caps are off
669 * or a zone which does not have a cap yet.
671 if ((CPUCAPS_OFF() || !ZONE_IS_CAPPED(zone
)) && (cap_val
== NOCAP
))
674 if (zone
->zone_cpucap
== NULL
)
677 mutex_enter(&caps_lock
);
680 mutex_exit(&caps_lock
);
685 * Double-check whether zone->zone_cpucap is NULL, now with caps_lock
686 * held. If it is still NULL, assign a newly allocated cpucap to it.
688 if (zone
->zone_cpucap
== NULL
) {
689 zone
->zone_cpucap
= cap
;
690 } else if (cap
!= NULL
) {
694 cap
= zone
->zone_cpucap
;
695 value
= cap_val
* cap_tick_cost
;
699 /* Nothing to do if the value is staying the same */
700 if (value
== cap
->cap_value
) {
701 mutex_exit(&caps_lock
);
706 * Clear cap statistics since the cap value itself changes.
708 cap
->cap_above
= cap
->cap_below
= 0;
711 if (cap_val
== NOCAP
) {
712 if (CAP_ENABLED(cap
)) {
714 * Remove cap for the zone
716 cap_zone_disable(zone
);
717 cpucaps_busy
= B_TRUE
;
718 mutex_exit(&caps_lock
);
720 * Disable caps for all project belonging to this zone
721 * unless they have their own cap.
723 (void) project_walk_all(zone
->zone_id
,
724 cap_project_zone_modify_walker
, cap
);
726 mutex_enter(&caps_lock
);
727 cpucaps_busy
= B_FALSE
;
729 } else if (CAP_DISABLED(cap
)) {
731 * Set a cap on a zone which previously was not capped.
733 cap_zone_enable(zone
, value
);
734 cpucaps_busy
= B_TRUE
;
735 mutex_exit(&caps_lock
);
738 * Enable cap for all projects belonging to this zone.
740 (void) project_walk_all(zone
->zone_id
,
741 cap_project_zone_modify_walker
, cap
);
743 mutex_enter(&caps_lock
);
744 cpucaps_busy
= B_FALSE
;
747 * No state transitions, just change the value
749 cap
->cap_value
= value
;
752 ASSERT(MUTEX_HELD(&caps_lock
));
753 ASSERT(!cpucaps_busy
);
754 mutex_exit(&caps_lock
);
760 * The project is going away so disable its cap.
763 cpucaps_project_remove(kproject_t
*kpj
)
765 mutex_enter(&caps_lock
);
766 if (PROJECT_IS_CAPPED(kpj
))
767 cap_project_disable(kpj
);
768 if (kpj
->kpj_cpucap
!= NULL
) {
769 cap_free(kpj
->kpj_cpucap
);
770 kpj
->kpj_cpucap
= NULL
;
772 mutex_exit(&caps_lock
);
776 * The zone is going away, so disable its cap.
779 cpucaps_zone_remove(zone_t
*zone
)
781 mutex_enter(&caps_lock
);
782 while (ZONE_IS_CAPPED(zone
)) {
783 mutex_exit(&caps_lock
);
784 (void) cpucaps_zone_set(zone
, NOCAP
);
785 mutex_enter(&caps_lock
);
787 if (zone
->zone_cpucap
!= NULL
) {
788 cap_free(zone
->zone_cpucap
);
789 zone
->zone_cpucap
= NULL
;
791 mutex_exit(&caps_lock
);
795 * New project was created. It should be put on the capped_projects list if
796 * its zone has a cap.
799 cpucaps_project_add(kproject_t
*kpj
)
801 cpucap_t
*cap
= NULL
;
803 if (CPUCAPS_OFF() || !ZONE_IS_CAPPED(kpj
->kpj_zone
))
807 * This project was never capped before, so allocate its cap structure.
809 if (kpj
->kpj_cpucap
== NULL
)
812 mutex_enter(&caps_lock
);
814 * Double-check with caps_lock held
816 if (kpj
->kpj_cpucap
== NULL
) {
817 kpj
->kpj_cpucap
= cap
;
818 } else if (cap
!= NULL
) {
822 if (ZONE_IS_CAPPED(kpj
->kpj_zone
))
823 cap_project_enable(kpj
, MAX_USAGE
);
825 mutex_exit(&caps_lock
);
829 * Set project cap to cap_val
830 * If cap_val is equal to NOCAP, disable project cap.
832 * If this is the first time a cap is set on a project, allocate cpucap
833 * structure without holding caps_lock to avoid KM_SLEEP allocation with
837 cpucaps_project_set(kproject_t
*kpj
, rctl_qty_t cap_val
)
839 cpucap_t
*cap
= NULL
;
845 ASSERT(cap_val
<= MAXCAP
);
846 if (cap_val
> MAXCAP
)
850 * Nothing to do if trying to disable project cap and caps are not
851 * enabled or if trying to disable cap on a project that does not have
854 if ((cap_val
== NOCAP
) && (CPUCAPS_OFF() || !PROJECT_IS_CAPPED(kpj
)))
857 if (kpj
->kpj_cpucap
== NULL
) {
859 * This project was never capped before, so allocate its cap
865 mutex_enter(&caps_lock
);
868 * Double-check with caps_lock held.
870 if (kpj
->kpj_cpucap
== NULL
) {
871 kpj
->kpj_cpucap
= cap
;
872 } else if (cap
!= NULL
) {
877 * Get the actual pointer to the project cap.
879 cap
= kpj
->kpj_cpucap
;
880 value
= cap_val
* cap_tick_cost
;
885 * Nothing to do if the value is not changing
887 if (value
== cap
->cap_value
) {
888 mutex_exit(&caps_lock
);
893 * Clear cap statistics since the cap value itself changes.
895 cap
->cap_above
= cap
->cap_below
= 0;
896 cap
->cap_maxusage
= 0;
898 if (cap_val
!= NOCAP
) {
900 * Enable this cap if it is not already enabled.
902 if (CAP_DISABLED(cap
))
903 cap_project_enable(kpj
, value
);
905 cap
->cap_value
= value
;
906 } else if (CAP_ENABLED(cap
)) {
908 * User requested to drop a cap on the project. If it is part of
909 * capped zone, keep the cap and set the value to MAX_USAGE,
910 * otherwise disable the cap.
912 if (ZONE_IS_CAPPED(kpj
->kpj_zone
)) {
913 cap
->cap_value
= MAX_USAGE
;
915 cap_project_disable(kpj
);
918 mutex_exit(&caps_lock
);
927 cap_get(cpucap_t
*cap
)
929 return (cap
!= NULL
? (rctl_qty_t
)(cap
->cap_usage
/ cap_tick_cost
) : 0);
933 * Get current project usage.
936 cpucaps_project_get(kproject_t
*kpj
)
938 return (cap_get(kpj
->kpj_cpucap
));
942 * Get current zone usage.
945 cpucaps_zone_get(zone_t
*zone
)
947 return (cap_get(zone
->zone_cpucap
));
951 * Charge project of thread t the time thread t spent on CPU since previously
954 * Record the current on-CPU time in the csc structure.
956 * Do not adjust for more than one tick worth of time.
958 * It is possible that the project cap is being disabled while this routine is
959 * executed. This should not cause any issues since the association between the
960 * thread and its project is protected by thread lock.
963 caps_charge_adjust(kthread_id_t t
, caps_sc_t
*csc
)
965 kproject_t
*kpj
= ttoproj(t
);
967 hrtime_t usage_delta
;
969 ASSERT(THREAD_LOCK_HELD(t
));
970 ASSERT(kpj
->kpj_cpucap
!= NULL
);
972 /* Get on-CPU time since birth of a thread */
973 new_usage
= mstate_thread_onproc_time(t
);
975 /* Time spent on CPU since last checked */
976 usage_delta
= new_usage
- csc
->csc_cputime
;
978 /* Save the accumulated on-CPU time */
979 csc
->csc_cputime
= new_usage
;
981 /* Charge at most one tick worth of on-CPU time */
982 if (usage_delta
> cap_tick_cost
)
983 usage_delta
= cap_tick_cost
;
985 /* Add usage_delta to the project usage value. */
986 if (usage_delta
> 0) {
987 cpucap_t
*cap
= kpj
->kpj_cpucap
;
989 DTRACE_PROBE2(cpucaps__project__charge
,
990 kthread_id_t
, t
, hrtime_t
, usage_delta
);
992 disp_lock_enter_high(&cap
->cap_usagelock
);
993 cap
->cap_usage
+= usage_delta
;
995 /* Check for overflows */
996 if (cap
->cap_usage
< 0)
997 cap
->cap_usage
= MAX_USAGE
- 1;
999 disp_lock_exit_high(&cap
->cap_usagelock
);
1002 * cap_maxusage is only kept for observability. Move it outside
1003 * the lock to reduce the time spent while holding the lock.
1005 if (cap
->cap_usage
> cap
->cap_maxusage
)
1006 cap
->cap_maxusage
= cap
->cap_usage
;
1011 * Charge thread's project and return True if project or zone should be
1012 * penalized because its project or zone is exceeding its cap. Also sets
1013 * TS_PROJWAITQ or TS_ZONEWAITQ in this case.
1015 * It is possible that the project cap is being disabled while this routine is
1016 * executed. This should not cause any issues since the association between the
1017 * thread and its project is protected by thread lock. It will still set
1018 * TS_PROJECTWAITQ/TS_ZONEWAITQ in this case but cpucaps_enforce will not place
1019 * anything on the blocked wait queue.
1023 cpucaps_charge(kthread_id_t t
, caps_sc_t
*csc
, cpucaps_charge_t charge_type
)
1025 kproject_t
*kpj
= ttoproj(t
);
1026 klwp_t
*lwp
= t
->t_lwp
;
1028 cpucap_t
*project_cap
;
1029 boolean_t rc
= B_FALSE
;
1031 ASSERT(THREAD_LOCK_HELD(t
));
1033 /* Nothing to do for projects that are not capped. */
1034 if (lwp
== NULL
|| !PROJECT_IS_CAPPED(kpj
))
1037 caps_charge_adjust(t
, csc
);
1040 * The caller only requested to charge the project usage, no enforcement
1043 if (charge_type
== CPUCAPS_CHARGE_ONLY
)
1046 project_cap
= kpj
->kpj_cpucap
;
1048 if (project_cap
->cap_usage
>= project_cap
->cap_value
) {
1049 t
->t_schedflag
|= TS_PROJWAITQ
;
1051 } else if (t
->t_schedflag
& TS_PROJWAITQ
) {
1052 t
->t_schedflag
&= ~TS_PROJWAITQ
;
1056 if (!ZONE_IS_CAPPED(zone
)) {
1057 if (t
->t_schedflag
& TS_ZONEWAITQ
)
1058 t
->t_schedflag
&= ~TS_ZONEWAITQ
;
1060 cpucap_t
*zone_cap
= zone
->zone_cpucap
;
1062 if (zone_cap
->cap_usage
>= zone_cap
->cap_value
) {
1063 t
->t_schedflag
|= TS_ZONEWAITQ
;
1065 } else if (t
->t_schedflag
& TS_ZONEWAITQ
) {
1066 t
->t_schedflag
&= ~TS_ZONEWAITQ
;
1075 * Enforce CPU caps. If got preempted in the user-land, we know that thread does
1076 * not hold any kernel locks, so enqueue ourselves on the waitq, if needed.
1078 * CPU Caps are only enforced for user threads.
1080 * Threads flagged with TS_PROJWAITQ are placed on their project wait queues and
1081 * threads marked with TS_ZONEWAITQ are placed on their zone wait queue.
1083 * It is possible that by the time we enter cpucaps_enforce() the cap is already
1084 * disabled. In this case waitq_enqueue() fails and doesn't enqueue anything. We
1085 * still clear TS_PROJWAITQ/TS_ZONEWAITQ flags in this case since they no longer
1089 cpucaps_enforce(kthread_t
*t
)
1091 klwp_t
*lwp
= t
->t_lwp
;
1093 ASSERT(THREAD_LOCK_HELD(t
));
1095 if (lwp
!= NULL
&& lwp
->lwp_state
== LWP_USER
) {
1096 if (t
->t_schedflag
& TS_PROJWAITQ
) {
1097 ASSERT(ttoproj(t
)->kpj_cpucap
!= NULL
);
1098 t
->t_schedflag
&= ~TS_ANYWAITQ
;
1099 if (waitq_enqueue(&(ttoproj(t
)->kpj_cpucap
->cap_waitq
),
1104 if (t
->t_schedflag
& TS_ZONEWAITQ
) {
1105 ASSERT(ttozone(t
)->zone_cpucap
!= NULL
);
1106 t
->t_schedflag
&= ~TS_ZONEWAITQ
;
1107 if (waitq_enqueue(&(ttozone(t
)->zone_cpucap
->cap_waitq
),
1115 * The thread is not enqueued on the wait queue.
1121 * Convert internal cap statistics into values exported by cap kstat.
1124 cap_kstat_update(kstat_t
*ksp
, int rw
)
1126 struct cap_kstat
*capsp
= &cap_kstat
;
1127 cpucap_t
*cap
= ksp
->ks_private
;
1128 clock_t tick_sec
= SEC_TO_TICK(1);
1129 char *zonename
= cap
->cap_zone
->zone_name
;
1131 if (rw
== KSTAT_WRITE
)
1134 capsp
->cap_value
.value
.ui64
=
1135 ROUND_SCALE(cap
->cap_value
, cap_tick_cost
);
1136 capsp
->cap_usage
.value
.ui64
=
1137 ROUND_SCALE(cap
->cap_usage
, cap_tick_cost
);
1138 capsp
->cap_maxusage
.value
.ui64
=
1139 ROUND_SCALE(cap
->cap_maxusage
, cap_tick_cost
);
1140 capsp
->cap_nwait
.value
.ui64
= cap
->cap_waitq
.wq_count
;
1141 capsp
->cap_below
.value
.ui64
= ROUND_SCALE(cap
->cap_below
, tick_sec
);
1142 capsp
->cap_above
.value
.ui64
= ROUND_SCALE(cap
->cap_above
, tick_sec
);
1143 kstat_named_setstr(&capsp
->cap_zonename
, zonename
);