4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
27 #include <sys/pool_impl.h>
28 #include <sys/pool_pset.h>
29 #include <sys/id_space.h>
30 #include <sys/mutex.h>
31 #include <sys/nvpair.h>
32 #include <sys/cpuvar.h>
33 #include <sys/errno.h>
34 #include <sys/cmn_err.h>
35 #include <sys/systm.h>
38 #include <sys/class.h>
39 #include <sys/exacct.h>
40 #include <sys/utsname.h>
41 #include <sys/procset.h>
42 #include <sys/atomic.h>
44 #include <sys/policy.h>
45 #include <sys/schedctl.h>
46 #include <sys/taskq.h>
51 * The resource pools facility brings together process-bindable resource into
52 * a common abstraction called a pool. Processor sets and other entities can
53 * be configured, grouped, and labelled such that workload components can be
54 * associated with a subset of a system's total resources.
56 * When disabled, the pools facility is "invisible". All processes belong
57 * to the same pool (pool_default), and processor sets can be managed through
58 * the old pset() system call. When enabled, processor sets can only be
59 * managed via the pools facility. New pools can be created and associated
60 * with processor sets. Processes can be bound to pools which have non-empty
63 * Locking: pool_lock() protects global pools state and must be called
64 * before modifying the configuration, or when taking a snapshot of the
65 * configuration. If pool_lock_intr() is used, the operation may be
66 * interrupted by a signal or a request.
68 * To prevent processes from being rebound between pools while they are
69 * the middle of an operation which affects resource set bindings, such
70 * operations must be surrounded by calls to pool_barrier_enter() and
71 * pool_barrier_exit(). This mechanism guarantees that such processes will
72 * be stopped either at the beginning or at the end of the barrier so that
73 * the rebind operation can atomically bind the process and its threads
74 * to new resource sets, and then let process run again.
76 * Lock ordering with respect to other locks is as follows:
78 * pool_lock() -> cpu_lock -> pidlock -> p_lock -> pool_barrier_lock
80 * Most static and global variables defined in this file are protected
81 * by calling pool_lock().
83 * The operation that binds tasks and projects to pools is atomic. That is,
84 * either all processes in a given task or a project will be bound to a
85 * new pool, or (in case of an error) they will be all left bound to the
86 * old pool. Processes in a given task or a given project can only be bound to
87 * different pools if they were rebound individually one by one as single
88 * processes. Threads or LWPs of the same process do not have pool bindings,
89 * and are bound to the same resource sets associated with the resource pool
92 * The following picture shows one possible pool configuration with three
93 * pools and three processor sets. Note that processor set "foo" is not
94 * associated with any pools and therefore cannot have any processes
95 * bound to it. Two pools (default and foo) are associated with the
96 * same processor set (default). Also, note that processes in Task 2
97 * are bound to different pools.
102 * +--------------+========================>| default |
110 * t| | +------>| bar |
114 * +---------+ +---------+ +---------+
115 * Pools | default |======| foo |======| bar |
116 * +---------+ +---------+ +---------+
120 * u| +-----+ | +-------+ | +---+
122 * ....d|........|......|......|.........|.......|....
123 * : | :: | | | :: | | :
124 * : +---+ :: +---+ +---+ +---+ :: +---+ +---+ :
125 * Processes : | p | :: | p | | p | | p | :: | p |...| p | :
126 * : +---+ :: +---+ +---+ +---+ :: +---+ +---+ :
127 * :........::......................::...............:
128 * Task 1 Task 2 Task N
131 * | +-----------+ | +-----------+
132 * +--| Project 1 |--+ | Project N |
133 * +-----------+ +-----------+
135 * This is just an illustration of relationships between processes, tasks,
136 * projects, pools, and processor sets. New types of resource sets will be
137 * added in the future.
140 pool_t
*pool_default
; /* default pool which always exists */
141 int pool_count
; /* number of pools created on this system */
142 int pool_state
; /* pools state -- enabled/disabled */
143 void *pool_buf
; /* pre-commit snapshot of the pools state */
144 size_t pool_bufsz
; /* size of pool_buf */
145 static hrtime_t pool_pool_mod
; /* last modification time for pools */
146 static hrtime_t pool_sys_mod
; /* last modification time for system */
147 static nvlist_t
*pool_sys_prop
; /* system properties */
148 static id_space_t
*pool_ids
; /* pool ID space */
149 static list_t pool_list
; /* doubly-linked list of pools */
150 static kmutex_t pool_mutex
; /* protects pool_busy_* */
151 static kcondvar_t pool_busy_cv
; /* waiting for "pool_lock" */
152 static kthread_t
*pool_busy_thread
; /* thread holding "pool_lock" */
153 static kmutex_t pool_barrier_lock
; /* synch. with pool_barrier_* */
154 static kcondvar_t pool_barrier_cv
; /* synch. with pool_barrier_* */
155 static int pool_barrier_count
; /* synch. with pool_barrier_* */
156 static list_t pool_event_cb_list
; /* pool event callbacks */
157 static boolean_t pool_event_cb_init
= B_FALSE
;
158 static kmutex_t pool_event_cb_lock
;
159 static taskq_t
*pool_event_cb_taskq
= NULL
;
161 void pool_event_dispatch(pool_event_t
, poolid_t
);
164 * Boot-time pool initialization.
169 pool_ids
= id_space_create("pool_ids", POOL_DEFAULT
+ 1, POOL_MAXID
);
172 * Initialize default pool.
174 pool_default
= kmem_zalloc(sizeof (pool_t
), KM_SLEEP
);
175 pool_default
->pool_id
= POOL_DEFAULT
;
176 list_create(&pool_list
, sizeof (pool_t
), offsetof(pool_t
, pool_link
));
177 list_insert_head(&pool_list
, pool_default
);
180 * Initialize plugins for resource sets.
184 p0
.p_pool
= pool_default
;
185 global_zone
->zone_pool
= pool_default
;
186 pool_default
->pool_ref
= 1;
190 * Synchronization routines.
192 * pool_lock is only called from syscall-level routines (processor_bind(),
193 * pset_*(), and /dev/pool ioctls). The pool "lock" may be held for long
194 * periods of time, including across sleeping operations, so we allow its
195 * acquisition to be interruptible.
197 * The current thread that owns the "lock" is stored in the variable
198 * pool_busy_thread, both to let pool_lock_held() work and to aid debugging.
203 mutex_enter(&pool_mutex
);
204 ASSERT(!pool_lock_held());
205 while (pool_busy_thread
!= NULL
)
206 cv_wait(&pool_busy_cv
, &pool_mutex
);
207 pool_busy_thread
= curthread
;
208 mutex_exit(&pool_mutex
);
214 mutex_enter(&pool_mutex
);
215 ASSERT(!pool_lock_held());
216 while (pool_busy_thread
!= NULL
) {
217 if (cv_wait_sig(&pool_busy_cv
, &pool_mutex
) == 0) {
218 cv_signal(&pool_busy_cv
);
219 mutex_exit(&pool_mutex
);
223 pool_busy_thread
= curthread
;
224 mutex_exit(&pool_mutex
);
231 return (pool_busy_thread
== curthread
);
237 mutex_enter(&pool_mutex
);
238 ASSERT(pool_lock_held());
239 pool_busy_thread
= NULL
;
240 cv_signal(&pool_busy_cv
);
241 mutex_exit(&pool_mutex
);
245 * Routines allowing fork(), exec(), exit(), and lwp_create() to synchronize
246 * with pool_do_bind().
248 * Calls to pool_barrier_enter() and pool_barrier_exit() must bracket all
249 * operations which modify pool or pset associations. They can be called
250 * while the process is multi-threaded. In the common case, when current
251 * process is not being rebound (PBWAIT flag is not set), these functions
252 * will be just incrementing and decrementing reference counts.
255 pool_barrier_enter(void)
259 ASSERT(MUTEX_HELD(&p
->p_lock
));
260 while (p
->p_poolflag
& PBWAIT
)
261 cv_wait(&p
->p_poolcv
, &p
->p_lock
);
266 pool_barrier_exit(void)
270 ASSERT(MUTEX_HELD(&p
->p_lock
));
271 ASSERT(p
->p_poolcnt
> 0);
273 if (p
->p_poolflag
& PBWAIT
) {
274 mutex_enter(&pool_barrier_lock
);
275 ASSERT(pool_barrier_count
> 0);
276 pool_barrier_count
--;
277 if (pool_barrier_count
== 0)
278 cv_signal(&pool_barrier_cv
);
279 mutex_exit(&pool_barrier_lock
);
280 while (p
->p_poolflag
& PBWAIT
)
281 cv_wait(&p
->p_poolcv
, &p
->p_lock
);
286 * Enable pools facility.
293 ASSERT(pool_lock_held());
294 ASSERT(pool_count
== 1);
296 ret
= pool_pset_enable();
299 (void) nvlist_alloc(&pool_sys_prop
, NV_UNIQUE_NAME
, KM_SLEEP
);
300 (void) nvlist_add_string(pool_sys_prop
, "system.name",
302 (void) nvlist_add_string(pool_sys_prop
, "system.comment", "");
303 (void) nvlist_add_int64(pool_sys_prop
, "system.version", 1);
304 (void) nvlist_add_byte(pool_sys_prop
, "system.bind-default", 1);
305 (void) nvlist_add_string(pool_sys_prop
, "system.poold.objectives",
308 (void) nvlist_alloc(&pool_default
->pool_props
,
309 NV_UNIQUE_NAME
, KM_SLEEP
);
310 (void) nvlist_add_string(pool_default
->pool_props
,
311 "pool.name", "pool_default");
312 (void) nvlist_add_string(pool_default
->pool_props
, "pool.comment", "");
313 (void) nvlist_add_byte(pool_default
->pool_props
, "pool.default", 1);
314 (void) nvlist_add_byte(pool_default
->pool_props
, "pool.active", 1);
315 (void) nvlist_add_int64(pool_default
->pool_props
,
316 "pool.importance", 1);
317 (void) nvlist_add_int64(pool_default
->pool_props
, "pool.sys_id",
318 pool_default
->pool_id
);
320 pool_sys_mod
= pool_pool_mod
= gethrtime();
326 * Disable pools facility.
333 ASSERT(pool_lock_held());
335 if (pool_count
> 1) /* must destroy all pools first */
338 ret
= pool_pset_disable();
341 if (pool_sys_prop
!= NULL
) {
342 nvlist_free(pool_sys_prop
);
343 pool_sys_prop
= NULL
;
345 if (pool_default
->pool_props
!= NULL
) {
346 nvlist_free(pool_default
->pool_props
);
347 pool_default
->pool_props
= NULL
;
353 pool_lookup_pool_by_name(char *name
)
355 pool_t
*pool
= pool_default
;
358 ASSERT(pool_lock_held());
359 for (pool
= list_head(&pool_list
); pool
;
360 pool
= list_next(&pool_list
, pool
)) {
361 if (nvlist_lookup_string(pool
->pool_props
,
362 "pool.name", &p
) == 0 && strcmp(name
, p
) == 0)
369 pool_lookup_pool_by_id(poolid_t poolid
)
371 pool_t
*pool
= pool_default
;
373 ASSERT(pool_lock_held());
374 for (pool
= list_head(&pool_list
); pool
;
375 pool
= list_next(&pool_list
, pool
)) {
376 if (pool
->pool_id
== poolid
)
383 pool_lookup_pool_by_pset(int id
)
385 pool_t
*pool
= pool_default
;
386 psetid_t psetid
= (psetid_t
)id
;
388 ASSERT(pool_lock_held());
389 for (pool
= list_head(&pool_list
); pool
!= NULL
;
390 pool
= list_next(&pool_list
, pool
)) {
391 if (pool
->pool_pset
->pset_id
== psetid
)
398 * Create new pool, associate it with default resource sets, and give
399 * it a temporary name.
402 pool_pool_create(poolid_t
*poolid
)
407 ASSERT(pool_lock_held());
409 pool
= kmem_zalloc(sizeof (pool_t
), KM_SLEEP
);
410 pool
->pool_id
= *poolid
= id_alloc(pool_ids
);
411 pool
->pool_pset
= pool_pset_default
;
412 pool_pset_default
->pset_npools
++;
413 list_insert_tail(&pool_list
, pool
);
414 (void) nvlist_alloc(&pool
->pool_props
, NV_UNIQUE_NAME
, KM_SLEEP
);
415 (void) nvlist_add_int64(pool
->pool_props
, "pool.sys_id", pool
->pool_id
);
416 (void) nvlist_add_byte(pool
->pool_props
, "pool.default", 0);
417 pool_pool_mod
= gethrtime();
418 (void) snprintf(pool_name
, sizeof (pool_name
), "pool_%lld",
420 (void) nvlist_add_string(pool
->pool_props
, "pool.name", pool_name
);
425 struct destroy_zone_arg
{
431 * Update pool pointers for zones that are currently bound to pool "old"
432 * to be bound to pool "new".
435 pool_destroy_zone_cb(zone_t
*zone
, void *arg
)
437 struct destroy_zone_arg
*dza
= arg
;
439 ASSERT(pool_lock_held());
440 ASSERT(MUTEX_HELD(&cpu_lock
));
442 if (zone_pool_get(zone
) == dza
->old
)
443 zone_pool_set(zone
, dza
->new);
448 * Destroy specified pool, and rebind all processes in it
449 * to the default pool.
452 pool_pool_destroy(poolid_t poolid
)
457 ASSERT(pool_lock_held());
459 if (poolid
== POOL_DEFAULT
)
461 if ((pool
= pool_lookup_pool_by_id(poolid
)) == NULL
)
463 ret
= pool_do_bind(pool_default
, P_POOLID
, poolid
, POOL_BIND_ALL
);
465 struct destroy_zone_arg dzarg
;
468 dzarg
.new = pool_default
;
469 mutex_enter(&cpu_lock
);
470 ret
= zone_walk(pool_destroy_zone_cb
, &dzarg
);
471 mutex_exit(&cpu_lock
);
473 ASSERT(pool
->pool_ref
== 0);
474 (void) nvlist_free(pool
->pool_props
);
475 id_free(pool_ids
, pool
->pool_id
);
476 pool
->pool_pset
->pset_npools
--;
477 list_remove(&pool_list
, pool
);
479 pool_pool_mod
= gethrtime();
480 kmem_free(pool
, sizeof (pool_t
));
486 * Create new pool or resource set.
489 pool_create(int class, int subclass
, id_t
*id
)
493 ASSERT(pool_lock_held());
494 if (pool_state
== POOL_DISABLED
)
498 ret
= pool_pool_create((poolid_t
*)id
);
503 ret
= pool_pset_create((psetid_t
*)id
);
519 * Destroy an existing pool or resource set.
522 pool_destroy(int class, int subclass
, id_t id
)
526 ASSERT(pool_lock_held());
527 if (pool_state
== POOL_DISABLED
)
531 ret
= pool_pool_destroy((poolid_t
)id
);
536 ret
= pool_pset_destroy((psetid_t
)id
);
552 * Enable or disable pools.
555 pool_status(int status
)
559 ASSERT(pool_lock_held());
561 if (pool_state
== status
)
568 pool_state
= POOL_ENABLED
;
569 pool_event_dispatch(POOL_E_ENABLE
, 0);
572 ret
= pool_disable();
575 pool_state
= POOL_DISABLED
;
576 pool_event_dispatch(POOL_E_DISABLE
, 0);
585 * Associate pool with resource set.
588 pool_assoc(poolid_t poolid
, int idtype
, id_t id
)
592 ASSERT(pool_lock_held());
593 if (pool_state
== POOL_DISABLED
)
597 ret
= pool_pset_assoc(poolid
, (psetid_t
)id
);
599 pool_event_dispatch(POOL_E_CHANGE
, poolid
);
605 pool_pool_mod
= gethrtime();
610 * Disassociate resource set from pool.
613 pool_dissoc(poolid_t poolid
, int idtype
)
617 ASSERT(pool_lock_held());
618 if (pool_state
== POOL_DISABLED
)
622 ret
= pool_pset_assoc(poolid
, PS_NONE
);
624 pool_event_dispatch(POOL_E_CHANGE
, poolid
);
630 pool_pool_mod
= gethrtime();
635 * Transfer specified quantity of resources between resource sets.
639 pool_transfer(int type
, id_t src
, id_t dst
, uint64_t qty
)
647 pool_lookup_id_by_pset(int id
)
649 pool_t
*pool
= pool_default
;
650 psetid_t psetid
= (psetid_t
)id
;
652 ASSERT(pool_lock_held());
653 for (pool
= list_head(&pool_list
); pool
!= NULL
;
654 pool
= list_next(&pool_list
, pool
)) {
655 if (pool
->pool_pset
->pset_id
== psetid
)
656 return (pool
->pool_id
);
658 return (POOL_INVALID
);
662 * Transfer resources specified by their IDs between resource sets.
665 pool_xtransfer(int type
, id_t src_pset
, id_t dst_pset
, uint_t size
, id_t
*ids
)
668 poolid_t src_pool
, dst_pool
;
670 ASSERT(pool_lock_held());
671 if (pool_state
== POOL_DISABLED
)
675 ret
= pool_pset_xtransfer((psetid_t
)src_pset
,
676 (psetid_t
)dst_pset
, size
, ids
);
678 if ((src_pool
= pool_lookup_id_by_pset(src_pset
)) !=
680 pool_event_dispatch(POOL_E_CHANGE
, src_pool
);
681 if ((dst_pool
= pool_lookup_id_by_pset(dst_pset
)) !=
683 pool_event_dispatch(POOL_E_CHANGE
, dst_pool
);
693 * Bind processes to pools.
696 pool_bind(poolid_t poolid
, idtype_t idtype
, id_t id
)
700 ASSERT(pool_lock_held());
702 if (pool_state
== POOL_DISABLED
)
704 if ((pool
= pool_lookup_pool_by_id(poolid
)) == NULL
)
716 return (pool_do_bind(pool
, idtype
, id
, POOL_BIND_ALL
));
720 * Query pool binding of the specifed process.
723 pool_query_binding(idtype_t idtype
, id_t id
, id_t
*poolid
)
732 ASSERT(pool_lock_held());
734 mutex_enter(&pidlock
);
735 if ((p
= prfind((pid_t
)id
)) == NULL
) {
736 mutex_exit(&pidlock
);
739 mutex_enter(&p
->p_lock
);
741 * In local zones, lie about pool bindings of processes from
744 if (!INGLOBALZONE(curproc
) && INGLOBALZONE(p
)) {
747 pool
= zone_pool_get(curproc
->p_zone
);
748 *poolid
= pool
->pool_id
;
750 *poolid
= p
->p_pool
->pool_id
;
752 mutex_exit(&p
->p_lock
);
753 mutex_exit(&pidlock
);
758 pool_system_pack(void)
760 ea_object_t
*eo_system
;
764 ASSERT(pool_lock_held());
766 eo_system
= ea_alloc_group(EXT_GROUP
| EXC_LOCAL
| EXD_GROUP_SYSTEM
);
767 (void) ea_attach_item(eo_system
, &pool_sys_mod
, sizeof (hrtime_t
),
768 EXC_LOCAL
| EXD_SYSTEM_TSTAMP
| EXT_UINT64
);
769 if (INGLOBALZONE(curproc
))
770 (void) ea_attach_item(eo_system
, &pool_pool_mod
,
772 EXC_LOCAL
| EXD_POOL_TSTAMP
| EXT_UINT64
);
774 (void) ea_attach_item(eo_system
,
775 &curproc
->p_zone
->zone_pool_mod
,
777 EXC_LOCAL
| EXD_POOL_TSTAMP
| EXT_UINT64
);
778 (void) ea_attach_item(eo_system
, &pool_pset_mod
, sizeof (hrtime_t
),
779 EXC_LOCAL
| EXD_PSET_TSTAMP
| EXT_UINT64
);
780 (void) ea_attach_item(eo_system
, &pool_cpu_mod
, sizeof (hrtime_t
),
781 EXC_LOCAL
| EXD_CPU_TSTAMP
| EXT_UINT64
);
782 (void) nvlist_pack(pool_sys_prop
, &buf
, &bufsz
, NV_ENCODE_NATIVE
, 0);
783 (void) ea_attach_item(eo_system
, buf
, bufsz
,
784 EXC_LOCAL
| EXD_SYSTEM_PROP
| EXT_RAW
);
785 kmem_free(buf
, bufsz
);
790 * Pack information about pools and attach it to specified exacct group.
793 pool_pool_pack(ea_object_t
*eo_system
)
795 ea_object_t
*eo_pool
;
801 ASSERT(pool_lock_held());
802 myzonepool
= zone_pool_get(curproc
->p_zone
);
803 for (pool
= list_head(&pool_list
); pool
;
804 pool
= list_next(&pool_list
, pool
)) {
805 if (!INGLOBALZONE(curproc
) && myzonepool
!= pool
)
809 eo_pool
= ea_alloc_group(EXT_GROUP
|
810 EXC_LOCAL
| EXD_GROUP_POOL
);
811 (void) ea_attach_item(eo_pool
, &pool
->pool_id
, sizeof (id_t
),
812 EXC_LOCAL
| EXD_POOL_POOLID
| EXT_UINT32
);
813 (void) ea_attach_item(eo_pool
, &pool
->pool_pset
->pset_id
,
814 sizeof (id_t
), EXC_LOCAL
| EXD_POOL_PSETID
| EXT_UINT32
);
815 (void) nvlist_pack(pool
->pool_props
, &buf
, &bufsz
,
816 NV_ENCODE_NATIVE
, 0);
817 (void) ea_attach_item(eo_pool
, buf
, bufsz
,
818 EXC_LOCAL
| EXD_POOL_PROP
| EXT_RAW
);
819 kmem_free(buf
, bufsz
);
820 (void) ea_attach_to_group(eo_system
, eo_pool
);
826 * Pack the whole pool configuration in the specified buffer.
829 pool_pack_conf(void *kbuf
, size_t kbufsz
, size_t *asize
)
831 ea_object_t
*eo_system
;
835 ASSERT(pool_lock_held());
837 eo_system
= pool_system_pack(); /* 1. pack system */
838 (void) pool_pool_pack(eo_system
); /* 2. pack all pools */
839 (void) pool_pset_pack(eo_system
); /* 3. pack all psets */
840 ksize
= ea_pack_object(eo_system
, NULL
, 0);
841 if (kbuf
== NULL
|| kbufsz
== 0)
843 else if (ksize
> kbufsz
)
846 *asize
= ea_pack_object(eo_system
, kbuf
, kbufsz
);
847 ea_free_object(eo_system
, EUP_ALLOC
);
852 * Start/end the commit transaction. If commit transaction is currently
853 * in progress, then all POOL_QUERY ioctls will return pools configuration
854 * at the beginning of transaction.
857 pool_commit(int state
)
859 ea_object_t
*eo_system
;
862 ASSERT(pool_lock_held());
864 if (pool_state
== POOL_DISABLED
)
869 * Beginning commit transation.
871 if (pool_buf
!= NULL
) /* transaction in progress */
873 eo_system
= pool_system_pack(); /* 1. pack system */
874 (void) pool_pool_pack(eo_system
); /* 2. pack all pools */
875 (void) pool_pset_pack(eo_system
); /* 3. pack all psets */
876 pool_bufsz
= ea_pack_object(eo_system
, NULL
, 0);
877 pool_buf
= kmem_alloc(pool_bufsz
, KM_SLEEP
);
878 pool_bufsz
= ea_pack_object(eo_system
, pool_buf
, pool_bufsz
);
879 ea_free_object(eo_system
, EUP_ALLOC
);
883 * Finishing commit transaction.
885 if (pool_buf
!= NULL
) {
886 kmem_free(pool_buf
, pool_bufsz
);
898 * Check is the specified property is special
900 static pool_property_t
*
901 pool_property_find(char *name
, pool_property_t
*list
)
903 pool_property_t
*prop
;
905 for (prop
= list
; prop
->pp_name
!= NULL
; prop
++)
906 if (strcmp(prop
->pp_name
, name
) == 0)
911 static pool_property_t pool_prop_sys
[] = {
912 { "system.name", DATA_TYPE_STRING
, PP_RDWR
},
913 { "system.comment", DATA_TYPE_STRING
, PP_RDWR
},
914 { "system.version", DATA_TYPE_UINT64
, PP_READ
},
915 { "system.bind-default", DATA_TYPE_BYTE
, PP_RDWR
},
916 { "system.allocate-method", DATA_TYPE_STRING
,
917 PP_RDWR
| PP_OPTIONAL
},
918 { "system.poold.log-level", DATA_TYPE_STRING
,
919 PP_RDWR
| PP_OPTIONAL
},
920 { "system.poold.log-location", DATA_TYPE_STRING
,
921 PP_RDWR
| PP_OPTIONAL
},
922 { "system.poold.monitor-interval", DATA_TYPE_UINT64
,
923 PP_RDWR
| PP_OPTIONAL
},
924 { "system.poold.history-file", DATA_TYPE_STRING
,
925 PP_RDWR
| PP_OPTIONAL
},
926 { "system.poold.objectives", DATA_TYPE_STRING
,
927 PP_RDWR
| PP_OPTIONAL
},
931 static pool_property_t pool_prop_pool
[] = {
932 { "pool.sys_id", DATA_TYPE_UINT64
, PP_READ
},
933 { "pool.name", DATA_TYPE_STRING
, PP_RDWR
},
934 { "pool.default", DATA_TYPE_BYTE
, PP_READ
},
935 { "pool.active", DATA_TYPE_BYTE
, PP_RDWR
},
936 { "pool.importance", DATA_TYPE_INT64
, PP_RDWR
},
937 { "pool.comment", DATA_TYPE_STRING
, PP_RDWR
},
938 { "pool.scheduler", DATA_TYPE_STRING
,
939 PP_RDWR
| PP_OPTIONAL
},
944 * Common routine to put new property on the specified list
947 pool_propput_common(nvlist_t
*nvlist
, nvpair_t
*pair
, pool_property_t
*props
)
949 pool_property_t
*prop
;
951 if ((prop
= pool_property_find(nvpair_name(pair
), props
)) != NULL
) {
953 * No read-only properties or properties with bad types
955 if (!(prop
->pp_perm
& PP_WRITE
) ||
956 prop
->pp_type
!= nvpair_type(pair
))
959 return (nvlist_add_nvpair(nvlist
, pair
));
963 * Common routine to remove property from the given list
966 pool_proprm_common(nvlist_t
*nvlist
, char *name
, pool_property_t
*props
)
968 pool_property_t
*prop
;
970 if ((prop
= pool_property_find(name
, props
)) != NULL
) {
971 if (!(prop
->pp_perm
& PP_OPTIONAL
))
974 return (nvlist_remove_all(nvlist
, name
));
978 pool_system_propput(nvpair_t
*pair
)
982 ASSERT(pool_lock_held());
983 ret
= pool_propput_common(pool_sys_prop
, pair
, pool_prop_sys
);
985 pool_sys_mod
= gethrtime();
990 pool_system_proprm(char *name
)
994 ASSERT(pool_lock_held());
995 ret
= pool_proprm_common(pool_sys_prop
, name
, pool_prop_sys
);
997 pool_sys_mod
= gethrtime();
1002 pool_pool_propput(poolid_t poolid
, nvpair_t
*pair
)
1007 ASSERT(pool_lock_held());
1008 if ((pool
= pool_lookup_pool_by_id(poolid
)) == NULL
)
1010 ret
= pool_propput_common(pool
->pool_props
, pair
, pool_prop_pool
);
1012 pool_pool_mod
= gethrtime();
1017 pool_pool_proprm(poolid_t poolid
, char *name
)
1022 ASSERT(pool_lock_held());
1023 if ((pool
= pool_lookup_pool_by_id(poolid
)) == NULL
)
1025 ret
= pool_proprm_common(pool
->pool_props
, name
, pool_prop_pool
);
1027 pool_pool_mod
= gethrtime();
1032 pool_propput(int class, int subclass
, id_t id
, nvpair_t
*pair
)
1036 ASSERT(pool_lock_held());
1037 if (pool_state
== POOL_DISABLED
)
1038 return (ENOTACTIVE
);
1041 ret
= pool_system_propput(pair
);
1044 ret
= pool_pool_propput((poolid_t
)id
, pair
);
1049 ret
= pool_pset_propput((psetid_t
)id
, pair
);
1061 ret
= pool_cpu_propput((processorid_t
)id
, pair
);
1074 pool_proprm(int class, int subclass
, id_t id
, char *name
)
1078 ASSERT(pool_lock_held());
1079 if (pool_state
== POOL_DISABLED
)
1080 return (ENOTACTIVE
);
1083 ret
= pool_system_proprm(name
);
1086 ret
= pool_pool_proprm((poolid_t
)id
, name
);
1091 ret
= pool_pset_proprm((psetid_t
)id
, name
);
1103 ret
= pool_cpu_proprm((processorid_t
)id
, name
);
1116 pool_propget(char *name
, int class, int subclass
, id_t id
, nvlist_t
**nvlp
)
1121 ASSERT(pool_lock_held());
1122 if (pool_state
== POOL_DISABLED
)
1123 return (ENOTACTIVE
);
1125 (void) nvlist_alloc(&nvl
, NV_UNIQUE_NAME
, KM_SLEEP
);
1135 ret
= pool_pset_propget((psetid_t
)id
, name
, nvl
);
1147 ret
= pool_cpu_propget((processorid_t
)id
, name
, nvl
);
1164 * pool_bind_wake and pool_bind_wakeall are helper functions to undo PBWAITs
1165 * in case of failure in pool_do_bind().
1168 pool_bind_wake(proc_t
*p
)
1170 ASSERT(pool_lock_held());
1172 mutex_enter(&p
->p_lock
);
1173 ASSERT(p
->p_poolflag
& PBWAIT
);
1174 if (p
->p_poolcnt
> 0) {
1175 mutex_enter(&pool_barrier_lock
);
1176 pool_barrier_count
-= p
->p_poolcnt
;
1177 mutex_exit(&pool_barrier_lock
);
1179 p
->p_poolflag
&= ~PBWAIT
;
1180 cv_signal(&p
->p_poolcv
);
1181 mutex_exit(&p
->p_lock
);
1185 pool_bind_wakeall(proc_t
**procs
)
1189 ASSERT(pool_lock_held());
1190 for (pp
= procs
; (p
= *pp
) != NULL
; pp
++)
1195 * Return the scheduling class for this pool, or
1196 * POOL_CLASS_UNSET if not set
1197 * POOL_CLASS_INVAL if set to an invalid class ID.
1200 pool_get_class(pool_t
*pool
)
1205 ASSERT(pool_lock_held());
1207 if (nvlist_lookup_string(pool
->pool_props
, "pool.scheduler",
1209 if (getcidbyname(name
, &cid
) == 0)
1212 return (POOL_CLASS_INVAL
);
1214 return (POOL_CLASS_UNSET
);
1218 * Move process to the new scheduling class.
1221 pool_change_class(proc_t
*p
, id_t cid
)
1233 * Do not move kernel processes (such as zsched).
1235 if (p
->p_flag
& SSYS
)
1238 * This process is in the pool barrier, so it can't possibly be
1239 * adding new threads and we can use p_lwpcnt + p_zombcnt + 1
1240 * (for possible agent LWP which doesn't use pool barrier) as
1243 nlwp
= p
->p_lwpcnt
+ p
->p_zombcnt
+ 1;
1246 * Pre-allocate scheduling class specific buffers before
1249 bufs
= kmem_zalloc(nlwp
* sizeof (void *), KM_SLEEP
);
1250 for (i
= 0, buf
= bufs
; i
< nlwp
; i
++, buf
++) {
1251 ret
= CL_ALLOC(buf
, cid
, KM_SLEEP
);
1256 * Move threads one by one to the new scheduling class.
1257 * This never fails because we have all the right
1260 mutex_enter(&p
->p_lock
);
1261 ASSERT(p
->p_poolflag
& PBWAIT
);
1266 if (t
->t_cid
!= cid
) {
1268 cldata
= t
->t_cldata
;
1269 ret
= CL_ENTERCLASS(t
, cid
, NULL
, NULL
, *buf
);
1271 CL_EXITCLASS(oldcid
, cldata
);
1272 schedctl_set_cidpri(t
);
1275 } while ((t
= t
->t_forw
) != p
->p_tlist
);
1276 mutex_exit(&p
->p_lock
);
1278 * Free unused scheduling class specific buffers.
1280 for (i
= 0, buf
= bufs
; i
< nlwp
; i
++, buf
++) {
1286 kmem_free(bufs
, nlwp
* sizeof (void *));
1290 pool_get_name(pool_t
*pool
, char **name
)
1292 ASSERT(pool_lock_held());
1294 (void) nvlist_lookup_string(pool
->pool_props
, "pool.name", name
);
1296 ASSERT(strlen(*name
) != 0);
1301 * The meat of the bind operation. The steps in pool_do_bind are:
1303 * 1) Set PBWAIT in the p_poolflag of any process of interest, and add all
1304 * such processes to an array. For any interesting process that has
1305 * threads inside the pool barrier set, increment a counter by the
1306 * count of such threads. Once PBWAIT is set on a process, that process
1307 * will not disappear.
1309 * 2) Wait for the counter from step 2 to drop to zero. Any process which
1310 * calls pool_barrier_exit() and notices that PBWAIT has been set on it
1311 * will decrement that counter before going to sleep, and the process
1312 * calling pool_barrier_exit() which does the final decrement will wake us.
1314 * 3) For each interesting process, perform a calculation on it to see if
1315 * the bind will actually succeed. This uses the following three
1316 * resource-set-specific functions:
1318 * - int set_bind_start(procs, pool)
1320 * Determine whether the given array of processes can be bound to the
1321 * resource set associated with the given pool. If it can, take and hold
1322 * any locks necessary to ensure that the operation will succeed, and
1323 * make any necessary reservations in the target resource set. If it
1324 * can't, return failure with no reservations made and no new locks held.
1326 * - void set_bind_abort(procs, pool)
1328 * set_bind_start() has completed successfully, but another resource set's
1329 * set_bind_start() has failed, and we haven't begun the bind yet. Undo
1330 * any reservations made and drop any locks acquired by our
1333 * - void set_bind_finish(void)
1335 * The bind has completed successfully. The processes have been released,
1336 * and the reservation acquired in set_bind_start() has been depleted as
1337 * the processes have finished their bindings. Drop any locks acquired by
1340 * 4) If we've decided that we can proceed with the bind, iterate through
1341 * the list of interesting processes, grab the necessary locks (which
1342 * may differ per resource set), perform the bind, and ASSERT that it
1343 * succeeds. Once a process has been rebound, it can be awakened.
1345 * The operations from step 4 must be kept in sync with anything which might
1346 * cause the bind operations (e.g., cpupart_bind_thread()) to fail, and
1347 * are thus located in the same source files as the associated bind operations.
1350 pool_do_bind(pool_t
*pool
, idtype_t idtype
, id_t id
, int flags
)
1352 extern uint_t nproc
;
1353 klwp_t
*lwp
= ttolwp(curthread
);
1354 proc_t
**pp
, **procs
;
1356 int procs_count
= 0;
1365 ASSERT(pool_lock_held());
1367 if ((cid
= pool_get_class(pool
)) == POOL_CLASS_INVAL
)
1370 if (idtype
== P_ZONEID
) {
1371 zone
= zone_find_by_id(id
);
1374 if (zone_status_get(zone
) > ZONE_IS_RUNNING
) {
1380 if (idtype
== P_PROJID
) {
1381 kpj
= project_hold_by_id(id
, global_zone
, PROJECT_HOLD_FIND
);
1384 mutex_enter(&kpj
->kpj_poolbind
);
1387 if (idtype
== P_PID
) {
1389 * Fast-path for a single process case.
1391 procs_size
= 2; /* procs is NULL-terminated */
1392 procs
= kmem_zalloc(procs_size
* sizeof (proc_t
*), KM_SLEEP
);
1393 mutex_enter(&pidlock
);
1396 * We will need enough slots for proc_t pointers for as many as
1397 * twice the number of currently running processes (assuming
1398 * that each one could be in fork() creating a new child).
1401 procs_size
= nproc
* 2;
1402 procs
= kmem_zalloc(procs_size
* sizeof (proc_t
*),
1404 mutex_enter(&pidlock
);
1406 if (nproc
* 2 <= procs_size
)
1409 * If nproc has changed, try again.
1411 mutex_exit(&pidlock
);
1412 kmem_free(procs
, procs_size
* sizeof (proc_t
*));
1417 id
= getmyid(idtype
);
1418 setprocset(&set
, POP_AND
, idtype
, id
, P_ALL
, 0);
1421 * Do a first scan, and select target processes.
1423 if (idtype
== P_PID
)
1424 prstart
= prfind(id
);
1427 for (p
= prstart
, pp
= procs
; p
!= NULL
; p
= p
->p_next
) {
1428 mutex_enter(&p
->p_lock
);
1430 * Skip processes that don't match our (id, idtype) set or
1431 * on the way of becoming zombies. Skip kernel processes
1432 * from the global zone.
1434 if (procinset(p
, &set
) == 0 ||
1435 p
->p_poolflag
& PEXITED
||
1436 ((p
->p_flag
& SSYS
) && INGLOBALZONE(p
))) {
1437 mutex_exit(&p
->p_lock
);
1440 if (!INGLOBALZONE(p
)) {
1445 * Can't bind processes or tasks
1446 * in local zones to pools.
1448 mutex_exit(&p
->p_lock
);
1449 mutex_exit(&pidlock
);
1450 pool_bind_wakeall(procs
);
1455 * Only projects in the global
1456 * zone can be rebound.
1458 mutex_exit(&p
->p_lock
);
1462 * When rebinding pools, processes can be
1463 * in different zones.
1469 p
->p_poolflag
|= PBWAIT
;
1471 * If some threads in this process are inside the pool
1472 * barrier, add them to pool_barrier_count, as we have
1473 * to wait for all of them to exit the barrier.
1475 if (p
->p_poolcnt
> 0) {
1476 mutex_enter(&pool_barrier_lock
);
1477 pool_barrier_count
+= p
->p_poolcnt
;
1478 mutex_exit(&pool_barrier_lock
);
1480 ASSERT(pp
< &procs
[procs_size
]);
1483 mutex_exit(&p
->p_lock
);
1486 * We just found our process, so if we're only rebinding a
1487 * single process then get out of this loop.
1489 if (idtype
== P_PID
)
1492 *pp
= NULL
; /* cap off the end of the array */
1493 mutex_exit(&pidlock
);
1496 * Wait for relevant processes to stop before they try to enter the
1497 * barrier or at the exit from the barrier. Make sure that we do
1498 * not get stopped here while we're holding pool_lock. If we were
1499 * requested to stop, or got a signal then return EAGAIN to let the
1500 * library know that it needs to retry.
1502 mutex_enter(&pool_barrier_lock
);
1504 while (pool_barrier_count
> 0) {
1505 (void) cv_wait_sig(&pool_barrier_cv
, &pool_barrier_lock
);
1506 if (pool_barrier_count
> 0) {
1508 * We either got a signal or were requested to
1509 * stop by /proc. Bail out with EAGAIN. If we were
1510 * requested to stop, we'll stop in post_syscall()
1511 * on our way back to userland.
1513 mutex_exit(&pool_barrier_lock
);
1514 pool_bind_wakeall(procs
);
1521 mutex_exit(&pool_barrier_lock
);
1523 if (idtype
== P_PID
) {
1524 if ((p
= *procs
) == NULL
)
1526 mutex_enter(&p
->p_lock
);
1527 /* Drop the process if it is exiting */
1528 if (p
->p_poolflag
& PEXITED
) {
1529 mutex_exit(&p
->p_lock
);
1533 mutex_exit(&p
->p_lock
);
1538 * Do another run, and drop processes that were inside the barrier
1539 * in exit(), but when they have dropped to pool_barrier_exit
1540 * they have become of no interest to us. Pick up child processes that
1541 * were created by fork() but didn't exist during our first scan.
1542 * Their parents are now stopped at pool_barrier_exit in cfork().
1544 mutex_enter(&pidlock
);
1545 for (pp
= procs
; (p
= *pp
) != NULL
; pp
++) {
1546 mutex_enter(&p
->p_lock
);
1547 if (p
->p_poolflag
& PEXITED
) {
1548 ASSERT(p
->p_lwpcnt
== 0);
1549 mutex_exit(&p
->p_lock
);
1551 /* flip w/last non-NULL slot */
1552 *pp
= procs
[procs_count
- 1];
1553 procs
[procs_count
- 1] = NULL
;
1555 pp
--; /* try this slot again */
1558 mutex_exit(&p
->p_lock
);
1560 * Look at the child and check if it should be rebound also.
1561 * We're holding pidlock, so it is safe to reference p_child.
1563 if ((p
= p
->p_child
) == NULL
)
1566 mutex_enter(&p
->p_lock
);
1569 * Skip system processes and make sure that the child is in
1570 * the same task/project/pool/zone as the parent.
1572 if ((!INGLOBALZONE(p
) && idtype
!= P_ZONEID
&&
1573 idtype
!= P_POOLID
) || p
->p_flag
& SSYS
) {
1574 mutex_exit(&p
->p_lock
);
1579 * If the child process has been already created by fork(), has
1580 * not exited, and has not been added to the list already,
1581 * then add it now. We will hit this process again (since we
1582 * stick it at the end of the procs list) but it will ignored
1583 * because it will have the PBWAIT flag set.
1585 if (procinset(p
, &set
) &&
1586 !(p
->p_poolflag
& PEXITED
) &&
1587 !(p
->p_poolflag
& PBWAIT
)) {
1588 ASSERT(p
->p_child
== NULL
); /* no child of a child */
1589 procs
[procs_count
] = p
;
1590 procs
[procs_count
+ 1] = NULL
;
1592 p
->p_poolflag
|= PBWAIT
;
1594 mutex_exit(&p
->p_lock
);
1596 mutex_exit(&pidlock
);
1599 * If there's no processes to rebind then return ESRCH, unless
1600 * we're associating a pool with new resource set, destroying it,
1601 * or binding a zone to a pool.
1603 if (procs_count
== 0) {
1604 if (idtype
== P_POOLID
|| idtype
== P_ZONEID
)
1613 * All processes in the array should have PBWAIT set, and none
1614 * should be in the critical section. Thus, although p_poolflag
1615 * and p_poolcnt are protected by p_lock, their ASSERTions below
1616 * should be stable without it. procinset(), however, ASSERTs that
1617 * the p_lock is held upon entry.
1619 for (pp
= procs
; (p
= *pp
) != NULL
; pp
++) {
1622 mutex_enter(&p
->p_lock
);
1623 in_set
= procinset(p
, &set
);
1624 mutex_exit(&p
->p_lock
);
1627 ASSERT(p
->p_poolflag
& PBWAIT
);
1628 ASSERT(p
->p_poolcnt
== 0);
1633 * Do the check if processor set rebinding is going to succeed or not.
1635 if ((flags
& POOL_BIND_PSET
) &&
1636 (rv
= pset_bind_start(procs
, pool
)) != 0) {
1637 pool_bind_wakeall(procs
);
1642 * At this point, all bind operations should succeed.
1644 for (pp
= procs
; (p
= *pp
) != NULL
; pp
++) {
1645 if (flags
& POOL_BIND_PSET
) {
1646 psetid_t psetid
= pool
->pool_pset
->pset_id
;
1651 * Pre-allocate one buffer for FSS (per-project
1652 * buffer for a new pset) in case if this is the
1653 * first thread from its current project getting
1654 * bound to this processor set.
1656 projbuf
= fss_allocbuf(FSS_ONE_BUF
, FSS_ALLOC_PROJ
);
1657 zonebuf
= fss_allocbuf(FSS_ONE_BUF
, FSS_ALLOC_ZONE
);
1659 mutex_enter(&pidlock
);
1660 mutex_enter(&p
->p_lock
);
1661 pool_pset_bind(p
, psetid
, projbuf
, zonebuf
);
1662 mutex_exit(&p
->p_lock
);
1663 mutex_exit(&pidlock
);
1665 * Free buffers pre-allocated above if it
1666 * wasn't actually used.
1668 fss_freebuf(projbuf
, FSS_ALLOC_PROJ
);
1669 fss_freebuf(zonebuf
, FSS_ALLOC_ZONE
);
1672 * Now let's change the scheduling class of this
1673 * process if our target pool has it defined.
1675 if (cid
!= POOL_CLASS_UNSET
)
1676 pool_change_class(p
, cid
);
1679 * It is safe to reference p_pool here without holding
1680 * p_lock because it cannot change underneath of us.
1681 * We're holding pool_lock here, so nobody else can be
1682 * moving this process between pools. If process "p"
1683 * would be exiting, we're guaranteed that it would be blocked
1684 * at pool_barrier_enter() in exit(). Otherwise, it would've
1685 * been skipped by one of our scans of the practive list
1686 * as a process with PEXITED flag set.
1688 if (p
->p_pool
!= pool
) {
1689 ASSERT(p
->p_pool
->pool_ref
> 0);
1690 atomic_dec_32(&p
->p_pool
->pool_ref
);
1692 atomic_inc_32(&p
->p_pool
->pool_ref
);
1695 * Okay, we've tortured this guy enough.
1696 * Let this poor process go now.
1700 if (flags
& POOL_BIND_PSET
)
1703 out
: switch (idtype
) {
1705 ASSERT(kpj
!= NULL
);
1706 mutex_exit(&kpj
->kpj_poolbind
);
1711 mutex_enter(&cpu_lock
);
1712 zone_pool_set(zone
, pool
);
1713 mutex_exit(&cpu_lock
);
1715 zone
->zone_pool_mod
= gethrtime();
1720 kmem_free(procs
, procs_size
* sizeof (proc_t
*));
1721 ASSERT(pool_barrier_count
== 0);
1726 pool_event_cb_register(pool_event_cb_t
*cb
)
1728 ASSERT(!pool_lock_held() || panicstr
);
1729 ASSERT(cb
->pec_func
!= NULL
);
1731 mutex_enter(&pool_event_cb_lock
);
1732 if (!pool_event_cb_init
) {
1733 list_create(&pool_event_cb_list
, sizeof (pool_event_cb_t
),
1734 offsetof(pool_event_cb_t
, pec_list
));
1735 pool_event_cb_init
= B_TRUE
;
1737 list_insert_tail(&pool_event_cb_list
, cb
);
1738 mutex_exit(&pool_event_cb_lock
);
1742 pool_event_cb_unregister(pool_event_cb_t
*cb
)
1744 ASSERT(!pool_lock_held() || panicstr
);
1746 mutex_enter(&pool_event_cb_lock
);
1747 list_remove(&pool_event_cb_list
, cb
);
1748 mutex_exit(&pool_event_cb_lock
);
1752 pool_event_t tqd_what
;
1757 pool_event_notify(void *arg
)
1759 pool_tqd_t
*tqd
= (pool_tqd_t
*)arg
;
1760 pool_event_cb_t
*cb
;
1762 ASSERT(!pool_lock_held() || panicstr
);
1764 mutex_enter(&pool_event_cb_lock
);
1765 for (cb
= list_head(&pool_event_cb_list
); cb
!= NULL
;
1766 cb
= list_next(&pool_event_cb_list
, cb
)) {
1767 cb
->pec_func(tqd
->tqd_what
, tqd
->tqd_id
, cb
->pec_arg
);
1769 mutex_exit(&pool_event_cb_lock
);
1770 kmem_free(tqd
, sizeof (*tqd
));
1774 pool_event_dispatch(pool_event_t what
, poolid_t id
)
1776 pool_tqd_t
*tqd
= NULL
;
1778 ASSERT(pool_lock_held());
1780 if (pool_event_cb_taskq
== NULL
) {
1781 pool_event_cb_taskq
= taskq_create("pool_event_cb_taskq", 1,
1782 -1, 1, 1, TASKQ_PREPOPULATE
);
1785 tqd
= kmem_alloc(sizeof (*tqd
), KM_SLEEP
);
1786 tqd
->tqd_what
= what
;
1789 (void) taskq_dispatch(pool_event_cb_taskq
, pool_event_notify
, tqd
,