4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
27 #include <sys/thread.h>
30 #include <sys/cmn_err.h>
31 #include <sys/class.h>
33 #include <sys/atomic.h>
35 #include <sys/clock_tick.h>
36 #include <sys/clock_impl.h>
37 #include <sys/sysmacros.h>
41 * This file contains the implementation of clock tick accounting for threads.
42 * Every tick, user threads running on various CPUs are located and charged
43 * with a tick to account for their use of CPU time.
45 * Every tick, the clock() handler calls clock_tick_schedule() to perform tick
46 * accounting for all the threads in the system. Tick accounting is done in
49 * Tick scheduling Done in clock_tick_schedule(). In this phase, cross
50 * calls are scheduled to multiple CPUs to perform
51 * multi-threaded tick accounting. The CPUs are chosen
52 * on a rotational basis so as to distribute the tick
53 * accounting load evenly across all CPUs.
55 * Tick execution Done in clock_tick_execute(). In this phase, tick
56 * accounting is actually performed by softint handlers
59 * This implementation gives us a multi-threaded tick processing facility that
60 * is suitable for configurations with a large number of CPUs. On smaller
61 * configurations it may be desirable to let the processing be single-threaded
62 * and just allow clock() to do it as it has been done traditionally. To
63 * facilitate this, a variable, clock_tick_threshold, is defined. Platforms
64 * that desire multi-threading should set this variable to something
65 * appropriate. A recommended value may be found in clock_tick.h. At boot time,
66 * if the number of CPUs is greater than clock_tick_threshold, multi-threading
67 * kicks in. Note that this is a decision made at boot time. If more CPUs
68 * are dynamically added later on to exceed the threshold, no attempt is made
69 * to switch to multi-threaded. Similarly, if CPUs are removed dynamically
70 * no attempt is made to switch to single-threaded. This is to keep the
71 * implementation simple. Also note that the threshold can be changed for a
72 * specific customer configuration via /etc/system.
74 * The boot time decision is reflected in clock_tick_single_threaded.
78 * clock_tick_threshold
79 * If the number of CPUs at boot time exceeds this threshold,
80 * multi-threaded tick accounting kicks in.
83 * The number of CPUs in a set. Each set is scheduled for tick execution
84 * on a separate processor.
86 * clock_tick_single_threaded
87 * Indicates whether or not tick accounting is single threaded.
89 * clock_tick_total_cpus
90 * Total number of online CPUs.
93 * Array of online CPU pointers.
96 * Per-CPU, cache-aligned data structures to facilitate multi-threading.
99 * Counter that indicates the number of active tick processing softints
103 * Number of pending ticks that need to be accounted by the softint
107 * Mutex to synchronize between clock_tick_schedule() and
108 * CPU online/offline.
111 * CPU id of the clock() CPU. Used to detect when the clock CPU
114 * clock_tick_online_cpuset
115 * CPU set of all online processors that can be X-called.
117 * clock_tick_proc_max
118 * Each process is allowed to accumulate a few ticks before checking
119 * for the task CPU time resource limit. We lower the number of calls
120 * to rctl_test() to make tick accounting more scalable. The tradeoff
121 * is that the limit may not get enforced in a timely manner. This is
122 * typically not a problem.
125 * Per-set structures. Each structure contains the range of CPUs
126 * to be processed for the set.
132 * Where to begin the scan for single-threaded mode. In multi-threaded,
133 * the clock_tick_set itself contains a field for this.
135 int clock_tick_threshold
;
136 int clock_tick_ncpus
;
137 int clock_tick_single_threaded
;
138 int clock_tick_total_cpus
;
139 cpu_t
*clock_tick_cpus
[NCPU
];
140 clock_tick_cpu_t
*clock_tick_cpu
[NCPU
];
141 ulong_t clock_tick_active
;
142 int clock_tick_pending
;
143 kmutex_t clock_tick_lock
;
144 processorid_t clock_cpu_id
;
145 cpuset_t clock_tick_online_cpuset
;
146 clock_t clock_tick_proc_max
;
147 clock_tick_set_t
*clock_tick_set
;
148 int clock_tick_nsets
;
150 ulong_t clock_tick_intr
;
152 static uint_t
clock_tick_execute(caddr_t
, caddr_t
);
153 static void clock_tick_execute_common(int, int, int, clock_t, int);
155 #define CLOCK_TICK_ALIGN 64 /* cache alignment */
158 * Clock tick initialization is done in two phases:
160 * 1. Before clock_init() is called, clock_tick_init_pre() is called to set
161 * up single-threading so the clock() can begin to do its job.
163 * 2. After the slave CPUs are initialized at boot time, we know the number
164 * of CPUs. clock_tick_init_post() is called to set up multi-threading if
168 clock_tick_init_pre(void)
170 clock_tick_cpu_t
*ctp
;
172 clock_tick_set_t
*csp
;
176 clock_tick_single_threaded
= 1;
178 size
= P2ROUNDUP(sizeof (clock_tick_cpu_t
), CLOCK_TICK_ALIGN
);
179 buf
= (uintptr_t)kmem_zalloc(size
* NCPU
+ CLOCK_TICK_ALIGN
, KM_SLEEP
);
180 buf
= P2ROUNDUP(buf
, CLOCK_TICK_ALIGN
);
183 * Perform initialization in case multi-threading is chosen later.
185 if (&create_softint
!= NULL
) {
186 clock_tick_intr
= create_softint(LOCK_LEVEL
,
187 clock_tick_execute
, NULL
);
189 for (i
= 0; i
< NCPU
; i
++, buf
+= size
) {
190 ctp
= (clock_tick_cpu_t
*)buf
;
191 clock_tick_cpu
[i
] = ctp
;
192 mutex_init(&ctp
->ct_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
193 if (&create_softint
!= NULL
) {
194 ctp
->ct_intr
= clock_tick_intr
;
199 mutex_init(&clock_tick_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
202 * Compute clock_tick_ncpus here. We need it to compute the
203 * maximum number of tick sets we need to support.
205 ASSERT(clock_tick_ncpus
>= 0);
206 if (clock_tick_ncpus
== 0)
207 clock_tick_ncpus
= CLOCK_TICK_NCPUS
;
208 if (clock_tick_ncpus
> max_ncpus
)
209 clock_tick_ncpus
= max_ncpus
;
212 * Allocate and initialize the tick sets.
214 n
= (max_ncpus
+ clock_tick_ncpus
- 1)/clock_tick_ncpus
;
215 clock_tick_set
= kmem_zalloc(sizeof (clock_tick_set_t
) * n
, KM_SLEEP
);
216 for (i
= 0; i
< n
; i
++) {
217 csp
= &clock_tick_set
[i
];
218 csp
->ct_start
= i
* clock_tick_ncpus
;
219 csp
->ct_scan
= csp
->ct_start
;
220 csp
->ct_end
= csp
->ct_start
;
225 clock_tick_init_post(void)
228 * If a platform does not provide create_softint() and invoke_softint(),
229 * then we assume single threaded.
231 if (&invoke_softint
== NULL
)
232 clock_tick_threshold
= 0;
234 ASSERT(clock_tick_threshold
>= 0);
236 if (clock_tick_threshold
== 0)
237 clock_tick_threshold
= max_ncpus
;
240 * If a platform does not specify a threshold or if the number of CPUs
241 * at boot time does not exceed the threshold, tick accounting remains
244 if (ncpus
<= clock_tick_threshold
) {
245 clock_tick_ncpus
= max_ncpus
;
246 clock_tick_proc_max
= 1;
251 * OK. Multi-thread tick processing. If a platform has not specified
252 * the CPU set size for multi-threading, then use the default value.
253 * This value has been arrived through measurements on large
254 * configuration systems.
256 clock_tick_single_threaded
= 0;
257 if (clock_tick_proc_max
== 0) {
258 clock_tick_proc_max
= CLOCK_TICK_PROC_MAX
;
260 clock_tick_proc_max
*= 10;
265 clock_tick_schedule_one(clock_tick_set_t
*csp
, int pending
, processorid_t cid
)
267 clock_tick_cpu_t
*ctp
;
269 ASSERT(&invoke_softint
!= NULL
);
271 atomic_inc_ulong(&clock_tick_active
);
274 * Schedule tick accounting for a set of CPUs.
276 ctp
= clock_tick_cpu
[cid
];
277 mutex_enter(&ctp
->ct_lock
);
278 ctp
->ct_lbolt
= LBOLT_NO_ACCOUNT
;
279 ctp
->ct_pending
+= pending
;
280 ctp
->ct_start
= csp
->ct_start
;
281 ctp
->ct_end
= csp
->ct_end
;
282 ctp
->ct_scan
= csp
->ct_scan
;
283 mutex_exit(&ctp
->ct_lock
);
285 invoke_softint(cid
, ctp
->ct_intr
);
287 * Return without waiting for the softint to finish.
292 clock_tick_process(cpu_t
*cp
, clock_t mylbolt
, int pending
)
300 * The locking here is rather tricky. thread_free_prevent()
301 * prevents the thread returned from being freed while we
302 * are looking at it. We can then check if the thread
303 * is exiting and get the appropriate p_lock if it
304 * is not. We have to be careful, though, because
305 * the _process_ can still be freed while we've
306 * prevented thread free. To avoid touching the
307 * proc structure we put a pointer to the p_lock in the
308 * thread structure. The p_lock is persistent so we
309 * can acquire it even if the process is gone. At that
310 * point we can check (again) if the thread is exiting
311 * and either drop the lock or do the tick processing.
313 t
= cp
->cpu_thread
; /* Current running thread */
316 * 't' will be the tick processing thread on this
317 * CPU. Use the pinned thread (if any) on this CPU
318 * as the target of the clock tick.
320 if (t
->t_intr
!= NULL
)
325 * We use thread_free_prevent to keep the currently running
326 * thread from being freed or recycled while we're
329 thread_free_prevent(t
);
331 * We cannot hold the cpu_lock to prevent the
332 * cpu_active from changing in the clock interrupt.
333 * As long as we don't block (or don't get pre-empted)
334 * the cpu_list will not change (all threads are paused
335 * before list modification).
337 if (CLOCK_TICK_CPU_OFFLINE(cp
)) {
338 thread_free_allow(t
);
343 * Make sure the thread is still on the CPU.
345 if ((t
!= cp
->cpu_thread
) &&
346 ((cp
!= CPU
) || (t
!= cp
->cpu_thread
->t_intr
))) {
348 * We could not locate the thread. Skip this CPU. Race
349 * conditions while performing these checks are benign.
350 * These checks are not perfect and they don't need
353 thread_free_allow(t
);
357 intr
= t
->t_flag
& T_INTR_THREAD
;
359 if (lwp
== NULL
|| (t
->t_proc_flag
& TP_LWPEXIT
) || intr
) {
361 * Thread is exiting (or uninteresting) so don't
362 * do tick processing.
364 thread_free_allow(t
);
369 * OK, try to grab the process lock. See
370 * comments above for why we're not using
371 * ttoproc(t)->p_lockp here.
373 plockp
= t
->t_plockp
;
375 /* See above comment. */
376 if (CLOCK_TICK_CPU_OFFLINE(cp
)) {
378 thread_free_allow(t
);
383 * The thread may have exited between when we
384 * checked above, and when we got the p_lock.
386 if (t
->t_proc_flag
& TP_LWPEXIT
) {
388 thread_free_allow(t
);
393 * Either we have the p_lock for the thread's process,
394 * or we don't care about the thread structure any more.
395 * Either way we can allow thread free.
397 thread_free_allow(t
);
400 * If we haven't done tick processing for this
401 * lwp, then do it now. Since we don't hold the
402 * lwp down on a CPU it can migrate and show up
403 * more than once, hence the lbolt check. mylbolt
404 * is copied at the time of tick scheduling to prevent
407 * Also, make sure that it's okay to perform the
408 * tick processing before calling clock_tick.
409 * Setting notick to a TRUE value (ie. not 0)
410 * results in tick processing not being performed for
413 notick
= ((cp
->cpu_flags
& CPU_QUIESCED
) || CPU_ON_INTR(cp
) ||
414 (cp
->cpu_dispthread
== cp
->cpu_idle_thread
));
416 if ((!notick
) && (t
->t_lbolt
< mylbolt
)) {
417 t
->t_lbolt
= mylbolt
;
418 clock_tick(t
, pending
);
425 clock_tick_schedule(int one_sec
)
429 clock_tick_set_t
*csp
;
432 if (clock_cpu_id
!= CPU
->cpu_id
)
433 clock_cpu_id
= CPU
->cpu_id
;
435 if (clock_tick_single_threaded
) {
437 * Each tick cycle, start the scan from a different
438 * CPU for the sake of fairness.
440 end
= clock_tick_total_cpus
;
442 if (clock_tick_scan
>= end
)
445 clock_tick_execute_common(0, clock_tick_scan
, end
,
446 LBOLT_NO_ACCOUNT
, 1);
452 * If the previous invocation of handlers is not yet finished, then
453 * simply increment a pending count and return. Eventually when they
454 * finish, the pending count is passed down to the next set of
455 * handlers to process. This way, ticks that have already elapsed
456 * in the past are handled as quickly as possible to minimize the
457 * chances of threads getting away before their pending ticks are
458 * accounted. The other benefit is that if the pending count is
459 * more than one, it can be handled by a single invocation of
460 * clock_tick(). This is a good optimization for large configuration
461 * busy systems where tick accounting can get backed up for various
464 clock_tick_pending
++;
466 active
= clock_tick_active
;
467 active
= atomic_cas_ulong(&clock_tick_active
, active
, active
);
472 * We want to handle the clock CPU here. If we
473 * scheduled the accounting for the clock CPU to another
474 * processor, that processor will find only the clock() thread
475 * running and not account for any user thread below it. Also,
476 * we want to handle this before we block on anything and allow
477 * the pinned thread below the current thread to escape.
479 clock_tick_process(CPU
, LBOLT_NO_ACCOUNT
, clock_tick_pending
);
481 mutex_enter(&clock_tick_lock
);
484 * Schedule each set on a separate processor.
487 for (i
= 0; i
< clock_tick_nsets
; i
++) {
488 csp
= &clock_tick_set
[i
];
491 * Pick the next online CPU in list for scheduling tick
492 * accounting. The clock_tick_lock is held by the caller.
493 * So, CPU online/offline cannot muck with this while
494 * we are picking our CPU to X-call.
497 cp
= cp
->cpu_next_onln
;
500 * Each tick cycle, start the scan from a different
501 * CPU for the sake of fairness.
504 if (csp
->ct_scan
>= csp
->ct_end
)
505 csp
->ct_scan
= csp
->ct_start
;
507 clock_tick_schedule_one(csp
, clock_tick_pending
, cp
->cpu_id
);
509 cp
= cp
->cpu_next_onln
;
514 * Move the CPU pointer around every second. This is so
515 * all the CPUs can be X-called in a round-robin fashion
516 * to evenly distribute the X-calls. We don't do this
517 * at a faster rate than this because we don't want
518 * to affect cache performance negatively.
520 clock_cpu_list
= clock_cpu_list
->cpu_next_onln
;
523 mutex_exit(&clock_tick_lock
);
525 clock_tick_pending
= 0;
529 clock_tick_execute_common(int start
, int scan
, int end
, clock_t mylbolt
,
535 ASSERT((start
<= scan
) && (scan
<= end
));
538 * Handle the thread on current CPU first. This is to prevent a
539 * pinned thread from escaping if we ever block on something.
540 * Note that in the single-threaded mode, this handles the clock
543 clock_tick_process(CPU
, mylbolt
, pending
);
546 * Perform tick accounting for the threads running on
547 * the scheduled CPUs.
549 for (i
= scan
; i
< end
; i
++) {
550 cp
= clock_tick_cpus
[i
];
551 if ((cp
== NULL
) || (cp
== CPU
) || (cp
->cpu_id
== clock_cpu_id
))
553 clock_tick_process(cp
, mylbolt
, pending
);
556 for (i
= start
; i
< scan
; i
++) {
557 cp
= clock_tick_cpus
[i
];
558 if ((cp
== NULL
) || (cp
== CPU
) || (cp
->cpu_id
== clock_cpu_id
))
560 clock_tick_process(cp
, mylbolt
, pending
);
566 clock_tick_execute(caddr_t arg1
, caddr_t arg2
)
568 clock_tick_cpu_t
*ctp
;
569 int start
, scan
, end
, pending
;
573 * We could have raced with cpu offline. We don't want to
574 * process anything on an offlined CPU. If we got blocked
575 * on anything, we may not get scheduled when we wakeup
578 if (!CLOCK_TICK_XCALL_SAFE(CPU
))
581 ctp
= clock_tick_cpu
[CPU
->cpu_id
];
583 mutex_enter(&ctp
->ct_lock
);
584 pending
= ctp
->ct_pending
;
587 * If a CPU is busy at LOCK_LEVEL, then an invocation
588 * of this softint may be queued for some time. In that case,
589 * clock_tick_active will not be incremented.
590 * clock_tick_schedule() will then assume that the previous
591 * invocation is done and post a new softint. The first one
592 * that gets in will reset the pending count so the
593 * second one is a noop.
595 mutex_exit(&ctp
->ct_lock
);
599 start
= ctp
->ct_start
;
602 mylbolt
= ctp
->ct_lbolt
;
603 mutex_exit(&ctp
->ct_lock
);
605 clock_tick_execute_common(start
, scan
, end
, mylbolt
, pending
);
609 * Signal completion to the clock handler.
611 atomic_dec_ulong(&clock_tick_active
);
618 clock_tick_cpu_setup(cpu_setup_t what
, int cid
, void *arg
)
622 clock_tick_set_t
*csp
;
625 * This function performs some computations at CPU offline/online
626 * time. The computed values are used during tick scheduling and
627 * execution phases. This avoids having to compute things on
628 * an every tick basis. The other benefit is that we perform the
629 * computations only for onlined CPUs (not offlined ones). As a
630 * result, no tick processing is attempted for offlined CPUs.
632 * Also, cpu_offline() calls this function before checking for
633 * active interrupt threads. This allows us to avoid posting
634 * cross calls to CPUs that are being offlined.
639 mutex_enter(&clock_tick_lock
);
643 clock_tick_cpus
[clock_tick_total_cpus
] = cp
;
644 set
= clock_tick_total_cpus
/ clock_tick_ncpus
;
645 csp
= &clock_tick_set
[set
];
647 clock_tick_total_cpus
++;
649 (clock_tick_total_cpus
+ clock_tick_ncpus
- 1) /
651 CPUSET_ADD(clock_tick_online_cpuset
, cp
->cpu_id
);
656 if (&sync_softint
!= NULL
)
657 sync_softint(clock_tick_online_cpuset
);
658 CPUSET_DEL(clock_tick_online_cpuset
, cp
->cpu_id
);
659 clock_tick_total_cpus
--;
660 clock_tick_cpus
[clock_tick_total_cpus
] = NULL
;
662 (clock_tick_total_cpus
+ clock_tick_ncpus
- 1) /
664 set
= clock_tick_total_cpus
/ clock_tick_ncpus
;
665 csp
= &clock_tick_set
[set
];
673 clock_tick_cpus
[i
] = ncp
;
675 } while ((ncp
= ncp
->cpu_next_onln
) != cpu_active
);
676 ASSERT(i
== clock_tick_total_cpus
);
684 mutex_exit(&clock_tick_lock
);
691 clock_tick_mp_init(void)
695 mutex_enter(&cpu_lock
);
699 (void) clock_tick_cpu_setup(CPU_ON
, cp
->cpu_id
, NULL
);
700 } while ((cp
= cp
->cpu_next_onln
) != cpu_active
);
702 register_cpu_setup_func(clock_tick_cpu_setup
, NULL
);
704 mutex_exit(&cpu_lock
);