4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
26 * Copyright (c) 2010, Intel Corporation.
27 * All rights reserved.
30 #include <sys/types.h>
31 #include <sys/param.h>
32 #include <sys/t_lock.h>
33 #include <sys/thread.h>
34 #include <sys/cpuvar.h>
35 #include <sys/x_call.h>
36 #include <sys/xc_levels.h>
39 #include <sys/sunddi.h>
40 #include <sys/debug.h>
41 #include <sys/systm.h>
42 #include <sys/archsystm.h>
43 #include <sys/machsystm.h>
44 #include <sys/mutex_impl.h>
45 #include <sys/stack.h>
46 #include <sys/promif.h>
47 #include <sys/x86_archext.h>
50 * Implementation for cross-processor calls via interprocessor interrupts
52 * This implementation uses a message passing architecture to allow multiple
53 * concurrent cross calls to be in flight at any given time. We use the cmpxchg
54 * instruction, aka atomic_cas_ptr(), to implement simple efficient work
55 * queues for message passing between CPUs with almost no need for regular
56 * locking. See xc_extract() and xc_insert() below.
58 * The general idea is that initiating a cross call means putting a message
59 * on a target(s) CPU's work queue. Any synchronization is handled by passing
60 * the message back and forth between initiator and target(s).
62 * Every CPU has xc_work_cnt, which indicates it has messages to process.
63 * This value is incremented as message traffic is initiated and decremented
64 * with every message that finishes all processing.
66 * The code needs no mfence or other membar_*() calls. The uses of
67 * atomic_cas_ptr(), atomic_cas_32() and atomic_dec_32() for the message
68 * passing are implemented with LOCK prefix instructions which are
69 * equivalent to mfence.
71 * One interesting aspect of this implmentation is that it allows 2 or more
72 * CPUs to initiate cross calls to intersecting sets of CPUs at the same time.
73 * The cross call processing by the CPUs will happen in any order with only
74 * a guarantee, for xc_call() and xc_sync(), that an initiator won't return
75 * from cross calls before all slaves have invoked the function.
77 * The reason for this asynchronous approach is to allow for fast global
78 * TLB shootdowns. If all CPUs, say N, tried to do a global TLB invalidation
79 * on a different Virtual Address at the same time. The old code required
80 * N squared IPIs. With this method, depending on timing, it could happen
85 * The default is to not enable collecting counts of IPI information, since
86 * the updating of shared cachelines could cause excess bus traffic.
88 uint_t xc_collect_enable
= 0;
89 uint64_t xc_total_cnt
= 0; /* total #IPIs sent for cross calls */
90 uint64_t xc_multi_cnt
= 0; /* # times we piggy backed on another IPI */
93 * Values for message states. Here are the normal transitions. A transition
94 * of "->" happens in the slave cpu and "=>" happens in the master cpu as
95 * the messages are passed back and forth.
97 * FREE => ASYNC -> DONE => FREE
98 * FREE => CALL -> DONE => FREE
99 * FREE => SYNC -> WAITING => RELEASED -> DONE => FREE
101 * The interesing one above is ASYNC. You might ask, why not go directly
102 * to FREE, instead of DONE. If it did that, it might be possible to exhaust
103 * the master's xc_free list if a master can generate ASYNC messages faster
104 * then the slave can process them. That could be handled with more complicated
105 * handling. However since nothing important uses ASYNC, I've not bothered.
107 #define XC_MSG_FREE (0) /* msg in xc_free queue */
108 #define XC_MSG_ASYNC (1) /* msg in slave xc_msgbox */
109 #define XC_MSG_CALL (2) /* msg in slave xc_msgbox */
110 #define XC_MSG_SYNC (3) /* msg in slave xc_msgbox */
111 #define XC_MSG_WAITING (4) /* msg in master xc_msgbox or xc_waiters */
112 #define XC_MSG_RELEASED (5) /* msg in slave xc_msgbox */
113 #define XC_MSG_DONE (6) /* msg in master xc_msgbox */
116 * We allow for one high priority message at a time to happen in the system.
117 * This is used for panic, kmdb, etc., so no locking is done.
119 static volatile cpuset_t xc_priority_set_store
;
120 static volatile ulong_t
*xc_priority_set
= CPUSET2BV(xc_priority_set_store
);
121 static xc_data_t xc_priority_data
;
124 * Wrappers to avoid C compiler warnings due to volatile. The atomic bit
125 * operations don't accept volatile bit vectors - which is a bit silly.
127 #define XC_BT_SET(vector, b) BT_ATOMIC_SET((ulong_t *)(vector), (b))
128 #define XC_BT_CLEAR(vector, b) BT_ATOMIC_CLEAR((ulong_t *)(vector), (b))
131 * Decrement a CPU's work count
134 xc_decrement(struct machcpu
*mcpu
)
136 atomic_dec_32(&mcpu
->xc_work_cnt
);
140 * Increment a CPU's work count and return the old value
143 xc_increment(struct machcpu
*mcpu
)
147 old
= mcpu
->xc_work_cnt
;
148 } while (atomic_cas_32(&mcpu
->xc_work_cnt
, old
, old
+ 1) != old
);
153 * Put a message into a queue. The insertion is atomic no matter
154 * how many different inserts/extracts to the same queue happen.
157 xc_insert(void *queue
, xc_msg_t
*msg
)
162 * FREE messages should only ever be getting inserted into
163 * the xc_master CPUs xc_free queue.
165 ASSERT(msg
->xc_command
!= XC_MSG_FREE
||
166 cpu
[msg
->xc_master
] == NULL
|| /* possible only during init */
167 queue
== &cpu
[msg
->xc_master
]->cpu_m
.xc_free
);
170 old_head
= (xc_msg_t
*)*(volatile xc_msg_t
**)queue
;
171 msg
->xc_next
= old_head
;
172 } while (atomic_cas_ptr(queue
, old_head
, msg
) != old_head
);
176 * Extract a message from a queue. The extraction is atomic only
177 * when just one thread does extractions from the queue.
178 * If the queue is empty, NULL is returned.
181 xc_extract(xc_msg_t
**queue
)
186 old_head
= (xc_msg_t
*)*(volatile xc_msg_t
**)queue
;
187 if (old_head
== NULL
)
189 } while (atomic_cas_ptr(queue
, old_head
, old_head
->xc_next
) !=
191 old_head
->xc_next
= NULL
;
196 * Initialize the machcpu fields used for cross calls
198 static uint_t xc_initialized
= 0;
201 xc_init_cpu(struct cpu
*cpup
)
207 * Allocate message buffers for the new CPU.
209 for (c
= 0; c
< max_ncpus
; ++c
) {
210 if (plat_dr_support_cpu()) {
212 * Allocate a message buffer for every CPU possible
213 * in system, including our own, and add them to our xc
216 msg
= kmem_zalloc(sizeof (*msg
), KM_SLEEP
);
217 msg
->xc_command
= XC_MSG_FREE
;
218 msg
->xc_master
= cpup
->cpu_id
;
219 xc_insert(&cpup
->cpu_m
.xc_free
, msg
);
220 } else if (cpu
[c
] != NULL
&& cpu
[c
] != cpup
) {
222 * Add a new message buffer to each existing CPU's free
223 * list, as well as one for my list for each of them.
224 * Note: cpu0 is statically inserted into cpu[] array,
225 * so need to check cpu[c] isn't cpup itself to avoid
226 * allocating extra message buffers for cpu0.
228 msg
= kmem_zalloc(sizeof (*msg
), KM_SLEEP
);
229 msg
->xc_command
= XC_MSG_FREE
;
231 xc_insert(&cpu
[c
]->cpu_m
.xc_free
, msg
);
233 msg
= kmem_zalloc(sizeof (*msg
), KM_SLEEP
);
234 msg
->xc_command
= XC_MSG_FREE
;
235 msg
->xc_master
= cpup
->cpu_id
;
236 xc_insert(&cpup
->cpu_m
.xc_free
, msg
);
240 if (!plat_dr_support_cpu()) {
242 * Add one for self messages if CPU hotplug is disabled.
244 msg
= kmem_zalloc(sizeof (*msg
), KM_SLEEP
);
245 msg
->xc_command
= XC_MSG_FREE
;
246 msg
->xc_master
= cpup
->cpu_id
;
247 xc_insert(&cpup
->cpu_m
.xc_free
, msg
);
255 xc_fini_cpu(struct cpu
*cpup
)
259 ASSERT((cpup
->cpu_flags
& CPU_READY
) == 0);
260 ASSERT(cpup
->cpu_m
.xc_msgbox
== NULL
);
261 ASSERT(cpup
->cpu_m
.xc_work_cnt
== 0);
263 while ((msg
= xc_extract(&cpup
->cpu_m
.xc_free
)) != NULL
) {
264 kmem_free(msg
, sizeof (*msg
));
268 #define XC_FLUSH_MAX_WAITS 1000
270 /* Flush inflight message buffers. */
272 xc_flush_cpu(struct cpu
*cpup
)
276 ASSERT((cpup
->cpu_flags
& CPU_READY
) == 0);
279 * Pause all working CPUs, which ensures that there's no CPU in
280 * function xc_common().
281 * This is used to work around a race condition window in xc_common()
282 * between checking CPU_READY flag and increasing working item count.
287 for (i
= 0; i
< XC_FLUSH_MAX_WAITS
; i
++) {
288 if (cpup
->cpu_m
.xc_work_cnt
== 0) {
293 for (; i
< XC_FLUSH_MAX_WAITS
; i
++) {
294 if (!BT_TEST(xc_priority_set
, cpup
->cpu_id
)) {
300 return (i
>= XC_FLUSH_MAX_WAITS
? ETIME
: 0);
304 * X-call message processing routine. Note that this is used by both
305 * senders and recipients of messages.
307 * We're protected against changing CPUs by either being in a high-priority
308 * interrupt, having preemption disabled or by having a raised SPL.
312 xc_serv(caddr_t arg1
, caddr_t arg2
)
314 struct machcpu
*mcpup
= &(CPU
->cpu_m
);
317 xc_msg_t
*xc_waiters
= NULL
;
318 uint32_t num_waiting
= 0;
323 uint_t rc
= DDI_INTR_UNCLAIMED
;
325 while (mcpup
->xc_work_cnt
!= 0) {
326 rc
= DDI_INTR_CLAIMED
;
329 * We may have to wait for a message to arrive.
331 for (msg
= NULL
; msg
== NULL
;
332 msg
= xc_extract(&mcpup
->xc_msgbox
)) {
335 * Alway check for and handle a priority message.
337 if (BT_TEST(xc_priority_set
, CPU
->cpu_id
)) {
338 func
= xc_priority_data
.xc_func
;
339 a1
= xc_priority_data
.xc_a1
;
340 a2
= xc_priority_data
.xc_a2
;
341 a3
= xc_priority_data
.xc_a3
;
342 XC_BT_CLEAR(xc_priority_set
, CPU
->cpu_id
);
345 if (mcpup
->xc_work_cnt
== 0)
350 * wait for a message to arrive
357 * process the message
359 switch (msg
->xc_command
) {
362 * ASYNC gives back the message immediately, then we do the
363 * function and return with no more waiting.
366 data
= &cpu
[msg
->xc_master
]->cpu_m
.xc_data
;
367 func
= data
->xc_func
;
371 msg
->xc_command
= XC_MSG_DONE
;
372 xc_insert(&cpu
[msg
->xc_master
]->cpu_m
.xc_msgbox
, msg
);
374 (void) (*func
)(a1
, a2
, a3
);
379 * SYNC messages do the call, then send it back to the master
383 data
= &cpu
[msg
->xc_master
]->cpu_m
.xc_data
;
384 if (data
->xc_func
!= NULL
)
385 (void) (*data
->xc_func
)(data
->xc_a1
,
386 data
->xc_a2
, data
->xc_a3
);
387 msg
->xc_command
= XC_MSG_WAITING
;
388 xc_insert(&cpu
[msg
->xc_master
]->cpu_m
.xc_msgbox
, msg
);
392 * WAITING messsages are collected by the master until all
393 * have arrived. Once all arrive, we release them back to
397 xc_insert(&xc_waiters
, msg
);
398 if (++num_waiting
< mcpup
->xc_wait_cnt
)
400 while ((msg
= xc_extract(&xc_waiters
)) != NULL
) {
401 msg
->xc_command
= XC_MSG_RELEASED
;
402 xc_insert(&cpu
[msg
->xc_slave
]->cpu_m
.xc_msgbox
,
406 if (num_waiting
!= 0)
407 panic("wrong number waiting");
408 mcpup
->xc_wait_cnt
= 0;
412 * CALL messages do the function and then, like RELEASE,
413 * send the message is back to master as DONE.
416 data
= &cpu
[msg
->xc_master
]->cpu_m
.xc_data
;
417 if (data
->xc_func
!= NULL
)
418 (void) (*data
->xc_func
)(data
->xc_a1
,
419 data
->xc_a2
, data
->xc_a3
);
421 case XC_MSG_RELEASED
:
422 msg
->xc_command
= XC_MSG_DONE
;
423 xc_insert(&cpu
[msg
->xc_master
]->cpu_m
.xc_msgbox
, msg
);
428 * DONE means a slave has completely finished up.
429 * Once we collect all the DONE messages, we'll exit
433 msg
->xc_command
= XC_MSG_FREE
;
434 xc_insert(&mcpup
->xc_free
, msg
);
439 panic("free message 0x%p in msgbox", (void *)msg
);
443 panic("bad message 0x%p in msgbox", (void *)msg
);
451 * Initiate cross call processing.
469 if (!xc_initialized
) {
470 if (BT_TEST(set
, CPU
->cpu_id
) && (CPU
->cpu_flags
& CPU_READY
) &&
472 (void) (*func
)(arg1
, arg2
, arg3
);
476 save_spl
= splr(ipltospl(XC_HI_PIL
));
479 * fill in cross call data
481 data
= &CPU
->cpu_m
.xc_data
;
482 data
->xc_func
= func
;
488 * Post messages to all CPUs involved that are CPU_READY
490 CPU
->cpu_m
.xc_wait_cnt
= 0;
491 for (c
= 0; c
< max_ncpus
; ++c
) {
492 if (!BT_TEST(set
, c
))
495 if (cpup
== NULL
|| !(cpup
->cpu_flags
& CPU_READY
))
499 * Fill out a new message.
501 msg
= xc_extract(&CPU
->cpu_m
.xc_free
);
503 panic("Ran out of free xc_msg_t's");
504 msg
->xc_command
= command
;
505 if (msg
->xc_master
!= CPU
->cpu_id
)
506 panic("msg %p has wrong xc_master", (void *)msg
);
510 * Increment my work count for all messages that I'll
511 * transition from DONE to FREE.
512 * Also remember how many XC_MSG_WAITINGs to look for
514 (void) xc_increment(&CPU
->cpu_m
);
515 if (command
== XC_MSG_SYNC
)
516 ++CPU
->cpu_m
.xc_wait_cnt
;
519 * Increment the target CPU work count then insert the message
520 * in the target msgbox. If I post the first bit of work
521 * for the target to do, send an IPI to the target CPU.
523 cnt
= xc_increment(&cpup
->cpu_m
);
524 xc_insert(&cpup
->cpu_m
.xc_msgbox
, msg
);
527 CPU_STATS_ADDQ(CPU
, sys
, xcalls
, 1);
528 send_dirint(c
, XC_HI_PIL
);
529 if (xc_collect_enable
)
531 } else if (xc_collect_enable
) {
538 * Now drop into the message handler until all work is done
540 (void) xc_serv(NULL
, NULL
);
545 * Push out a priority cross call.
560 * Wait briefly for any previous xc_priority to have finished.
562 for (c
= 0; c
< max_ncpus
; ++c
) {
564 if (cpup
== NULL
|| !(cpup
->cpu_flags
& CPU_READY
))
568 * The value of 40000 here is from old kernel code. It
569 * really should be changed to some time based value, since
570 * under a hypervisor, there's no guarantee a remote CPU
573 for (i
= 0; BT_TEST(xc_priority_set
, c
) && i
< 40000; ++i
)
577 * Some CPU did not respond to a previous priority request. It's
578 * probably deadlocked with interrupts blocked or some such
579 * problem. We'll just erase the previous request - which was
580 * most likely a kmdb_enter that has already expired - and plow
583 if (BT_TEST(xc_priority_set
, c
)) {
584 XC_BT_CLEAR(xc_priority_set
, c
);
585 if (cpup
->cpu_m
.xc_work_cnt
> 0)
586 xc_decrement(&cpup
->cpu_m
);
591 * fill in cross call data
593 xc_priority_data
.xc_func
= func
;
594 xc_priority_data
.xc_a1
= arg1
;
595 xc_priority_data
.xc_a2
= arg2
;
596 xc_priority_data
.xc_a3
= arg3
;
599 * Post messages to all CPUs involved that are CPU_READY
600 * We'll always IPI, plus bang on the xc_msgbox for i86_mwait()
602 for (c
= 0; c
< max_ncpus
; ++c
) {
603 if (!BT_TEST(set
, c
))
606 if (cpup
== NULL
|| !(cpup
->cpu_flags
& CPU_READY
) ||
609 (void) xc_increment(&cpup
->cpu_m
);
610 XC_BT_SET(xc_priority_set
, c
);
611 send_dirint(c
, XC_HI_PIL
);
612 for (i
= 0; i
< 10; ++i
) {
613 (void) atomic_cas_ptr(&cpup
->cpu_m
.xc_msgbox
,
614 cpup
->cpu_m
.xc_msgbox
, cpup
->cpu_m
.xc_msgbox
);
620 * Do cross call to all other CPUs with absolutely no waiting or handshaking.
621 * This should only be used for extraordinary operations, like panic(), which
622 * need to work, in some fashion, in a not completely functional system.
623 * All other uses that want minimal waiting should use xc_call_nowait().
633 extern int IGNORE_KERNEL_PREEMPTION
;
634 int save_spl
= splr(ipltospl(XC_HI_PIL
));
635 int save_kernel_preemption
= IGNORE_KERNEL_PREEMPTION
;
637 IGNORE_KERNEL_PREEMPTION
= 1;
638 xc_priority_common((xc_func_t
)func
, arg1
, arg2
, arg3
, set
);
639 IGNORE_KERNEL_PREEMPTION
= save_kernel_preemption
;
644 * Wrapper for kmdb to capture other CPUs, causing them to enter the debugger.
647 kdi_xc_others(int this_cpu
, void (*func
)(void))
649 extern int IGNORE_KERNEL_PREEMPTION
;
650 int save_kernel_preemption
;
656 save_kernel_preemption
= IGNORE_KERNEL_PREEMPTION
;
657 IGNORE_KERNEL_PREEMPTION
= 1;
658 CPUSET_ALL_BUT(set
, this_cpu
);
659 xc_priority_common((xc_func_t
)func
, 0, 0, 0, CPUSET2BV(set
));
660 IGNORE_KERNEL_PREEMPTION
= save_kernel_preemption
;
666 * Invoke function on specified processors. Remotes may continue after
667 * service with no waiting. xc_call_nowait() may return immediately too.
677 xc_common(func
, arg1
, arg2
, arg3
, set
, XC_MSG_ASYNC
);
681 * Invoke function on specified processors. Remotes may continue after
682 * service with no waiting. xc_call() returns only after remotes have finished.
692 xc_common(func
, arg1
, arg2
, arg3
, set
, XC_MSG_CALL
);
696 * Invoke function on specified processors. Remotes wait until all have
697 * finished. xc_sync() also waits until all remotes have finished.
707 xc_common(func
, arg1
, arg2
, arg3
, set
, XC_MSG_SYNC
);