2 * Copyright (c) 2003,2004 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * $DragonFly: src/sys/kern/lwkt_ipiq.c,v 1.10 2005/04/18 01:02:58 dillon Exp $
38 * This module implements IPI message queueing and the MI portion of IPI
44 #include <sys/param.h>
45 #include <sys/systm.h>
46 #include <sys/kernel.h>
48 #include <sys/rtprio.h>
49 #include <sys/queue.h>
50 #include <sys/thread2.h>
51 #include <sys/sysctl.h>
52 #include <sys/kthread.h>
53 #include <machine/cpu.h>
58 #include <vm/vm_param.h>
59 #include <vm/vm_kern.h>
60 #include <vm/vm_object.h>
61 #include <vm/vm_page.h>
62 #include <vm/vm_map.h>
63 #include <vm/vm_pager.h>
64 #include <vm/vm_extern.h>
65 #include <vm/vm_zone.h>
67 #include <machine/stdarg.h>
68 #include <machine/ipl.h>
69 #include <machine/smp.h>
70 #include <machine/atomic.h>
72 #define THREAD_STACK (UPAGES * PAGE_SIZE)
76 #include <sys/stdint.h>
77 #include <libcaps/thread.h>
78 #include <sys/thread.h>
79 #include <sys/msgport.h>
80 #include <sys/errno.h>
81 #include <libcaps/globaldata.h>
82 #include <machine/cpufunc.h>
83 #include <sys/thread2.h>
84 #include <sys/msgport2.h>
88 #include <machine/lock.h>
89 #include <machine/cpu.h>
90 #include <machine/atomic.h>
95 static __int64_t ipiq_count
; /* total calls to lwkt_send_ipiq*() */
96 static __int64_t ipiq_fifofull
; /* number of fifo full conditions detected */
97 static __int64_t ipiq_avoided
; /* interlock with target avoids cpu ipi */
98 static __int64_t ipiq_passive
; /* passive IPI messages */
99 static __int64_t ipiq_cscount
; /* number of cpu synchronizations */
100 static int ipiq_optimized
= 1; /* XXX temporary sysctl */
106 SYSCTL_QUAD(_lwkt
, OID_AUTO
, ipiq_count
, CTLFLAG_RW
, &ipiq_count
, 0, "");
107 SYSCTL_QUAD(_lwkt
, OID_AUTO
, ipiq_fifofull
, CTLFLAG_RW
, &ipiq_fifofull
, 0, "");
108 SYSCTL_QUAD(_lwkt
, OID_AUTO
, ipiq_avoided
, CTLFLAG_RW
, &ipiq_avoided
, 0, "");
109 SYSCTL_QUAD(_lwkt
, OID_AUTO
, ipiq_passive
, CTLFLAG_RW
, &ipiq_passive
, 0, "");
110 SYSCTL_QUAD(_lwkt
, OID_AUTO
, ipiq_cscount
, CTLFLAG_RW
, &ipiq_cscount
, 0, "");
111 SYSCTL_INT(_lwkt
, OID_AUTO
, ipiq_optimized
, CTLFLAG_RW
, &ipiq_optimized
, 0, "");
118 static int lwkt_process_ipiq1(lwkt_ipiq_t ip
, struct intrframe
*frame
);
119 static void lwkt_cpusync_remote1(lwkt_cpusync_t poll
);
120 static void lwkt_cpusync_remote2(lwkt_cpusync_t poll
);
123 * Send a function execution request to another cpu. The request is queued
124 * on the cpu<->cpu ipiq matrix. Each cpu owns a unique ipiq FIFO for every
125 * possible target cpu. The FIFO can be written.
127 * If the FIFO fills up we have to enable interrupts to avoid an APIC
128 * deadlock and process pending IPIQs while waiting for it to empty.
129 * Otherwise we may soft-deadlock with another cpu whos FIFO is also full.
131 * We can safely bump gd_intr_nesting_level because our crit_exit() at the
132 * end will take care of any pending interrupts.
134 * The actual hardware IPI is avoided if the target cpu is already processing
135 * the queue from a prior IPI. It is possible to pipeline IPI messages
136 * very quickly between cpus due to the FIFO hysteresis.
138 * Need not be called from a critical section.
141 lwkt_send_ipiq(globaldata_t target
, ipifunc_t func
, void *arg
)
145 struct globaldata
*gd
= mycpu
;
152 ++gd
->gd_intr_nesting_level
;
154 if (gd
->gd_intr_nesting_level
> 20)
155 panic("lwkt_send_ipiq: TOO HEAVILY NESTED!");
157 KKASSERT(curthread
->td_pri
>= TDPRI_CRIT
);
159 ip
= &gd
->gd_ipiq
[target
->gd_cpuid
];
162 * Do not allow the FIFO to become full. Interrupts must be physically
163 * enabled while we liveloop to avoid deadlocking the APIC.
165 if (ip
->ip_windex
- ip
->ip_rindex
> MAXCPUFIFO
/ 2) {
166 unsigned int eflags
= read_eflags();
168 if (atomic_poll_acquire_int(&ip
->ip_npoll
) || ipiq_optimized
== 0)
169 cpu_send_ipiq(target
->gd_cpuid
);
172 while (ip
->ip_windex
- ip
->ip_rindex
> MAXCPUFIFO
/ 4) {
173 KKASSERT(ip
->ip_windex
- ip
->ip_rindex
!= MAXCPUFIFO
- 1);
176 write_eflags(eflags
);
180 * Queue the new message
182 windex
= ip
->ip_windex
& MAXCPUFIFO_MASK
;
183 ip
->ip_func
[windex
] = (ipifunc2_t
)func
;
184 ip
->ip_arg
[windex
] = arg
;
187 --gd
->gd_intr_nesting_level
;
190 * signal the target cpu that there is work pending.
192 if (atomic_poll_acquire_int(&ip
->ip_npoll
)) {
193 cpu_send_ipiq(target
->gd_cpuid
);
195 if (ipiq_optimized
== 0)
196 cpu_send_ipiq(target
->gd_cpuid
);
200 return(ip
->ip_windex
);
204 * Similar to lwkt_send_ipiq() but this function does not actually initiate
205 * the IPI to the target cpu unless the FIFO has become too full, so it is
208 * This function is used for non-critical IPI messages, such as memory
209 * deallocations. The queue will typically be flushed by the target cpu at
210 * the next clock interrupt.
212 * Need not be called from a critical section.
215 lwkt_send_ipiq_passive(globaldata_t target
, ipifunc_t func
, void *arg
)
219 struct globaldata
*gd
= mycpu
;
221 KKASSERT(target
!= gd
);
223 ++gd
->gd_intr_nesting_level
;
225 if (gd
->gd_intr_nesting_level
> 20)
226 panic("lwkt_send_ipiq: TOO HEAVILY NESTED!");
228 KKASSERT(curthread
->td_pri
>= TDPRI_CRIT
);
231 ip
= &gd
->gd_ipiq
[target
->gd_cpuid
];
234 * Do not allow the FIFO to become full. Interrupts must be physically
235 * enabled while we liveloop to avoid deadlocking the APIC.
237 if (ip
->ip_windex
- ip
->ip_rindex
> MAXCPUFIFO
/ 2) {
238 unsigned int eflags
= read_eflags();
240 if (atomic_poll_acquire_int(&ip
->ip_npoll
) || ipiq_optimized
== 0)
241 cpu_send_ipiq(target
->gd_cpuid
);
244 while (ip
->ip_windex
- ip
->ip_rindex
> MAXCPUFIFO
/ 4) {
245 KKASSERT(ip
->ip_windex
- ip
->ip_rindex
!= MAXCPUFIFO
- 1);
248 write_eflags(eflags
);
252 * Queue the new message
254 windex
= ip
->ip_windex
& MAXCPUFIFO_MASK
;
255 ip
->ip_func
[windex
] = (ipifunc2_t
)func
;
256 ip
->ip_arg
[windex
] = arg
;
259 --gd
->gd_intr_nesting_level
;
262 * Do not signal the target cpu, it will pick up the IPI when it next
263 * polls (typically on the next tick).
266 return(ip
->ip_windex
);
270 * Send an IPI request without blocking, return 0 on success, ENOENT on
271 * failure. The actual queueing of the hardware IPI may still force us
272 * to spin and process incoming IPIs but that will eventually go away
273 * when we've gotten rid of the other general IPIs.
276 lwkt_send_ipiq_nowait(globaldata_t target
, ipifunc_t func
, void *arg
)
280 struct globaldata
*gd
= mycpu
;
282 KKASSERT(curthread
->td_pri
>= TDPRI_CRIT
);
288 ip
= &gd
->gd_ipiq
[target
->gd_cpuid
];
290 if (ip
->ip_windex
- ip
->ip_rindex
>= MAXCPUFIFO
* 2 / 3)
292 windex
= ip
->ip_windex
& MAXCPUFIFO_MASK
;
293 ip
->ip_func
[windex
] = (ipifunc2_t
)func
;
294 ip
->ip_arg
[windex
] = arg
;
299 * This isn't a passive IPI, we still have to signal the target cpu.
301 if (atomic_poll_acquire_int(&ip
->ip_npoll
)) {
302 cpu_send_ipiq(target
->gd_cpuid
);
304 if (ipiq_optimized
== 0)
305 cpu_send_ipiq(target
->gd_cpuid
);
312 * deprecated, used only by fast int forwarding.
315 lwkt_send_ipiq_bycpu(int dcpu
, ipifunc_t func
, void *arg
)
317 return(lwkt_send_ipiq(globaldata_find(dcpu
), func
, arg
));
321 * Send a message to several target cpus. Typically used for scheduling.
322 * The message will not be sent to stopped cpus.
325 lwkt_send_ipiq_mask(u_int32_t mask
, ipifunc_t func
, void *arg
)
330 mask
&= ~stopped_cpus
;
333 lwkt_send_ipiq(globaldata_find(cpuid
), func
, arg
);
334 mask
&= ~(1 << cpuid
);
341 * Wait for the remote cpu to finish processing a function.
343 * YYY we have to enable interrupts and process the IPIQ while waiting
344 * for it to empty or we may deadlock with another cpu. Create a CPU_*()
345 * function to do this! YYY we really should 'block' here.
347 * MUST be called from a critical section. This routine may be called
348 * from an interrupt (for example, if an interrupt wakes a foreign thread
352 lwkt_wait_ipiq(globaldata_t target
, int seq
)
355 int maxc
= 100000000;
357 if (target
!= mycpu
) {
358 ip
= &mycpu
->gd_ipiq
[target
->gd_cpuid
];
359 if ((int)(ip
->ip_xindex
- seq
) < 0) {
360 unsigned int eflags
= read_eflags();
362 while ((int)(ip
->ip_xindex
- seq
) < 0) {
367 printf("LWKT_WAIT_IPIQ WARNING! %d wait %d (%d)\n", mycpu
->gd_cpuid
, target
->gd_cpuid
, ip
->ip_xindex
- seq
);
369 panic("LWKT_WAIT_IPIQ");
371 write_eflags(eflags
);
377 lwkt_seq_ipiq(globaldata_t target
)
381 ip
= &mycpu
->gd_ipiq
[target
->gd_cpuid
];
382 return(ip
->ip_windex
);
386 * Called from IPI interrupt (like a fast interrupt), which has placed
387 * us in a critical section. The MP lock may or may not be held.
388 * May also be called from doreti or splz, or be reentrantly called
389 * indirectly through the ip_func[] we run.
391 * There are two versions, one where no interrupt frame is available (when
392 * called from the send code and from splz, and one where an interrupt
393 * frame is available.
396 lwkt_process_ipiq(void)
398 globaldata_t gd
= mycpu
;
403 for (n
= 0; n
< ncpus
; ++n
) {
404 if (n
!= gd
->gd_cpuid
) {
405 ip
= globaldata_find(n
)->gd_ipiq
;
407 while (lwkt_process_ipiq1(&ip
[gd
->gd_cpuid
], NULL
))
412 if (gd
->gd_cpusyncq
.ip_rindex
!= gd
->gd_cpusyncq
.ip_windex
) {
413 if (lwkt_process_ipiq1(&gd
->gd_cpusyncq
, NULL
)) {
414 if (gd
->gd_curthread
->td_cscount
== 0)
423 lwkt_process_ipiq_frame(struct intrframe frame
)
425 globaldata_t gd
= mycpu
;
430 for (n
= 0; n
< ncpus
; ++n
) {
431 if (n
!= gd
->gd_cpuid
) {
432 ip
= globaldata_find(n
)->gd_ipiq
;
434 while (lwkt_process_ipiq1(&ip
[gd
->gd_cpuid
], &frame
))
439 if (gd
->gd_cpusyncq
.ip_rindex
!= gd
->gd_cpusyncq
.ip_windex
) {
440 if (lwkt_process_ipiq1(&gd
->gd_cpusyncq
, &frame
)) {
441 if (gd
->gd_curthread
->td_cscount
== 0)
450 lwkt_process_ipiq1(lwkt_ipiq_t ip
, struct intrframe
*frame
)
453 int wi
= ip
->ip_windex
;
455 * Note: xindex is only updated after we are sure the function has
456 * finished execution. Beware lwkt_process_ipiq() reentrancy! The
457 * function may send an IPI which may block/drain.
459 while ((ri
= ip
->ip_rindex
) != wi
) {
460 ip
->ip_rindex
= ri
+ 1;
461 ri
&= MAXCPUFIFO_MASK
;
462 ip
->ip_func
[ri
](ip
->ip_arg
[ri
], frame
);
463 /* YYY memory barrier */
464 ip
->ip_xindex
= ip
->ip_rindex
;
468 * Return non-zero if there are more IPI messages pending on this
469 * ipiq. ip_npoll is left set as long as possible to reduce the
470 * number of IPIs queued by the originating cpu, but must be cleared
471 * *BEFORE* checking windex.
473 atomic_poll_release_int(&ip
->ip_npoll
);
474 return(wi
!= ip
->ip_windex
);
480 * !SMP dummy routines
484 lwkt_send_ipiq(globaldata_t target
, ipifunc_t func
, void *arg
)
486 panic("lwkt_send_ipiq: UP box! (%d,%p,%p)", target
->gd_cpuid
, func
, arg
);
487 return(0); /* NOT REACHED */
491 lwkt_wait_ipiq(globaldata_t target
, int seq
)
493 panic("lwkt_wait_ipiq: UP box! (%d,%d)", target
->gd_cpuid
, seq
);
499 * CPU Synchronization Support
501 * lwkt_cpusync_simple()
503 * The function is executed synchronously before return on remote cpus.
504 * A lwkt_cpusync_t pointer is passed as an argument. The data can
505 * be accessed via arg->cs_data.
507 * XXX should I just pass the data as an argument to be consistent?
511 lwkt_cpusync_simple(cpumask_t mask
, cpusync_func_t func
, void *data
)
513 struct lwkt_cpusync cmd
;
515 cmd
.cs_run_func
= NULL
;
516 cmd
.cs_fin1_func
= func
;
517 cmd
.cs_fin2_func
= NULL
;
519 lwkt_cpusync_start(mask
& mycpu
->gd_other_cpus
, &cmd
);
520 if (mask
& (1 << mycpu
->gd_cpuid
))
522 lwkt_cpusync_finish(&cmd
);
526 * lwkt_cpusync_fastdata()
528 * The function is executed in tandem with return on remote cpus.
529 * The data is directly passed as an argument. Do not pass pointers to
530 * temporary storage as the storage might have
531 * gone poof by the time the target cpu executes
534 * At the moment lwkt_cpusync is declared on the stack and we must wait
535 * for all remote cpus to ack in lwkt_cpusync_finish(), but as a future
536 * optimization we should be able to put a counter in the globaldata
537 * structure (if it is not otherwise being used) and just poke it and
538 * return without waiting. XXX
541 lwkt_cpusync_fastdata(cpumask_t mask
, cpusync_func2_t func
, void *data
)
543 struct lwkt_cpusync cmd
;
545 cmd
.cs_run_func
= NULL
;
546 cmd
.cs_fin1_func
= NULL
;
547 cmd
.cs_fin2_func
= func
;
549 lwkt_cpusync_start(mask
& mycpu
->gd_other_cpus
, &cmd
);
550 if (mask
& (1 << mycpu
->gd_cpuid
))
552 lwkt_cpusync_finish(&cmd
);
556 * lwkt_cpusync_start()
558 * Start synchronization with a set of target cpus, return once they are
559 * known to be in a synchronization loop. The target cpus will execute
560 * poll->cs_run_func() IN TANDEM WITH THE RETURN.
562 * XXX future: add lwkt_cpusync_start_quick() and require a call to
563 * lwkt_cpusync_add() or lwkt_cpusync_wait(), allowing the caller to
564 * potentially absorb the IPI latency doing something useful.
567 lwkt_cpusync_start(cpumask_t mask
, lwkt_cpusync_t poll
)
569 globaldata_t gd
= mycpu
;
572 poll
->cs_mask
= mask
;
574 poll
->cs_maxcount
= lwkt_send_ipiq_mask(
575 mask
& gd
->gd_other_cpus
& smp_active_mask
,
576 (ipifunc_t
)lwkt_cpusync_remote1
, poll
);
578 if (mask
& gd
->gd_cpumask
) {
579 if (poll
->cs_run_func
)
580 poll
->cs_run_func(poll
);
583 if (poll
->cs_maxcount
) {
585 ++gd
->gd_curthread
->td_cscount
;
586 while (poll
->cs_count
!= poll
->cs_maxcount
) {
596 lwkt_cpusync_add(cpumask_t mask
, lwkt_cpusync_t poll
)
598 globaldata_t gd
= mycpu
;
603 mask
&= ~poll
->cs_mask
;
604 poll
->cs_mask
|= mask
;
606 count
= lwkt_send_ipiq_mask(
607 mask
& gd
->gd_other_cpus
& smp_active_mask
,
608 (ipifunc_t
)lwkt_cpusync_remote1
, poll
);
610 if (mask
& gd
->gd_cpumask
) {
611 if (poll
->cs_run_func
)
612 poll
->cs_run_func(poll
);
615 poll
->cs_maxcount
+= count
;
616 if (poll
->cs_maxcount
) {
617 if (poll
->cs_maxcount
== count
)
618 ++gd
->gd_curthread
->td_cscount
;
619 while (poll
->cs_count
!= poll
->cs_maxcount
) {
629 * Finish synchronization with a set of target cpus. The target cpus will
630 * execute cs_fin1_func(poll) prior to this function returning, and will
631 * execute cs_fin2_func(data) IN TANDEM WITH THIS FUNCTION'S RETURN.
633 * If cs_maxcount is non-zero then we are mastering a cpusync with one or
634 * more remote cpus and must account for it in our thread structure.
637 lwkt_cpusync_finish(lwkt_cpusync_t poll
)
639 globaldata_t gd
= mycpu
;
642 if (poll
->cs_mask
& gd
->gd_cpumask
) {
643 if (poll
->cs_fin1_func
)
644 poll
->cs_fin1_func(poll
);
645 if (poll
->cs_fin2_func
)
646 poll
->cs_fin2_func(poll
->cs_data
);
649 if (poll
->cs_maxcount
) {
650 while (poll
->cs_count
!= -(poll
->cs_maxcount
+ 1)) {
655 --gd
->gd_curthread
->td_cscount
;
663 * helper IPI remote messaging function.
665 * Called on remote cpu when a new cpu synchronization request has been
666 * sent to us. Execute the run function and adjust cs_count, then requeue
667 * the request so we spin on it.
670 lwkt_cpusync_remote1(lwkt_cpusync_t poll
)
672 atomic_add_int(&poll
->cs_count
, 1);
673 if (poll
->cs_run_func
)
674 poll
->cs_run_func(poll
);
675 lwkt_cpusync_remote2(poll
);
679 * helper IPI remote messaging function.
681 * Poll for the originator telling us to finish. If it hasn't, requeue
682 * our request so we spin on it. When the originator requests that we
683 * finish we execute cs_fin1_func(poll) synchronously and cs_fin2_func(data)
684 * in tandem with the release.
687 lwkt_cpusync_remote2(lwkt_cpusync_t poll
)
689 if (poll
->cs_count
< 0) {
690 cpusync_func2_t savef
;
693 if (poll
->cs_fin1_func
)
694 poll
->cs_fin1_func(poll
);
695 if (poll
->cs_fin2_func
) {
696 savef
= poll
->cs_fin2_func
;
697 saved
= poll
->cs_data
;
698 atomic_add_int(&poll
->cs_count
, -1);
701 atomic_add_int(&poll
->cs_count
, -1);
704 globaldata_t gd
= mycpu
;
708 ip
= &gd
->gd_cpusyncq
;
709 wi
= ip
->ip_windex
& MAXCPUFIFO_MASK
;
710 ip
->ip_func
[wi
] = (ipifunc2_t
)lwkt_cpusync_remote2
;
711 ip
->ip_arg
[wi
] = poll
;