2 * Copyright (c) 2012 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 * TODO: txcmd CREATE state is deferred by tx msgq, need to calculate
36 * a streaming response. See subr_diskiocom()'s diskiodone().
38 #include <sys/param.h>
39 #include <sys/types.h>
40 #include <sys/kernel.h>
42 #include <sys/systm.h>
43 #include <sys/queue.h>
45 #include <sys/malloc.h>
46 #include <sys/mount.h>
47 #include <sys/socket.h>
48 #include <sys/vnode.h>
49 #include <sys/sysctl.h>
53 #include <sys/thread.h>
54 #include <sys/globaldata.h>
55 #include <sys/limits.h>
59 RB_GENERATE(kdmsg_state_tree
, kdmsg_state
, rbnode
, kdmsg_state_cmp
);
61 SYSCTL_NODE(, OID_AUTO
, kdmsg
, CTLFLAG_RW
, 0, "kdmsg");
62 static int kdmsg_debug
= 1;
63 SYSCTL_INT(_kdmsg
, OID_AUTO
, debug
, CTLFLAG_RW
, &kdmsg_debug
, 0,
64 "Set debug level for kernel dmsg layer");
66 #define kd_printf(level, ctl, ...) \
67 if (kdmsg_debug >= (level)) kprintf("kdmsg: " ctl, __VA_ARGS__)
69 #define kdio_printf(iocom, level, ctl, ...) \
70 if (kdmsg_debug >= (level)) kprintf("kdmsg: " ctl, __VA_ARGS__)
72 static int kdmsg_msg_receive_handling(kdmsg_msg_t
*msg
);
73 static int kdmsg_state_msgrx(kdmsg_msg_t
*msg
);
74 static int kdmsg_state_msgtx(kdmsg_msg_t
*msg
);
75 static void kdmsg_msg_write_locked(kdmsg_iocom_t
*iocom
, kdmsg_msg_t
*msg
);
76 static void kdmsg_state_cleanuprx(kdmsg_msg_t
*msg
);
77 static void kdmsg_state_cleanuptx(kdmsg_msg_t
*msg
);
78 static void kdmsg_subq_delete(kdmsg_state_t
*state
);
79 static void kdmsg_simulate_failure(kdmsg_state_t
*state
, int meto
, int error
);
80 static void kdmsg_state_abort(kdmsg_state_t
*state
);
81 static void kdmsg_state_dying(kdmsg_state_t
*state
);
82 static void kdmsg_state_free(kdmsg_state_t
*state
);
83 static void kdmsg_drain_msg(kdmsg_msg_t
*msg
);
86 #define KDMSG_DEBUG_ARGS , const char *file, int line
87 #define kdmsg_state_hold(state) _kdmsg_state_hold(state, __FILE__, __LINE__)
88 #define kdmsg_state_drop(state) _kdmsg_state_drop(state, __FILE__, __LINE__)
91 #define KDMSG_DEBUG_ARGS
92 #define kdmsg_state_hold(state) _kdmsg_state_hold(state)
93 #define kdmsg_state_drop(state) _kdmsg_state_drop(state)
95 static void _kdmsg_state_hold(kdmsg_state_t
*state KDMSG_DEBUG_ARGS
);
96 static void _kdmsg_state_drop(kdmsg_state_t
*state KDMSG_DEBUG_ARGS
);
98 static void kdmsg_iocom_thread_rd(void *arg
);
99 static void kdmsg_iocom_thread_wr(void *arg
);
100 static int kdmsg_autorxmsg(kdmsg_msg_t
*msg
);
102 /*static struct lwkt_token kdmsg_token = LWKT_TOKEN_INITIALIZER(kdmsg_token);*/
105 * Initialize the roll-up communications structure for a network
106 * messaging session. This function does not install the socket.
109 kdmsg_iocom_init(kdmsg_iocom_t
*iocom
, void *handle
, uint32_t flags
,
110 struct malloc_type
*mmsg
,
111 int (*rcvmsg
)(kdmsg_msg_t
*msg
))
113 bzero(iocom
, sizeof(*iocom
));
114 iocom
->handle
= handle
;
116 iocom
->rcvmsg
= rcvmsg
;
117 iocom
->flags
= flags
;
118 lockinit(&iocom
->msglk
, "h2msg", 0, 0);
119 TAILQ_INIT(&iocom
->msgq
);
120 RB_INIT(&iocom
->staterd_tree
);
121 RB_INIT(&iocom
->statewr_tree
);
123 iocom
->state0
.iocom
= iocom
;
124 iocom
->state0
.parent
= &iocom
->state0
;
125 TAILQ_INIT(&iocom
->state0
.subq
);
129 * [Re]connect using the passed file pointer. The caller must ref the
130 * fp for us. We own that ref now.
133 kdmsg_iocom_reconnect(kdmsg_iocom_t
*iocom
, struct file
*fp
,
134 const char *subsysname
)
137 * Destroy the current connection
139 lockmgr(&iocom
->msglk
, LK_EXCLUSIVE
);
140 atomic_set_int(&iocom
->msg_ctl
, KDMSG_CLUSTERCTL_KILLRX
);
141 while (iocom
->msgrd_td
|| iocom
->msgwr_td
) {
142 wakeup(&iocom
->msg_ctl
);
143 lksleep(iocom
, &iocom
->msglk
, 0, "clstrkl", hz
);
147 * Drop communications descriptor
150 fdrop(iocom
->msg_fp
);
151 iocom
->msg_fp
= NULL
;
155 * Setup new communications descriptor
160 iocom
->flags
&= ~KDMSG_IOCOMF_EXITNOACC
;
162 lwkt_create(kdmsg_iocom_thread_rd
, iocom
, &iocom
->msgrd_td
,
163 NULL
, 0, -1, "%s-msgrd", subsysname
);
164 lwkt_create(kdmsg_iocom_thread_wr
, iocom
, &iocom
->msgwr_td
,
165 NULL
, 0, -1, "%s-msgwr", subsysname
);
166 lockmgr(&iocom
->msglk
, LK_RELEASE
);
170 * Caller sets up iocom->auto_lnk_conn and iocom->auto_lnk_span, then calls
171 * this function to handle the state machine for LNK_CONN and LNK_SPAN.
173 static int kdmsg_lnk_conn_reply(kdmsg_state_t
*state
, kdmsg_msg_t
*msg
);
174 static int kdmsg_lnk_span_reply(kdmsg_state_t
*state
, kdmsg_msg_t
*msg
);
177 kdmsg_iocom_autoinitiate(kdmsg_iocom_t
*iocom
,
178 void (*auto_callback
)(kdmsg_msg_t
*msg
))
182 iocom
->auto_callback
= auto_callback
;
184 msg
= kdmsg_msg_alloc(&iocom
->state0
,
185 DMSG_LNK_CONN
| DMSGF_CREATE
,
186 kdmsg_lnk_conn_reply
, NULL
);
187 iocom
->auto_lnk_conn
.head
= msg
->any
.head
;
188 msg
->any
.lnk_conn
= iocom
->auto_lnk_conn
;
189 iocom
->conn_state
= msg
->state
;
190 kdmsg_state_hold(msg
->state
); /* iocom->conn_state */
191 kdmsg_msg_write(msg
);
196 kdmsg_lnk_conn_reply(kdmsg_state_t
*state
, kdmsg_msg_t
*msg
)
198 kdmsg_iocom_t
*iocom
= state
->iocom
;
202 * Upon receipt of the LNK_CONN acknowledgement initiate an
203 * automatic SPAN if we were asked to. Used by e.g. xdisk, but
204 * not used by HAMMER2 which must manage more than one transmitted
207 if ((msg
->any
.head
.cmd
& DMSGF_CREATE
) &&
208 (iocom
->flags
& KDMSG_IOCOMF_AUTOTXSPAN
)) {
209 rmsg
= kdmsg_msg_alloc(&iocom
->state0
,
210 DMSG_LNK_SPAN
| DMSGF_CREATE
,
211 kdmsg_lnk_span_reply
, NULL
);
212 iocom
->auto_lnk_span
.head
= rmsg
->any
.head
;
213 rmsg
->any
.lnk_span
= iocom
->auto_lnk_span
;
214 kdmsg_msg_write(rmsg
);
218 * Process shim after the CONN is acknowledged and before the CONN
219 * transaction is deleted. For deletions this gives device drivers
220 * the ability to interlock new operations on the circuit before
221 * it becomes illegal and panics.
223 if (iocom
->auto_callback
)
224 iocom
->auto_callback(msg
);
226 if ((state
->txcmd
& DMSGF_DELETE
) == 0 &&
227 (msg
->any
.head
.cmd
& DMSGF_DELETE
)) {
229 * iocom->conn_state has a state ref, drop it when clearing.
231 if (iocom
->conn_state
)
232 kdmsg_state_drop(iocom
->conn_state
);
233 iocom
->conn_state
= NULL
;
234 kdmsg_msg_reply(msg
, 0);
242 kdmsg_lnk_span_reply(kdmsg_state_t
*state
, kdmsg_msg_t
*msg
)
245 * Be sure to process shim before terminating the SPAN
246 * transaction. Gives device drivers the ability to
247 * interlock new operations on the circuit before it
248 * becomes illegal and panics.
250 if (state
->iocom
->auto_callback
)
251 state
->iocom
->auto_callback(msg
);
253 if ((state
->txcmd
& DMSGF_DELETE
) == 0 &&
254 (msg
->any
.head
.cmd
& DMSGF_DELETE
)) {
255 kdmsg_msg_reply(msg
, 0);
261 * Disconnect and clean up
264 kdmsg_iocom_uninit(kdmsg_iocom_t
*iocom
)
266 kdmsg_state_t
*state
;
271 * Ask the cluster controller to go away by setting
272 * KILLRX. Send a PING to get a response to unstick reading
275 * After 10 seconds shitcan the pipe and do an unclean shutdown.
277 lockmgr(&iocom
->msglk
, LK_EXCLUSIVE
);
279 atomic_set_int(&iocom
->msg_ctl
, KDMSG_CLUSTERCTL_KILLRX
);
280 msg
= kdmsg_msg_alloc(&iocom
->state0
, DMSG_LNK_PING
, NULL
, NULL
);
281 kdmsg_msg_write_locked(iocom
, msg
);
284 while (iocom
->msgrd_td
|| iocom
->msgwr_td
) {
285 wakeup(&iocom
->msg_ctl
);
286 lksleep(iocom
, &iocom
->msglk
, 0, "clstrkl", hz
);
287 if (--retries
== 0 && iocom
->msg_fp
) {
288 kdio_printf(iocom
, 0, "%s\n",
290 "shitcanning unresponsive pipe");
291 fp_shutdown(iocom
->msg_fp
, SHUT_RDWR
);
292 /* retries allowed to go negative, keep looping */
299 if ((state
= iocom
->freerd_state
) != NULL
) {
300 iocom
->freerd_state
= NULL
;
301 kdmsg_state_drop(state
);
304 if ((state
= iocom
->freewr_state
) != NULL
) {
305 iocom
->freewr_state
= NULL
;
306 kdmsg_state_drop(state
);
310 * Drop communications descriptor
313 fdrop(iocom
->msg_fp
);
314 iocom
->msg_fp
= NULL
;
316 lockmgr(&iocom
->msglk
, LK_RELEASE
);
320 * Cluster controller thread. Perform messaging functions. We have one
321 * thread for the reader and one for the writer. The writer handles
322 * shutdown requests (which should break the reader thread).
326 kdmsg_iocom_thread_rd(void *arg
)
328 kdmsg_iocom_t
*iocom
= arg
;
330 kdmsg_msg_t
*msg
= NULL
;
335 while ((iocom
->msg_ctl
& KDMSG_CLUSTERCTL_KILLRX
) == 0) {
337 * Retrieve the message from the pipe or socket.
339 error
= fp_read(iocom
->msg_fp
, &hdr
, sizeof(hdr
),
340 NULL
, 1, UIO_SYSSPACE
);
343 if (hdr
.magic
!= DMSG_HDR_MAGIC
) {
344 kdio_printf(iocom
, 1, "bad magic: %04x\n", hdr
.magic
);
348 hbytes
= (hdr
.cmd
& DMSGF_SIZE
) * DMSG_ALIGN
;
349 if (hbytes
< sizeof(hdr
) || hbytes
> DMSG_HDR_MAX
) {
350 kdio_printf(iocom
, 1, "bad header size %zd\n", hbytes
);
355 /* XXX messy: mask cmd to avoid allocating state */
356 msg
= kdmsg_msg_alloc(&iocom
->state0
,
357 hdr
.cmd
& DMSGF_BASECMDMASK
,
360 msg
->hdr_size
= hbytes
;
361 if (hbytes
> sizeof(hdr
)) {
362 error
= fp_read(iocom
->msg_fp
, &msg
->any
.head
+ 1,
363 hbytes
- sizeof(hdr
),
364 NULL
, 1, UIO_SYSSPACE
);
366 kdio_printf(iocom
, 1, "%s\n",
367 "short msg received");
372 msg
->aux_size
= hdr
.aux_bytes
;
373 if (msg
->aux_size
> DMSG_AUX_MAX
) {
374 kdio_printf(iocom
, 1,
375 "illegal msg payload size %zd\n",
381 abytes
= DMSG_DOALIGN(msg
->aux_size
);
382 msg
->aux_data
= kmalloc(abytes
, iocom
->mmsg
, M_WAITOK
);
383 msg
->flags
|= KDMSG_FLAG_AUXALLOC
;
384 error
= fp_read(iocom
->msg_fp
, msg
->aux_data
,
385 abytes
, NULL
, 1, UIO_SYSSPACE
);
387 kdio_printf(iocom
, 1, "%s\n",
388 "short msg payload received");
393 error
= kdmsg_msg_receive_handling(msg
);
398 kdio_printf(iocom
, 1, "read thread terminating error=%d\n", error
);
401 lockmgr(&iocom
->msglk
, LK_EXCLUSIVE
);
406 * Shutdown the socket and set KILLRX for consistency in case the
407 * shutdown was not commanded. Signal the transmit side to shutdown
408 * by setting KILLTX and waking it up.
410 fp_shutdown(iocom
->msg_fp
, SHUT_RDWR
);
411 atomic_set_int(&iocom
->msg_ctl
, KDMSG_CLUSTERCTL_KILLRX
|
412 KDMSG_CLUSTERCTL_KILLTX
);
413 iocom
->msgrd_td
= NULL
;
414 lockmgr(&iocom
->msglk
, LK_RELEASE
);
415 wakeup(&iocom
->msg_ctl
);
418 * iocom can be ripped out at any time once the lock is
419 * released with msgrd_td set to NULL. The wakeup()s are safe but
428 kdmsg_iocom_thread_wr(void *arg
)
430 kdmsg_iocom_t
*iocom
= arg
;
442 lockmgr(&iocom
->msglk
, LK_EXCLUSIVE
);
444 while ((iocom
->msg_ctl
& KDMSG_CLUSTERCTL_KILLTX
) == 0 && error
== 0) {
446 * Sleep if no messages pending. Interlock with flag while
449 if (TAILQ_EMPTY(&iocom
->msgq
)) {
450 atomic_set_int(&iocom
->msg_ctl
,
451 KDMSG_CLUSTERCTL_SLEEPING
);
452 lksleep(&iocom
->msg_ctl
, &iocom
->msglk
, 0, "msgwr", hz
);
453 atomic_clear_int(&iocom
->msg_ctl
,
454 KDMSG_CLUSTERCTL_SLEEPING
);
457 while ((msg
= TAILQ_FIRST(&iocom
->msgq
)) != NULL
) {
459 * Remove msg from the transmit queue and do
460 * persist and half-closed state handling.
462 TAILQ_REMOVE(&iocom
->msgq
, msg
, qentry
);
464 error
= kdmsg_state_msgtx(msg
);
465 if (error
== EALREADY
) {
476 * Dump the message to the pipe or socket.
478 * We have to clean up the message as if the transmit
479 * succeeded even if it failed.
481 lockmgr(&iocom
->msglk
, LK_RELEASE
);
482 error
= fp_write(iocom
->msg_fp
, &msg
->any
,
483 msg
->hdr_size
, &res
, UIO_SYSSPACE
);
484 if (error
|| res
!= msg
->hdr_size
) {
487 lockmgr(&iocom
->msglk
, LK_EXCLUSIVE
);
488 kdmsg_state_cleanuptx(msg
);
492 abytes
= DMSG_DOALIGN(msg
->aux_size
);
493 error
= fp_write(iocom
->msg_fp
,
494 msg
->aux_data
, abytes
,
496 if (error
|| res
!= abytes
) {
499 lockmgr(&iocom
->msglk
, LK_EXCLUSIVE
);
500 kdmsg_state_cleanuptx(msg
);
504 lockmgr(&iocom
->msglk
, LK_EXCLUSIVE
);
505 kdmsg_state_cleanuptx(msg
);
510 kdio_printf(iocom
, 1, "write thread terminating error=%d\n", error
);
514 * Shutdown the socket and set KILLTX for consistency in case the
515 * shutdown was not commanded. Signal the receive side to shutdown
516 * by setting KILLRX and waking it up.
518 fp_shutdown(iocom
->msg_fp
, SHUT_RDWR
);
519 atomic_set_int(&iocom
->msg_ctl
, KDMSG_CLUSTERCTL_KILLRX
|
520 KDMSG_CLUSTERCTL_KILLTX
);
521 wakeup(&iocom
->msg_ctl
);
524 * The transmit thread is responsible for final cleanups, wait
525 * for the receive side to terminate to prevent new received
526 * states from interfering with our cleanup.
528 * Do not set msgwr_td to NULL until we actually exit.
530 while (iocom
->msgrd_td
) {
531 wakeup(&iocom
->msg_ctl
);
532 lksleep(iocom
, &iocom
->msglk
, 0, "clstrkt", hz
);
536 * We can no longer receive new messages. We must drain the transmit
537 * message queue and simulate received messages to close anay remaining
540 * Loop until all the states are gone and there are no messages
545 iocom
->flags
|= KDMSG_IOCOMF_EXITNOACC
;
547 while (TAILQ_FIRST(&iocom
->msgq
) ||
548 RB_ROOT(&iocom
->staterd_tree
) ||
549 RB_ROOT(&iocom
->statewr_tree
) ||
552 * Simulate failure for all sub-states of state0.
554 kdmsg_drain_msgq(iocom
);
555 kdmsg_simulate_failure(&iocom
->state0
, 0, DMSG_ERR_LOSTLINK
);
557 lksleep(iocom
, &iocom
->msglk
, 0, "clstrtk", hz
/ 2);
559 if ((int)(ticks
- save_ticks
) > hz
*2 && didwarn
== 0) {
561 kdio_printf(iocom
, 0,
562 "Warning, write thread on %p "
563 "still terminating\n",
566 if ((int)(ticks
- save_ticks
) > hz
*15 && didwarn
== 1) {
568 kdio_printf(iocom
, 0,
569 "Warning, write thread on %p "
570 "still terminating\n",
573 if ((int)(ticks
- save_ticks
) > hz
*60) {
574 kdio_printf(iocom
, 0,
575 "Can't terminate: msgq %p "
576 "rd_tree %p wr_tree %p\n",
577 TAILQ_FIRST(&iocom
->msgq
),
578 RB_ROOT(&iocom
->staterd_tree
),
579 RB_ROOT(&iocom
->statewr_tree
));
580 lksleep(iocom
, &iocom
->msglk
, 0, "clstrtk", hz
* 10);
585 * Exit handling is done by the write thread.
587 lockmgr(&iocom
->msglk
, LK_RELEASE
);
590 * The state trees had better be empty now
592 KKASSERT(RB_EMPTY(&iocom
->staterd_tree
));
593 KKASSERT(RB_EMPTY(&iocom
->statewr_tree
));
594 KKASSERT(iocom
->conn_state
== NULL
);
596 if (iocom
->exit_func
) {
598 * iocom is invalid after we call the exit function.
600 iocom
->msgwr_td
= NULL
;
601 iocom
->exit_func(iocom
);
604 * iocom can be ripped out from under us once msgwr_td is
605 * set to NULL. The wakeup is safe.
607 iocom
->msgwr_td
= NULL
;
614 * This cleans out the pending transmit message queue, adjusting any
615 * persistent states properly in the process.
617 * Called with iocom locked.
620 kdmsg_drain_msgq(kdmsg_iocom_t
*iocom
)
625 * Clean out our pending transmit queue, executing the
626 * appropriate state adjustments as if the messages were
629 while ((msg
= TAILQ_FIRST(&iocom
->msgq
)) != NULL
) {
630 TAILQ_REMOVE(&iocom
->msgq
, msg
, qentry
);
631 kdmsg_drain_msg(msg
);
636 * Drain one message by simulating transmission and also simulating a
640 kdmsg_drain_msg(kdmsg_msg_t
*msg
)
642 if (kdmsg_state_msgtx(msg
)) {
646 kdmsg_simulate_failure(msg
->state
,
647 0, DMSG_ERR_LOSTLINK
);
649 kdmsg_state_cleanuptx(msg
);
654 * Do all processing required to handle a freshly received message
655 * after its low level header has been validated.
657 * iocom is not locked.
661 kdmsg_msg_receive_handling(kdmsg_msg_t
*msg
)
663 kdmsg_iocom_t
*iocom
= msg
->state
->iocom
;
667 * State machine tracking, state assignment for msg,
668 * returns error and discard status. Errors are fatal
669 * to the connection except for EALREADY which forces
670 * a discard without execution.
672 error
= kdmsg_state_msgrx(msg
);
673 if (msg
->state
->flags
& KDMSG_STATE_ABORTING
) {
674 kdio_printf(iocom
, 5,
675 "kdmsg_state_abort(b): state %p rxcmd=%08x "
676 "txcmd=%08x msgrx error %d\n",
677 msg
->state
, msg
->state
->rxcmd
,
678 msg
->state
->txcmd
, error
);
682 * Raw protocol or connection error
684 if (msg
->state
->flags
& KDMSG_STATE_ABORTING
)
685 kdio_printf(iocom
, 5,
686 "X1 state %p error %d\n",
689 if (error
== EALREADY
)
691 } else if (msg
->state
&& msg
->state
->func
) {
693 * Message related to state which already has a
694 * handling function installed for it.
696 if (msg
->state
->flags
& KDMSG_STATE_ABORTING
)
697 kdio_printf(iocom
, 5,
698 "X2 state %p func %p\n",
699 msg
->state
, msg
->state
->func
);
700 error
= msg
->state
->func(msg
->state
, msg
);
701 kdmsg_state_cleanuprx(msg
);
702 } else if (iocom
->flags
& KDMSG_IOCOMF_AUTOANY
) {
703 if (msg
->state
->flags
& KDMSG_STATE_ABORTING
)
704 kdio_printf(iocom
, 5,
705 "X3 state %p\n", msg
->state
);
706 error
= kdmsg_autorxmsg(msg
);
707 kdmsg_state_cleanuprx(msg
);
709 if (msg
->state
->flags
& KDMSG_STATE_ABORTING
)
710 kdio_printf(iocom
, 5,
711 "X4 state %p\n", msg
->state
);
712 error
= iocom
->rcvmsg(msg
);
713 kdmsg_state_cleanuprx(msg
);
719 * Process state tracking for a message after reception and dequeueing,
720 * prior to execution of the state callback. The state is updated and
721 * will be removed from the RBTREE if completely closed, but the state->parent
722 * and subq linkage is not cleaned up until after the callback (see
727 * NOTE: A message transaction can consist of several messages in either
730 * NOTE: The msgid is unique to the initiator, not necessarily unique for
731 * us or for any relay or for the return direction for that matter.
732 * That is, two sides sending a new message can use the same msgid
737 * ABORT sequences work by setting the ABORT flag along with normal message
738 * state. However, ABORTs can also be sent on half-closed messages, that is
739 * even if the command or reply side has already sent a DELETE, as long as
740 * the message has not been fully closed it can still send an ABORT+DELETE
741 * to terminate the half-closed message state.
743 * Since ABORT+DELETEs can race we silently discard ABORT's for message
744 * state which has already been fully closed. REPLY+ABORT+DELETEs can
745 * also race, and in this situation the other side might have already
746 * initiated a new unrelated command with the same message id. Since
747 * the abort has not set the CREATE flag the situation can be detected
748 * and the message will also be discarded.
750 * Non-blocking requests can be initiated with ABORT+CREATE[+DELETE].
751 * The ABORT request is essentially integrated into the command instead
752 * of being sent later on. In this situation the command implementation
753 * detects that CREATE and ABORT are both set (vs ABORT alone) and can
754 * special-case non-blocking operation for the command.
756 * NOTE! Messages with ABORT set without CREATE or DELETE are considered
757 * to be mid-stream aborts for command/reply sequences. ABORTs on
758 * one-way messages are not supported.
760 * NOTE! If a command sequence does not support aborts the ABORT flag is
765 * One-off messages (no reply expected) are sent with neither CREATE or DELETE
766 * set. One-off messages cannot be aborted and typically aren't processed
767 * by these routines. The REPLY bit can be used to distinguish whether a
768 * one-off message is a command or reply. For example, one-off replies
769 * will typically just contain status updates.
773 kdmsg_state_msgrx(kdmsg_msg_t
*msg
)
775 kdmsg_iocom_t
*iocom
= msg
->state
->iocom
;
776 kdmsg_state_t
*state
;
777 kdmsg_state_t
*pstate
;
778 kdmsg_state_t sdummy
;
781 bzero(&sdummy
, sizeof(sdummy
)); /* avoid gcc warnings */
784 * Make sure a state structure is ready to go in case we need a new
785 * one. This is the only routine which uses freerd_state so no
786 * races are possible.
788 if ((state
= iocom
->freerd_state
) == NULL
) {
789 state
= kmalloc(sizeof(*state
), iocom
->mmsg
, M_WAITOK
| M_ZERO
);
790 state
->flags
= KDMSG_STATE_DYNAMIC
;
791 state
->iocom
= iocom
;
793 TAILQ_INIT(&state
->subq
);
794 iocom
->freerd_state
= state
;
796 state
= NULL
; /* safety */
799 * Lock RB tree and locate existing persistent state, if any.
801 * If received msg is a command state is on staterd_tree.
802 * If received msg is a reply state is on statewr_tree.
804 lockmgr(&iocom
->msglk
, LK_EXCLUSIVE
);
807 if (msg
->state
== &iocom
->state0
) {
808 sdummy
.msgid
= msg
->any
.head
.msgid
;
809 sdummy
.iocom
= iocom
;
810 if (msg
->any
.head
.cmd
& DMSGF_REVTRANS
) {
811 state
= RB_FIND(kdmsg_state_tree
, &iocom
->statewr_tree
,
814 state
= RB_FIND(kdmsg_state_tree
, &iocom
->staterd_tree
,
819 * Set message state unconditionally. If this is a CREATE
820 * message this state will become the parent state and new
821 * state will be allocated for the message state.
824 state
= &iocom
->state0
;
825 if (state
->flags
& KDMSG_STATE_INTERLOCK
) {
826 state
->flags
|= KDMSG_STATE_SIGNAL
;
827 lksleep(state
, &iocom
->msglk
, 0, "dmrace", hz
);
830 kdmsg_state_hold(state
);
831 kdmsg_state_drop(msg
->state
); /* iocom->state0 */
838 * Short-cut one-off or mid-stream messages.
840 if ((msg
->any
.head
.cmd
& (DMSGF_CREATE
| DMSGF_DELETE
|
841 DMSGF_ABORT
)) == 0) {
847 * Switch on CREATE, DELETE, REPLY, and also handle ABORT from
848 * inside the case statements.
850 switch(msg
->any
.head
.cmd
& (DMSGF_CREATE
|DMSGF_DELETE
|DMSGF_REPLY
)) {
852 case DMSGF_CREATE
| DMSGF_DELETE
:
854 * New persistant command received.
856 if (state
!= &iocom
->state0
) {
857 kdio_printf(iocom
, 1, "%s\n",
858 "duplicate transaction");
864 * Lookup the circuit. The circuit is an open transaction.
865 * the REVCIRC bit in the message tells us which side
866 * initiated the transaction representing the circuit.
868 if (msg
->any
.head
.circuit
) {
869 sdummy
.msgid
= msg
->any
.head
.circuit
;
871 if (msg
->any
.head
.cmd
& DMSGF_REVCIRC
) {
872 pstate
= RB_FIND(kdmsg_state_tree
,
873 &iocom
->statewr_tree
,
876 pstate
= RB_FIND(kdmsg_state_tree
,
877 &iocom
->staterd_tree
,
880 if (pstate
== NULL
) {
881 kdio_printf(iocom
, 1, "%s\n",
888 pstate
= &iocom
->state0
;
892 * Allocate new state.
894 * msg->state becomes the owner of the ref we inherit from
897 kdmsg_state_drop(state
);
898 state
= iocom
->freerd_state
;
899 iocom
->freerd_state
= NULL
;
901 msg
->state
= state
; /* inherits freerd ref */
902 state
->parent
= pstate
;
903 KKASSERT(state
->iocom
== iocom
);
904 state
->flags
|= KDMSG_STATE_RBINSERTED
|
905 KDMSG_STATE_SUBINSERTED
|
906 KDMSG_STATE_OPPOSITE
;
907 if (TAILQ_EMPTY(&pstate
->subq
))
908 kdmsg_state_hold(pstate
);/* states on pstate->subq */
909 kdmsg_state_hold(state
); /* state on pstate->subq */
910 kdmsg_state_hold(state
); /* state on rbtree */
911 state
->icmd
= msg
->any
.head
.cmd
& DMSGF_BASECMDMASK
;
912 state
->rxcmd
= msg
->any
.head
.cmd
& ~DMSGF_DELETE
;
913 state
->txcmd
= DMSGF_REPLY
;
914 state
->msgid
= msg
->any
.head
.msgid
;
915 state
->flags
&= ~KDMSG_STATE_NEW
;
916 RB_INSERT(kdmsg_state_tree
, &iocom
->staterd_tree
, state
);
917 TAILQ_INSERT_TAIL(&pstate
->subq
, state
, entry
);
922 * Persistent state is expected but might not exist if an
923 * ABORT+DELETE races the close.
925 if (state
== &iocom
->state0
) {
926 if (msg
->any
.head
.cmd
& DMSGF_ABORT
) {
927 kdio_printf(iocom
, 1, "%s\n",
932 kdio_printf(iocom
, 1, "%s\n",
933 "msgrx: no state for DELETE");
940 * Handle another ABORT+DELETE case if the msgid has already
943 if ((state
->rxcmd
& DMSGF_CREATE
) == 0) {
944 if (msg
->any
.head
.cmd
& DMSGF_ABORT
) {
945 kdio_printf(iocom
, 1, "%s\n",
946 "msgrx: state already B");
949 kdio_printf(iocom
, 1, "%s\n",
950 "msgrx: state reused for DELETE");
959 * Check for mid-stream ABORT command received, otherwise
962 if (msg
->any
.head
.cmd
& DMSGF_ABORT
) {
963 if (state
== &iocom
->state0
||
964 (state
->rxcmd
& DMSGF_CREATE
) == 0) {
971 case DMSGF_REPLY
| DMSGF_CREATE
:
972 case DMSGF_REPLY
| DMSGF_CREATE
| DMSGF_DELETE
:
974 * When receiving a reply with CREATE set the original
975 * persistent state message should already exist.
977 if (state
== &iocom
->state0
) {
978 kdio_printf(iocom
, 1,
979 "msgrx: no state match for "
980 "REPLY cmd=%08x msgid=%016jx\n",
982 (intmax_t)msg
->any
.head
.msgid
);
986 state
->rxcmd
= msg
->any
.head
.cmd
& ~DMSGF_DELETE
;
989 case DMSGF_REPLY
| DMSGF_DELETE
:
991 * Received REPLY+ABORT+DELETE in case where msgid has
992 * already been fully closed, ignore the message.
994 if (state
== &iocom
->state0
) {
995 if (msg
->any
.head
.cmd
& DMSGF_ABORT
) {
998 kdio_printf(iocom
, 1, "%s\n",
999 "msgrx: no state match "
1000 "for REPLY|DELETE");
1007 * Received REPLY+ABORT+DELETE in case where msgid has
1008 * already been reused for an unrelated message,
1009 * ignore the message.
1011 if ((state
->rxcmd
& DMSGF_CREATE
) == 0) {
1012 if (msg
->any
.head
.cmd
& DMSGF_ABORT
) {
1015 kdio_printf(iocom
, 1, "%s\n",
1016 "msgrx: state reused "
1017 "for REPLY|DELETE");
1026 * Check for mid-stream ABORT reply received to sent command.
1028 if (msg
->any
.head
.cmd
& DMSGF_ABORT
) {
1029 if (state
== &iocom
->state0
||
1030 (state
->rxcmd
& DMSGF_CREATE
) == 0) {
1040 * Calculate the easy-switch() transactional command. Represents
1041 * the outer-transaction command for any transaction-create or
1042 * transaction-delete, and the inner message command for any
1043 * non-transaction or inside-transaction command. tcmd will be
1044 * set to 0 if the message state is illegal.
1046 * The two can be told apart because outer-transaction commands
1047 * always have a DMSGF_CREATE and/or DMSGF_DELETE flag.
1050 if (msg
->any
.head
.cmd
& (DMSGF_CREATE
| DMSGF_DELETE
)) {
1051 if (state
!= &iocom
->state0
) {
1052 msg
->tcmd
= (msg
->state
->icmd
& DMSGF_BASECMDMASK
) |
1053 (msg
->any
.head
.cmd
& (DMSGF_CREATE
|
1060 msg
->tcmd
= msg
->any
.head
.cmd
& DMSGF_CMDSWMASK
;
1064 * Adjust the state for DELETE handling now, before making the
1065 * callback so we are atomic with other state updates.
1067 * Subq/parent linkages are cleaned up after the callback.
1068 * If an error occurred the message is ignored and state is not
1071 if ((state
= msg
->state
) == NULL
|| error
!= 0) {
1072 kdio_printf(iocom
, 1,
1073 "msgrx: state=%p error %d\n",
1075 } else if (msg
->any
.head
.cmd
& DMSGF_DELETE
) {
1076 KKASSERT((state
->rxcmd
& DMSGF_DELETE
) == 0);
1077 state
->rxcmd
|= DMSGF_DELETE
;
1078 if (state
->txcmd
& DMSGF_DELETE
) {
1079 KKASSERT(state
->flags
& KDMSG_STATE_RBINSERTED
);
1080 if (state
->rxcmd
& DMSGF_REPLY
) {
1081 KKASSERT(msg
->any
.head
.cmd
&
1083 RB_REMOVE(kdmsg_state_tree
,
1084 &iocom
->statewr_tree
, state
);
1086 KKASSERT((msg
->any
.head
.cmd
&
1088 RB_REMOVE(kdmsg_state_tree
,
1089 &iocom
->staterd_tree
, state
);
1091 state
->flags
&= ~KDMSG_STATE_RBINSERTED
;
1092 kdmsg_state_drop(state
); /* state on rbtree */
1095 lockmgr(&iocom
->msglk
, LK_RELEASE
);
1101 * Called instead of iocom->rcvmsg() if any of the AUTO flags are set.
1102 * This routine must call iocom->rcvmsg() for anything not automatically
1106 kdmsg_autorxmsg(kdmsg_msg_t
*msg
)
1108 kdmsg_iocom_t
*iocom
= msg
->state
->iocom
;
1114 * Main switch processes transaction create/delete sequences only.
1115 * Use icmd (DELETEs use DMSG_LNK_ERROR
1117 * NOTE: If processing in-transaction messages you generally want
1118 * an inner switch on msg->any.head.cmd.
1121 cmd
= (msg
->state
->icmd
& DMSGF_BASECMDMASK
) |
1122 (msg
->any
.head
.cmd
& (DMSGF_CREATE
|
1132 * Received ping, send reply
1134 rep
= kdmsg_msg_alloc(msg
->state
, DMSG_LNK_PING
| DMSGF_REPLY
,
1136 kdmsg_msg_write(rep
);
1138 case DMSG_LNK_PING
| DMSGF_REPLY
:
1139 /* ignore replies */
1141 case DMSG_LNK_CONN
| DMSGF_CREATE
:
1142 case DMSG_LNK_CONN
| DMSGF_CREATE
| DMSGF_DELETE
:
1144 * Received LNK_CONN transaction. Transmit response and
1145 * leave transaction open, which allows the other end to
1146 * start to the SPAN protocol.
1148 * Handle shim after acknowledging the CONN.
1150 if ((msg
->any
.head
.cmd
& DMSGF_DELETE
) == 0) {
1151 if (iocom
->flags
& KDMSG_IOCOMF_AUTOCONN
) {
1152 kdmsg_msg_result(msg
, 0);
1153 if (iocom
->auto_callback
)
1154 iocom
->auto_callback(msg
);
1156 error
= iocom
->rcvmsg(msg
);
1161 case DMSG_LNK_CONN
| DMSGF_DELETE
:
1163 * This message is usually simulated after a link is lost
1164 * to clean up the transaction.
1166 if (iocom
->flags
& KDMSG_IOCOMF_AUTOCONN
) {
1167 if (iocom
->auto_callback
)
1168 iocom
->auto_callback(msg
);
1169 kdmsg_msg_reply(msg
, 0);
1171 error
= iocom
->rcvmsg(msg
);
1174 case DMSG_LNK_SPAN
| DMSGF_CREATE
:
1175 case DMSG_LNK_SPAN
| DMSGF_CREATE
| DMSGF_DELETE
:
1177 * Received LNK_SPAN transaction. We do not have to respond
1178 * (except on termination), but we must leave the transaction
1181 * Handle shim after acknowledging the SPAN.
1183 if (iocom
->flags
& KDMSG_IOCOMF_AUTORXSPAN
) {
1184 if ((msg
->any
.head
.cmd
& DMSGF_DELETE
) == 0) {
1185 if (iocom
->auto_callback
)
1186 iocom
->auto_callback(msg
);
1191 error
= iocom
->rcvmsg(msg
);
1195 case DMSG_LNK_SPAN
| DMSGF_DELETE
:
1197 * Process shims (auto_callback) before cleaning up the
1198 * circuit structure and closing the transactions. Device
1199 * driver should ensure that the circuit is not used after
1200 * the auto_callback() returns.
1202 * Handle shim before closing the SPAN transaction.
1204 if (iocom
->flags
& KDMSG_IOCOMF_AUTORXSPAN
) {
1205 if (iocom
->auto_callback
)
1206 iocom
->auto_callback(msg
);
1207 kdmsg_msg_reply(msg
, 0);
1209 error
= iocom
->rcvmsg(msg
);
1214 * Anything unhandled goes into rcvmsg.
1216 * NOTE: Replies to link-level messages initiated by our side
1217 * are handled by the state callback, they are NOT
1220 error
= iocom
->rcvmsg(msg
);
1227 * Post-receive-handling message and state cleanup. This routine is called
1228 * after the state function handling/callback to properly dispose of the
1229 * message and unlink the state's parent/subq linkage if the state is
1230 * completely closed.
1232 * msglk is not held.
1236 kdmsg_state_cleanuprx(kdmsg_msg_t
*msg
)
1238 kdmsg_state_t
*state
= msg
->state
;
1239 kdmsg_iocom_t
*iocom
= state
->iocom
;
1241 lockmgr(&iocom
->msglk
, LK_EXCLUSIVE
);
1242 if (state
!= &iocom
->state0
) {
1244 * When terminating a transaction (in either direction), all
1245 * sub-states are aborted.
1247 if ((msg
->any
.head
.cmd
& DMSGF_DELETE
) &&
1248 TAILQ_FIRST(&msg
->state
->subq
)) {
1249 kdio_printf(iocom
, 2,
1250 "simulate failure for substates of "
1251 "state %p cmd %08x/%08x\n",
1255 kdmsg_simulate_failure(msg
->state
,
1256 0, DMSG_ERR_LOSTLINK
);
1260 * Once the state is fully closed we can (try to) remove it
1261 * from the subq topology.
1263 if ((state
->flags
& KDMSG_STATE_SUBINSERTED
) &&
1264 (state
->rxcmd
& DMSGF_DELETE
) &&
1265 (state
->txcmd
& DMSGF_DELETE
)) {
1267 * Remove parent linkage if state is completely closed.
1269 kdmsg_subq_delete(state
);
1272 kdmsg_msg_free(msg
);
1274 lockmgr(&iocom
->msglk
, LK_RELEASE
);
1278 * Remove state from its parent's subq. This can wind up recursively
1279 * dropping the parent upward.
1281 * NOTE: Once we drop the parent, our pstate pointer may become invalid.
1285 kdmsg_subq_delete(kdmsg_state_t
*state
)
1287 kdmsg_state_t
*pstate
;
1289 if (state
->flags
& KDMSG_STATE_SUBINSERTED
) {
1290 pstate
= state
->parent
;
1292 if (pstate
->scan
== state
)
1293 pstate
->scan
= NULL
;
1294 TAILQ_REMOVE(&pstate
->subq
, state
, entry
);
1295 state
->flags
&= ~KDMSG_STATE_SUBINSERTED
;
1296 state
->parent
= NULL
;
1297 if (TAILQ_EMPTY(&pstate
->subq
)) {
1298 kdmsg_state_drop(pstate
);/* pstate->subq */
1300 pstate
= NULL
; /* safety */
1301 kdmsg_state_drop(state
); /* pstate->subq */
1303 KKASSERT(state
->parent
== NULL
);
1308 * Simulate receiving a message which terminates an active transaction
1309 * state. Our simulated received message must set DELETE and may also
1310 * have to set CREATE. It must also ensure that all fields are set such
1311 * that the receive handling code can find the state (kdmsg_state_msgrx())
1312 * or an endless loop will ensue.
1314 * This is used when the other end of the link is dead so the device driver
1315 * gets a completed transaction for all pending states.
1317 * Called with iocom locked.
1321 kdmsg_simulate_failure(kdmsg_state_t
*state
, int meto
, int error
)
1323 kdmsg_state_t
*substate
;
1325 kdmsg_state_hold(state
); /* aborting */
1328 * Abort parent state first. Parent will not actually disappear
1329 * until children are gone. Device drivers must handle the situation.
1330 * The advantage of this is that device drivers can flag the situation
1331 * as an interlock against new operations on dying states. And since
1332 * device operations are often asynchronous anyway, this sequence of
1333 * events works out better.
1336 kdmsg_state_abort(state
);
1339 * Recurse through any children.
1342 TAILQ_FOREACH(substate
, &state
->subq
, entry
) {
1343 if (substate
->flags
& KDMSG_STATE_ABORTING
)
1345 state
->scan
= substate
;
1346 kdmsg_simulate_failure(substate
, 1, error
);
1347 if (state
->scan
!= substate
)
1350 kdmsg_state_drop(state
); /* aborting */
1355 kdmsg_state_abort(kdmsg_state_t
*state
)
1360 * Set ABORTING and DYING, return if already set. If the state was
1361 * just allocated we defer the abort operation until the related
1362 * message is processed.
1364 KKASSERT((state
->flags
& KDMSG_STATE_ABORTING
) == 0);
1365 if (state
->flags
& KDMSG_STATE_ABORTING
)
1367 state
->flags
|= KDMSG_STATE_ABORTING
;
1368 kdmsg_state_dying(state
);
1369 if (state
->flags
& KDMSG_STATE_NEW
) {
1370 kdio_printf(iocom
, 5,
1371 "kdmsg_state_abort(0): state %p rxcmd %08x "
1372 "txcmd %08x flags %08x - in NEW state\n",
1373 state
, state
->rxcmd
,
1374 state
->txcmd
, state
->flags
);
1379 * NOTE: The DELETE flag might already be set due to an early
1382 * NOTE: Args to kdmsg_msg_alloc() to avoid dynamic state allocation.
1384 * NOTE: We are simulating a received message using our state
1385 * (vs a message generated by the other side using its state),
1386 * so we must invert DMSGF_REVTRANS and DMSGF_REVCIRC.
1388 kdio_printf(iocom
, 5,
1389 "kdmsg_state_abort(1): state %p rxcmd %08x txcmd %08x\n",
1390 state
, state
->rxcmd
, state
->txcmd
);
1391 if ((state
->rxcmd
& DMSGF_DELETE
) == 0) {
1392 msg
= kdmsg_msg_alloc(state
, DMSG_LNK_ERROR
, NULL
, NULL
);
1393 if ((state
->rxcmd
& DMSGF_CREATE
) == 0)
1394 msg
->any
.head
.cmd
|= DMSGF_CREATE
;
1395 msg
->any
.head
.cmd
|= DMSGF_DELETE
|
1396 (state
->rxcmd
& DMSGF_REPLY
);
1397 msg
->any
.head
.cmd
^= (DMSGF_REVTRANS
| DMSGF_REVCIRC
);
1398 msg
->any
.head
.error
= DMSG_ERR_LOSTLINK
;
1399 kdio_printf(iocom
, 5,
1400 "kdmsg_state_abort(a): state %p msgcmd %08x\n",
1401 state
, msg
->any
.head
.cmd
);
1402 /* circuit not initialized */
1403 lockmgr(&state
->iocom
->msglk
, LK_RELEASE
);
1404 kdmsg_msg_receive_handling(msg
);
1405 lockmgr(&state
->iocom
->msglk
, LK_EXCLUSIVE
);
1408 kdio_printf(iocom
, 5,
1409 "kdmsg_state_abort(2): state %p rxcmd %08x txcmd %08x\n",
1410 state
, state
->rxcmd
, state
->txcmd
);
1414 * Recursively sets KDMSG_STATE_DYING on state and all sub-states, preventing
1415 * the transmission of any new messages on these states. This is done
1416 * atomically when parent state is terminating, whereas setting ABORTING is
1417 * not atomic and can leak races.
1421 kdmsg_state_dying(kdmsg_state_t
*state
)
1423 kdmsg_state_t
*scan
;
1425 if ((state
->flags
& KDMSG_STATE_DYING
) == 0) {
1426 state
->flags
|= KDMSG_STATE_DYING
;
1427 TAILQ_FOREACH(scan
, &state
->subq
, entry
)
1428 kdmsg_state_dying(scan
);
1433 * Process state tracking for a message prior to transmission.
1435 * Called with msglk held and the msg dequeued. Returns non-zero if
1436 * the message is bad and should be deleted by the caller.
1438 * One-off messages are usually with dummy state and msg->state may be NULL
1439 * in this situation.
1441 * New transactions (when CREATE is set) will insert the state.
1443 * May request that caller discard the message by setting *discardp to 1.
1444 * A NULL state may be returned in this case.
1448 kdmsg_state_msgtx(kdmsg_msg_t
*msg
)
1450 kdmsg_iocom_t
*iocom
= msg
->state
->iocom
;
1451 kdmsg_state_t
*state
;
1455 * Make sure a state structure is ready to go in case we need a new
1456 * one. This is the only routine which uses freewr_state so no
1457 * races are possible.
1459 if ((state
= iocom
->freewr_state
) == NULL
) {
1460 state
= kmalloc(sizeof(*state
), iocom
->mmsg
, M_WAITOK
| M_ZERO
);
1461 state
->flags
= KDMSG_STATE_DYNAMIC
;
1462 state
->iocom
= iocom
;
1464 TAILQ_INIT(&state
->subq
);
1465 iocom
->freewr_state
= state
;
1469 * Lock RB tree. If persistent state is present it will have already
1470 * been assigned to msg.
1475 * Short-cut one-off or mid-stream messages (state may be NULL).
1477 if ((msg
->any
.head
.cmd
& (DMSGF_CREATE
| DMSGF_DELETE
|
1478 DMSGF_ABORT
)) == 0) {
1484 * Switch on CREATE, DELETE, REPLY, and also handle ABORT from
1485 * inside the case statements.
1487 switch(msg
->any
.head
.cmd
& (DMSGF_CREATE
| DMSGF_DELETE
|
1490 case DMSGF_CREATE
| DMSGF_DELETE
:
1492 * Insert the new persistent message state and mark
1493 * half-closed if DELETE is set. Since this is a new
1494 * message it isn't possible to transition into the fully
1495 * closed state here.
1497 * XXX state must be assigned and inserted by
1498 * kdmsg_msg_write(). txcmd is assigned by us
1501 KKASSERT(state
!= NULL
);
1502 state
->icmd
= msg
->any
.head
.cmd
& DMSGF_BASECMDMASK
;
1503 state
->txcmd
= msg
->any
.head
.cmd
& ~DMSGF_DELETE
;
1504 state
->rxcmd
= DMSGF_REPLY
;
1505 state
->flags
&= ~KDMSG_STATE_NEW
;
1510 * Sent ABORT+DELETE in case where msgid has already
1511 * been fully closed, ignore the message.
1513 if (state
== &iocom
->state0
) {
1514 if (msg
->any
.head
.cmd
& DMSGF_ABORT
) {
1517 kdio_printf(iocom
, 1,
1518 "msgtx: no state match "
1519 "for DELETE cmd=%08x msgid=%016jx\n",
1521 (intmax_t)msg
->any
.head
.msgid
);
1528 * Sent ABORT+DELETE in case where msgid has
1529 * already been reused for an unrelated message,
1530 * ignore the message.
1532 if ((state
->txcmd
& DMSGF_CREATE
) == 0) {
1533 if (msg
->any
.head
.cmd
& DMSGF_ABORT
) {
1536 kdio_printf(iocom
, 1, "%s\n",
1537 "msgtx: state reused "
1547 * Check for mid-stream ABORT command sent
1549 if (msg
->any
.head
.cmd
& DMSGF_ABORT
) {
1550 if (state
== &state
->iocom
->state0
||
1551 (state
->txcmd
& DMSGF_CREATE
) == 0) {
1558 case DMSGF_REPLY
| DMSGF_CREATE
:
1559 case DMSGF_REPLY
| DMSGF_CREATE
| DMSGF_DELETE
:
1561 * When transmitting a reply with CREATE set the original
1562 * persistent state message should already exist.
1564 if (state
== &state
->iocom
->state0
) {
1565 kdio_printf(iocom
, 1, "%s\n",
1566 "msgtx: no state match "
1567 "for REPLY | CREATE");
1571 state
->txcmd
= msg
->any
.head
.cmd
& ~DMSGF_DELETE
;
1574 case DMSGF_REPLY
| DMSGF_DELETE
:
1576 * When transmitting a reply with DELETE set the original
1577 * persistent state message should already exist.
1579 * This is very similar to the REPLY|CREATE|* case except
1580 * txcmd is already stored, so we just add the DELETE flag.
1582 * Sent REPLY+ABORT+DELETE in case where msgid has
1583 * already been fully closed, ignore the message.
1585 if (state
== &state
->iocom
->state0
) {
1586 if (msg
->any
.head
.cmd
& DMSGF_ABORT
) {
1589 kdio_printf(iocom
, 1, "%s\n",
1590 "msgtx: no state match "
1591 "for REPLY | DELETE");
1598 * Sent REPLY+ABORT+DELETE in case where msgid has already
1599 * been reused for an unrelated message, ignore the message.
1601 if ((state
->txcmd
& DMSGF_CREATE
) == 0) {
1602 if (msg
->any
.head
.cmd
& DMSGF_ABORT
) {
1605 kdio_printf(iocom
, 1, "%s\n",
1606 "msgtx: state reused "
1607 "for REPLY | DELETE");
1616 * Check for mid-stream ABORT reply sent.
1618 * One-off REPLY messages are allowed for e.g. status updates.
1620 if (msg
->any
.head
.cmd
& DMSGF_ABORT
) {
1621 if (state
== &state
->iocom
->state0
||
1622 (state
->txcmd
& DMSGF_CREATE
) == 0) {
1632 * Set interlock (XXX hack) in case the send side blocks and a
1633 * response is returned before kdmsg_state_cleanuptx() can be
1636 if (state
&& error
== 0)
1637 state
->flags
|= KDMSG_STATE_INTERLOCK
;
1643 * Called with iocom locked.
1647 kdmsg_state_cleanuptx(kdmsg_msg_t
*msg
)
1649 kdmsg_iocom_t
*iocom
= msg
->state
->iocom
;
1650 kdmsg_state_t
*state
;
1652 if ((state
= msg
->state
) == NULL
) {
1653 kdmsg_msg_free(msg
);
1658 * Clear interlock (XXX hack) in case the send side blocks and a
1659 * response is returned in the other thread before
1660 * kdmsg_state_cleanuptx() can be run. We maintain our hold on
1661 * iocom->msglk so we can do this before completing our task.
1663 if (state
->flags
& KDMSG_STATE_SIGNAL
) {
1664 kdio_printf(iocom
, 1, "state %p interlock!\n", state
);
1667 state
->flags
&= ~(KDMSG_STATE_INTERLOCK
| KDMSG_STATE_SIGNAL
);
1668 kdmsg_state_hold(state
);
1670 if (msg
->any
.head
.cmd
& DMSGF_DELETE
) {
1671 KKASSERT((state
->txcmd
& DMSGF_DELETE
) == 0);
1672 state
->txcmd
|= DMSGF_DELETE
;
1673 if (state
->rxcmd
& DMSGF_DELETE
) {
1674 KKASSERT(state
->flags
& KDMSG_STATE_RBINSERTED
);
1675 if (state
->txcmd
& DMSGF_REPLY
) {
1676 KKASSERT(msg
->any
.head
.cmd
&
1678 RB_REMOVE(kdmsg_state_tree
,
1679 &iocom
->staterd_tree
, state
);
1681 KKASSERT((msg
->any
.head
.cmd
&
1683 RB_REMOVE(kdmsg_state_tree
,
1684 &iocom
->statewr_tree
, state
);
1686 state
->flags
&= ~KDMSG_STATE_RBINSERTED
;
1689 * The subq recursion is used for parent linking and
1690 * scanning the topology for aborts, we can only
1691 * remove leafs. The circuit is effectively dead now,
1692 * but topology won't be torn down until all of its
1693 * children have finished/aborted.
1695 * This is particularly important for end-point
1696 * devices which might need to access private data
1697 * in parent states. Out of order disconnects can
1698 * occur if an end-point device is processing a
1699 * message transaction asynchronously because abort
1700 * requests are basically synchronous and it probably
1701 * isn't convenient (or possible) for the end-point
1702 * to abort an asynchronous operation.
1704 if (TAILQ_EMPTY(&state
->subq
))
1705 kdmsg_subq_delete(state
);
1706 kdmsg_msg_free(msg
);
1707 kdmsg_state_drop(state
); /* state on rbtree */
1709 kdmsg_msg_free(msg
);
1712 kdmsg_msg_free(msg
);
1716 * Deferred abort after transmission.
1718 if ((state
->flags
& (KDMSG_STATE_ABORTING
| KDMSG_STATE_DYING
)) &&
1719 (state
->rxcmd
& DMSGF_DELETE
) == 0) {
1720 kdio_printf(iocom
, 5,
1721 "kdmsg_state_cleanuptx: state=%p "
1722 "executing deferred abort\n",
1724 state
->flags
&= ~KDMSG_STATE_ABORTING
;
1725 kdmsg_state_abort(state
);
1727 kdmsg_state_drop(state
);
1732 _kdmsg_state_hold(kdmsg_state_t
*state KDMSG_DEBUG_ARGS
)
1734 atomic_add_int(&state
->refs
, 1);
1736 kd_printf(4, "state %p +%d\t%s:%d\n", state
, state
->refs
, file
, line
);
1742 _kdmsg_state_drop(kdmsg_state_t
*state KDMSG_DEBUG_ARGS
)
1744 KKASSERT(state
->refs
> 0);
1746 kd_printf(4, "state %p -%d\t%s:%d\n", state
, state
->refs
, file
, line
);
1748 if (atomic_fetchadd_int(&state
->refs
, -1) == 1)
1749 kdmsg_state_free(state
);
1754 kdmsg_state_free(kdmsg_state_t
*state
)
1756 kdmsg_iocom_t
*iocom
= state
->iocom
;
1758 KKASSERT((state
->flags
& KDMSG_STATE_RBINSERTED
) == 0);
1759 KKASSERT((state
->flags
& KDMSG_STATE_SUBINSERTED
) == 0);
1760 KKASSERT(TAILQ_EMPTY(&state
->subq
));
1762 if (state
!= &state
->iocom
->state0
)
1763 kfree(state
, iocom
->mmsg
);
1767 kdmsg_msg_alloc(kdmsg_state_t
*state
, uint32_t cmd
,
1768 int (*func
)(kdmsg_state_t
*, kdmsg_msg_t
*), void *data
)
1770 kdmsg_iocom_t
*iocom
= state
->iocom
;
1771 kdmsg_state_t
*pstate
;
1775 KKASSERT(iocom
!= NULL
);
1776 hbytes
= (cmd
& DMSGF_SIZE
) * DMSG_ALIGN
;
1777 msg
= kmalloc(offsetof(struct kdmsg_msg
, any
) + hbytes
,
1778 iocom
->mmsg
, M_WAITOK
| M_ZERO
);
1779 msg
->hdr_size
= hbytes
;
1781 if ((cmd
& (DMSGF_CREATE
| DMSGF_REPLY
)) == DMSGF_CREATE
) {
1783 * New transaction, requires tracking state and a unique
1784 * msgid to be allocated.
1786 * It is possible to race a circuit failure, inherit the
1787 * parent's STATE_DYING flag to trigger an abort sequence
1788 * in the transmit path. By not inheriting ABORTING the
1789 * abort sequence can recurse.
1791 * NOTE: The transactions has not yet been initiated so we
1792 * cannot set DMSGF_CREATE/DELETE bits in txcmd or rxcmd.
1793 * We have to properly setup DMSGF_REPLY, however.
1796 state
= kmalloc(sizeof(*state
), iocom
->mmsg
, M_WAITOK
| M_ZERO
);
1797 TAILQ_INIT(&state
->subq
);
1798 state
->iocom
= iocom
;
1799 state
->parent
= pstate
;
1800 state
->flags
= KDMSG_STATE_DYNAMIC
|
1803 state
->any
.any
= data
;
1804 state
->msgid
= (uint64_t)(uintptr_t)state
;
1805 /*msg->any.head.msgid = state->msgid;XXX*/
1807 lockmgr(&iocom
->msglk
, LK_EXCLUSIVE
);
1808 if (RB_INSERT(kdmsg_state_tree
, &iocom
->statewr_tree
, state
))
1809 panic("duplicate msgid allocated");
1810 if (TAILQ_EMPTY(&pstate
->subq
))
1811 kdmsg_state_hold(pstate
);/* pstate->subq */
1812 TAILQ_INSERT_TAIL(&pstate
->subq
, state
, entry
);
1813 state
->flags
|= KDMSG_STATE_RBINSERTED
|
1814 KDMSG_STATE_SUBINSERTED
;
1815 state
->flags
|= pstate
->flags
& KDMSG_STATE_DYING
;
1816 kdmsg_state_hold(state
); /* pstate->subq */
1817 kdmsg_state_hold(state
); /* state on rbtree */
1818 kdmsg_state_hold(state
); /* msg->state */
1819 lockmgr(&iocom
->msglk
, LK_RELEASE
);
1821 pstate
= state
->parent
;
1822 KKASSERT(pstate
!= NULL
);
1823 kdmsg_state_hold(state
); /* msg->state */
1826 if (state
->flags
& KDMSG_STATE_OPPOSITE
)
1827 cmd
|= DMSGF_REVTRANS
;
1828 if (pstate
->flags
& KDMSG_STATE_OPPOSITE
)
1829 cmd
|= DMSGF_REVCIRC
;
1831 msg
->any
.head
.magic
= DMSG_HDR_MAGIC
;
1832 msg
->any
.head
.cmd
= cmd
;
1833 msg
->any
.head
.msgid
= state
->msgid
;
1834 msg
->any
.head
.circuit
= pstate
->msgid
;
1841 kdmsg_msg_free(kdmsg_msg_t
*msg
)
1843 kdmsg_iocom_t
*iocom
= msg
->state
->iocom
;
1844 kdmsg_state_t
*state
;
1846 if ((msg
->flags
& KDMSG_FLAG_AUXALLOC
) &&
1847 msg
->aux_data
&& msg
->aux_size
) {
1848 kfree(msg
->aux_data
, iocom
->mmsg
);
1849 msg
->aux_data
= NULL
;
1850 msg
->flags
&= ~KDMSG_FLAG_AUXALLOC
;
1852 if ((state
= msg
->state
) != NULL
) {
1854 kdmsg_state_drop(state
); /* msg->state */
1856 msg
->aux_data
= NULL
;
1859 kfree(msg
, iocom
->mmsg
);
1863 kdmsg_detach_aux_data(kdmsg_msg_t
*msg
, kdmsg_data_t
*data
)
1865 if (msg
->flags
& KDMSG_FLAG_AUXALLOC
) {
1866 data
->aux_data
= msg
->aux_data
;
1867 data
->aux_size
= msg
->aux_size
;
1868 data
->iocom
= msg
->state
->iocom
;
1869 msg
->flags
&= ~KDMSG_FLAG_AUXALLOC
;
1871 data
->aux_data
= NULL
;
1873 data
->iocom
= msg
->state
->iocom
;
1878 kdmsg_free_aux_data(kdmsg_data_t
*data
)
1880 if (data
->aux_data
) {
1881 kfree(data
->aux_data
, data
->iocom
->mmsg
);
1882 data
->aux_data
= NULL
;
1887 * Indexed messages are stored in a red-black tree indexed by their
1888 * msgid. Only persistent messages are indexed.
1891 kdmsg_state_cmp(kdmsg_state_t
*state1
, kdmsg_state_t
*state2
)
1893 if (state1
->iocom
< state2
->iocom
)
1895 if (state1
->iocom
> state2
->iocom
)
1897 if (state1
->msgid
< state2
->msgid
)
1899 if (state1
->msgid
> state2
->msgid
)
1905 * Write a message. All requisit command flags have been set.
1907 * If msg->state is non-NULL the message is written to the existing
1908 * transaction. msgid will be set accordingly.
1910 * If msg->state is NULL and CREATE is set new state is allocated and
1911 * (func, data) is installed. A msgid is assigned.
1913 * If msg->state is NULL and CREATE is not set the message is assumed
1914 * to be a one-way message. The originator must assign the msgid
1915 * (or leave it 0, which is typical.
1917 * This function merely queues the message to the management thread, it
1918 * does not write to the message socket/pipe.
1921 kdmsg_msg_write(kdmsg_msg_t
*msg
)
1923 kdmsg_iocom_t
*iocom
= msg
->state
->iocom
;
1925 lockmgr(&iocom
->msglk
, LK_EXCLUSIVE
);
1926 kdmsg_msg_write_locked(iocom
, msg
);
1927 lockmgr(&iocom
->msglk
, LK_RELEASE
);
1931 kdmsg_msg_write_locked(kdmsg_iocom_t
*iocom
, kdmsg_msg_t
*msg
)
1933 kdmsg_state_t
*state
;
1937 * Continuance or termination of existing transaction.
1938 * The transaction could have been initiated by either end.
1940 * (Function callback and aux data for the receive side can
1941 * be replaced or left alone).
1944 msg
->any
.head
.msgid
= state
->msgid
;
1947 * One-off message (always uses msgid 0 to distinguish
1948 * between a possibly lost in-transaction message due to
1949 * competing aborts and a real one-off message?)
1952 msg
->any
.head
.msgid
= 0;
1956 * For stateful messages, if the circuit is dead or dying we have
1957 * to abort the potentially newly-created state and discard the
1960 * - We must discard the message because the other end will not
1961 * be expecting any more messages over the dead or dying circuit
1962 * and might not be able to receive them.
1964 * - We abort the state by simulating a failure to generate a fake
1965 * incoming DELETE. This will trigger the state callback and allow
1966 * the device to clean things up and reply, closing the outgoing
1967 * direction and allowing the state to be freed.
1969 * This situation occurs quite often, particularly as SPANs stabilize.
1970 * End-points must do the right thing.
1973 KKASSERT((state
->txcmd
& DMSGF_DELETE
) == 0);
1974 if (state
->flags
& KDMSG_STATE_DYING
) {
1976 if ((state
->flags
& KDMSG_STATE_DYING
) ||
1977 (state
->parent
->txcmd
& DMSGF_DELETE
) ||
1978 (state
->parent
->flags
& KDMSG_STATE_DYING
)) {
1980 kdio_printf(iocom
, 4,
1981 "kdmsg_msg_write: Write to dying circuit "
1983 "ptxcmd=%08x prxcmd=%08x flags=%08x\n",
1985 state
->parent
->rxcmd
,
1986 state
->parent
->txcmd
,
1987 state
->parent
->flags
);
1988 kdmsg_state_hold(state
);
1989 kdmsg_state_msgtx(msg
);
1990 kdmsg_state_cleanuptx(msg
);
1991 kdmsg_state_drop(state
);
1997 * Finish up the msg fields. Note that msg->aux_size and the
1998 * aux_bytes stored in the message header represent the unaligned
1999 * (actual) bytes of data, but the buffer is sized to an aligned
2000 * size and the CRC is generated over the aligned length.
2002 msg
->any
.head
.salt
= /* (random << 8) | */ (iocom
->msg_seq
& 255);
2005 if (msg
->aux_data
&& msg
->aux_size
) {
2006 uint32_t abytes
= DMSG_DOALIGN(msg
->aux_size
);
2008 msg
->any
.head
.aux_bytes
= msg
->aux_size
;
2009 msg
->any
.head
.aux_crc
= iscsi_crc32(msg
->aux_data
, abytes
);
2011 msg
->any
.head
.hdr_crc
= 0;
2012 msg
->any
.head
.hdr_crc
= iscsi_crc32(msg
->any
.buf
, msg
->hdr_size
);
2015 * If termination races new message senders we must drain the
2016 * message immediately instead of queue it.
2018 if (iocom
->flags
& KDMSG_IOCOMF_EXITNOACC
)
2019 kdmsg_drain_msg(msg
);
2021 TAILQ_INSERT_TAIL(&iocom
->msgq
, msg
, qentry
);
2023 if (iocom
->msg_ctl
& KDMSG_CLUSTERCTL_SLEEPING
) {
2024 atomic_clear_int(&iocom
->msg_ctl
,
2025 KDMSG_CLUSTERCTL_SLEEPING
);
2026 wakeup(&iocom
->msg_ctl
);
2031 * Reply to a message and terminate our side of the transaction.
2033 * If msg->state is non-NULL we are replying to a one-way message.
2036 kdmsg_msg_reply(kdmsg_msg_t
*msg
, uint32_t error
)
2038 kdmsg_state_t
*state
= msg
->state
;
2043 * Reply with a simple error code and terminate the transaction.
2045 cmd
= DMSG_LNK_ERROR
;
2048 * Check if our direction has even been initiated yet, set CREATE.
2050 * Check what direction this is (command or reply direction). Note
2051 * that txcmd might not have been initiated yet.
2053 * If our direction has already been closed we just return without
2056 if (state
!= &state
->iocom
->state0
) {
2057 if (state
->txcmd
& DMSGF_DELETE
)
2059 if ((state
->txcmd
& DMSGF_CREATE
) == 0)
2060 cmd
|= DMSGF_CREATE
;
2061 if (state
->txcmd
& DMSGF_REPLY
)
2063 cmd
|= DMSGF_DELETE
;
2065 if ((msg
->any
.head
.cmd
& DMSGF_REPLY
) == 0)
2069 nmsg
= kdmsg_msg_alloc(state
, cmd
, NULL
, NULL
);
2070 nmsg
->any
.head
.error
= error
;
2071 kdmsg_msg_write(nmsg
);
2075 * Reply to a message and continue our side of the transaction.
2077 * If msg->state is non-NULL we are replying to a one-way message and this
2078 * function degenerates into the same as kdmsg_msg_reply().
2081 kdmsg_msg_result(kdmsg_msg_t
*msg
, uint32_t error
)
2083 kdmsg_state_t
*state
= msg
->state
;
2088 * Return a simple result code, do NOT terminate the transaction.
2090 cmd
= DMSG_LNK_ERROR
;
2093 * Check if our direction has even been initiated yet, set CREATE.
2095 * Check what direction this is (command or reply direction). Note
2096 * that txcmd might not have been initiated yet.
2098 * If our direction has already been closed we just return without
2101 if (state
!= &state
->iocom
->state0
) {
2102 if (state
->txcmd
& DMSGF_DELETE
)
2104 if ((state
->txcmd
& DMSGF_CREATE
) == 0)
2105 cmd
|= DMSGF_CREATE
;
2106 if (state
->txcmd
& DMSGF_REPLY
)
2108 /* continuing transaction, do not set MSGF_DELETE */
2110 if ((msg
->any
.head
.cmd
& DMSGF_REPLY
) == 0)
2114 nmsg
= kdmsg_msg_alloc(state
, cmd
, NULL
, NULL
);
2115 nmsg
->any
.head
.error
= error
;
2116 kdmsg_msg_write(nmsg
);
2120 * Reply to a message and terminate our side of the transaction.
2122 * If msg->state is non-NULL we are replying to a one-way message.
2125 kdmsg_state_reply(kdmsg_state_t
*state
, uint32_t error
)
2131 * Reply with a simple error code and terminate the transaction.
2133 cmd
= DMSG_LNK_ERROR
;
2136 * Check if our direction has even been initiated yet, set CREATE.
2138 * Check what direction this is (command or reply direction). Note
2139 * that txcmd might not have been initiated yet.
2141 * If our direction has already been closed we just return without
2145 if (state
->txcmd
& DMSGF_DELETE
)
2147 if ((state
->txcmd
& DMSGF_CREATE
) == 0)
2148 cmd
|= DMSGF_CREATE
;
2149 if (state
->txcmd
& DMSGF_REPLY
)
2151 cmd
|= DMSGF_DELETE
;
2153 nmsg
= kdmsg_msg_alloc(state
, cmd
, NULL
, NULL
);
2154 nmsg
->any
.head
.error
= error
;
2155 kdmsg_msg_write(nmsg
);
2159 * Reply to a message and continue our side of the transaction.
2161 * If msg->state is non-NULL we are replying to a one-way message and this
2162 * function degenerates into the same as kdmsg_msg_reply().
2165 kdmsg_state_result(kdmsg_state_t
*state
, uint32_t error
)
2171 * Return a simple result code, do NOT terminate the transaction.
2173 cmd
= DMSG_LNK_ERROR
;
2176 * Check if our direction has even been initiated yet, set CREATE.
2178 * Check what direction this is (command or reply direction). Note
2179 * that txcmd might not have been initiated yet.
2181 * If our direction has already been closed we just return without
2185 if (state
->txcmd
& DMSGF_DELETE
)
2187 if ((state
->txcmd
& DMSGF_CREATE
) == 0)
2188 cmd
|= DMSGF_CREATE
;
2189 if (state
->txcmd
& DMSGF_REPLY
)
2191 /* continuing transaction, do not set MSGF_DELETE */
2193 nmsg
= kdmsg_msg_alloc(state
, cmd
, NULL
, NULL
);
2194 nmsg
->any
.head
.error
= error
;
2195 kdmsg_msg_write(nmsg
);