dmsg - Stabilization work
[dragonfly.git] / sys / kern / kern_dmsg.c
blobac725138da47fafa530ca26fc28338b0f8e7fae8
1 /*-
2 * Copyright (c) 2012 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
35 * TODO: txcmd CREATE state is deferred by tx msgq, need to calculate
36 * a streaming response. See subr_diskiocom()'s diskiodone().
38 #include <sys/param.h>
39 #include <sys/types.h>
40 #include <sys/kernel.h>
41 #include <sys/conf.h>
42 #include <sys/systm.h>
43 #include <sys/queue.h>
44 #include <sys/tree.h>
45 #include <sys/malloc.h>
46 #include <sys/mount.h>
47 #include <sys/socket.h>
48 #include <sys/vnode.h>
49 #include <sys/file.h>
50 #include <sys/proc.h>
51 #include <sys/priv.h>
52 #include <sys/thread.h>
53 #include <sys/globaldata.h>
54 #include <sys/limits.h>
56 #include <sys/dmsg.h>
58 RB_GENERATE(kdmsg_state_tree, kdmsg_state, rbnode, kdmsg_state_cmp);
60 static int kdmsg_msg_receive_handling(kdmsg_msg_t *msg);
61 static int kdmsg_state_msgrx(kdmsg_msg_t *msg);
62 static int kdmsg_state_msgtx(kdmsg_msg_t *msg);
63 static void kdmsg_state_cleanuprx(kdmsg_msg_t *msg);
64 static void kdmsg_state_cleanuptx(kdmsg_msg_t *msg);
65 static void kdmsg_simulate_failure(kdmsg_state_t *state, int meto, int error);
66 static void kdmsg_state_abort(kdmsg_state_t *state);
67 static void kdmsg_state_free(kdmsg_state_t *state);
69 #ifdef KDMSG_DEBUG
70 #define KDMSG_DEBUG_ARGS , const char *file, int line
71 #define kdmsg_state_ref(state) _kdmsg_state_ref(state, __FILE__, __LINE__)
72 #define kdmsg_state_drop(state) _kdmsg_state_drop(state, __FILE__, __LINE__)
73 #else
74 #define KDMSG_DEBUG_ARGS
75 #define kdmsg_state_ref(state) _kdmsg_state_ref(state)
76 #define kdmsg_state_drop(state) _kdmsg_state_drop(state)
77 #endif
78 static void _kdmsg_state_ref(kdmsg_state_t *state KDMSG_DEBUG_ARGS);
79 static void _kdmsg_state_drop(kdmsg_state_t *state KDMSG_DEBUG_ARGS);
81 static void kdmsg_iocom_thread_rd(void *arg);
82 static void kdmsg_iocom_thread_wr(void *arg);
83 static int kdmsg_autorxmsg(kdmsg_msg_t *msg);
85 /*static struct lwkt_token kdmsg_token = LWKT_TOKEN_INITIALIZER(kdmsg_token);*/
88 * Initialize the roll-up communications structure for a network
89 * messaging session. This function does not install the socket.
91 void
92 kdmsg_iocom_init(kdmsg_iocom_t *iocom, void *handle, uint32_t flags,
93 struct malloc_type *mmsg,
94 int (*rcvmsg)(kdmsg_msg_t *msg))
96 bzero(iocom, sizeof(*iocom));
97 iocom->handle = handle;
98 iocom->mmsg = mmsg;
99 iocom->rcvmsg = rcvmsg;
100 iocom->flags = flags;
101 lockinit(&iocom->msglk, "h2msg", 0, 0);
102 TAILQ_INIT(&iocom->msgq);
103 RB_INIT(&iocom->staterd_tree);
104 RB_INIT(&iocom->statewr_tree);
106 iocom->state0.iocom = iocom;
107 iocom->state0.parent = &iocom->state0;
108 TAILQ_INIT(&iocom->state0.subq);
112 * [Re]connect using the passed file pointer. The caller must ref the
113 * fp for us. We own that ref now.
115 void
116 kdmsg_iocom_reconnect(kdmsg_iocom_t *iocom, struct file *fp,
117 const char *subsysname)
120 * Destroy the current connection
122 lockmgr(&iocom->msglk, LK_EXCLUSIVE);
123 atomic_set_int(&iocom->msg_ctl, KDMSG_CLUSTERCTL_KILLRX);
124 while (iocom->msgrd_td || iocom->msgwr_td) {
125 wakeup(&iocom->msg_ctl);
126 lksleep(iocom, &iocom->msglk, 0, "clstrkl", hz);
130 * Drop communications descriptor
132 if (iocom->msg_fp) {
133 fdrop(iocom->msg_fp);
134 iocom->msg_fp = NULL;
138 * Setup new communications descriptor
140 iocom->msg_ctl = 0;
141 iocom->msg_fp = fp;
142 iocom->msg_seq = 0;
143 iocom->flags &= ~KDMSG_IOCOMF_EXITNOACC;
145 lwkt_create(kdmsg_iocom_thread_rd, iocom, &iocom->msgrd_td,
146 NULL, 0, -1, "%s-msgrd", subsysname);
147 lwkt_create(kdmsg_iocom_thread_wr, iocom, &iocom->msgwr_td,
148 NULL, 0, -1, "%s-msgwr", subsysname);
149 lockmgr(&iocom->msglk, LK_RELEASE);
153 * Caller sets up iocom->auto_lnk_conn and iocom->auto_lnk_span, then calls
154 * this function to handle the state machine for LNK_CONN and LNK_SPAN.
156 static int kdmsg_lnk_conn_reply(kdmsg_state_t *state, kdmsg_msg_t *msg);
157 static int kdmsg_lnk_span_reply(kdmsg_state_t *state, kdmsg_msg_t *msg);
159 void
160 kdmsg_iocom_autoinitiate(kdmsg_iocom_t *iocom,
161 void (*auto_callback)(kdmsg_msg_t *msg))
163 kdmsg_msg_t *msg;
165 iocom->auto_callback = auto_callback;
167 msg = kdmsg_msg_alloc(&iocom->state0,
168 DMSG_LNK_CONN | DMSGF_CREATE,
169 kdmsg_lnk_conn_reply, NULL);
170 iocom->auto_lnk_conn.head = msg->any.head;
171 msg->any.lnk_conn = iocom->auto_lnk_conn;
172 iocom->conn_state = msg->state;
173 kdmsg_state_ref(msg->state); /* iocom->conn_state */
174 kdmsg_msg_write(msg);
177 static
179 kdmsg_lnk_conn_reply(kdmsg_state_t *state, kdmsg_msg_t *msg)
181 kdmsg_iocom_t *iocom = state->iocom;
182 kdmsg_msg_t *rmsg;
185 * Upon receipt of the LNK_CONN acknowledgement initiate an
186 * automatic SPAN if we were asked to. Used by e.g. xdisk, but
187 * not used by HAMMER2 which must manage more than one transmitted
188 * SPAN.
190 if ((msg->any.head.cmd & DMSGF_CREATE) &&
191 (iocom->flags & KDMSG_IOCOMF_AUTOTXSPAN)) {
192 rmsg = kdmsg_msg_alloc(&iocom->state0,
193 DMSG_LNK_SPAN | DMSGF_CREATE,
194 kdmsg_lnk_span_reply, NULL);
195 iocom->auto_lnk_span.head = rmsg->any.head;
196 rmsg->any.lnk_span = iocom->auto_lnk_span;
197 kdmsg_msg_write(rmsg);
201 * Process shim after the CONN is acknowledged and before the CONN
202 * transaction is deleted. For deletions this gives device drivers
203 * the ability to interlock new operations on the circuit before
204 * it becomes illegal and panics.
206 if (iocom->auto_callback)
207 iocom->auto_callback(msg);
209 if ((state->txcmd & DMSGF_DELETE) == 0 &&
210 (msg->any.head.cmd & DMSGF_DELETE)) {
212 * iocom->conn_state has a state ref, drop it when clearing.
214 if (iocom->conn_state)
215 kdmsg_state_drop(iocom->conn_state);
216 iocom->conn_state = NULL;
217 kdmsg_msg_reply(msg, 0);
220 return (0);
223 static
225 kdmsg_lnk_span_reply(kdmsg_state_t *state, kdmsg_msg_t *msg)
228 * Be sure to process shim before terminating the SPAN
229 * transaction. Gives device drivers the ability to
230 * interlock new operations on the circuit before it
231 * becomes illegal and panics.
233 if (state->iocom->auto_callback)
234 state->iocom->auto_callback(msg);
236 if ((state->txcmd & DMSGF_DELETE) == 0 &&
237 (msg->any.head.cmd & DMSGF_DELETE)) {
238 kdmsg_msg_reply(msg, 0);
240 return (0);
244 * Disconnect and clean up
246 void
247 kdmsg_iocom_uninit(kdmsg_iocom_t *iocom)
249 kdmsg_state_t *state;
252 * Ask the cluster controller to go away
254 lockmgr(&iocom->msglk, LK_EXCLUSIVE);
255 atomic_set_int(&iocom->msg_ctl, KDMSG_CLUSTERCTL_KILLRX);
257 while (iocom->msgrd_td || iocom->msgwr_td) {
258 wakeup(&iocom->msg_ctl);
259 lksleep(iocom, &iocom->msglk, 0, "clstrkl", hz);
263 * Cleanup caches
265 if ((state = iocom->freerd_state) != NULL) {
266 iocom->freerd_state = NULL;
267 kdmsg_state_drop(state);
270 if ((state = iocom->freewr_state) != NULL) {
271 iocom->freewr_state = NULL;
272 kdmsg_state_drop(state);
276 * Drop communications descriptor
278 if (iocom->msg_fp) {
279 fdrop(iocom->msg_fp);
280 iocom->msg_fp = NULL;
282 lockmgr(&iocom->msglk, LK_RELEASE);
286 * Cluster controller thread. Perform messaging functions. We have one
287 * thread for the reader and one for the writer. The writer handles
288 * shutdown requests (which should break the reader thread).
290 static
291 void
292 kdmsg_iocom_thread_rd(void *arg)
294 kdmsg_iocom_t *iocom = arg;
295 dmsg_hdr_t hdr;
296 kdmsg_msg_t *msg = NULL;
297 size_t hbytes;
298 size_t abytes;
299 int error = 0;
301 while ((iocom->msg_ctl & KDMSG_CLUSTERCTL_KILLRX) == 0) {
303 * Retrieve the message from the pipe or socket.
305 error = fp_read(iocom->msg_fp, &hdr, sizeof(hdr),
306 NULL, 1, UIO_SYSSPACE);
307 if (error)
308 break;
309 if (hdr.magic != DMSG_HDR_MAGIC) {
310 kprintf("kdmsg: bad magic: %04x\n", hdr.magic);
311 error = EINVAL;
312 break;
314 hbytes = (hdr.cmd & DMSGF_SIZE) * DMSG_ALIGN;
315 if (hbytes < sizeof(hdr) || hbytes > DMSG_HDR_MAX) {
316 kprintf("kdmsg: bad header size %zd\n", hbytes);
317 error = EINVAL;
318 break;
321 /* XXX messy: mask cmd to avoid allocating state */
322 msg = kdmsg_msg_alloc(&iocom->state0,
323 hdr.cmd & DMSGF_BASECMDMASK,
324 NULL, NULL);
325 msg->any.head = hdr;
326 msg->hdr_size = hbytes;
327 if (hbytes > sizeof(hdr)) {
328 error = fp_read(iocom->msg_fp, &msg->any.head + 1,
329 hbytes - sizeof(hdr),
330 NULL, 1, UIO_SYSSPACE);
331 if (error) {
332 kprintf("kdmsg: short msg received\n");
333 error = EINVAL;
334 break;
337 msg->aux_size = hdr.aux_bytes;
338 if (msg->aux_size > DMSG_AUX_MAX) {
339 kprintf("kdmsg: illegal msg payload size %zd\n",
340 msg->aux_size);
341 error = EINVAL;
342 break;
344 if (msg->aux_size) {
345 abytes = DMSG_DOALIGN(msg->aux_size);
346 msg->aux_data = kmalloc(abytes, iocom->mmsg, M_WAITOK);
347 msg->flags |= KDMSG_FLAG_AUXALLOC;
348 error = fp_read(iocom->msg_fp, msg->aux_data,
349 abytes, NULL, 1, UIO_SYSSPACE);
350 if (error) {
351 kprintf("kdmsg: short msg payload received\n");
352 break;
356 error = kdmsg_msg_receive_handling(msg);
357 msg = NULL;
360 kprintf("kdmsg: read thread terminating error=%d\n", error);
362 lockmgr(&iocom->msglk, LK_EXCLUSIVE);
363 if (msg)
364 kdmsg_msg_free(msg);
367 * Shutdown the socket and set KILLRX for consistency in case the
368 * shutdown was not commanded. Signal the transmit side to shutdown
369 * by setting KILLTX and waking it up.
371 fp_shutdown(iocom->msg_fp, SHUT_RDWR);
372 atomic_set_int(&iocom->msg_ctl, KDMSG_CLUSTERCTL_KILLRX |
373 KDMSG_CLUSTERCTL_KILLTX);
374 iocom->msgrd_td = NULL;
375 lockmgr(&iocom->msglk, LK_RELEASE);
376 wakeup(&iocom->msg_ctl);
379 * iocom can be ripped out at any time once the lock is
380 * released with msgrd_td set to NULL. The wakeup()s are safe but
381 * that is all.
383 wakeup(iocom);
384 lwkt_exit();
387 static
388 void
389 kdmsg_iocom_thread_wr(void *arg)
391 kdmsg_iocom_t *iocom = arg;
392 kdmsg_msg_t *msg;
393 ssize_t res;
394 size_t abytes;
395 int error = 0;
396 int save_ticks;
397 int didwarn;
400 * Transmit loop
402 msg = NULL;
403 lockmgr(&iocom->msglk, LK_EXCLUSIVE);
405 while ((iocom->msg_ctl & KDMSG_CLUSTERCTL_KILLTX) == 0 && error == 0) {
407 * Sleep if no messages pending. Interlock with flag while
408 * holding msglk.
410 if (TAILQ_EMPTY(&iocom->msgq)) {
411 atomic_set_int(&iocom->msg_ctl,
412 KDMSG_CLUSTERCTL_SLEEPING);
413 lksleep(&iocom->msg_ctl, &iocom->msglk, 0, "msgwr", hz);
414 atomic_clear_int(&iocom->msg_ctl,
415 KDMSG_CLUSTERCTL_SLEEPING);
418 while ((msg = TAILQ_FIRST(&iocom->msgq)) != NULL) {
420 * Remove msg from the transmit queue and do
421 * persist and half-closed state handling.
423 TAILQ_REMOVE(&iocom->msgq, msg, qentry);
425 error = kdmsg_state_msgtx(msg);
426 if (error == EALREADY) {
427 error = 0;
428 kdmsg_msg_free(msg);
429 continue;
431 if (error) {
432 kdmsg_msg_free(msg);
433 break;
437 * Dump the message to the pipe or socket.
439 * We have to clean up the message as if the transmit
440 * succeeded even if it failed.
442 lockmgr(&iocom->msglk, LK_RELEASE);
443 error = fp_write(iocom->msg_fp, &msg->any,
444 msg->hdr_size, &res, UIO_SYSSPACE);
445 if (error || res != msg->hdr_size) {
446 if (error == 0)
447 error = EINVAL;
448 lockmgr(&iocom->msglk, LK_EXCLUSIVE);
449 kdmsg_state_cleanuptx(msg);
450 break;
452 if (msg->aux_size) {
453 abytes = DMSG_DOALIGN(msg->aux_size);
454 error = fp_write(iocom->msg_fp,
455 msg->aux_data, abytes,
456 &res, UIO_SYSSPACE);
457 if (error || res != abytes) {
458 if (error == 0)
459 error = EINVAL;
460 lockmgr(&iocom->msglk, LK_EXCLUSIVE);
461 kdmsg_state_cleanuptx(msg);
462 break;
465 lockmgr(&iocom->msglk, LK_EXCLUSIVE);
466 kdmsg_state_cleanuptx(msg);
470 kprintf("kdmsg: write thread terminating error=%d\n", error);
473 * Shutdown the socket and set KILLTX for consistency in case the
474 * shutdown was not commanded. Signal the receive side to shutdown
475 * by setting KILLRX and waking it up.
477 fp_shutdown(iocom->msg_fp, SHUT_RDWR);
478 atomic_set_int(&iocom->msg_ctl, KDMSG_CLUSTERCTL_KILLRX |
479 KDMSG_CLUSTERCTL_KILLTX);
480 wakeup(&iocom->msg_ctl);
483 * The transmit thread is responsible for final cleanups, wait
484 * for the receive side to terminate to prevent new received
485 * states from interfering with our cleanup.
487 * Do not set msgwr_td to NULL until we actually exit.
489 while (iocom->msgrd_td) {
490 wakeup(&iocom->msg_ctl);
491 lksleep(iocom, &iocom->msglk, 0, "clstrkt", hz);
495 * We can no longer receive new messages. We must drain the transmit
496 * message queue and simulate received messages to close anay remaining
497 * states.
499 * Loop until all the states are gone and there are no messages
500 * pending transmit.
502 save_ticks = ticks;
503 didwarn = 0;
505 while (TAILQ_FIRST(&iocom->msgq) ||
506 RB_ROOT(&iocom->staterd_tree) ||
507 RB_ROOT(&iocom->statewr_tree)) {
511 kdmsg_drain_msgq(iocom);
512 kprintf("simulate failure for all substates of state0\n");
513 kdmsg_simulate_failure(&iocom->state0, 0, DMSG_ERR_LOSTLINK);
515 lksleep(iocom, &iocom->msglk, 0, "clstrtk", hz / 2);
517 if ((int)(ticks - save_ticks) > hz*2 && didwarn == 0) {
518 didwarn = 1;
519 kprintf("kdmsg: warning, write thread on %p still "
520 "terminating\n", iocom);
522 if ((int)(ticks - save_ticks) > hz*15 && didwarn == 1) {
523 didwarn = 2;
524 kprintf("kdmsg: warning, write thread on %p still "
525 "terminating\n", iocom);
527 if ((int)(ticks - save_ticks) > hz*60) {
528 kprintf("kdmsg: msgq %p rd_tree %p wr_tree %p\n",
529 TAILQ_FIRST(&iocom->msgq),
530 RB_ROOT(&iocom->staterd_tree),
531 RB_ROOT(&iocom->statewr_tree));
532 panic("kdmsg: write thread on %p could not terminate\n",
533 iocom);
538 * Exit handling is done by the write thread.
540 iocom->flags |= KDMSG_IOCOMF_EXITNOACC;
541 lockmgr(&iocom->msglk, LK_RELEASE);
544 * The state trees had better be empty now
546 KKASSERT(RB_EMPTY(&iocom->staterd_tree));
547 KKASSERT(RB_EMPTY(&iocom->statewr_tree));
548 KKASSERT(iocom->conn_state == NULL);
550 if (iocom->exit_func) {
552 * iocom is invalid after we call the exit function.
554 iocom->msgwr_td = NULL;
555 iocom->exit_func(iocom);
556 } else {
558 * iocom can be ripped out from under us once msgwr_td is
559 * set to NULL. The wakeup is safe.
561 iocom->msgwr_td = NULL;
562 wakeup(iocom);
564 lwkt_exit();
568 * This cleans out the pending transmit message queue, adjusting any
569 * persistent states properly in the process.
571 * Called with iocom locked.
573 void
574 kdmsg_drain_msgq(kdmsg_iocom_t *iocom)
576 kdmsg_msg_t *msg;
579 * Clean out our pending transmit queue, executing the
580 * appropriate state adjustments. If this tries to open
581 * any new outgoing transactions we have to loop up and
582 * clean them out.
584 while ((msg = TAILQ_FIRST(&iocom->msgq)) != NULL) {
585 TAILQ_REMOVE(&iocom->msgq, msg, qentry);
586 if (kdmsg_state_msgtx(msg))
587 kdmsg_msg_free(msg);
588 else
589 kdmsg_state_cleanuptx(msg);
594 * Do all processing required to handle a freshly received message
595 * after its low level header has been validated.
597 * iocom is not locked.
599 static
601 kdmsg_msg_receive_handling(kdmsg_msg_t *msg)
603 kdmsg_iocom_t *iocom = msg->state->iocom;
604 int error;
606 #if 0
608 * If sub-states exist and we are deleting (typically due to a
609 * disconnect), we may not receive deletes for any of the substates
610 * and must simulate associated failures.
612 if (msg->state &&
613 (msg->any.head.cmd & DMSGF_DELETE) &&
614 TAILQ_FIRST(&msg->state->subq)) {
615 lockmgr(&iocom->msglk, LK_EXCLUSIVE);
616 kprintf("simulate failure for substates of cmd %08x/%08x\n",
617 msg->state->rxcmd, msg->state->txcmd);
618 kdmsg_simulate_failure(msg->state, 0, DMSG_ERR_LOSTLINK);
619 lockmgr(&iocom->msglk, LK_RELEASE);
621 #endif
624 * State machine tracking, state assignment for msg,
625 * returns error and discard status. Errors are fatal
626 * to the connection except for EALREADY which forces
627 * a discard without execution.
629 error = kdmsg_state_msgrx(msg);
630 if (error) {
632 * Raw protocol or connection error
634 kdmsg_msg_free(msg);
635 if (error == EALREADY)
636 error = 0;
637 } else if (msg->state && msg->state->func) {
639 * Message related to state which already has a
640 * handling function installed for it.
642 error = msg->state->func(msg->state, msg);
643 kdmsg_state_cleanuprx(msg);
644 } else if (iocom->flags & KDMSG_IOCOMF_AUTOANY) {
645 error = kdmsg_autorxmsg(msg);
646 kdmsg_state_cleanuprx(msg);
647 } else {
648 error = iocom->rcvmsg(msg);
649 kdmsg_state_cleanuprx(msg);
651 return error;
655 * Process state tracking for a message after reception, prior to
656 * execution.
658 * Called with msglk held and the msg dequeued.
660 * All messages are called with dummy state and return actual state.
661 * (One-off messages often just return the same dummy state).
663 * May request that caller discard the message by setting *discardp to 1.
664 * The returned state is not used in this case and is allowed to be NULL.
666 * --
668 * These routines handle persistent and command/reply message state via the
669 * CREATE and DELETE flags. The first message in a command or reply sequence
670 * sets CREATE, the last message in a command or reply sequence sets DELETE.
672 * There can be any number of intermediate messages belonging to the same
673 * sequence sent inbetween the CREATE message and the DELETE message,
674 * which set neither flag. This represents a streaming command or reply.
676 * Any command message received with CREATE set expects a reply sequence to
677 * be returned. Reply sequences work the same as command sequences except the
678 * REPLY bit is also sent. Both the command side and reply side can
679 * degenerate into a single message with both CREATE and DELETE set. Note
680 * that one side can be streaming and the other side not, or neither, or both.
682 * The msgid is unique for the initiator. That is, two sides sending a new
683 * message can use the same msgid without colliding.
685 * --
687 * ABORT sequences work by setting the ABORT flag along with normal message
688 * state. However, ABORTs can also be sent on half-closed messages, that is
689 * even if the command or reply side has already sent a DELETE, as long as
690 * the message has not been fully closed it can still send an ABORT+DELETE
691 * to terminate the half-closed message state.
693 * Since ABORT+DELETEs can race we silently discard ABORT's for message
694 * state which has already been fully closed. REPLY+ABORT+DELETEs can
695 * also race, and in this situation the other side might have already
696 * initiated a new unrelated command with the same message id. Since
697 * the abort has not set the CREATE flag the situation can be detected
698 * and the message will also be discarded.
700 * Non-blocking requests can be initiated with ABORT+CREATE[+DELETE].
701 * The ABORT request is essentially integrated into the command instead
702 * of being sent later on. In this situation the command implementation
703 * detects that CREATE and ABORT are both set (vs ABORT alone) and can
704 * special-case non-blocking operation for the command.
706 * NOTE! Messages with ABORT set without CREATE or DELETE are considered
707 * to be mid-stream aborts for command/reply sequences. ABORTs on
708 * one-way messages are not supported.
710 * NOTE! If a command sequence does not support aborts the ABORT flag is
711 * simply ignored.
713 * --
715 * One-off messages (no reply expected) are sent with neither CREATE or DELETE
716 * set. One-off messages cannot be aborted and typically aren't processed
717 * by these routines. The REPLY bit can be used to distinguish whether a
718 * one-off message is a command or reply. For example, one-off replies
719 * will typically just contain status updates.
721 static
723 kdmsg_state_msgrx(kdmsg_msg_t *msg)
725 kdmsg_iocom_t *iocom = msg->state->iocom;
726 kdmsg_state_t *state;
727 kdmsg_state_t *pstate;
728 kdmsg_state_t sdummy;
729 int error;
732 * Make sure a state structure is ready to go in case we need a new
733 * one. This is the only routine which uses freerd_state so no
734 * races are possible.
736 if ((state = iocom->freerd_state) == NULL) {
737 state = kmalloc(sizeof(*state), iocom->mmsg, M_WAITOK | M_ZERO);
738 state->flags = KDMSG_STATE_DYNAMIC;
739 state->iocom = iocom;
740 state->refs = 1;
741 TAILQ_INIT(&state->subq);
742 iocom->freerd_state = state;
744 state = NULL; /* safety */
747 * Lock RB tree and locate existing persistent state, if any.
749 * If received msg is a command state is on staterd_tree.
750 * If received msg is a reply state is on statewr_tree.
752 lockmgr(&iocom->msglk, LK_EXCLUSIVE);
754 again:
755 if (msg->state == &iocom->state0) {
756 sdummy.msgid = msg->any.head.msgid;
757 sdummy.iocom = iocom;
758 if (msg->any.head.cmd & DMSGF_REVTRANS) {
759 state = RB_FIND(kdmsg_state_tree, &iocom->statewr_tree,
760 &sdummy);
761 } else {
762 state = RB_FIND(kdmsg_state_tree, &iocom->staterd_tree,
763 &sdummy);
767 * Set message state unconditionally. If this is a CREATE
768 * message this state will become the parent state and new
769 * state will be allocated for the message state.
771 if (state == NULL)
772 state = &iocom->state0;
773 if (state->flags & KDMSG_STATE_INTERLOCK) {
774 state->flags |= KDMSG_STATE_SIGNAL;
775 lksleep(state, &iocom->msglk, 0, "dmrace", hz);
776 goto again;
778 kdmsg_state_ref(state);
779 kdmsg_state_drop(msg->state); /* iocom->state0 */
780 msg->state = state;
781 } else {
782 state = msg->state;
786 * Short-cut one-off or mid-stream messages.
788 if ((msg->any.head.cmd & (DMSGF_CREATE | DMSGF_DELETE |
789 DMSGF_ABORT)) == 0) {
790 error = 0;
791 goto done;
795 * Switch on CREATE, DELETE, REPLY, and also handle ABORT from
796 * inside the case statements.
798 switch(msg->any.head.cmd & (DMSGF_CREATE|DMSGF_DELETE|DMSGF_REPLY)) {
799 case DMSGF_CREATE:
800 case DMSGF_CREATE | DMSGF_DELETE:
802 * New persistant command received.
804 if (state != &iocom->state0) {
805 kprintf("kdmsg_state_msgrx: duplicate transaction\n");
806 error = EINVAL;
807 break;
811 * Lookup the circuit. The circuit is an open transaction.
812 * the REVCIRC bit in the message tells us which side
813 * initiated the transaction representing the circuit.
815 if (msg->any.head.circuit) {
816 sdummy.msgid = msg->any.head.circuit;
818 if (msg->any.head.cmd & DMSGF_REVCIRC) {
819 pstate = RB_FIND(kdmsg_state_tree,
820 &iocom->statewr_tree,
821 &sdummy);
822 } else {
823 pstate = RB_FIND(kdmsg_state_tree,
824 &iocom->staterd_tree,
825 &sdummy);
827 if (pstate == NULL) {
828 kprintf("kdmsg_state_msgrx: "
829 "missing parent in stacked trans\n");
830 error = EINVAL;
831 break;
833 } else {
834 pstate = &iocom->state0;
838 * Allocate new state.
840 * msg->state becomes the owner of the ref we inherit from
841 * freerd_stae.
843 kdmsg_state_drop(state);
844 state = iocom->freerd_state;
845 iocom->freerd_state = NULL;
847 msg->state = state; /* inherits freerd ref */
848 state->parent = pstate;
849 KKASSERT(state->iocom == iocom);
850 state->flags |= KDMSG_STATE_RBINSERTED |
851 KDMSG_STATE_SUBINSERTED |
852 KDMSG_STATE_OPPOSITE;
853 kdmsg_state_ref(pstate); /* states on pstate->subq */
854 kdmsg_state_ref(state); /* state on pstate->subq */
855 kdmsg_state_ref(state); /* state on rbtree */
856 state->icmd = msg->any.head.cmd & DMSGF_BASECMDMASK;
857 state->rxcmd = msg->any.head.cmd & ~DMSGF_DELETE;
858 state->txcmd = DMSGF_REPLY;
859 state->msgid = msg->any.head.msgid;
860 RB_INSERT(kdmsg_state_tree, &iocom->staterd_tree, state);
861 TAILQ_INSERT_TAIL(&pstate->subq, state, entry);
862 error = 0;
863 break;
864 case DMSGF_DELETE:
866 * Persistent state is expected but might not exist if an
867 * ABORT+DELETE races the close.
869 if (state == &iocom->state0) {
870 if (msg->any.head.cmd & DMSGF_ABORT) {
871 error = EALREADY;
872 } else {
873 kprintf("kdmsg_state_msgrx: "
874 "no state for DELETE\n");
875 error = EINVAL;
877 break;
881 * Handle another ABORT+DELETE case if the msgid has already
882 * been reused.
884 if ((state->rxcmd & DMSGF_CREATE) == 0) {
885 if (msg->any.head.cmd & DMSGF_ABORT) {
886 error = EALREADY;
887 } else {
888 kprintf("kdmsg_state_msgrx: "
889 "state reused for DELETE\n");
890 error = EINVAL;
892 break;
894 error = 0;
895 break;
896 default:
898 * Check for mid-stream ABORT command received, otherwise
899 * allow.
901 if (msg->any.head.cmd & DMSGF_ABORT) {
902 if (state == &iocom->state0 ||
903 (state->rxcmd & DMSGF_CREATE) == 0) {
904 error = EALREADY;
905 break;
908 error = 0;
909 break;
910 case DMSGF_REPLY | DMSGF_CREATE:
911 case DMSGF_REPLY | DMSGF_CREATE | DMSGF_DELETE:
913 * When receiving a reply with CREATE set the original
914 * persistent state message should already exist.
916 if (state == &iocom->state0) {
917 kprintf("kdmsg_state_msgrx: no state match for "
918 "REPLY cmd=%08x msgid=%016jx\n",
919 msg->any.head.cmd,
920 (intmax_t)msg->any.head.msgid);
921 error = EINVAL;
922 break;
924 state->rxcmd = msg->any.head.cmd & ~DMSGF_DELETE;
925 error = 0;
926 break;
927 case DMSGF_REPLY | DMSGF_DELETE:
929 * Received REPLY+ABORT+DELETE in case where msgid has
930 * already been fully closed, ignore the message.
932 if (state == &iocom->state0) {
933 if (msg->any.head.cmd & DMSGF_ABORT) {
934 error = EALREADY;
935 } else {
936 kprintf("kdmsg_state_msgrx: no state match "
937 "for REPLY|DELETE\n");
938 error = EINVAL;
940 break;
944 * Received REPLY+ABORT+DELETE in case where msgid has
945 * already been reused for an unrelated message,
946 * ignore the message.
948 if ((state->rxcmd & DMSGF_CREATE) == 0) {
949 if (msg->any.head.cmd & DMSGF_ABORT) {
950 error = EALREADY;
951 } else {
952 kprintf("kdmsg_state_msgrx: state reused "
953 "for REPLY|DELETE\n");
954 error = EINVAL;
956 break;
958 error = 0;
959 break;
960 case DMSGF_REPLY:
962 * Check for mid-stream ABORT reply received to sent command.
964 if (msg->any.head.cmd & DMSGF_ABORT) {
965 if (state == &iocom->state0 ||
966 (state->rxcmd & DMSGF_CREATE) == 0) {
967 error = EALREADY;
968 break;
971 error = 0;
972 break;
976 * Calculate the easy-switch() transactional command. Represents
977 * the outer-transaction command for any transaction-create or
978 * transaction-delete, and the inner message command for any
979 * non-transaction or inside-transaction command. tcmd will be
980 * set to 0 if the message state is illegal.
982 * The two can be told apart because outer-transaction commands
983 * always have a DMSGF_CREATE and/or DMSGF_DELETE flag.
985 done:
986 lockmgr(&iocom->msglk, LK_RELEASE);
988 if (msg->any.head.cmd & (DMSGF_CREATE | DMSGF_DELETE)) {
989 if (state != &iocom->state0) {
990 msg->tcmd = (msg->state->icmd & DMSGF_BASECMDMASK) |
991 (msg->any.head.cmd & (DMSGF_CREATE |
992 DMSGF_DELETE |
993 DMSGF_REPLY));
994 } else {
995 msg->tcmd = 0;
997 } else {
998 msg->tcmd = msg->any.head.cmd & DMSGF_CMDSWMASK;
1000 return (error);
1004 * Called instead of iocom->rcvmsg() if any of the AUTO flags are set.
1005 * This routine must call iocom->rcvmsg() for anything not automatically
1006 * handled.
1008 static int
1009 kdmsg_autorxmsg(kdmsg_msg_t *msg)
1011 kdmsg_iocom_t *iocom = msg->state->iocom;
1012 int error = 0;
1013 uint32_t cmd;
1016 * Main switch processes transaction create/delete sequences only.
1017 * Use icmd (DELETEs use DMSG_LNK_ERROR
1019 * NOTE: If processing in-transaction messages you generally want
1020 * an inner switch on msg->any.head.cmd.
1022 if (msg->state) {
1023 cmd = (msg->state->icmd & DMSGF_BASECMDMASK) |
1024 (msg->any.head.cmd & (DMSGF_CREATE |
1025 DMSGF_DELETE |
1026 DMSGF_REPLY));
1027 } else {
1028 cmd = 0;
1031 switch(cmd) {
1032 case DMSG_LNK_CONN | DMSGF_CREATE:
1033 case DMSG_LNK_CONN | DMSGF_CREATE | DMSGF_DELETE:
1035 * Received LNK_CONN transaction. Transmit response and
1036 * leave transaction open, which allows the other end to
1037 * start to the SPAN protocol.
1039 * Handle shim after acknowledging the CONN.
1041 if ((msg->any.head.cmd & DMSGF_DELETE) == 0) {
1042 if (iocom->flags & KDMSG_IOCOMF_AUTOCONN) {
1043 kdmsg_msg_result(msg, 0);
1044 if (iocom->auto_callback)
1045 iocom->auto_callback(msg);
1046 } else {
1047 error = iocom->rcvmsg(msg);
1049 break;
1051 /* fall through */
1052 case DMSG_LNK_CONN | DMSGF_DELETE:
1054 * This message is usually simulated after a link is lost
1055 * to clean up the transaction.
1057 if (iocom->flags & KDMSG_IOCOMF_AUTOCONN) {
1058 if (iocom->auto_callback)
1059 iocom->auto_callback(msg);
1060 kdmsg_msg_reply(msg, 0);
1061 } else {
1062 error = iocom->rcvmsg(msg);
1064 break;
1065 case DMSG_LNK_SPAN | DMSGF_CREATE:
1066 case DMSG_LNK_SPAN | DMSGF_CREATE | DMSGF_DELETE:
1068 * Received LNK_SPAN transaction. We do not have to respond
1069 * (except on termination), but we must leave the transaction
1070 * open.
1072 * Handle shim after acknowledging the SPAN.
1074 if (iocom->flags & KDMSG_IOCOMF_AUTORXSPAN) {
1075 if ((msg->any.head.cmd & DMSGF_DELETE) == 0) {
1076 if (iocom->auto_callback)
1077 iocom->auto_callback(msg);
1078 break;
1080 /* fall through */
1081 } else {
1082 error = iocom->rcvmsg(msg);
1083 break;
1085 /* fall through */
1086 case DMSG_LNK_SPAN | DMSGF_DELETE:
1088 * Process shims (auto_callback) before cleaning up the
1089 * circuit structure and closing the transactions. Device
1090 * driver should ensure that the circuit is not used after
1091 * the auto_callback() returns.
1093 * Handle shim before closing the SPAN transaction.
1095 if (iocom->flags & KDMSG_IOCOMF_AUTORXSPAN) {
1096 if (iocom->auto_callback)
1097 iocom->auto_callback(msg);
1098 kdmsg_msg_reply(msg, 0);
1099 } else {
1100 error = iocom->rcvmsg(msg);
1102 break;
1103 default:
1105 * Anything unhandled goes into rcvmsg.
1107 * NOTE: Replies to link-level messages initiated by our side
1108 * are handled by the state callback, they are NOT
1109 * handled here.
1111 error = iocom->rcvmsg(msg);
1112 break;
1114 return (error);
1118 * Post-receive-handling message and state cleanup. This routine is called
1119 * after the state function handling/callback to properly dispose of the
1120 * message and update or dispose of the state.
1122 static
1123 void
1124 kdmsg_state_cleanuprx(kdmsg_msg_t *msg)
1126 kdmsg_iocom_t *iocom = msg->state->iocom;
1127 kdmsg_state_t *state;
1128 kdmsg_state_t *pstate;
1130 if ((state = msg->state) == NULL) {
1131 kdmsg_msg_free(msg);
1132 } else if (msg->any.head.cmd & DMSGF_DELETE) {
1133 lockmgr(&iocom->msglk, LK_EXCLUSIVE);
1134 KKASSERT((state->rxcmd & DMSGF_DELETE) == 0);
1135 state->rxcmd |= DMSGF_DELETE;
1136 if (state->txcmd & DMSGF_DELETE) {
1137 KKASSERT(state->flags & KDMSG_STATE_RBINSERTED);
1138 if (state->rxcmd & DMSGF_REPLY) {
1139 KKASSERT(msg->any.head.cmd &
1140 DMSGF_REPLY);
1141 RB_REMOVE(kdmsg_state_tree,
1142 &iocom->statewr_tree, state);
1143 } else {
1144 KKASSERT((msg->any.head.cmd &
1145 DMSGF_REPLY) == 0);
1146 RB_REMOVE(kdmsg_state_tree,
1147 &iocom->staterd_tree, state);
1149 state->flags &= ~KDMSG_STATE_RBINSERTED;
1150 pstate = state->parent;
1151 if (state->flags & KDMSG_STATE_SUBINSERTED) {
1152 TAILQ_REMOVE(&pstate->subq, state, entry);
1153 state->flags &= ~KDMSG_STATE_SUBINSERTED;
1154 kdmsg_state_drop(pstate); /* pstate->subq */
1155 kdmsg_state_drop(state); /* pstate->subq */
1156 state->parent = NULL;
1157 } else {
1158 KKASSERT(state->parent == NULL);
1160 kdmsg_msg_free(msg);
1161 kdmsg_state_drop(state); /* state on rbtree */
1162 lockmgr(&iocom->msglk, LK_RELEASE);
1163 } else {
1164 kdmsg_msg_free(msg);
1165 lockmgr(&iocom->msglk, LK_RELEASE);
1167 } else {
1168 kdmsg_msg_free(msg);
1173 * Simulate receiving a message which terminates an active transaction
1174 * state. Our simulated received message must set DELETE and may also
1175 * have to set CREATE. It must also ensure that all fields are set such
1176 * that the receive handling code can find the state (kdmsg_state_msgrx())
1177 * or an endless loop will ensue.
1179 * This is used when the other end of the link is dead so the device driver
1180 * gets a completed transaction for all pending states.
1182 * Called with iocom locked.
1184 static
1185 void
1186 kdmsg_simulate_failure(kdmsg_state_t *state, int meto, int error)
1188 kdmsg_state_t *substate;
1190 kdmsg_state_ref(state); /* aborting */
1191 while ((substate = TAILQ_FIRST(&state->subq)) != NULL) {
1192 kdmsg_simulate_failure(substate, 1, error);
1194 if (meto)
1195 kdmsg_state_abort(state);
1196 kdmsg_state_drop(state); /* aborting */
1199 static
1200 void
1201 kdmsg_state_abort(kdmsg_state_t *state)
1203 kdmsg_msg_t *msg;
1206 * Prevent recursive aborts which could otherwise occur if the
1207 * simulated message reception runs state->func which then turns
1208 * around and tries to reply to a broken circuit when then calls
1209 * the state abort code again.
1211 KKASSERT((state->flags & KDMSG_STATE_ABORTING) == 0);
1212 if (state->flags & KDMSG_STATE_ABORTING)
1213 return;
1214 state->flags |= KDMSG_STATE_ABORTING;
1217 * NOTE: Args to kdmsg_msg_alloc() to avoid dynamic state allocation.
1219 * NOTE: We are simulating a received message using our state
1220 * (vs a message generated by the other side using its state),
1221 * so we must invert DMSGF_REVTRANS and DMSGF_REVCIRC.
1223 if ((state->rxcmd & DMSGF_DELETE) == 0) {
1224 msg = kdmsg_msg_alloc(state, DMSG_LNK_ERROR, NULL, NULL);
1225 if ((state->rxcmd & DMSGF_CREATE) == 0)
1226 msg->any.head.cmd |= DMSGF_CREATE;
1227 msg->any.head.cmd |= DMSGF_DELETE |
1228 (state->rxcmd & DMSGF_REPLY);
1229 msg->any.head.cmd ^= (DMSGF_REVTRANS | DMSGF_REVCIRC);
1230 msg->any.head.error = DMSG_ERR_LOSTLINK;
1231 lockmgr(&state->iocom->msglk, LK_RELEASE);
1232 kdmsg_msg_receive_handling(msg);
1233 lockmgr(&state->iocom->msglk, LK_EXCLUSIVE);
1234 msg = NULL;
1238 * If the state still has a parent association we must remove it
1239 * now even if it is not fully closed or the simulation loop will
1240 * livelock.
1242 if (state->flags & KDMSG_STATE_SUBINSERTED) {
1243 KKASSERT(state->flags & KDMSG_STATE_RBINSERTED);
1244 TAILQ_REMOVE(&state->parent->subq, state, entry);
1245 state->flags &= ~KDMSG_STATE_SUBINSERTED;
1246 kdmsg_state_drop(state->parent); /* pstate->subq */
1247 kdmsg_state_drop(state); /* pstate->subq */
1248 state->parent = NULL;
1253 * Process state tracking for a message prior to transmission.
1255 * Called with msglk held and the msg dequeued. Returns non-zero if
1256 * the message is bad and should be deleted by the caller.
1258 * One-off messages are usually with dummy state and msg->state may be NULL
1259 * in this situation.
1261 * New transactions (when CREATE is set) will insert the state.
1263 * May request that caller discard the message by setting *discardp to 1.
1264 * A NULL state may be returned in this case.
1266 static
1268 kdmsg_state_msgtx(kdmsg_msg_t *msg)
1270 kdmsg_iocom_t *iocom = msg->state->iocom;
1271 kdmsg_state_t *state;
1272 int error;
1275 * Make sure a state structure is ready to go in case we need a new
1276 * one. This is the only routine which uses freewr_state so no
1277 * races are possible.
1279 if ((state = iocom->freewr_state) == NULL) {
1280 state = kmalloc(sizeof(*state), iocom->mmsg, M_WAITOK | M_ZERO);
1281 state->flags = KDMSG_STATE_DYNAMIC;
1282 state->iocom = iocom;
1283 state->refs = 1;
1284 TAILQ_INIT(&state->subq);
1285 iocom->freewr_state = state;
1289 * Lock RB tree. If persistent state is present it will have already
1290 * been assigned to msg.
1292 state = msg->state;
1295 * Short-cut one-off or mid-stream messages (state may be NULL).
1297 if ((msg->any.head.cmd & (DMSGF_CREATE | DMSGF_DELETE |
1298 DMSGF_ABORT)) == 0) {
1299 return(0);
1304 * Switch on CREATE, DELETE, REPLY, and also handle ABORT from
1305 * inside the case statements.
1307 switch(msg->any.head.cmd & (DMSGF_CREATE | DMSGF_DELETE |
1308 DMSGF_REPLY)) {
1309 case DMSGF_CREATE:
1310 case DMSGF_CREATE | DMSGF_DELETE:
1312 * Insert the new persistent message state and mark
1313 * half-closed if DELETE is set. Since this is a new
1314 * message it isn't possible to transition into the fully
1315 * closed state here.
1317 * XXX state must be assigned and inserted by
1318 * kdmsg_msg_write(). txcmd is assigned by us
1319 * on-transmit.
1321 KKASSERT(state != NULL);
1322 state->icmd = msg->any.head.cmd & DMSGF_BASECMDMASK;
1323 state->txcmd = msg->any.head.cmd & ~DMSGF_DELETE;
1324 state->rxcmd = DMSGF_REPLY;
1325 error = 0;
1326 break;
1327 case DMSGF_DELETE:
1329 * Sent ABORT+DELETE in case where msgid has already
1330 * been fully closed, ignore the message.
1332 if (state == &iocom->state0) {
1333 if (msg->any.head.cmd & DMSGF_ABORT) {
1334 error = EALREADY;
1335 } else {
1336 kprintf("kdmsg_state_msgtx: no state match "
1337 "for DELETE cmd=%08x msgid=%016jx\n",
1338 msg->any.head.cmd,
1339 (intmax_t)msg->any.head.msgid);
1340 error = EINVAL;
1342 break;
1346 * Sent ABORT+DELETE in case where msgid has
1347 * already been reused for an unrelated message,
1348 * ignore the message.
1350 if ((state->txcmd & DMSGF_CREATE) == 0) {
1351 if (msg->any.head.cmd & DMSGF_ABORT) {
1352 error = EALREADY;
1353 } else {
1354 kprintf("kdmsg_state_msgtx: state reused "
1355 "for DELETE\n");
1356 error = EINVAL;
1358 break;
1360 error = 0;
1361 break;
1362 default:
1364 * Check for mid-stream ABORT command sent
1366 if (msg->any.head.cmd & DMSGF_ABORT) {
1367 if (state == &state->iocom->state0 ||
1368 (state->txcmd & DMSGF_CREATE) == 0) {
1369 error = EALREADY;
1370 break;
1373 error = 0;
1374 break;
1375 case DMSGF_REPLY | DMSGF_CREATE:
1376 case DMSGF_REPLY | DMSGF_CREATE | DMSGF_DELETE:
1378 * When transmitting a reply with CREATE set the original
1379 * persistent state message should already exist.
1381 if (state == &state->iocom->state0) {
1382 kprintf("kdmsg_state_msgtx: no state match "
1383 "for REPLY | CREATE\n");
1384 error = EINVAL;
1385 break;
1387 state->txcmd = msg->any.head.cmd & ~DMSGF_DELETE;
1388 error = 0;
1389 break;
1390 case DMSGF_REPLY | DMSGF_DELETE:
1392 * When transmitting a reply with DELETE set the original
1393 * persistent state message should already exist.
1395 * This is very similar to the REPLY|CREATE|* case except
1396 * txcmd is already stored, so we just add the DELETE flag.
1398 * Sent REPLY+ABORT+DELETE in case where msgid has
1399 * already been fully closed, ignore the message.
1401 if (state == &state->iocom->state0) {
1402 if (msg->any.head.cmd & DMSGF_ABORT) {
1403 error = EALREADY;
1404 } else {
1405 kprintf("kdmsg_state_msgtx: no state match "
1406 "for REPLY | DELETE\n");
1407 error = EINVAL;
1409 break;
1413 * Sent REPLY+ABORT+DELETE in case where msgid has already
1414 * been reused for an unrelated message, ignore the message.
1416 if ((state->txcmd & DMSGF_CREATE) == 0) {
1417 if (msg->any.head.cmd & DMSGF_ABORT) {
1418 error = EALREADY;
1419 } else {
1420 kprintf("kdmsg_state_msgtx: state reused "
1421 "for REPLY | DELETE\n");
1422 error = EINVAL;
1424 break;
1426 error = 0;
1427 break;
1428 case DMSGF_REPLY:
1430 * Check for mid-stream ABORT reply sent.
1432 * One-off REPLY messages are allowed for e.g. status updates.
1434 if (msg->any.head.cmd & DMSGF_ABORT) {
1435 if (state == &state->iocom->state0 ||
1436 (state->txcmd & DMSGF_CREATE) == 0) {
1437 error = EALREADY;
1438 break;
1441 error = 0;
1442 break;
1446 * Set interlock (XXX hack) in case the send side blocks and a
1447 * response is returned before kdmsg_state_cleanuptx() can be
1448 * run.
1450 if (state && error == 0)
1451 state->flags |= KDMSG_STATE_INTERLOCK;
1453 return (error);
1457 * Called with iocom locked.
1459 static
1460 void
1461 kdmsg_state_cleanuptx(kdmsg_msg_t *msg)
1463 kdmsg_iocom_t *iocom = msg->state->iocom;
1464 kdmsg_state_t *state;
1465 kdmsg_state_t *pstate;
1467 if ((state = msg->state) == NULL) {
1468 kdmsg_msg_free(msg);
1469 return;
1473 * Clear interlock (XXX hack) in case the send side blocks and a
1474 * response is returned in the other thread before
1475 * kdmsg_state_cleanuptx() can be run. We maintain our hold on
1476 * iocom->msglk so we can do this before completing our task.
1478 if (state->flags & KDMSG_STATE_SIGNAL) {
1479 kprintf("kdmsg: state %p interlock!\n", state);
1480 wakeup(state);
1482 state->flags &= ~(KDMSG_STATE_INTERLOCK | KDMSG_STATE_SIGNAL);
1484 if (msg->any.head.cmd & DMSGF_DELETE) {
1485 KKASSERT((state->txcmd & DMSGF_DELETE) == 0);
1486 state->txcmd |= DMSGF_DELETE;
1487 if (state->rxcmd & DMSGF_DELETE) {
1488 KKASSERT(state->flags & KDMSG_STATE_RBINSERTED);
1489 if (state->txcmd & DMSGF_REPLY) {
1490 KKASSERT(msg->any.head.cmd &
1491 DMSGF_REPLY);
1492 RB_REMOVE(kdmsg_state_tree,
1493 &iocom->staterd_tree, state);
1494 } else {
1495 KKASSERT((msg->any.head.cmd &
1496 DMSGF_REPLY) == 0);
1497 RB_REMOVE(kdmsg_state_tree,
1498 &iocom->statewr_tree, state);
1500 state->flags &= ~KDMSG_STATE_RBINSERTED;
1501 pstate = state->parent;
1502 if (state->flags & KDMSG_STATE_SUBINSERTED) {
1503 TAILQ_REMOVE(&pstate->subq, state, entry);
1504 state->flags &= ~KDMSG_STATE_SUBINSERTED;
1505 kdmsg_state_drop(pstate); /* pstate->subq */
1506 kdmsg_state_drop(state); /* pstate->subq */
1507 state->parent = NULL;
1508 } else {
1509 KKASSERT(state->parent == NULL);
1511 kdmsg_msg_free(msg);
1512 kdmsg_state_drop(state); /* state on rbtree */
1513 } else {
1514 kdmsg_msg_free(msg);
1516 } else {
1517 kdmsg_msg_free(msg);
1521 static
1522 void
1523 _kdmsg_state_ref(kdmsg_state_t *state KDMSG_DEBUG_ARGS)
1525 atomic_add_int(&state->refs, 1);
1526 #if KDMSG_DEBUG
1527 kprintf("state %p +%d\t%s:%d\n", state, state->refs, file, line);
1528 #endif
1531 static
1532 void
1533 _kdmsg_state_drop(kdmsg_state_t *state KDMSG_DEBUG_ARGS)
1535 KKASSERT(state->refs > 0);
1536 #if KDMSG_DEBUG
1537 kprintf("state %p -%d\t%s:%d\n", state, state->refs, file, line);
1538 #endif
1539 if (atomic_fetchadd_int(&state->refs, -1) == 1)
1540 kdmsg_state_free(state);
1543 static
1544 void
1545 kdmsg_state_free(kdmsg_state_t *state)
1547 kdmsg_iocom_t *iocom = state->iocom;
1549 KKASSERT((state->flags & KDMSG_STATE_RBINSERTED) == 0);
1550 KKASSERT((state->flags & KDMSG_STATE_SUBINSERTED) == 0);
1551 KKASSERT(TAILQ_EMPTY(&state->subq));
1553 if (state != &state->iocom->state0)
1554 kfree(state, iocom->mmsg);
1557 kdmsg_msg_t *
1558 kdmsg_msg_alloc(kdmsg_state_t *state, uint32_t cmd,
1559 int (*func)(kdmsg_state_t *, kdmsg_msg_t *), void *data)
1561 kdmsg_iocom_t *iocom = state->iocom;
1562 kdmsg_state_t *pstate;
1563 kdmsg_msg_t *msg;
1564 size_t hbytes;
1566 KKASSERT(iocom != NULL);
1567 hbytes = (cmd & DMSGF_SIZE) * DMSG_ALIGN;
1568 msg = kmalloc(offsetof(struct kdmsg_msg, any) + hbytes,
1569 iocom->mmsg, M_WAITOK | M_ZERO);
1570 msg->hdr_size = hbytes;
1572 if ((cmd & (DMSGF_CREATE | DMSGF_REPLY)) == DMSGF_CREATE) {
1574 * New transaction, requires tracking state and a unique
1575 * msgid to be allocated.
1577 pstate = state;
1578 state = kmalloc(sizeof(*state), iocom->mmsg, M_WAITOK | M_ZERO);
1579 TAILQ_INIT(&state->subq);
1580 state->iocom = iocom;
1581 state->parent = pstate;
1582 state->flags = KDMSG_STATE_DYNAMIC;
1583 state->func = func;
1584 state->any.any = data;
1585 state->msgid = (uint64_t)(uintptr_t)state;
1586 /*msg->any.head.msgid = state->msgid;XXX*/
1588 lockmgr(&iocom->msglk, LK_EXCLUSIVE);
1589 if (RB_INSERT(kdmsg_state_tree, &iocom->statewr_tree, state))
1590 panic("duplicate msgid allocated");
1591 TAILQ_INSERT_TAIL(&pstate->subq, state, entry);
1592 state->flags |= KDMSG_STATE_RBINSERTED |
1593 KDMSG_STATE_SUBINSERTED;
1594 kdmsg_state_ref(pstate); /* pstate->subq */
1595 kdmsg_state_ref(state); /* pstate->subq */
1596 kdmsg_state_ref(state); /* state on rbtree */
1597 lockmgr(&iocom->msglk, LK_RELEASE);
1598 } else {
1599 pstate = state->parent;
1600 KKASSERT(pstate != NULL);
1603 if (state->flags & KDMSG_STATE_OPPOSITE)
1604 cmd |= DMSGF_REVTRANS;
1605 if (pstate->flags & KDMSG_STATE_OPPOSITE)
1606 cmd |= DMSGF_REVCIRC;
1608 msg->any.head.magic = DMSG_HDR_MAGIC;
1609 msg->any.head.cmd = cmd;
1610 msg->any.head.msgid = state->msgid;
1611 msg->any.head.circuit = pstate->msgid;
1612 msg->state = state;
1613 kdmsg_state_ref(state); /* msg->state */
1615 return (msg);
1618 void
1619 kdmsg_msg_free(kdmsg_msg_t *msg)
1621 kdmsg_iocom_t *iocom = msg->state->iocom;
1622 kdmsg_state_t *state;
1624 if ((msg->flags & KDMSG_FLAG_AUXALLOC) &&
1625 msg->aux_data && msg->aux_size) {
1626 kfree(msg->aux_data, iocom->mmsg);
1627 msg->flags &= ~KDMSG_FLAG_AUXALLOC;
1629 if ((state = msg->state) != NULL) {
1630 msg->state = NULL;
1631 kdmsg_state_drop(state); /* msg->state */
1633 msg->aux_data = NULL;
1634 msg->aux_size = 0;
1636 kfree(msg, iocom->mmsg);
1639 void
1640 kdmsg_detach_aux_data(kdmsg_msg_t *msg, kdmsg_data_t *data)
1642 if (msg->flags & KDMSG_FLAG_AUXALLOC) {
1643 data->aux_data = msg->aux_data;
1644 data->aux_size = msg->aux_size;
1645 data->iocom = msg->state->iocom;
1646 msg->flags &= ~KDMSG_FLAG_AUXALLOC;
1647 } else {
1648 data->aux_data = NULL;
1649 data->aux_size = 0;
1650 data->iocom = msg->state->iocom;
1654 void
1655 kdmsg_free_aux_data(kdmsg_data_t *data)
1657 if (data->aux_data)
1658 kfree(data->aux_data, data->iocom->mmsg);
1662 * Indexed messages are stored in a red-black tree indexed by their
1663 * msgid. Only persistent messages are indexed.
1666 kdmsg_state_cmp(kdmsg_state_t *state1, kdmsg_state_t *state2)
1668 if (state1->iocom < state2->iocom)
1669 return(-1);
1670 if (state1->iocom > state2->iocom)
1671 return(1);
1672 if (state1->msgid < state2->msgid)
1673 return(-1);
1674 if (state1->msgid > state2->msgid)
1675 return(1);
1676 return(0);
1680 * Write a message. All requisit command flags have been set.
1682 * If msg->state is non-NULL the message is written to the existing
1683 * transaction. msgid will be set accordingly.
1685 * If msg->state is NULL and CREATE is set new state is allocated and
1686 * (func, data) is installed. A msgid is assigned.
1688 * If msg->state is NULL and CREATE is not set the message is assumed
1689 * to be a one-way message. The originator must assign the msgid
1690 * (or leave it 0, which is typical.
1692 * This function merely queues the message to the management thread, it
1693 * does not write to the message socket/pipe.
1695 void
1696 kdmsg_msg_write(kdmsg_msg_t *msg)
1698 kdmsg_iocom_t *iocom = msg->state->iocom;
1699 kdmsg_state_t *state;
1701 if (msg->state) {
1703 * Continuance or termination of existing transaction.
1704 * The transaction could have been initiated by either end.
1706 * (Function callback and aux data for the receive side can
1707 * be replaced or left alone).
1709 state = msg->state;
1710 msg->any.head.msgid = state->msgid;
1711 } else {
1713 * One-off message (always uses msgid 0 to distinguish
1714 * between a possibly lost in-transaction message due to
1715 * competing aborts and a real one-off message?)
1717 state = NULL;
1718 msg->any.head.msgid = 0;
1721 lockmgr(&iocom->msglk, LK_EXCLUSIVE);
1724 * This flag is not set until after the tx thread has drained
1725 * the tx msgq and simulated responses. After that point the
1726 * txthread is dead and can no longer simulate responses.
1728 * Device drivers should never try to send a message once this
1729 * flag is set. They should have detected (through the state
1730 * closures) that the link is in trouble.
1732 if (iocom->flags & KDMSG_IOCOMF_EXITNOACC) {
1733 lockmgr(&iocom->msglk, LK_RELEASE);
1734 panic("kdmsg_msg_write: Attempt to write message to "
1735 "terminated iocom\n");
1739 * For stateful messages, if the circuit is dead we have to abort
1740 * the state and discard the message.
1742 * - We must discard the message because the other end will not
1743 * be expecting any more messages over the dead circuit and might
1744 * not be able to receive them.
1746 * - We abort the state by simulating a failure to generate a fake
1747 * incoming DELETE. This will trigger the state callback and allow
1748 * the device to clean things up and reply, closing the outgoing
1749 * direction and terminating the state.
1751 * - Because there are numerous races, it is possible that an abort
1752 * has already been initiated on this state.
1754 * - For now, don't bother checking to see if this is a CREATE
1755 * message, though we could probably add that as a restriction.
1756 * Any pre-existing state will probably have already had an abort
1757 * initiated on it.
1759 * This race occurs quite often, particularly as SPANs stabilize.
1760 * End-points must do the right thing.
1762 if (state) {
1763 KKASSERT((state->txcmd & DMSGF_DELETE) == 0);
1764 if ((state->parent->txcmd & DMSGF_DELETE) ||
1765 (state->parent->flags & KDMSG_STATE_ABORTING)) {
1766 kprintf("kdmsg_msg_write: Write to dying circuit "
1767 "ptxcmd=%08x prxcmd=%08x flags=%08x\n",
1768 state->parent->rxcmd,
1769 state->parent->txcmd,
1770 state->parent->flags);
1771 kdmsg_state_ref(state);
1772 kdmsg_state_msgtx(msg);
1773 kdmsg_state_cleanuptx(msg);
1774 if ((state->flags & KDMSG_STATE_ABORTING) == 0) {
1775 kdmsg_simulate_failure(state, 1,
1776 DMSG_ERR_LOSTLINK);
1778 kdmsg_state_drop(state);
1779 lockmgr(&iocom->msglk, LK_RELEASE);
1780 return;
1785 * Finish up the msg fields. Note that msg->aux_size and the
1786 * aux_bytes stored in the message header represent the unaligned
1787 * (actual) bytes of data, but the buffer is sized to an aligned
1788 * size and the CRC is generated over the aligned length.
1790 msg->any.head.salt = /* (random << 8) | */ (iocom->msg_seq & 255);
1791 ++iocom->msg_seq;
1793 if (msg->aux_data && msg->aux_size) {
1794 uint32_t abytes = DMSG_DOALIGN(msg->aux_size);
1796 msg->any.head.aux_bytes = msg->aux_size;
1797 msg->any.head.aux_crc = iscsi_crc32(msg->aux_data, abytes);
1799 msg->any.head.hdr_crc = 0;
1800 msg->any.head.hdr_crc = iscsi_crc32(msg->any.buf, msg->hdr_size);
1802 TAILQ_INSERT_TAIL(&iocom->msgq, msg, qentry);
1804 if (iocom->msg_ctl & KDMSG_CLUSTERCTL_SLEEPING) {
1805 atomic_clear_int(&iocom->msg_ctl,
1806 KDMSG_CLUSTERCTL_SLEEPING);
1807 wakeup(&iocom->msg_ctl);
1810 lockmgr(&iocom->msglk, LK_RELEASE);
1814 * Reply to a message and terminate our side of the transaction.
1816 * If msg->state is non-NULL we are replying to a one-way message.
1818 void
1819 kdmsg_msg_reply(kdmsg_msg_t *msg, uint32_t error)
1821 kdmsg_state_t *state = msg->state;
1822 kdmsg_msg_t *nmsg;
1823 uint32_t cmd;
1826 * Reply with a simple error code and terminate the transaction.
1828 cmd = DMSG_LNK_ERROR;
1831 * Check if our direction has even been initiated yet, set CREATE.
1833 * Check what direction this is (command or reply direction). Note
1834 * that txcmd might not have been initiated yet.
1836 * If our direction has already been closed we just return without
1837 * doing anything.
1839 if (state != &state->iocom->state0) {
1840 if (state->txcmd & DMSGF_DELETE)
1841 return;
1842 if ((state->txcmd & DMSGF_CREATE) == 0)
1843 cmd |= DMSGF_CREATE;
1844 if (state->txcmd & DMSGF_REPLY)
1845 cmd |= DMSGF_REPLY;
1846 cmd |= DMSGF_DELETE;
1847 } else {
1848 if ((msg->any.head.cmd & DMSGF_REPLY) == 0)
1849 cmd |= DMSGF_REPLY;
1852 nmsg = kdmsg_msg_alloc(state, cmd, NULL, NULL);
1853 nmsg->any.head.error = error;
1854 kdmsg_msg_write(nmsg);
1858 * Reply to a message and continue our side of the transaction.
1860 * If msg->state is non-NULL we are replying to a one-way message and this
1861 * function degenerates into the same as kdmsg_msg_reply().
1863 void
1864 kdmsg_msg_result(kdmsg_msg_t *msg, uint32_t error)
1866 kdmsg_state_t *state = msg->state;
1867 kdmsg_msg_t *nmsg;
1868 uint32_t cmd;
1871 * Return a simple result code, do NOT terminate the transaction.
1873 cmd = DMSG_LNK_ERROR;
1876 * Check if our direction has even been initiated yet, set CREATE.
1878 * Check what direction this is (command or reply direction). Note
1879 * that txcmd might not have been initiated yet.
1881 * If our direction has already been closed we just return without
1882 * doing anything.
1884 if (state != &state->iocom->state0) {
1885 if (state->txcmd & DMSGF_DELETE)
1886 return;
1887 if ((state->txcmd & DMSGF_CREATE) == 0)
1888 cmd |= DMSGF_CREATE;
1889 if (state->txcmd & DMSGF_REPLY)
1890 cmd |= DMSGF_REPLY;
1891 /* continuing transaction, do not set MSGF_DELETE */
1892 } else {
1893 if ((msg->any.head.cmd & DMSGF_REPLY) == 0)
1894 cmd |= DMSGF_REPLY;
1897 nmsg = kdmsg_msg_alloc(state, cmd, NULL, NULL);
1898 nmsg->any.head.error = error;
1899 kdmsg_msg_write(nmsg);
1903 * Reply to a message and terminate our side of the transaction.
1905 * If msg->state is non-NULL we are replying to a one-way message.
1907 void
1908 kdmsg_state_reply(kdmsg_state_t *state, uint32_t error)
1910 kdmsg_msg_t *nmsg;
1911 uint32_t cmd;
1914 * Reply with a simple error code and terminate the transaction.
1916 cmd = DMSG_LNK_ERROR;
1919 * Check if our direction has even been initiated yet, set CREATE.
1921 * Check what direction this is (command or reply direction). Note
1922 * that txcmd might not have been initiated yet.
1924 * If our direction has already been closed we just return without
1925 * doing anything.
1927 KKASSERT(state);
1928 if (state->txcmd & DMSGF_DELETE)
1929 return;
1930 if ((state->txcmd & DMSGF_CREATE) == 0)
1931 cmd |= DMSGF_CREATE;
1932 if (state->txcmd & DMSGF_REPLY)
1933 cmd |= DMSGF_REPLY;
1934 cmd |= DMSGF_DELETE;
1936 nmsg = kdmsg_msg_alloc(state, cmd, NULL, NULL);
1937 nmsg->any.head.error = error;
1938 kdmsg_msg_write(nmsg);
1942 * Reply to a message and continue our side of the transaction.
1944 * If msg->state is non-NULL we are replying to a one-way message and this
1945 * function degenerates into the same as kdmsg_msg_reply().
1947 void
1948 kdmsg_state_result(kdmsg_state_t *state, uint32_t error)
1950 kdmsg_msg_t *nmsg;
1951 uint32_t cmd;
1954 * Return a simple result code, do NOT terminate the transaction.
1956 cmd = DMSG_LNK_ERROR;
1959 * Check if our direction has even been initiated yet, set CREATE.
1961 * Check what direction this is (command or reply direction). Note
1962 * that txcmd might not have been initiated yet.
1964 * If our direction has already been closed we just return without
1965 * doing anything.
1967 KKASSERT(state);
1968 if (state->txcmd & DMSGF_DELETE)
1969 return;
1970 if ((state->txcmd & DMSGF_CREATE) == 0)
1971 cmd |= DMSGF_CREATE;
1972 if (state->txcmd & DMSGF_REPLY)
1973 cmd |= DMSGF_REPLY;
1974 /* continuing transaction, do not set MSGF_DELETE */
1976 nmsg = kdmsg_msg_alloc(state, cmd, NULL, NULL);
1977 nmsg->any.head.error = error;
1978 kdmsg_msg_write(nmsg);