hammer2 - more dmsg/separation work
[dragonfly.git] / sys / kern / kern_dmsg.c
blob4812f911055f48353aad24e3b071aa986b9547ad
1 /*-
2 * Copyright (c) 2012 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
35 * TODO: txcmd CREATE state is deferred by txmsgq, need to calculate
36 * a streaming response. See subr_diskiocom()'s diskiodone().
38 #include <sys/param.h>
39 #include <sys/types.h>
40 #include <sys/kernel.h>
41 #include <sys/conf.h>
42 #include <sys/systm.h>
43 #include <sys/queue.h>
44 #include <sys/tree.h>
45 #include <sys/malloc.h>
46 #include <sys/mount.h>
47 #include <sys/socket.h>
48 #include <sys/vnode.h>
49 #include <sys/file.h>
50 #include <sys/proc.h>
51 #include <sys/priv.h>
52 #include <sys/thread.h>
53 #include <sys/globaldata.h>
54 #include <sys/limits.h>
56 #include <sys/dmsg.h>
58 RB_GENERATE(kdmsg_state_tree, kdmsg_state, rbnode, kdmsg_state_cmp);
59 RB_GENERATE(kdmsg_circuit_tree, kdmsg_circuit, rbnode, kdmsg_circuit_cmp);
61 static int kdmsg_msg_receive_handling(kdmsg_msg_t *msg);
62 static int kdmsg_circ_msgrx(kdmsg_msg_t *msg);
63 static int kdmsg_state_msgrx(kdmsg_msg_t *msg);
64 static int kdmsg_state_msgtx(kdmsg_msg_t *msg);
65 static void kdmsg_state_cleanuprx(kdmsg_msg_t *msg);
66 static void kdmsg_state_cleanuptx(kdmsg_msg_t *msg);
67 static void kdmsg_state_abort(kdmsg_state_t *state);
68 static void kdmsg_state_free(kdmsg_state_t *state);
70 static void kdmsg_iocom_thread_rd(void *arg);
71 static void kdmsg_iocom_thread_wr(void *arg);
72 static int kdmsg_autorxmsg(kdmsg_msg_t *msg);
73 static void kdmsg_autocirc(kdmsg_msg_t *msg);
74 static int kdmsg_autocirc_reply(kdmsg_state_t *state, kdmsg_msg_t *msg);
76 static struct lwkt_token kdmsg_token = LWKT_TOKEN_INITIALIZER(kdmsg_token);
78 void
79 kdmsg_circ_hold(kdmsg_circuit_t *circ)
81 atomic_add_int(&circ->refs, 1);
84 void
85 kdmsg_circ_drop(kdmsg_circuit_t *circ)
87 kdmsg_iocom_t *iocom;
89 if (atomic_fetchadd_int(&circ->refs, -1) == 1) {
90 KKASSERT(circ->span_state == NULL &&
91 circ->circ_state == NULL &&
92 circ->rcirc_state == NULL &&
93 circ->recorded == 0);
94 iocom = circ->iocom;
95 circ->iocom = NULL;
96 kfree(circ, iocom->mmsg);
102 * Initialize the roll-up communications structure for a network
103 * messaging session. This function does not install the socket.
105 void
106 kdmsg_iocom_init(kdmsg_iocom_t *iocom, void *handle, uint32_t flags,
107 struct malloc_type *mmsg,
108 int (*rcvmsg)(kdmsg_msg_t *msg))
110 bzero(iocom, sizeof(*iocom));
111 iocom->handle = handle;
112 iocom->mmsg = mmsg;
113 iocom->rcvmsg = rcvmsg;
114 iocom->flags = flags;
115 lockinit(&iocom->msglk, "h2msg", 0, 0);
116 TAILQ_INIT(&iocom->msgq);
117 RB_INIT(&iocom->circ_tree);
118 RB_INIT(&iocom->staterd_tree);
119 RB_INIT(&iocom->statewr_tree);
123 * [Re]connect using the passed file pointer. The caller must ref the
124 * fp for us. We own that ref now.
126 void
127 kdmsg_iocom_reconnect(kdmsg_iocom_t *iocom, struct file *fp,
128 const char *subsysname)
131 * Destroy the current connection
133 lockmgr(&iocom->msglk, LK_EXCLUSIVE);
134 atomic_set_int(&iocom->msg_ctl, KDMSG_CLUSTERCTL_KILL);
135 while (iocom->msgrd_td || iocom->msgwr_td) {
136 wakeup(&iocom->msg_ctl);
137 lksleep(iocom, &iocom->msglk, 0, "clstrkl", hz);
141 * Drop communications descriptor
143 if (iocom->msg_fp) {
144 fdrop(iocom->msg_fp);
145 iocom->msg_fp = NULL;
149 * Setup new communications descriptor
151 iocom->msg_ctl = 0;
152 iocom->msg_fp = fp;
153 iocom->msg_seq = 0;
154 iocom->flags &= ~KDMSG_IOCOMF_EXITNOACC;
156 lwkt_create(kdmsg_iocom_thread_rd, iocom, &iocom->msgrd_td,
157 NULL, 0, -1, "%s-msgrd", subsysname);
158 lwkt_create(kdmsg_iocom_thread_wr, iocom, &iocom->msgwr_td,
159 NULL, 0, -1, "%s-msgwr", subsysname);
160 lockmgr(&iocom->msglk, LK_RELEASE);
164 * Caller sets up iocom->auto_lnk_conn and iocom->auto_lnk_span, then calls
165 * this function to handle the state machine for LNK_CONN and LNK_SPAN.
167 * NOTE: Caller typically also sets the IOCOMF_AUTOCONN, IOCOMF_AUTORXSPAN,
168 * and IOCOMF_AUTORXCIRC in the kdmsg_iocom_init() call. Clients
169 * typically set IOCOMF_AUTOTXCIRC to automatically forged circuits
170 * for received SPANs.
172 static int kdmsg_lnk_conn_reply(kdmsg_state_t *state, kdmsg_msg_t *msg);
173 static int kdmsg_lnk_span_reply(kdmsg_state_t *state, kdmsg_msg_t *msg);
175 void
176 kdmsg_iocom_autoinitiate(kdmsg_iocom_t *iocom,
177 void (*auto_callback)(kdmsg_msg_t *msg))
179 kdmsg_msg_t *msg;
181 iocom->auto_callback = auto_callback;
183 msg = kdmsg_msg_alloc(iocom, NULL,
184 DMSG_LNK_CONN | DMSGF_CREATE,
185 kdmsg_lnk_conn_reply, NULL);
186 iocom->auto_lnk_conn.head = msg->any.head;
187 msg->any.lnk_conn = iocom->auto_lnk_conn;
188 iocom->conn_state = msg->state;
189 kdmsg_msg_write(msg);
192 static
194 kdmsg_lnk_conn_reply(kdmsg_state_t *state, kdmsg_msg_t *msg)
196 kdmsg_iocom_t *iocom = state->iocom;
197 kdmsg_msg_t *rmsg;
200 * Upon receipt of the LNK_CONN acknowledgement initiate an
201 * automatic SPAN if we were asked to. Used by e.g. xdisk, but
202 * not used by HAMMER2 which must manage more than one transmitted
203 * SPAN.
205 if ((msg->any.head.cmd & DMSGF_CREATE) &&
206 (iocom->flags & KDMSG_IOCOMF_AUTOTXSPAN)) {
207 rmsg = kdmsg_msg_alloc(iocom, NULL,
208 DMSG_LNK_SPAN | DMSGF_CREATE,
209 kdmsg_lnk_span_reply, NULL);
210 iocom->auto_lnk_span.head = rmsg->any.head;
211 rmsg->any.lnk_span = iocom->auto_lnk_span;
212 kdmsg_msg_write(rmsg);
216 * Process shim after the CONN is acknowledged and before the CONN
217 * transaction is deleted. For deletions this gives device drivers
218 * the ability to interlock new operations on the circuit before
219 * it becomes illegal and panics.
221 if (iocom->auto_callback)
222 iocom->auto_callback(msg);
224 if ((state->txcmd & DMSGF_DELETE) == 0 &&
225 (msg->any.head.cmd & DMSGF_DELETE)) {
226 iocom->conn_state = NULL;
227 kdmsg_msg_reply(msg, 0);
230 return (0);
233 static
235 kdmsg_lnk_span_reply(kdmsg_state_t *state, kdmsg_msg_t *msg)
238 * Be sure to process shim before terminating the SPAN
239 * transaction. Gives device drivers the ability to
240 * interlock new operations on the circuit before it
241 * becomes illegal and panics.
243 if (state->iocom->auto_callback)
244 state->iocom->auto_callback(msg);
246 if ((state->txcmd & DMSGF_DELETE) == 0 &&
247 (msg->any.head.cmd & DMSGF_DELETE)) {
248 kdmsg_msg_reply(msg, 0);
250 return (0);
254 * Disconnect and clean up
256 void
257 kdmsg_iocom_uninit(kdmsg_iocom_t *iocom)
259 kdmsg_state_t *state;
262 * Ask the cluster controller to go away
264 lockmgr(&iocom->msglk, LK_EXCLUSIVE);
265 atomic_set_int(&iocom->msg_ctl, KDMSG_CLUSTERCTL_KILL);
267 while (iocom->msgrd_td || iocom->msgwr_td) {
268 wakeup(&iocom->msg_ctl);
269 lksleep(iocom, &iocom->msglk, 0, "clstrkl", hz);
273 * Cleanup caches
275 if ((state = iocom->freerd_state) != NULL) {
276 iocom->freerd_state = NULL;
277 kdmsg_state_free(state);
280 if ((state = iocom->freewr_state) != NULL) {
281 iocom->freewr_state = NULL;
282 kdmsg_state_free(state);
286 * Drop communications descriptor
288 if (iocom->msg_fp) {
289 fdrop(iocom->msg_fp);
290 iocom->msg_fp = NULL;
292 lockmgr(&iocom->msglk, LK_RELEASE);
296 * Cluster controller thread. Perform messaging functions. We have one
297 * thread for the reader and one for the writer. The writer handles
298 * shutdown requests (which should break the reader thread).
300 static
301 void
302 kdmsg_iocom_thread_rd(void *arg)
304 kdmsg_iocom_t *iocom = arg;
305 dmsg_hdr_t hdr;
306 kdmsg_msg_t *msg = NULL;
307 size_t hbytes;
308 size_t abytes;
309 int error = 0;
311 while ((iocom->msg_ctl & KDMSG_CLUSTERCTL_KILL) == 0) {
313 * Retrieve the message from the pipe or socket.
315 error = fp_read(iocom->msg_fp, &hdr, sizeof(hdr),
316 NULL, 1, UIO_SYSSPACE);
317 if (error)
318 break;
319 if (hdr.magic != DMSG_HDR_MAGIC) {
320 kprintf("kdmsg: bad magic: %04x\n", hdr.magic);
321 error = EINVAL;
322 break;
324 hbytes = (hdr.cmd & DMSGF_SIZE) * DMSG_ALIGN;
325 if (hbytes < sizeof(hdr) || hbytes > DMSG_AUX_MAX) {
326 kprintf("kdmsg: bad header size %zd\n", hbytes);
327 error = EINVAL;
328 break;
330 /* XXX messy: mask cmd to avoid allocating state */
331 msg = kdmsg_msg_alloc(iocom, NULL,
332 hdr.cmd & DMSGF_BASECMDMASK,
333 NULL, NULL);
334 msg->any.head = hdr;
335 msg->hdr_size = hbytes;
336 if (hbytes > sizeof(hdr)) {
337 error = fp_read(iocom->msg_fp, &msg->any.head + 1,
338 hbytes - sizeof(hdr),
339 NULL, 1, UIO_SYSSPACE);
340 if (error) {
341 kprintf("kdmsg: short msg received\n");
342 error = EINVAL;
343 break;
346 msg->aux_size = hdr.aux_bytes;
347 if (msg->aux_size > DMSG_AUX_MAX) {
348 kprintf("kdmsg: illegal msg payload size %zd\n",
349 msg->aux_size);
350 error = EINVAL;
351 break;
353 if (msg->aux_size) {
354 abytes = DMSG_DOALIGN(msg->aux_size);
355 msg->aux_data = kmalloc(abytes, iocom->mmsg, M_WAITOK);
356 msg->flags |= KDMSG_FLAG_AUXALLOC;
357 error = fp_read(iocom->msg_fp, msg->aux_data,
358 abytes, NULL, 1, UIO_SYSSPACE);
359 if (error) {
360 kprintf("kdmsg: short msg payload received\n");
361 break;
365 (void)kdmsg_circ_msgrx(msg);
366 error = kdmsg_msg_receive_handling(msg);
367 msg = NULL;
370 if (error)
371 kprintf("kdmsg: read failed error %d\n", error);
373 lockmgr(&iocom->msglk, LK_EXCLUSIVE);
374 if (msg)
375 kdmsg_msg_free(msg);
378 * Shutdown the socket before waiting for the transmit side.
380 * If we are dying due to e.g. a socket disconnect verses being
381 * killed explicity we have to set KILL in order to kick the tx
382 * side when it might not have any other work to do. KILL might
383 * already be set if we are in an unmount or reconnect.
385 fp_shutdown(iocom->msg_fp, SHUT_RDWR);
387 atomic_set_int(&iocom->msg_ctl, KDMSG_CLUSTERCTL_KILL);
388 wakeup(&iocom->msg_ctl);
391 * Wait for the transmit side to drain remaining messages
392 * before cleaning up the rx state. The transmit side will
393 * set KILLTX and wait for the rx side to completely finish
394 * (set msgrd_td to NULL) before cleaning up any remaining
395 * tx states.
397 lockmgr(&iocom->msglk, LK_RELEASE);
398 atomic_set_int(&iocom->msg_ctl, KDMSG_CLUSTERCTL_KILLRX);
399 wakeup(&iocom->msg_ctl);
400 while ((iocom->msg_ctl & KDMSG_CLUSTERCTL_KILLTX) == 0) {
401 wakeup(&iocom->msg_ctl);
402 tsleep(iocom, 0, "clstrkw", hz);
405 iocom->msgrd_td = NULL;
408 * iocom can be ripped out from under us at this point but
409 * wakeup() is safe.
411 wakeup(iocom);
412 lwkt_exit();
415 static
416 void
417 kdmsg_iocom_thread_wr(void *arg)
419 kdmsg_iocom_t *iocom = arg;
420 kdmsg_msg_t *msg;
421 kdmsg_state_t *state;
422 ssize_t res;
423 size_t abytes;
424 int error = 0;
425 int retries = 20;
428 * Transmit loop
430 msg = NULL;
431 lockmgr(&iocom->msglk, LK_EXCLUSIVE);
433 while ((iocom->msg_ctl & KDMSG_CLUSTERCTL_KILL) == 0 && error == 0) {
435 * Sleep if no messages pending. Interlock with flag while
436 * holding msglk.
438 if (TAILQ_EMPTY(&iocom->msgq)) {
439 atomic_set_int(&iocom->msg_ctl,
440 KDMSG_CLUSTERCTL_SLEEPING);
441 lksleep(&iocom->msg_ctl, &iocom->msglk, 0, "msgwr", hz);
442 atomic_clear_int(&iocom->msg_ctl,
443 KDMSG_CLUSTERCTL_SLEEPING);
446 while ((msg = TAILQ_FIRST(&iocom->msgq)) != NULL) {
448 * Remove msg from the transmit queue and do
449 * persist and half-closed state handling.
451 TAILQ_REMOVE(&iocom->msgq, msg, qentry);
452 lockmgr(&iocom->msglk, LK_RELEASE);
454 error = kdmsg_state_msgtx(msg);
455 if (error == EALREADY) {
456 error = 0;
457 kdmsg_msg_free(msg);
458 lockmgr(&iocom->msglk, LK_EXCLUSIVE);
459 continue;
461 if (error) {
462 kdmsg_msg_free(msg);
463 lockmgr(&iocom->msglk, LK_EXCLUSIVE);
464 break;
468 * Dump the message to the pipe or socket.
470 * We have to clean up the message as if the transmit
471 * succeeded even if it failed.
473 error = fp_write(iocom->msg_fp, &msg->any,
474 msg->hdr_size, &res, UIO_SYSSPACE);
475 if (error || res != msg->hdr_size) {
476 if (error == 0)
477 error = EINVAL;
478 kdmsg_state_cleanuptx(msg);
479 lockmgr(&iocom->msglk, LK_EXCLUSIVE);
480 break;
482 if (msg->aux_size) {
483 abytes = DMSG_DOALIGN(msg->aux_size);
484 error = fp_write(iocom->msg_fp,
485 msg->aux_data, abytes,
486 &res, UIO_SYSSPACE);
487 if (error || res != abytes) {
488 if (error == 0)
489 error = EINVAL;
490 kdmsg_state_cleanuptx(msg);
491 lockmgr(&iocom->msglk, LK_EXCLUSIVE);
492 break;
495 kdmsg_state_cleanuptx(msg);
496 lockmgr(&iocom->msglk, LK_EXCLUSIVE);
501 * Cleanup messages pending transmission and release msgq lock.
503 if (error)
504 kprintf("kdmsg: write failed error %d\n", error);
505 kprintf("thread_wr: Terminating iocom\n");
508 * Shutdown the socket. This will cause the rx thread to get an
509 * EOF and ensure that both threads get to a termination state.
511 fp_shutdown(iocom->msg_fp, SHUT_RDWR);
514 * Set KILLTX (which the rx side waits for), then wait for the RX
515 * side to completely finish before we clean out any remaining
516 * command states.
518 lockmgr(&iocom->msglk, LK_RELEASE);
519 atomic_set_int(&iocom->msg_ctl, KDMSG_CLUSTERCTL_KILLTX);
520 wakeup(&iocom->msg_ctl);
521 while (iocom->msgrd_td) {
522 wakeup(&iocom->msg_ctl);
523 tsleep(iocom, 0, "clstrkw", hz);
525 lockmgr(&iocom->msglk, LK_EXCLUSIVE);
528 * Simulate received MSGF_DELETE's for any remaining states.
529 * (For remote masters).
531 * Drain the message queue to handle any device initiated writes
532 * due to state callbacks.
534 cleanuprd:
535 kdmsg_drain_msgq(iocom);
536 RB_FOREACH(state, kdmsg_state_tree, &iocom->staterd_tree) {
537 if ((state->rxcmd & DMSGF_DELETE) == 0) {
538 lockmgr(&iocom->msglk, LK_RELEASE);
539 kdmsg_state_abort(state);
540 lockmgr(&iocom->msglk, LK_EXCLUSIVE);
541 goto cleanuprd;
546 * Simulate received MSGF_DELETE's for any remaining states.
547 * (For local masters).
549 cleanupwr:
550 kdmsg_drain_msgq(iocom);
551 RB_FOREACH(state, kdmsg_state_tree, &iocom->statewr_tree) {
552 if ((state->rxcmd & DMSGF_DELETE) == 0) {
553 lockmgr(&iocom->msglk, LK_RELEASE);
554 kdmsg_state_abort(state);
555 lockmgr(&iocom->msglk, LK_EXCLUSIVE);
556 goto cleanupwr;
561 * Retry until all work is done
563 if (--retries == 0)
564 panic("kdmsg: comm thread shutdown couldn't drain");
565 if (TAILQ_FIRST(&iocom->msgq) ||
566 RB_ROOT(&iocom->staterd_tree) ||
567 RB_ROOT(&iocom->statewr_tree)) {
568 goto cleanuprd;
570 iocom->flags |= KDMSG_IOCOMF_EXITNOACC;
572 lockmgr(&iocom->msglk, LK_RELEASE);
575 * The state trees had better be empty now
577 KKASSERT(RB_EMPTY(&iocom->staterd_tree));
578 KKASSERT(RB_EMPTY(&iocom->statewr_tree));
579 KKASSERT(iocom->conn_state == NULL);
581 if (iocom->exit_func) {
583 * iocom is invalid after we call the exit function.
585 iocom->msgwr_td = NULL;
586 iocom->exit_func(iocom);
587 } else {
589 * iocom can be ripped out from under us once msgwr_td is
590 * set to NULL. The wakeup is safe.
592 iocom->msgwr_td = NULL;
593 wakeup(iocom);
595 lwkt_exit();
599 * This cleans out the pending transmit message queue, adjusting any
600 * persistent states properly in the process.
602 * Caller must hold pmp->iocom.msglk
604 void
605 kdmsg_drain_msgq(kdmsg_iocom_t *iocom)
607 kdmsg_msg_t *msg;
610 * Clean out our pending transmit queue, executing the
611 * appropriate state adjustments. If this tries to open
612 * any new outgoing transactions we have to loop up and
613 * clean them out.
615 while ((msg = TAILQ_FIRST(&iocom->msgq)) != NULL) {
616 TAILQ_REMOVE(&iocom->msgq, msg, qentry);
617 lockmgr(&iocom->msglk, LK_RELEASE);
618 if (kdmsg_state_msgtx(msg))
619 kdmsg_msg_free(msg);
620 else
621 kdmsg_state_cleanuptx(msg);
622 lockmgr(&iocom->msglk, LK_EXCLUSIVE);
627 * Do all processing required to handle a freshly received message
628 * after its low level header has been validated.
630 static
632 kdmsg_msg_receive_handling(kdmsg_msg_t *msg)
634 kdmsg_iocom_t *iocom = msg->iocom;
635 int error;
638 * State machine tracking, state assignment for msg,
639 * returns error and discard status. Errors are fatal
640 * to the connection except for EALREADY which forces
641 * a discard without execution.
643 error = kdmsg_state_msgrx(msg);
644 if (error) {
646 * Raw protocol or connection error
648 kdmsg_msg_free(msg);
649 if (error == EALREADY)
650 error = 0;
651 } else if (msg->state && msg->state->func) {
653 * Message related to state which already has a
654 * handling function installed for it.
656 error = msg->state->func(msg->state, msg);
657 kdmsg_state_cleanuprx(msg);
658 } else if (iocom->flags & KDMSG_IOCOMF_AUTOANY) {
659 error = kdmsg_autorxmsg(msg);
660 kdmsg_state_cleanuprx(msg);
661 } else {
662 error = iocom->rcvmsg(msg);
663 kdmsg_state_cleanuprx(msg);
665 return error;
669 * Process circuit tracking (NEEDS WORK)
671 static
673 kdmsg_circ_msgrx(kdmsg_msg_t *msg)
675 kdmsg_circuit_t dummy;
676 kdmsg_circuit_t *circ;
677 int error = 0;
679 if (msg->any.head.circuit) {
680 dummy.msgid = msg->any.head.circuit;
681 lwkt_gettoken(&kdmsg_token);
682 circ = RB_FIND(kdmsg_circuit_tree, &msg->iocom->circ_tree,
683 &dummy);
684 if (circ) {
685 msg->circ = circ;
686 kdmsg_circ_hold(circ);
688 if (circ == NULL) {
689 kprintf("KDMSG_CIRC_MSGRX CMD %08x: IOCOM %p "
690 "Bad circuit %016jx\n",
691 msg->any.head.cmd,
692 msg->iocom,
693 (intmax_t)msg->any.head.circuit);
694 kprintf("KDMSG_CIRC_MSGRX: Avail circuits: ");
695 RB_FOREACH(circ, kdmsg_circuit_tree,
696 &msg->iocom->circ_tree) {
697 kprintf(" %016jx", (intmax_t)circ->msgid);
699 kprintf("\n");
700 error = EINVAL;
702 lwkt_reltoken(&kdmsg_token);
704 return (error);
708 * Process state tracking for a message after reception, prior to
709 * execution.
711 * Called with msglk held and the msg dequeued.
713 * All messages are called with dummy state and return actual state.
714 * (One-off messages often just return the same dummy state).
716 * May request that caller discard the message by setting *discardp to 1.
717 * The returned state is not used in this case and is allowed to be NULL.
719 * --
721 * These routines handle persistent and command/reply message state via the
722 * CREATE and DELETE flags. The first message in a command or reply sequence
723 * sets CREATE, the last message in a command or reply sequence sets DELETE.
725 * There can be any number of intermediate messages belonging to the same
726 * sequence sent inbetween the CREATE message and the DELETE message,
727 * which set neither flag. This represents a streaming command or reply.
729 * Any command message received with CREATE set expects a reply sequence to
730 * be returned. Reply sequences work the same as command sequences except the
731 * REPLY bit is also sent. Both the command side and reply side can
732 * degenerate into a single message with both CREATE and DELETE set. Note
733 * that one side can be streaming and the other side not, or neither, or both.
735 * The msgid is unique for the initiator. That is, two sides sending a new
736 * message can use the same msgid without colliding.
738 * --
740 * ABORT sequences work by setting the ABORT flag along with normal message
741 * state. However, ABORTs can also be sent on half-closed messages, that is
742 * even if the command or reply side has already sent a DELETE, as long as
743 * the message has not been fully closed it can still send an ABORT+DELETE
744 * to terminate the half-closed message state.
746 * Since ABORT+DELETEs can race we silently discard ABORT's for message
747 * state which has already been fully closed. REPLY+ABORT+DELETEs can
748 * also race, and in this situation the other side might have already
749 * initiated a new unrelated command with the same message id. Since
750 * the abort has not set the CREATE flag the situation can be detected
751 * and the message will also be discarded.
753 * Non-blocking requests can be initiated with ABORT+CREATE[+DELETE].
754 * The ABORT request is essentially integrated into the command instead
755 * of being sent later on. In this situation the command implementation
756 * detects that CREATE and ABORT are both set (vs ABORT alone) and can
757 * special-case non-blocking operation for the command.
759 * NOTE! Messages with ABORT set without CREATE or DELETE are considered
760 * to be mid-stream aborts for command/reply sequences. ABORTs on
761 * one-way messages are not supported.
763 * NOTE! If a command sequence does not support aborts the ABORT flag is
764 * simply ignored.
766 * --
768 * One-off messages (no reply expected) are sent with neither CREATE or DELETE
769 * set. One-off messages cannot be aborted and typically aren't processed
770 * by these routines. The REPLY bit can be used to distinguish whether a
771 * one-off message is a command or reply. For example, one-off replies
772 * will typically just contain status updates.
774 static
776 kdmsg_state_msgrx(kdmsg_msg_t *msg)
778 kdmsg_iocom_t *iocom = msg->iocom;
779 kdmsg_state_t *state;
780 int error;
783 * Make sure a state structure is ready to go in case we need a new
784 * one. This is the only routine which uses freerd_state so no
785 * races are possible.
787 if ((state = iocom->freerd_state) == NULL) {
788 state = kmalloc(sizeof(*state), iocom->mmsg, M_WAITOK | M_ZERO);
789 state->flags = KDMSG_STATE_DYNAMIC;
790 iocom->freerd_state = state;
794 * Lock RB tree and locate existing persistent state, if any.
796 * If received msg is a command state is on staterd_tree.
797 * If received msg is a reply state is on statewr_tree.
799 lockmgr(&iocom->msglk, LK_EXCLUSIVE);
801 state->msgid = msg->any.head.msgid;
802 state->circ = msg->circ;
803 state->iocom = iocom;
804 if (msg->any.head.cmd & DMSGF_REPLY)
805 state = RB_FIND(kdmsg_state_tree, &iocom->statewr_tree, state);
806 else
807 state = RB_FIND(kdmsg_state_tree, &iocom->staterd_tree, state);
808 msg->state = state;
811 * Short-cut one-off or mid-stream messages (state may be NULL).
813 if ((msg->any.head.cmd & (DMSGF_CREATE | DMSGF_DELETE |
814 DMSGF_ABORT)) == 0) {
815 goto done;
819 * Switch on CREATE, DELETE, REPLY, and also handle ABORT from
820 * inside the case statements.
822 switch(msg->any.head.cmd & (DMSGF_CREATE|DMSGF_DELETE|DMSGF_REPLY)) {
823 case DMSGF_CREATE:
824 case DMSGF_CREATE | DMSGF_DELETE:
826 * New persistant command received.
828 if (state) {
829 kprintf("kdmsg_state_msgrx: duplicate transaction\n");
830 error = EINVAL;
831 break;
833 state = iocom->freerd_state;
834 iocom->freerd_state = NULL;
835 msg->state = state;
836 state->msg = msg;
837 state->icmd = msg->any.head.cmd & DMSGF_BASECMDMASK;
838 state->rxcmd = msg->any.head.cmd & ~DMSGF_DELETE;
839 state->txcmd = DMSGF_REPLY;
840 state->msgid = msg->any.head.msgid;
841 if ((state->circ = msg->circ) != NULL)
842 kdmsg_circ_hold(state->circ);
843 RB_INSERT(kdmsg_state_tree, &iocom->staterd_tree, state);
844 state->flags |= KDMSG_STATE_INSERTED;
845 error = 0;
846 break;
847 case DMSGF_DELETE:
849 * Persistent state is expected but might not exist if an
850 * ABORT+DELETE races the close.
852 if (state == NULL) {
853 if (msg->any.head.cmd & DMSGF_ABORT) {
854 error = EALREADY;
855 } else {
856 kprintf("kdmsg_state_msgrx: "
857 "no state for DELETE\n");
858 error = EINVAL;
860 break;
864 * Handle another ABORT+DELETE case if the msgid has already
865 * been reused.
867 if ((state->rxcmd & DMSGF_CREATE) == 0) {
868 if (msg->any.head.cmd & DMSGF_ABORT) {
869 error = EALREADY;
870 } else {
871 kprintf("kdmsg_state_msgrx: "
872 "state reused for DELETE\n");
873 error = EINVAL;
875 break;
877 error = 0;
878 break;
879 default:
881 * Check for mid-stream ABORT command received, otherwise
882 * allow.
884 if (msg->any.head.cmd & DMSGF_ABORT) {
885 if (state == NULL ||
886 (state->rxcmd & DMSGF_CREATE) == 0) {
887 error = EALREADY;
888 break;
891 error = 0;
892 break;
893 case DMSGF_REPLY | DMSGF_CREATE:
894 case DMSGF_REPLY | DMSGF_CREATE | DMSGF_DELETE:
896 * When receiving a reply with CREATE set the original
897 * persistent state message should already exist.
899 if (state == NULL) {
900 kprintf("kdmsg_state_msgrx: no state match for "
901 "REPLY cmd=%08x msgid=%016jx\n",
902 msg->any.head.cmd,
903 (intmax_t)msg->any.head.msgid);
904 error = EINVAL;
905 break;
907 state->rxcmd = msg->any.head.cmd & ~DMSGF_DELETE;
908 error = 0;
909 break;
910 case DMSGF_REPLY | DMSGF_DELETE:
912 * Received REPLY+ABORT+DELETE in case where msgid has
913 * already been fully closed, ignore the message.
915 if (state == NULL) {
916 if (msg->any.head.cmd & DMSGF_ABORT) {
917 error = EALREADY;
918 } else {
919 kprintf("kdmsg_state_msgrx: no state match "
920 "for REPLY|DELETE\n");
921 error = EINVAL;
923 break;
927 * Received REPLY+ABORT+DELETE in case where msgid has
928 * already been reused for an unrelated message,
929 * ignore the message.
931 if ((state->rxcmd & DMSGF_CREATE) == 0) {
932 if (msg->any.head.cmd & DMSGF_ABORT) {
933 error = EALREADY;
934 } else {
935 kprintf("kdmsg_state_msgrx: state reused "
936 "for REPLY|DELETE\n");
937 error = EINVAL;
939 break;
941 error = 0;
942 break;
943 case DMSGF_REPLY:
945 * Check for mid-stream ABORT reply received to sent command.
947 if (msg->any.head.cmd & DMSGF_ABORT) {
948 if (state == NULL ||
949 (state->rxcmd & DMSGF_CREATE) == 0) {
950 error = EALREADY;
951 break;
954 error = 0;
955 break;
959 * Calculate the easy-switch() transactional command. Represents
960 * the outer-transaction command for any transaction-create or
961 * transaction-delete, and the inner message command for any
962 * non-transaction or inside-transaction command. tcmd will be
963 * set to 0 for any messaging error condition.
965 * The two can be told apart because outer-transaction commands
966 * always have a DMSGF_CREATE and/or DMSGF_DELETE flag.
968 done:
969 lockmgr(&iocom->msglk, LK_RELEASE);
971 if (msg->any.head.cmd & (DMSGF_CREATE | DMSGF_DELETE)) {
972 if (state) {
973 msg->tcmd = (msg->state->icmd & DMSGF_BASECMDMASK) |
974 (msg->any.head.cmd & (DMSGF_CREATE |
975 DMSGF_DELETE |
976 DMSGF_REPLY));
977 } else {
978 msg->tcmd = 0;
980 } else {
981 msg->tcmd = msg->any.head.cmd & DMSGF_CMDSWMASK;
983 return (error);
987 * Called instead of iocom->rcvmsg() if any of the AUTO flags are set.
988 * This routine must call iocom->rcvmsg() for anything not automatically
989 * handled.
991 static int
992 kdmsg_autorxmsg(kdmsg_msg_t *msg)
994 kdmsg_iocom_t *iocom = msg->iocom;
995 kdmsg_circuit_t *circ;
996 int error = 0;
997 uint32_t cmd;
1000 * Main switch processes transaction create/delete sequences only.
1001 * Use icmd (DELETEs use DMSG_LNK_ERROR
1003 * NOTE: If processing in-transaction messages you generally want
1004 * an inner switch on msg->any.head.cmd.
1006 if (msg->state) {
1007 cmd = (msg->state->icmd & DMSGF_BASECMDMASK) |
1008 (msg->any.head.cmd & (DMSGF_CREATE |
1009 DMSGF_DELETE |
1010 DMSGF_REPLY));
1011 } else {
1012 cmd = 0;
1015 switch(cmd) {
1016 case DMSG_LNK_CONN | DMSGF_CREATE:
1017 case DMSG_LNK_CONN | DMSGF_CREATE | DMSGF_DELETE:
1019 * Received LNK_CONN transaction. Transmit response and
1020 * leave transaction open, which allows the other end to
1021 * start to the SPAN protocol.
1023 * Handle shim after acknowledging the CONN.
1025 if ((msg->any.head.cmd & DMSGF_DELETE) == 0) {
1026 if (iocom->flags & KDMSG_IOCOMF_AUTOCONN) {
1027 kdmsg_msg_result(msg, 0);
1028 if (iocom->auto_callback)
1029 iocom->auto_callback(msg);
1030 } else {
1031 error = iocom->rcvmsg(msg);
1033 break;
1035 /* fall through */
1036 case DMSG_LNK_CONN | DMSGF_DELETE:
1038 * This message is usually simulated after a link is lost
1039 * to clean up the transaction.
1041 if (iocom->flags & KDMSG_IOCOMF_AUTOCONN) {
1042 if (iocom->auto_callback)
1043 iocom->auto_callback(msg);
1044 kdmsg_msg_reply(msg, 0);
1045 } else {
1046 error = iocom->rcvmsg(msg);
1048 break;
1049 case DMSG_LNK_SPAN | DMSGF_CREATE:
1050 case DMSG_LNK_SPAN | DMSGF_CREATE | DMSGF_DELETE:
1052 * Received LNK_SPAN transaction. We do not have to respond
1053 * but we must leave the transaction open.
1055 * If AUTOTXCIRC is set automatically initiate a virtual
1056 * circuit to the received span. This will attach a
1057 * kdmsg_circuit to the SPAN state. The circuit is lost
1058 * when the span is lost.
1060 * Handle shim after acknowledging the SPAN.
1062 if (iocom->flags & KDMSG_IOCOMF_AUTORXSPAN) {
1063 if ((msg->any.head.cmd & DMSGF_DELETE) == 0) {
1064 if (iocom->flags & KDMSG_IOCOMF_AUTOTXCIRC)
1065 kdmsg_autocirc(msg);
1066 if (iocom->auto_callback)
1067 iocom->auto_callback(msg);
1068 break;
1070 /* fall through */
1071 } else {
1072 error = iocom->rcvmsg(msg);
1073 break;
1075 /* fall through */
1076 case DMSG_LNK_SPAN | DMSGF_DELETE:
1078 * Process shims (auto_callback) before cleaning up the
1079 * circuit structure and closing the transactions. Device
1080 * driver should ensure that the circuit is not used after
1081 * the auto_callback() returns.
1083 * Handle shim before closing the SPAN transaction.
1085 if (iocom->flags & KDMSG_IOCOMF_AUTORXSPAN) {
1086 if (iocom->auto_callback)
1087 iocom->auto_callback(msg);
1088 if (iocom->flags & KDMSG_IOCOMF_AUTOTXCIRC)
1089 kdmsg_autocirc(msg);
1090 kdmsg_msg_reply(msg, 0);
1091 } else {
1092 error = iocom->rcvmsg(msg);
1094 break;
1095 case DMSG_LNK_CIRC | DMSGF_CREATE:
1096 case DMSG_LNK_CIRC | DMSGF_CREATE | DMSGF_DELETE:
1098 * Received LNK_CIRC transaction. We must respond and should
1099 * leave the transaction open, allowing the circuit. The
1100 * remote can start issuing commands to us over the circuit
1101 * even before we respond.
1103 if (iocom->flags & KDMSG_IOCOMF_AUTORXCIRC) {
1104 if ((msg->any.head.cmd & DMSGF_DELETE) == 0) {
1105 circ = kmalloc(sizeof(*circ), iocom->mmsg,
1106 M_WAITOK | M_ZERO);
1107 lwkt_gettoken(&kdmsg_token);
1108 msg->state->any.circ = circ;
1109 circ->iocom = iocom;
1110 circ->rcirc_state = msg->state;
1111 kdmsg_circ_hold(circ); /* for rcirc_state */
1112 circ->weight = 0;
1113 circ->msgid = circ->rcirc_state->msgid;
1114 /* XXX no span link for received circuits */
1115 kdmsg_circ_hold(circ); /* for circ_state */
1117 if (RB_INSERT(kdmsg_circuit_tree,
1118 &iocom->circ_tree, circ)) {
1119 panic("duplicate circuitid allocated");
1121 lwkt_reltoken(&kdmsg_token);
1122 kdmsg_msg_result(msg, 0);
1125 * Handle shim after adding the circuit and
1126 * after acknowledging the CIRC.
1128 if (iocom->auto_callback)
1129 iocom->auto_callback(msg);
1130 break;
1132 /* fall through */
1133 } else {
1134 error = iocom->rcvmsg(msg);
1135 break;
1137 /* fall through */
1138 case DMSG_LNK_CIRC | DMSGF_DELETE:
1139 if (iocom->flags & KDMSG_IOCOMF_AUTORXCIRC) {
1140 circ = msg->state->any.circ;
1141 if (circ == NULL)
1142 break;
1145 * Handle shim before terminating the circuit.
1147 #if 0
1148 kprintf("KDMSG VC: RECEIVE CIRC DELETE "
1149 "IOCOM %p MSGID %016jx\n",
1150 msg->iocom, circ->msgid);
1151 #endif
1152 if (iocom->auto_callback)
1153 iocom->auto_callback(msg);
1155 KKASSERT(circ->rcirc_state == msg->state);
1156 lwkt_gettoken(&kdmsg_token);
1157 circ->rcirc_state = NULL;
1158 msg->state->any.circ = NULL;
1159 RB_REMOVE(kdmsg_circuit_tree, &iocom->circ_tree, circ);
1160 lwkt_reltoken(&kdmsg_token);
1161 kdmsg_circ_drop(circ); /* for rcirc_state */
1162 kdmsg_msg_reply(msg, 0);
1163 } else {
1164 error = iocom->rcvmsg(msg);
1166 break;
1167 default:
1169 * Anything unhandled goes into rcvmsg.
1171 * NOTE: Replies to link-level messages initiated by our side
1172 * are handled by the state callback, they are NOT
1173 * handled here.
1175 error = iocom->rcvmsg(msg);
1176 break;
1178 return (error);
1182 * Handle automatic forging of virtual circuits based on received SPANs.
1183 * (AUTOTXCIRC). Note that other code handles tracking received circuit
1184 * transactions (AUTORXCIRC).
1186 * We can ignore non-transactions here. Use trans->icmd to test the
1187 * transactional command (once past the CREATE the individual message
1188 * commands are not usually the icmd).
1190 * XXX locks
1192 static
1193 void
1194 kdmsg_autocirc(kdmsg_msg_t *msg)
1196 kdmsg_iocom_t *iocom = msg->iocom;
1197 kdmsg_circuit_t *circ;
1198 kdmsg_msg_t *xmsg; /* CIRC */
1200 if (msg->state == NULL)
1201 return;
1204 * Gaining the SPAN, automatically forge a circuit to the target.
1206 * NOTE!! The shim is not executed until we receive an acknowlegement
1207 * to our forged LNK_CIRC (see kdmsg_autocirc_reply()).
1209 if (msg->state->icmd == DMSG_LNK_SPAN &&
1210 (msg->any.head.cmd & DMSGF_CREATE)) {
1211 circ = kmalloc(sizeof(*circ), iocom->mmsg, M_WAITOK | M_ZERO);
1212 lwkt_gettoken(&kdmsg_token);
1213 msg->state->any.circ = circ;
1214 circ->iocom = iocom;
1215 circ->span_state = msg->state;
1216 kdmsg_circ_hold(circ); /* for span_state */
1217 xmsg = kdmsg_msg_alloc(iocom, NULL,
1218 DMSG_LNK_CIRC | DMSGF_CREATE,
1219 kdmsg_autocirc_reply, circ);
1220 circ->circ_state = xmsg->state;
1221 circ->weight = msg->any.lnk_span.dist;
1222 circ->msgid = circ->circ_state->msgid;
1223 kdmsg_circ_hold(circ); /* for circ_state */
1224 #if 0
1225 kprintf("KDMSG VC: CREATE SPAN->CIRC IOCOM %p MSGID %016jx\n",
1226 msg->iocom, circ->msgid);
1227 #endif
1229 if (RB_INSERT(kdmsg_circuit_tree, &iocom->circ_tree, circ))
1230 panic("duplicate circuitid allocated");
1231 lwkt_reltoken(&kdmsg_token);
1233 xmsg->any.lnk_circ.target = msg->any.head.msgid;
1234 kdmsg_msg_write(xmsg);
1238 * Losing the SPAN
1240 * NOTE: When losing a SPAN, any circuits using the span should be
1241 * deleted by the remote end first. XXX might not be ordered
1242 * on actual loss of connection.
1244 if (msg->state->icmd == DMSG_LNK_SPAN &&
1245 (msg->any.head.cmd & DMSGF_DELETE) &&
1246 msg->state->any.circ) {
1247 circ = msg->state->any.circ;
1248 lwkt_gettoken(&kdmsg_token);
1249 circ->span_state = NULL;
1250 msg->state->any.circ = NULL;
1251 RB_REMOVE(kdmsg_circuit_tree, &iocom->circ_tree, circ);
1252 #if 0
1253 kprintf("KDMSG VC: DELETE SPAN->CIRC IOCOM %p MSGID %016jx\n",
1254 msg->iocom, (intmax_t)circ->msgid);
1255 #endif
1256 kdmsg_circ_drop(circ); /* for span_state */
1257 lwkt_reltoken(&kdmsg_token);
1261 static
1263 kdmsg_autocirc_reply(kdmsg_state_t *state, kdmsg_msg_t *msg)
1265 kdmsg_iocom_t *iocom = state->iocom;
1266 kdmsg_circuit_t *circ = state->any.circ;
1269 * Call shim after receiving an acknowlegement to our forged
1270 * circuit and before processing a received termination.
1272 if (iocom->auto_callback)
1273 iocom->auto_callback(msg);
1276 * If the remote is terminating the VC we terminate our side
1278 if ((state->txcmd & DMSGF_DELETE) == 0 &&
1279 (msg->any.head.cmd & DMSGF_DELETE)) {
1280 #if 0
1281 kprintf("KDMSG VC: DELETE CIRC FROM REMOTE\n");
1282 #endif
1283 lwkt_gettoken(&kdmsg_token);
1284 circ->circ_state = NULL;
1285 state->any.circ = NULL;
1286 kdmsg_circ_drop(circ); /* for circ_state */
1287 lwkt_reltoken(&kdmsg_token);
1288 kdmsg_msg_reply(msg, 0);
1290 return (0);
1294 * Post-receive-handling message and state cleanup. This routine is called
1295 * after the state function handling/callback to properly dispose of the
1296 * message and update or dispose of the state.
1298 static
1299 void
1300 kdmsg_state_cleanuprx(kdmsg_msg_t *msg)
1302 kdmsg_iocom_t *iocom = msg->iocom;
1303 kdmsg_state_t *state;
1305 if ((state = msg->state) == NULL) {
1306 kdmsg_msg_free(msg);
1307 } else if (msg->any.head.cmd & DMSGF_DELETE) {
1308 lockmgr(&iocom->msglk, LK_EXCLUSIVE);
1309 KKASSERT((state->rxcmd & DMSGF_DELETE) == 0);
1310 state->rxcmd |= DMSGF_DELETE;
1311 if (state->txcmd & DMSGF_DELETE) {
1312 KKASSERT(state->flags & KDMSG_STATE_INSERTED);
1313 if (state->rxcmd & DMSGF_REPLY) {
1314 KKASSERT(msg->any.head.cmd &
1315 DMSGF_REPLY);
1316 RB_REMOVE(kdmsg_state_tree,
1317 &iocom->statewr_tree, state);
1318 } else {
1319 KKASSERT((msg->any.head.cmd &
1320 DMSGF_REPLY) == 0);
1321 RB_REMOVE(kdmsg_state_tree,
1322 &iocom->staterd_tree, state);
1324 state->flags &= ~KDMSG_STATE_INSERTED;
1325 if (msg != state->msg)
1326 kdmsg_msg_free(msg);
1327 lockmgr(&iocom->msglk, LK_RELEASE);
1328 kdmsg_state_free(state);
1329 } else {
1330 if (msg != state->msg)
1331 kdmsg_msg_free(msg);
1332 lockmgr(&iocom->msglk, LK_RELEASE);
1334 } else if (msg != state->msg) {
1335 kdmsg_msg_free(msg);
1340 * Simulate receiving a message which terminates an active transaction
1341 * state. Our simulated received message must set DELETE and may also
1342 * have to set CREATE. It must also ensure that all fields are set such
1343 * that the receive handling code can find the state (kdmsg_state_msgrx())
1344 * or an endless loop will ensue.
1346 * This is used when the other end of the link or virtual circuit is dead
1347 * so the device driver gets a completed transaction for all pending states.
1349 static
1350 void
1351 kdmsg_state_abort(kdmsg_state_t *state)
1353 kdmsg_iocom_t *iocom = state->iocom;
1354 kdmsg_msg_t *msg;
1357 * Prevent recursive aborts which could otherwise occur if the
1358 * simulated message reception runs state->func which then turns
1359 * around and tries to reply to a broken circuit when then calls
1360 * the state abort code again.
1362 if (state->flags & KDMSG_STATE_ABORTING)
1363 return;
1364 state->flags |= KDMSG_STATE_ABORTING;
1367 * Simulatem essage reception
1369 msg = kdmsg_msg_alloc(iocom, state->circ,
1370 DMSG_LNK_ERROR,
1371 NULL, NULL);
1372 if ((state->rxcmd & DMSGF_CREATE) == 0)
1373 msg->any.head.cmd |= DMSGF_CREATE;
1374 msg->any.head.cmd |= DMSGF_DELETE | (state->rxcmd & DMSGF_REPLY);
1375 msg->any.head.error = DMSG_ERR_LOSTLINK;
1376 msg->any.head.msgid = state->msgid;
1377 msg->state = state;
1378 kdmsg_msg_receive_handling(msg);
1382 * Process state tracking for a message prior to transmission.
1384 * Called with msglk held and the msg dequeued. Returns non-zero if
1385 * the message is bad and should be deleted by the caller.
1387 * One-off messages are usually with dummy state and msg->state may be NULL
1388 * in this situation.
1390 * New transactions (when CREATE is set) will insert the state.
1392 * May request that caller discard the message by setting *discardp to 1.
1393 * A NULL state may be returned in this case.
1395 static
1397 kdmsg_state_msgtx(kdmsg_msg_t *msg)
1399 kdmsg_iocom_t *iocom = msg->iocom;
1400 kdmsg_state_t *state;
1401 int error;
1404 * Make sure a state structure is ready to go in case we need a new
1405 * one. This is the only routine which uses freewr_state so no
1406 * races are possible.
1408 if ((state = iocom->freewr_state) == NULL) {
1409 state = kmalloc(sizeof(*state), iocom->mmsg, M_WAITOK | M_ZERO);
1410 state->flags = KDMSG_STATE_DYNAMIC;
1411 state->iocom = iocom;
1412 iocom->freewr_state = state;
1416 * Lock RB tree. If persistent state is present it will have already
1417 * been assigned to msg.
1419 lockmgr(&iocom->msglk, LK_EXCLUSIVE);
1420 state = msg->state;
1423 * Short-cut one-off or mid-stream messages (state may be NULL).
1425 if ((msg->any.head.cmd & (DMSGF_CREATE | DMSGF_DELETE |
1426 DMSGF_ABORT)) == 0) {
1427 lockmgr(&iocom->msglk, LK_RELEASE);
1428 return(0);
1433 * Switch on CREATE, DELETE, REPLY, and also handle ABORT from
1434 * inside the case statements.
1436 switch(msg->any.head.cmd & (DMSGF_CREATE | DMSGF_DELETE |
1437 DMSGF_REPLY)) {
1438 case DMSGF_CREATE:
1439 case DMSGF_CREATE | DMSGF_DELETE:
1441 * Insert the new persistent message state and mark
1442 * half-closed if DELETE is set. Since this is a new
1443 * message it isn't possible to transition into the fully
1444 * closed state here.
1446 * XXX state must be assigned and inserted by
1447 * kdmsg_msg_write(). txcmd is assigned by us
1448 * on-transmit.
1450 KKASSERT(state != NULL);
1451 state->icmd = msg->any.head.cmd & DMSGF_BASECMDMASK;
1452 state->txcmd = msg->any.head.cmd & ~DMSGF_DELETE;
1453 state->rxcmd = DMSGF_REPLY;
1454 error = 0;
1455 break;
1456 case DMSGF_DELETE:
1458 * Sent ABORT+DELETE in case where msgid has already
1459 * been fully closed, ignore the message.
1461 if (state == NULL) {
1462 if (msg->any.head.cmd & DMSGF_ABORT) {
1463 error = EALREADY;
1464 } else {
1465 kprintf("kdmsg_state_msgtx: no state match "
1466 "for DELETE cmd=%08x msgid=%016jx\n",
1467 msg->any.head.cmd,
1468 (intmax_t)msg->any.head.msgid);
1469 error = EINVAL;
1471 break;
1475 * Sent ABORT+DELETE in case where msgid has
1476 * already been reused for an unrelated message,
1477 * ignore the message.
1479 if ((state->txcmd & DMSGF_CREATE) == 0) {
1480 if (msg->any.head.cmd & DMSGF_ABORT) {
1481 error = EALREADY;
1482 } else {
1483 kprintf("kdmsg_state_msgtx: state reused "
1484 "for DELETE\n");
1485 error = EINVAL;
1487 break;
1489 error = 0;
1490 break;
1491 default:
1493 * Check for mid-stream ABORT command sent
1495 if (msg->any.head.cmd & DMSGF_ABORT) {
1496 if (state == NULL ||
1497 (state->txcmd & DMSGF_CREATE) == 0) {
1498 error = EALREADY;
1499 break;
1502 error = 0;
1503 break;
1504 case DMSGF_REPLY | DMSGF_CREATE:
1505 case DMSGF_REPLY | DMSGF_CREATE | DMSGF_DELETE:
1507 * When transmitting a reply with CREATE set the original
1508 * persistent state message should already exist.
1510 if (state == NULL) {
1511 kprintf("kdmsg_state_msgtx: no state match "
1512 "for REPLY | CREATE\n");
1513 error = EINVAL;
1514 break;
1516 state->txcmd = msg->any.head.cmd & ~DMSGF_DELETE;
1517 error = 0;
1518 break;
1519 case DMSGF_REPLY | DMSGF_DELETE:
1521 * When transmitting a reply with DELETE set the original
1522 * persistent state message should already exist.
1524 * This is very similar to the REPLY|CREATE|* case except
1525 * txcmd is already stored, so we just add the DELETE flag.
1527 * Sent REPLY+ABORT+DELETE in case where msgid has
1528 * already been fully closed, ignore the message.
1530 if (state == NULL) {
1531 if (msg->any.head.cmd & DMSGF_ABORT) {
1532 error = EALREADY;
1533 } else {
1534 kprintf("kdmsg_state_msgtx: no state match "
1535 "for REPLY | DELETE\n");
1536 error = EINVAL;
1538 break;
1542 * Sent REPLY+ABORT+DELETE in case where msgid has already
1543 * been reused for an unrelated message, ignore the message.
1545 if ((state->txcmd & DMSGF_CREATE) == 0) {
1546 if (msg->any.head.cmd & DMSGF_ABORT) {
1547 error = EALREADY;
1548 } else {
1549 kprintf("kdmsg_state_msgtx: state reused "
1550 "for REPLY | DELETE\n");
1551 error = EINVAL;
1553 break;
1555 error = 0;
1556 break;
1557 case DMSGF_REPLY:
1559 * Check for mid-stream ABORT reply sent.
1561 * One-off REPLY messages are allowed for e.g. status updates.
1563 if (msg->any.head.cmd & DMSGF_ABORT) {
1564 if (state == NULL ||
1565 (state->txcmd & DMSGF_CREATE) == 0) {
1566 error = EALREADY;
1567 break;
1570 error = 0;
1571 break;
1573 lockmgr(&iocom->msglk, LK_RELEASE);
1574 return (error);
1577 static
1578 void
1579 kdmsg_state_cleanuptx(kdmsg_msg_t *msg)
1581 kdmsg_iocom_t *iocom = msg->iocom;
1582 kdmsg_state_t *state;
1584 if ((state = msg->state) == NULL) {
1585 kdmsg_msg_free(msg);
1586 } else if (msg->any.head.cmd & DMSGF_DELETE) {
1587 lockmgr(&iocom->msglk, LK_EXCLUSIVE);
1588 KKASSERT((state->txcmd & DMSGF_DELETE) == 0);
1589 state->txcmd |= DMSGF_DELETE;
1590 if (state->rxcmd & DMSGF_DELETE) {
1591 KKASSERT(state->flags & KDMSG_STATE_INSERTED);
1592 if (state->txcmd & DMSGF_REPLY) {
1593 KKASSERT(msg->any.head.cmd &
1594 DMSGF_REPLY);
1595 RB_REMOVE(kdmsg_state_tree,
1596 &iocom->staterd_tree, state);
1597 } else {
1598 KKASSERT((msg->any.head.cmd &
1599 DMSGF_REPLY) == 0);
1600 RB_REMOVE(kdmsg_state_tree,
1601 &iocom->statewr_tree, state);
1603 state->flags &= ~KDMSG_STATE_INSERTED;
1604 if (msg != state->msg)
1605 kdmsg_msg_free(msg);
1606 lockmgr(&iocom->msglk, LK_RELEASE);
1607 kdmsg_state_free(state);
1608 } else {
1609 if (msg != state->msg)
1610 kdmsg_msg_free(msg);
1611 lockmgr(&iocom->msglk, LK_RELEASE);
1613 } else if (msg != state->msg) {
1614 kdmsg_msg_free(msg);
1618 static
1619 void
1620 kdmsg_state_free(kdmsg_state_t *state)
1622 kdmsg_iocom_t *iocom = state->iocom;
1623 kdmsg_msg_t *msg;
1625 KKASSERT((state->flags & KDMSG_STATE_INSERTED) == 0);
1626 msg = state->msg;
1627 state->msg = NULL;
1628 kfree(state, iocom->mmsg);
1629 if (msg) {
1630 msg->state = NULL;
1631 kdmsg_msg_free(msg);
1635 kdmsg_msg_t *
1636 kdmsg_msg_alloc(kdmsg_iocom_t *iocom, kdmsg_circuit_t *circ, uint32_t cmd,
1637 int (*func)(kdmsg_state_t *, kdmsg_msg_t *), void *data)
1639 kdmsg_msg_t *msg;
1640 kdmsg_state_t *state;
1641 size_t hbytes;
1643 KKASSERT(iocom != NULL);
1644 hbytes = (cmd & DMSGF_SIZE) * DMSG_ALIGN;
1645 msg = kmalloc(offsetof(struct kdmsg_msg, any) + hbytes,
1646 iocom->mmsg, M_WAITOK | M_ZERO);
1647 msg->hdr_size = hbytes;
1648 msg->iocom = iocom;
1649 msg->any.head.magic = DMSG_HDR_MAGIC;
1650 msg->any.head.cmd = cmd;
1651 if (circ) {
1652 kdmsg_circ_hold(circ);
1653 msg->circ = circ;
1654 msg->any.head.circuit = circ->msgid;
1657 if (cmd & DMSGF_CREATE) {
1659 * New transaction, requires tracking state and a unique
1660 * msgid to be allocated.
1662 KKASSERT(msg->state == NULL);
1663 state = kmalloc(sizeof(*state), iocom->mmsg, M_WAITOK | M_ZERO);
1664 state->flags = KDMSG_STATE_DYNAMIC;
1665 state->func = func;
1666 state->any.any = data;
1667 state->msg = msg;
1668 state->msgid = (uint64_t)(uintptr_t)state;
1669 state->circ = circ;
1670 state->iocom = iocom;
1671 msg->state = state;
1672 if (circ)
1673 kdmsg_circ_hold(circ);
1674 /*msg->any.head.msgid = state->msgid;XXX*/
1676 lockmgr(&iocom->msglk, LK_EXCLUSIVE);
1677 if (RB_INSERT(kdmsg_state_tree, &iocom->statewr_tree, state))
1678 panic("duplicate msgid allocated");
1679 state->flags |= KDMSG_STATE_INSERTED;
1680 msg->any.head.msgid = state->msgid;
1681 lockmgr(&iocom->msglk, LK_RELEASE);
1683 return (msg);
1686 kdmsg_msg_t *
1687 kdmsg_msg_alloc_state(kdmsg_state_t *state, uint32_t cmd,
1688 int (*func)(kdmsg_state_t *, kdmsg_msg_t *), void *data)
1690 kdmsg_iocom_t *iocom = state->iocom;
1691 kdmsg_msg_t *msg;
1692 size_t hbytes;
1694 KKASSERT(iocom != NULL);
1695 hbytes = (cmd & DMSGF_SIZE) * DMSG_ALIGN;
1696 msg = kmalloc(offsetof(struct kdmsg_msg, any) + hbytes,
1697 iocom->mmsg, M_WAITOK | M_ZERO);
1698 msg->hdr_size = hbytes;
1699 msg->iocom = iocom;
1700 msg->any.head.magic = DMSG_HDR_MAGIC;
1701 msg->any.head.cmd = cmd;
1702 msg->state = state;
1703 if (state->circ) {
1704 kdmsg_circ_hold(state->circ);
1705 msg->circ = state->circ;
1706 msg->any.head.circuit = state->circ->msgid;
1708 return(msg);
1711 void
1712 kdmsg_msg_free(kdmsg_msg_t *msg)
1714 kdmsg_iocom_t *iocom = msg->iocom;
1716 if ((msg->flags & KDMSG_FLAG_AUXALLOC) &&
1717 msg->aux_data && msg->aux_size) {
1718 kfree(msg->aux_data, iocom->mmsg);
1719 msg->flags &= ~KDMSG_FLAG_AUXALLOC;
1721 if (msg->circ) {
1722 kdmsg_circ_drop(msg->circ);
1723 msg->circ = NULL;
1725 if (msg->state) {
1726 if (msg->state->msg == msg)
1727 msg->state->msg = NULL;
1728 msg->state = NULL;
1730 msg->aux_data = NULL;
1731 msg->aux_size = 0;
1732 msg->iocom = NULL;
1733 kfree(msg, iocom->mmsg);
1737 * Circuits are tracked in a red-black tree by their circuit id (msgid).
1740 kdmsg_circuit_cmp(kdmsg_circuit_t *circ1, kdmsg_circuit_t *circ2)
1742 if (circ1->msgid < circ2->msgid)
1743 return(-1);
1744 if (circ1->msgid > circ2->msgid)
1745 return(1);
1746 return (0);
1750 * Indexed messages are stored in a red-black tree indexed by their
1751 * msgid. Only persistent messages are indexed.
1754 kdmsg_state_cmp(kdmsg_state_t *state1, kdmsg_state_t *state2)
1756 if (state1->iocom < state2->iocom)
1757 return(-1);
1758 if (state1->iocom > state2->iocom)
1759 return(1);
1760 if (state1->circ < state2->circ)
1761 return(-1);
1762 if (state1->circ > state2->circ)
1763 return(1);
1764 if (state1->msgid < state2->msgid)
1765 return(-1);
1766 if (state1->msgid > state2->msgid)
1767 return(1);
1768 return(0);
1772 * Write a message. All requisit command flags have been set.
1774 * If msg->state is non-NULL the message is written to the existing
1775 * transaction. msgid will be set accordingly.
1777 * If msg->state is NULL and CREATE is set new state is allocated and
1778 * (func, data) is installed. A msgid is assigned.
1780 * If msg->state is NULL and CREATE is not set the message is assumed
1781 * to be a one-way message. The originator must assign the msgid
1782 * (or leave it 0, which is typical.
1784 * This function merely queues the message to the management thread, it
1785 * does not write to the message socket/pipe.
1787 void
1788 kdmsg_msg_write(kdmsg_msg_t *msg)
1790 kdmsg_iocom_t *iocom = msg->iocom;
1791 kdmsg_state_t *state;
1793 if (msg->state) {
1795 * Continuance or termination of existing transaction.
1796 * The transaction could have been initiated by either end.
1798 * (Function callback and aux data for the receive side can
1799 * be replaced or left alone).
1801 state = msg->state;
1802 msg->any.head.msgid = state->msgid;
1803 lockmgr(&iocom->msglk, LK_EXCLUSIVE);
1804 } else {
1806 * One-off message (always uses msgid 0 to distinguish
1807 * between a possibly lost in-transaction message due to
1808 * competing aborts and a real one-off message?)
1810 state = NULL;
1811 msg->any.head.msgid = 0;
1812 lockmgr(&iocom->msglk, LK_EXCLUSIVE);
1816 * With AUTORXCIRC and AUTOTXCIRC it is possible for the circuit to
1817 * get ripped out in the rxthread while some other thread is
1818 * holding a ref on it inbetween allocating and sending a dmsg.
1820 if (msg->circ && msg->circ->rcirc_state == NULL &&
1821 (msg->circ->span_state == NULL || msg->circ->circ_state == NULL)) {
1822 kprintf("kdmsg_msg_write: Attempt to write message to "
1823 "terminated circuit: msg %08x\n", msg->any.head.cmd);
1824 lockmgr(&iocom->msglk, LK_RELEASE);
1825 if (kdmsg_state_msgtx(msg)) {
1826 if (state == NULL || msg != state->msg)
1827 kdmsg_msg_free(msg);
1828 } else if ((msg->state->rxcmd & DMSGF_DELETE) == 0) {
1829 /* XXX SMP races simulating a response here */
1830 kdmsg_state_t *state = msg->state;
1831 kdmsg_state_cleanuptx(msg);
1832 kdmsg_state_abort(state);
1833 } else {
1834 kdmsg_state_cleanuptx(msg);
1836 return;
1840 * This flag is not set until after the tx thread has drained
1841 * the txmsgq and simulated responses. After that point the
1842 * txthread is dead and can no longer simulate responses.
1844 * Device drivers should never try to send a message once this
1845 * flag is set. They should have detected (through the state
1846 * closures) that the link is in trouble.
1848 if (iocom->flags & KDMSG_IOCOMF_EXITNOACC) {
1849 lockmgr(&iocom->msglk, LK_RELEASE);
1850 panic("kdmsg_msg_write: Attempt to write message to "
1851 "terminated iocom\n");
1855 * Finish up the msg fields. Note that msg->aux_size and the
1856 * aux_bytes stored in the message header represent the unaligned
1857 * (actual) bytes of data, but the buffer is sized to an aligned
1858 * size and the CRC is generated over the aligned length.
1860 msg->any.head.salt = /* (random << 8) | */ (iocom->msg_seq & 255);
1861 ++iocom->msg_seq;
1863 if (msg->aux_data && msg->aux_size) {
1864 uint32_t abytes = DMSG_DOALIGN(msg->aux_size);
1866 msg->any.head.aux_bytes = msg->aux_size;
1867 msg->any.head.aux_crc = iscsi_crc32(msg->aux_data, abytes);
1869 msg->any.head.hdr_crc = 0;
1870 msg->any.head.hdr_crc = iscsi_crc32(msg->any.buf, msg->hdr_size);
1872 TAILQ_INSERT_TAIL(&iocom->msgq, msg, qentry);
1874 if (iocom->msg_ctl & KDMSG_CLUSTERCTL_SLEEPING) {
1875 atomic_clear_int(&iocom->msg_ctl,
1876 KDMSG_CLUSTERCTL_SLEEPING);
1877 wakeup(&iocom->msg_ctl);
1880 lockmgr(&iocom->msglk, LK_RELEASE);
1884 * Reply to a message and terminate our side of the transaction.
1886 * If msg->state is non-NULL we are replying to a one-way message.
1888 void
1889 kdmsg_msg_reply(kdmsg_msg_t *msg, uint32_t error)
1891 kdmsg_state_t *state = msg->state;
1892 kdmsg_msg_t *nmsg;
1893 uint32_t cmd;
1896 * Reply with a simple error code and terminate the transaction.
1898 cmd = DMSG_LNK_ERROR;
1901 * Check if our direction has even been initiated yet, set CREATE.
1903 * Check what direction this is (command or reply direction). Note
1904 * that txcmd might not have been initiated yet.
1906 * If our direction has already been closed we just return without
1907 * doing anything.
1909 if (state) {
1910 if (state->txcmd & DMSGF_DELETE)
1911 return;
1912 if ((state->txcmd & DMSGF_CREATE) == 0)
1913 cmd |= DMSGF_CREATE;
1914 if (state->txcmd & DMSGF_REPLY)
1915 cmd |= DMSGF_REPLY;
1916 cmd |= DMSGF_DELETE;
1917 } else {
1918 if ((msg->any.head.cmd & DMSGF_REPLY) == 0)
1919 cmd |= DMSGF_REPLY;
1922 /* XXX messy mask cmd to avoid allocating state */
1923 nmsg = kdmsg_msg_alloc_state(state, cmd, NULL, NULL);
1924 nmsg->any.head.error = error;
1925 kdmsg_msg_write(nmsg);
1929 * Reply to a message and continue our side of the transaction.
1931 * If msg->state is non-NULL we are replying to a one-way message and this
1932 * function degenerates into the same as kdmsg_msg_reply().
1934 void
1935 kdmsg_msg_result(kdmsg_msg_t *msg, uint32_t error)
1937 kdmsg_state_t *state = msg->state;
1938 kdmsg_msg_t *nmsg;
1939 uint32_t cmd;
1942 * Return a simple result code, do NOT terminate the transaction.
1944 cmd = DMSG_LNK_ERROR;
1947 * Check if our direction has even been initiated yet, set CREATE.
1949 * Check what direction this is (command or reply direction). Note
1950 * that txcmd might not have been initiated yet.
1952 * If our direction has already been closed we just return without
1953 * doing anything.
1955 if (state) {
1956 if (state->txcmd & DMSGF_DELETE)
1957 return;
1958 if ((state->txcmd & DMSGF_CREATE) == 0)
1959 cmd |= DMSGF_CREATE;
1960 if (state->txcmd & DMSGF_REPLY)
1961 cmd |= DMSGF_REPLY;
1962 /* continuing transaction, do not set MSGF_DELETE */
1963 } else {
1964 if ((msg->any.head.cmd & DMSGF_REPLY) == 0)
1965 cmd |= DMSGF_REPLY;
1968 /* XXX messy mask cmd to avoid allocating state */
1969 nmsg = kdmsg_msg_alloc_state(state, cmd, NULL, NULL);
1970 nmsg->any.head.error = error;
1971 kdmsg_msg_write(nmsg);
1975 * Reply to a message and terminate our side of the transaction.
1977 * If msg->state is non-NULL we are replying to a one-way message.
1979 void
1980 kdmsg_state_reply(kdmsg_state_t *state, uint32_t error)
1982 kdmsg_msg_t *nmsg;
1983 uint32_t cmd;
1986 * Reply with a simple error code and terminate the transaction.
1988 cmd = DMSG_LNK_ERROR;
1991 * Check if our direction has even been initiated yet, set CREATE.
1993 * Check what direction this is (command or reply direction). Note
1994 * that txcmd might not have been initiated yet.
1996 * If our direction has already been closed we just return without
1997 * doing anything.
1999 KKASSERT(state);
2000 if (state->txcmd & DMSGF_DELETE)
2001 return;
2002 if ((state->txcmd & DMSGF_CREATE) == 0)
2003 cmd |= DMSGF_CREATE;
2004 if (state->txcmd & DMSGF_REPLY)
2005 cmd |= DMSGF_REPLY;
2006 cmd |= DMSGF_DELETE;
2008 /* XXX messy mask cmd to avoid allocating state */
2009 nmsg = kdmsg_msg_alloc_state(state, cmd, NULL, NULL);
2010 nmsg->any.head.error = error;
2011 kdmsg_msg_write(nmsg);
2015 * Reply to a message and continue our side of the transaction.
2017 * If msg->state is non-NULL we are replying to a one-way message and this
2018 * function degenerates into the same as kdmsg_msg_reply().
2020 void
2021 kdmsg_state_result(kdmsg_state_t *state, uint32_t error)
2023 kdmsg_msg_t *nmsg;
2024 uint32_t cmd;
2027 * Return a simple result code, do NOT terminate the transaction.
2029 cmd = DMSG_LNK_ERROR;
2032 * Check if our direction has even been initiated yet, set CREATE.
2034 * Check what direction this is (command or reply direction). Note
2035 * that txcmd might not have been initiated yet.
2037 * If our direction has already been closed we just return without
2038 * doing anything.
2040 KKASSERT(state);
2041 if (state->txcmd & DMSGF_DELETE)
2042 return;
2043 if ((state->txcmd & DMSGF_CREATE) == 0)
2044 cmd |= DMSGF_CREATE;
2045 if (state->txcmd & DMSGF_REPLY)
2046 cmd |= DMSGF_REPLY;
2047 /* continuing transaction, do not set MSGF_DELETE */
2049 /* XXX messy mask cmd to avoid allocating state */
2050 nmsg = kdmsg_msg_alloc_state(state, cmd, NULL, NULL);
2051 nmsg->any.head.error = error;
2052 kdmsg_msg_write(nmsg);