2 * Copyright (c) 2012-2014 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@dragonflybsd.org>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 * This module allows disk devices to be created and associated with a
36 * communications pipe or socket. You open the device and issue an
37 * ioctl() to install a new disk along with its communications descriptor.
39 * All further communication occurs via the descriptor using the DMSG
40 * LNK_CONN, LNK_SPAN, and BLOCK protocols. The descriptor can be a
41 * direct connection to a remote machine's disk (in-kernenl), to a remote
42 * cluster controller, to the local cluster controller, etc.
44 * /dev/xdisk is the control device, issue ioctl()s to create the /dev/xa%d
45 * devices. These devices look like raw disks to the system.
47 #include <sys/param.h>
48 #include <sys/systm.h>
51 #include <sys/device.h>
52 #include <sys/devicestat.h>
54 #include <sys/kernel.h>
55 #include <sys/malloc.h>
56 #include <sys/sysctl.h>
58 #include <sys/queue.h>
62 #include <sys/kern_syscall.h>
65 #include <sys/xdiskioctl.h>
68 #include <sys/thread2.h>
72 RB_HEAD(xa_softc_tree
, xa_softc
);
73 RB_PROTOTYPE(xa_softc_tree
, xa_softc
, rbnode
, xa_softc_cmp
);
76 SYSCTL_INT(_debug
, OID_AUTO
, xa_active
, CTLFLAG_RW
, &xa_active
, 0,
77 "Number of active xdisk IOs");
78 static uint64_t xa_last
;
79 SYSCTL_ULONG(_debug
, OID_AUTO
, xa_last
, CTLFLAG_RW
, &xa_last
, 0,
80 "Offset of last xdisk IO");
81 static int xa_debug
= 1;
82 SYSCTL_INT(_debug
, OID_AUTO
, xa_debug
, CTLFLAG_RW
, &xa_debug
, 0,
89 TAILQ_ENTRY(xa_tag
) entry
;
91 dmsg_blk_error_t status
;
99 typedef struct xa_tag xa_tag_t
;
105 struct kdmsg_state_list spanq
;
106 RB_ENTRY(xa_softc
) rbnode
;
108 struct devstat stats
;
109 struct disk_info info
;
119 char peer_label
[64]; /* from LNK_SPAN host/dev */
120 char pfs_label
[64]; /* from LNK_SPAN serno */
122 TAILQ_HEAD(, bio
) bioq
; /* pending BIOs */
123 TAILQ_HEAD(, xa_tag
) tag_freeq
; /* available I/O tags */
124 TAILQ_HEAD(, xa_tag
) tag_pendq
; /* running I/O tags */
128 typedef struct xa_softc xa_softc_t
;
131 TAILQ_ENTRY(xa_iocom
) entry
;
136 typedef struct xa_iocom xa_iocom_t
;
138 static int xa_softc_cmp(xa_softc_t
*sc1
, xa_softc_t
*sc2
);
139 RB_GENERATE(xa_softc_tree
, xa_softc
, rbnode
, xa_softc_cmp
);
140 static struct xa_softc_tree xa_device_tree
;
142 #define MAXTAGS 64 /* no real limit */
144 static int xdisk_attach(struct xdisk_attach_ioctl
*xaioc
);
145 static int xdisk_detach(struct xdisk_attach_ioctl
*xaioc
);
146 static void xaio_exit(kdmsg_iocom_t
*iocom
);
147 static int xaio_rcvdmsg(kdmsg_msg_t
*msg
);
149 static void xa_terminate_check(struct xa_softc
*sc
);
151 static xa_tag_t
*xa_setup_cmd(xa_softc_t
*sc
, struct bio
*bio
);
152 static void xa_start(xa_tag_t
*tag
, kdmsg_msg_t
*msg
, int async
);
153 static void xa_done(xa_tag_t
*tag
, int wasbio
);
154 static void xa_release(xa_tag_t
*tag
, int wasbio
);
155 static uint32_t xa_wait(xa_tag_t
*tag
);
156 static int xa_sync_completion(kdmsg_state_t
*state
, kdmsg_msg_t
*msg
);
157 static int xa_bio_completion(kdmsg_state_t
*state
, kdmsg_msg_t
*msg
);
158 static void xa_restart_deferred(xa_softc_t
*sc
);
160 #define xa_printf(level, ctl, ...) \
161 if (xa_debug >= (level)) kprintf("xdisk: " ctl, __VA_ARGS__)
163 MALLOC_DEFINE(M_XDISK
, "Networked disk client", "Network Disks");
166 * Control device, issue ioctls to create xa devices.
168 static d_open_t xdisk_open
;
169 static d_close_t xdisk_close
;
170 static d_ioctl_t xdisk_ioctl
;
172 static struct dev_ops xdisk_ops
= {
173 { "xdisk", 0, D_MPSAFE
| D_TRACKCLOSE
},
174 .d_open
= xdisk_open
,
175 .d_close
= xdisk_close
,
176 .d_ioctl
= xdisk_ioctl
182 static d_open_t xa_open
;
183 static d_close_t xa_close
;
184 static d_ioctl_t xa_ioctl
;
185 static d_strategy_t xa_strategy
;
186 static d_psize_t xa_size
;
188 static struct dev_ops xa_ops
= {
189 { "xa", 0, D_DISK
| D_CANFREE
| D_MPSAFE
| D_TRACKCLOSE
},
194 .d_write
= physwrite
,
195 .d_strategy
= xa_strategy
,
199 static int xdisk_opencount
;
200 static cdev_t xdisk_dev
;
201 struct lock xdisk_lk
;
202 static TAILQ_HEAD(, xa_iocom
) xaiocomq
;
205 * Module initialization
208 xdisk_modevent(module_t mod
, int type
, void *data
)
212 TAILQ_INIT(&xaiocomq
);
213 RB_INIT(&xa_device_tree
);
214 lockinit(&xdisk_lk
, "xdisk", 0, 0);
215 xdisk_dev
= make_dev(&xdisk_ops
, 0,
216 UID_ROOT
, GID_WHEEL
, 0600, "xdisk");
220 if (!RB_EMPTY(&xa_device_tree
))
222 if (xdisk_opencount
|| TAILQ_FIRST(&xaiocomq
))
225 destroy_dev(xdisk_dev
);
228 dev_ops_remove_all(&xdisk_ops
);
229 dev_ops_remove_all(&xa_ops
);
237 DEV_MODULE(xdisk
, xdisk_modevent
, 0);
240 xa_softc_cmp(xa_softc_t
*sc1
, xa_softc_t
*sc2
)
242 return(strcmp(sc1
->pfs_label
, sc2
->pfs_label
));
249 xdisk_open(struct dev_open_args
*ap
)
251 lockmgr(&xdisk_lk
, LK_EXCLUSIVE
);
253 lockmgr(&xdisk_lk
, LK_RELEASE
);
258 xdisk_close(struct dev_close_args
*ap
)
260 lockmgr(&xdisk_lk
, LK_EXCLUSIVE
);
262 lockmgr(&xdisk_lk
, LK_RELEASE
);
267 xdisk_ioctl(struct dev_ioctl_args
*ap
)
273 error
= xdisk_attach((void *)ap
->a_data
);
276 error
= xdisk_detach((void *)ap
->a_data
);
285 /************************************************************************
287 ************************************************************************/
290 xdisk_attach(struct xdisk_attach_ioctl
*xaioc
)
296 * Normalize ioctl params
298 fp
= holdfp(curproc
->p_fd
, xaioc
->fd
, -1);
301 xa_printf(1, "xdisk_attach fp=%p\n", fp
);
304 * See if the serial number is already present. If we are
305 * racing a termination the disk subsystem may still have
306 * duplicate entries not yet removed so we wait a bit and
309 lockmgr(&xdisk_lk
, LK_EXCLUSIVE
);
311 xaio
= kmalloc(sizeof(*xaio
), M_XDISK
, M_WAITOK
| M_ZERO
);
312 kdmsg_iocom_init(&xaio
->iocom
, xaio
,
313 KDMSG_IOCOMF_AUTOCONN
,
314 M_XDISK
, xaio_rcvdmsg
);
315 xaio
->iocom
.exit_func
= xaio_exit
;
317 kdmsg_iocom_reconnect(&xaio
->iocom
, fp
, "xdisk");
320 * Setup our LNK_CONN advertisement for autoinitiate.
322 * Our filter is setup to only accept PEER_BLOCK advertisements.
323 * XXX no peer_id filter.
325 * We need a unique pfs_fsid to avoid confusion.
327 xaio
->iocom
.auto_lnk_conn
.peer_type
= DMSG_PEER_CLIENT
;
328 xaio
->iocom
.auto_lnk_conn
.proto_version
= DMSG_SPAN_PROTO_1
;
329 xaio
->iocom
.auto_lnk_conn
.peer_mask
= 1LLU << DMSG_PEER_BLOCK
;
330 ksnprintf(xaio
->iocom
.auto_lnk_conn
.peer_label
,
331 sizeof(xaio
->iocom
.auto_lnk_conn
.peer_label
),
334 /* kern_uuidgen(&xaio->iocom.auto_lnk_conn.pfs_fsid, 1); */
337 * Setup our LNK_SPAN advertisement for autoinitiate
339 TAILQ_INSERT_TAIL(&xaiocomq
, xaio
, entry
);
340 kdmsg_iocom_autoinitiate(&xaio
->iocom
, NULL
);
342 lockmgr(&xdisk_lk
, LK_RELEASE
);
348 xdisk_detach(struct xdisk_attach_ioctl
*xaioc
)
354 * Called from iocom core transmit thread upon disconnect.
358 xaio_exit(kdmsg_iocom_t
*iocom
)
360 xa_iocom_t
*xaio
= iocom
->handle
;
362 lockmgr(&xdisk_lk
, LK_EXCLUSIVE
);
363 xa_printf(1, "%s", "xdisk_detach [xaio_exit()]\n");
364 TAILQ_REMOVE(&xaiocomq
, xaio
, entry
);
365 lockmgr(&xdisk_lk
, LK_RELEASE
);
367 kdmsg_iocom_uninit(&xaio
->iocom
);
369 kfree(xaio
, M_XDISK
);
373 * Called from iocom core to handle messages that the iocom core does not
374 * handle itself and for which a state function callback has not yet been
377 * We primarily care about LNK_SPAN transactions here.
380 xaio_rcvdmsg(kdmsg_msg_t
*msg
)
382 kdmsg_state_t
*state
= msg
->state
;
383 xa_iocom_t
*xaio
= state
->iocom
->handle
;
388 "xdisk - rcvmsg state=%p rx=%08x tx=%08x msgcmd=%08x\n",
389 state
, state
->rxcmd
, state
->txcmd
,
392 lockmgr(&xdisk_lk
, LK_EXCLUSIVE
);
395 case DMSG_LNK_SPAN
| DMSGF_CREATE
| DMSGF_DELETE
:
397 * A LNK_SPAN transaction which is opened and closed
398 * degenerately is not useful to us, just ignore it.
400 kdmsg_msg_reply(msg
, 0);
402 case DMSG_LNK_SPAN
| DMSGF_CREATE
:
404 * Manage the tracking node for the remote LNK_SPAN.
406 * Return a streaming result, leaving the transaction open
407 * in both directions to allow sub-transactions.
409 bcopy(msg
->any
.lnk_span
.peer_label
, xaio
->dummysc
.peer_label
,
410 sizeof(xaio
->dummysc
.peer_label
));
411 xaio
->dummysc
.peer_label
[
412 sizeof(xaio
->dummysc
.peer_label
) - 1] = 0;
414 bcopy(msg
->any
.lnk_span
.pfs_label
, xaio
->dummysc
.pfs_label
,
415 sizeof(xaio
->dummysc
.pfs_label
));
416 xaio
->dummysc
.pfs_label
[
417 sizeof(xaio
->dummysc
.pfs_label
) - 1] = 0;
419 xa_printf(3, "LINK_SPAN state %p create for %s\n",
420 msg
->state
, msg
->any
.lnk_span
.pfs_label
);
422 sc
= RB_FIND(xa_softc_tree
, &xa_device_tree
, &xaio
->dummysc
);
430 sc
= kmalloc(sizeof(*sc
), M_XDISK
, M_WAITOK
| M_ZERO
);
431 bcopy(msg
->any
.lnk_span
.peer_label
, sc
->peer_label
,
432 sizeof(sc
->peer_label
));
433 sc
->peer_label
[sizeof(sc
->peer_label
) - 1] = 0;
434 bcopy(msg
->any
.lnk_span
.pfs_label
, sc
->pfs_label
,
435 sizeof(sc
->pfs_label
));
436 sc
->pfs_label
[sizeof(sc
->pfs_label
) - 1] = 0;
438 /* XXX FIXME O(N^2) */
442 RB_FOREACH(sctmp
, xa_softc_tree
,
444 if (sctmp
->unit
== unit
)
452 lockinit(&sc
->lk
, "xalk", 0, 0);
453 TAILQ_INIT(&sc
->spanq
);
454 TAILQ_INIT(&sc
->bioq
);
455 TAILQ_INIT(&sc
->tag_freeq
);
456 TAILQ_INIT(&sc
->tag_pendq
);
458 lockmgr(&sc
->lk
, LK_EXCLUSIVE
);
459 RB_INSERT(xa_softc_tree
, &xa_device_tree
, sc
);
460 TAILQ_INSERT_TAIL(&sc
->spanq
, msg
->state
, user_entry
);
461 msg
->state
->any
.xa_sc
= sc
;
466 for (n
= 0; n
< MAXTAGS
; ++n
) {
467 tag
= kmalloc(sizeof(*tag
),
468 M_XDISK
, M_WAITOK
|M_ZERO
);
470 TAILQ_INSERT_TAIL(&sc
->tag_freeq
, tag
, entry
);
473 if (sc
->dev
== NULL
) {
474 dev
= disk_create(unit
, &sc
->disk
, &xa_ops
);
477 devstat_add_entry(&sc
->stats
, "xa", unit
,
479 DEVSTAT_NO_ORDERED_TAGS
,
480 DEVSTAT_TYPE_DIRECT
|
481 DEVSTAT_TYPE_IF_OTHER
,
482 DEVSTAT_PRIORITY_OTHER
);
485 sc
->info
.d_media_blksize
=
486 msg
->any
.lnk_span
.media
.block
.blksize
;
487 if (sc
->info
.d_media_blksize
<= 0)
488 sc
->info
.d_media_blksize
= 1;
489 sc
->info
.d_media_blocks
=
490 msg
->any
.lnk_span
.media
.block
.bytes
/
491 sc
->info
.d_media_blksize
;
492 sc
->info
.d_dsflags
= DSO_MBRQUIET
| DSO_RAWPSIZE
;
493 sc
->info
.d_secpertrack
= 32;
494 sc
->info
.d_nheads
= 64;
495 sc
->info
.d_secpercyl
= sc
->info
.d_secpertrack
*
497 sc
->info
.d_ncylinders
= 0;
498 if (sc
->pfs_label
[0])
499 sc
->info
.d_serialno
= sc
->pfs_label
;
501 * WARNING! disk_setdiskinfo() must be asynchronous
502 * because we are in the rxmsg thread. If
503 * it is synchronous and issues more disk
504 * I/Os, we will deadlock.
506 disk_setdiskinfo(&sc
->disk
, &sc
->info
);
507 xa_restart_deferred(sc
); /* eats serializing */
508 lockmgr(&sc
->lk
, LK_RELEASE
);
510 lockmgr(&sc
->lk
, LK_EXCLUSIVE
);
512 TAILQ_INSERT_TAIL(&sc
->spanq
, msg
->state
, user_entry
);
513 msg
->state
->any
.xa_sc
= sc
;
514 if (sc
->serializing
== 0 && sc
->open_tag
== NULL
) {
516 xa_restart_deferred(sc
); /* eats serializing */
518 lockmgr(&sc
->lk
, LK_RELEASE
);
519 if (sc
->dev
&& sc
->dev
->si_disk
) {
520 xa_printf(1, "reprobe disk: %s\n",
522 disk_msg_send(DISK_DISK_REPROBE
,
527 xa_printf(2, "sc %p spancnt %d\n", sc
, sc
->spancnt
);
528 kdmsg_msg_result(msg
, 0);
530 case DMSG_LNK_SPAN
| DMSGF_DELETE
:
532 * Manage the tracking node for the remote LNK_SPAN.
534 * Return a final result, closing our end of the transaction.
536 sc
= msg
->state
->any
.xa_sc
;
537 xa_printf(3, "LINK_SPAN state %p delete for %s (sc=%p)\n",
538 msg
->state
, (sc
? sc
->pfs_label
: "(null)"), sc
);
539 lockmgr(&sc
->lk
, LK_EXCLUSIVE
);
540 msg
->state
->any
.xa_sc
= NULL
;
541 TAILQ_REMOVE(&sc
->spanq
, msg
->state
, user_entry
);
544 xa_printf(2, "sc %p spancnt %d\n", sc
, sc
->spancnt
);
547 * Spans can come and go as the graph stabilizes, so if
548 * we lose a span along with sc->open_tag we may be able
549 * to restart the I/Os on a different span.
552 sc
->serializing
== 0 && sc
->open_tag
== NULL
) {
554 xa_restart_deferred(sc
);
556 lockmgr(&sc
->lk
, LK_RELEASE
);
557 kdmsg_msg_reply(msg
, 0);
563 if (sc
->spancnt
== 0)
564 xa_terminate_check(sc
);
567 case DMSG_LNK_SPAN
| DMSGF_DELETE
| DMSGF_REPLY
:
569 * Ignore unimplemented streaming replies on our LNK_SPAN
572 xa_printf(3, "LINK_SPAN state %p delete+reply\n",
575 case DMSG_LNK_SPAN
| DMSGF_REPLY
:
577 * Ignore unimplemented streaming replies on our LNK_SPAN
580 xa_printf(3, "LINK_SPAN state %p reply\n",
585 * Execute shell command (not supported atm).
587 * This is a one-way packet but if not (e.g. if part of
588 * a streaming transaction), we will have already closed
591 kdmsg_msg_reply(msg
, DMSG_ERR_NOSUPP
);
593 case DMSG_DBG_SHELL
| DMSGF_REPLY
:
595 * Receive one or more replies to a shell command
596 * that we sent. Just dump it to the console.
598 * This is a one-way packet but if not (e.g. if
599 * part of a streaming transaction), we will have
600 * already closed our end.
603 msg
->aux_data
[msg
->aux_size
- 1] = 0;
604 xa_printf(0, "DEBUGMSG: %s\n", msg
->aux_data
);
609 * Unsupported one-way message, streaming message, or
612 * Terminate any unsupported transactions with an error
613 * and ignore any unsupported streaming messages.
615 * NOTE: This case also includes DMSG_LNK_ERROR messages
616 * which might be one-way, replying to those would
617 * cause an infinite ping-pong.
619 if (msg
->any
.head
.cmd
& DMSGF_CREATE
)
620 kdmsg_msg_reply(msg
, DMSG_ERR_NOSUPP
);
623 lockmgr(&xdisk_lk
, LK_RELEASE
);
629 * Determine if we can destroy the xa_softc.
631 * Called with xdisk_lk held.
635 xa_terminate_check(struct xa_softc
*sc
)
640 * Determine if we can destroy the softc.
642 xa_printf(1, "Terminate check xa%d (%d,%d,%d) sc=%p ",
644 sc
->opencnt
, sc
->serializing
, sc
->spancnt
,
647 if (sc
->opencnt
|| sc
->serializing
|| sc
->spancnt
||
648 TAILQ_FIRST(&sc
->bioq
) || TAILQ_FIRST(&sc
->tag_pendq
)) {
649 xa_printf(1, "%s", "(leave intact)\n");
654 * Remove from device tree, a race with a new incoming span
655 * will create a new softc and disk.
657 RB_REMOVE(xa_softc_tree
, &xa_device_tree
, sc
);
661 * Device has to go first to prevent device ops races.
664 disk_destroy(&sc
->disk
);
665 devstat_remove_entry(&sc
->stats
);
666 sc
->dev
->si_drv1
= NULL
;
670 xa_printf(1, "%s", "(remove from tree)\n");
672 KKASSERT(sc
->opencnt
== 0);
673 KKASSERT(TAILQ_EMPTY(&sc
->tag_pendq
));
675 while ((tag
= TAILQ_FIRST(&sc
->tag_freeq
)) != NULL
) {
676 TAILQ_REMOVE(&sc
->tag_freeq
, tag
, entry
);
684 /************************************************************************
685 * XA DEVICE INTERFACE *
686 ************************************************************************/
689 xa_open(struct dev_open_args
*ap
)
691 cdev_t dev
= ap
->a_head
.a_dev
;
695 dev
->si_bsize_phys
= 512;
696 dev
->si_bsize_best
= 32768;
699 * Interlock open with opencnt, wait for attachment operations
702 lockmgr(&xdisk_lk
, LK_EXCLUSIVE
);
706 lockmgr(&xdisk_lk
, LK_RELEASE
);
707 return ENXIO
; /* raced destruction */
709 if (sc
->serializing
) {
710 tsleep(sc
, 0, "xarace", hz
/ 10);
713 if (sc
->terminating
) {
714 lockmgr(&xdisk_lk
, LK_RELEASE
);
715 return ENXIO
; /* raced destruction */
720 * Serialize initial open
722 if (sc
->opencnt
++ > 0) {
725 lockmgr(&xdisk_lk
, LK_RELEASE
);
730 * Issue BLK_OPEN if necessary. ENXIO is returned if we have trouble.
732 if (sc
->open_tag
== NULL
) {
733 lockmgr(&sc
->lk
, LK_EXCLUSIVE
);
734 xa_restart_deferred(sc
); /* eats serializing */
735 lockmgr(&sc
->lk
, LK_RELEASE
);
740 lockmgr(&xdisk_lk
, LK_RELEASE
);
743 * Wait for completion of the BLK_OPEN
745 lockmgr(&xdisk_lk
, LK_EXCLUSIVE
);
746 while (sc
->serializing
)
747 lksleep(sc
, &xdisk_lk
, 0, "xaopen", hz
);
749 error
= sc
->last_error
;
751 KKASSERT(sc
->opencnt
> 0);
753 xa_terminate_check(sc
);
754 sc
= NULL
; /* sc may be invalid now */
756 lockmgr(&xdisk_lk
, LK_RELEASE
);
762 xa_close(struct dev_close_args
*ap
)
764 cdev_t dev
= ap
->a_head
.a_dev
;
768 lockmgr(&xdisk_lk
, LK_EXCLUSIVE
);
771 lockmgr(&sc
->lk
, LK_RELEASE
);
772 return ENXIO
; /* raced destruction */
774 if (sc
->terminating
) {
775 lockmgr(&sc
->lk
, LK_RELEASE
);
776 return ENXIO
; /* raced destruction */
778 lockmgr(&sc
->lk
, LK_EXCLUSIVE
);
781 * NOTE: Clearing open_tag allows a concurrent open to re-open
782 * the device and prevents autonomous completion of the tag.
784 if (sc
->opencnt
== 1 && sc
->open_tag
) {
787 lockmgr(&sc
->lk
, LK_RELEASE
);
788 kdmsg_state_reply(tag
->state
, 0); /* close our side */
789 xa_wait(tag
); /* wait on remote */
791 lockmgr(&sc
->lk
, LK_RELEASE
);
793 KKASSERT(sc
->opencnt
> 0);
795 xa_terminate_check(sc
);
796 lockmgr(&xdisk_lk
, LK_RELEASE
);
802 xa_strategy(struct dev_strategy_args
*ap
)
804 xa_softc_t
*sc
= ap
->a_head
.a_dev
->si_drv1
;
806 struct bio
*bio
= ap
->a_bio
;
808 devstat_start_transaction(&sc
->stats
);
809 atomic_add_int(&xa_active
, 1);
810 xa_last
= bio
->bio_offset
;
813 * If no tags are available NULL is returned and the bio is
814 * placed on sc->bioq.
816 lockmgr(&sc
->lk
, LK_EXCLUSIVE
);
817 tag
= xa_setup_cmd(sc
, bio
);
819 xa_start(tag
, NULL
, 1);
820 lockmgr(&sc
->lk
, LK_RELEASE
);
826 xa_ioctl(struct dev_ioctl_args
*ap
)
832 xa_size(struct dev_psize_args
*ap
)
836 if ((sc
= ap
->a_head
.a_dev
->si_drv1
) == NULL
)
838 ap
->a_result
= sc
->info
.d_media_blocks
;
842 /************************************************************************
843 * XA BLOCK PROTOCOL STATE MACHINE *
844 ************************************************************************
846 * Implement tag/msg setup and related functions.
847 * Called with sc->lk held.
850 xa_setup_cmd(xa_softc_t
*sc
, struct bio
*bio
)
855 * Only get a tag if we have a valid virtual circuit to the server.
857 if ((tag
= TAILQ_FIRST(&sc
->tag_freeq
)) != NULL
) {
858 TAILQ_REMOVE(&sc
->tag_freeq
, tag
, entry
);
860 TAILQ_INSERT_TAIL(&sc
->tag_pendq
, tag
, entry
);
864 * If we can't dispatch now and this is a bio, queue it for later.
866 if (tag
== NULL
&& bio
) {
867 TAILQ_INSERT_TAIL(&sc
->bioq
, bio
, bio_act
);
874 * Called with sc->lk held
877 xa_start(xa_tag_t
*tag
, kdmsg_msg_t
*msg
, int async
)
879 xa_softc_t
*sc
= tag
->sc
;
883 tag
->status
.head
.error
= DMSG_ERR_IO
; /* fallback error */
888 kdmsg_state_t
*trans
;
890 if (sc
->opencnt
== 0 || sc
->open_tag
== NULL
) {
891 TAILQ_FOREACH(trans
, &sc
->spanq
, user_entry
) {
892 if ((trans
->rxcmd
& DMSGF_DELETE
) == 0)
896 trans
= sc
->open_tag
->state
;
907 msg
= kdmsg_msg_alloc(trans
,
911 xa_bio_completion
, tag
);
912 msg
->any
.blk_read
.keyid
= sc
->keyid
;
913 msg
->any
.blk_read
.offset
= bio
->bio_offset
;
914 msg
->any
.blk_read
.bytes
= bp
->b_bcount
;
917 msg
= kdmsg_msg_alloc(trans
,
919 DMSGF_CREATE
| DMSGF_DELETE
,
920 xa_bio_completion
, tag
);
921 msg
->any
.blk_write
.keyid
= sc
->keyid
;
922 msg
->any
.blk_write
.offset
= bio
->bio_offset
;
923 msg
->any
.blk_write
.bytes
= bp
->b_bcount
;
924 msg
->aux_data
= bp
->b_data
;
925 msg
->aux_size
= bp
->b_bcount
;
928 msg
= kdmsg_msg_alloc(trans
,
930 DMSGF_CREATE
| DMSGF_DELETE
,
931 xa_bio_completion
, tag
);
932 msg
->any
.blk_flush
.keyid
= sc
->keyid
;
933 msg
->any
.blk_flush
.offset
= bio
->bio_offset
;
934 msg
->any
.blk_flush
.bytes
= bp
->b_bcount
;
936 case BUF_CMD_FREEBLKS
:
937 msg
= kdmsg_msg_alloc(trans
,
939 DMSGF_CREATE
| DMSGF_DELETE
,
940 xa_bio_completion
, tag
);
941 msg
->any
.blk_freeblks
.keyid
= sc
->keyid
;
942 msg
->any
.blk_freeblks
.offset
= bio
->bio_offset
;
943 msg
->any
.blk_freeblks
.bytes
= bp
->b_bcount
;
946 bp
->b_flags
|= B_ERROR
;
948 devstat_end_transaction_buf(&sc
->stats
, bp
);
949 atomic_add_int(&xa_active
, -1);
957 * If no msg was allocated we likely could not find a good span.
962 * Message was passed in or constructed.
964 tag
->state
= msg
->state
;
965 lockmgr(&sc
->lk
, LK_RELEASE
);
966 kdmsg_msg_write(msg
);
967 lockmgr(&sc
->lk
, LK_EXCLUSIVE
);
968 } else if (tag
->bio
&&
969 (tag
->bio
->bio_buf
->b_flags
& B_FAILONDIS
) == 0) {
971 * No spans available but BIO is not allowed to fail
972 * on connectivity problems. Requeue the BIO.
974 TAILQ_INSERT_TAIL(&sc
->bioq
, tag
->bio
, bio_act
);
976 lockmgr(&sc
->lk
, LK_RELEASE
);
978 lockmgr(&sc
->lk
, LK_EXCLUSIVE
);
981 * No spans available, bio is allowed to fail.
983 lockmgr(&sc
->lk
, LK_RELEASE
);
984 tag
->status
.head
.error
= DMSG_ERR_IO
;
986 lockmgr(&sc
->lk
, LK_EXCLUSIVE
);
991 xa_wait(xa_tag_t
*tag
)
993 xa_softc_t
*sc
= tag
->sc
;
996 lockmgr(&sc
->lk
, LK_EXCLUSIVE
);
998 while (tag
->done
== 0)
999 lksleep(tag
, &sc
->lk
, 0, "xawait", 0);
1000 lockmgr(&sc
->lk
, LK_RELEASE
);
1002 error
= tag
->status
.head
.error
;
1010 xa_done(xa_tag_t
*tag
, int wasbio
)
1012 KKASSERT(tag
->bio
== NULL
);
1019 xa_release(tag
, wasbio
);
1023 * Release a tag. If everything looks ok and there are pending BIOs
1024 * (due to all tags in-use), we can use the tag to start the next BIO.
1025 * Do not try to restart if the connection is currently failed.
1029 xa_release(xa_tag_t
*tag
, int wasbio
)
1031 xa_softc_t
*sc
= tag
->sc
;
1034 if ((bio
= tag
->bio
) != NULL
) {
1035 struct buf
*bp
= bio
->bio_buf
;
1038 bp
->b_flags
|= B_ERROR
;
1039 devstat_end_transaction_buf(&sc
->stats
, bp
);
1040 atomic_add_int(&xa_active
, -1);
1045 lockmgr(&sc
->lk
, LK_EXCLUSIVE
);
1047 if (wasbio
&& sc
->open_tag
&&
1048 (bio
= TAILQ_FIRST(&sc
->bioq
)) != NULL
) {
1049 TAILQ_REMOVE(&sc
->bioq
, bio
, bio_act
);
1051 xa_start(tag
, NULL
, 1);
1053 TAILQ_REMOVE(&sc
->tag_pendq
, tag
, entry
);
1054 TAILQ_INSERT_TAIL(&sc
->tag_freeq
, tag
, entry
);
1056 lockmgr(&sc
->lk
, LK_RELEASE
);
1060 * Handle messages under the BLKOPEN transaction.
1063 xa_sync_completion(kdmsg_state_t
*state
, kdmsg_msg_t
*msg
)
1065 xa_tag_t
*tag
= state
->any
.any
;
1070 * If the tag has been cleaned out we already closed our side
1071 * of the transaction and we are waiting for the other side to
1074 xa_printf(1, "xa_sync_completion: tag %p msg %08x state %p\n",
1075 tag
, msg
->any
.head
.cmd
, msg
->state
);
1078 if (msg
->any
.head
.cmd
& DMSGF_CREATE
)
1079 kdmsg_state_reply(state
, DMSG_ERR_LOSTLINK
);
1087 lockmgr(&sc
->lk
, LK_EXCLUSIVE
);
1090 * Handle initial response to our open and restart any deferred
1093 * NOTE: DELETE may also be set.
1095 if (msg
->any
.head
.cmd
& DMSGF_CREATE
) {
1096 switch(msg
->any
.head
.cmd
& DMSGF_CMDSWMASK
) {
1097 case DMSG_LNK_ERROR
| DMSGF_REPLY
:
1098 bzero(&tag
->status
, sizeof(tag
->status
));
1099 tag
->status
.head
= msg
->any
.head
;
1101 case DMSG_BLK_ERROR
| DMSGF_REPLY
:
1102 tag
->status
= msg
->any
.blk_error
;
1105 sc
->last_error
= tag
->status
.head
.error
;
1106 xa_printf(1, "blk_open completion status %d\n",
1108 if (sc
->last_error
== 0) {
1109 while ((bio
= TAILQ_FIRST(&sc
->bioq
)) != NULL
) {
1110 tag
= xa_setup_cmd(sc
, NULL
);
1113 TAILQ_REMOVE(&sc
->bioq
, bio
, bio_act
);
1115 xa_start(tag
, NULL
, 1);
1118 sc
->serializing
= 0;
1123 * Handle unexpected termination (or lost comm channel) from other
1124 * side. Autonomous completion only if open_tag matches,
1125 * otherwise another thread is probably waiting on the tag.
1127 * (see xa_close() for other interactions)
1129 if (msg
->any
.head
.cmd
& DMSGF_DELETE
) {
1130 kdmsg_state_reply(tag
->state
, 0);
1131 if (sc
->open_tag
== tag
) {
1132 sc
->open_tag
= NULL
;
1139 lockmgr(&sc
->lk
, LK_RELEASE
);
1145 xa_bio_completion(kdmsg_state_t
*state
, kdmsg_msg_t
*msg
)
1147 xa_tag_t
*tag
= state
->any
.any
;
1148 xa_softc_t
*sc
= tag
->sc
;
1153 * Get the bio from the tag. If no bio is present we just do
1156 if ((bio
= tag
->bio
) == NULL
)
1161 * Process return status
1163 switch(msg
->any
.head
.cmd
& DMSGF_CMDSWMASK
) {
1164 case DMSG_LNK_ERROR
| DMSGF_REPLY
:
1165 bzero(&tag
->status
, sizeof(tag
->status
));
1166 tag
->status
.head
= msg
->any
.head
;
1167 if (tag
->status
.head
.error
)
1168 tag
->status
.resid
= bp
->b_bcount
;
1170 tag
->status
.resid
= 0;
1172 case DMSG_BLK_ERROR
| DMSGF_REPLY
:
1173 tag
->status
= msg
->any
.blk_error
;
1178 * If the device is open stall the bio on DMSG errors. If an
1179 * actual I/O error occured on the remote device, DMSG_ERR_IO
1182 if (tag
->status
.head
.error
&&
1183 (msg
->any
.head
.cmd
& DMSGF_DELETE
) && sc
->opencnt
) {
1184 if (tag
->status
.head
.error
!= DMSG_ERR_IO
)
1189 * Process bio completion
1191 * For reads any returned data is zero-extended if necessary, so
1192 * the server can short-cut any all-zeros reads if it desires.
1196 if (msg
->aux_data
&& msg
->aux_size
) {
1197 if (msg
->aux_size
< bp
->b_bcount
) {
1198 bcopy(msg
->aux_data
, bp
->b_data
, msg
->aux_size
);
1199 bzero(bp
->b_data
+ msg
->aux_size
,
1200 bp
->b_bcount
- msg
->aux_size
);
1202 bcopy(msg
->aux_data
, bp
->b_data
, bp
->b_bcount
);
1205 bzero(bp
->b_data
, bp
->b_bcount
);
1210 case BUF_CMD_FREEBLKS
:
1212 if (tag
->status
.resid
> bp
->b_bcount
)
1213 tag
->status
.resid
= bp
->b_bcount
;
1214 bp
->b_resid
= tag
->status
.resid
;
1215 if (tag
->status
.head
.error
!= 0) {
1217 bp
->b_flags
|= B_ERROR
;
1221 devstat_end_transaction_buf(&sc
->stats
, bp
);
1222 atomic_add_int(&xa_active
, -1);
1229 * Handle completion of the transaction. If the bioq is not empty
1230 * we can initiate another bio on the same tag.
1232 * NOTE: Most of our transactions will be single-message
1233 * CREATE+DELETEs, so we won't have to terminate the
1234 * transaction separately, here. But just in case they
1235 * aren't be sure to terminate the transaction.
1238 if (msg
->any
.head
.cmd
& DMSGF_DELETE
) {
1240 if ((state
->txcmd
& DMSGF_DELETE
) == 0)
1241 kdmsg_msg_reply(msg
, 0);
1246 * Handle the case where the transaction failed due to a
1247 * connectivity issue. The tag is put away with wasbio=0
1248 * and we put the BIO back onto the bioq for a later restart.
1250 * probe I/Os (where the device is not open) will be failed
1251 * instead of requeued.
1255 if (bio
->bio_buf
->b_flags
& B_FAILONDIS
) {
1256 xa_printf(1, "xa_strategy: lost link, fail probe bp %p\n",
1258 bio
->bio_buf
->b_error
= ENXIO
;
1259 bio
->bio_buf
->b_flags
|= B_ERROR
;
1263 xa_printf(1, "xa_strategy: lost link, requeue bp %p\n",
1267 if ((state
->txcmd
& DMSGF_DELETE
) == 0)
1268 kdmsg_msg_reply(msg
, 0);
1274 lockmgr(&sc
->lk
, LK_EXCLUSIVE
);
1275 TAILQ_INSERT_TAIL(&sc
->bioq
, bio
, bio_act
);
1276 lockmgr(&sc
->lk
, LK_RELEASE
);
1282 * Restart as much deferred I/O as we can. The serializer is set and we
1283 * eat it (clear it) when done.
1285 * Called with sc->lk held
1289 xa_restart_deferred(xa_softc_t
*sc
)
1291 kdmsg_state_t
*span
;
1296 KKASSERT(sc
->serializing
);
1299 * Determine if a restart is needed.
1301 if (sc
->opencnt
== 0) {
1303 * Device is not open, nothing to do, eat serializing.
1305 sc
->serializing
= 0;
1307 } else if (sc
->open_tag
== NULL
) {
1309 * BLK_OPEN required before we can restart any BIOs.
1310 * Select the best LNK_SPAN to issue the BLK_OPEN under.
1312 * serializing interlocks waiting open()s.
1315 TAILQ_FOREACH(span
, &sc
->spanq
, user_entry
) {
1316 if ((span
->rxcmd
& DMSGF_DELETE
) == 0)
1323 tag
= xa_setup_cmd(sc
, NULL
);
1329 msg
= kdmsg_msg_alloc(span
,
1332 xa_sync_completion
, tag
);
1333 msg
->any
.blk_open
.modes
= DMSG_BLKOPEN_RD
;
1335 "BLK_OPEN tag %p state %p "
1337 tag
, msg
->state
, span
);
1338 xa_start(tag
, msg
, 0);
1341 sc
->serializing
= 0;
1344 /* else leave serializing set until BLK_OPEN response */
1347 sc
->serializing
= 0;