2 * Copyright (c) 2012 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/kernel.h>
38 #include <sys/sysctl.h>
41 #include <sys/disklabel.h>
42 #include <sys/disklabel32.h>
43 #include <sys/disklabel64.h>
44 #include <sys/diskslice.h>
45 #include <sys/diskmbr.h>
47 #include <sys/malloc.h>
48 #include <sys/device.h>
49 #include <sys/devfs.h>
50 #include <sys/thread.h>
51 #include <sys/queue.h>
58 #include <sys/msgport2.h>
59 #include <sys/thread2.h>
72 static MALLOC_DEFINE(M_DMSG_DISK
, "dmsg_disk", "disk dmsg");
74 static int blk_active
;
75 SYSCTL_INT(_debug
, OID_AUTO
, blk_active
, CTLFLAG_RW
, &blk_active
, 0,
76 "Number of active iocom IOs");
78 static int disk_iocom_reconnect(struct disk
*dp
, struct file
*fp
);
79 static int disk_rcvdmsg(kdmsg_msg_t
*msg
);
81 static void disk_blk_open(struct disk
*dp
, kdmsg_msg_t
*msg
);
82 static void disk_blk_read(struct disk
*dp
, kdmsg_msg_t
*msg
);
83 static void disk_blk_write(struct disk
*dp
, kdmsg_msg_t
*msg
);
84 static void disk_blk_flush(struct disk
*dp
, kdmsg_msg_t
*msg
);
85 static void disk_blk_freeblks(struct disk
*dp
, kdmsg_msg_t
*msg
);
86 static void diskiodone(struct bio
*bio
);
89 disk_iocom_init(struct disk
*dp
)
91 kdmsg_iocom_init(&dp
->d_iocom
, dp
,
92 KDMSG_IOCOMF_AUTOCONN
|
93 KDMSG_IOCOMF_AUTORXSPAN
|
94 KDMSG_IOCOMF_AUTOTXSPAN
,
95 M_DMSG_DISK
, disk_rcvdmsg
);
99 disk_iocom_update(struct disk
*dp
)
104 disk_iocom_uninit(struct disk
*dp
)
106 kdmsg_iocom_uninit(&dp
->d_iocom
);
110 disk_iocom_ioctl(struct disk
*dp
, u_long cmd
, void *data
)
113 struct disk_ioc_recluster
*recl
;
119 fp
= holdfp(curthread
, recl
->fd
, -1);
121 error
= disk_iocom_reconnect(dp
, fp
);
135 disk_iocom_reconnect(struct disk
*dp
, struct file
*fp
)
139 ksnprintf(devname
, sizeof(devname
), "%s%d",
140 dev_dname(dp
->d_rawdev
), dkunit(dp
->d_rawdev
));
142 kdmsg_iocom_reconnect(&dp
->d_iocom
, fp
, devname
);
144 dp
->d_iocom
.auto_lnk_conn
.proto_version
= DMSG_SPAN_PROTO_1
;
145 dp
->d_iocom
.auto_lnk_conn
.peer_type
= DMSG_PEER_BLOCK
;
146 dp
->d_iocom
.auto_lnk_conn
.peer_mask
= 1LLU << DMSG_PEER_BLOCK
;
147 dp
->d_iocom
.auto_lnk_conn
.peer_mask
= (uint64_t)-1;
149 if (dp
->d_info
.d_serialno
) {
150 ksnprintf(dp
->d_iocom
.auto_lnk_conn
.peer_label
,
151 sizeof(dp
->d_iocom
.auto_lnk_conn
.peer_label
),
152 "%s/%s", hostname
, dp
->d_info
.d_serialno
);
154 ksnprintf(dp
->d_iocom
.auto_lnk_conn
.peer_label
,
155 sizeof(dp
->d_iocom
.auto_lnk_conn
.peer_label
),
156 "%s/%s", hostname
, devname
);
159 ksnprintf(dp
->d_iocom
.auto_lnk_conn
.peer_label
,
160 sizeof(dp
->d_iocom
.auto_lnk_conn
.peer_label
),
161 "%s/%s", hostname
, devname
);
163 dp
->d_iocom
.auto_lnk_span
.proto_version
= DMSG_SPAN_PROTO_1
;
164 dp
->d_iocom
.auto_lnk_span
.peer_type
= DMSG_PEER_BLOCK
;
165 dp
->d_iocom
.auto_lnk_span
.media
.block
.bytes
=
166 dp
->d_info
.d_media_size
;
167 dp
->d_iocom
.auto_lnk_span
.media
.block
.blksize
=
168 dp
->d_info
.d_media_blksize
;
169 ksnprintf(dp
->d_iocom
.auto_lnk_span
.peer_label
,
170 sizeof(dp
->d_iocom
.auto_lnk_span
.peer_label
),
171 "%s", dp
->d_iocom
.auto_lnk_conn
.peer_label
);
172 if (dp
->d_info
.d_serialno
) {
173 ksnprintf(dp
->d_iocom
.auto_lnk_span
.pfs_label
,
174 sizeof(dp
->d_iocom
.auto_lnk_span
.pfs_label
),
175 "%s", dp
->d_info
.d_serialno
);
178 * If no serial number is available generate a dummy serial
179 * number from the host and device name and pray. This will
180 * allow e.g. /dev/vn* to look meaningful on a remote machine.
182 ksnprintf(dp
->d_iocom
.auto_lnk_span
.pfs_label
,
183 sizeof(dp
->d_iocom
.auto_lnk_span
.pfs_label
),
184 "%s.%s", hostname
, devname
);
187 kdmsg_iocom_autoinitiate(&dp
->d_iocom
, NULL
);
193 disk_rcvdmsg(kdmsg_msg_t
*msg
)
195 struct disk
*dp
= msg
->state
->iocom
->handle
;
198 * Handle debug messages (these might not be in transactions)
200 switch(msg
->any
.head
.cmd
& DMSGF_CMDSWMASK
) {
203 * Execute shell command (not supported atm)
205 kdmsg_msg_reply(msg
, DMSG_ERR_NOSUPP
);
207 case DMSG_DBG_SHELL
| DMSGF_REPLY
:
209 msg
->aux_data
[msg
->aux_size
- 1] = 0;
210 kprintf("diskiocom: DEBUGMSG: %s\n", msg
->aux_data
);
216 * All remaining messages must be in a transaction.
218 * NOTE! We currently don't care if the transaction is just
219 * the span transaction (for disk probes) or if it is the
220 * BLK_OPEN transaction.
222 * NOTE! We are switching on the first message's command. The
223 * actual message command within the transaction may be
224 * different (if streaming within a transaction).
226 if (msg
->state
== &msg
->state
->iocom
->state0
) {
227 kdmsg_msg_reply(msg
, DMSG_ERR_NOSUPP
);
231 switch(msg
->state
->rxcmd
& DMSGF_CMDSWMASK
) {
233 disk_blk_open(dp
, msg
);
237 * not reached normally but leave in for completeness
239 disk_blk_read(dp
, msg
);
242 disk_blk_write(dp
, msg
);
245 disk_blk_flush(dp
, msg
);
247 case DMSG_BLK_FREEBLKS
:
248 disk_blk_freeblks(dp
, msg
);
251 if ((msg
->any
.head
.cmd
& DMSGF_REPLY
) == 0) {
252 if (msg
->any
.head
.cmd
& DMSGF_DELETE
)
253 kdmsg_msg_reply(msg
, DMSG_ERR_NOSUPP
);
255 kdmsg_msg_result(msg
, DMSG_ERR_NOSUPP
);
264 disk_blk_open(struct disk
*dp
, kdmsg_msg_t
*msg
)
266 struct dios_open
*openst
;
267 int error
= DMSG_ERR_NOSUPP
;
270 openst
= msg
->state
->any
.any
;
271 if ((msg
->any
.head
.cmd
& DMSGF_CMDSWMASK
) == DMSG_BLK_OPEN
) {
272 if (openst
== NULL
) {
273 openst
= kmalloc(sizeof(*openst
), M_DEVBUF
,
275 msg
->state
->any
.any
= openst
;
278 if (msg
->any
.blk_open
.modes
& DMSG_BLKOPEN_RD
)
280 if (msg
->any
.blk_open
.modes
& DMSG_BLKOPEN_WR
)
282 error
= dev_dopen(dp
->d_rawdev
, fflags
, S_IFCHR
, proc0
.p_ucred
, NULL
);
286 if (msg
->any
.blk_open
.modes
& DMSG_BLKOPEN_RD
)
288 if (msg
->any
.blk_open
.modes
& DMSG_BLKOPEN_WR
)
293 if ((msg
->any
.head
.cmd
& DMSGF_CMDSWMASK
) == DMSG_BLK_CLOSE
&&
296 if ((msg
->any
.blk_open
.modes
& DMSG_BLKOPEN_RD
) &&
300 if ((msg
->any
.blk_open
.modes
& DMSG_BLKOPEN_WR
) &&
304 error
= dev_dclose(dp
->d_rawdev
, fflags
, S_IFCHR
, NULL
);
308 if (msg
->any
.blk_open
.modes
& DMSG_BLKOPEN_RD
)
310 if (msg
->any
.blk_open
.modes
& DMSG_BLKOPEN_WR
)
315 if (msg
->any
.head
.cmd
& DMSGF_DELETE
) {
317 while (openst
->openrd
&& openst
->openwr
) {
320 dev_dclose(dp
->d_rawdev
, FREAD
|FWRITE
, S_IFCHR
, NULL
);
322 while (openst
->openrd
) {
324 dev_dclose(dp
->d_rawdev
, FREAD
, S_IFCHR
, NULL
);
326 while (openst
->openwr
) {
328 dev_dclose(dp
->d_rawdev
, FWRITE
, S_IFCHR
, NULL
);
330 kfree(openst
, M_DEVBUF
);
331 msg
->state
->any
.any
= NULL
;
333 kdmsg_msg_reply(msg
, error
);
335 kdmsg_msg_result(msg
, error
);
341 disk_blk_read(struct disk
*dp
, kdmsg_msg_t
*msg
)
343 struct dios_io
*iost
;
346 int error
= DMSG_ERR_NOSUPP
;
350 * Only DMSG_BLK_READ commands imply read ops.
352 iost
= msg
->state
->any
.any
;
353 if ((msg
->any
.head
.cmd
& DMSGF_CMDSWMASK
) == DMSG_BLK_READ
) {
354 if (msg
->any
.blk_read
.bytes
< DEV_BSIZE
||
355 msg
->any
.blk_read
.bytes
> MAXPHYS
) {
356 error
= DMSG_ERR_PARAM
;
360 iost
= kmalloc(sizeof(*iost
), M_DEVBUF
,
362 msg
->state
->any
.any
= iost
;
365 bp
= getpbuf_mem(NULL
);
366 KKASSERT(msg
->any
.blk_read
.bytes
<= bp
->b_bufsize
);
368 bp
->b_cmd
= BUF_CMD_READ
;
369 bp
->b_bcount
= msg
->any
.blk_read
.bytes
;
370 bp
->b_resid
= bp
->b_bcount
;
371 bio
->bio_offset
= msg
->any
.blk_read
.offset
;
372 bio
->bio_caller_info1
.ptr
= msg
->state
;
373 bio
->bio_done
= diskiodone
;
375 /* kdmsg_state_hold(msg->state); */
376 atomic_add_int(&blk_active
, 1);
377 atomic_add_int(&iost
->count
, 1);
378 if (msg
->any
.head
.cmd
& DMSGF_DELETE
)
381 dev_dstrategy(dp
->d_rawdev
, bio
);
385 if (msg
->any
.head
.cmd
& DMSGF_DELETE
) {
386 if (iost
&& iost
->count
== 0) {
387 kfree(iost
, M_DEVBUF
);
388 msg
->state
->any
.any
= NULL
;
390 kdmsg_msg_reply(msg
, error
);
392 kdmsg_msg_result(msg
, error
);
399 disk_blk_write(struct disk
*dp
, kdmsg_msg_t
*msg
)
401 struct dios_io
*iost
;
404 int error
= DMSG_ERR_NOSUPP
;
408 * Only DMSG_BLK_WRITE commands imply read ops.
410 iost
= msg
->state
->any
.any
;
411 if ((msg
->any
.head
.cmd
& DMSGF_CMDSWMASK
) == DMSG_BLK_WRITE
) {
412 if (msg
->any
.blk_write
.bytes
< DEV_BSIZE
||
413 msg
->any
.blk_write
.bytes
> MAXPHYS
) {
414 error
= DMSG_ERR_PARAM
;
418 iost
= kmalloc(sizeof(*iost
), M_DEVBUF
,
420 msg
->state
->any
.any
= iost
;
424 * Issue WRITE. Short data implies zeros. Try to optimize
425 * the buffer cache buffer for the case where we can just
426 * use the message's data pointer.
429 if (msg
->aux_size
>= msg
->any
.blk_write
.bytes
)
432 bp
= getpbuf_mem(NULL
);
433 KKASSERT(msg
->any
.blk_write
.bytes
<= bp
->b_bufsize
);
435 bp
->b_cmd
= BUF_CMD_WRITE
;
436 bp
->b_bcount
= msg
->any
.blk_write
.bytes
;
437 bp
->b_resid
= bp
->b_bcount
;
438 if (msg
->aux_size
>= msg
->any
.blk_write
.bytes
) {
439 bp
->b_data
= msg
->aux_data
;
440 kdmsg_detach_aux_data(msg
, &iost
->data
);
442 bcopy(msg
->aux_data
, bp
->b_data
, msg
->aux_size
);
443 bzero(bp
->b_data
+ msg
->aux_size
,
444 msg
->any
.blk_write
.bytes
- msg
->aux_size
);
445 bzero(&iost
->data
, sizeof(iost
->data
));
447 bio
->bio_offset
= msg
->any
.blk_write
.offset
;
448 bio
->bio_caller_info1
.ptr
= msg
->state
;
449 bio
->bio_done
= diskiodone
;
451 /* kdmsg_state_hold(msg->state); */
452 atomic_add_int(&blk_active
, 1);
453 atomic_add_int(&iost
->count
, 1);
454 if (msg
->any
.head
.cmd
& DMSGF_DELETE
)
457 dev_dstrategy(dp
->d_rawdev
, bio
);
461 if (msg
->any
.head
.cmd
& DMSGF_DELETE
) {
462 if (iost
&& iost
->count
== 0) {
463 kfree(iost
, M_DEVBUF
);
464 msg
->state
->any
.any
= NULL
;
466 kdmsg_msg_reply(msg
, error
);
468 kdmsg_msg_result(msg
, error
);
475 disk_blk_flush(struct disk
*dp
, kdmsg_msg_t
*msg
)
477 struct dios_io
*iost
;
480 int error
= DMSG_ERR_NOSUPP
;
484 * Only DMSG_BLK_FLUSH commands imply read ops.
486 iost
= msg
->state
->any
.any
;
487 if ((msg
->any
.head
.cmd
& DMSGF_CMDSWMASK
) == DMSG_BLK_FLUSH
) {
489 iost
= kmalloc(sizeof(*iost
), M_DEVBUF
,
491 msg
->state
->any
.any
= iost
;
496 bp
->b_cmd
= BUF_CMD_FLUSH
;
497 bp
->b_bcount
= msg
->any
.blk_flush
.bytes
;
499 bio
->bio_offset
= msg
->any
.blk_flush
.offset
;
500 bio
->bio_caller_info1
.ptr
= msg
->state
;
501 bio
->bio_done
= diskiodone
;
503 /* kdmsg_state_hold(msg->state); */
504 atomic_add_int(&blk_active
, 1);
505 atomic_add_int(&iost
->count
, 1);
506 if (msg
->any
.head
.cmd
& DMSGF_DELETE
)
509 dev_dstrategy(dp
->d_rawdev
, bio
);
512 if (msg
->any
.head
.cmd
& DMSGF_DELETE
) {
513 if (iost
&& iost
->count
== 0) {
514 kfree(iost
, M_DEVBUF
);
515 msg
->state
->any
.any
= NULL
;
517 kdmsg_msg_reply(msg
, error
);
519 kdmsg_msg_result(msg
, error
);
526 disk_blk_freeblks(struct disk
*dp
, kdmsg_msg_t
*msg
)
528 struct dios_io
*iost
;
531 int error
= DMSG_ERR_NOSUPP
;
535 * Only DMSG_BLK_FREEBLKS commands imply read ops.
537 iost
= msg
->state
->any
.any
;
538 if ((msg
->any
.head
.cmd
& DMSGF_CMDSWMASK
) == DMSG_BLK_FREEBLKS
) {
540 iost
= kmalloc(sizeof(*iost
), M_DEVBUF
,
542 msg
->state
->any
.any
= iost
;
547 bp
->b_cmd
= BUF_CMD_FREEBLKS
;
548 bp
->b_bcount
= msg
->any
.blk_freeblks
.bytes
;
550 bio
->bio_offset
= msg
->any
.blk_freeblks
.offset
;
551 bio
->bio_caller_info1
.ptr
= msg
->state
;
552 bio
->bio_done
= diskiodone
;
554 /* kdmsg_state_hold(msg->state); */
555 atomic_add_int(&blk_active
, 1);
556 atomic_add_int(&iost
->count
, 1);
557 if (msg
->any
.head
.cmd
& DMSGF_DELETE
)
560 dev_dstrategy(dp
->d_rawdev
, bio
);
563 if (msg
->any
.head
.cmd
& DMSGF_DELETE
) {
564 if (iost
&& iost
->count
== 0) {
565 kfree(iost
, M_DEVBUF
);
566 msg
->state
->any
.any
= NULL
;
568 kdmsg_msg_reply(msg
, error
);
570 kdmsg_msg_result(msg
, error
);
577 diskiodone(struct bio
*bio
)
579 struct buf
*bp
= bio
->bio_buf
;
580 kdmsg_state_t
*state
= bio
->bio_caller_info1
.ptr
;
582 struct dios_io
*iost
= state
->any
.any
;
589 cmd
= DMSG_LNK_ERROR
;
595 cmd
= DMSG_LNK_ERROR
;
597 bytes
= bp
->b_bcount
;
600 if (bp
->b_flags
& B_ERROR
) {
606 kdmsg_free_aux_data(&iost
->data
);
609 case BUF_CMD_FREEBLKS
:
610 if (bp
->b_flags
& B_ERROR
)
616 panic("diskiodone: Unknown bio cmd = %d\n",
617 bio
->bio_buf
->b_cmd
);
618 error
= 0; /* avoid compiler warning */
619 break; /* NOT REACHED */
623 * Convert error to DMSG_ERR_* code.
629 * Convert LNK_ERROR or BLK_ERROR if non-zero resid. READS will
630 * have already converted cmd to BLK_ERROR and set up data to return.
632 if (resid
&& cmd
== DMSG_LNK_ERROR
)
633 cmd
= DMSG_BLK_ERROR
;
634 /* XXX txcmd is delayed so this won't work for streaming */
635 if ((state
->txcmd
& DMSGF_CREATE
) == 0) /* assume serialized */
638 if (atomic_fetchadd_int(&iost
->count
, -1) == 1)
641 atomic_add_int(&iost
->count
, -1);
643 atomic_add_int(&blk_active
, -1);
647 * Allocate a basic or extended reply. Be careful not to populate
648 * extended header fields unless we allocated an extended reply.
650 rmsg
= kdmsg_msg_alloc(state
, cmd
, NULL
, 0);
652 rmsg
->aux_data
= kmalloc(bytes
, state
->iocom
->mmsg
, M_INTWAIT
);
653 rmsg
->aux_size
= bytes
;
654 rmsg
->flags
|= KDMSG_FLAG_AUXALLOC
;
655 bcopy(data
, rmsg
->aux_data
, bytes
);
657 rmsg
->any
.blk_error
.head
.error
= error
;
658 if ((cmd
& DMSGF_BASECMDMASK
) == DMSG_BLK_ERROR
)
659 rmsg
->any
.blk_error
.resid
= resid
;
660 bio
->bio_caller_info1
.ptr
= NULL
;
661 /* kdmsg_state_drop(state); */
662 kdmsg_msg_write(rmsg
);
663 if (bp
->b_flags
& B_PAGING
) {
666 bp
->b_flags
|= B_INVAL
| B_AGE
;