15325 bhyve upstream sync 2023 January
[illumos-gate.git] / usr / src / cmd / bhyve / block_if.c
blob889e207f3fe7c444949d8432382ee62146969f83
1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
4 * Copyright (c) 2013 Peter Grehan <grehan@freebsd.org>
5 * All rights reserved.
6 * Copyright 2020 Joyent, Inc.
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
29 * $FreeBSD$
33 * Copyright 2020 Joyent, Inc.
36 #include <sys/cdefs.h>
37 __FBSDID("$FreeBSD$");
39 #include <sys/param.h>
40 #ifndef WITHOUT_CAPSICUM
41 #include <sys/capsicum.h>
42 #endif
43 #include <sys/queue.h>
44 #include <sys/errno.h>
45 #include <sys/stat.h>
46 #include <sys/ioctl.h>
47 #include <sys/disk.h>
48 #ifndef __FreeBSD__
49 #include <sys/limits.h>
50 #include <sys/uio.h>
51 #include <sys/dkio.h>
52 #endif
54 #include <assert.h>
55 #ifndef WITHOUT_CAPSICUM
56 #include <capsicum_helpers.h>
57 #endif
58 #include <err.h>
59 #include <fcntl.h>
60 #include <stdio.h>
61 #include <stdlib.h>
62 #include <string.h>
63 #include <pthread.h>
64 #include <pthread_np.h>
65 #include <signal.h>
66 #include <sysexits.h>
67 #include <unistd.h>
69 #include <machine/atomic.h>
71 #include "bhyverun.h"
72 #include "config.h"
73 #include "debug.h"
74 #include "mevent.h"
75 #include "pci_emul.h"
76 #include "block_if.h"
78 #define BLOCKIF_SIG 0xb109b109
80 #ifdef __FreeBSD__
81 #define BLOCKIF_NUMTHR 8
82 #else
83 /* Enlarge to keep pace with the virtio-block ring size */
84 #define BLOCKIF_NUMTHR 16
85 #endif
86 #define BLOCKIF_MAXREQ (BLOCKIF_RING_MAX + BLOCKIF_NUMTHR)
88 enum blockop {
89 BOP_READ,
90 BOP_WRITE,
91 #ifndef __FreeBSD__
92 BOP_WRITE_SYNC,
93 #endif
94 BOP_FLUSH,
95 BOP_DELETE
98 enum blockstat {
99 BST_FREE,
100 BST_BLOCK,
101 BST_PEND,
102 BST_BUSY,
103 BST_DONE
106 struct blockif_elem {
107 TAILQ_ENTRY(blockif_elem) be_link;
108 struct blockif_req *be_req;
109 enum blockop be_op;
110 enum blockstat be_status;
111 pthread_t be_tid;
112 off_t be_block;
115 #ifndef __FreeBSD__
116 enum blockif_wce {
117 WCE_NONE = 0,
118 WCE_IOCTL,
119 WCE_FCNTL
121 #endif
123 struct blockif_ctxt {
124 unsigned int bc_magic;
125 int bc_fd;
126 int bc_ischr;
127 int bc_isgeom;
128 int bc_candelete;
129 #ifndef __FreeBSD__
130 enum blockif_wce bc_wce;
131 #endif
132 int bc_rdonly;
133 off_t bc_size;
134 int bc_sectsz;
135 int bc_psectsz;
136 int bc_psectoff;
137 int bc_closing;
138 pthread_t bc_btid[BLOCKIF_NUMTHR];
139 pthread_mutex_t bc_mtx;
140 pthread_cond_t bc_cond;
141 blockif_resize_cb *bc_resize_cb;
142 void *bc_resize_cb_arg;
143 struct mevent *bc_resize_event;
145 /* Request elements and free/pending/busy queues */
146 TAILQ_HEAD(, blockif_elem) bc_freeq;
147 TAILQ_HEAD(, blockif_elem) bc_pendq;
148 TAILQ_HEAD(, blockif_elem) bc_busyq;
149 struct blockif_elem bc_reqs[BLOCKIF_MAXREQ];
152 static pthread_once_t blockif_once = PTHREAD_ONCE_INIT;
154 struct blockif_sig_elem {
155 pthread_mutex_t bse_mtx;
156 pthread_cond_t bse_cond;
157 int bse_pending;
158 struct blockif_sig_elem *bse_next;
161 static struct blockif_sig_elem *blockif_bse_head;
163 static int
164 blockif_enqueue(struct blockif_ctxt *bc, struct blockif_req *breq,
165 enum blockop op)
167 struct blockif_elem *be, *tbe;
168 off_t off;
169 int i;
171 be = TAILQ_FIRST(&bc->bc_freeq);
172 assert(be != NULL);
173 assert(be->be_status == BST_FREE);
174 TAILQ_REMOVE(&bc->bc_freeq, be, be_link);
175 be->be_req = breq;
176 be->be_op = op;
177 switch (op) {
178 case BOP_READ:
179 case BOP_WRITE:
180 #ifndef __FreeBSD__
181 case BOP_WRITE_SYNC:
182 #endif
183 case BOP_DELETE:
184 off = breq->br_offset;
185 for (i = 0; i < breq->br_iovcnt; i++)
186 off += breq->br_iov[i].iov_len;
187 break;
188 default:
189 off = OFF_MAX;
191 be->be_block = off;
192 TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) {
193 if (tbe->be_block == breq->br_offset)
194 break;
196 if (tbe == NULL) {
197 TAILQ_FOREACH(tbe, &bc->bc_busyq, be_link) {
198 if (tbe->be_block == breq->br_offset)
199 break;
202 if (tbe == NULL)
203 be->be_status = BST_PEND;
204 else
205 be->be_status = BST_BLOCK;
206 TAILQ_INSERT_TAIL(&bc->bc_pendq, be, be_link);
207 return (be->be_status == BST_PEND);
210 static int
211 blockif_dequeue(struct blockif_ctxt *bc, pthread_t t, struct blockif_elem **bep)
213 struct blockif_elem *be;
215 TAILQ_FOREACH(be, &bc->bc_pendq, be_link) {
216 if (be->be_status == BST_PEND)
217 break;
218 assert(be->be_status == BST_BLOCK);
220 if (be == NULL)
221 return (0);
222 TAILQ_REMOVE(&bc->bc_pendq, be, be_link);
223 be->be_status = BST_BUSY;
224 be->be_tid = t;
225 TAILQ_INSERT_TAIL(&bc->bc_busyq, be, be_link);
226 *bep = be;
227 return (1);
230 static void
231 blockif_complete(struct blockif_ctxt *bc, struct blockif_elem *be)
233 struct blockif_elem *tbe;
235 if (be->be_status == BST_DONE || be->be_status == BST_BUSY)
236 TAILQ_REMOVE(&bc->bc_busyq, be, be_link);
237 else
238 TAILQ_REMOVE(&bc->bc_pendq, be, be_link);
239 TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) {
240 if (tbe->be_req->br_offset == be->be_block)
241 tbe->be_status = BST_PEND;
243 be->be_tid = 0;
244 be->be_status = BST_FREE;
245 be->be_req = NULL;
246 TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link);
249 static int
250 blockif_flush_bc(struct blockif_ctxt *bc)
252 #ifdef __FreeBSD__
253 if (bc->bc_ischr) {
254 if (ioctl(bc->bc_fd, DIOCGFLUSH))
255 return (errno);
256 } else if (fsync(bc->bc_fd))
257 return (errno);
258 #else
260 * This fsync() should be adequate to flush the cache of a file
261 * or device. In VFS, the VOP_SYNC operation is converted to
262 * the appropriate ioctl in both sdev (for real devices) and
263 * zfs (for zvols).
265 if (fsync(bc->bc_fd))
266 return (errno);
267 #endif
269 return (0);
272 static void
273 blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be, uint8_t *buf)
275 #ifdef __FreeBSD__
276 struct spacectl_range range;
277 #endif
278 struct blockif_req *br;
279 #ifdef __FreeBSD__
280 off_t arg[2];
281 #endif
282 ssize_t n;
283 size_t clen, len, off, boff, voff;
284 int i, err;
286 br = be->be_req;
287 assert(br->br_resid >= 0);
289 if (br->br_iovcnt <= 1)
290 buf = NULL;
291 err = 0;
292 switch (be->be_op) {
293 case BOP_READ:
294 if (buf == NULL) {
295 if ((n = preadv(bc->bc_fd, br->br_iov, br->br_iovcnt,
296 br->br_offset)) < 0)
297 err = errno;
298 else
299 br->br_resid -= n;
300 break;
302 i = 0;
303 off = voff = 0;
304 while (br->br_resid > 0) {
305 len = MIN(br->br_resid, MAXPHYS);
306 n = pread(bc->bc_fd, buf, len, br->br_offset + off);
307 if (n < 0) {
308 err = errno;
309 break;
311 len = (size_t)n;
312 boff = 0;
313 do {
314 clen = MIN(len - boff, br->br_iov[i].iov_len -
315 voff);
316 memcpy((uint8_t *)br->br_iov[i].iov_base + voff,
317 buf + boff, clen);
318 if (clen < br->br_iov[i].iov_len - voff)
319 voff += clen;
320 else {
321 i++;
322 voff = 0;
324 boff += clen;
325 } while (boff < len);
326 off += len;
327 br->br_resid -= len;
329 break;
330 case BOP_WRITE:
331 if (bc->bc_rdonly) {
332 err = EROFS;
333 break;
335 if (buf == NULL) {
336 if ((n = pwritev(bc->bc_fd, br->br_iov, br->br_iovcnt,
337 br->br_offset)) < 0)
338 err = errno;
339 else
340 br->br_resid -= n;
341 break;
343 i = 0;
344 off = voff = 0;
345 while (br->br_resid > 0) {
346 len = MIN(br->br_resid, MAXPHYS);
347 boff = 0;
348 do {
349 clen = MIN(len - boff, br->br_iov[i].iov_len -
350 voff);
351 memcpy(buf + boff,
352 (uint8_t *)br->br_iov[i].iov_base + voff,
353 clen);
354 if (clen < br->br_iov[i].iov_len - voff)
355 voff += clen;
356 else {
357 i++;
358 voff = 0;
360 boff += clen;
361 } while (boff < len);
363 n = pwrite(bc->bc_fd, buf, len, br->br_offset + off);
364 if (n < 0) {
365 err = errno;
366 break;
368 off += n;
369 br->br_resid -= n;
371 break;
372 case BOP_FLUSH:
373 err = blockif_flush_bc(bc);
374 break;
375 case BOP_DELETE:
376 if (!bc->bc_candelete)
377 err = EOPNOTSUPP;
378 else if (bc->bc_rdonly)
379 err = EROFS;
380 #ifdef __FreeBSD__
381 else if (bc->bc_ischr) {
382 arg[0] = br->br_offset;
383 arg[1] = br->br_resid;
384 if (ioctl(bc->bc_fd, DIOCGDELETE, arg))
385 err = errno;
386 else
387 br->br_resid = 0;
388 } else {
389 range.r_offset = br->br_offset;
390 range.r_len = br->br_resid;
392 while (range.r_len > 0) {
393 if (fspacectl(bc->bc_fd, SPACECTL_DEALLOC,
394 &range, 0, &range) != 0) {
395 err = errno;
396 break;
399 if (err == 0)
400 br->br_resid = 0;
402 #else
403 else if (bc->bc_ischr) {
404 dkioc_free_list_t dfl = {
405 .dfl_num_exts = 1,
406 .dfl_offset = 0,
407 .dfl_flags = 0,
408 .dfl_exts = {
410 .dfle_start = br->br_offset,
411 .dfle_length = br->br_resid
416 if (ioctl(bc->bc_fd, DKIOCFREE, &dfl))
417 err = errno;
418 else
419 br->br_resid = 0;
420 } else {
421 struct flock fl = {
422 .l_whence = 0,
423 .l_type = F_WRLCK,
424 .l_start = br->br_offset,
425 .l_len = br->br_resid
428 if (fcntl(bc->bc_fd, F_FREESP, &fl))
429 err = errno;
430 else
431 br->br_resid = 0;
433 #endif
434 break;
435 default:
436 err = EINVAL;
437 break;
440 be->be_status = BST_DONE;
442 (*br->br_callback)(br, err);
445 static inline bool
446 blockif_empty(const struct blockif_ctxt *bc)
448 return (TAILQ_EMPTY(&bc->bc_pendq) && TAILQ_EMPTY(&bc->bc_busyq));
451 static void *
452 blockif_thr(void *arg)
454 struct blockif_ctxt *bc;
455 struct blockif_elem *be;
456 pthread_t t;
457 uint8_t *buf;
459 bc = arg;
460 if (bc->bc_isgeom)
461 buf = malloc(MAXPHYS);
462 else
463 buf = NULL;
464 t = pthread_self();
466 pthread_mutex_lock(&bc->bc_mtx);
467 for (;;) {
468 while (blockif_dequeue(bc, t, &be)) {
469 pthread_mutex_unlock(&bc->bc_mtx);
470 blockif_proc(bc, be, buf);
471 pthread_mutex_lock(&bc->bc_mtx);
472 blockif_complete(bc, be);
474 /* Check ctxt status here to see if exit requested */
475 if (bc->bc_closing)
476 break;
478 pthread_cond_wait(&bc->bc_cond, &bc->bc_mtx);
480 pthread_mutex_unlock(&bc->bc_mtx);
482 if (buf)
483 free(buf);
484 pthread_exit(NULL);
485 return (NULL);
488 #ifdef __FreeBSD__
489 static void
490 blockif_sigcont_handler(int signal __unused, enum ev_type type __unused,
491 void *arg __unused)
492 #else
493 static void
494 blockif_sigcont_handler(int signal __unused)
495 #endif
497 struct blockif_sig_elem *bse;
499 for (;;) {
501 * Process the entire list even if not intended for
502 * this thread.
504 do {
505 bse = blockif_bse_head;
506 if (bse == NULL)
507 return;
508 } while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head,
509 (uintptr_t)bse,
510 (uintptr_t)bse->bse_next));
512 pthread_mutex_lock(&bse->bse_mtx);
513 bse->bse_pending = 0;
514 pthread_cond_signal(&bse->bse_cond);
515 pthread_mutex_unlock(&bse->bse_mtx);
519 static void
520 blockif_init(void)
522 #ifdef __FreeBSD__
523 mevent_add(SIGCONT, EVF_SIGNAL, blockif_sigcont_handler, NULL);
524 (void) signal(SIGCONT, SIG_IGN);
525 #else
526 (void) sigset(SIGCONT, blockif_sigcont_handler);
527 #endif
531 blockif_legacy_config(nvlist_t *nvl, const char *opts)
533 char *cp, *path;
535 if (opts == NULL)
536 return (0);
538 cp = strchr(opts, ',');
539 if (cp == NULL) {
540 set_config_value_node(nvl, "path", opts);
541 return (0);
543 path = strndup(opts, cp - opts);
544 set_config_value_node(nvl, "path", path);
545 free(path);
546 return (pci_parse_legacy_config(nvl, cp + 1));
549 struct blockif_ctxt *
550 blockif_open(nvlist_t *nvl, const char *ident)
552 char tname[MAXCOMLEN + 1];
553 #ifdef __FreeBSD__
554 char name[MAXPATHLEN];
555 #endif
556 const char *path, *pssval, *ssval;
557 char *cp;
558 struct blockif_ctxt *bc;
559 struct stat sbuf;
560 #ifdef __FreeBSD__
561 struct diocgattr_arg arg;
562 #else
563 enum blockif_wce wce = WCE_NONE;
564 #endif
565 off_t size, psectsz, psectoff;
566 int extra, fd, i, sectsz;
567 int ro, candelete, geom, ssopt, pssopt;
568 int nodelete;
570 #ifndef WITHOUT_CAPSICUM
571 cap_rights_t rights;
572 cap_ioctl_t cmds[] = { DIOCGFLUSH, DIOCGDELETE, DIOCGMEDIASIZE };
573 #endif
575 pthread_once(&blockif_once, blockif_init);
577 fd = -1;
578 extra = 0;
579 ssopt = 0;
580 #ifndef __FreeBSD__
581 pssopt = 0;
582 #endif
583 ro = 0;
584 nodelete = 0;
586 if (get_config_bool_node_default(nvl, "nocache", false))
587 extra |= O_DIRECT;
588 if (get_config_bool_node_default(nvl, "nodelete", false))
589 nodelete = 1;
590 if (get_config_bool_node_default(nvl, "sync", false) ||
591 get_config_bool_node_default(nvl, "direct", false))
592 extra |= O_SYNC;
593 if (get_config_bool_node_default(nvl, "ro", false))
594 ro = 1;
595 ssval = get_config_value_node(nvl, "sectorsize");
596 if (ssval != NULL) {
597 ssopt = strtol(ssval, &cp, 10);
598 if (cp == ssval) {
599 EPRINTLN("Invalid sector size \"%s\"", ssval);
600 goto err;
602 if (*cp == '\0') {
603 pssopt = ssopt;
604 } else if (*cp == '/') {
605 pssval = cp + 1;
606 pssopt = strtol(pssval, &cp, 10);
607 if (cp == pssval || *cp != '\0') {
608 EPRINTLN("Invalid sector size \"%s\"", ssval);
609 goto err;
611 } else {
612 EPRINTLN("Invalid sector size \"%s\"", ssval);
613 goto err;
617 path = get_config_value_node(nvl, "path");
618 if (path == NULL) {
619 EPRINTLN("Missing \"path\" for block device.");
620 goto err;
623 fd = open(path, (ro ? O_RDONLY : O_RDWR) | extra);
624 if (fd < 0 && !ro) {
625 /* Attempt a r/w fail with a r/o open */
626 fd = open(path, O_RDONLY | extra);
627 ro = 1;
630 if (fd < 0) {
631 warn("Could not open backing file: %s", path);
632 goto err;
635 if (fstat(fd, &sbuf) < 0) {
636 warn("Could not stat backing file %s", path);
637 goto err;
640 #ifndef WITHOUT_CAPSICUM
641 cap_rights_init(&rights, CAP_FSYNC, CAP_IOCTL, CAP_READ, CAP_SEEK,
642 CAP_WRITE, CAP_FSTAT, CAP_EVENT, CAP_FPATHCONF);
643 if (ro)
644 cap_rights_clear(&rights, CAP_FSYNC, CAP_WRITE);
646 if (caph_rights_limit(fd, &rights) == -1)
647 errx(EX_OSERR, "Unable to apply rights for sandbox");
648 #endif
651 * Deal with raw devices
653 size = sbuf.st_size;
654 sectsz = DEV_BSIZE;
655 psectsz = psectoff = 0;
656 candelete = geom = 0;
657 #ifdef __FreeBSD__
658 if (S_ISCHR(sbuf.st_mode)) {
659 if (ioctl(fd, DIOCGMEDIASIZE, &size) < 0 ||
660 ioctl(fd, DIOCGSECTORSIZE, &sectsz)) {
661 perror("Could not fetch dev blk/sector size");
662 goto err;
664 assert(size != 0);
665 assert(sectsz != 0);
666 if (ioctl(fd, DIOCGSTRIPESIZE, &psectsz) == 0 && psectsz > 0)
667 ioctl(fd, DIOCGSTRIPEOFFSET, &psectoff);
668 strlcpy(arg.name, "GEOM::candelete", sizeof(arg.name));
669 arg.len = sizeof(arg.value.i);
670 if (nodelete == 0 && ioctl(fd, DIOCGATTR, &arg) == 0)
671 candelete = arg.value.i;
672 if (ioctl(fd, DIOCGPROVIDERNAME, name) == 0)
673 geom = 1;
674 } else {
675 psectsz = sbuf.st_blksize;
676 /* Avoid fallback implementation */
677 candelete = fpathconf(fd, _PC_DEALLOC_PRESENT) == 1;
679 #else
680 psectsz = sbuf.st_blksize;
681 if (S_ISCHR(sbuf.st_mode)) {
682 struct dk_minfo_ext dkmext;
683 int wce_val;
685 /* Look for a more accurate physical block/media size */
686 if (ioctl(fd, DKIOCGMEDIAINFOEXT, &dkmext) == 0) {
687 psectsz = dkmext.dki_pbsize;
688 size = dkmext.dki_lbsize * dkmext.dki_capacity;
690 /* See if a configurable write cache is present and working */
691 if (ioctl(fd, DKIOCGETWCE, &wce_val) == 0) {
693 * If WCE is already active, disable it until the
694 * specific device driver calls for its return. If it
695 * is not active, toggle it on and off to verify that
696 * such actions are possible.
698 if (wce_val != 0) {
699 wce_val = 0;
701 * Inability to disable the cache is a threat
702 * to data durability.
704 assert(ioctl(fd, DKIOCSETWCE, &wce_val) == 0);
705 wce = WCE_IOCTL;
706 } else {
707 int r1, r2;
709 wce_val = 1;
710 r1 = ioctl(fd, DKIOCSETWCE, &wce_val);
711 wce_val = 0;
712 r2 = ioctl(fd, DKIOCSETWCE, &wce_val);
714 if (r1 == 0 && r2 == 0) {
715 wce = WCE_IOCTL;
716 } else {
718 * If the cache cache toggle was not
719 * successful, ensure that the cache
720 * was not left enabled.
722 assert(r1 != 0);
727 if (nodelete == 0 && ioctl(fd, DKIOC_CANFREE, &candelete))
728 candelete = 0;
730 } else {
731 int flags;
733 if ((flags = fcntl(fd, F_GETFL)) >= 0) {
734 flags |= O_DSYNC;
735 if (fcntl(fd, F_SETFL, flags) != -1) {
736 wce = WCE_FCNTL;
741 * We don't have a way to discover if a file supports the
742 * FREESP fcntl cmd (other than trying it). However,
743 * zfs, ufs, tmpfs, and udfs all support the FREESP fcntl cmd.
744 * Nfsv4 and nfsv4 also forward the FREESP request
745 * to the server, so we always enable it for file based
746 * volumes. Anyone trying to run volumes on an unsupported
747 * configuration is on their own, and should be prepared
748 * for the requests to fail.
750 if (nodelete == 0)
751 candelete = 1;
753 #endif
755 #ifndef WITHOUT_CAPSICUM
756 if (caph_ioctls_limit(fd, cmds, nitems(cmds)) == -1)
757 errx(EX_OSERR, "Unable to apply rights for sandbox");
758 #endif
760 if (ssopt != 0) {
761 if (!powerof2(ssopt) || !powerof2(pssopt) || ssopt < 512 ||
762 ssopt > pssopt) {
763 EPRINTLN("Invalid sector size %d/%d",
764 ssopt, pssopt);
765 goto err;
769 * Some backend drivers (e.g. cd0, ada0) require that the I/O
770 * size be a multiple of the device's sector size.
772 * Validate that the emulated sector size complies with this
773 * requirement.
775 if (S_ISCHR(sbuf.st_mode)) {
776 if (ssopt < sectsz || (ssopt % sectsz) != 0) {
777 EPRINTLN("Sector size %d incompatible "
778 "with underlying device sector size %d",
779 ssopt, sectsz);
780 goto err;
784 sectsz = ssopt;
785 psectsz = pssopt;
786 psectoff = 0;
789 bc = calloc(1, sizeof(struct blockif_ctxt));
790 if (bc == NULL) {
791 perror("calloc");
792 goto err;
795 bc->bc_magic = BLOCKIF_SIG;
796 bc->bc_fd = fd;
797 bc->bc_ischr = S_ISCHR(sbuf.st_mode);
798 bc->bc_isgeom = geom;
799 bc->bc_candelete = candelete;
800 #ifndef __FreeBSD__
801 bc->bc_wce = wce;
802 #endif
803 bc->bc_rdonly = ro;
804 bc->bc_size = size;
805 bc->bc_sectsz = sectsz;
806 bc->bc_psectsz = psectsz;
807 bc->bc_psectoff = psectoff;
808 pthread_mutex_init(&bc->bc_mtx, NULL);
809 pthread_cond_init(&bc->bc_cond, NULL);
810 TAILQ_INIT(&bc->bc_freeq);
811 TAILQ_INIT(&bc->bc_pendq);
812 TAILQ_INIT(&bc->bc_busyq);
813 for (i = 0; i < BLOCKIF_MAXREQ; i++) {
814 bc->bc_reqs[i].be_status = BST_FREE;
815 TAILQ_INSERT_HEAD(&bc->bc_freeq, &bc->bc_reqs[i], be_link);
818 for (i = 0; i < BLOCKIF_NUMTHR; i++) {
819 pthread_create(&bc->bc_btid[i], NULL, blockif_thr, bc);
820 snprintf(tname, sizeof(tname), "blk-%s-%d", ident, i);
821 pthread_set_name_np(bc->bc_btid[i], tname);
824 return (bc);
825 err:
826 if (fd >= 0)
827 close(fd);
828 return (NULL);
831 static void
832 blockif_resized(int fd, enum ev_type type __unused, void *arg)
834 struct blockif_ctxt *bc;
835 struct stat sb;
836 off_t mediasize;
838 if (fstat(fd, &sb) != 0)
839 return;
841 #ifdef __FreeBSD__
842 if (S_ISCHR(sb.st_mode)) {
843 if (ioctl(fd, DIOCGMEDIASIZE, &mediasize) < 0) {
844 EPRINTLN("blockif_resized: get mediasize failed: %s",
845 strerror(errno));
846 return;
848 } else
849 mediasize = sb.st_size;
850 #else
851 mediasize = sb.st_size;
852 if (S_ISCHR(sb.st_mode)) {
853 struct dk_minfo dkm;
855 if (ioctl(fd, DKIOCGMEDIAINFO, &dkm) == 0)
856 mediasize = dkm.dki_lbsize * dkm.dki_capacity;
858 #endif
860 bc = arg;
861 pthread_mutex_lock(&bc->bc_mtx);
862 if (mediasize != bc->bc_size) {
863 bc->bc_size = mediasize;
864 bc->bc_resize_cb(bc, bc->bc_resize_cb_arg, bc->bc_size);
866 pthread_mutex_unlock(&bc->bc_mtx);
870 blockif_register_resize_callback(struct blockif_ctxt *bc, blockif_resize_cb *cb,
871 void *cb_arg)
873 struct stat sb;
874 int err;
876 if (cb == NULL)
877 return (EINVAL);
879 err = 0;
881 pthread_mutex_lock(&bc->bc_mtx);
882 if (bc->bc_resize_cb != NULL) {
883 err = EBUSY;
884 goto out;
887 assert(bc->bc_closing == 0);
889 if (fstat(bc->bc_fd, &sb) != 0) {
890 err = errno;
891 goto out;
894 bc->bc_resize_event = mevent_add_flags(bc->bc_fd, EVF_VNODE,
895 EVFF_ATTRIB, blockif_resized, bc);
896 if (bc->bc_resize_event == NULL) {
897 err = ENXIO;
898 goto out;
901 bc->bc_resize_cb = cb;
902 bc->bc_resize_cb_arg = cb_arg;
903 out:
904 pthread_mutex_unlock(&bc->bc_mtx);
906 return (err);
909 static int
910 blockif_request(struct blockif_ctxt *bc, struct blockif_req *breq,
911 enum blockop op)
913 int err;
915 err = 0;
917 pthread_mutex_lock(&bc->bc_mtx);
918 if (!TAILQ_EMPTY(&bc->bc_freeq)) {
920 * Enqueue and inform the block i/o thread
921 * that there is work available
923 if (blockif_enqueue(bc, breq, op))
924 pthread_cond_signal(&bc->bc_cond);
925 } else {
927 * Callers are not allowed to enqueue more than
928 * the specified blockif queue limit. Return an
929 * error to indicate that the queue length has been
930 * exceeded.
932 err = E2BIG;
934 pthread_mutex_unlock(&bc->bc_mtx);
936 return (err);
940 blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq)
942 assert(bc->bc_magic == BLOCKIF_SIG);
943 return (blockif_request(bc, breq, BOP_READ));
947 blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq)
949 assert(bc->bc_magic == BLOCKIF_SIG);
950 return (blockif_request(bc, breq, BOP_WRITE));
954 blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq)
956 assert(bc->bc_magic == BLOCKIF_SIG);
957 return (blockif_request(bc, breq, BOP_FLUSH));
961 blockif_delete(struct blockif_ctxt *bc, struct blockif_req *breq)
963 assert(bc->bc_magic == BLOCKIF_SIG);
964 return (blockif_request(bc, breq, BOP_DELETE));
968 blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq)
970 struct blockif_elem *be;
972 assert(bc->bc_magic == BLOCKIF_SIG);
974 pthread_mutex_lock(&bc->bc_mtx);
976 * Check pending requests.
978 TAILQ_FOREACH(be, &bc->bc_pendq, be_link) {
979 if (be->be_req == breq)
980 break;
982 if (be != NULL) {
984 * Found it.
986 blockif_complete(bc, be);
987 pthread_mutex_unlock(&bc->bc_mtx);
989 return (0);
993 * Check in-flight requests.
995 TAILQ_FOREACH(be, &bc->bc_busyq, be_link) {
996 if (be->be_req == breq)
997 break;
999 if (be == NULL) {
1001 * Didn't find it.
1003 pthread_mutex_unlock(&bc->bc_mtx);
1004 return (EINVAL);
1008 * Interrupt the processing thread to force it return
1009 * prematurely via it's normal callback path.
1011 while (be->be_status == BST_BUSY) {
1012 struct blockif_sig_elem bse, *old_head;
1014 pthread_mutex_init(&bse.bse_mtx, NULL);
1015 pthread_cond_init(&bse.bse_cond, NULL);
1017 bse.bse_pending = 1;
1019 do {
1020 old_head = blockif_bse_head;
1021 bse.bse_next = old_head;
1022 } while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head,
1023 (uintptr_t)old_head,
1024 (uintptr_t)&bse));
1026 pthread_kill(be->be_tid, SIGCONT);
1028 pthread_mutex_lock(&bse.bse_mtx);
1029 while (bse.bse_pending)
1030 pthread_cond_wait(&bse.bse_cond, &bse.bse_mtx);
1031 pthread_mutex_unlock(&bse.bse_mtx);
1034 pthread_mutex_unlock(&bc->bc_mtx);
1037 * The processing thread has been interrupted. Since it's not
1038 * clear if the callback has been invoked yet, return EBUSY.
1040 return (EBUSY);
1044 blockif_close(struct blockif_ctxt *bc)
1046 void *jval;
1047 int i;
1049 assert(bc->bc_magic == BLOCKIF_SIG);
1052 * Stop the block i/o thread
1054 pthread_mutex_lock(&bc->bc_mtx);
1055 bc->bc_closing = 1;
1056 if (bc->bc_resize_event != NULL)
1057 mevent_disable(bc->bc_resize_event);
1058 pthread_mutex_unlock(&bc->bc_mtx);
1059 pthread_cond_broadcast(&bc->bc_cond);
1060 for (i = 0; i < BLOCKIF_NUMTHR; i++)
1061 pthread_join(bc->bc_btid[i], &jval);
1063 /* XXX Cancel queued i/o's ??? */
1066 * Release resources
1068 bc->bc_magic = 0;
1069 close(bc->bc_fd);
1070 free(bc);
1072 return (0);
1076 * Return virtual C/H/S values for a given block. Use the algorithm
1077 * outlined in the VHD specification to calculate values.
1079 void
1080 blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, uint8_t *s)
1082 off_t sectors; /* total sectors of the block dev */
1083 off_t hcyl; /* cylinders times heads */
1084 uint16_t secpt; /* sectors per track */
1085 uint8_t heads;
1087 assert(bc->bc_magic == BLOCKIF_SIG);
1089 sectors = bc->bc_size / bc->bc_sectsz;
1091 /* Clamp the size to the largest possible with CHS */
1092 if (sectors > 65535L * 16 * 255)
1093 sectors = 65535L * 16 * 255;
1095 if (sectors >= 65536L * 16 * 63) {
1096 secpt = 255;
1097 heads = 16;
1098 hcyl = sectors / secpt;
1099 } else {
1100 secpt = 17;
1101 hcyl = sectors / secpt;
1102 heads = (hcyl + 1023) / 1024;
1104 if (heads < 4)
1105 heads = 4;
1107 if (hcyl >= (heads * 1024) || heads > 16) {
1108 secpt = 31;
1109 heads = 16;
1110 hcyl = sectors / secpt;
1112 if (hcyl >= (heads * 1024)) {
1113 secpt = 63;
1114 heads = 16;
1115 hcyl = sectors / secpt;
1119 *c = hcyl / heads;
1120 *h = heads;
1121 *s = secpt;
1125 * Accessors
1127 off_t
1128 blockif_size(struct blockif_ctxt *bc)
1130 assert(bc->bc_magic == BLOCKIF_SIG);
1131 return (bc->bc_size);
1135 blockif_sectsz(struct blockif_ctxt *bc)
1137 assert(bc->bc_magic == BLOCKIF_SIG);
1138 return (bc->bc_sectsz);
1141 void
1142 blockif_psectsz(struct blockif_ctxt *bc, int *size, int *off)
1144 assert(bc->bc_magic == BLOCKIF_SIG);
1145 *size = bc->bc_psectsz;
1146 *off = bc->bc_psectoff;
1150 blockif_queuesz(struct blockif_ctxt *bc)
1152 assert(bc->bc_magic == BLOCKIF_SIG);
1153 return (BLOCKIF_MAXREQ - 1);
1157 blockif_is_ro(struct blockif_ctxt *bc)
1159 assert(bc->bc_magic == BLOCKIF_SIG);
1160 return (bc->bc_rdonly);
1164 blockif_candelete(struct blockif_ctxt *bc)
1166 assert(bc->bc_magic == BLOCKIF_SIG);
1167 return (bc->bc_candelete);
1170 #ifndef __FreeBSD__
1172 blockif_set_wce(struct blockif_ctxt *bc, int wc_enable)
1174 int res = 0, flags;
1175 int clean_val = (wc_enable != 0) ? 1 : 0;
1177 (void) pthread_mutex_lock(&bc->bc_mtx);
1178 switch (bc->bc_wce) {
1179 case WCE_IOCTL:
1180 res = ioctl(bc->bc_fd, DKIOCSETWCE, &clean_val);
1181 break;
1182 case WCE_FCNTL:
1183 if ((flags = fcntl(bc->bc_fd, F_GETFL)) >= 0) {
1184 if (wc_enable == 0) {
1185 flags |= O_DSYNC;
1186 } else {
1187 flags &= ~O_DSYNC;
1189 if (fcntl(bc->bc_fd, F_SETFL, flags) == -1) {
1190 res = -1;
1192 } else {
1193 res = -1;
1195 break;
1196 default:
1197 break;
1201 * After a successful disable of the write cache, ensure that any
1202 * lingering data in the cache is synced out.
1204 if (res == 0 && wc_enable == 0) {
1205 res = fsync(bc->bc_fd);
1207 (void) pthread_mutex_unlock(&bc->bc_mtx);
1209 return (res);
1211 #endif /* __FreeBSD__ */