2 * Sharing QEMU block devices via vhost-user protocal
4 * Parts of the code based on nbd/server.c.
6 * Copyright (c) Coiby Xu <coiby.xu@gmail.com>.
7 * Copyright (c) 2020 Red Hat, Inc.
9 * This work is licensed under the terms of the GNU GPL, version 2 or
10 * later. See the COPYING file in the top-level directory.
12 #include "qemu/osdep.h"
13 #include "block/block.h"
14 #include "subprojects/libvhost-user/libvhost-user.h" /* only for the type definitions */
15 #include "standard-headers/linux/virtio_blk.h"
16 #include "qemu/vhost-user-server.h"
17 #include "vhost-user-blk-server.h"
18 #include "qapi/error.h"
19 #include "qom/object_interfaces.h"
20 #include "sysemu/block-backend.h"
21 #include "util/block-helpers.h"
24 * Sector units are 512 bytes regardless of the
25 * virtio_blk_config->blk_size value.
27 #define VIRTIO_BLK_SECTOR_BITS 9
28 #define VIRTIO_BLK_SECTOR_SIZE (1ull << VIRTIO_BLK_SECTOR_BITS)
31 VHOST_USER_BLK_NUM_QUEUES_DEFAULT
= 1,
32 VHOST_USER_BLK_MAX_DISCARD_SECTORS
= 32768,
33 VHOST_USER_BLK_MAX_WRITE_ZEROES_SECTORS
= 32768,
35 struct virtio_blk_inhdr
{
39 typedef struct VuBlkReq
{
43 struct virtio_blk_inhdr
*in
;
44 struct virtio_blk_outhdr out
;
49 /* vhost user block device */
54 QIOChannelSocket
*sioc
;
55 struct virtio_blk_config blkcfg
;
59 static void vu_blk_req_complete(VuBlkReq
*req
)
61 VuDev
*vu_dev
= &req
->server
->vu_dev
;
63 /* IO size with 1 extra status byte */
64 vu_queue_push(vu_dev
, req
->vq
, &req
->elem
, req
->size
+ 1);
65 vu_queue_notify(vu_dev
, req
->vq
);
70 static bool vu_blk_sect_range_ok(VuBlkExport
*vexp
, uint64_t sector
,
74 uint64_t total_sectors
;
76 if (size
% VIRTIO_BLK_SECTOR_SIZE
) {
80 nb_sectors
= size
>> VIRTIO_BLK_SECTOR_BITS
;
82 QEMU_BUILD_BUG_ON(BDRV_SECTOR_SIZE
!= VIRTIO_BLK_SECTOR_SIZE
);
83 if (nb_sectors
> BDRV_REQUEST_MAX_SECTORS
) {
86 if ((sector
<< VIRTIO_BLK_SECTOR_BITS
) % vexp
->blk_size
) {
89 blk_get_geometry(vexp
->export
.blk
, &total_sectors
);
90 if (sector
> total_sectors
|| nb_sectors
> total_sectors
- sector
) {
96 static int coroutine_fn
97 vu_blk_discard_write_zeroes(VuBlkExport
*vexp
, struct iovec
*iov
,
98 uint32_t iovcnt
, uint32_t type
)
100 BlockBackend
*blk
= vexp
->export
.blk
;
101 struct virtio_blk_discard_write_zeroes desc
;
104 uint32_t num_sectors
;
105 uint32_t max_sectors
;
109 /* Only one desc is currently supported */
110 if (unlikely(iov_size(iov
, iovcnt
) > sizeof(desc
))) {
111 return VIRTIO_BLK_S_UNSUPP
;
114 size
= iov_to_buf(iov
, iovcnt
, 0, &desc
, sizeof(desc
));
115 if (unlikely(size
!= sizeof(desc
))) {
116 error_report("Invalid size %zd, expected %zu", size
, sizeof(desc
));
117 return VIRTIO_BLK_S_IOERR
;
120 sector
= le64_to_cpu(desc
.sector
);
121 num_sectors
= le32_to_cpu(desc
.num_sectors
);
122 flags
= le32_to_cpu(desc
.flags
);
123 max_sectors
= (type
== VIRTIO_BLK_T_WRITE_ZEROES
) ?
124 VHOST_USER_BLK_MAX_WRITE_ZEROES_SECTORS
:
125 VHOST_USER_BLK_MAX_DISCARD_SECTORS
;
127 /* This check ensures that 'bytes' fits in an int */
128 if (unlikely(num_sectors
> max_sectors
)) {
129 return VIRTIO_BLK_S_IOERR
;
132 bytes
= num_sectors
<< VIRTIO_BLK_SECTOR_BITS
;
134 if (unlikely(!vu_blk_sect_range_ok(vexp
, sector
, bytes
))) {
135 return VIRTIO_BLK_S_IOERR
;
139 * The device MUST set the status byte to VIRTIO_BLK_S_UNSUPP for discard
140 * and write zeroes commands if any unknown flag is set.
142 if (unlikely(flags
& ~VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP
)) {
143 return VIRTIO_BLK_S_UNSUPP
;
146 if (type
== VIRTIO_BLK_T_WRITE_ZEROES
) {
149 if (flags
& VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP
) {
150 blk_flags
|= BDRV_REQ_MAY_UNMAP
;
153 if (blk_co_pwrite_zeroes(blk
, sector
<< VIRTIO_BLK_SECTOR_BITS
,
154 bytes
, blk_flags
) == 0) {
155 return VIRTIO_BLK_S_OK
;
157 } else if (type
== VIRTIO_BLK_T_DISCARD
) {
159 * The device MUST set the status byte to VIRTIO_BLK_S_UNSUPP for
160 * discard commands if the unmap flag is set.
162 if (unlikely(flags
& VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP
)) {
163 return VIRTIO_BLK_S_UNSUPP
;
166 if (blk_co_pdiscard(blk
, sector
<< VIRTIO_BLK_SECTOR_BITS
,
168 return VIRTIO_BLK_S_OK
;
172 return VIRTIO_BLK_S_IOERR
;
175 static void coroutine_fn
vu_blk_virtio_process_req(void *opaque
)
177 VuBlkReq
*req
= opaque
;
178 VuServer
*server
= req
->server
;
179 VuVirtqElement
*elem
= &req
->elem
;
182 VuBlkExport
*vexp
= container_of(server
, VuBlkExport
, vu_server
);
183 BlockBackend
*blk
= vexp
->export
.blk
;
185 struct iovec
*in_iov
= elem
->in_sg
;
186 struct iovec
*out_iov
= elem
->out_sg
;
187 unsigned in_num
= elem
->in_num
;
188 unsigned out_num
= elem
->out_num
;
190 /* refer to hw/block/virtio_blk.c */
191 if (elem
->out_num
< 1 || elem
->in_num
< 1) {
192 error_report("virtio-blk request missing headers");
196 if (unlikely(iov_to_buf(out_iov
, out_num
, 0, &req
->out
,
197 sizeof(req
->out
)) != sizeof(req
->out
))) {
198 error_report("virtio-blk request outhdr too short");
202 iov_discard_front(&out_iov
, &out_num
, sizeof(req
->out
));
204 if (in_iov
[in_num
- 1].iov_len
< sizeof(struct virtio_blk_inhdr
)) {
205 error_report("virtio-blk request inhdr too short");
209 /* We always touch the last byte, so just see how big in_iov is. */
210 req
->in
= (void *)in_iov
[in_num
- 1].iov_base
211 + in_iov
[in_num
- 1].iov_len
212 - sizeof(struct virtio_blk_inhdr
);
213 iov_discard_back(in_iov
, &in_num
, sizeof(struct virtio_blk_inhdr
));
215 type
= le32_to_cpu(req
->out
.type
);
216 switch (type
& ~VIRTIO_BLK_T_BARRIER
) {
217 case VIRTIO_BLK_T_IN
:
218 case VIRTIO_BLK_T_OUT
: {
222 bool is_write
= type
& VIRTIO_BLK_T_OUT
;
223 req
->sector_num
= le64_to_cpu(req
->out
.sector
);
225 if (is_write
&& !vexp
->writable
) {
226 req
->in
->status
= VIRTIO_BLK_S_IOERR
;
231 qemu_iovec_init_external(&qiov
, out_iov
, out_num
);
233 qemu_iovec_init_external(&qiov
, in_iov
, in_num
);
236 if (unlikely(!vu_blk_sect_range_ok(vexp
,
239 req
->in
->status
= VIRTIO_BLK_S_IOERR
;
243 offset
= req
->sector_num
<< VIRTIO_BLK_SECTOR_BITS
;
246 ret
= blk_co_pwritev(blk
, offset
, qiov
.size
, &qiov
, 0);
248 ret
= blk_co_preadv(blk
, offset
, qiov
.size
, &qiov
, 0);
251 req
->in
->status
= VIRTIO_BLK_S_OK
;
253 req
->in
->status
= VIRTIO_BLK_S_IOERR
;
257 case VIRTIO_BLK_T_FLUSH
:
258 if (blk_co_flush(blk
) == 0) {
259 req
->in
->status
= VIRTIO_BLK_S_OK
;
261 req
->in
->status
= VIRTIO_BLK_S_IOERR
;
264 case VIRTIO_BLK_T_GET_ID
: {
265 size_t size
= MIN(iov_size(&elem
->in_sg
[0], in_num
),
266 VIRTIO_BLK_ID_BYTES
);
267 snprintf(elem
->in_sg
[0].iov_base
, size
, "%s", "vhost_user_blk");
268 req
->in
->status
= VIRTIO_BLK_S_OK
;
269 req
->size
= elem
->in_sg
[0].iov_len
;
272 case VIRTIO_BLK_T_DISCARD
:
273 case VIRTIO_BLK_T_WRITE_ZEROES
: {
274 if (!vexp
->writable
) {
275 req
->in
->status
= VIRTIO_BLK_S_IOERR
;
279 req
->in
->status
= vu_blk_discard_write_zeroes(vexp
, out_iov
, out_num
,
284 req
->in
->status
= VIRTIO_BLK_S_UNSUPP
;
288 vu_blk_req_complete(req
);
295 static void vu_blk_process_vq(VuDev
*vu_dev
, int idx
)
297 VuServer
*server
= container_of(vu_dev
, VuServer
, vu_dev
);
298 VuVirtq
*vq
= vu_get_queue(vu_dev
, idx
);
303 req
= vu_queue_pop(vu_dev
, vq
, sizeof(VuBlkReq
));
308 req
->server
= server
;
312 qemu_coroutine_create(vu_blk_virtio_process_req
, req
);
313 qemu_coroutine_enter(co
);
317 static void vu_blk_queue_set_started(VuDev
*vu_dev
, int idx
, bool started
)
323 vq
= vu_get_queue(vu_dev
, idx
);
324 vu_set_queue_handler(vu_dev
, vq
, started
? vu_blk_process_vq
: NULL
);
327 static uint64_t vu_blk_get_features(VuDev
*dev
)
330 VuServer
*server
= container_of(dev
, VuServer
, vu_dev
);
331 VuBlkExport
*vexp
= container_of(server
, VuBlkExport
, vu_server
);
332 features
= 1ull << VIRTIO_BLK_F_SIZE_MAX
|
333 1ull << VIRTIO_BLK_F_SEG_MAX
|
334 1ull << VIRTIO_BLK_F_TOPOLOGY
|
335 1ull << VIRTIO_BLK_F_BLK_SIZE
|
336 1ull << VIRTIO_BLK_F_FLUSH
|
337 1ull << VIRTIO_BLK_F_DISCARD
|
338 1ull << VIRTIO_BLK_F_WRITE_ZEROES
|
339 1ull << VIRTIO_BLK_F_CONFIG_WCE
|
340 1ull << VIRTIO_BLK_F_MQ
|
341 1ull << VIRTIO_F_VERSION_1
|
342 1ull << VIRTIO_RING_F_INDIRECT_DESC
|
343 1ull << VIRTIO_RING_F_EVENT_IDX
|
344 1ull << VHOST_USER_F_PROTOCOL_FEATURES
;
346 if (!vexp
->writable
) {
347 features
|= 1ull << VIRTIO_BLK_F_RO
;
353 static uint64_t vu_blk_get_protocol_features(VuDev
*dev
)
355 return 1ull << VHOST_USER_PROTOCOL_F_CONFIG
;
359 vu_blk_get_config(VuDev
*vu_dev
, uint8_t *config
, uint32_t len
)
361 VuServer
*server
= container_of(vu_dev
, VuServer
, vu_dev
);
362 VuBlkExport
*vexp
= container_of(server
, VuBlkExport
, vu_server
);
364 if (len
> sizeof(struct virtio_blk_config
)) {
368 memcpy(config
, &vexp
->blkcfg
, len
);
373 vu_blk_set_config(VuDev
*vu_dev
, const uint8_t *data
,
374 uint32_t offset
, uint32_t size
, uint32_t flags
)
376 VuServer
*server
= container_of(vu_dev
, VuServer
, vu_dev
);
377 VuBlkExport
*vexp
= container_of(server
, VuBlkExport
, vu_server
);
380 /* don't support live migration */
381 if (flags
!= VHOST_SET_CONFIG_TYPE_MASTER
) {
385 if (offset
!= offsetof(struct virtio_blk_config
, wce
) ||
391 vexp
->blkcfg
.wce
= wce
;
392 blk_set_enable_write_cache(vexp
->export
.blk
, wce
);
397 * When the client disconnects, it sends a VHOST_USER_NONE request
398 * and vu_process_message will simple call exit which cause the VM
400 * To avoid this issue, process VHOST_USER_NONE request ahead
401 * of vu_process_message.
404 static int vu_blk_process_msg(VuDev
*dev
, VhostUserMsg
*vmsg
, int *do_reply
)
406 if (vmsg
->request
== VHOST_USER_NONE
) {
407 dev
->panic(dev
, "disconnect");
413 static const VuDevIface vu_blk_iface
= {
414 .get_features
= vu_blk_get_features
,
415 .queue_set_started
= vu_blk_queue_set_started
,
416 .get_protocol_features
= vu_blk_get_protocol_features
,
417 .get_config
= vu_blk_get_config
,
418 .set_config
= vu_blk_set_config
,
419 .process_msg
= vu_blk_process_msg
,
422 static void blk_aio_attached(AioContext
*ctx
, void *opaque
)
424 VuBlkExport
*vexp
= opaque
;
426 vexp
->export
.ctx
= ctx
;
427 vhost_user_server_attach_aio_context(&vexp
->vu_server
, ctx
);
430 static void blk_aio_detach(void *opaque
)
432 VuBlkExport
*vexp
= opaque
;
434 vhost_user_server_detach_aio_context(&vexp
->vu_server
);
435 vexp
->export
.ctx
= NULL
;
439 vu_blk_initialize_config(BlockDriverState
*bs
,
440 struct virtio_blk_config
*config
,
445 cpu_to_le64(bdrv_getlength(bs
) >> VIRTIO_BLK_SECTOR_BITS
);
446 config
->blk_size
= cpu_to_le32(blk_size
);
447 config
->size_max
= cpu_to_le32(0);
448 config
->seg_max
= cpu_to_le32(128 - 2);
449 config
->min_io_size
= cpu_to_le16(1);
450 config
->opt_io_size
= cpu_to_le32(1);
451 config
->num_queues
= cpu_to_le16(num_queues
);
452 config
->max_discard_sectors
=
453 cpu_to_le32(VHOST_USER_BLK_MAX_DISCARD_SECTORS
);
454 config
->max_discard_seg
= cpu_to_le32(1);
455 config
->discard_sector_alignment
=
456 cpu_to_le32(blk_size
>> VIRTIO_BLK_SECTOR_BITS
);
457 config
->max_write_zeroes_sectors
458 = cpu_to_le32(VHOST_USER_BLK_MAX_WRITE_ZEROES_SECTORS
);
459 config
->max_write_zeroes_seg
= cpu_to_le32(1);
462 static void vu_blk_exp_request_shutdown(BlockExport
*exp
)
464 VuBlkExport
*vexp
= container_of(exp
, VuBlkExport
, export
);
466 vhost_user_server_stop(&vexp
->vu_server
);
469 static int vu_blk_exp_create(BlockExport
*exp
, BlockExportOptions
*opts
,
472 VuBlkExport
*vexp
= container_of(exp
, VuBlkExport
, export
);
473 BlockExportOptionsVhostUserBlk
*vu_opts
= &opts
->u
.vhost_user_blk
;
474 Error
*local_err
= NULL
;
475 uint64_t logical_block_size
;
476 uint16_t num_queues
= VHOST_USER_BLK_NUM_QUEUES_DEFAULT
;
478 vexp
->writable
= opts
->writable
;
479 vexp
->blkcfg
.wce
= 0;
481 if (vu_opts
->has_logical_block_size
) {
482 logical_block_size
= vu_opts
->logical_block_size
;
484 logical_block_size
= VIRTIO_BLK_SECTOR_SIZE
;
486 check_block_size(exp
->id
, "logical-block-size", logical_block_size
,
489 error_propagate(errp
, local_err
);
492 vexp
->blk_size
= logical_block_size
;
493 blk_set_guest_block_size(exp
->blk
, logical_block_size
);
495 if (vu_opts
->has_num_queues
) {
496 num_queues
= vu_opts
->num_queues
;
498 if (num_queues
== 0) {
499 error_setg(errp
, "num-queues must be greater than 0");
503 vu_blk_initialize_config(blk_bs(exp
->blk
), &vexp
->blkcfg
,
504 logical_block_size
, num_queues
);
506 blk_add_aio_context_notifier(exp
->blk
, blk_aio_attached
, blk_aio_detach
,
509 if (!vhost_user_server_start(&vexp
->vu_server
, vu_opts
->addr
, exp
->ctx
,
510 num_queues
, &vu_blk_iface
, errp
)) {
511 blk_remove_aio_context_notifier(exp
->blk
, blk_aio_attached
,
512 blk_aio_detach
, vexp
);
513 return -EADDRNOTAVAIL
;
519 static void vu_blk_exp_delete(BlockExport
*exp
)
521 VuBlkExport
*vexp
= container_of(exp
, VuBlkExport
, export
);
523 blk_remove_aio_context_notifier(exp
->blk
, blk_aio_attached
, blk_aio_detach
,
527 const BlockExportDriver blk_exp_vhost_user_blk
= {
528 .type
= BLOCK_EXPORT_TYPE_VHOST_USER_BLK
,
529 .instance_size
= sizeof(VuBlkExport
),
530 .create
= vu_blk_exp_create
,
531 .delete = vu_blk_exp_delete
,
532 .request_shutdown
= vu_blk_exp_request_shutdown
,