2 * Sharing QEMU block devices via vhost-user protocal
4 * Parts of the code based on nbd/server.c.
6 * Copyright (c) Coiby Xu <coiby.xu@gmail.com>.
7 * Copyright (c) 2020 Red Hat, Inc.
9 * This work is licensed under the terms of the GNU GPL, version 2 or
10 * later. See the COPYING file in the top-level directory.
12 #include "qemu/osdep.h"
13 #include "block/block.h"
14 #include "subprojects/libvhost-user/libvhost-user.h" /* only for the type definitions */
15 #include "standard-headers/linux/virtio_blk.h"
16 #include "qemu/vhost-user-server.h"
17 #include "vhost-user-blk-server.h"
18 #include "qapi/error.h"
19 #include "qom/object_interfaces.h"
20 #include "sysemu/block-backend.h"
21 #include "util/block-helpers.h"
24 * Sector units are 512 bytes regardless of the
25 * virtio_blk_config->blk_size value.
27 #define VIRTIO_BLK_SECTOR_BITS 9
28 #define VIRTIO_BLK_SECTOR_SIZE (1ull << VIRTIO_BLK_SECTOR_BITS)
31 VHOST_USER_BLK_NUM_QUEUES_DEFAULT
= 1,
32 VHOST_USER_BLK_MAX_DISCARD_SECTORS
= 32768,
33 VHOST_USER_BLK_MAX_WRITE_ZEROES_SECTORS
= 32768,
35 struct virtio_blk_inhdr
{
39 typedef struct VuBlkReq
{
43 struct virtio_blk_inhdr
*in
;
44 struct virtio_blk_outhdr out
;
49 /* vhost user block device */
54 QIOChannelSocket
*sioc
;
55 struct virtio_blk_config blkcfg
;
59 static void vu_blk_req_complete(VuBlkReq
*req
)
61 VuDev
*vu_dev
= &req
->server
->vu_dev
;
63 /* IO size with 1 extra status byte */
64 vu_queue_push(vu_dev
, req
->vq
, &req
->elem
, req
->size
+ 1);
65 vu_queue_notify(vu_dev
, req
->vq
);
70 static bool vu_blk_sect_range_ok(VuBlkExport
*vexp
, uint64_t sector
,
74 uint64_t total_sectors
;
76 if (size
% VIRTIO_BLK_SECTOR_SIZE
) {
80 nb_sectors
= size
>> VIRTIO_BLK_SECTOR_BITS
;
82 QEMU_BUILD_BUG_ON(BDRV_SECTOR_SIZE
!= VIRTIO_BLK_SECTOR_SIZE
);
83 if (nb_sectors
> BDRV_REQUEST_MAX_SECTORS
) {
86 if ((sector
<< VIRTIO_BLK_SECTOR_BITS
) % vexp
->blk_size
) {
89 blk_get_geometry(vexp
->export
.blk
, &total_sectors
);
90 if (sector
> total_sectors
|| nb_sectors
> total_sectors
- sector
) {
96 static int coroutine_fn
97 vu_blk_discard_write_zeroes(VuBlkExport
*vexp
, struct iovec
*iov
,
98 uint32_t iovcnt
, uint32_t type
)
100 BlockBackend
*blk
= vexp
->export
.blk
;
101 struct virtio_blk_discard_write_zeroes desc
;
104 uint32_t num_sectors
;
105 uint32_t max_sectors
;
109 /* Only one desc is currently supported */
110 if (unlikely(iov_size(iov
, iovcnt
) > sizeof(desc
))) {
111 return VIRTIO_BLK_S_UNSUPP
;
114 size
= iov_to_buf(iov
, iovcnt
, 0, &desc
, sizeof(desc
));
115 if (unlikely(size
!= sizeof(desc
))) {
116 error_report("Invalid size %zd, expected %zu", size
, sizeof(desc
));
117 return VIRTIO_BLK_S_IOERR
;
120 sector
= le64_to_cpu(desc
.sector
);
121 num_sectors
= le32_to_cpu(desc
.num_sectors
);
122 flags
= le32_to_cpu(desc
.flags
);
123 max_sectors
= (type
== VIRTIO_BLK_T_WRITE_ZEROES
) ?
124 VHOST_USER_BLK_MAX_WRITE_ZEROES_SECTORS
:
125 VHOST_USER_BLK_MAX_DISCARD_SECTORS
;
127 /* This check ensures that 'bytes' fits in an int */
128 if (unlikely(num_sectors
> max_sectors
)) {
129 return VIRTIO_BLK_S_IOERR
;
132 bytes
= num_sectors
<< VIRTIO_BLK_SECTOR_BITS
;
134 if (unlikely(!vu_blk_sect_range_ok(vexp
, sector
, bytes
))) {
135 return VIRTIO_BLK_S_IOERR
;
139 * The device MUST set the status byte to VIRTIO_BLK_S_UNSUPP for discard
140 * and write zeroes commands if any unknown flag is set.
142 if (unlikely(flags
& ~VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP
)) {
143 return VIRTIO_BLK_S_UNSUPP
;
146 if (type
== VIRTIO_BLK_T_WRITE_ZEROES
) {
149 if (flags
& VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP
) {
150 blk_flags
|= BDRV_REQ_MAY_UNMAP
;
153 if (blk_co_pwrite_zeroes(blk
, sector
<< VIRTIO_BLK_SECTOR_BITS
,
154 bytes
, blk_flags
) == 0) {
155 return VIRTIO_BLK_S_OK
;
157 } else if (type
== VIRTIO_BLK_T_DISCARD
) {
159 * The device MUST set the status byte to VIRTIO_BLK_S_UNSUPP for
160 * discard commands if the unmap flag is set.
162 if (unlikely(flags
& VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP
)) {
163 return VIRTIO_BLK_S_UNSUPP
;
166 if (blk_co_pdiscard(blk
, sector
<< VIRTIO_BLK_SECTOR_BITS
,
168 return VIRTIO_BLK_S_OK
;
172 return VIRTIO_BLK_S_IOERR
;
175 /* Called with server refcount increased, must decrease before returning */
176 static void coroutine_fn
vu_blk_virtio_process_req(void *opaque
)
178 VuBlkReq
*req
= opaque
;
179 VuServer
*server
= req
->server
;
180 VuVirtqElement
*elem
= &req
->elem
;
183 VuBlkExport
*vexp
= container_of(server
, VuBlkExport
, vu_server
);
184 BlockBackend
*blk
= vexp
->export
.blk
;
186 struct iovec
*in_iov
= elem
->in_sg
;
187 struct iovec
*out_iov
= elem
->out_sg
;
188 unsigned in_num
= elem
->in_num
;
189 unsigned out_num
= elem
->out_num
;
191 /* refer to hw/block/virtio_blk.c */
192 if (elem
->out_num
< 1 || elem
->in_num
< 1) {
193 error_report("virtio-blk request missing headers");
197 if (unlikely(iov_to_buf(out_iov
, out_num
, 0, &req
->out
,
198 sizeof(req
->out
)) != sizeof(req
->out
))) {
199 error_report("virtio-blk request outhdr too short");
203 iov_discard_front(&out_iov
, &out_num
, sizeof(req
->out
));
205 if (in_iov
[in_num
- 1].iov_len
< sizeof(struct virtio_blk_inhdr
)) {
206 error_report("virtio-blk request inhdr too short");
210 /* We always touch the last byte, so just see how big in_iov is. */
211 req
->in
= (void *)in_iov
[in_num
- 1].iov_base
212 + in_iov
[in_num
- 1].iov_len
213 - sizeof(struct virtio_blk_inhdr
);
214 iov_discard_back(in_iov
, &in_num
, sizeof(struct virtio_blk_inhdr
));
216 type
= le32_to_cpu(req
->out
.type
);
217 switch (type
& ~VIRTIO_BLK_T_BARRIER
) {
218 case VIRTIO_BLK_T_IN
:
219 case VIRTIO_BLK_T_OUT
: {
223 bool is_write
= type
& VIRTIO_BLK_T_OUT
;
224 req
->sector_num
= le64_to_cpu(req
->out
.sector
);
226 if (is_write
&& !vexp
->writable
) {
227 req
->in
->status
= VIRTIO_BLK_S_IOERR
;
232 qemu_iovec_init_external(&qiov
, out_iov
, out_num
);
234 qemu_iovec_init_external(&qiov
, in_iov
, in_num
);
237 if (unlikely(!vu_blk_sect_range_ok(vexp
,
240 req
->in
->status
= VIRTIO_BLK_S_IOERR
;
244 offset
= req
->sector_num
<< VIRTIO_BLK_SECTOR_BITS
;
247 ret
= blk_co_pwritev(blk
, offset
, qiov
.size
, &qiov
, 0);
249 ret
= blk_co_preadv(blk
, offset
, qiov
.size
, &qiov
, 0);
252 req
->in
->status
= VIRTIO_BLK_S_OK
;
254 req
->in
->status
= VIRTIO_BLK_S_IOERR
;
258 case VIRTIO_BLK_T_FLUSH
:
259 if (blk_co_flush(blk
) == 0) {
260 req
->in
->status
= VIRTIO_BLK_S_OK
;
262 req
->in
->status
= VIRTIO_BLK_S_IOERR
;
265 case VIRTIO_BLK_T_GET_ID
: {
266 size_t size
= MIN(iov_size(&elem
->in_sg
[0], in_num
),
267 VIRTIO_BLK_ID_BYTES
);
268 snprintf(elem
->in_sg
[0].iov_base
, size
, "%s", "vhost_user_blk");
269 req
->in
->status
= VIRTIO_BLK_S_OK
;
270 req
->size
= elem
->in_sg
[0].iov_len
;
273 case VIRTIO_BLK_T_DISCARD
:
274 case VIRTIO_BLK_T_WRITE_ZEROES
: {
275 if (!vexp
->writable
) {
276 req
->in
->status
= VIRTIO_BLK_S_IOERR
;
280 req
->in
->status
= vu_blk_discard_write_zeroes(vexp
, out_iov
, out_num
,
285 req
->in
->status
= VIRTIO_BLK_S_UNSUPP
;
289 vu_blk_req_complete(req
);
290 vhost_user_server_unref(server
);
295 vhost_user_server_unref(server
);
298 static void vu_blk_process_vq(VuDev
*vu_dev
, int idx
)
300 VuServer
*server
= container_of(vu_dev
, VuServer
, vu_dev
);
301 VuVirtq
*vq
= vu_get_queue(vu_dev
, idx
);
306 req
= vu_queue_pop(vu_dev
, vq
, sizeof(VuBlkReq
));
311 req
->server
= server
;
315 qemu_coroutine_create(vu_blk_virtio_process_req
, req
);
317 vhost_user_server_ref(server
);
318 qemu_coroutine_enter(co
);
322 static void vu_blk_queue_set_started(VuDev
*vu_dev
, int idx
, bool started
)
328 vq
= vu_get_queue(vu_dev
, idx
);
329 vu_set_queue_handler(vu_dev
, vq
, started
? vu_blk_process_vq
: NULL
);
332 static uint64_t vu_blk_get_features(VuDev
*dev
)
335 VuServer
*server
= container_of(dev
, VuServer
, vu_dev
);
336 VuBlkExport
*vexp
= container_of(server
, VuBlkExport
, vu_server
);
337 features
= 1ull << VIRTIO_BLK_F_SIZE_MAX
|
338 1ull << VIRTIO_BLK_F_SEG_MAX
|
339 1ull << VIRTIO_BLK_F_TOPOLOGY
|
340 1ull << VIRTIO_BLK_F_BLK_SIZE
|
341 1ull << VIRTIO_BLK_F_FLUSH
|
342 1ull << VIRTIO_BLK_F_DISCARD
|
343 1ull << VIRTIO_BLK_F_WRITE_ZEROES
|
344 1ull << VIRTIO_BLK_F_CONFIG_WCE
|
345 1ull << VIRTIO_BLK_F_MQ
|
346 1ull << VIRTIO_F_VERSION_1
|
347 1ull << VIRTIO_RING_F_INDIRECT_DESC
|
348 1ull << VIRTIO_RING_F_EVENT_IDX
|
349 1ull << VHOST_USER_F_PROTOCOL_FEATURES
;
351 if (!vexp
->writable
) {
352 features
|= 1ull << VIRTIO_BLK_F_RO
;
358 static uint64_t vu_blk_get_protocol_features(VuDev
*dev
)
360 return 1ull << VHOST_USER_PROTOCOL_F_CONFIG
;
364 vu_blk_get_config(VuDev
*vu_dev
, uint8_t *config
, uint32_t len
)
366 VuServer
*server
= container_of(vu_dev
, VuServer
, vu_dev
);
367 VuBlkExport
*vexp
= container_of(server
, VuBlkExport
, vu_server
);
369 if (len
> sizeof(struct virtio_blk_config
)) {
373 memcpy(config
, &vexp
->blkcfg
, len
);
378 vu_blk_set_config(VuDev
*vu_dev
, const uint8_t *data
,
379 uint32_t offset
, uint32_t size
, uint32_t flags
)
381 VuServer
*server
= container_of(vu_dev
, VuServer
, vu_dev
);
382 VuBlkExport
*vexp
= container_of(server
, VuBlkExport
, vu_server
);
385 /* don't support live migration */
386 if (flags
!= VHOST_SET_CONFIG_TYPE_MASTER
) {
390 if (offset
!= offsetof(struct virtio_blk_config
, wce
) ||
396 vexp
->blkcfg
.wce
= wce
;
397 blk_set_enable_write_cache(vexp
->export
.blk
, wce
);
402 * When the client disconnects, it sends a VHOST_USER_NONE request
403 * and vu_process_message will simple call exit which cause the VM
405 * To avoid this issue, process VHOST_USER_NONE request ahead
406 * of vu_process_message.
409 static int vu_blk_process_msg(VuDev
*dev
, VhostUserMsg
*vmsg
, int *do_reply
)
411 if (vmsg
->request
== VHOST_USER_NONE
) {
412 dev
->panic(dev
, "disconnect");
418 static const VuDevIface vu_blk_iface
= {
419 .get_features
= vu_blk_get_features
,
420 .queue_set_started
= vu_blk_queue_set_started
,
421 .get_protocol_features
= vu_blk_get_protocol_features
,
422 .get_config
= vu_blk_get_config
,
423 .set_config
= vu_blk_set_config
,
424 .process_msg
= vu_blk_process_msg
,
427 static void blk_aio_attached(AioContext
*ctx
, void *opaque
)
429 VuBlkExport
*vexp
= opaque
;
431 vexp
->export
.ctx
= ctx
;
432 vhost_user_server_attach_aio_context(&vexp
->vu_server
, ctx
);
435 static void blk_aio_detach(void *opaque
)
437 VuBlkExport
*vexp
= opaque
;
439 vhost_user_server_detach_aio_context(&vexp
->vu_server
);
440 vexp
->export
.ctx
= NULL
;
444 vu_blk_initialize_config(BlockDriverState
*bs
,
445 struct virtio_blk_config
*config
,
450 cpu_to_le64(bdrv_getlength(bs
) >> VIRTIO_BLK_SECTOR_BITS
);
451 config
->blk_size
= cpu_to_le32(blk_size
);
452 config
->size_max
= cpu_to_le32(0);
453 config
->seg_max
= cpu_to_le32(128 - 2);
454 config
->min_io_size
= cpu_to_le16(1);
455 config
->opt_io_size
= cpu_to_le32(1);
456 config
->num_queues
= cpu_to_le16(num_queues
);
457 config
->max_discard_sectors
=
458 cpu_to_le32(VHOST_USER_BLK_MAX_DISCARD_SECTORS
);
459 config
->max_discard_seg
= cpu_to_le32(1);
460 config
->discard_sector_alignment
=
461 cpu_to_le32(blk_size
>> VIRTIO_BLK_SECTOR_BITS
);
462 config
->max_write_zeroes_sectors
463 = cpu_to_le32(VHOST_USER_BLK_MAX_WRITE_ZEROES_SECTORS
);
464 config
->max_write_zeroes_seg
= cpu_to_le32(1);
467 static void vu_blk_exp_request_shutdown(BlockExport
*exp
)
469 VuBlkExport
*vexp
= container_of(exp
, VuBlkExport
, export
);
471 vhost_user_server_stop(&vexp
->vu_server
);
474 static int vu_blk_exp_create(BlockExport
*exp
, BlockExportOptions
*opts
,
477 VuBlkExport
*vexp
= container_of(exp
, VuBlkExport
, export
);
478 BlockExportOptionsVhostUserBlk
*vu_opts
= &opts
->u
.vhost_user_blk
;
479 Error
*local_err
= NULL
;
480 uint64_t logical_block_size
;
481 uint16_t num_queues
= VHOST_USER_BLK_NUM_QUEUES_DEFAULT
;
483 vexp
->writable
= opts
->writable
;
484 vexp
->blkcfg
.wce
= 0;
486 if (vu_opts
->has_logical_block_size
) {
487 logical_block_size
= vu_opts
->logical_block_size
;
489 logical_block_size
= VIRTIO_BLK_SECTOR_SIZE
;
491 check_block_size(exp
->id
, "logical-block-size", logical_block_size
,
494 error_propagate(errp
, local_err
);
497 vexp
->blk_size
= logical_block_size
;
498 blk_set_guest_block_size(exp
->blk
, logical_block_size
);
500 if (vu_opts
->has_num_queues
) {
501 num_queues
= vu_opts
->num_queues
;
503 if (num_queues
== 0) {
504 error_setg(errp
, "num-queues must be greater than 0");
508 vu_blk_initialize_config(blk_bs(exp
->blk
), &vexp
->blkcfg
,
509 logical_block_size
, num_queues
);
511 blk_add_aio_context_notifier(exp
->blk
, blk_aio_attached
, blk_aio_detach
,
514 if (!vhost_user_server_start(&vexp
->vu_server
, vu_opts
->addr
, exp
->ctx
,
515 num_queues
, &vu_blk_iface
, errp
)) {
516 blk_remove_aio_context_notifier(exp
->blk
, blk_aio_attached
,
517 blk_aio_detach
, vexp
);
518 return -EADDRNOTAVAIL
;
524 static void vu_blk_exp_delete(BlockExport
*exp
)
526 VuBlkExport
*vexp
= container_of(exp
, VuBlkExport
, export
);
528 blk_remove_aio_context_notifier(exp
->blk
, blk_aio_attached
, blk_aio_detach
,
532 const BlockExportDriver blk_exp_vhost_user_blk
= {
533 .type
= BLOCK_EXPORT_TYPE_VHOST_USER_BLK
,
534 .instance_size
= sizeof(VuBlkExport
),
535 .create
= vu_blk_exp_create
,
536 .delete = vu_blk_exp_delete
,
537 .request_shutdown
= vu_blk_exp_request_shutdown
,