2 * Export QEMU block device via VDUSE
4 * Copyright (C) 2022 Bytedance Inc. and/or its affiliates. All rights reserved.
7 * Xie Yongji <xieyongji@bytedance.com>
9 * This work is licensed under the terms of the GNU GPL, version 2 or
10 * later. See the COPYING file in the top-level directory.
13 #include "qemu/osdep.h"
14 #include <sys/eventfd.h>
16 #include "qapi/error.h"
17 #include "block/export.h"
18 #include "qemu/error-report.h"
19 #include "util/block-helpers.h"
20 #include "subprojects/libvduse/libvduse.h"
21 #include "virtio-blk-handler.h"
23 #include "standard-headers/linux/virtio_blk.h"
25 #define VDUSE_DEFAULT_NUM_QUEUE 1
26 #define VDUSE_DEFAULT_QUEUE_SIZE 256
28 typedef struct VduseBlkExport
{
30 VirtioBlkHandler handler
;
34 unsigned int inflight
; /* atomic */
38 typedef struct VduseBlkReq
{
39 VduseVirtqElement elem
;
43 static void vduse_blk_inflight_inc(VduseBlkExport
*vblk_exp
)
45 if (qatomic_fetch_inc(&vblk_exp
->inflight
) == 0) {
46 /* Prevent export from being deleted */
47 blk_exp_ref(&vblk_exp
->export
);
51 static void vduse_blk_inflight_dec(VduseBlkExport
*vblk_exp
)
53 if (qatomic_fetch_dec(&vblk_exp
->inflight
) == 1) {
54 /* Wake AIO_WAIT_WHILE() */
57 /* Now the export can be deleted */
58 blk_exp_unref(&vblk_exp
->export
);
62 static void vduse_blk_req_complete(VduseBlkReq
*req
, size_t in_len
)
64 vduse_queue_push(req
->vq
, &req
->elem
, in_len
);
65 vduse_queue_notify(req
->vq
);
70 static void coroutine_fn
vduse_blk_virtio_process_req(void *opaque
)
72 VduseBlkReq
*req
= opaque
;
73 VduseVirtq
*vq
= req
->vq
;
74 VduseDev
*dev
= vduse_queue_get_dev(vq
);
75 VduseBlkExport
*vblk_exp
= vduse_dev_get_priv(dev
);
76 VirtioBlkHandler
*handler
= &vblk_exp
->handler
;
77 VduseVirtqElement
*elem
= &req
->elem
;
78 struct iovec
*in_iov
= elem
->in_sg
;
79 struct iovec
*out_iov
= elem
->out_sg
;
80 unsigned in_num
= elem
->in_num
;
81 unsigned out_num
= elem
->out_num
;
84 in_len
= virtio_blk_process_req(handler
, in_iov
,
85 out_iov
, in_num
, out_num
);
91 vduse_blk_req_complete(req
, in_len
);
92 vduse_blk_inflight_dec(vblk_exp
);
95 static void vduse_blk_vq_handler(VduseDev
*dev
, VduseVirtq
*vq
)
97 VduseBlkExport
*vblk_exp
= vduse_dev_get_priv(dev
);
102 req
= vduse_queue_pop(vq
, sizeof(VduseBlkReq
));
109 qemu_coroutine_create(vduse_blk_virtio_process_req
, req
);
111 vduse_blk_inflight_inc(vblk_exp
);
112 qemu_coroutine_enter(co
);
116 static void on_vduse_vq_kick(void *opaque
)
118 VduseVirtq
*vq
= opaque
;
119 VduseDev
*dev
= vduse_queue_get_dev(vq
);
120 int fd
= vduse_queue_get_fd(vq
);
123 if (eventfd_read(fd
, &kick_data
) == -1) {
124 error_report("failed to read data from eventfd");
128 vduse_blk_vq_handler(dev
, vq
);
131 static void vduse_blk_enable_queue(VduseDev
*dev
, VduseVirtq
*vq
)
133 VduseBlkExport
*vblk_exp
= vduse_dev_get_priv(dev
);
135 if (!vblk_exp
->vqs_started
) {
136 return; /* vduse_blk_drained_end() will start vqs later */
139 aio_set_fd_handler(vblk_exp
->export
.ctx
, vduse_queue_get_fd(vq
),
140 on_vduse_vq_kick
, NULL
, NULL
, NULL
, vq
);
141 /* Make sure we don't miss any kick after reconnecting */
142 eventfd_write(vduse_queue_get_fd(vq
), 1);
145 static void vduse_blk_disable_queue(VduseDev
*dev
, VduseVirtq
*vq
)
147 VduseBlkExport
*vblk_exp
= vduse_dev_get_priv(dev
);
148 int fd
= vduse_queue_get_fd(vq
);
154 aio_set_fd_handler(vblk_exp
->export
.ctx
, fd
,
155 NULL
, NULL
, NULL
, NULL
, NULL
);
158 static const VduseOps vduse_blk_ops
= {
159 .enable_queue
= vduse_blk_enable_queue
,
160 .disable_queue
= vduse_blk_disable_queue
,
163 static void on_vduse_dev_kick(void *opaque
)
165 VduseDev
*dev
= opaque
;
167 vduse_dev_handler(dev
);
170 static void vduse_blk_attach_ctx(VduseBlkExport
*vblk_exp
, AioContext
*ctx
)
172 aio_set_fd_handler(vblk_exp
->export
.ctx
, vduse_dev_get_fd(vblk_exp
->dev
),
173 on_vduse_dev_kick
, NULL
, NULL
, NULL
,
176 /* Virtqueues are handled by vduse_blk_drained_end() */
179 static void vduse_blk_detach_ctx(VduseBlkExport
*vblk_exp
)
181 aio_set_fd_handler(vblk_exp
->export
.ctx
, vduse_dev_get_fd(vblk_exp
->dev
),
182 NULL
, NULL
, NULL
, NULL
, NULL
);
184 /* Virtqueues are handled by vduse_blk_drained_begin() */
188 static void blk_aio_attached(AioContext
*ctx
, void *opaque
)
190 VduseBlkExport
*vblk_exp
= opaque
;
192 vblk_exp
->export
.ctx
= ctx
;
193 vduse_blk_attach_ctx(vblk_exp
, ctx
);
196 static void blk_aio_detach(void *opaque
)
198 VduseBlkExport
*vblk_exp
= opaque
;
200 vduse_blk_detach_ctx(vblk_exp
);
201 vblk_exp
->export
.ctx
= NULL
;
204 static void vduse_blk_resize(void *opaque
)
206 BlockExport
*exp
= opaque
;
207 VduseBlkExport
*vblk_exp
= container_of(exp
, VduseBlkExport
, export
);
208 struct virtio_blk_config config
;
211 cpu_to_le64(blk_getlength(exp
->blk
) >> VIRTIO_BLK_SECTOR_BITS
);
212 vduse_dev_update_config(vblk_exp
->dev
, sizeof(config
.capacity
),
213 offsetof(struct virtio_blk_config
, capacity
),
214 (char *)&config
.capacity
);
217 static void vduse_blk_stop_virtqueues(VduseBlkExport
*vblk_exp
)
219 for (uint16_t i
= 0; i
< vblk_exp
->num_queues
; i
++) {
220 VduseVirtq
*vq
= vduse_dev_get_queue(vblk_exp
->dev
, i
);
221 vduse_blk_disable_queue(vblk_exp
->dev
, vq
);
224 vblk_exp
->vqs_started
= false;
227 static void vduse_blk_start_virtqueues(VduseBlkExport
*vblk_exp
)
229 vblk_exp
->vqs_started
= true;
231 for (uint16_t i
= 0; i
< vblk_exp
->num_queues
; i
++) {
232 VduseVirtq
*vq
= vduse_dev_get_queue(vblk_exp
->dev
, i
);
233 vduse_blk_enable_queue(vblk_exp
->dev
, vq
);
237 static void vduse_blk_drained_begin(void *opaque
)
239 BlockExport
*exp
= opaque
;
240 VduseBlkExport
*vblk_exp
= container_of(exp
, VduseBlkExport
, export
);
242 vduse_blk_stop_virtqueues(vblk_exp
);
245 static void vduse_blk_drained_end(void *opaque
)
247 BlockExport
*exp
= opaque
;
248 VduseBlkExport
*vblk_exp
= container_of(exp
, VduseBlkExport
, export
);
250 vduse_blk_start_virtqueues(vblk_exp
);
253 static bool vduse_blk_drained_poll(void *opaque
)
255 BlockExport
*exp
= opaque
;
256 VduseBlkExport
*vblk_exp
= container_of(exp
, VduseBlkExport
, export
);
258 return qatomic_read(&vblk_exp
->inflight
) > 0;
261 static const BlockDevOps vduse_block_ops
= {
262 .resize_cb
= vduse_blk_resize
,
263 .drained_begin
= vduse_blk_drained_begin
,
264 .drained_end
= vduse_blk_drained_end
,
265 .drained_poll
= vduse_blk_drained_poll
,
268 static int vduse_blk_exp_create(BlockExport
*exp
, BlockExportOptions
*opts
,
271 VduseBlkExport
*vblk_exp
= container_of(exp
, VduseBlkExport
, export
);
272 BlockExportOptionsVduseBlk
*vblk_opts
= &opts
->u
.vduse_blk
;
273 uint64_t logical_block_size
= VIRTIO_BLK_SECTOR_SIZE
;
274 uint16_t num_queues
= VDUSE_DEFAULT_NUM_QUEUE
;
275 uint16_t queue_size
= VDUSE_DEFAULT_QUEUE_SIZE
;
276 Error
*local_err
= NULL
;
277 struct virtio_blk_config config
= { 0 };
281 if (vblk_opts
->has_num_queues
) {
282 num_queues
= vblk_opts
->num_queues
;
283 if (num_queues
== 0) {
284 error_setg(errp
, "num-queues must be greater than 0");
289 if (vblk_opts
->has_queue_size
) {
290 queue_size
= vblk_opts
->queue_size
;
291 if (queue_size
<= 2 || !is_power_of_2(queue_size
) ||
292 queue_size
> VIRTQUEUE_MAX_SIZE
) {
293 error_setg(errp
, "queue-size is invalid");
298 if (vblk_opts
->has_logical_block_size
) {
299 logical_block_size
= vblk_opts
->logical_block_size
;
300 check_block_size(exp
->id
, "logical-block-size", logical_block_size
,
303 error_propagate(errp
, local_err
);
307 vblk_exp
->num_queues
= num_queues
;
308 vblk_exp
->handler
.blk
= exp
->blk
;
309 vblk_exp
->handler
.serial
= g_strdup(vblk_opts
->serial
?: "");
310 vblk_exp
->handler
.logical_block_size
= logical_block_size
;
311 vblk_exp
->handler
.writable
= opts
->writable
;
312 vblk_exp
->vqs_started
= true;
315 cpu_to_le64(blk_getlength(exp
->blk
) >> VIRTIO_BLK_SECTOR_BITS
);
316 config
.seg_max
= cpu_to_le32(queue_size
- 2);
317 config
.min_io_size
= cpu_to_le16(1);
318 config
.opt_io_size
= cpu_to_le32(1);
319 config
.num_queues
= cpu_to_le16(num_queues
);
320 config
.blk_size
= cpu_to_le32(logical_block_size
);
321 config
.max_discard_sectors
= cpu_to_le32(VIRTIO_BLK_MAX_DISCARD_SECTORS
);
322 config
.max_discard_seg
= cpu_to_le32(1);
323 config
.discard_sector_alignment
=
324 cpu_to_le32(logical_block_size
>> VIRTIO_BLK_SECTOR_BITS
);
325 config
.max_write_zeroes_sectors
=
326 cpu_to_le32(VIRTIO_BLK_MAX_WRITE_ZEROES_SECTORS
);
327 config
.max_write_zeroes_seg
= cpu_to_le32(1);
329 features
= vduse_get_virtio_features() |
330 (1ULL << VIRTIO_BLK_F_SEG_MAX
) |
331 (1ULL << VIRTIO_BLK_F_TOPOLOGY
) |
332 (1ULL << VIRTIO_BLK_F_BLK_SIZE
) |
333 (1ULL << VIRTIO_BLK_F_FLUSH
) |
334 (1ULL << VIRTIO_BLK_F_DISCARD
) |
335 (1ULL << VIRTIO_BLK_F_WRITE_ZEROES
);
337 if (num_queues
> 1) {
338 features
|= 1ULL << VIRTIO_BLK_F_MQ
;
340 if (!opts
->writable
) {
341 features
|= 1ULL << VIRTIO_BLK_F_RO
;
344 vblk_exp
->dev
= vduse_dev_create(vblk_opts
->name
, VIRTIO_ID_BLOCK
, 0,
345 features
, num_queues
,
346 sizeof(struct virtio_blk_config
),
347 (char *)&config
, &vduse_blk_ops
,
349 if (!vblk_exp
->dev
) {
350 error_setg(errp
, "failed to create vduse device");
355 vblk_exp
->recon_file
= g_strdup_printf("%s/vduse-blk-%s",
356 g_get_tmp_dir(), vblk_opts
->name
);
357 if (vduse_set_reconnect_log_file(vblk_exp
->dev
, vblk_exp
->recon_file
)) {
358 error_setg(errp
, "failed to set reconnect log file");
363 for (i
= 0; i
< num_queues
; i
++) {
364 vduse_dev_setup_queue(vblk_exp
->dev
, i
, queue_size
);
367 aio_set_fd_handler(exp
->ctx
, vduse_dev_get_fd(vblk_exp
->dev
),
368 on_vduse_dev_kick
, NULL
, NULL
, NULL
, vblk_exp
->dev
);
370 blk_add_aio_context_notifier(exp
->blk
, blk_aio_attached
, blk_aio_detach
,
372 blk_set_dev_ops(exp
->blk
, &vduse_block_ops
, exp
);
375 * We handle draining ourselves using an in-flight counter and by disabling
376 * virtqueue fd handlers. Do not queue BlockBackend requests, they need to
377 * complete so the in-flight counter reaches zero.
379 blk_set_disable_request_queuing(exp
->blk
, true);
383 vduse_dev_destroy(vblk_exp
->dev
);
384 g_free(vblk_exp
->recon_file
);
386 g_free(vblk_exp
->handler
.serial
);
390 static void vduse_blk_exp_delete(BlockExport
*exp
)
392 VduseBlkExport
*vblk_exp
= container_of(exp
, VduseBlkExport
, export
);
395 assert(qatomic_read(&vblk_exp
->inflight
) == 0);
397 vduse_blk_detach_ctx(vblk_exp
);
398 blk_remove_aio_context_notifier(exp
->blk
, blk_aio_attached
, blk_aio_detach
,
400 ret
= vduse_dev_destroy(vblk_exp
->dev
);
402 unlink(vblk_exp
->recon_file
);
404 g_free(vblk_exp
->recon_file
);
405 g_free(vblk_exp
->handler
.serial
);
408 /* Called with exp->ctx acquired */
409 static void vduse_blk_exp_request_shutdown(BlockExport
*exp
)
411 VduseBlkExport
*vblk_exp
= container_of(exp
, VduseBlkExport
, export
);
413 vduse_blk_stop_virtqueues(vblk_exp
);
416 const BlockExportDriver blk_exp_vduse_blk
= {
417 .type
= BLOCK_EXPORT_TYPE_VDUSE_BLK
,
418 .instance_size
= sizeof(VduseBlkExport
),
419 .create
= vduse_blk_exp_create
,
420 .delete = vduse_blk_exp_delete
,
421 .request_shutdown
= vduse_blk_exp_request_shutdown
,