4 * Copyright(c) 2017-2018 Intel Corporation.
5 * Copyright(c) 2020 Red Hat, Inc.
7 * This work is licensed under the terms of the GNU GPL, version 2 or later.
8 * See the COPYING file in the top-level directory.
12 #include "qemu/osdep.h"
14 #include "hw/virtio/virtio-net.h"
15 #include "net/vhost_net.h"
16 #include "net/vhost-vdpa.h"
17 #include "hw/virtio/vhost-vdpa.h"
18 #include "qemu/config-file.h"
19 #include "qemu/error-report.h"
21 #include "qemu/memalign.h"
22 #include "qemu/option.h"
23 #include "qapi/error.h"
24 #include <linux/vhost.h>
25 #include <sys/ioctl.h>
27 #include "standard-headers/linux/virtio_net.h"
28 #include "monitor/monitor.h"
29 #include "hw/virtio/vhost.h"
31 /* Todo:need to add the multiqueue support here */
32 typedef struct VhostVDPAState
{
34 struct vhost_vdpa vhost_vdpa
;
35 VHostNetState
*vhost_net
;
37 /* Control commands shadow buffers */
38 void *cvq_cmd_out_buffer
, *cvq_cmd_in_buffer
;
42 const int vdpa_feature_bits
[] = {
43 VIRTIO_F_NOTIFY_ON_EMPTY
,
44 VIRTIO_RING_F_INDIRECT_DESC
,
45 VIRTIO_RING_F_EVENT_IDX
,
49 VIRTIO_NET_F_GUEST_CSUM
,
51 VIRTIO_NET_F_GUEST_TSO4
,
52 VIRTIO_NET_F_GUEST_TSO6
,
53 VIRTIO_NET_F_GUEST_ECN
,
54 VIRTIO_NET_F_GUEST_UFO
,
55 VIRTIO_NET_F_HOST_TSO4
,
56 VIRTIO_NET_F_HOST_TSO6
,
57 VIRTIO_NET_F_HOST_ECN
,
58 VIRTIO_NET_F_HOST_UFO
,
59 VIRTIO_NET_F_MRG_RXBUF
,
62 VIRTIO_NET_F_CTRL_RX_EXTRA
,
63 VIRTIO_NET_F_CTRL_VLAN
,
64 VIRTIO_NET_F_GUEST_ANNOUNCE
,
65 VIRTIO_NET_F_CTRL_MAC_ADDR
,
69 VIRTIO_F_IOMMU_PLATFORM
,
72 VIRTIO_NET_F_HASH_REPORT
,
73 VIRTIO_NET_F_GUEST_ANNOUNCE
,
75 VHOST_INVALID_FEATURE_BIT
78 /** Supported device specific feature bits with SVQ */
79 static const uint64_t vdpa_svq_device_features
=
80 BIT_ULL(VIRTIO_NET_F_CSUM
) |
81 BIT_ULL(VIRTIO_NET_F_GUEST_CSUM
) |
82 BIT_ULL(VIRTIO_NET_F_MTU
) |
83 BIT_ULL(VIRTIO_NET_F_MAC
) |
84 BIT_ULL(VIRTIO_NET_F_GUEST_TSO4
) |
85 BIT_ULL(VIRTIO_NET_F_GUEST_TSO6
) |
86 BIT_ULL(VIRTIO_NET_F_GUEST_ECN
) |
87 BIT_ULL(VIRTIO_NET_F_GUEST_UFO
) |
88 BIT_ULL(VIRTIO_NET_F_HOST_TSO4
) |
89 BIT_ULL(VIRTIO_NET_F_HOST_TSO6
) |
90 BIT_ULL(VIRTIO_NET_F_HOST_ECN
) |
91 BIT_ULL(VIRTIO_NET_F_HOST_UFO
) |
92 BIT_ULL(VIRTIO_NET_F_MRG_RXBUF
) |
93 BIT_ULL(VIRTIO_NET_F_STATUS
) |
94 BIT_ULL(VIRTIO_NET_F_CTRL_VQ
) |
95 BIT_ULL(VIRTIO_F_ANY_LAYOUT
) |
96 BIT_ULL(VIRTIO_NET_F_CTRL_MAC_ADDR
) |
97 BIT_ULL(VIRTIO_NET_F_RSC_EXT
) |
98 BIT_ULL(VIRTIO_NET_F_STANDBY
);
100 VHostNetState
*vhost_vdpa_get_vhost_net(NetClientState
*nc
)
102 VhostVDPAState
*s
= DO_UPCAST(VhostVDPAState
, nc
, nc
);
103 assert(nc
->info
->type
== NET_CLIENT_DRIVER_VHOST_VDPA
);
107 static int vhost_vdpa_net_check_device_id(struct vhost_net
*net
)
111 struct vhost_dev
*hdev
;
113 hdev
= (struct vhost_dev
*)&net
->dev
;
114 ret
= hdev
->vhost_ops
->vhost_get_device_id(hdev
, &device_id
);
115 if (device_id
!= VIRTIO_ID_NET
) {
121 static int vhost_vdpa_add(NetClientState
*ncs
, void *be
,
122 int queue_pair_index
, int nvqs
)
124 VhostNetOptions options
;
125 struct vhost_net
*net
= NULL
;
129 options
.backend_type
= VHOST_BACKEND_TYPE_VDPA
;
130 assert(ncs
->info
->type
== NET_CLIENT_DRIVER_VHOST_VDPA
);
131 s
= DO_UPCAST(VhostVDPAState
, nc
, ncs
);
132 options
.net_backend
= ncs
;
134 options
.busyloop_timeout
= 0;
137 net
= vhost_net_init(&options
);
139 error_report("failed to init vhost_net for queue");
143 ret
= vhost_vdpa_net_check_device_id(net
);
149 vhost_net_cleanup(net
);
155 static void vhost_vdpa_cleanup(NetClientState
*nc
)
157 VhostVDPAState
*s
= DO_UPCAST(VhostVDPAState
, nc
, nc
);
158 struct vhost_dev
*dev
= &s
->vhost_net
->dev
;
160 qemu_vfree(s
->cvq_cmd_out_buffer
);
161 qemu_vfree(s
->cvq_cmd_in_buffer
);
162 if (dev
->vq_index
+ dev
->nvqs
== dev
->vq_index_end
) {
163 g_clear_pointer(&s
->vhost_vdpa
.iova_tree
, vhost_iova_tree_delete
);
166 vhost_net_cleanup(s
->vhost_net
);
167 g_free(s
->vhost_net
);
170 if (s
->vhost_vdpa
.device_fd
>= 0) {
171 qemu_close(s
->vhost_vdpa
.device_fd
);
172 s
->vhost_vdpa
.device_fd
= -1;
176 static bool vhost_vdpa_has_vnet_hdr(NetClientState
*nc
)
178 assert(nc
->info
->type
== NET_CLIENT_DRIVER_VHOST_VDPA
);
183 static bool vhost_vdpa_has_ufo(NetClientState
*nc
)
185 assert(nc
->info
->type
== NET_CLIENT_DRIVER_VHOST_VDPA
);
186 VhostVDPAState
*s
= DO_UPCAST(VhostVDPAState
, nc
, nc
);
187 uint64_t features
= 0;
188 features
|= (1ULL << VIRTIO_NET_F_HOST_UFO
);
189 features
= vhost_net_get_features(s
->vhost_net
, features
);
190 return !!(features
& (1ULL << VIRTIO_NET_F_HOST_UFO
));
194 static bool vhost_vdpa_check_peer_type(NetClientState
*nc
, ObjectClass
*oc
,
197 const char *driver
= object_class_get_name(oc
);
199 if (!g_str_has_prefix(driver
, "virtio-net-")) {
200 error_setg(errp
, "vhost-vdpa requires frontend driver virtio-net-*");
207 /** Dummy receive in case qemu falls back to userland tap networking */
208 static ssize_t
vhost_vdpa_receive(NetClientState
*nc
, const uint8_t *buf
,
214 static NetClientInfo net_vhost_vdpa_info
= {
215 .type
= NET_CLIENT_DRIVER_VHOST_VDPA
,
216 .size
= sizeof(VhostVDPAState
),
217 .receive
= vhost_vdpa_receive
,
218 .cleanup
= vhost_vdpa_cleanup
,
219 .has_vnet_hdr
= vhost_vdpa_has_vnet_hdr
,
220 .has_ufo
= vhost_vdpa_has_ufo
,
221 .check_peer_type
= vhost_vdpa_check_peer_type
,
224 static void vhost_vdpa_cvq_unmap_buf(struct vhost_vdpa
*v
, void *addr
)
226 VhostIOVATree
*tree
= v
->iova_tree
;
229 * No need to specify size or to look for more translations since
230 * this contiguous chunk was allocated by us.
232 .translated_addr
= (hwaddr
)(uintptr_t)addr
,
234 const DMAMap
*map
= vhost_iova_tree_find_iova(tree
, &needle
);
237 if (unlikely(!map
)) {
238 error_report("Cannot locate expected map");
242 r
= vhost_vdpa_dma_unmap(v
, map
->iova
, map
->size
+ 1);
243 if (unlikely(r
!= 0)) {
244 error_report("Device cannot unmap: %s(%d)", g_strerror(r
), r
);
247 vhost_iova_tree_remove(tree
, map
);
250 static size_t vhost_vdpa_net_cvq_cmd_len(void)
253 * MAC_TABLE_SET is the ctrl command that produces the longer out buffer.
254 * In buffer is always 1 byte, so it should fit here
256 return sizeof(struct virtio_net_ctrl_hdr
) +
257 2 * sizeof(struct virtio_net_ctrl_mac
) +
258 MAC_TABLE_ENTRIES
* ETH_ALEN
;
261 static size_t vhost_vdpa_net_cvq_cmd_page_len(void)
263 return ROUND_UP(vhost_vdpa_net_cvq_cmd_len(), qemu_real_host_page_size());
266 /** Copy and map a guest buffer. */
267 static bool vhost_vdpa_cvq_map_buf(struct vhost_vdpa
*v
,
268 const struct iovec
*out_data
,
269 size_t out_num
, size_t data_len
, void *buf
,
270 size_t *written
, bool write
)
275 if (unlikely(!data_len
)) {
276 qemu_log_mask(LOG_GUEST_ERROR
, "%s: invalid legnth of %s buffer\n",
277 __func__
, write
? "in" : "out");
281 *written
= iov_to_buf(out_data
, out_num
, 0, buf
, data_len
);
282 map
.translated_addr
= (hwaddr
)(uintptr_t)buf
;
283 map
.size
= vhost_vdpa_net_cvq_cmd_page_len() - 1;
284 map
.perm
= write
? IOMMU_RW
: IOMMU_RO
,
285 r
= vhost_iova_tree_map_alloc(v
->iova_tree
, &map
);
286 if (unlikely(r
!= IOVA_OK
)) {
287 error_report("Cannot map injected element");
291 r
= vhost_vdpa_dma_map(v
, map
.iova
, vhost_vdpa_net_cvq_cmd_page_len(), buf
,
293 if (unlikely(r
< 0)) {
300 vhost_iova_tree_remove(v
->iova_tree
, &map
);
305 * Copy the guest element into a dedicated buffer suitable to be sent to NIC
307 * @iov: [0] is the out buffer, [1] is the in one
309 static bool vhost_vdpa_net_cvq_map_elem(VhostVDPAState
*s
,
310 VirtQueueElement
*elem
,
316 iov
[0].iov_base
= s
->cvq_cmd_out_buffer
;
317 ok
= vhost_vdpa_cvq_map_buf(&s
->vhost_vdpa
, elem
->out_sg
, elem
->out_num
,
318 vhost_vdpa_net_cvq_cmd_len(), iov
[0].iov_base
,
319 &iov
[0].iov_len
, false);
324 iov
[1].iov_base
= s
->cvq_cmd_in_buffer
;
325 ok
= vhost_vdpa_cvq_map_buf(&s
->vhost_vdpa
, NULL
, 0,
326 sizeof(virtio_net_ctrl_ack
), iov
[1].iov_base
,
329 vhost_vdpa_cvq_unmap_buf(&s
->vhost_vdpa
, s
->cvq_cmd_out_buffer
);
333 iov
[1].iov_len
= sizeof(virtio_net_ctrl_ack
);
338 * Do not forward commands not supported by SVQ. Otherwise, the device could
339 * accept it and qemu would not know how to update the device model.
341 static bool vhost_vdpa_net_cvq_validate_cmd(const struct iovec
*out
,
344 struct virtio_net_ctrl_hdr ctrl
;
347 n
= iov_to_buf(out
, out_num
, 0, &ctrl
, sizeof(ctrl
));
348 if (unlikely(n
< sizeof(ctrl
))) {
349 qemu_log_mask(LOG_GUEST_ERROR
,
350 "%s: invalid legnth of out buffer %zu\n", __func__
, n
);
354 switch (ctrl
.class) {
355 case VIRTIO_NET_CTRL_MAC
:
357 case VIRTIO_NET_CTRL_MAC_ADDR_SET
:
360 qemu_log_mask(LOG_GUEST_ERROR
, "%s: invalid mac cmd %u\n",
365 qemu_log_mask(LOG_GUEST_ERROR
, "%s: invalid control class %u\n",
366 __func__
, ctrl
.class);
373 * Validate and copy control virtqueue commands.
375 * Following QEMU guidelines, we offer a copy of the buffers to the device to
376 * prevent TOCTOU bugs.
378 static int vhost_vdpa_net_handle_ctrl_avail(VhostShadowVirtqueue
*svq
,
379 VirtQueueElement
*elem
,
382 VhostVDPAState
*s
= opaque
;
383 size_t in_len
, dev_written
;
384 virtio_net_ctrl_ack status
= VIRTIO_NET_ERR
;
385 /* out and in buffers sent to the device */
386 struct iovec dev_buffers
[2] = {
387 { .iov_base
= s
->cvq_cmd_out_buffer
},
388 { .iov_base
= s
->cvq_cmd_in_buffer
},
390 /* in buffer used for device model */
391 const struct iovec in
= {
393 .iov_len
= sizeof(status
),
398 ok
= vhost_vdpa_net_cvq_map_elem(s
, elem
, dev_buffers
);
403 ok
= vhost_vdpa_net_cvq_validate_cmd(&dev_buffers
[0], 1);
408 r
= vhost_svq_add(svq
, &dev_buffers
[0], 1, &dev_buffers
[1], 1, elem
);
409 if (unlikely(r
!= 0)) {
410 if (unlikely(r
== -ENOSPC
)) {
411 qemu_log_mask(LOG_GUEST_ERROR
, "%s: No space on device queue\n",
418 * We can poll here since we've had BQL from the time we sent the
419 * descriptor. Also, we need to take the answer before SVQ pulls by itself,
420 * when BQL is released
422 dev_written
= vhost_svq_poll(svq
);
423 if (unlikely(dev_written
< sizeof(status
))) {
424 error_report("Insufficient written data (%zu)", dev_written
);
428 memcpy(&status
, dev_buffers
[1].iov_base
, sizeof(status
));
429 if (status
!= VIRTIO_NET_OK
) {
433 status
= VIRTIO_NET_ERR
;
434 virtio_net_handle_ctrl_iov(svq
->vdev
, &in
, 1, dev_buffers
, 1);
435 if (status
!= VIRTIO_NET_OK
) {
436 error_report("Bad CVQ processing in model");
440 in_len
= iov_from_buf(elem
->in_sg
, elem
->in_num
, 0, &status
,
442 if (unlikely(in_len
< sizeof(status
))) {
443 error_report("Bad device CVQ written length");
445 vhost_svq_push_elem(svq
, elem
, MIN(in_len
, sizeof(status
)));
447 if (dev_buffers
[0].iov_base
) {
448 vhost_vdpa_cvq_unmap_buf(&s
->vhost_vdpa
, dev_buffers
[0].iov_base
);
450 if (dev_buffers
[1].iov_base
) {
451 vhost_vdpa_cvq_unmap_buf(&s
->vhost_vdpa
, dev_buffers
[1].iov_base
);
456 static const VhostShadowVirtqueueOps vhost_vdpa_net_svq_ops
= {
457 .avail_handler
= vhost_vdpa_net_handle_ctrl_avail
,
460 static NetClientState
*net_vhost_vdpa_init(NetClientState
*peer
,
464 int queue_pair_index
,
468 VhostIOVATree
*iova_tree
)
470 NetClientState
*nc
= NULL
;
475 nc
= qemu_new_net_client(&net_vhost_vdpa_info
, peer
, device
,
478 nc
= qemu_new_net_control_client(&net_vhost_vdpa_info
, peer
,
481 snprintf(nc
->info_str
, sizeof(nc
->info_str
), TYPE_VHOST_VDPA
);
482 s
= DO_UPCAST(VhostVDPAState
, nc
, nc
);
484 s
->vhost_vdpa
.device_fd
= vdpa_device_fd
;
485 s
->vhost_vdpa
.index
= queue_pair_index
;
486 s
->vhost_vdpa
.shadow_vqs_enabled
= svq
;
487 s
->vhost_vdpa
.iova_tree
= iova_tree
;
489 s
->cvq_cmd_out_buffer
= qemu_memalign(qemu_real_host_page_size(),
490 vhost_vdpa_net_cvq_cmd_page_len());
491 memset(s
->cvq_cmd_out_buffer
, 0, vhost_vdpa_net_cvq_cmd_page_len());
492 s
->cvq_cmd_in_buffer
= qemu_memalign(qemu_real_host_page_size(),
493 vhost_vdpa_net_cvq_cmd_page_len());
494 memset(s
->cvq_cmd_in_buffer
, 0, vhost_vdpa_net_cvq_cmd_page_len());
496 s
->vhost_vdpa
.shadow_vq_ops
= &vhost_vdpa_net_svq_ops
;
497 s
->vhost_vdpa
.shadow_vq_ops_opaque
= s
;
498 error_setg(&s
->vhost_vdpa
.migration_blocker
,
499 "Migration disabled: vhost-vdpa uses CVQ.");
501 ret
= vhost_vdpa_add(nc
, (void *)&s
->vhost_vdpa
, queue_pair_index
, nvqs
);
503 qemu_del_net_client(nc
);
509 static int vhost_vdpa_get_iova_range(int fd
,
510 struct vhost_vdpa_iova_range
*iova_range
)
512 int ret
= ioctl(fd
, VHOST_VDPA_GET_IOVA_RANGE
, iova_range
);
514 return ret
< 0 ? -errno
: 0;
517 static int vhost_vdpa_get_features(int fd
, uint64_t *features
, Error
**errp
)
519 int ret
= ioctl(fd
, VHOST_GET_FEATURES
, features
);
520 if (unlikely(ret
< 0)) {
521 error_setg_errno(errp
, errno
,
522 "Fail to query features from vhost-vDPA device");
527 static int vhost_vdpa_get_max_queue_pairs(int fd
, uint64_t features
,
528 int *has_cvq
, Error
**errp
)
530 unsigned long config_size
= offsetof(struct vhost_vdpa_config
, buf
);
531 g_autofree
struct vhost_vdpa_config
*config
= NULL
;
532 __virtio16
*max_queue_pairs
;
535 if (features
& (1 << VIRTIO_NET_F_CTRL_VQ
)) {
541 if (features
& (1 << VIRTIO_NET_F_MQ
)) {
542 config
= g_malloc0(config_size
+ sizeof(*max_queue_pairs
));
543 config
->off
= offsetof(struct virtio_net_config
, max_virtqueue_pairs
);
544 config
->len
= sizeof(*max_queue_pairs
);
546 ret
= ioctl(fd
, VHOST_VDPA_GET_CONFIG
, config
);
548 error_setg(errp
, "Fail to get config from vhost-vDPA device");
552 max_queue_pairs
= (__virtio16
*)&config
->buf
;
554 return lduw_le_p(max_queue_pairs
);
560 int net_init_vhost_vdpa(const Netdev
*netdev
, const char *name
,
561 NetClientState
*peer
, Error
**errp
)
563 const NetdevVhostVDPAOptions
*opts
;
566 g_autofree NetClientState
**ncs
= NULL
;
567 g_autoptr(VhostIOVATree
) iova_tree
= NULL
;
569 int queue_pairs
, r
, i
= 0, has_cvq
= 0;
571 assert(netdev
->type
== NET_CLIENT_DRIVER_VHOST_VDPA
);
572 opts
= &netdev
->u
.vhost_vdpa
;
573 if (!opts
->vhostdev
) {
574 error_setg(errp
, "vdpa character device not specified with vhostdev");
578 vdpa_device_fd
= qemu_open(opts
->vhostdev
, O_RDWR
, errp
);
579 if (vdpa_device_fd
== -1) {
583 r
= vhost_vdpa_get_features(vdpa_device_fd
, &features
, errp
);
584 if (unlikely(r
< 0)) {
588 queue_pairs
= vhost_vdpa_get_max_queue_pairs(vdpa_device_fd
, features
,
590 if (queue_pairs
< 0) {
591 qemu_close(vdpa_device_fd
);
596 struct vhost_vdpa_iova_range iova_range
;
598 uint64_t invalid_dev_features
=
599 features
& ~vdpa_svq_device_features
&
600 /* Transport are all accepted at this point */
601 ~MAKE_64BIT_MASK(VIRTIO_TRANSPORT_F_START
,
602 VIRTIO_TRANSPORT_F_END
- VIRTIO_TRANSPORT_F_START
);
604 if (invalid_dev_features
) {
605 error_setg(errp
, "vdpa svq does not work with features 0x%" PRIx64
,
606 invalid_dev_features
);
610 vhost_vdpa_get_iova_range(vdpa_device_fd
, &iova_range
);
611 iova_tree
= vhost_iova_tree_new(iova_range
.first
, iova_range
.last
);
614 ncs
= g_malloc0(sizeof(*ncs
) * queue_pairs
);
616 for (i
= 0; i
< queue_pairs
; i
++) {
617 ncs
[i
] = net_vhost_vdpa_init(peer
, TYPE_VHOST_VDPA
, name
,
618 vdpa_device_fd
, i
, 2, true, opts
->x_svq
,
625 nc
= net_vhost_vdpa_init(peer
, TYPE_VHOST_VDPA
, name
,
626 vdpa_device_fd
, i
, 1, false,
627 opts
->x_svq
, iova_tree
);
632 /* iova_tree ownership belongs to last NetClientState */
633 g_steal_pointer(&iova_tree
);
638 for (i
--; i
>= 0; i
--) {
639 qemu_del_net_client(ncs
[i
]);
644 qemu_close(vdpa_device_fd
);