2 * iommufd container backend
4 * Copyright (C) 2023 Intel Corporation.
5 * Copyright Red Hat, Inc. 2023
7 * Authors: Yi Liu <yi.l.liu@intel.com>
8 * Eric Auger <eric.auger@redhat.com>
10 * SPDX-License-Identifier: GPL-2.0-or-later
13 #include "qemu/osdep.h"
14 #include <sys/ioctl.h>
15 #include <linux/vfio.h>
16 #include <linux/iommufd.h>
18 #include "hw/vfio/vfio-common.h"
19 #include "qemu/error-report.h"
21 #include "qapi/error.h"
22 #include "sysemu/iommufd.h"
23 #include "hw/qdev-core.h"
24 #include "sysemu/reset.h"
25 #include "qemu/cutils.h"
26 #include "qemu/chardev_open.h"
29 static int iommufd_cdev_map(const VFIOContainerBase
*bcontainer
, hwaddr iova
,
30 ram_addr_t size
, void *vaddr
, bool readonly
)
32 const VFIOIOMMUFDContainer
*container
=
33 container_of(bcontainer
, VFIOIOMMUFDContainer
, bcontainer
);
35 return iommufd_backend_map_dma(container
->be
,
37 iova
, size
, vaddr
, readonly
);
40 static int iommufd_cdev_unmap(const VFIOContainerBase
*bcontainer
,
41 hwaddr iova
, ram_addr_t size
,
44 const VFIOIOMMUFDContainer
*container
=
45 container_of(bcontainer
, VFIOIOMMUFDContainer
, bcontainer
);
47 /* TODO: Handle dma_unmap_bitmap with iotlb args (migration) */
48 return iommufd_backend_unmap_dma(container
->be
,
49 container
->ioas_id
, iova
, size
);
52 static bool iommufd_cdev_kvm_device_add(VFIODevice
*vbasedev
, Error
**errp
)
54 return !vfio_kvm_device_add_fd(vbasedev
->fd
, errp
);
57 static void iommufd_cdev_kvm_device_del(VFIODevice
*vbasedev
)
61 if (vfio_kvm_device_del_fd(vbasedev
->fd
, &err
)) {
62 error_report_err(err
);
66 static bool iommufd_cdev_connect_and_bind(VFIODevice
*vbasedev
, Error
**errp
)
68 IOMMUFDBackend
*iommufd
= vbasedev
->iommufd
;
69 struct vfio_device_bind_iommufd bind
= {
70 .argsz
= sizeof(bind
),
74 if (!iommufd_backend_connect(iommufd
, errp
)) {
79 * Add device to kvm-vfio to be prepared for the tracking
80 * in KVM. Especially for some emulated devices, it requires
81 * to have kvm information in the device open.
83 if (!iommufd_cdev_kvm_device_add(vbasedev
, errp
)) {
84 goto err_kvm_device_add
;
87 /* Bind device to iommufd */
88 bind
.iommufd
= iommufd
->fd
;
89 if (ioctl(vbasedev
->fd
, VFIO_DEVICE_BIND_IOMMUFD
, &bind
)) {
90 error_setg_errno(errp
, errno
, "error bind device fd=%d to iommufd=%d",
91 vbasedev
->fd
, bind
.iommufd
);
95 vbasedev
->devid
= bind
.out_devid
;
96 trace_iommufd_cdev_connect_and_bind(bind
.iommufd
, vbasedev
->name
,
97 vbasedev
->fd
, vbasedev
->devid
);
100 iommufd_cdev_kvm_device_del(vbasedev
);
102 iommufd_backend_disconnect(iommufd
);
106 static void iommufd_cdev_unbind_and_disconnect(VFIODevice
*vbasedev
)
108 /* Unbind is automatically conducted when device fd is closed */
109 iommufd_cdev_kvm_device_del(vbasedev
);
110 iommufd_backend_disconnect(vbasedev
->iommufd
);
113 static int iommufd_cdev_getfd(const char *sysfs_path
, Error
**errp
)
116 long int ret
= -ENOTTY
;
117 g_autofree
char *path
= NULL
;
118 g_autofree
char *vfio_dev_path
= NULL
;
119 g_autofree
char *vfio_path
= NULL
;
122 g_autofree gchar
*contents
= NULL
;
127 path
= g_strdup_printf("%s/vfio-dev", sysfs_path
);
130 error_setg_errno(errp
, errno
, "couldn't open directory %s", path
);
134 while ((dent
= readdir(dir
))) {
135 if (!strncmp(dent
->d_name
, "vfio", 4)) {
136 vfio_dev_path
= g_strdup_printf("%s/%s/dev", path
, dent
->d_name
);
141 if (!vfio_dev_path
) {
142 error_setg(errp
, "failed to find vfio-dev/vfioX/dev");
146 if (!g_file_get_contents(vfio_dev_path
, &contents
, &length
, NULL
)) {
147 error_setg(errp
, "failed to load \"%s\"", vfio_dev_path
);
151 if (sscanf(contents
, "%d:%d", &major
, &minor
) != 2) {
152 error_setg(errp
, "failed to get major:minor for \"%s\"", vfio_dev_path
);
155 vfio_devt
= makedev(major
, minor
);
157 vfio_path
= g_strdup_printf("/dev/vfio/devices/%s", dent
->d_name
);
158 ret
= open_cdev(vfio_path
, vfio_devt
);
160 error_setg(errp
, "Failed to open %s", vfio_path
);
163 trace_iommufd_cdev_getfd(vfio_path
, ret
);
169 error_prepend(errp
, VFIO_MSG_PREFIX
, path
);
175 static bool iommufd_cdev_attach_ioas_hwpt(VFIODevice
*vbasedev
, uint32_t id
,
178 int iommufd
= vbasedev
->iommufd
->fd
;
179 struct vfio_device_attach_iommufd_pt attach_data
= {
180 .argsz
= sizeof(attach_data
),
185 /* Attach device to an IOAS or hwpt within iommufd */
186 if (ioctl(vbasedev
->fd
, VFIO_DEVICE_ATTACH_IOMMUFD_PT
, &attach_data
)) {
187 error_setg_errno(errp
, errno
,
188 "[iommufd=%d] error attach %s (%d) to id=%d",
189 iommufd
, vbasedev
->name
, vbasedev
->fd
, id
);
193 trace_iommufd_cdev_attach_ioas_hwpt(iommufd
, vbasedev
->name
,
198 static bool iommufd_cdev_detach_ioas_hwpt(VFIODevice
*vbasedev
, Error
**errp
)
200 int iommufd
= vbasedev
->iommufd
->fd
;
201 struct vfio_device_detach_iommufd_pt detach_data
= {
202 .argsz
= sizeof(detach_data
),
206 if (ioctl(vbasedev
->fd
, VFIO_DEVICE_DETACH_IOMMUFD_PT
, &detach_data
)) {
207 error_setg_errno(errp
, errno
, "detach %s failed", vbasedev
->name
);
211 trace_iommufd_cdev_detach_ioas_hwpt(iommufd
, vbasedev
->name
);
215 static bool iommufd_cdev_attach_container(VFIODevice
*vbasedev
,
216 VFIOIOMMUFDContainer
*container
,
219 return iommufd_cdev_attach_ioas_hwpt(vbasedev
, container
->ioas_id
, errp
);
222 static void iommufd_cdev_detach_container(VFIODevice
*vbasedev
,
223 VFIOIOMMUFDContainer
*container
)
227 if (!iommufd_cdev_detach_ioas_hwpt(vbasedev
, &err
)) {
228 error_report_err(err
);
232 static void iommufd_cdev_container_destroy(VFIOIOMMUFDContainer
*container
)
234 VFIOContainerBase
*bcontainer
= &container
->bcontainer
;
236 if (!QLIST_EMPTY(&bcontainer
->device_list
)) {
239 memory_listener_unregister(&bcontainer
->listener
);
240 iommufd_backend_free_id(container
->be
, container
->ioas_id
);
241 object_unref(container
);
244 static int iommufd_cdev_ram_block_discard_disable(bool state
)
247 * We support coordinated discarding of RAM via the RamDiscardManager.
249 return ram_block_uncoordinated_discard_disable(state
);
252 static bool iommufd_cdev_get_info_iova_range(VFIOIOMMUFDContainer
*container
,
253 uint32_t ioas_id
, Error
**errp
)
255 VFIOContainerBase
*bcontainer
= &container
->bcontainer
;
256 g_autofree
struct iommu_ioas_iova_ranges
*info
= NULL
;
257 struct iommu_iova_range
*iova_ranges
;
258 int sz
, fd
= container
->be
->fd
;
260 info
= g_malloc0(sizeof(*info
));
261 info
->size
= sizeof(*info
);
262 info
->ioas_id
= ioas_id
;
264 if (ioctl(fd
, IOMMU_IOAS_IOVA_RANGES
, info
) && errno
!= EMSGSIZE
) {
268 sz
= info
->num_iovas
* sizeof(struct iommu_iova_range
);
269 info
= g_realloc(info
, sizeof(*info
) + sz
);
270 info
->allowed_iovas
= (uintptr_t)(info
+ 1);
272 if (ioctl(fd
, IOMMU_IOAS_IOVA_RANGES
, info
)) {
276 iova_ranges
= (struct iommu_iova_range
*)(uintptr_t)info
->allowed_iovas
;
278 for (int i
= 0; i
< info
->num_iovas
; i
++) {
279 Range
*range
= g_new(Range
, 1);
281 range_set_bounds(range
, iova_ranges
[i
].start
, iova_ranges
[i
].last
);
282 bcontainer
->iova_ranges
=
283 range_list_insert(bcontainer
->iova_ranges
, range
);
285 bcontainer
->pgsizes
= info
->out_iova_alignment
;
290 error_setg_errno(errp
, errno
, "Cannot get IOVA ranges");
294 static bool iommufd_cdev_attach(const char *name
, VFIODevice
*vbasedev
,
295 AddressSpace
*as
, Error
**errp
)
297 VFIOContainerBase
*bcontainer
;
298 VFIOIOMMUFDContainer
*container
;
299 VFIOAddressSpace
*space
;
300 struct vfio_device_info dev_info
= { .argsz
= sizeof(dev_info
) };
304 const VFIOIOMMUClass
*iommufd_vioc
=
305 VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_IOMMUFD
));
307 if (vbasedev
->fd
< 0) {
308 devfd
= iommufd_cdev_getfd(vbasedev
->sysfsdev
, errp
);
312 vbasedev
->fd
= devfd
;
314 devfd
= vbasedev
->fd
;
317 if (!iommufd_cdev_connect_and_bind(vbasedev
, errp
)) {
318 goto err_connect_bind
;
321 space
= vfio_get_address_space(as
);
323 /* try to attach to an existing container in this space */
324 QLIST_FOREACH(bcontainer
, &space
->containers
, next
) {
325 container
= container_of(bcontainer
, VFIOIOMMUFDContainer
, bcontainer
);
326 if (VFIO_IOMMU_GET_CLASS(bcontainer
) != iommufd_vioc
||
327 vbasedev
->iommufd
!= container
->be
) {
330 if (!iommufd_cdev_attach_container(vbasedev
, container
, &err
)) {
331 const char *msg
= error_get_pretty(err
);
333 trace_iommufd_cdev_fail_attach_existing_container(msg
);
337 ret
= iommufd_cdev_ram_block_discard_disable(true);
340 "Cannot set discarding of RAM broken (%d)", ret
);
341 goto err_discard_disable
;
343 goto found_container
;
347 /* Need to allocate a new dedicated container */
348 if (!iommufd_backend_alloc_ioas(vbasedev
->iommufd
, &ioas_id
, errp
)) {
352 trace_iommufd_cdev_alloc_ioas(vbasedev
->iommufd
->fd
, ioas_id
);
354 container
= VFIO_IOMMU_IOMMUFD(object_new(TYPE_VFIO_IOMMU_IOMMUFD
));
355 container
->be
= vbasedev
->iommufd
;
356 container
->ioas_id
= ioas_id
;
358 bcontainer
= &container
->bcontainer
;
359 vfio_address_space_insert(space
, bcontainer
);
361 if (!iommufd_cdev_attach_container(vbasedev
, container
, errp
)) {
362 goto err_attach_container
;
365 ret
= iommufd_cdev_ram_block_discard_disable(true);
367 goto err_discard_disable
;
370 if (!iommufd_cdev_get_info_iova_range(container
, ioas_id
, &err
)) {
371 error_append_hint(&err
,
372 "Fallback to default 64bit IOVA range and 4K page size\n");
373 warn_report_err(err
);
375 bcontainer
->pgsizes
= qemu_real_host_page_size();
378 bcontainer
->listener
= vfio_memory_listener
;
379 memory_listener_register(&bcontainer
->listener
, bcontainer
->space
->as
);
381 if (bcontainer
->error
) {
382 error_propagate_prepend(errp
, bcontainer
->error
,
383 "memory listener initialization failed: ");
384 goto err_listener_register
;
387 bcontainer
->initialized
= true;
390 ret
= ioctl(devfd
, VFIO_DEVICE_GET_INFO
, &dev_info
);
392 error_setg_errno(errp
, errno
, "error getting device info");
393 goto err_listener_register
;
396 if (!vfio_cpr_register_container(bcontainer
, errp
)) {
397 goto err_listener_register
;
401 * TODO: examine RAM_BLOCK_DISCARD stuff, should we do group level
402 * for discarding incompatibility check as well?
404 if (vbasedev
->ram_block_discard_allowed
) {
405 iommufd_cdev_ram_block_discard_disable(false);
409 vbasedev
->num_irqs
= dev_info
.num_irqs
;
410 vbasedev
->num_regions
= dev_info
.num_regions
;
411 vbasedev
->flags
= dev_info
.flags
;
412 vbasedev
->reset_works
= !!(dev_info
.flags
& VFIO_DEVICE_FLAGS_RESET
);
413 vbasedev
->bcontainer
= bcontainer
;
414 QLIST_INSERT_HEAD(&bcontainer
->device_list
, vbasedev
, container_next
);
415 QLIST_INSERT_HEAD(&vfio_device_list
, vbasedev
, global_next
);
417 trace_iommufd_cdev_device_info(vbasedev
->name
, devfd
, vbasedev
->num_irqs
,
418 vbasedev
->num_regions
, vbasedev
->flags
);
421 err_listener_register
:
422 iommufd_cdev_ram_block_discard_disable(false);
424 iommufd_cdev_detach_container(vbasedev
, container
);
425 err_attach_container
:
426 iommufd_cdev_container_destroy(container
);
428 vfio_put_address_space(space
);
429 iommufd_cdev_unbind_and_disconnect(vbasedev
);
435 static void iommufd_cdev_detach(VFIODevice
*vbasedev
)
437 VFIOContainerBase
*bcontainer
= vbasedev
->bcontainer
;
438 VFIOAddressSpace
*space
= bcontainer
->space
;
439 VFIOIOMMUFDContainer
*container
= container_of(bcontainer
,
440 VFIOIOMMUFDContainer
,
442 QLIST_REMOVE(vbasedev
, global_next
);
443 QLIST_REMOVE(vbasedev
, container_next
);
444 vbasedev
->bcontainer
= NULL
;
446 if (!vbasedev
->ram_block_discard_allowed
) {
447 iommufd_cdev_ram_block_discard_disable(false);
450 vfio_cpr_unregister_container(bcontainer
);
451 iommufd_cdev_detach_container(vbasedev
, container
);
452 iommufd_cdev_container_destroy(container
);
453 vfio_put_address_space(space
);
455 iommufd_cdev_unbind_and_disconnect(vbasedev
);
459 static VFIODevice
*iommufd_cdev_pci_find_by_devid(__u32 devid
)
461 VFIODevice
*vbasedev_iter
;
462 const VFIOIOMMUClass
*iommufd_vioc
=
463 VFIO_IOMMU_CLASS(object_class_by_name(TYPE_VFIO_IOMMU_IOMMUFD
));
465 QLIST_FOREACH(vbasedev_iter
, &vfio_device_list
, global_next
) {
466 if (VFIO_IOMMU_GET_CLASS(vbasedev_iter
->bcontainer
) != iommufd_vioc
) {
469 if (devid
== vbasedev_iter
->devid
) {
470 return vbasedev_iter
;
476 static VFIOPCIDevice
*
477 iommufd_cdev_dep_get_realized_vpdev(struct vfio_pci_dependent_device
*dep_dev
,
478 VFIODevice
*reset_dev
)
480 VFIODevice
*vbasedev_tmp
;
482 if (dep_dev
->devid
== reset_dev
->devid
||
483 dep_dev
->devid
== VFIO_PCI_DEVID_OWNED
) {
487 vbasedev_tmp
= iommufd_cdev_pci_find_by_devid(dep_dev
->devid
);
488 if (!vbasedev_tmp
|| !vbasedev_tmp
->dev
->realized
||
489 vbasedev_tmp
->type
!= VFIO_DEVICE_TYPE_PCI
) {
493 return container_of(vbasedev_tmp
, VFIOPCIDevice
, vbasedev
);
496 static int iommufd_cdev_pci_hot_reset(VFIODevice
*vbasedev
, bool single
)
498 VFIOPCIDevice
*vdev
= container_of(vbasedev
, VFIOPCIDevice
, vbasedev
);
499 struct vfio_pci_hot_reset_info
*info
= NULL
;
500 struct vfio_pci_dependent_device
*devices
;
501 struct vfio_pci_hot_reset
*reset
;
505 trace_vfio_pci_hot_reset(vdev
->vbasedev
.name
, single
? "one" : "multi");
508 vfio_pci_pre_reset(vdev
);
510 vdev
->vbasedev
.needs_reset
= false;
512 ret
= vfio_pci_get_pci_hot_reset_info(vdev
, &info
);
518 assert(info
->flags
& VFIO_PCI_HOT_RESET_FLAG_DEV_ID
);
520 devices
= &info
->devices
[0];
522 if (!(info
->flags
& VFIO_PCI_HOT_RESET_FLAG_DEV_ID_OWNED
)) {
523 if (!vdev
->has_pm_reset
) {
524 for (i
= 0; i
< info
->count
; i
++) {
525 if (devices
[i
].devid
== VFIO_PCI_DEVID_NOT_OWNED
) {
526 error_report("vfio: Cannot reset device %s, "
527 "depends on device %04x:%02x:%02x.%x "
528 "which is not owned.",
529 vdev
->vbasedev
.name
, devices
[i
].segment
,
530 devices
[i
].bus
, PCI_SLOT(devices
[i
].devfn
),
531 PCI_FUNC(devices
[i
].devfn
));
539 trace_vfio_pci_hot_reset_has_dep_devices(vdev
->vbasedev
.name
);
541 for (i
= 0; i
< info
->count
; i
++) {
544 trace_iommufd_cdev_pci_hot_reset_dep_devices(devices
[i
].segment
,
546 PCI_SLOT(devices
[i
].devfn
),
547 PCI_FUNC(devices
[i
].devfn
),
551 * If a VFIO cdev device is resettable, all the dependent devices
552 * are either bound to same iommufd or within same iommu_groups as
553 * one of the iommufd bound devices.
555 assert(devices
[i
].devid
!= VFIO_PCI_DEVID_NOT_OWNED
);
557 tmp
= iommufd_cdev_dep_get_realized_vpdev(&devices
[i
], &vdev
->vbasedev
);
566 vfio_pci_pre_reset(tmp
);
567 tmp
->vbasedev
.needs_reset
= false;
571 if (!single
&& !multi
) {
576 /* Use zero length array for hot reset with iommufd backend */
577 reset
= g_malloc0(sizeof(*reset
));
578 reset
->argsz
= sizeof(*reset
);
581 ret
= ioctl(vdev
->vbasedev
.fd
, VFIO_DEVICE_PCI_HOT_RESET
, reset
);
587 trace_vfio_pci_hot_reset_result(vdev
->vbasedev
.name
,
588 ret
? strerror(errno
) : "Success");
590 /* Re-enable INTx on affected devices */
591 for (i
= 0; i
< info
->count
; i
++) {
594 tmp
= iommufd_cdev_dep_get_realized_vpdev(&devices
[i
], &vdev
->vbasedev
);
598 vfio_pci_post_reset(tmp
);
602 vfio_pci_post_reset(vdev
);
609 static void vfio_iommu_iommufd_class_init(ObjectClass
*klass
, void *data
)
611 VFIOIOMMUClass
*vioc
= VFIO_IOMMU_CLASS(klass
);
613 vioc
->hiod_typename
= TYPE_HOST_IOMMU_DEVICE_IOMMUFD_VFIO
;
615 vioc
->dma_map
= iommufd_cdev_map
;
616 vioc
->dma_unmap
= iommufd_cdev_unmap
;
617 vioc
->attach_device
= iommufd_cdev_attach
;
618 vioc
->detach_device
= iommufd_cdev_detach
;
619 vioc
->pci_hot_reset
= iommufd_cdev_pci_hot_reset
;
622 static bool hiod_iommufd_vfio_realize(HostIOMMUDevice
*hiod
, void *opaque
,
625 VFIODevice
*vdev
= opaque
;
626 HostIOMMUDeviceCaps
*caps
= &hiod
->caps
;
627 enum iommu_hw_info_type type
;
629 struct iommu_hw_info_vtd vtd
;
632 hiod
->agent
= opaque
;
634 if (!iommufd_backend_get_device_info(vdev
->iommufd
, vdev
->devid
,
635 &type
, &data
, sizeof(data
), errp
)) {
639 hiod
->name
= g_strdup(vdev
->name
);
641 caps
->aw_bits
= vfio_device_get_aw_bits(vdev
);
647 hiod_iommufd_vfio_get_iova_ranges(HostIOMMUDevice
*hiod
, Error
**errp
)
649 VFIODevice
*vdev
= hiod
->agent
;
654 if (vdev
->bcontainer
) {
655 l
= g_list_copy(vdev
->bcontainer
->iova_ranges
);
661 static void hiod_iommufd_vfio_class_init(ObjectClass
*oc
, void *data
)
663 HostIOMMUDeviceClass
*hiodc
= HOST_IOMMU_DEVICE_CLASS(oc
);
665 hiodc
->realize
= hiod_iommufd_vfio_realize
;
666 hiodc
->get_iova_ranges
= hiod_iommufd_vfio_get_iova_ranges
;
669 static const TypeInfo types
[] = {
671 .name
= TYPE_VFIO_IOMMU_IOMMUFD
,
672 .parent
= TYPE_VFIO_IOMMU
,
673 .instance_size
= sizeof(VFIOIOMMUFDContainer
),
674 .class_init
= vfio_iommu_iommufd_class_init
,
676 .name
= TYPE_HOST_IOMMU_DEVICE_IOMMUFD_VFIO
,
677 .parent
= TYPE_HOST_IOMMU_DEVICE_IOMMUFD
,
678 .class_init
= hiod_iommufd_vfio_class_init
,