2 * iommufd container backend
4 * Copyright (C) 2023 Intel Corporation.
5 * Copyright Red Hat, Inc. 2023
7 * Authors: Yi Liu <yi.l.liu@intel.com>
8 * Eric Auger <eric.auger@redhat.com>
10 * SPDX-License-Identifier: GPL-2.0-or-later
13 #include "qemu/osdep.h"
14 #include <sys/ioctl.h>
15 #include <linux/vfio.h>
16 #include <linux/iommufd.h>
18 #include "hw/vfio/vfio-common.h"
19 #include "qemu/error-report.h"
21 #include "qapi/error.h"
22 #include "sysemu/iommufd.h"
23 #include "hw/qdev-core.h"
24 #include "sysemu/reset.h"
25 #include "qemu/cutils.h"
26 #include "qemu/chardev_open.h"
28 static int iommufd_cdev_map(VFIOContainerBase
*bcontainer
, hwaddr iova
,
29 ram_addr_t size
, void *vaddr
, bool readonly
)
31 VFIOIOMMUFDContainer
*container
=
32 container_of(bcontainer
, VFIOIOMMUFDContainer
, bcontainer
);
34 return iommufd_backend_map_dma(container
->be
,
36 iova
, size
, vaddr
, readonly
);
39 static int iommufd_cdev_unmap(VFIOContainerBase
*bcontainer
,
40 hwaddr iova
, ram_addr_t size
,
43 VFIOIOMMUFDContainer
*container
=
44 container_of(bcontainer
, VFIOIOMMUFDContainer
, bcontainer
);
46 /* TODO: Handle dma_unmap_bitmap with iotlb args (migration) */
47 return iommufd_backend_unmap_dma(container
->be
,
48 container
->ioas_id
, iova
, size
);
51 static int iommufd_cdev_kvm_device_add(VFIODevice
*vbasedev
, Error
**errp
)
53 return vfio_kvm_device_add_fd(vbasedev
->fd
, errp
);
56 static void iommufd_cdev_kvm_device_del(VFIODevice
*vbasedev
)
60 if (vfio_kvm_device_del_fd(vbasedev
->fd
, &err
)) {
61 error_report_err(err
);
65 static int iommufd_cdev_connect_and_bind(VFIODevice
*vbasedev
, Error
**errp
)
67 IOMMUFDBackend
*iommufd
= vbasedev
->iommufd
;
68 struct vfio_device_bind_iommufd bind
= {
69 .argsz
= sizeof(bind
),
74 ret
= iommufd_backend_connect(iommufd
, errp
);
80 * Add device to kvm-vfio to be prepared for the tracking
81 * in KVM. Especially for some emulated devices, it requires
82 * to have kvm information in the device open.
84 ret
= iommufd_cdev_kvm_device_add(vbasedev
, errp
);
86 goto err_kvm_device_add
;
89 /* Bind device to iommufd */
90 bind
.iommufd
= iommufd
->fd
;
91 ret
= ioctl(vbasedev
->fd
, VFIO_DEVICE_BIND_IOMMUFD
, &bind
);
93 error_setg_errno(errp
, errno
, "error bind device fd=%d to iommufd=%d",
94 vbasedev
->fd
, bind
.iommufd
);
98 vbasedev
->devid
= bind
.out_devid
;
99 trace_iommufd_cdev_connect_and_bind(bind
.iommufd
, vbasedev
->name
,
100 vbasedev
->fd
, vbasedev
->devid
);
103 iommufd_cdev_kvm_device_del(vbasedev
);
105 iommufd_backend_disconnect(iommufd
);
109 static void iommufd_cdev_unbind_and_disconnect(VFIODevice
*vbasedev
)
111 /* Unbind is automatically conducted when device fd is closed */
112 iommufd_cdev_kvm_device_del(vbasedev
);
113 iommufd_backend_disconnect(vbasedev
->iommufd
);
116 static int iommufd_cdev_getfd(const char *sysfs_path
, Error
**errp
)
118 long int ret
= -ENOTTY
;
119 char *path
, *vfio_dev_path
= NULL
, *vfio_path
= NULL
;
128 path
= g_strdup_printf("%s/vfio-dev", sysfs_path
);
129 if (stat(path
, &st
) < 0) {
130 error_setg_errno(errp
, errno
, "no such host device");
136 error_setg_errno(errp
, errno
, "couldn't open directory %s", path
);
140 while ((dent
= readdir(dir
))) {
141 if (!strncmp(dent
->d_name
, "vfio", 4)) {
142 vfio_dev_path
= g_strdup_printf("%s/%s/dev", path
, dent
->d_name
);
147 if (!vfio_dev_path
) {
148 error_setg(errp
, "failed to find vfio-dev/vfioX/dev");
152 if (!g_file_get_contents(vfio_dev_path
, &contents
, &length
, NULL
)) {
153 error_setg(errp
, "failed to load \"%s\"", vfio_dev_path
);
154 goto out_free_dev_path
;
157 if (sscanf(contents
, "%d:%d", &major
, &minor
) != 2) {
158 error_setg(errp
, "failed to get major:minor for \"%s\"", vfio_dev_path
);
159 goto out_free_dev_path
;
162 vfio_devt
= makedev(major
, minor
);
164 vfio_path
= g_strdup_printf("/dev/vfio/devices/%s", dent
->d_name
);
165 ret
= open_cdev(vfio_path
, vfio_devt
);
167 error_setg(errp
, "Failed to open %s", vfio_path
);
170 trace_iommufd_cdev_getfd(vfio_path
, ret
);
174 g_free(vfio_dev_path
);
179 error_prepend(errp
, VFIO_MSG_PREFIX
, path
);
186 static int iommufd_cdev_attach_ioas_hwpt(VFIODevice
*vbasedev
, uint32_t id
,
189 int ret
, iommufd
= vbasedev
->iommufd
->fd
;
190 struct vfio_device_attach_iommufd_pt attach_data
= {
191 .argsz
= sizeof(attach_data
),
196 /* Attach device to an IOAS or hwpt within iommufd */
197 ret
= ioctl(vbasedev
->fd
, VFIO_DEVICE_ATTACH_IOMMUFD_PT
, &attach_data
);
199 error_setg_errno(errp
, errno
,
200 "[iommufd=%d] error attach %s (%d) to id=%d",
201 iommufd
, vbasedev
->name
, vbasedev
->fd
, id
);
203 trace_iommufd_cdev_attach_ioas_hwpt(iommufd
, vbasedev
->name
,
209 static int iommufd_cdev_detach_ioas_hwpt(VFIODevice
*vbasedev
, Error
**errp
)
211 int ret
, iommufd
= vbasedev
->iommufd
->fd
;
212 struct vfio_device_detach_iommufd_pt detach_data
= {
213 .argsz
= sizeof(detach_data
),
217 ret
= ioctl(vbasedev
->fd
, VFIO_DEVICE_DETACH_IOMMUFD_PT
, &detach_data
);
219 error_setg_errno(errp
, errno
, "detach %s failed", vbasedev
->name
);
221 trace_iommufd_cdev_detach_ioas_hwpt(iommufd
, vbasedev
->name
);
226 static int iommufd_cdev_attach_container(VFIODevice
*vbasedev
,
227 VFIOIOMMUFDContainer
*container
,
230 return iommufd_cdev_attach_ioas_hwpt(vbasedev
, container
->ioas_id
, errp
);
233 static void iommufd_cdev_detach_container(VFIODevice
*vbasedev
,
234 VFIOIOMMUFDContainer
*container
)
238 if (iommufd_cdev_detach_ioas_hwpt(vbasedev
, &err
)) {
239 error_report_err(err
);
243 static void iommufd_cdev_container_destroy(VFIOIOMMUFDContainer
*container
)
245 VFIOContainerBase
*bcontainer
= &container
->bcontainer
;
247 if (!QLIST_EMPTY(&bcontainer
->device_list
)) {
250 memory_listener_unregister(&bcontainer
->listener
);
251 vfio_container_destroy(bcontainer
);
252 iommufd_backend_free_id(container
->be
, container
->ioas_id
);
256 static int iommufd_cdev_ram_block_discard_disable(bool state
)
259 * We support coordinated discarding of RAM via the RamDiscardManager.
261 return ram_block_uncoordinated_discard_disable(state
);
264 static int iommufd_cdev_get_info_iova_range(VFIOIOMMUFDContainer
*container
,
265 uint32_t ioas_id
, Error
**errp
)
267 VFIOContainerBase
*bcontainer
= &container
->bcontainer
;
268 struct iommu_ioas_iova_ranges
*info
;
269 struct iommu_iova_range
*iova_ranges
;
270 int ret
, sz
, fd
= container
->be
->fd
;
272 info
= g_malloc0(sizeof(*info
));
273 info
->size
= sizeof(*info
);
274 info
->ioas_id
= ioas_id
;
276 ret
= ioctl(fd
, IOMMU_IOAS_IOVA_RANGES
, info
);
277 if (ret
&& errno
!= EMSGSIZE
) {
281 sz
= info
->num_iovas
* sizeof(struct iommu_iova_range
);
282 info
= g_realloc(info
, sizeof(*info
) + sz
);
283 info
->allowed_iovas
= (uintptr_t)(info
+ 1);
285 ret
= ioctl(fd
, IOMMU_IOAS_IOVA_RANGES
, info
);
290 iova_ranges
= (struct iommu_iova_range
*)(uintptr_t)info
->allowed_iovas
;
292 for (int i
= 0; i
< info
->num_iovas
; i
++) {
293 Range
*range
= g_new(Range
, 1);
295 range_set_bounds(range
, iova_ranges
[i
].start
, iova_ranges
[i
].last
);
296 bcontainer
->iova_ranges
=
297 range_list_insert(bcontainer
->iova_ranges
, range
);
299 bcontainer
->pgsizes
= info
->out_iova_alignment
;
307 error_setg_errno(errp
, errno
, "Cannot get IOVA ranges");
311 static int iommufd_cdev_attach(const char *name
, VFIODevice
*vbasedev
,
312 AddressSpace
*as
, Error
**errp
)
314 VFIOContainerBase
*bcontainer
;
315 VFIOIOMMUFDContainer
*container
;
316 VFIOAddressSpace
*space
;
317 struct vfio_device_info dev_info
= { .argsz
= sizeof(dev_info
) };
322 devfd
= iommufd_cdev_getfd(vbasedev
->sysfsdev
, errp
);
326 vbasedev
->fd
= devfd
;
328 ret
= iommufd_cdev_connect_and_bind(vbasedev
, errp
);
330 goto err_connect_bind
;
333 space
= vfio_get_address_space(as
);
335 /* try to attach to an existing container in this space */
336 QLIST_FOREACH(bcontainer
, &space
->containers
, next
) {
337 container
= container_of(bcontainer
, VFIOIOMMUFDContainer
, bcontainer
);
338 if (bcontainer
->ops
!= &vfio_iommufd_ops
||
339 vbasedev
->iommufd
!= container
->be
) {
342 if (iommufd_cdev_attach_container(vbasedev
, container
, &err
)) {
343 const char *msg
= error_get_pretty(err
);
345 trace_iommufd_cdev_fail_attach_existing_container(msg
);
349 ret
= iommufd_cdev_ram_block_discard_disable(true);
352 "Cannot set discarding of RAM broken (%d)", ret
);
353 goto err_discard_disable
;
355 goto found_container
;
359 /* Need to allocate a new dedicated container */
360 ret
= iommufd_backend_alloc_ioas(vbasedev
->iommufd
, &ioas_id
, errp
);
365 trace_iommufd_cdev_alloc_ioas(vbasedev
->iommufd
->fd
, ioas_id
);
367 container
= g_malloc0(sizeof(*container
));
368 container
->be
= vbasedev
->iommufd
;
369 container
->ioas_id
= ioas_id
;
371 bcontainer
= &container
->bcontainer
;
372 vfio_container_init(bcontainer
, space
, &vfio_iommufd_ops
);
373 QLIST_INSERT_HEAD(&space
->containers
, bcontainer
, next
);
375 ret
= iommufd_cdev_attach_container(vbasedev
, container
, errp
);
377 goto err_attach_container
;
380 ret
= iommufd_cdev_ram_block_discard_disable(true);
382 goto err_discard_disable
;
385 ret
= iommufd_cdev_get_info_iova_range(container
, ioas_id
, &err
);
387 error_append_hint(&err
,
388 "Fallback to default 64bit IOVA range and 4K page size\n");
389 warn_report_err(err
);
391 bcontainer
->pgsizes
= qemu_real_host_page_size();
394 bcontainer
->listener
= vfio_memory_listener
;
395 memory_listener_register(&bcontainer
->listener
, bcontainer
->space
->as
);
397 if (bcontainer
->error
) {
399 error_propagate_prepend(errp
, bcontainer
->error
,
400 "memory listener initialization failed: ");
401 goto err_listener_register
;
404 bcontainer
->initialized
= true;
407 ret
= ioctl(devfd
, VFIO_DEVICE_GET_INFO
, &dev_info
);
409 error_setg_errno(errp
, errno
, "error getting device info");
410 goto err_listener_register
;
414 * TODO: examine RAM_BLOCK_DISCARD stuff, should we do group level
415 * for discarding incompatibility check as well?
417 if (vbasedev
->ram_block_discard_allowed
) {
418 iommufd_cdev_ram_block_discard_disable(false);
422 vbasedev
->num_irqs
= dev_info
.num_irqs
;
423 vbasedev
->num_regions
= dev_info
.num_regions
;
424 vbasedev
->flags
= dev_info
.flags
;
425 vbasedev
->reset_works
= !!(dev_info
.flags
& VFIO_DEVICE_FLAGS_RESET
);
426 vbasedev
->bcontainer
= bcontainer
;
427 QLIST_INSERT_HEAD(&bcontainer
->device_list
, vbasedev
, container_next
);
428 QLIST_INSERT_HEAD(&vfio_device_list
, vbasedev
, global_next
);
430 trace_iommufd_cdev_device_info(vbasedev
->name
, devfd
, vbasedev
->num_irqs
,
431 vbasedev
->num_regions
, vbasedev
->flags
);
434 err_listener_register
:
435 iommufd_cdev_ram_block_discard_disable(false);
437 iommufd_cdev_detach_container(vbasedev
, container
);
438 err_attach_container
:
439 iommufd_cdev_container_destroy(container
);
441 vfio_put_address_space(space
);
442 iommufd_cdev_unbind_and_disconnect(vbasedev
);
448 static void iommufd_cdev_detach(VFIODevice
*vbasedev
)
450 VFIOContainerBase
*bcontainer
= vbasedev
->bcontainer
;
451 VFIOAddressSpace
*space
= bcontainer
->space
;
452 VFIOIOMMUFDContainer
*container
= container_of(bcontainer
,
453 VFIOIOMMUFDContainer
,
455 QLIST_REMOVE(vbasedev
, global_next
);
456 QLIST_REMOVE(vbasedev
, container_next
);
457 vbasedev
->bcontainer
= NULL
;
459 if (!vbasedev
->ram_block_discard_allowed
) {
460 iommufd_cdev_ram_block_discard_disable(false);
463 iommufd_cdev_detach_container(vbasedev
, container
);
464 iommufd_cdev_container_destroy(container
);
465 vfio_put_address_space(space
);
467 iommufd_cdev_unbind_and_disconnect(vbasedev
);
471 const VFIOIOMMUOps vfio_iommufd_ops
= {
472 .dma_map
= iommufd_cdev_map
,
473 .dma_unmap
= iommufd_cdev_unmap
,
474 .attach_device
= iommufd_cdev_attach
,
475 .detach_device
= iommufd_cdev_detach
,