vfio/spapr: Move prereg_listener into spapr container
[qemu/kevin.git] / hw / vfio / spapr.c
blob68c3dd6c75678dcfa901b8e7bc241a56047c0fbe
1 /*
2 * DMA memory preregistration
4 * Authors:
5 * Alexey Kardashevskiy <aik@ozlabs.ru>
7 * This work is licensed under the terms of the GNU GPL, version 2. See
8 * the COPYING file in the top-level directory.
9 */
11 #include "qemu/osdep.h"
12 #include <sys/ioctl.h>
13 #include <linux/vfio.h>
14 #ifdef CONFIG_KVM
15 #include <linux/kvm.h>
16 #endif
17 #include "sysemu/kvm.h"
18 #include "exec/address-spaces.h"
20 #include "hw/vfio/vfio-common.h"
21 #include "hw/hw.h"
22 #include "exec/ram_addr.h"
23 #include "qemu/error-report.h"
24 #include "qapi/error.h"
25 #include "trace.h"
27 typedef struct VFIOSpaprContainer {
28 VFIOContainer container;
29 MemoryListener prereg_listener;
30 } VFIOSpaprContainer;
32 static bool vfio_prereg_listener_skipped_section(MemoryRegionSection *section)
34 if (memory_region_is_iommu(section->mr)) {
35 hw_error("Cannot possibly preregister IOMMU memory");
38 return !memory_region_is_ram(section->mr) ||
39 memory_region_is_ram_device(section->mr);
42 static void *vfio_prereg_gpa_to_vaddr(MemoryRegionSection *section, hwaddr gpa)
44 return memory_region_get_ram_ptr(section->mr) +
45 section->offset_within_region +
46 (gpa - section->offset_within_address_space);
49 static void vfio_prereg_listener_region_add(MemoryListener *listener,
50 MemoryRegionSection *section)
52 VFIOSpaprContainer *scontainer = container_of(listener, VFIOSpaprContainer,
53 prereg_listener);
54 VFIOContainer *container = &scontainer->container;
55 VFIOContainerBase *bcontainer = &container->bcontainer;
56 const hwaddr gpa = section->offset_within_address_space;
57 hwaddr end;
58 int ret;
59 hwaddr page_mask = qemu_real_host_page_mask();
60 struct vfio_iommu_spapr_register_memory reg = {
61 .argsz = sizeof(reg),
62 .flags = 0,
65 if (vfio_prereg_listener_skipped_section(section)) {
66 trace_vfio_prereg_listener_region_add_skip(
67 section->offset_within_address_space,
68 section->offset_within_address_space +
69 int128_get64(int128_sub(section->size, int128_one())));
70 return;
73 if (unlikely((section->offset_within_address_space & ~page_mask) ||
74 (section->offset_within_region & ~page_mask) ||
75 (int128_get64(section->size) & ~page_mask))) {
76 error_report("%s received unaligned region", __func__);
77 return;
80 end = section->offset_within_address_space + int128_get64(section->size);
81 if (gpa >= end) {
82 return;
85 memory_region_ref(section->mr);
87 reg.vaddr = (uintptr_t) vfio_prereg_gpa_to_vaddr(section, gpa);
88 reg.size = end - gpa;
90 ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_REGISTER_MEMORY, &reg);
91 trace_vfio_prereg_register(reg.vaddr, reg.size, ret ? -errno : 0);
92 if (ret) {
94 * On the initfn path, store the first error in the container so we
95 * can gracefully fail. Runtime, there's not much we can do other
96 * than throw a hardware error.
98 if (!bcontainer->initialized) {
99 if (!bcontainer->error) {
100 error_setg_errno(&bcontainer->error, -ret,
101 "Memory registering failed");
103 } else {
104 hw_error("vfio: Memory registering failed, unable to continue");
109 static void vfio_prereg_listener_region_del(MemoryListener *listener,
110 MemoryRegionSection *section)
112 VFIOSpaprContainer *scontainer = container_of(listener, VFIOSpaprContainer,
113 prereg_listener);
114 VFIOContainer *container = &scontainer->container;
115 const hwaddr gpa = section->offset_within_address_space;
116 hwaddr end;
117 int ret;
118 hwaddr page_mask = qemu_real_host_page_mask();
119 struct vfio_iommu_spapr_register_memory reg = {
120 .argsz = sizeof(reg),
121 .flags = 0,
124 if (vfio_prereg_listener_skipped_section(section)) {
125 trace_vfio_prereg_listener_region_del_skip(
126 section->offset_within_address_space,
127 section->offset_within_address_space +
128 int128_get64(int128_sub(section->size, int128_one())));
129 return;
132 if (unlikely((section->offset_within_address_space & ~page_mask) ||
133 (section->offset_within_region & ~page_mask) ||
134 (int128_get64(section->size) & ~page_mask))) {
135 error_report("%s received unaligned region", __func__);
136 return;
139 end = section->offset_within_address_space + int128_get64(section->size);
140 if (gpa >= end) {
141 return;
144 reg.vaddr = (uintptr_t) vfio_prereg_gpa_to_vaddr(section, gpa);
145 reg.size = end - gpa;
147 ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY, &reg);
148 trace_vfio_prereg_unregister(reg.vaddr, reg.size, ret ? -errno : 0);
151 static const MemoryListener vfio_prereg_listener = {
152 .name = "vfio-pre-reg",
153 .region_add = vfio_prereg_listener_region_add,
154 .region_del = vfio_prereg_listener_region_del,
157 static void vfio_host_win_add(VFIOContainer *container, hwaddr min_iova,
158 hwaddr max_iova, uint64_t iova_pgsizes)
160 VFIOHostDMAWindow *hostwin;
162 QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
163 if (ranges_overlap(hostwin->min_iova,
164 hostwin->max_iova - hostwin->min_iova + 1,
165 min_iova,
166 max_iova - min_iova + 1)) {
167 hw_error("%s: Overlapped IOMMU are not enabled", __func__);
171 hostwin = g_malloc0(sizeof(*hostwin));
173 hostwin->min_iova = min_iova;
174 hostwin->max_iova = max_iova;
175 hostwin->iova_pgsizes = iova_pgsizes;
176 QLIST_INSERT_HEAD(&container->hostwin_list, hostwin, hostwin_next);
179 static int vfio_host_win_del(VFIOContainer *container,
180 hwaddr min_iova, hwaddr max_iova)
182 VFIOHostDMAWindow *hostwin;
184 QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
185 if (hostwin->min_iova == min_iova && hostwin->max_iova == max_iova) {
186 QLIST_REMOVE(hostwin, hostwin_next);
187 g_free(hostwin);
188 return 0;
192 return -1;
195 static VFIOHostDMAWindow *vfio_find_hostwin(VFIOContainer *container,
196 hwaddr iova, hwaddr end)
198 VFIOHostDMAWindow *hostwin;
199 bool hostwin_found = false;
201 QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
202 if (hostwin->min_iova <= iova && end <= hostwin->max_iova) {
203 hostwin_found = true;
204 break;
208 return hostwin_found ? hostwin : NULL;
211 static int vfio_spapr_remove_window(VFIOContainer *container,
212 hwaddr offset_within_address_space)
214 struct vfio_iommu_spapr_tce_remove remove = {
215 .argsz = sizeof(remove),
216 .start_addr = offset_within_address_space,
218 int ret;
220 ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove);
221 if (ret) {
222 error_report("Failed to remove window at %"PRIx64,
223 (uint64_t)remove.start_addr);
224 return -errno;
227 trace_vfio_spapr_remove_window(offset_within_address_space);
229 return 0;
232 static int vfio_spapr_create_window(VFIOContainer *container,
233 MemoryRegionSection *section,
234 hwaddr *pgsize)
236 int ret = 0;
237 VFIOContainerBase *bcontainer = &container->bcontainer;
238 IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr);
239 uint64_t pagesize = memory_region_iommu_get_min_page_size(iommu_mr), pgmask;
240 unsigned entries, bits_total, bits_per_level, max_levels;
241 struct vfio_iommu_spapr_tce_create create = { .argsz = sizeof(create) };
242 long rampagesize = qemu_minrampagesize();
245 * The host might not support the guest supported IOMMU page size,
246 * so we will use smaller physical IOMMU pages to back them.
248 if (pagesize > rampagesize) {
249 pagesize = rampagesize;
251 pgmask = bcontainer->pgsizes & (pagesize | (pagesize - 1));
252 pagesize = pgmask ? (1ULL << (63 - clz64(pgmask))) : 0;
253 if (!pagesize) {
254 error_report("Host doesn't support page size 0x%"PRIx64
255 ", the supported mask is 0x%lx",
256 memory_region_iommu_get_min_page_size(iommu_mr),
257 bcontainer->pgsizes);
258 return -EINVAL;
262 * FIXME: For VFIO iommu types which have KVM acceleration to
263 * avoid bouncing all map/unmaps through qemu this way, this
264 * would be the right place to wire that up (tell the KVM
265 * device emulation the VFIO iommu handles to use).
267 create.window_size = int128_get64(section->size);
268 create.page_shift = ctz64(pagesize);
270 * SPAPR host supports multilevel TCE tables. We try to guess optimal
271 * levels number and if this fails (for example due to the host memory
272 * fragmentation), we increase levels. The DMA address structure is:
273 * rrrrrrrr rxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx iiiiiiii
274 * where:
275 * r = reserved (bits >= 55 are reserved in the existing hardware)
276 * i = IOMMU page offset (64K in this example)
277 * x = bits to index a TCE which can be split to equal chunks to index
278 * within the level.
279 * The aim is to split "x" to smaller possible number of levels.
281 entries = create.window_size >> create.page_shift;
282 /* bits_total is number of "x" needed */
283 bits_total = ctz64(entries * sizeof(uint64_t));
285 * bits_per_level is a safe guess of how much we can allocate per level:
286 * 8 is the current minimum for CONFIG_FORCE_MAX_ZONEORDER and MAX_ORDER
287 * is usually bigger than that.
288 * Below we look at qemu_real_host_page_size as TCEs are allocated from
289 * system pages.
291 bits_per_level = ctz64(qemu_real_host_page_size()) + 8;
292 create.levels = bits_total / bits_per_level;
293 if (bits_total % bits_per_level) {
294 ++create.levels;
296 max_levels = (64 - create.page_shift) / ctz64(qemu_real_host_page_size());
297 for ( ; create.levels <= max_levels; ++create.levels) {
298 ret = ioctl(container->fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create);
299 if (!ret) {
300 break;
303 if (ret) {
304 error_report("Failed to create a window, ret = %d (%m)", ret);
305 return -errno;
308 if (create.start_addr != section->offset_within_address_space) {
309 vfio_spapr_remove_window(container, create.start_addr);
311 error_report("Host doesn't support DMA window at %"HWADDR_PRIx", must be %"PRIx64,
312 section->offset_within_address_space,
313 (uint64_t)create.start_addr);
314 return -EINVAL;
316 trace_vfio_spapr_create_window(create.page_shift,
317 create.levels,
318 create.window_size,
319 create.start_addr);
320 *pgsize = pagesize;
322 return 0;
325 static int
326 vfio_spapr_container_add_section_window(VFIOContainerBase *bcontainer,
327 MemoryRegionSection *section,
328 Error **errp)
330 VFIOContainer *container = container_of(bcontainer, VFIOContainer,
331 bcontainer);
332 VFIOHostDMAWindow *hostwin;
333 hwaddr pgsize = 0;
334 int ret;
337 * VFIO_SPAPR_TCE_IOMMU supports a single host window between
338 * [dma32_window_start, dma32_window_size), we need to ensure
339 * the section fall in this range.
341 if (container->iommu_type == VFIO_SPAPR_TCE_IOMMU) {
342 hwaddr iova, end;
344 iova = section->offset_within_address_space;
345 end = iova + int128_get64(section->size) - 1;
347 if (!vfio_find_hostwin(container, iova, end)) {
348 error_setg(errp, "Container %p can't map guest IOVA region"
349 " 0x%"HWADDR_PRIx"..0x%"HWADDR_PRIx, container,
350 iova, end);
351 return -EINVAL;
353 return 0;
356 if (container->iommu_type != VFIO_SPAPR_TCE_v2_IOMMU) {
357 return 0;
360 /* For now intersections are not allowed, we may relax this later */
361 QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
362 if (ranges_overlap(hostwin->min_iova,
363 hostwin->max_iova - hostwin->min_iova + 1,
364 section->offset_within_address_space,
365 int128_get64(section->size))) {
366 error_setg(errp,
367 "region [0x%"PRIx64",0x%"PRIx64"] overlaps with existing"
368 "host DMA window [0x%"PRIx64",0x%"PRIx64"]",
369 section->offset_within_address_space,
370 section->offset_within_address_space +
371 int128_get64(section->size) - 1,
372 hostwin->min_iova, hostwin->max_iova);
373 return -EINVAL;
377 ret = vfio_spapr_create_window(container, section, &pgsize);
378 if (ret) {
379 error_setg_errno(errp, -ret, "Failed to create SPAPR window");
380 return ret;
383 vfio_host_win_add(container, section->offset_within_address_space,
384 section->offset_within_address_space +
385 int128_get64(section->size) - 1, pgsize);
386 #ifdef CONFIG_KVM
387 if (kvm_enabled()) {
388 VFIOGroup *group;
389 IOMMUMemoryRegion *iommu_mr = IOMMU_MEMORY_REGION(section->mr);
390 struct kvm_vfio_spapr_tce param;
391 struct kvm_device_attr attr = {
392 .group = KVM_DEV_VFIO_GROUP,
393 .attr = KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE,
394 .addr = (uint64_t)(unsigned long)&param,
397 if (!memory_region_iommu_get_attr(iommu_mr, IOMMU_ATTR_SPAPR_TCE_FD,
398 &param.tablefd)) {
399 QLIST_FOREACH(group, &container->group_list, container_next) {
400 param.groupfd = group->fd;
401 if (ioctl(vfio_kvm_device_fd, KVM_SET_DEVICE_ATTR, &attr)) {
402 error_setg_errno(errp, errno,
403 "vfio: failed GROUP_SET_SPAPR_TCE for "
404 "KVM VFIO device %d and group fd %d",
405 param.tablefd, param.groupfd);
406 return -errno;
408 trace_vfio_spapr_group_attach(param.groupfd, param.tablefd);
412 #endif
413 return 0;
416 static void
417 vfio_spapr_container_del_section_window(VFIOContainerBase *bcontainer,
418 MemoryRegionSection *section)
420 VFIOContainer *container = container_of(bcontainer, VFIOContainer,
421 bcontainer);
423 if (container->iommu_type != VFIO_SPAPR_TCE_v2_IOMMU) {
424 return;
427 vfio_spapr_remove_window(container,
428 section->offset_within_address_space);
429 if (vfio_host_win_del(container,
430 section->offset_within_address_space,
431 section->offset_within_address_space +
432 int128_get64(section->size) - 1) < 0) {
433 hw_error("%s: Cannot delete missing window at %"HWADDR_PRIx,
434 __func__, section->offset_within_address_space);
438 static VFIOIOMMUOps vfio_iommu_spapr_ops;
440 static void setup_spapr_ops(VFIOContainerBase *bcontainer)
442 vfio_iommu_spapr_ops = *bcontainer->ops;
443 vfio_iommu_spapr_ops.add_window = vfio_spapr_container_add_section_window;
444 vfio_iommu_spapr_ops.del_window = vfio_spapr_container_del_section_window;
445 bcontainer->ops = &vfio_iommu_spapr_ops;
448 int vfio_spapr_container_init(VFIOContainer *container, Error **errp)
450 VFIOContainerBase *bcontainer = &container->bcontainer;
451 VFIOSpaprContainer *scontainer = container_of(container, VFIOSpaprContainer,
452 container);
453 struct vfio_iommu_spapr_tce_info info;
454 bool v2 = container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU;
455 int ret, fd = container->fd;
457 QLIST_INIT(&container->hostwin_list);
460 * The host kernel code implementing VFIO_IOMMU_DISABLE is called
461 * when container fd is closed so we do not call it explicitly
462 * in this file.
464 if (!v2) {
465 ret = ioctl(fd, VFIO_IOMMU_ENABLE);
466 if (ret) {
467 error_setg_errno(errp, errno, "failed to enable container");
468 return -errno;
470 } else {
471 scontainer->prereg_listener = vfio_prereg_listener;
473 memory_listener_register(&scontainer->prereg_listener,
474 &address_space_memory);
475 if (bcontainer->error) {
476 ret = -1;
477 error_propagate_prepend(errp, bcontainer->error,
478 "RAM memory listener initialization failed: ");
479 goto listener_unregister_exit;
483 info.argsz = sizeof(info);
484 ret = ioctl(fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info);
485 if (ret) {
486 error_setg_errno(errp, errno,
487 "VFIO_IOMMU_SPAPR_TCE_GET_INFO failed");
488 ret = -errno;
489 goto listener_unregister_exit;
492 if (v2) {
493 bcontainer->pgsizes = info.ddw.pgsizes;
495 * There is a default window in just created container.
496 * To make region_add/del simpler, we better remove this
497 * window now and let those iommu_listener callbacks
498 * create/remove them when needed.
500 ret = vfio_spapr_remove_window(container, info.dma32_window_start);
501 if (ret) {
502 error_setg_errno(errp, -ret,
503 "failed to remove existing window");
504 goto listener_unregister_exit;
506 } else {
507 /* The default table uses 4K pages */
508 bcontainer->pgsizes = 0x1000;
509 vfio_host_win_add(container, info.dma32_window_start,
510 info.dma32_window_start +
511 info.dma32_window_size - 1,
512 0x1000);
515 setup_spapr_ops(bcontainer);
517 return 0;
519 listener_unregister_exit:
520 if (v2) {
521 memory_listener_unregister(&scontainer->prereg_listener);
523 return ret;
526 void vfio_spapr_container_deinit(VFIOContainer *container)
528 VFIOHostDMAWindow *hostwin, *next;
530 if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
531 VFIOSpaprContainer *scontainer = container_of(container,
532 VFIOSpaprContainer,
533 container);
534 memory_listener_unregister(&scontainer->prereg_listener);
536 QLIST_FOREACH_SAFE(hostwin, &container->hostwin_list, hostwin_next,
537 next) {
538 QLIST_REMOVE(hostwin, hostwin_next);
539 g_free(hostwin);