4 * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
5 * Author: Alex Williamson <alex.williamson@redhat.com>
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
11 * Derived from original vfio:
12 * Copyright 2010 Cisco Systems, Inc. All rights reserved.
13 * Author: Tom Lyon, pugs@cisco.com
16 #include <linux/cdev.h>
17 #include <linux/compat.h>
18 #include <linux/device.h>
19 #include <linux/file.h>
20 #include <linux/anon_inodes.h>
22 #include <linux/idr.h>
23 #include <linux/iommu.h>
24 #include <linux/list.h>
25 #include <linux/module.h>
26 #include <linux/mutex.h>
27 #include <linux/rwsem.h>
28 #include <linux/sched.h>
29 #include <linux/slab.h>
30 #include <linux/stat.h>
31 #include <linux/string.h>
32 #include <linux/uaccess.h>
33 #include <linux/vfio.h>
34 #include <linux/wait.h>
36 #define DRIVER_VERSION "0.3"
37 #define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>"
38 #define DRIVER_DESC "VFIO - User Level meta-driver"
42 struct list_head iommu_drivers_list
;
43 struct mutex iommu_drivers_lock
;
44 struct list_head group_list
;
46 struct mutex group_lock
;
47 struct cdev group_cdev
;
51 wait_queue_head_t release_q
;
54 struct vfio_iommu_driver
{
55 const struct vfio_iommu_driver_ops
*ops
;
56 struct list_head vfio_next
;
59 struct vfio_container
{
61 struct list_head group_list
;
62 struct rw_semaphore group_lock
;
63 struct vfio_iommu_driver
*iommu_driver
;
70 atomic_t container_users
;
71 struct iommu_group
*iommu_group
;
72 struct vfio_container
*container
;
73 struct list_head device_list
;
74 struct mutex device_lock
;
76 struct notifier_block nb
;
77 struct list_head vfio_next
;
78 struct list_head container_next
;
85 const struct vfio_device_ops
*ops
;
86 struct vfio_group
*group
;
87 struct list_head group_next
;
92 * IOMMU driver registration
94 int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops
*ops
)
96 struct vfio_iommu_driver
*driver
, *tmp
;
98 driver
= kzalloc(sizeof(*driver
), GFP_KERNEL
);
104 mutex_lock(&vfio
.iommu_drivers_lock
);
106 /* Check for duplicates */
107 list_for_each_entry(tmp
, &vfio
.iommu_drivers_list
, vfio_next
) {
108 if (tmp
->ops
== ops
) {
109 mutex_unlock(&vfio
.iommu_drivers_lock
);
115 list_add(&driver
->vfio_next
, &vfio
.iommu_drivers_list
);
117 mutex_unlock(&vfio
.iommu_drivers_lock
);
121 EXPORT_SYMBOL_GPL(vfio_register_iommu_driver
);
123 void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops
*ops
)
125 struct vfio_iommu_driver
*driver
;
127 mutex_lock(&vfio
.iommu_drivers_lock
);
128 list_for_each_entry(driver
, &vfio
.iommu_drivers_list
, vfio_next
) {
129 if (driver
->ops
== ops
) {
130 list_del(&driver
->vfio_next
);
131 mutex_unlock(&vfio
.iommu_drivers_lock
);
136 mutex_unlock(&vfio
.iommu_drivers_lock
);
138 EXPORT_SYMBOL_GPL(vfio_unregister_iommu_driver
);
141 * Group minor allocation/free - both called with vfio.group_lock held
143 static int vfio_alloc_group_minor(struct vfio_group
*group
)
145 /* index 0 is used by /dev/vfio/vfio */
146 return idr_alloc(&vfio
.group_idr
, group
, 1, MINORMASK
+ 1, GFP_KERNEL
);
149 static void vfio_free_group_minor(int minor
)
151 idr_remove(&vfio
.group_idr
, minor
);
154 static int vfio_iommu_group_notifier(struct notifier_block
*nb
,
155 unsigned long action
, void *data
);
156 static void vfio_group_get(struct vfio_group
*group
);
159 * Container objects - containers are created when /dev/vfio/vfio is
160 * opened, but their lifecycle extends until the last user is done, so
161 * it's freed via kref. Must support container/group/device being
162 * closed in any order.
164 static void vfio_container_get(struct vfio_container
*container
)
166 kref_get(&container
->kref
);
169 static void vfio_container_release(struct kref
*kref
)
171 struct vfio_container
*container
;
172 container
= container_of(kref
, struct vfio_container
, kref
);
177 static void vfio_container_put(struct vfio_container
*container
)
179 kref_put(&container
->kref
, vfio_container_release
);
182 static void vfio_group_unlock_and_free(struct vfio_group
*group
)
184 mutex_unlock(&vfio
.group_lock
);
186 * Unregister outside of lock. A spurious callback is harmless now
187 * that the group is no longer in vfio.group_list.
189 iommu_group_unregister_notifier(group
->iommu_group
, &group
->nb
);
194 * Group objects - create, release, get, put, search
196 static struct vfio_group
*vfio_create_group(struct iommu_group
*iommu_group
)
198 struct vfio_group
*group
, *tmp
;
202 group
= kzalloc(sizeof(*group
), GFP_KERNEL
);
204 return ERR_PTR(-ENOMEM
);
206 kref_init(&group
->kref
);
207 INIT_LIST_HEAD(&group
->device_list
);
208 mutex_init(&group
->device_lock
);
209 atomic_set(&group
->container_users
, 0);
210 atomic_set(&group
->opened
, 0);
211 group
->iommu_group
= iommu_group
;
213 group
->nb
.notifier_call
= vfio_iommu_group_notifier
;
216 * blocking notifiers acquire a rwsem around registering and hold
217 * it around callback. Therefore, need to register outside of
218 * vfio.group_lock to avoid A-B/B-A contention. Our callback won't
219 * do anything unless it can find the group in vfio.group_list, so
220 * no harm in registering early.
222 ret
= iommu_group_register_notifier(iommu_group
, &group
->nb
);
228 mutex_lock(&vfio
.group_lock
);
230 minor
= vfio_alloc_group_minor(group
);
232 vfio_group_unlock_and_free(group
);
233 return ERR_PTR(minor
);
236 /* Did we race creating this group? */
237 list_for_each_entry(tmp
, &vfio
.group_list
, vfio_next
) {
238 if (tmp
->iommu_group
== iommu_group
) {
240 vfio_free_group_minor(minor
);
241 vfio_group_unlock_and_free(group
);
246 dev
= device_create(vfio
.class, NULL
, MKDEV(MAJOR(vfio
.devt
), minor
),
247 group
, "%d", iommu_group_id(iommu_group
));
249 vfio_free_group_minor(minor
);
250 vfio_group_unlock_and_free(group
);
251 return (struct vfio_group
*)dev
; /* ERR_PTR */
254 group
->minor
= minor
;
257 list_add(&group
->vfio_next
, &vfio
.group_list
);
259 mutex_unlock(&vfio
.group_lock
);
264 /* called with vfio.group_lock held */
265 static void vfio_group_release(struct kref
*kref
)
267 struct vfio_group
*group
= container_of(kref
, struct vfio_group
, kref
);
269 WARN_ON(!list_empty(&group
->device_list
));
271 device_destroy(vfio
.class, MKDEV(MAJOR(vfio
.devt
), group
->minor
));
272 list_del(&group
->vfio_next
);
273 vfio_free_group_minor(group
->minor
);
274 vfio_group_unlock_and_free(group
);
277 static void vfio_group_put(struct vfio_group
*group
)
279 kref_put_mutex(&group
->kref
, vfio_group_release
, &vfio
.group_lock
);
282 /* Assume group_lock or group reference is held */
283 static void vfio_group_get(struct vfio_group
*group
)
285 kref_get(&group
->kref
);
289 * Not really a try as we will sleep for mutex, but we need to make
290 * sure the group pointer is valid under lock and get a reference.
292 static struct vfio_group
*vfio_group_try_get(struct vfio_group
*group
)
294 struct vfio_group
*target
= group
;
296 mutex_lock(&vfio
.group_lock
);
297 list_for_each_entry(group
, &vfio
.group_list
, vfio_next
) {
298 if (group
== target
) {
299 vfio_group_get(group
);
300 mutex_unlock(&vfio
.group_lock
);
304 mutex_unlock(&vfio
.group_lock
);
310 struct vfio_group
*vfio_group_get_from_iommu(struct iommu_group
*iommu_group
)
312 struct vfio_group
*group
;
314 mutex_lock(&vfio
.group_lock
);
315 list_for_each_entry(group
, &vfio
.group_list
, vfio_next
) {
316 if (group
->iommu_group
== iommu_group
) {
317 vfio_group_get(group
);
318 mutex_unlock(&vfio
.group_lock
);
322 mutex_unlock(&vfio
.group_lock
);
327 static struct vfio_group
*vfio_group_get_from_minor(int minor
)
329 struct vfio_group
*group
;
331 mutex_lock(&vfio
.group_lock
);
332 group
= idr_find(&vfio
.group_idr
, minor
);
334 mutex_unlock(&vfio
.group_lock
);
337 vfio_group_get(group
);
338 mutex_unlock(&vfio
.group_lock
);
344 * Device objects - create, release, get, put, search
347 struct vfio_device
*vfio_group_create_device(struct vfio_group
*group
,
349 const struct vfio_device_ops
*ops
,
352 struct vfio_device
*device
;
355 device
= kzalloc(sizeof(*device
), GFP_KERNEL
);
357 return ERR_PTR(-ENOMEM
);
359 kref_init(&device
->kref
);
361 device
->group
= group
;
363 device
->device_data
= device_data
;
365 ret
= dev_set_drvdata(dev
, device
);
371 /* No need to get group_lock, caller has group reference */
372 vfio_group_get(group
);
374 mutex_lock(&group
->device_lock
);
375 list_add(&device
->group_next
, &group
->device_list
);
376 mutex_unlock(&group
->device_lock
);
381 static void vfio_device_release(struct kref
*kref
)
383 struct vfio_device
*device
= container_of(kref
,
384 struct vfio_device
, kref
);
385 struct vfio_group
*group
= device
->group
;
387 list_del(&device
->group_next
);
388 mutex_unlock(&group
->device_lock
);
390 dev_set_drvdata(device
->dev
, NULL
);
394 /* vfio_del_group_dev may be waiting for this device */
395 wake_up(&vfio
.release_q
);
398 /* Device reference always implies a group reference */
399 void vfio_device_put(struct vfio_device
*device
)
401 struct vfio_group
*group
= device
->group
;
402 kref_put_mutex(&device
->kref
, vfio_device_release
, &group
->device_lock
);
403 vfio_group_put(group
);
405 EXPORT_SYMBOL_GPL(vfio_device_put
);
407 static void vfio_device_get(struct vfio_device
*device
)
409 vfio_group_get(device
->group
);
410 kref_get(&device
->kref
);
413 static struct vfio_device
*vfio_group_get_device(struct vfio_group
*group
,
416 struct vfio_device
*device
;
418 mutex_lock(&group
->device_lock
);
419 list_for_each_entry(device
, &group
->device_list
, group_next
) {
420 if (device
->dev
== dev
) {
421 vfio_device_get(device
);
422 mutex_unlock(&group
->device_lock
);
426 mutex_unlock(&group
->device_lock
);
431 * Whitelist some drivers that we know are safe (no dma) or just sit on
432 * a device. It's not always practical to leave a device within a group
433 * driverless as it could get re-bound to something unsafe.
435 static const char * const vfio_driver_whitelist
[] = { "pci-stub", "pcieport" };
437 static bool vfio_whitelisted_driver(struct device_driver
*drv
)
441 for (i
= 0; i
< ARRAY_SIZE(vfio_driver_whitelist
); i
++) {
442 if (!strcmp(drv
->name
, vfio_driver_whitelist
[i
]))
450 * A vfio group is viable for use by userspace if all devices are either
451 * driver-less or bound to a vfio or whitelisted driver. We test the
452 * latter by the existence of a struct vfio_device matching the dev.
454 static int vfio_dev_viable(struct device
*dev
, void *data
)
456 struct vfio_group
*group
= data
;
457 struct vfio_device
*device
;
458 struct device_driver
*drv
= ACCESS_ONCE(dev
->driver
);
460 if (!drv
|| vfio_whitelisted_driver(drv
))
463 device
= vfio_group_get_device(group
, dev
);
465 vfio_device_put(device
);
473 * Async device support
475 static int vfio_group_nb_add_dev(struct vfio_group
*group
, struct device
*dev
)
477 struct vfio_device
*device
;
479 /* Do we already know about it? We shouldn't */
480 device
= vfio_group_get_device(group
, dev
);
481 if (WARN_ON_ONCE(device
)) {
482 vfio_device_put(device
);
486 /* Nothing to do for idle groups */
487 if (!atomic_read(&group
->container_users
))
490 /* TODO Prevent device auto probing */
491 WARN("Device %s added to live group %d!\n", dev_name(dev
),
492 iommu_group_id(group
->iommu_group
));
497 static int vfio_group_nb_verify(struct vfio_group
*group
, struct device
*dev
)
499 /* We don't care what happens when the group isn't in use */
500 if (!atomic_read(&group
->container_users
))
503 return vfio_dev_viable(dev
, group
);
506 static int vfio_iommu_group_notifier(struct notifier_block
*nb
,
507 unsigned long action
, void *data
)
509 struct vfio_group
*group
= container_of(nb
, struct vfio_group
, nb
);
510 struct device
*dev
= data
;
513 * Need to go through a group_lock lookup to get a reference or we
514 * risk racing a group being removed. Ignore spurious notifies.
516 group
= vfio_group_try_get(group
);
521 case IOMMU_GROUP_NOTIFY_ADD_DEVICE
:
522 vfio_group_nb_add_dev(group
, dev
);
524 case IOMMU_GROUP_NOTIFY_DEL_DEVICE
:
526 * Nothing to do here. If the device is in use, then the
527 * vfio sub-driver should block the remove callback until
528 * it is unused. If the device is unused or attached to a
529 * stub driver, then it should be released and we don't
530 * care that it will be going away.
533 case IOMMU_GROUP_NOTIFY_BIND_DRIVER
:
534 pr_debug("%s: Device %s, group %d binding to driver\n",
535 __func__
, dev_name(dev
),
536 iommu_group_id(group
->iommu_group
));
538 case IOMMU_GROUP_NOTIFY_BOUND_DRIVER
:
539 pr_debug("%s: Device %s, group %d bound to driver %s\n",
540 __func__
, dev_name(dev
),
541 iommu_group_id(group
->iommu_group
), dev
->driver
->name
);
542 BUG_ON(vfio_group_nb_verify(group
, dev
));
544 case IOMMU_GROUP_NOTIFY_UNBIND_DRIVER
:
545 pr_debug("%s: Device %s, group %d unbinding from driver %s\n",
546 __func__
, dev_name(dev
),
547 iommu_group_id(group
->iommu_group
), dev
->driver
->name
);
549 case IOMMU_GROUP_NOTIFY_UNBOUND_DRIVER
:
550 pr_debug("%s: Device %s, group %d unbound from driver\n",
551 __func__
, dev_name(dev
),
552 iommu_group_id(group
->iommu_group
));
554 * XXX An unbound device in a live group is ok, but we'd
555 * really like to avoid the above BUG_ON by preventing other
556 * drivers from binding to it. Once that occurs, we have to
557 * stop the system to maintain isolation. At a minimum, we'd
558 * want a toggle to disable driver auto probe for this device.
563 vfio_group_put(group
);
570 int vfio_add_group_dev(struct device
*dev
,
571 const struct vfio_device_ops
*ops
, void *device_data
)
573 struct iommu_group
*iommu_group
;
574 struct vfio_group
*group
;
575 struct vfio_device
*device
;
577 iommu_group
= iommu_group_get(dev
);
581 group
= vfio_group_get_from_iommu(iommu_group
);
583 group
= vfio_create_group(iommu_group
);
585 iommu_group_put(iommu_group
);
586 return PTR_ERR(group
);
590 device
= vfio_group_get_device(group
, dev
);
592 WARN(1, "Device %s already exists on group %d\n",
593 dev_name(dev
), iommu_group_id(iommu_group
));
594 vfio_device_put(device
);
595 vfio_group_put(group
);
596 iommu_group_put(iommu_group
);
600 device
= vfio_group_create_device(group
, dev
, ops
, device_data
);
601 if (IS_ERR(device
)) {
602 vfio_group_put(group
);
603 iommu_group_put(iommu_group
);
604 return PTR_ERR(device
);
608 * Added device holds reference to iommu_group and vfio_device
609 * (which in turn holds reference to vfio_group). Drop extra
610 * group reference used while acquiring device.
612 vfio_group_put(group
);
616 EXPORT_SYMBOL_GPL(vfio_add_group_dev
);
619 * Get a reference to the vfio_device for a device that is known to
620 * be bound to a vfio driver. The driver implicitly holds a
621 * vfio_device reference between vfio_add_group_dev and
622 * vfio_del_group_dev. We can therefore use drvdata to increment
623 * that reference from the struct device. This additional
624 * reference must be released by calling vfio_device_put.
626 struct vfio_device
*vfio_device_get_from_dev(struct device
*dev
)
628 struct vfio_device
*device
= dev_get_drvdata(dev
);
630 vfio_device_get(device
);
634 EXPORT_SYMBOL_GPL(vfio_device_get_from_dev
);
637 * Caller must hold a reference to the vfio_device
639 void *vfio_device_data(struct vfio_device
*device
)
641 return device
->device_data
;
643 EXPORT_SYMBOL_GPL(vfio_device_data
);
645 /* Given a referenced group, check if it contains the device */
646 static bool vfio_dev_present(struct vfio_group
*group
, struct device
*dev
)
648 struct vfio_device
*device
;
650 device
= vfio_group_get_device(group
, dev
);
654 vfio_device_put(device
);
659 * Decrement the device reference count and wait for the device to be
660 * removed. Open file descriptors for the device... */
661 void *vfio_del_group_dev(struct device
*dev
)
663 struct vfio_device
*device
= dev_get_drvdata(dev
);
664 struct vfio_group
*group
= device
->group
;
665 struct iommu_group
*iommu_group
= group
->iommu_group
;
666 void *device_data
= device
->device_data
;
669 * The group exists so long as we have a device reference. Get
670 * a group reference and use it to scan for the device going away.
672 vfio_group_get(group
);
674 vfio_device_put(device
);
676 /* TODO send a signal to encourage this to be released */
677 wait_event(vfio
.release_q
, !vfio_dev_present(group
, dev
));
679 vfio_group_put(group
);
681 iommu_group_put(iommu_group
);
685 EXPORT_SYMBOL_GPL(vfio_del_group_dev
);
688 * VFIO base fd, /dev/vfio/vfio
690 static long vfio_ioctl_check_extension(struct vfio_container
*container
,
693 struct vfio_iommu_driver
*driver
;
696 down_read(&container
->group_lock
);
698 driver
= container
->iommu_driver
;
701 /* No base extensions yet */
704 * If no driver is set, poll all registered drivers for
705 * extensions and return the first positive result. If
706 * a driver is already set, further queries will be passed
707 * only to that driver.
710 mutex_lock(&vfio
.iommu_drivers_lock
);
711 list_for_each_entry(driver
, &vfio
.iommu_drivers_list
,
713 if (!try_module_get(driver
->ops
->owner
))
716 ret
= driver
->ops
->ioctl(NULL
,
717 VFIO_CHECK_EXTENSION
,
719 module_put(driver
->ops
->owner
);
723 mutex_unlock(&vfio
.iommu_drivers_lock
);
725 ret
= driver
->ops
->ioctl(container
->iommu_data
,
726 VFIO_CHECK_EXTENSION
, arg
);
729 up_read(&container
->group_lock
);
734 /* hold write lock on container->group_lock */
735 static int __vfio_container_attach_groups(struct vfio_container
*container
,
736 struct vfio_iommu_driver
*driver
,
739 struct vfio_group
*group
;
742 list_for_each_entry(group
, &container
->group_list
, container_next
) {
743 ret
= driver
->ops
->attach_group(data
, group
->iommu_group
);
751 list_for_each_entry_continue_reverse(group
, &container
->group_list
,
753 driver
->ops
->detach_group(data
, group
->iommu_group
);
759 static long vfio_ioctl_set_iommu(struct vfio_container
*container
,
762 struct vfio_iommu_driver
*driver
;
765 down_write(&container
->group_lock
);
768 * The container is designed to be an unprivileged interface while
769 * the group can be assigned to specific users. Therefore, only by
770 * adding a group to a container does the user get the privilege of
771 * enabling the iommu, which may allocate finite resources. There
772 * is no unset_iommu, but by removing all the groups from a container,
773 * the container is deprivileged and returns to an unset state.
775 if (list_empty(&container
->group_list
) || container
->iommu_driver
) {
776 up_write(&container
->group_lock
);
780 mutex_lock(&vfio
.iommu_drivers_lock
);
781 list_for_each_entry(driver
, &vfio
.iommu_drivers_list
, vfio_next
) {
784 if (!try_module_get(driver
->ops
->owner
))
788 * The arg magic for SET_IOMMU is the same as CHECK_EXTENSION,
789 * so test which iommu driver reported support for this
790 * extension and call open on them. We also pass them the
791 * magic, allowing a single driver to support multiple
792 * interfaces if they'd like.
794 if (driver
->ops
->ioctl(NULL
, VFIO_CHECK_EXTENSION
, arg
) <= 0) {
795 module_put(driver
->ops
->owner
);
799 /* module reference holds the driver we're working on */
800 mutex_unlock(&vfio
.iommu_drivers_lock
);
802 data
= driver
->ops
->open(arg
);
805 module_put(driver
->ops
->owner
);
806 goto skip_drivers_unlock
;
809 ret
= __vfio_container_attach_groups(container
, driver
, data
);
811 container
->iommu_driver
= driver
;
812 container
->iommu_data
= data
;
814 driver
->ops
->release(data
);
815 module_put(driver
->ops
->owner
);
818 goto skip_drivers_unlock
;
821 mutex_unlock(&vfio
.iommu_drivers_lock
);
823 up_write(&container
->group_lock
);
828 static long vfio_fops_unl_ioctl(struct file
*filep
,
829 unsigned int cmd
, unsigned long arg
)
831 struct vfio_container
*container
= filep
->private_data
;
832 struct vfio_iommu_driver
*driver
;
840 case VFIO_GET_API_VERSION
:
841 ret
= VFIO_API_VERSION
;
843 case VFIO_CHECK_EXTENSION
:
844 ret
= vfio_ioctl_check_extension(container
, arg
);
847 ret
= vfio_ioctl_set_iommu(container
, arg
);
850 down_read(&container
->group_lock
);
852 driver
= container
->iommu_driver
;
853 data
= container
->iommu_data
;
855 if (driver
) /* passthrough all unrecognized ioctls */
856 ret
= driver
->ops
->ioctl(data
, cmd
, arg
);
858 up_read(&container
->group_lock
);
865 static long vfio_fops_compat_ioctl(struct file
*filep
,
866 unsigned int cmd
, unsigned long arg
)
868 arg
= (unsigned long)compat_ptr(arg
);
869 return vfio_fops_unl_ioctl(filep
, cmd
, arg
);
871 #endif /* CONFIG_COMPAT */
873 static int vfio_fops_open(struct inode
*inode
, struct file
*filep
)
875 struct vfio_container
*container
;
877 container
= kzalloc(sizeof(*container
), GFP_KERNEL
);
881 INIT_LIST_HEAD(&container
->group_list
);
882 init_rwsem(&container
->group_lock
);
883 kref_init(&container
->kref
);
885 filep
->private_data
= container
;
890 static int vfio_fops_release(struct inode
*inode
, struct file
*filep
)
892 struct vfio_container
*container
= filep
->private_data
;
894 filep
->private_data
= NULL
;
896 vfio_container_put(container
);
902 * Once an iommu driver is set, we optionally pass read/write/mmap
903 * on to the driver, allowing management interfaces beyond ioctl.
905 static ssize_t
vfio_fops_read(struct file
*filep
, char __user
*buf
,
906 size_t count
, loff_t
*ppos
)
908 struct vfio_container
*container
= filep
->private_data
;
909 struct vfio_iommu_driver
*driver
;
910 ssize_t ret
= -EINVAL
;
912 down_read(&container
->group_lock
);
914 driver
= container
->iommu_driver
;
915 if (likely(driver
&& driver
->ops
->read
))
916 ret
= driver
->ops
->read(container
->iommu_data
,
919 up_read(&container
->group_lock
);
924 static ssize_t
vfio_fops_write(struct file
*filep
, const char __user
*buf
,
925 size_t count
, loff_t
*ppos
)
927 struct vfio_container
*container
= filep
->private_data
;
928 struct vfio_iommu_driver
*driver
;
929 ssize_t ret
= -EINVAL
;
931 down_read(&container
->group_lock
);
933 driver
= container
->iommu_driver
;
934 if (likely(driver
&& driver
->ops
->write
))
935 ret
= driver
->ops
->write(container
->iommu_data
,
938 up_read(&container
->group_lock
);
943 static int vfio_fops_mmap(struct file
*filep
, struct vm_area_struct
*vma
)
945 struct vfio_container
*container
= filep
->private_data
;
946 struct vfio_iommu_driver
*driver
;
949 down_read(&container
->group_lock
);
951 driver
= container
->iommu_driver
;
952 if (likely(driver
&& driver
->ops
->mmap
))
953 ret
= driver
->ops
->mmap(container
->iommu_data
, vma
);
955 up_read(&container
->group_lock
);
960 static const struct file_operations vfio_fops
= {
961 .owner
= THIS_MODULE
,
962 .open
= vfio_fops_open
,
963 .release
= vfio_fops_release
,
964 .read
= vfio_fops_read
,
965 .write
= vfio_fops_write
,
966 .unlocked_ioctl
= vfio_fops_unl_ioctl
,
968 .compat_ioctl
= vfio_fops_compat_ioctl
,
970 .mmap
= vfio_fops_mmap
,
974 * VFIO Group fd, /dev/vfio/$GROUP
976 static void __vfio_group_unset_container(struct vfio_group
*group
)
978 struct vfio_container
*container
= group
->container
;
979 struct vfio_iommu_driver
*driver
;
981 down_write(&container
->group_lock
);
983 driver
= container
->iommu_driver
;
985 driver
->ops
->detach_group(container
->iommu_data
,
988 group
->container
= NULL
;
989 list_del(&group
->container_next
);
991 /* Detaching the last group deprivileges a container, remove iommu */
992 if (driver
&& list_empty(&container
->group_list
)) {
993 driver
->ops
->release(container
->iommu_data
);
994 module_put(driver
->ops
->owner
);
995 container
->iommu_driver
= NULL
;
996 container
->iommu_data
= NULL
;
999 up_write(&container
->group_lock
);
1001 vfio_container_put(container
);
1005 * VFIO_GROUP_UNSET_CONTAINER should fail if there are other users or
1006 * if there was no container to unset. Since the ioctl is called on
1007 * the group, we know that still exists, therefore the only valid
1008 * transition here is 1->0.
1010 static int vfio_group_unset_container(struct vfio_group
*group
)
1012 int users
= atomic_cmpxchg(&group
->container_users
, 1, 0);
1019 __vfio_group_unset_container(group
);
1025 * When removing container users, anything that removes the last user
1026 * implicitly removes the group from the container. That is, if the
1027 * group file descriptor is closed, as well as any device file descriptors,
1028 * the group is free.
1030 static void vfio_group_try_dissolve_container(struct vfio_group
*group
)
1032 if (0 == atomic_dec_if_positive(&group
->container_users
))
1033 __vfio_group_unset_container(group
);
1036 static int vfio_group_set_container(struct vfio_group
*group
, int container_fd
)
1039 struct vfio_container
*container
;
1040 struct vfio_iommu_driver
*driver
;
1043 if (atomic_read(&group
->container_users
))
1046 f
= fdget(container_fd
);
1050 /* Sanity check, is this really our fd? */
1051 if (f
.file
->f_op
!= &vfio_fops
) {
1056 container
= f
.file
->private_data
;
1057 WARN_ON(!container
); /* fget ensures we don't race vfio_release */
1059 down_write(&container
->group_lock
);
1061 driver
= container
->iommu_driver
;
1063 ret
= driver
->ops
->attach_group(container
->iommu_data
,
1064 group
->iommu_group
);
1069 group
->container
= container
;
1070 list_add(&group
->container_next
, &container
->group_list
);
1072 /* Get a reference on the container and mark a user within the group */
1073 vfio_container_get(container
);
1074 atomic_inc(&group
->container_users
);
1077 up_write(&container
->group_lock
);
1082 static bool vfio_group_viable(struct vfio_group
*group
)
1084 return (iommu_group_for_each_dev(group
->iommu_group
,
1085 group
, vfio_dev_viable
) == 0);
1088 static const struct file_operations vfio_device_fops
;
1090 static int vfio_group_get_device_fd(struct vfio_group
*group
, char *buf
)
1092 struct vfio_device
*device
;
1096 if (0 == atomic_read(&group
->container_users
) ||
1097 !group
->container
->iommu_driver
|| !vfio_group_viable(group
))
1100 mutex_lock(&group
->device_lock
);
1101 list_for_each_entry(device
, &group
->device_list
, group_next
) {
1102 if (strcmp(dev_name(device
->dev
), buf
))
1105 ret
= device
->ops
->open(device
->device_data
);
1109 * We can't use anon_inode_getfd() because we need to modify
1110 * the f_mode flags directly to allow more than just ioctls
1112 ret
= get_unused_fd_flags(O_CLOEXEC
);
1114 device
->ops
->release(device
->device_data
);
1118 filep
= anon_inode_getfile("[vfio-device]", &vfio_device_fops
,
1120 if (IS_ERR(filep
)) {
1122 ret
= PTR_ERR(filep
);
1123 device
->ops
->release(device
->device_data
);
1128 * TODO: add an anon_inode interface to do this.
1129 * Appears to be missing by lack of need rather than
1130 * explicitly prevented. Now there's need.
1132 filep
->f_mode
|= (FMODE_LSEEK
| FMODE_PREAD
| FMODE_PWRITE
);
1134 vfio_device_get(device
);
1135 atomic_inc(&group
->container_users
);
1137 fd_install(ret
, filep
);
1140 mutex_unlock(&group
->device_lock
);
1145 static long vfio_group_fops_unl_ioctl(struct file
*filep
,
1146 unsigned int cmd
, unsigned long arg
)
1148 struct vfio_group
*group
= filep
->private_data
;
1152 case VFIO_GROUP_GET_STATUS
:
1154 struct vfio_group_status status
;
1155 unsigned long minsz
;
1157 minsz
= offsetofend(struct vfio_group_status
, flags
);
1159 if (copy_from_user(&status
, (void __user
*)arg
, minsz
))
1162 if (status
.argsz
< minsz
)
1167 if (vfio_group_viable(group
))
1168 status
.flags
|= VFIO_GROUP_FLAGS_VIABLE
;
1170 if (group
->container
)
1171 status
.flags
|= VFIO_GROUP_FLAGS_CONTAINER_SET
;
1173 if (copy_to_user((void __user
*)arg
, &status
, minsz
))
1179 case VFIO_GROUP_SET_CONTAINER
:
1183 if (get_user(fd
, (int __user
*)arg
))
1189 ret
= vfio_group_set_container(group
, fd
);
1192 case VFIO_GROUP_UNSET_CONTAINER
:
1193 ret
= vfio_group_unset_container(group
);
1195 case VFIO_GROUP_GET_DEVICE_FD
:
1199 buf
= strndup_user((const char __user
*)arg
, PAGE_SIZE
);
1201 return PTR_ERR(buf
);
1203 ret
= vfio_group_get_device_fd(group
, buf
);
1212 #ifdef CONFIG_COMPAT
1213 static long vfio_group_fops_compat_ioctl(struct file
*filep
,
1214 unsigned int cmd
, unsigned long arg
)
1216 arg
= (unsigned long)compat_ptr(arg
);
1217 return vfio_group_fops_unl_ioctl(filep
, cmd
, arg
);
1219 #endif /* CONFIG_COMPAT */
1221 static int vfio_group_fops_open(struct inode
*inode
, struct file
*filep
)
1223 struct vfio_group
*group
;
1226 group
= vfio_group_get_from_minor(iminor(inode
));
1230 /* Do we need multiple instances of the group open? Seems not. */
1231 opened
= atomic_cmpxchg(&group
->opened
, 0, 1);
1233 vfio_group_put(group
);
1237 /* Is something still in use from a previous open? */
1238 if (group
->container
) {
1239 atomic_dec(&group
->opened
);
1240 vfio_group_put(group
);
1244 filep
->private_data
= group
;
1249 static int vfio_group_fops_release(struct inode
*inode
, struct file
*filep
)
1251 struct vfio_group
*group
= filep
->private_data
;
1253 filep
->private_data
= NULL
;
1255 vfio_group_try_dissolve_container(group
);
1257 atomic_dec(&group
->opened
);
1259 vfio_group_put(group
);
1264 static const struct file_operations vfio_group_fops
= {
1265 .owner
= THIS_MODULE
,
1266 .unlocked_ioctl
= vfio_group_fops_unl_ioctl
,
1267 #ifdef CONFIG_COMPAT
1268 .compat_ioctl
= vfio_group_fops_compat_ioctl
,
1270 .open
= vfio_group_fops_open
,
1271 .release
= vfio_group_fops_release
,
1277 static int vfio_device_fops_release(struct inode
*inode
, struct file
*filep
)
1279 struct vfio_device
*device
= filep
->private_data
;
1281 device
->ops
->release(device
->device_data
);
1283 vfio_group_try_dissolve_container(device
->group
);
1285 vfio_device_put(device
);
1290 static long vfio_device_fops_unl_ioctl(struct file
*filep
,
1291 unsigned int cmd
, unsigned long arg
)
1293 struct vfio_device
*device
= filep
->private_data
;
1295 if (unlikely(!device
->ops
->ioctl
))
1298 return device
->ops
->ioctl(device
->device_data
, cmd
, arg
);
1301 static ssize_t
vfio_device_fops_read(struct file
*filep
, char __user
*buf
,
1302 size_t count
, loff_t
*ppos
)
1304 struct vfio_device
*device
= filep
->private_data
;
1306 if (unlikely(!device
->ops
->read
))
1309 return device
->ops
->read(device
->device_data
, buf
, count
, ppos
);
1312 static ssize_t
vfio_device_fops_write(struct file
*filep
,
1313 const char __user
*buf
,
1314 size_t count
, loff_t
*ppos
)
1316 struct vfio_device
*device
= filep
->private_data
;
1318 if (unlikely(!device
->ops
->write
))
1321 return device
->ops
->write(device
->device_data
, buf
, count
, ppos
);
1324 static int vfio_device_fops_mmap(struct file
*filep
, struct vm_area_struct
*vma
)
1326 struct vfio_device
*device
= filep
->private_data
;
1328 if (unlikely(!device
->ops
->mmap
))
1331 return device
->ops
->mmap(device
->device_data
, vma
);
1334 #ifdef CONFIG_COMPAT
1335 static long vfio_device_fops_compat_ioctl(struct file
*filep
,
1336 unsigned int cmd
, unsigned long arg
)
1338 arg
= (unsigned long)compat_ptr(arg
);
1339 return vfio_device_fops_unl_ioctl(filep
, cmd
, arg
);
1341 #endif /* CONFIG_COMPAT */
1343 static const struct file_operations vfio_device_fops
= {
1344 .owner
= THIS_MODULE
,
1345 .release
= vfio_device_fops_release
,
1346 .read
= vfio_device_fops_read
,
1347 .write
= vfio_device_fops_write
,
1348 .unlocked_ioctl
= vfio_device_fops_unl_ioctl
,
1349 #ifdef CONFIG_COMPAT
1350 .compat_ioctl
= vfio_device_fops_compat_ioctl
,
1352 .mmap
= vfio_device_fops_mmap
,
1356 * External user API, exported by symbols to be linked dynamically.
1358 * The protocol includes:
1359 * 1. do normal VFIO init operation:
1360 * - opening a new container;
1361 * - attaching group(s) to it;
1362 * - setting an IOMMU driver for a container.
1363 * When IOMMU is set for a container, all groups in it are
1364 * considered ready to use by an external user.
1366 * 2. User space passes a group fd to an external user.
1367 * The external user calls vfio_group_get_external_user()
1369 * - the group is initialized;
1370 * - IOMMU is set for it.
1371 * If both checks passed, vfio_group_get_external_user()
1372 * increments the container user counter to prevent
1373 * the VFIO group from disposal before KVM exits.
1375 * 3. The external user calls vfio_external_user_iommu_id()
1376 * to know an IOMMU ID.
1378 * 4. When the external KVM finishes, it calls
1379 * vfio_group_put_external_user() to release the VFIO group.
1380 * This call decrements the container user counter.
1382 struct vfio_group
*vfio_group_get_external_user(struct file
*filep
)
1384 struct vfio_group
*group
= filep
->private_data
;
1386 if (filep
->f_op
!= &vfio_group_fops
)
1387 return ERR_PTR(-EINVAL
);
1389 if (!atomic_inc_not_zero(&group
->container_users
))
1390 return ERR_PTR(-EINVAL
);
1392 if (!group
->container
->iommu_driver
||
1393 !vfio_group_viable(group
)) {
1394 atomic_dec(&group
->container_users
);
1395 return ERR_PTR(-EINVAL
);
1398 vfio_group_get(group
);
1402 EXPORT_SYMBOL_GPL(vfio_group_get_external_user
);
1404 void vfio_group_put_external_user(struct vfio_group
*group
)
1406 vfio_group_put(group
);
1407 vfio_group_try_dissolve_container(group
);
1409 EXPORT_SYMBOL_GPL(vfio_group_put_external_user
);
1411 int vfio_external_user_iommu_id(struct vfio_group
*group
)
1413 return iommu_group_id(group
->iommu_group
);
1415 EXPORT_SYMBOL_GPL(vfio_external_user_iommu_id
);
1418 * Module/class support
1420 static char *vfio_devnode(struct device
*dev
, umode_t
*mode
)
1422 if (mode
&& (MINOR(dev
->devt
) == 0))
1423 *mode
= S_IRUGO
| S_IWUGO
;
1425 return kasprintf(GFP_KERNEL
, "vfio/%s", dev_name(dev
));
1428 static int __init
vfio_init(void)
1432 idr_init(&vfio
.group_idr
);
1433 mutex_init(&vfio
.group_lock
);
1434 mutex_init(&vfio
.iommu_drivers_lock
);
1435 INIT_LIST_HEAD(&vfio
.group_list
);
1436 INIT_LIST_HEAD(&vfio
.iommu_drivers_list
);
1437 init_waitqueue_head(&vfio
.release_q
);
1439 vfio
.class = class_create(THIS_MODULE
, "vfio");
1440 if (IS_ERR(vfio
.class)) {
1441 ret
= PTR_ERR(vfio
.class);
1445 vfio
.class->devnode
= vfio_devnode
;
1447 ret
= alloc_chrdev_region(&vfio
.devt
, 0, MINORMASK
, "vfio");
1449 goto err_base_chrdev
;
1451 cdev_init(&vfio
.cdev
, &vfio_fops
);
1452 ret
= cdev_add(&vfio
.cdev
, vfio
.devt
, 1);
1456 vfio
.dev
= device_create(vfio
.class, NULL
, vfio
.devt
, NULL
, "vfio");
1457 if (IS_ERR(vfio
.dev
)) {
1458 ret
= PTR_ERR(vfio
.dev
);
1462 /* /dev/vfio/$GROUP */
1463 cdev_init(&vfio
.group_cdev
, &vfio_group_fops
);
1464 ret
= cdev_add(&vfio
.group_cdev
,
1465 MKDEV(MAJOR(vfio
.devt
), 1), MINORMASK
- 1);
1467 goto err_groups_cdev
;
1469 pr_info(DRIVER_DESC
" version: " DRIVER_VERSION
"\n");
1472 * Attempt to load known iommu-drivers. This gives us a working
1473 * environment without the user needing to explicitly load iommu
1476 request_module_nowait("vfio_iommu_type1");
1477 request_module_nowait("vfio_iommu_spapr_tce");
1482 device_destroy(vfio
.class, vfio
.devt
);
1484 cdev_del(&vfio
.cdev
);
1486 unregister_chrdev_region(vfio
.devt
, MINORMASK
);
1488 class_destroy(vfio
.class);
1494 static void __exit
vfio_cleanup(void)
1496 WARN_ON(!list_empty(&vfio
.group_list
));
1498 idr_destroy(&vfio
.group_idr
);
1499 cdev_del(&vfio
.group_cdev
);
1500 device_destroy(vfio
.class, vfio
.devt
);
1501 cdev_del(&vfio
.cdev
);
1502 unregister_chrdev_region(vfio
.devt
, MINORMASK
);
1503 class_destroy(vfio
.class);
1507 module_init(vfio_init
);
1508 module_exit(vfio_cleanup
);
1510 MODULE_VERSION(DRIVER_VERSION
);
1511 MODULE_LICENSE("GPL v2");
1512 MODULE_AUTHOR(DRIVER_AUTHOR
);
1513 MODULE_DESCRIPTION(DRIVER_DESC
);