2 * Kernel-based Virtual Machine - device assignment support
4 * Copyright (C) 2006-9 Red Hat, Inc
6 * This work is licensed under the terms of the GNU GPL, version 2. See
7 * the COPYING file in the top-level directory.
11 #include <linux/kvm_host.h>
12 #include <linux/kvm.h>
13 #include <linux/uaccess.h>
14 #include <linux/vmalloc.h>
15 #include <linux/errno.h>
16 #include <linux/spinlock.h>
17 #include <linux/pci.h>
18 #include <linux/interrupt.h>
19 #include <linux/slab.h>
22 static struct kvm_assigned_dev_kernel
*kvm_find_assigned_dev(struct list_head
*head
,
25 struct list_head
*ptr
;
26 struct kvm_assigned_dev_kernel
*match
;
28 list_for_each(ptr
, head
) {
29 match
= list_entry(ptr
, struct kvm_assigned_dev_kernel
, list
);
30 if (match
->assigned_dev_id
== assigned_dev_id
)
36 static int find_index_from_host_irq(struct kvm_assigned_dev_kernel
37 *assigned_dev
, int irq
)
40 struct msix_entry
*host_msix_entries
;
42 host_msix_entries
= assigned_dev
->host_msix_entries
;
45 for (i
= 0; i
< assigned_dev
->entries_nr
; i
++)
46 if (irq
== host_msix_entries
[i
].vector
) {
51 printk(KERN_WARNING
"Fail to find correlated MSI-X entry!\n");
58 static void kvm_assigned_dev_interrupt_work_handler(struct work_struct
*work
)
60 struct kvm_assigned_dev_kernel
*assigned_dev
;
64 assigned_dev
= container_of(work
, struct kvm_assigned_dev_kernel
,
66 kvm
= assigned_dev
->kvm
;
68 spin_lock_irq(&assigned_dev
->assigned_dev_lock
);
69 if (assigned_dev
->irq_requested_type
& KVM_DEV_IRQ_HOST_MSIX
) {
70 struct kvm_guest_msix_entry
*guest_entries
=
71 assigned_dev
->guest_msix_entries
;
72 for (i
= 0; i
< assigned_dev
->entries_nr
; i
++) {
73 if (!(guest_entries
[i
].flags
&
74 KVM_ASSIGNED_MSIX_PENDING
))
76 guest_entries
[i
].flags
&= ~KVM_ASSIGNED_MSIX_PENDING
;
77 kvm_set_irq(assigned_dev
->kvm
,
78 assigned_dev
->irq_source_id
,
79 guest_entries
[i
].vector
, 1);
82 kvm_set_irq(assigned_dev
->kvm
, assigned_dev
->irq_source_id
,
83 assigned_dev
->guest_irq
, 1);
85 spin_unlock_irq(&assigned_dev
->assigned_dev_lock
);
88 static irqreturn_t
kvm_assigned_dev_intr(int irq
, void *dev_id
)
91 struct kvm_assigned_dev_kernel
*assigned_dev
=
92 (struct kvm_assigned_dev_kernel
*) dev_id
;
94 spin_lock_irqsave(&assigned_dev
->assigned_dev_lock
, flags
);
95 if (assigned_dev
->irq_requested_type
& KVM_DEV_IRQ_HOST_MSIX
) {
96 int index
= find_index_from_host_irq(assigned_dev
, irq
);
99 assigned_dev
->guest_msix_entries
[index
].flags
|=
100 KVM_ASSIGNED_MSIX_PENDING
;
103 schedule_work(&assigned_dev
->interrupt_work
);
105 if (assigned_dev
->irq_requested_type
& KVM_DEV_IRQ_GUEST_INTX
) {
106 disable_irq_nosync(irq
);
107 assigned_dev
->host_irq_disabled
= true;
111 spin_unlock_irqrestore(&assigned_dev
->assigned_dev_lock
, flags
);
115 /* Ack the irq line for an assigned device */
116 static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier
*kian
)
118 struct kvm_assigned_dev_kernel
*dev
;
124 dev
= container_of(kian
, struct kvm_assigned_dev_kernel
,
127 kvm_set_irq(dev
->kvm
, dev
->irq_source_id
, dev
->guest_irq
, 0);
129 /* The guest irq may be shared so this ack may be
130 * from another device.
132 spin_lock_irqsave(&dev
->assigned_dev_lock
, flags
);
133 if (dev
->host_irq_disabled
) {
134 enable_irq(dev
->host_irq
);
135 dev
->host_irq_disabled
= false;
137 spin_unlock_irqrestore(&dev
->assigned_dev_lock
, flags
);
140 static void deassign_guest_irq(struct kvm
*kvm
,
141 struct kvm_assigned_dev_kernel
*assigned_dev
)
143 kvm_unregister_irq_ack_notifier(kvm
, &assigned_dev
->ack_notifier
);
144 assigned_dev
->ack_notifier
.gsi
= -1;
146 if (assigned_dev
->irq_source_id
!= -1)
147 kvm_free_irq_source_id(kvm
, assigned_dev
->irq_source_id
);
148 assigned_dev
->irq_source_id
= -1;
149 assigned_dev
->irq_requested_type
&= ~(KVM_DEV_IRQ_GUEST_MASK
);
152 /* The function implicit hold kvm->lock mutex due to cancel_work_sync() */
153 static void deassign_host_irq(struct kvm
*kvm
,
154 struct kvm_assigned_dev_kernel
*assigned_dev
)
157 * In kvm_free_device_irq, cancel_work_sync return true if:
158 * 1. work is scheduled, and then cancelled.
159 * 2. work callback is executed.
161 * The first one ensured that the irq is disabled and no more events
162 * would happen. But for the second one, the irq may be enabled (e.g.
163 * for MSI). So we disable irq here to prevent further events.
165 * Notice this maybe result in nested disable if the interrupt type is
166 * INTx, but it's OK for we are going to free it.
168 * If this function is a part of VM destroy, please ensure that till
169 * now, the kvm state is still legal for probably we also have to wait
170 * interrupt_work done.
172 if (assigned_dev
->irq_requested_type
& KVM_DEV_IRQ_HOST_MSIX
) {
174 for (i
= 0; i
< assigned_dev
->entries_nr
; i
++)
175 disable_irq_nosync(assigned_dev
->
176 host_msix_entries
[i
].vector
);
178 cancel_work_sync(&assigned_dev
->interrupt_work
);
180 for (i
= 0; i
< assigned_dev
->entries_nr
; i
++)
181 free_irq(assigned_dev
->host_msix_entries
[i
].vector
,
182 (void *)assigned_dev
);
184 assigned_dev
->entries_nr
= 0;
185 kfree(assigned_dev
->host_msix_entries
);
186 kfree(assigned_dev
->guest_msix_entries
);
187 pci_disable_msix(assigned_dev
->dev
);
189 /* Deal with MSI and INTx */
190 disable_irq_nosync(assigned_dev
->host_irq
);
191 cancel_work_sync(&assigned_dev
->interrupt_work
);
193 free_irq(assigned_dev
->host_irq
, (void *)assigned_dev
);
195 if (assigned_dev
->irq_requested_type
& KVM_DEV_IRQ_HOST_MSI
)
196 pci_disable_msi(assigned_dev
->dev
);
199 assigned_dev
->irq_requested_type
&= ~(KVM_DEV_IRQ_HOST_MASK
);
202 static int kvm_deassign_irq(struct kvm
*kvm
,
203 struct kvm_assigned_dev_kernel
*assigned_dev
,
204 unsigned long irq_requested_type
)
206 unsigned long guest_irq_type
, host_irq_type
;
208 if (!irqchip_in_kernel(kvm
))
210 /* no irq assignment to deassign */
211 if (!assigned_dev
->irq_requested_type
)
214 host_irq_type
= irq_requested_type
& KVM_DEV_IRQ_HOST_MASK
;
215 guest_irq_type
= irq_requested_type
& KVM_DEV_IRQ_GUEST_MASK
;
218 deassign_host_irq(kvm
, assigned_dev
);
220 deassign_guest_irq(kvm
, assigned_dev
);
225 static void kvm_free_assigned_irq(struct kvm
*kvm
,
226 struct kvm_assigned_dev_kernel
*assigned_dev
)
228 kvm_deassign_irq(kvm
, assigned_dev
, assigned_dev
->irq_requested_type
);
231 static void kvm_free_assigned_device(struct kvm
*kvm
,
232 struct kvm_assigned_dev_kernel
235 kvm_free_assigned_irq(kvm
, assigned_dev
);
237 pci_reset_function(assigned_dev
->dev
);
239 pci_release_regions(assigned_dev
->dev
);
240 pci_disable_device(assigned_dev
->dev
);
241 pci_dev_put(assigned_dev
->dev
);
243 list_del(&assigned_dev
->list
);
247 void kvm_free_all_assigned_devices(struct kvm
*kvm
)
249 struct list_head
*ptr
, *ptr2
;
250 struct kvm_assigned_dev_kernel
*assigned_dev
;
252 list_for_each_safe(ptr
, ptr2
, &kvm
->arch
.assigned_dev_head
) {
253 assigned_dev
= list_entry(ptr
,
254 struct kvm_assigned_dev_kernel
,
257 kvm_free_assigned_device(kvm
, assigned_dev
);
261 static int assigned_device_enable_host_intx(struct kvm
*kvm
,
262 struct kvm_assigned_dev_kernel
*dev
)
264 dev
->host_irq
= dev
->dev
->irq
;
265 /* Even though this is PCI, we don't want to use shared
266 * interrupts. Sharing host devices with guest-assigned devices
267 * on the same interrupt line is not a happy situation: there
268 * are going to be long delays in accepting, acking, etc.
270 if (request_irq(dev
->host_irq
, kvm_assigned_dev_intr
,
271 0, "kvm_assigned_intx_device", (void *)dev
))
276 #ifdef __KVM_HAVE_MSI
277 static int assigned_device_enable_host_msi(struct kvm
*kvm
,
278 struct kvm_assigned_dev_kernel
*dev
)
282 if (!dev
->dev
->msi_enabled
) {
283 r
= pci_enable_msi(dev
->dev
);
288 dev
->host_irq
= dev
->dev
->irq
;
289 if (request_irq(dev
->host_irq
, kvm_assigned_dev_intr
, 0,
290 "kvm_assigned_msi_device", (void *)dev
)) {
291 pci_disable_msi(dev
->dev
);
299 #ifdef __KVM_HAVE_MSIX
300 static int assigned_device_enable_host_msix(struct kvm
*kvm
,
301 struct kvm_assigned_dev_kernel
*dev
)
305 /* host_msix_entries and guest_msix_entries should have been
307 if (dev
->entries_nr
== 0)
310 r
= pci_enable_msix(dev
->dev
, dev
->host_msix_entries
, dev
->entries_nr
);
314 for (i
= 0; i
< dev
->entries_nr
; i
++) {
315 r
= request_irq(dev
->host_msix_entries
[i
].vector
,
316 kvm_assigned_dev_intr
, 0,
317 "kvm_assigned_msix_device",
325 for (i
-= 1; i
>= 0; i
--)
326 free_irq(dev
->host_msix_entries
[i
].vector
, (void *)dev
);
327 pci_disable_msix(dev
->dev
);
333 static int assigned_device_enable_guest_intx(struct kvm
*kvm
,
334 struct kvm_assigned_dev_kernel
*dev
,
335 struct kvm_assigned_irq
*irq
)
337 dev
->guest_irq
= irq
->guest_irq
;
338 dev
->ack_notifier
.gsi
= irq
->guest_irq
;
342 #ifdef __KVM_HAVE_MSI
343 static int assigned_device_enable_guest_msi(struct kvm
*kvm
,
344 struct kvm_assigned_dev_kernel
*dev
,
345 struct kvm_assigned_irq
*irq
)
347 dev
->guest_irq
= irq
->guest_irq
;
348 dev
->ack_notifier
.gsi
= -1;
349 dev
->host_irq_disabled
= false;
354 #ifdef __KVM_HAVE_MSIX
355 static int assigned_device_enable_guest_msix(struct kvm
*kvm
,
356 struct kvm_assigned_dev_kernel
*dev
,
357 struct kvm_assigned_irq
*irq
)
359 dev
->guest_irq
= irq
->guest_irq
;
360 dev
->ack_notifier
.gsi
= -1;
361 dev
->host_irq_disabled
= false;
366 static int assign_host_irq(struct kvm
*kvm
,
367 struct kvm_assigned_dev_kernel
*dev
,
372 if (dev
->irq_requested_type
& KVM_DEV_IRQ_HOST_MASK
)
375 switch (host_irq_type
) {
376 case KVM_DEV_IRQ_HOST_INTX
:
377 r
= assigned_device_enable_host_intx(kvm
, dev
);
379 #ifdef __KVM_HAVE_MSI
380 case KVM_DEV_IRQ_HOST_MSI
:
381 r
= assigned_device_enable_host_msi(kvm
, dev
);
384 #ifdef __KVM_HAVE_MSIX
385 case KVM_DEV_IRQ_HOST_MSIX
:
386 r
= assigned_device_enable_host_msix(kvm
, dev
);
394 dev
->irq_requested_type
|= host_irq_type
;
399 static int assign_guest_irq(struct kvm
*kvm
,
400 struct kvm_assigned_dev_kernel
*dev
,
401 struct kvm_assigned_irq
*irq
,
402 unsigned long guest_irq_type
)
407 if (dev
->irq_requested_type
& KVM_DEV_IRQ_GUEST_MASK
)
410 id
= kvm_request_irq_source_id(kvm
);
414 dev
->irq_source_id
= id
;
416 switch (guest_irq_type
) {
417 case KVM_DEV_IRQ_GUEST_INTX
:
418 r
= assigned_device_enable_guest_intx(kvm
, dev
, irq
);
420 #ifdef __KVM_HAVE_MSI
421 case KVM_DEV_IRQ_GUEST_MSI
:
422 r
= assigned_device_enable_guest_msi(kvm
, dev
, irq
);
425 #ifdef __KVM_HAVE_MSIX
426 case KVM_DEV_IRQ_GUEST_MSIX
:
427 r
= assigned_device_enable_guest_msix(kvm
, dev
, irq
);
435 dev
->irq_requested_type
|= guest_irq_type
;
436 kvm_register_irq_ack_notifier(kvm
, &dev
->ack_notifier
);
438 kvm_free_irq_source_id(kvm
, dev
->irq_source_id
);
443 /* TODO Deal with KVM_DEV_IRQ_ASSIGNED_MASK_MSIX */
444 static int kvm_vm_ioctl_assign_irq(struct kvm
*kvm
,
445 struct kvm_assigned_irq
*assigned_irq
)
448 struct kvm_assigned_dev_kernel
*match
;
449 unsigned long host_irq_type
, guest_irq_type
;
451 if (!capable(CAP_SYS_RAWIO
))
454 if (!irqchip_in_kernel(kvm
))
457 mutex_lock(&kvm
->lock
);
459 match
= kvm_find_assigned_dev(&kvm
->arch
.assigned_dev_head
,
460 assigned_irq
->assigned_dev_id
);
464 host_irq_type
= (assigned_irq
->flags
& KVM_DEV_IRQ_HOST_MASK
);
465 guest_irq_type
= (assigned_irq
->flags
& KVM_DEV_IRQ_GUEST_MASK
);
468 /* can only assign one type at a time */
469 if (hweight_long(host_irq_type
) > 1)
471 if (hweight_long(guest_irq_type
) > 1)
473 if (host_irq_type
== 0 && guest_irq_type
== 0)
478 r
= assign_host_irq(kvm
, match
, host_irq_type
);
483 r
= assign_guest_irq(kvm
, match
, assigned_irq
, guest_irq_type
);
485 mutex_unlock(&kvm
->lock
);
489 static int kvm_vm_ioctl_deassign_dev_irq(struct kvm
*kvm
,
490 struct kvm_assigned_irq
494 struct kvm_assigned_dev_kernel
*match
;
496 mutex_lock(&kvm
->lock
);
498 match
= kvm_find_assigned_dev(&kvm
->arch
.assigned_dev_head
,
499 assigned_irq
->assigned_dev_id
);
503 r
= kvm_deassign_irq(kvm
, match
, assigned_irq
->flags
);
505 mutex_unlock(&kvm
->lock
);
509 static int kvm_vm_ioctl_assign_device(struct kvm
*kvm
,
510 struct kvm_assigned_pci_dev
*assigned_dev
)
513 struct kvm_assigned_dev_kernel
*match
;
516 mutex_lock(&kvm
->lock
);
517 idx
= srcu_read_lock(&kvm
->srcu
);
519 match
= kvm_find_assigned_dev(&kvm
->arch
.assigned_dev_head
,
520 assigned_dev
->assigned_dev_id
);
522 /* device already assigned */
527 match
= kzalloc(sizeof(struct kvm_assigned_dev_kernel
), GFP_KERNEL
);
529 printk(KERN_INFO
"%s: Couldn't allocate memory\n",
534 dev
= pci_get_domain_bus_and_slot(assigned_dev
->segnr
,
536 assigned_dev
->devfn
);
538 printk(KERN_INFO
"%s: host device not found\n", __func__
);
542 if (pci_enable_device(dev
)) {
543 printk(KERN_INFO
"%s: Could not enable PCI device\n", __func__
);
547 r
= pci_request_regions(dev
, "kvm_assigned_device");
549 printk(KERN_INFO
"%s: Could not get access to device regions\n",
554 pci_reset_function(dev
);
556 match
->assigned_dev_id
= assigned_dev
->assigned_dev_id
;
557 match
->host_segnr
= assigned_dev
->segnr
;
558 match
->host_busnr
= assigned_dev
->busnr
;
559 match
->host_devfn
= assigned_dev
->devfn
;
560 match
->flags
= assigned_dev
->flags
;
562 spin_lock_init(&match
->assigned_dev_lock
);
563 match
->irq_source_id
= -1;
565 match
->ack_notifier
.irq_acked
= kvm_assigned_dev_ack_irq
;
566 INIT_WORK(&match
->interrupt_work
,
567 kvm_assigned_dev_interrupt_work_handler
);
569 list_add(&match
->list
, &kvm
->arch
.assigned_dev_head
);
571 if (assigned_dev
->flags
& KVM_DEV_ASSIGN_ENABLE_IOMMU
) {
572 if (!kvm
->arch
.iommu_domain
) {
573 r
= kvm_iommu_map_guest(kvm
);
577 r
= kvm_assign_device(kvm
, match
);
583 srcu_read_unlock(&kvm
->srcu
, idx
);
584 mutex_unlock(&kvm
->lock
);
587 list_del(&match
->list
);
588 pci_release_regions(dev
);
590 pci_disable_device(dev
);
595 srcu_read_unlock(&kvm
->srcu
, idx
);
596 mutex_unlock(&kvm
->lock
);
600 static int kvm_vm_ioctl_deassign_device(struct kvm
*kvm
,
601 struct kvm_assigned_pci_dev
*assigned_dev
)
604 struct kvm_assigned_dev_kernel
*match
;
606 mutex_lock(&kvm
->lock
);
608 match
= kvm_find_assigned_dev(&kvm
->arch
.assigned_dev_head
,
609 assigned_dev
->assigned_dev_id
);
611 printk(KERN_INFO
"%s: device hasn't been assigned before, "
612 "so cannot be deassigned\n", __func__
);
617 if (match
->flags
& KVM_DEV_ASSIGN_ENABLE_IOMMU
)
618 kvm_deassign_device(kvm
, match
);
620 kvm_free_assigned_device(kvm
, match
);
623 mutex_unlock(&kvm
->lock
);
628 #ifdef __KVM_HAVE_MSIX
629 static int kvm_vm_ioctl_set_msix_nr(struct kvm
*kvm
,
630 struct kvm_assigned_msix_nr
*entry_nr
)
633 struct kvm_assigned_dev_kernel
*adev
;
635 mutex_lock(&kvm
->lock
);
637 adev
= kvm_find_assigned_dev(&kvm
->arch
.assigned_dev_head
,
638 entry_nr
->assigned_dev_id
);
644 if (adev
->entries_nr
== 0) {
645 adev
->entries_nr
= entry_nr
->entry_nr
;
646 if (adev
->entries_nr
== 0 ||
647 adev
->entries_nr
>= KVM_MAX_MSIX_PER_DEV
) {
652 adev
->host_msix_entries
= kzalloc(sizeof(struct msix_entry
) *
655 if (!adev
->host_msix_entries
) {
659 adev
->guest_msix_entries
= kzalloc(
660 sizeof(struct kvm_guest_msix_entry
) *
661 entry_nr
->entry_nr
, GFP_KERNEL
);
662 if (!adev
->guest_msix_entries
) {
663 kfree(adev
->host_msix_entries
);
667 } else /* Not allowed set MSI-X number twice */
670 mutex_unlock(&kvm
->lock
);
674 static int kvm_vm_ioctl_set_msix_entry(struct kvm
*kvm
,
675 struct kvm_assigned_msix_entry
*entry
)
678 struct kvm_assigned_dev_kernel
*adev
;
680 mutex_lock(&kvm
->lock
);
682 adev
= kvm_find_assigned_dev(&kvm
->arch
.assigned_dev_head
,
683 entry
->assigned_dev_id
);
690 for (i
= 0; i
< adev
->entries_nr
; i
++)
691 if (adev
->guest_msix_entries
[i
].vector
== 0 ||
692 adev
->guest_msix_entries
[i
].entry
== entry
->entry
) {
693 adev
->guest_msix_entries
[i
].entry
= entry
->entry
;
694 adev
->guest_msix_entries
[i
].vector
= entry
->gsi
;
695 adev
->host_msix_entries
[i
].entry
= entry
->entry
;
698 if (i
== adev
->entries_nr
) {
704 mutex_unlock(&kvm
->lock
);
710 long kvm_vm_ioctl_assigned_device(struct kvm
*kvm
, unsigned ioctl
,
713 void __user
*argp
= (void __user
*)arg
;
717 case KVM_ASSIGN_PCI_DEVICE
: {
718 struct kvm_assigned_pci_dev assigned_dev
;
721 if (copy_from_user(&assigned_dev
, argp
, sizeof assigned_dev
))
723 r
= kvm_vm_ioctl_assign_device(kvm
, &assigned_dev
);
728 case KVM_ASSIGN_IRQ
: {
732 #ifdef KVM_CAP_ASSIGN_DEV_IRQ
733 case KVM_ASSIGN_DEV_IRQ
: {
734 struct kvm_assigned_irq assigned_irq
;
737 if (copy_from_user(&assigned_irq
, argp
, sizeof assigned_irq
))
739 r
= kvm_vm_ioctl_assign_irq(kvm
, &assigned_irq
);
744 case KVM_DEASSIGN_DEV_IRQ
: {
745 struct kvm_assigned_irq assigned_irq
;
748 if (copy_from_user(&assigned_irq
, argp
, sizeof assigned_irq
))
750 r
= kvm_vm_ioctl_deassign_dev_irq(kvm
, &assigned_irq
);
756 #ifdef KVM_CAP_DEVICE_DEASSIGNMENT
757 case KVM_DEASSIGN_PCI_DEVICE
: {
758 struct kvm_assigned_pci_dev assigned_dev
;
761 if (copy_from_user(&assigned_dev
, argp
, sizeof assigned_dev
))
763 r
= kvm_vm_ioctl_deassign_device(kvm
, &assigned_dev
);
769 #ifdef KVM_CAP_IRQ_ROUTING
770 case KVM_SET_GSI_ROUTING
: {
771 struct kvm_irq_routing routing
;
772 struct kvm_irq_routing __user
*urouting
;
773 struct kvm_irq_routing_entry
*entries
;
776 if (copy_from_user(&routing
, argp
, sizeof(routing
)))
779 if (routing
.nr
>= KVM_MAX_IRQ_ROUTES
)
784 entries
= vmalloc(routing
.nr
* sizeof(*entries
));
789 if (copy_from_user(entries
, urouting
->entries
,
790 routing
.nr
* sizeof(*entries
)))
791 goto out_free_irq_routing
;
792 r
= kvm_set_irq_routing(kvm
, entries
, routing
.nr
,
794 out_free_irq_routing
:
798 #endif /* KVM_CAP_IRQ_ROUTING */
799 #ifdef __KVM_HAVE_MSIX
800 case KVM_ASSIGN_SET_MSIX_NR
: {
801 struct kvm_assigned_msix_nr entry_nr
;
803 if (copy_from_user(&entry_nr
, argp
, sizeof entry_nr
))
805 r
= kvm_vm_ioctl_set_msix_nr(kvm
, &entry_nr
);
810 case KVM_ASSIGN_SET_MSIX_ENTRY
: {
811 struct kvm_assigned_msix_entry entry
;
813 if (copy_from_user(&entry
, argp
, sizeof entry
))
815 r
= kvm_vm_ioctl_set_msix_entry(kvm
, &entry
);