2 * Kernel-based Virtual Machine - device assignment support
4 * Copyright (C) 2010 Red Hat, Inc. and/or its affiliates.
6 * This work is licensed under the terms of the GNU GPL, version 2. See
7 * the COPYING file in the top-level directory.
11 #include <linux/kvm_host.h>
12 #include <linux/kvm.h>
13 #include <linux/uaccess.h>
14 #include <linux/vmalloc.h>
15 #include <linux/errno.h>
16 #include <linux/spinlock.h>
17 #include <linux/pci.h>
18 #include <linux/interrupt.h>
19 #include <linux/slab.h>
22 static struct kvm_assigned_dev_kernel
*kvm_find_assigned_dev(struct list_head
*head
,
25 struct list_head
*ptr
;
26 struct kvm_assigned_dev_kernel
*match
;
28 list_for_each(ptr
, head
) {
29 match
= list_entry(ptr
, struct kvm_assigned_dev_kernel
, list
);
30 if (match
->assigned_dev_id
== assigned_dev_id
)
36 static int find_index_from_host_irq(struct kvm_assigned_dev_kernel
37 *assigned_dev
, int irq
)
40 struct msix_entry
*host_msix_entries
;
42 host_msix_entries
= assigned_dev
->host_msix_entries
;
45 for (i
= 0; i
< assigned_dev
->entries_nr
; i
++)
46 if (irq
== host_msix_entries
[i
].vector
) {
51 printk(KERN_WARNING
"Fail to find correlated MSI-X entry!\n");
58 static void kvm_assigned_dev_interrupt_work_handler(struct work_struct
*work
)
60 struct kvm_assigned_dev_kernel
*assigned_dev
;
63 assigned_dev
= container_of(work
, struct kvm_assigned_dev_kernel
,
66 spin_lock_irq(&assigned_dev
->assigned_dev_lock
);
67 if (assigned_dev
->irq_requested_type
& KVM_DEV_IRQ_HOST_MSIX
) {
68 struct kvm_guest_msix_entry
*guest_entries
=
69 assigned_dev
->guest_msix_entries
;
70 for (i
= 0; i
< assigned_dev
->entries_nr
; i
++) {
71 if (!(guest_entries
[i
].flags
&
72 KVM_ASSIGNED_MSIX_PENDING
))
74 guest_entries
[i
].flags
&= ~KVM_ASSIGNED_MSIX_PENDING
;
75 kvm_set_irq(assigned_dev
->kvm
,
76 assigned_dev
->irq_source_id
,
77 guest_entries
[i
].vector
, 1);
80 kvm_set_irq(assigned_dev
->kvm
, assigned_dev
->irq_source_id
,
81 assigned_dev
->guest_irq
, 1);
83 spin_unlock_irq(&assigned_dev
->assigned_dev_lock
);
86 static irqreturn_t
kvm_assigned_dev_intr(int irq
, void *dev_id
)
89 struct kvm_assigned_dev_kernel
*assigned_dev
=
90 (struct kvm_assigned_dev_kernel
*) dev_id
;
92 spin_lock_irqsave(&assigned_dev
->assigned_dev_lock
, flags
);
93 if (assigned_dev
->irq_requested_type
& KVM_DEV_IRQ_HOST_MSIX
) {
94 int index
= find_index_from_host_irq(assigned_dev
, irq
);
97 assigned_dev
->guest_msix_entries
[index
].flags
|=
98 KVM_ASSIGNED_MSIX_PENDING
;
101 schedule_work(&assigned_dev
->interrupt_work
);
103 if (assigned_dev
->irq_requested_type
& KVM_DEV_IRQ_GUEST_INTX
) {
104 disable_irq_nosync(irq
);
105 assigned_dev
->host_irq_disabled
= true;
109 spin_unlock_irqrestore(&assigned_dev
->assigned_dev_lock
, flags
);
113 /* Ack the irq line for an assigned device */
114 static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier
*kian
)
116 struct kvm_assigned_dev_kernel
*dev
;
122 dev
= container_of(kian
, struct kvm_assigned_dev_kernel
,
125 kvm_set_irq(dev
->kvm
, dev
->irq_source_id
, dev
->guest_irq
, 0);
127 /* The guest irq may be shared so this ack may be
128 * from another device.
130 spin_lock_irqsave(&dev
->assigned_dev_lock
, flags
);
131 if (dev
->host_irq_disabled
) {
132 enable_irq(dev
->host_irq
);
133 dev
->host_irq_disabled
= false;
135 spin_unlock_irqrestore(&dev
->assigned_dev_lock
, flags
);
138 static void deassign_guest_irq(struct kvm
*kvm
,
139 struct kvm_assigned_dev_kernel
*assigned_dev
)
141 kvm_unregister_irq_ack_notifier(kvm
, &assigned_dev
->ack_notifier
);
142 assigned_dev
->ack_notifier
.gsi
= -1;
144 if (assigned_dev
->irq_source_id
!= -1)
145 kvm_free_irq_source_id(kvm
, assigned_dev
->irq_source_id
);
146 assigned_dev
->irq_source_id
= -1;
147 assigned_dev
->irq_requested_type
&= ~(KVM_DEV_IRQ_GUEST_MASK
);
150 /* The function implicit hold kvm->lock mutex due to cancel_work_sync() */
151 static void deassign_host_irq(struct kvm
*kvm
,
152 struct kvm_assigned_dev_kernel
*assigned_dev
)
155 * In kvm_free_device_irq, cancel_work_sync return true if:
156 * 1. work is scheduled, and then cancelled.
157 * 2. work callback is executed.
159 * The first one ensured that the irq is disabled and no more events
160 * would happen. But for the second one, the irq may be enabled (e.g.
161 * for MSI). So we disable irq here to prevent further events.
163 * Notice this maybe result in nested disable if the interrupt type is
164 * INTx, but it's OK for we are going to free it.
166 * If this function is a part of VM destroy, please ensure that till
167 * now, the kvm state is still legal for probably we also have to wait
168 * interrupt_work done.
170 if (assigned_dev
->irq_requested_type
& KVM_DEV_IRQ_HOST_MSIX
) {
172 for (i
= 0; i
< assigned_dev
->entries_nr
; i
++)
173 disable_irq_nosync(assigned_dev
->
174 host_msix_entries
[i
].vector
);
176 cancel_work_sync(&assigned_dev
->interrupt_work
);
178 for (i
= 0; i
< assigned_dev
->entries_nr
; i
++)
179 free_irq(assigned_dev
->host_msix_entries
[i
].vector
,
180 (void *)assigned_dev
);
182 assigned_dev
->entries_nr
= 0;
183 kfree(assigned_dev
->host_msix_entries
);
184 kfree(assigned_dev
->guest_msix_entries
);
185 pci_disable_msix(assigned_dev
->dev
);
187 /* Deal with MSI and INTx */
188 disable_irq_nosync(assigned_dev
->host_irq
);
189 cancel_work_sync(&assigned_dev
->interrupt_work
);
191 free_irq(assigned_dev
->host_irq
, (void *)assigned_dev
);
193 if (assigned_dev
->irq_requested_type
& KVM_DEV_IRQ_HOST_MSI
)
194 pci_disable_msi(assigned_dev
->dev
);
197 assigned_dev
->irq_requested_type
&= ~(KVM_DEV_IRQ_HOST_MASK
);
200 static int kvm_deassign_irq(struct kvm
*kvm
,
201 struct kvm_assigned_dev_kernel
*assigned_dev
,
202 unsigned long irq_requested_type
)
204 unsigned long guest_irq_type
, host_irq_type
;
206 if (!irqchip_in_kernel(kvm
))
208 /* no irq assignment to deassign */
209 if (!assigned_dev
->irq_requested_type
)
212 host_irq_type
= irq_requested_type
& KVM_DEV_IRQ_HOST_MASK
;
213 guest_irq_type
= irq_requested_type
& KVM_DEV_IRQ_GUEST_MASK
;
216 deassign_host_irq(kvm
, assigned_dev
);
218 deassign_guest_irq(kvm
, assigned_dev
);
223 static void kvm_free_assigned_irq(struct kvm
*kvm
,
224 struct kvm_assigned_dev_kernel
*assigned_dev
)
226 kvm_deassign_irq(kvm
, assigned_dev
, assigned_dev
->irq_requested_type
);
229 static void kvm_free_assigned_device(struct kvm
*kvm
,
230 struct kvm_assigned_dev_kernel
233 kvm_free_assigned_irq(kvm
, assigned_dev
);
235 pci_reset_function(assigned_dev
->dev
);
237 pci_release_regions(assigned_dev
->dev
);
238 pci_disable_device(assigned_dev
->dev
);
239 pci_dev_put(assigned_dev
->dev
);
241 list_del(&assigned_dev
->list
);
245 void kvm_free_all_assigned_devices(struct kvm
*kvm
)
247 struct list_head
*ptr
, *ptr2
;
248 struct kvm_assigned_dev_kernel
*assigned_dev
;
250 list_for_each_safe(ptr
, ptr2
, &kvm
->arch
.assigned_dev_head
) {
251 assigned_dev
= list_entry(ptr
,
252 struct kvm_assigned_dev_kernel
,
255 kvm_free_assigned_device(kvm
, assigned_dev
);
259 static int assigned_device_enable_host_intx(struct kvm
*kvm
,
260 struct kvm_assigned_dev_kernel
*dev
)
262 dev
->host_irq
= dev
->dev
->irq
;
263 /* Even though this is PCI, we don't want to use shared
264 * interrupts. Sharing host devices with guest-assigned devices
265 * on the same interrupt line is not a happy situation: there
266 * are going to be long delays in accepting, acking, etc.
268 if (request_irq(dev
->host_irq
, kvm_assigned_dev_intr
,
269 0, "kvm_assigned_intx_device", (void *)dev
))
274 #ifdef __KVM_HAVE_MSI
275 static int assigned_device_enable_host_msi(struct kvm
*kvm
,
276 struct kvm_assigned_dev_kernel
*dev
)
280 if (!dev
->dev
->msi_enabled
) {
281 r
= pci_enable_msi(dev
->dev
);
286 dev
->host_irq
= dev
->dev
->irq
;
287 if (request_irq(dev
->host_irq
, kvm_assigned_dev_intr
, 0,
288 "kvm_assigned_msi_device", (void *)dev
)) {
289 pci_disable_msi(dev
->dev
);
297 #ifdef __KVM_HAVE_MSIX
298 static int assigned_device_enable_host_msix(struct kvm
*kvm
,
299 struct kvm_assigned_dev_kernel
*dev
)
303 /* host_msix_entries and guest_msix_entries should have been
305 if (dev
->entries_nr
== 0)
308 r
= pci_enable_msix(dev
->dev
, dev
->host_msix_entries
, dev
->entries_nr
);
312 for (i
= 0; i
< dev
->entries_nr
; i
++) {
313 r
= request_irq(dev
->host_msix_entries
[i
].vector
,
314 kvm_assigned_dev_intr
, 0,
315 "kvm_assigned_msix_device",
323 for (i
-= 1; i
>= 0; i
--)
324 free_irq(dev
->host_msix_entries
[i
].vector
, (void *)dev
);
325 pci_disable_msix(dev
->dev
);
331 static int assigned_device_enable_guest_intx(struct kvm
*kvm
,
332 struct kvm_assigned_dev_kernel
*dev
,
333 struct kvm_assigned_irq
*irq
)
335 dev
->guest_irq
= irq
->guest_irq
;
336 dev
->ack_notifier
.gsi
= irq
->guest_irq
;
340 #ifdef __KVM_HAVE_MSI
341 static int assigned_device_enable_guest_msi(struct kvm
*kvm
,
342 struct kvm_assigned_dev_kernel
*dev
,
343 struct kvm_assigned_irq
*irq
)
345 dev
->guest_irq
= irq
->guest_irq
;
346 dev
->ack_notifier
.gsi
= -1;
347 dev
->host_irq_disabled
= false;
352 #ifdef __KVM_HAVE_MSIX
353 static int assigned_device_enable_guest_msix(struct kvm
*kvm
,
354 struct kvm_assigned_dev_kernel
*dev
,
355 struct kvm_assigned_irq
*irq
)
357 dev
->guest_irq
= irq
->guest_irq
;
358 dev
->ack_notifier
.gsi
= -1;
359 dev
->host_irq_disabled
= false;
364 static int assign_host_irq(struct kvm
*kvm
,
365 struct kvm_assigned_dev_kernel
*dev
,
370 if (dev
->irq_requested_type
& KVM_DEV_IRQ_HOST_MASK
)
373 switch (host_irq_type
) {
374 case KVM_DEV_IRQ_HOST_INTX
:
375 r
= assigned_device_enable_host_intx(kvm
, dev
);
377 #ifdef __KVM_HAVE_MSI
378 case KVM_DEV_IRQ_HOST_MSI
:
379 r
= assigned_device_enable_host_msi(kvm
, dev
);
382 #ifdef __KVM_HAVE_MSIX
383 case KVM_DEV_IRQ_HOST_MSIX
:
384 r
= assigned_device_enable_host_msix(kvm
, dev
);
392 dev
->irq_requested_type
|= host_irq_type
;
397 static int assign_guest_irq(struct kvm
*kvm
,
398 struct kvm_assigned_dev_kernel
*dev
,
399 struct kvm_assigned_irq
*irq
,
400 unsigned long guest_irq_type
)
405 if (dev
->irq_requested_type
& KVM_DEV_IRQ_GUEST_MASK
)
408 id
= kvm_request_irq_source_id(kvm
);
412 dev
->irq_source_id
= id
;
414 switch (guest_irq_type
) {
415 case KVM_DEV_IRQ_GUEST_INTX
:
416 r
= assigned_device_enable_guest_intx(kvm
, dev
, irq
);
418 #ifdef __KVM_HAVE_MSI
419 case KVM_DEV_IRQ_GUEST_MSI
:
420 r
= assigned_device_enable_guest_msi(kvm
, dev
, irq
);
423 #ifdef __KVM_HAVE_MSIX
424 case KVM_DEV_IRQ_GUEST_MSIX
:
425 r
= assigned_device_enable_guest_msix(kvm
, dev
, irq
);
433 dev
->irq_requested_type
|= guest_irq_type
;
434 kvm_register_irq_ack_notifier(kvm
, &dev
->ack_notifier
);
436 kvm_free_irq_source_id(kvm
, dev
->irq_source_id
);
441 /* TODO Deal with KVM_DEV_IRQ_ASSIGNED_MASK_MSIX */
442 static int kvm_vm_ioctl_assign_irq(struct kvm
*kvm
,
443 struct kvm_assigned_irq
*assigned_irq
)
446 struct kvm_assigned_dev_kernel
*match
;
447 unsigned long host_irq_type
, guest_irq_type
;
449 if (!irqchip_in_kernel(kvm
))
452 mutex_lock(&kvm
->lock
);
454 match
= kvm_find_assigned_dev(&kvm
->arch
.assigned_dev_head
,
455 assigned_irq
->assigned_dev_id
);
459 host_irq_type
= (assigned_irq
->flags
& KVM_DEV_IRQ_HOST_MASK
);
460 guest_irq_type
= (assigned_irq
->flags
& KVM_DEV_IRQ_GUEST_MASK
);
463 /* can only assign one type at a time */
464 if (hweight_long(host_irq_type
) > 1)
466 if (hweight_long(guest_irq_type
) > 1)
468 if (host_irq_type
== 0 && guest_irq_type
== 0)
473 r
= assign_host_irq(kvm
, match
, host_irq_type
);
478 r
= assign_guest_irq(kvm
, match
, assigned_irq
, guest_irq_type
);
480 mutex_unlock(&kvm
->lock
);
484 static int kvm_vm_ioctl_deassign_dev_irq(struct kvm
*kvm
,
485 struct kvm_assigned_irq
489 struct kvm_assigned_dev_kernel
*match
;
491 mutex_lock(&kvm
->lock
);
493 match
= kvm_find_assigned_dev(&kvm
->arch
.assigned_dev_head
,
494 assigned_irq
->assigned_dev_id
);
498 r
= kvm_deassign_irq(kvm
, match
, assigned_irq
->flags
);
500 mutex_unlock(&kvm
->lock
);
504 static int kvm_vm_ioctl_assign_device(struct kvm
*kvm
,
505 struct kvm_assigned_pci_dev
*assigned_dev
)
508 struct kvm_assigned_dev_kernel
*match
;
511 mutex_lock(&kvm
->lock
);
512 idx
= srcu_read_lock(&kvm
->srcu
);
514 match
= kvm_find_assigned_dev(&kvm
->arch
.assigned_dev_head
,
515 assigned_dev
->assigned_dev_id
);
517 /* device already assigned */
522 match
= kzalloc(sizeof(struct kvm_assigned_dev_kernel
), GFP_KERNEL
);
524 printk(KERN_INFO
"%s: Couldn't allocate memory\n",
529 dev
= pci_get_domain_bus_and_slot(assigned_dev
->segnr
,
531 assigned_dev
->devfn
);
533 printk(KERN_INFO
"%s: host device not found\n", __func__
);
537 if (pci_enable_device(dev
)) {
538 printk(KERN_INFO
"%s: Could not enable PCI device\n", __func__
);
542 r
= pci_request_regions(dev
, "kvm_assigned_device");
544 printk(KERN_INFO
"%s: Could not get access to device regions\n",
549 pci_reset_function(dev
);
551 match
->assigned_dev_id
= assigned_dev
->assigned_dev_id
;
552 match
->host_segnr
= assigned_dev
->segnr
;
553 match
->host_busnr
= assigned_dev
->busnr
;
554 match
->host_devfn
= assigned_dev
->devfn
;
555 match
->flags
= assigned_dev
->flags
;
557 spin_lock_init(&match
->assigned_dev_lock
);
558 match
->irq_source_id
= -1;
560 match
->ack_notifier
.irq_acked
= kvm_assigned_dev_ack_irq
;
561 INIT_WORK(&match
->interrupt_work
,
562 kvm_assigned_dev_interrupt_work_handler
);
564 list_add(&match
->list
, &kvm
->arch
.assigned_dev_head
);
566 if (assigned_dev
->flags
& KVM_DEV_ASSIGN_ENABLE_IOMMU
) {
567 if (!kvm
->arch
.iommu_domain
) {
568 r
= kvm_iommu_map_guest(kvm
);
572 r
= kvm_assign_device(kvm
, match
);
578 srcu_read_unlock(&kvm
->srcu
, idx
);
579 mutex_unlock(&kvm
->lock
);
582 list_del(&match
->list
);
583 pci_release_regions(dev
);
585 pci_disable_device(dev
);
590 srcu_read_unlock(&kvm
->srcu
, idx
);
591 mutex_unlock(&kvm
->lock
);
595 static int kvm_vm_ioctl_deassign_device(struct kvm
*kvm
,
596 struct kvm_assigned_pci_dev
*assigned_dev
)
599 struct kvm_assigned_dev_kernel
*match
;
601 mutex_lock(&kvm
->lock
);
603 match
= kvm_find_assigned_dev(&kvm
->arch
.assigned_dev_head
,
604 assigned_dev
->assigned_dev_id
);
606 printk(KERN_INFO
"%s: device hasn't been assigned before, "
607 "so cannot be deassigned\n", __func__
);
612 if (match
->flags
& KVM_DEV_ASSIGN_ENABLE_IOMMU
)
613 kvm_deassign_device(kvm
, match
);
615 kvm_free_assigned_device(kvm
, match
);
618 mutex_unlock(&kvm
->lock
);
623 #ifdef __KVM_HAVE_MSIX
624 static int kvm_vm_ioctl_set_msix_nr(struct kvm
*kvm
,
625 struct kvm_assigned_msix_nr
*entry_nr
)
628 struct kvm_assigned_dev_kernel
*adev
;
630 mutex_lock(&kvm
->lock
);
632 adev
= kvm_find_assigned_dev(&kvm
->arch
.assigned_dev_head
,
633 entry_nr
->assigned_dev_id
);
639 if (adev
->entries_nr
== 0) {
640 adev
->entries_nr
= entry_nr
->entry_nr
;
641 if (adev
->entries_nr
== 0 ||
642 adev
->entries_nr
>= KVM_MAX_MSIX_PER_DEV
) {
647 adev
->host_msix_entries
= kzalloc(sizeof(struct msix_entry
) *
650 if (!adev
->host_msix_entries
) {
654 adev
->guest_msix_entries
= kzalloc(
655 sizeof(struct kvm_guest_msix_entry
) *
656 entry_nr
->entry_nr
, GFP_KERNEL
);
657 if (!adev
->guest_msix_entries
) {
658 kfree(adev
->host_msix_entries
);
662 } else /* Not allowed set MSI-X number twice */
665 mutex_unlock(&kvm
->lock
);
669 static int kvm_vm_ioctl_set_msix_entry(struct kvm
*kvm
,
670 struct kvm_assigned_msix_entry
*entry
)
673 struct kvm_assigned_dev_kernel
*adev
;
675 mutex_lock(&kvm
->lock
);
677 adev
= kvm_find_assigned_dev(&kvm
->arch
.assigned_dev_head
,
678 entry
->assigned_dev_id
);
685 for (i
= 0; i
< adev
->entries_nr
; i
++)
686 if (adev
->guest_msix_entries
[i
].vector
== 0 ||
687 adev
->guest_msix_entries
[i
].entry
== entry
->entry
) {
688 adev
->guest_msix_entries
[i
].entry
= entry
->entry
;
689 adev
->guest_msix_entries
[i
].vector
= entry
->gsi
;
690 adev
->host_msix_entries
[i
].entry
= entry
->entry
;
693 if (i
== adev
->entries_nr
) {
699 mutex_unlock(&kvm
->lock
);
705 long kvm_vm_ioctl_assigned_device(struct kvm
*kvm
, unsigned ioctl
,
708 void __user
*argp
= (void __user
*)arg
;
712 case KVM_ASSIGN_PCI_DEVICE
: {
713 struct kvm_assigned_pci_dev assigned_dev
;
716 if (copy_from_user(&assigned_dev
, argp
, sizeof assigned_dev
))
718 r
= kvm_vm_ioctl_assign_device(kvm
, &assigned_dev
);
723 case KVM_ASSIGN_IRQ
: {
727 #ifdef KVM_CAP_ASSIGN_DEV_IRQ
728 case KVM_ASSIGN_DEV_IRQ
: {
729 struct kvm_assigned_irq assigned_irq
;
732 if (copy_from_user(&assigned_irq
, argp
, sizeof assigned_irq
))
734 r
= kvm_vm_ioctl_assign_irq(kvm
, &assigned_irq
);
739 case KVM_DEASSIGN_DEV_IRQ
: {
740 struct kvm_assigned_irq assigned_irq
;
743 if (copy_from_user(&assigned_irq
, argp
, sizeof assigned_irq
))
745 r
= kvm_vm_ioctl_deassign_dev_irq(kvm
, &assigned_irq
);
751 #ifdef KVM_CAP_DEVICE_DEASSIGNMENT
752 case KVM_DEASSIGN_PCI_DEVICE
: {
753 struct kvm_assigned_pci_dev assigned_dev
;
756 if (copy_from_user(&assigned_dev
, argp
, sizeof assigned_dev
))
758 r
= kvm_vm_ioctl_deassign_device(kvm
, &assigned_dev
);
764 #ifdef KVM_CAP_IRQ_ROUTING
765 case KVM_SET_GSI_ROUTING
: {
766 struct kvm_irq_routing routing
;
767 struct kvm_irq_routing __user
*urouting
;
768 struct kvm_irq_routing_entry
*entries
;
771 if (copy_from_user(&routing
, argp
, sizeof(routing
)))
774 if (routing
.nr
>= KVM_MAX_IRQ_ROUTES
)
779 entries
= vmalloc(routing
.nr
* sizeof(*entries
));
784 if (copy_from_user(entries
, urouting
->entries
,
785 routing
.nr
* sizeof(*entries
)))
786 goto out_free_irq_routing
;
787 r
= kvm_set_irq_routing(kvm
, entries
, routing
.nr
,
789 out_free_irq_routing
:
793 #endif /* KVM_CAP_IRQ_ROUTING */
794 #ifdef __KVM_HAVE_MSIX
795 case KVM_ASSIGN_SET_MSIX_NR
: {
796 struct kvm_assigned_msix_nr entry_nr
;
798 if (copy_from_user(&entry_nr
, argp
, sizeof entry_nr
))
800 r
= kvm_vm_ioctl_set_msix_nr(kvm
, &entry_nr
);
805 case KVM_ASSIGN_SET_MSIX_ENTRY
: {
806 struct kvm_assigned_msix_entry entry
;
808 if (copy_from_user(&entry
, argp
, sizeof entry
))
810 r
= kvm_vm_ioctl_set_msix_entry(kvm
, &entry
);