2 * Kernel-based Virtual Machine - device assignment support
4 * Copyright (C) 2006-9 Red Hat, Inc
6 * This work is licensed under the terms of the GNU GPL, version 2. See
7 * the COPYING file in the top-level directory.
11 #include <linux/kvm_host.h>
12 #include <linux/kvm.h>
13 #include <linux/uaccess.h>
14 #include <linux/vmalloc.h>
15 #include <linux/errno.h>
16 #include <linux/spinlock.h>
17 #include <linux/pci.h>
18 #include <linux/interrupt.h>
21 static struct kvm_assigned_dev_kernel
*kvm_find_assigned_dev(struct list_head
*head
,
24 struct list_head
*ptr
;
25 struct kvm_assigned_dev_kernel
*match
;
27 list_for_each(ptr
, head
) {
28 match
= list_entry(ptr
, struct kvm_assigned_dev_kernel
, list
);
29 if (match
->assigned_dev_id
== assigned_dev_id
)
35 static int find_index_from_host_irq(struct kvm_assigned_dev_kernel
36 *assigned_dev
, int irq
)
39 struct msix_entry
*host_msix_entries
;
41 host_msix_entries
= assigned_dev
->host_msix_entries
;
44 for (i
= 0; i
< assigned_dev
->entries_nr
; i
++)
45 if (irq
== host_msix_entries
[i
].vector
) {
50 printk(KERN_WARNING
"Fail to find correlated MSI-X entry!\n");
57 static void kvm_assigned_dev_interrupt_work_handler(struct work_struct
*work
)
59 struct kvm_assigned_dev_kernel
*assigned_dev
;
63 assigned_dev
= container_of(work
, struct kvm_assigned_dev_kernel
,
65 kvm
= assigned_dev
->kvm
;
67 spin_lock_irq(&assigned_dev
->assigned_dev_lock
);
68 if (assigned_dev
->irq_requested_type
& KVM_DEV_IRQ_HOST_MSIX
) {
69 struct kvm_guest_msix_entry
*guest_entries
=
70 assigned_dev
->guest_msix_entries
;
71 for (i
= 0; i
< assigned_dev
->entries_nr
; i
++) {
72 if (!(guest_entries
[i
].flags
&
73 KVM_ASSIGNED_MSIX_PENDING
))
75 guest_entries
[i
].flags
&= ~KVM_ASSIGNED_MSIX_PENDING
;
76 kvm_set_irq(assigned_dev
->kvm
,
77 assigned_dev
->irq_source_id
,
78 guest_entries
[i
].vector
, 1);
81 kvm_set_irq(assigned_dev
->kvm
, assigned_dev
->irq_source_id
,
82 assigned_dev
->guest_irq
, 1);
84 spin_unlock_irq(&assigned_dev
->assigned_dev_lock
);
87 static irqreturn_t
kvm_assigned_dev_intr(int irq
, void *dev_id
)
90 struct kvm_assigned_dev_kernel
*assigned_dev
=
91 (struct kvm_assigned_dev_kernel
*) dev_id
;
93 spin_lock_irqsave(&assigned_dev
->assigned_dev_lock
, flags
);
94 if (assigned_dev
->irq_requested_type
& KVM_DEV_IRQ_HOST_MSIX
) {
95 int index
= find_index_from_host_irq(assigned_dev
, irq
);
98 assigned_dev
->guest_msix_entries
[index
].flags
|=
99 KVM_ASSIGNED_MSIX_PENDING
;
102 schedule_work(&assigned_dev
->interrupt_work
);
104 if (assigned_dev
->irq_requested_type
& KVM_DEV_IRQ_GUEST_INTX
) {
105 disable_irq_nosync(irq
);
106 assigned_dev
->host_irq_disabled
= true;
110 spin_unlock_irqrestore(&assigned_dev
->assigned_dev_lock
, flags
);
114 /* Ack the irq line for an assigned device */
115 static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier
*kian
)
117 struct kvm_assigned_dev_kernel
*dev
;
123 dev
= container_of(kian
, struct kvm_assigned_dev_kernel
,
126 kvm_set_irq(dev
->kvm
, dev
->irq_source_id
, dev
->guest_irq
, 0);
128 /* The guest irq may be shared so this ack may be
129 * from another device.
131 spin_lock_irqsave(&dev
->assigned_dev_lock
, flags
);
132 if (dev
->host_irq_disabled
) {
133 enable_irq(dev
->host_irq
);
134 dev
->host_irq_disabled
= false;
136 spin_unlock_irqrestore(&dev
->assigned_dev_lock
, flags
);
139 static void deassign_guest_irq(struct kvm
*kvm
,
140 struct kvm_assigned_dev_kernel
*assigned_dev
)
142 kvm_unregister_irq_ack_notifier(kvm
, &assigned_dev
->ack_notifier
);
143 assigned_dev
->ack_notifier
.gsi
= -1;
145 if (assigned_dev
->irq_source_id
!= -1)
146 kvm_free_irq_source_id(kvm
, assigned_dev
->irq_source_id
);
147 assigned_dev
->irq_source_id
= -1;
148 assigned_dev
->irq_requested_type
&= ~(KVM_DEV_IRQ_GUEST_MASK
);
151 /* The function implicit hold kvm->lock mutex due to cancel_work_sync() */
152 static void deassign_host_irq(struct kvm
*kvm
,
153 struct kvm_assigned_dev_kernel
*assigned_dev
)
156 * In kvm_free_device_irq, cancel_work_sync return true if:
157 * 1. work is scheduled, and then cancelled.
158 * 2. work callback is executed.
160 * The first one ensured that the irq is disabled and no more events
161 * would happen. But for the second one, the irq may be enabled (e.g.
162 * for MSI). So we disable irq here to prevent further events.
164 * Notice this maybe result in nested disable if the interrupt type is
165 * INTx, but it's OK for we are going to free it.
167 * If this function is a part of VM destroy, please ensure that till
168 * now, the kvm state is still legal for probably we also have to wait
169 * interrupt_work done.
171 if (assigned_dev
->irq_requested_type
& KVM_DEV_IRQ_HOST_MSIX
) {
173 for (i
= 0; i
< assigned_dev
->entries_nr
; i
++)
174 disable_irq_nosync(assigned_dev
->
175 host_msix_entries
[i
].vector
);
177 cancel_work_sync(&assigned_dev
->interrupt_work
);
179 for (i
= 0; i
< assigned_dev
->entries_nr
; i
++)
180 free_irq(assigned_dev
->host_msix_entries
[i
].vector
,
181 (void *)assigned_dev
);
183 assigned_dev
->entries_nr
= 0;
184 kfree(assigned_dev
->host_msix_entries
);
185 kfree(assigned_dev
->guest_msix_entries
);
186 pci_disable_msix(assigned_dev
->dev
);
188 /* Deal with MSI and INTx */
189 disable_irq_nosync(assigned_dev
->host_irq
);
190 cancel_work_sync(&assigned_dev
->interrupt_work
);
192 free_irq(assigned_dev
->host_irq
, (void *)assigned_dev
);
194 if (assigned_dev
->irq_requested_type
& KVM_DEV_IRQ_HOST_MSI
)
195 pci_disable_msi(assigned_dev
->dev
);
198 assigned_dev
->irq_requested_type
&= ~(KVM_DEV_IRQ_HOST_MASK
);
201 static int kvm_deassign_irq(struct kvm
*kvm
,
202 struct kvm_assigned_dev_kernel
*assigned_dev
,
203 unsigned long irq_requested_type
)
205 unsigned long guest_irq_type
, host_irq_type
;
207 if (!irqchip_in_kernel(kvm
))
209 /* no irq assignment to deassign */
210 if (!assigned_dev
->irq_requested_type
)
213 host_irq_type
= irq_requested_type
& KVM_DEV_IRQ_HOST_MASK
;
214 guest_irq_type
= irq_requested_type
& KVM_DEV_IRQ_GUEST_MASK
;
217 deassign_host_irq(kvm
, assigned_dev
);
219 deassign_guest_irq(kvm
, assigned_dev
);
224 static void kvm_free_assigned_irq(struct kvm
*kvm
,
225 struct kvm_assigned_dev_kernel
*assigned_dev
)
227 kvm_deassign_irq(kvm
, assigned_dev
, assigned_dev
->irq_requested_type
);
230 static void kvm_free_assigned_device(struct kvm
*kvm
,
231 struct kvm_assigned_dev_kernel
234 kvm_free_assigned_irq(kvm
, assigned_dev
);
236 pci_reset_function(assigned_dev
->dev
);
238 pci_release_regions(assigned_dev
->dev
);
239 pci_disable_device(assigned_dev
->dev
);
240 pci_dev_put(assigned_dev
->dev
);
242 list_del(&assigned_dev
->list
);
246 void kvm_free_all_assigned_devices(struct kvm
*kvm
)
248 struct list_head
*ptr
, *ptr2
;
249 struct kvm_assigned_dev_kernel
*assigned_dev
;
251 list_for_each_safe(ptr
, ptr2
, &kvm
->arch
.assigned_dev_head
) {
252 assigned_dev
= list_entry(ptr
,
253 struct kvm_assigned_dev_kernel
,
256 kvm_free_assigned_device(kvm
, assigned_dev
);
260 static int assigned_device_enable_host_intx(struct kvm
*kvm
,
261 struct kvm_assigned_dev_kernel
*dev
)
263 dev
->host_irq
= dev
->dev
->irq
;
264 /* Even though this is PCI, we don't want to use shared
265 * interrupts. Sharing host devices with guest-assigned devices
266 * on the same interrupt line is not a happy situation: there
267 * are going to be long delays in accepting, acking, etc.
269 if (request_irq(dev
->host_irq
, kvm_assigned_dev_intr
,
270 0, "kvm_assigned_intx_device", (void *)dev
))
275 #ifdef __KVM_HAVE_MSI
276 static int assigned_device_enable_host_msi(struct kvm
*kvm
,
277 struct kvm_assigned_dev_kernel
*dev
)
281 if (!dev
->dev
->msi_enabled
) {
282 r
= pci_enable_msi(dev
->dev
);
287 dev
->host_irq
= dev
->dev
->irq
;
288 if (request_irq(dev
->host_irq
, kvm_assigned_dev_intr
, 0,
289 "kvm_assigned_msi_device", (void *)dev
)) {
290 pci_disable_msi(dev
->dev
);
298 #ifdef __KVM_HAVE_MSIX
299 static int assigned_device_enable_host_msix(struct kvm
*kvm
,
300 struct kvm_assigned_dev_kernel
*dev
)
304 /* host_msix_entries and guest_msix_entries should have been
306 if (dev
->entries_nr
== 0)
309 r
= pci_enable_msix(dev
->dev
, dev
->host_msix_entries
, dev
->entries_nr
);
313 for (i
= 0; i
< dev
->entries_nr
; i
++) {
314 r
= request_irq(dev
->host_msix_entries
[i
].vector
,
315 kvm_assigned_dev_intr
, 0,
316 "kvm_assigned_msix_device",
318 /* FIXME: free requested_irq's on failure */
328 static int assigned_device_enable_guest_intx(struct kvm
*kvm
,
329 struct kvm_assigned_dev_kernel
*dev
,
330 struct kvm_assigned_irq
*irq
)
332 dev
->guest_irq
= irq
->guest_irq
;
333 dev
->ack_notifier
.gsi
= irq
->guest_irq
;
337 #ifdef __KVM_HAVE_MSI
338 static int assigned_device_enable_guest_msi(struct kvm
*kvm
,
339 struct kvm_assigned_dev_kernel
*dev
,
340 struct kvm_assigned_irq
*irq
)
342 dev
->guest_irq
= irq
->guest_irq
;
343 dev
->ack_notifier
.gsi
= -1;
344 dev
->host_irq_disabled
= false;
349 #ifdef __KVM_HAVE_MSIX
350 static int assigned_device_enable_guest_msix(struct kvm
*kvm
,
351 struct kvm_assigned_dev_kernel
*dev
,
352 struct kvm_assigned_irq
*irq
)
354 dev
->guest_irq
= irq
->guest_irq
;
355 dev
->ack_notifier
.gsi
= -1;
356 dev
->host_irq_disabled
= false;
361 static int assign_host_irq(struct kvm
*kvm
,
362 struct kvm_assigned_dev_kernel
*dev
,
367 if (dev
->irq_requested_type
& KVM_DEV_IRQ_HOST_MASK
)
370 switch (host_irq_type
) {
371 case KVM_DEV_IRQ_HOST_INTX
:
372 r
= assigned_device_enable_host_intx(kvm
, dev
);
374 #ifdef __KVM_HAVE_MSI
375 case KVM_DEV_IRQ_HOST_MSI
:
376 r
= assigned_device_enable_host_msi(kvm
, dev
);
379 #ifdef __KVM_HAVE_MSIX
380 case KVM_DEV_IRQ_HOST_MSIX
:
381 r
= assigned_device_enable_host_msix(kvm
, dev
);
389 dev
->irq_requested_type
|= host_irq_type
;
394 static int assign_guest_irq(struct kvm
*kvm
,
395 struct kvm_assigned_dev_kernel
*dev
,
396 struct kvm_assigned_irq
*irq
,
397 unsigned long guest_irq_type
)
402 if (dev
->irq_requested_type
& KVM_DEV_IRQ_GUEST_MASK
)
405 id
= kvm_request_irq_source_id(kvm
);
409 dev
->irq_source_id
= id
;
411 switch (guest_irq_type
) {
412 case KVM_DEV_IRQ_GUEST_INTX
:
413 r
= assigned_device_enable_guest_intx(kvm
, dev
, irq
);
415 #ifdef __KVM_HAVE_MSI
416 case KVM_DEV_IRQ_GUEST_MSI
:
417 r
= assigned_device_enable_guest_msi(kvm
, dev
, irq
);
420 #ifdef __KVM_HAVE_MSIX
421 case KVM_DEV_IRQ_GUEST_MSIX
:
422 r
= assigned_device_enable_guest_msix(kvm
, dev
, irq
);
430 dev
->irq_requested_type
|= guest_irq_type
;
431 kvm_register_irq_ack_notifier(kvm
, &dev
->ack_notifier
);
433 kvm_free_irq_source_id(kvm
, dev
->irq_source_id
);
438 /* TODO Deal with KVM_DEV_IRQ_ASSIGNED_MASK_MSIX */
439 static int kvm_vm_ioctl_assign_irq(struct kvm
*kvm
,
440 struct kvm_assigned_irq
*assigned_irq
)
443 struct kvm_assigned_dev_kernel
*match
;
444 unsigned long host_irq_type
, guest_irq_type
;
446 if (!capable(CAP_SYS_RAWIO
))
449 if (!irqchip_in_kernel(kvm
))
452 mutex_lock(&kvm
->lock
);
454 match
= kvm_find_assigned_dev(&kvm
->arch
.assigned_dev_head
,
455 assigned_irq
->assigned_dev_id
);
459 host_irq_type
= (assigned_irq
->flags
& KVM_DEV_IRQ_HOST_MASK
);
460 guest_irq_type
= (assigned_irq
->flags
& KVM_DEV_IRQ_GUEST_MASK
);
463 /* can only assign one type at a time */
464 if (hweight_long(host_irq_type
) > 1)
466 if (hweight_long(guest_irq_type
) > 1)
468 if (host_irq_type
== 0 && guest_irq_type
== 0)
473 r
= assign_host_irq(kvm
, match
, host_irq_type
);
478 r
= assign_guest_irq(kvm
, match
, assigned_irq
, guest_irq_type
);
480 mutex_unlock(&kvm
->lock
);
484 static int kvm_vm_ioctl_deassign_dev_irq(struct kvm
*kvm
,
485 struct kvm_assigned_irq
489 struct kvm_assigned_dev_kernel
*match
;
491 mutex_lock(&kvm
->lock
);
493 match
= kvm_find_assigned_dev(&kvm
->arch
.assigned_dev_head
,
494 assigned_irq
->assigned_dev_id
);
498 r
= kvm_deassign_irq(kvm
, match
, assigned_irq
->flags
);
500 mutex_unlock(&kvm
->lock
);
504 static int kvm_vm_ioctl_assign_device(struct kvm
*kvm
,
505 struct kvm_assigned_pci_dev
*assigned_dev
)
508 struct kvm_assigned_dev_kernel
*match
;
511 mutex_lock(&kvm
->lock
);
512 down_read(&kvm
->slots_lock
);
514 match
= kvm_find_assigned_dev(&kvm
->arch
.assigned_dev_head
,
515 assigned_dev
->assigned_dev_id
);
517 /* device already assigned */
522 match
= kzalloc(sizeof(struct kvm_assigned_dev_kernel
), GFP_KERNEL
);
524 printk(KERN_INFO
"%s: Couldn't allocate memory\n",
529 dev
= pci_get_bus_and_slot(assigned_dev
->busnr
,
530 assigned_dev
->devfn
);
532 printk(KERN_INFO
"%s: host device not found\n", __func__
);
536 if (pci_enable_device(dev
)) {
537 printk(KERN_INFO
"%s: Could not enable PCI device\n", __func__
);
541 r
= pci_request_regions(dev
, "kvm_assigned_device");
543 printk(KERN_INFO
"%s: Could not get access to device regions\n",
548 pci_reset_function(dev
);
550 match
->assigned_dev_id
= assigned_dev
->assigned_dev_id
;
551 match
->host_busnr
= assigned_dev
->busnr
;
552 match
->host_devfn
= assigned_dev
->devfn
;
553 match
->flags
= assigned_dev
->flags
;
555 spin_lock_init(&match
->assigned_dev_lock
);
556 match
->irq_source_id
= -1;
558 match
->ack_notifier
.irq_acked
= kvm_assigned_dev_ack_irq
;
559 INIT_WORK(&match
->interrupt_work
,
560 kvm_assigned_dev_interrupt_work_handler
);
562 list_add(&match
->list
, &kvm
->arch
.assigned_dev_head
);
564 if (assigned_dev
->flags
& KVM_DEV_ASSIGN_ENABLE_IOMMU
) {
565 if (!kvm
->arch
.iommu_domain
) {
566 r
= kvm_iommu_map_guest(kvm
);
570 r
= kvm_assign_device(kvm
, match
);
576 up_read(&kvm
->slots_lock
);
577 mutex_unlock(&kvm
->lock
);
580 list_del(&match
->list
);
581 pci_release_regions(dev
);
583 pci_disable_device(dev
);
588 up_read(&kvm
->slots_lock
);
589 mutex_unlock(&kvm
->lock
);
593 static int kvm_vm_ioctl_deassign_device(struct kvm
*kvm
,
594 struct kvm_assigned_pci_dev
*assigned_dev
)
597 struct kvm_assigned_dev_kernel
*match
;
599 mutex_lock(&kvm
->lock
);
601 match
= kvm_find_assigned_dev(&kvm
->arch
.assigned_dev_head
,
602 assigned_dev
->assigned_dev_id
);
604 printk(KERN_INFO
"%s: device hasn't been assigned before, "
605 "so cannot be deassigned\n", __func__
);
610 if (match
->flags
& KVM_DEV_ASSIGN_ENABLE_IOMMU
)
611 kvm_deassign_device(kvm
, match
);
613 kvm_free_assigned_device(kvm
, match
);
616 mutex_unlock(&kvm
->lock
);
621 #ifdef __KVM_HAVE_MSIX
622 static int kvm_vm_ioctl_set_msix_nr(struct kvm
*kvm
,
623 struct kvm_assigned_msix_nr
*entry_nr
)
626 struct kvm_assigned_dev_kernel
*adev
;
628 mutex_lock(&kvm
->lock
);
630 adev
= kvm_find_assigned_dev(&kvm
->arch
.assigned_dev_head
,
631 entry_nr
->assigned_dev_id
);
637 if (adev
->entries_nr
== 0) {
638 adev
->entries_nr
= entry_nr
->entry_nr
;
639 if (adev
->entries_nr
== 0 ||
640 adev
->entries_nr
>= KVM_MAX_MSIX_PER_DEV
) {
645 adev
->host_msix_entries
= kzalloc(sizeof(struct msix_entry
) *
648 if (!adev
->host_msix_entries
) {
652 adev
->guest_msix_entries
= kzalloc(
653 sizeof(struct kvm_guest_msix_entry
) *
654 entry_nr
->entry_nr
, GFP_KERNEL
);
655 if (!adev
->guest_msix_entries
) {
656 kfree(adev
->host_msix_entries
);
660 } else /* Not allowed set MSI-X number twice */
663 mutex_unlock(&kvm
->lock
);
667 static int kvm_vm_ioctl_set_msix_entry(struct kvm
*kvm
,
668 struct kvm_assigned_msix_entry
*entry
)
671 struct kvm_assigned_dev_kernel
*adev
;
673 mutex_lock(&kvm
->lock
);
675 adev
= kvm_find_assigned_dev(&kvm
->arch
.assigned_dev_head
,
676 entry
->assigned_dev_id
);
683 for (i
= 0; i
< adev
->entries_nr
; i
++)
684 if (adev
->guest_msix_entries
[i
].vector
== 0 ||
685 adev
->guest_msix_entries
[i
].entry
== entry
->entry
) {
686 adev
->guest_msix_entries
[i
].entry
= entry
->entry
;
687 adev
->guest_msix_entries
[i
].vector
= entry
->gsi
;
688 adev
->host_msix_entries
[i
].entry
= entry
->entry
;
691 if (i
== adev
->entries_nr
) {
697 mutex_unlock(&kvm
->lock
);
703 long kvm_vm_ioctl_assigned_device(struct kvm
*kvm
, unsigned ioctl
,
706 void __user
*argp
= (void __user
*)arg
;
710 case KVM_ASSIGN_PCI_DEVICE
: {
711 struct kvm_assigned_pci_dev assigned_dev
;
714 if (copy_from_user(&assigned_dev
, argp
, sizeof assigned_dev
))
716 r
= kvm_vm_ioctl_assign_device(kvm
, &assigned_dev
);
721 case KVM_ASSIGN_IRQ
: {
725 #ifdef KVM_CAP_ASSIGN_DEV_IRQ
726 case KVM_ASSIGN_DEV_IRQ
: {
727 struct kvm_assigned_irq assigned_irq
;
730 if (copy_from_user(&assigned_irq
, argp
, sizeof assigned_irq
))
732 r
= kvm_vm_ioctl_assign_irq(kvm
, &assigned_irq
);
737 case KVM_DEASSIGN_DEV_IRQ
: {
738 struct kvm_assigned_irq assigned_irq
;
741 if (copy_from_user(&assigned_irq
, argp
, sizeof assigned_irq
))
743 r
= kvm_vm_ioctl_deassign_dev_irq(kvm
, &assigned_irq
);
749 #ifdef KVM_CAP_DEVICE_DEASSIGNMENT
750 case KVM_DEASSIGN_PCI_DEVICE
: {
751 struct kvm_assigned_pci_dev assigned_dev
;
754 if (copy_from_user(&assigned_dev
, argp
, sizeof assigned_dev
))
756 r
= kvm_vm_ioctl_deassign_device(kvm
, &assigned_dev
);
762 #ifdef KVM_CAP_IRQ_ROUTING
763 case KVM_SET_GSI_ROUTING
: {
764 struct kvm_irq_routing routing
;
765 struct kvm_irq_routing __user
*urouting
;
766 struct kvm_irq_routing_entry
*entries
;
769 if (copy_from_user(&routing
, argp
, sizeof(routing
)))
772 if (routing
.nr
>= KVM_MAX_IRQ_ROUTES
)
777 entries
= vmalloc(routing
.nr
* sizeof(*entries
));
782 if (copy_from_user(entries
, urouting
->entries
,
783 routing
.nr
* sizeof(*entries
)))
784 goto out_free_irq_routing
;
785 r
= kvm_set_irq_routing(kvm
, entries
, routing
.nr
,
787 out_free_irq_routing
:
791 #endif /* KVM_CAP_IRQ_ROUTING */
792 #ifdef __KVM_HAVE_MSIX
793 case KVM_ASSIGN_SET_MSIX_NR
: {
794 struct kvm_assigned_msix_nr entry_nr
;
796 if (copy_from_user(&entry_nr
, argp
, sizeof entry_nr
))
798 r
= kvm_vm_ioctl_set_msix_nr(kvm
, &entry_nr
);
803 case KVM_ASSIGN_SET_MSIX_ENTRY
: {
804 struct kvm_assigned_msix_entry entry
;
806 if (copy_from_user(&entry
, argp
, sizeof entry
))
808 r
= kvm_vm_ioctl_set_msix_entry(kvm
, &entry
);