2 * Kernel-based Virtual Machine - device assignment support
4 * Copyright (C) 2010 Red Hat, Inc. and/or its affiliates.
6 * This work is licensed under the terms of the GNU GPL, version 2. See
7 * the COPYING file in the top-level directory.
11 #include <linux/kvm_host.h>
12 #include <linux/kvm.h>
13 #include <linux/uaccess.h>
14 #include <linux/vmalloc.h>
15 #include <linux/errno.h>
16 #include <linux/spinlock.h>
17 #include <linux/pci.h>
18 #include <linux/interrupt.h>
19 #include <linux/slab.h>
22 static struct kvm_assigned_dev_kernel
*kvm_find_assigned_dev(struct list_head
*head
,
25 struct list_head
*ptr
;
26 struct kvm_assigned_dev_kernel
*match
;
28 list_for_each(ptr
, head
) {
29 match
= list_entry(ptr
, struct kvm_assigned_dev_kernel
, list
);
30 if (match
->assigned_dev_id
== assigned_dev_id
)
36 static int find_index_from_host_irq(struct kvm_assigned_dev_kernel
37 *assigned_dev
, int irq
)
40 struct msix_entry
*host_msix_entries
;
42 host_msix_entries
= assigned_dev
->host_msix_entries
;
45 for (i
= 0; i
< assigned_dev
->entries_nr
; i
++)
46 if (irq
== host_msix_entries
[i
].vector
) {
51 printk(KERN_WARNING
"Fail to find correlated MSI-X entry!\n");
58 static irqreturn_t
kvm_assigned_dev_thread(int irq
, void *dev_id
)
60 struct kvm_assigned_dev_kernel
*assigned_dev
= dev_id
;
64 if (assigned_dev
->irq_requested_type
& KVM_DEV_IRQ_HOST_INTX
) {
65 spin_lock(&assigned_dev
->intx_lock
);
66 disable_irq_nosync(irq
);
67 assigned_dev
->host_irq_disabled
= true;
68 spin_unlock(&assigned_dev
->intx_lock
);
71 if (assigned_dev
->irq_requested_type
& KVM_DEV_IRQ_HOST_MSIX
) {
72 index
= find_index_from_host_irq(assigned_dev
, irq
);
74 vector
= assigned_dev
->
75 guest_msix_entries
[index
].vector
;
76 kvm_set_irq(assigned_dev
->kvm
,
77 assigned_dev
->irq_source_id
, vector
, 1);
80 kvm_set_irq(assigned_dev
->kvm
, assigned_dev
->irq_source_id
,
81 assigned_dev
->guest_irq
, 1);
86 /* Ack the irq line for an assigned device */
87 static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier
*kian
)
89 struct kvm_assigned_dev_kernel
*dev
;
94 dev
= container_of(kian
, struct kvm_assigned_dev_kernel
,
97 kvm_set_irq(dev
->kvm
, dev
->irq_source_id
, dev
->guest_irq
, 0);
99 /* The guest irq may be shared so this ack may be
100 * from another device.
102 spin_lock(&dev
->intx_lock
);
103 if (dev
->host_irq_disabled
) {
104 enable_irq(dev
->host_irq
);
105 dev
->host_irq_disabled
= false;
107 spin_unlock(&dev
->intx_lock
);
110 static void deassign_guest_irq(struct kvm
*kvm
,
111 struct kvm_assigned_dev_kernel
*assigned_dev
)
113 kvm_unregister_irq_ack_notifier(kvm
, &assigned_dev
->ack_notifier
);
114 assigned_dev
->ack_notifier
.gsi
= -1;
116 kvm_set_irq(assigned_dev
->kvm
, assigned_dev
->irq_source_id
,
117 assigned_dev
->guest_irq
, 0);
119 if (assigned_dev
->irq_source_id
!= -1)
120 kvm_free_irq_source_id(kvm
, assigned_dev
->irq_source_id
);
121 assigned_dev
->irq_source_id
= -1;
122 assigned_dev
->irq_requested_type
&= ~(KVM_DEV_IRQ_GUEST_MASK
);
125 /* The function implicit hold kvm->lock mutex due to cancel_work_sync() */
126 static void deassign_host_irq(struct kvm
*kvm
,
127 struct kvm_assigned_dev_kernel
*assigned_dev
)
130 * We disable irq here to prevent further events.
132 * Notice this maybe result in nested disable if the interrupt type is
133 * INTx, but it's OK for we are going to free it.
135 * If this function is a part of VM destroy, please ensure that till
136 * now, the kvm state is still legal for probably we also have to wait
137 * on a currently running IRQ handler.
139 if (assigned_dev
->irq_requested_type
& KVM_DEV_IRQ_HOST_MSIX
) {
141 for (i
= 0; i
< assigned_dev
->entries_nr
; i
++)
142 disable_irq(assigned_dev
->host_msix_entries
[i
].vector
);
144 for (i
= 0; i
< assigned_dev
->entries_nr
; i
++)
145 free_irq(assigned_dev
->host_msix_entries
[i
].vector
,
146 (void *)assigned_dev
);
148 assigned_dev
->entries_nr
= 0;
149 kfree(assigned_dev
->host_msix_entries
);
150 kfree(assigned_dev
->guest_msix_entries
);
151 pci_disable_msix(assigned_dev
->dev
);
153 /* Deal with MSI and INTx */
154 disable_irq(assigned_dev
->host_irq
);
156 free_irq(assigned_dev
->host_irq
, (void *)assigned_dev
);
158 if (assigned_dev
->irq_requested_type
& KVM_DEV_IRQ_HOST_MSI
)
159 pci_disable_msi(assigned_dev
->dev
);
162 assigned_dev
->irq_requested_type
&= ~(KVM_DEV_IRQ_HOST_MASK
);
165 static int kvm_deassign_irq(struct kvm
*kvm
,
166 struct kvm_assigned_dev_kernel
*assigned_dev
,
167 unsigned long irq_requested_type
)
169 unsigned long guest_irq_type
, host_irq_type
;
171 if (!irqchip_in_kernel(kvm
))
173 /* no irq assignment to deassign */
174 if (!assigned_dev
->irq_requested_type
)
177 host_irq_type
= irq_requested_type
& KVM_DEV_IRQ_HOST_MASK
;
178 guest_irq_type
= irq_requested_type
& KVM_DEV_IRQ_GUEST_MASK
;
181 deassign_host_irq(kvm
, assigned_dev
);
183 deassign_guest_irq(kvm
, assigned_dev
);
188 static void kvm_free_assigned_irq(struct kvm
*kvm
,
189 struct kvm_assigned_dev_kernel
*assigned_dev
)
191 kvm_deassign_irq(kvm
, assigned_dev
, assigned_dev
->irq_requested_type
);
194 static void kvm_free_assigned_device(struct kvm
*kvm
,
195 struct kvm_assigned_dev_kernel
198 kvm_free_assigned_irq(kvm
, assigned_dev
);
200 pci_reset_function(assigned_dev
->dev
);
202 pci_release_regions(assigned_dev
->dev
);
203 pci_disable_device(assigned_dev
->dev
);
204 pci_dev_put(assigned_dev
->dev
);
206 list_del(&assigned_dev
->list
);
210 void kvm_free_all_assigned_devices(struct kvm
*kvm
)
212 struct list_head
*ptr
, *ptr2
;
213 struct kvm_assigned_dev_kernel
*assigned_dev
;
215 list_for_each_safe(ptr
, ptr2
, &kvm
->arch
.assigned_dev_head
) {
216 assigned_dev
= list_entry(ptr
,
217 struct kvm_assigned_dev_kernel
,
220 kvm_free_assigned_device(kvm
, assigned_dev
);
224 static int assigned_device_enable_host_intx(struct kvm
*kvm
,
225 struct kvm_assigned_dev_kernel
*dev
)
227 dev
->host_irq
= dev
->dev
->irq
;
228 /* Even though this is PCI, we don't want to use shared
229 * interrupts. Sharing host devices with guest-assigned devices
230 * on the same interrupt line is not a happy situation: there
231 * are going to be long delays in accepting, acking, etc.
233 if (request_threaded_irq(dev
->host_irq
, NULL
, kvm_assigned_dev_thread
,
234 IRQF_ONESHOT
, dev
->irq_name
, (void *)dev
))
239 #ifdef __KVM_HAVE_MSI
240 static int assigned_device_enable_host_msi(struct kvm
*kvm
,
241 struct kvm_assigned_dev_kernel
*dev
)
245 if (!dev
->dev
->msi_enabled
) {
246 r
= pci_enable_msi(dev
->dev
);
251 dev
->host_irq
= dev
->dev
->irq
;
252 if (request_threaded_irq(dev
->host_irq
, NULL
, kvm_assigned_dev_thread
,
253 0, dev
->irq_name
, (void *)dev
)) {
254 pci_disable_msi(dev
->dev
);
262 #ifdef __KVM_HAVE_MSIX
263 static int assigned_device_enable_host_msix(struct kvm
*kvm
,
264 struct kvm_assigned_dev_kernel
*dev
)
268 /* host_msix_entries and guest_msix_entries should have been
270 if (dev
->entries_nr
== 0)
273 r
= pci_enable_msix(dev
->dev
, dev
->host_msix_entries
, dev
->entries_nr
);
277 for (i
= 0; i
< dev
->entries_nr
; i
++) {
278 r
= request_threaded_irq(dev
->host_msix_entries
[i
].vector
,
279 NULL
, kvm_assigned_dev_thread
,
280 0, dev
->irq_name
, (void *)dev
);
287 for (i
-= 1; i
>= 0; i
--)
288 free_irq(dev
->host_msix_entries
[i
].vector
, (void *)dev
);
289 pci_disable_msix(dev
->dev
);
295 static int assigned_device_enable_guest_intx(struct kvm
*kvm
,
296 struct kvm_assigned_dev_kernel
*dev
,
297 struct kvm_assigned_irq
*irq
)
299 dev
->guest_irq
= irq
->guest_irq
;
300 dev
->ack_notifier
.gsi
= irq
->guest_irq
;
304 #ifdef __KVM_HAVE_MSI
305 static int assigned_device_enable_guest_msi(struct kvm
*kvm
,
306 struct kvm_assigned_dev_kernel
*dev
,
307 struct kvm_assigned_irq
*irq
)
309 dev
->guest_irq
= irq
->guest_irq
;
310 dev
->ack_notifier
.gsi
= -1;
311 dev
->host_irq_disabled
= false;
316 #ifdef __KVM_HAVE_MSIX
317 static int assigned_device_enable_guest_msix(struct kvm
*kvm
,
318 struct kvm_assigned_dev_kernel
*dev
,
319 struct kvm_assigned_irq
*irq
)
321 dev
->guest_irq
= irq
->guest_irq
;
322 dev
->ack_notifier
.gsi
= -1;
323 dev
->host_irq_disabled
= false;
328 static int assign_host_irq(struct kvm
*kvm
,
329 struct kvm_assigned_dev_kernel
*dev
,
334 if (dev
->irq_requested_type
& KVM_DEV_IRQ_HOST_MASK
)
337 snprintf(dev
->irq_name
, sizeof(dev
->irq_name
), "kvm:%s",
340 switch (host_irq_type
) {
341 case KVM_DEV_IRQ_HOST_INTX
:
342 r
= assigned_device_enable_host_intx(kvm
, dev
);
344 #ifdef __KVM_HAVE_MSI
345 case KVM_DEV_IRQ_HOST_MSI
:
346 r
= assigned_device_enable_host_msi(kvm
, dev
);
349 #ifdef __KVM_HAVE_MSIX
350 case KVM_DEV_IRQ_HOST_MSIX
:
351 r
= assigned_device_enable_host_msix(kvm
, dev
);
359 dev
->irq_requested_type
|= host_irq_type
;
364 static int assign_guest_irq(struct kvm
*kvm
,
365 struct kvm_assigned_dev_kernel
*dev
,
366 struct kvm_assigned_irq
*irq
,
367 unsigned long guest_irq_type
)
372 if (dev
->irq_requested_type
& KVM_DEV_IRQ_GUEST_MASK
)
375 id
= kvm_request_irq_source_id(kvm
);
379 dev
->irq_source_id
= id
;
381 switch (guest_irq_type
) {
382 case KVM_DEV_IRQ_GUEST_INTX
:
383 r
= assigned_device_enable_guest_intx(kvm
, dev
, irq
);
385 #ifdef __KVM_HAVE_MSI
386 case KVM_DEV_IRQ_GUEST_MSI
:
387 r
= assigned_device_enable_guest_msi(kvm
, dev
, irq
);
390 #ifdef __KVM_HAVE_MSIX
391 case KVM_DEV_IRQ_GUEST_MSIX
:
392 r
= assigned_device_enable_guest_msix(kvm
, dev
, irq
);
400 dev
->irq_requested_type
|= guest_irq_type
;
401 kvm_register_irq_ack_notifier(kvm
, &dev
->ack_notifier
);
403 kvm_free_irq_source_id(kvm
, dev
->irq_source_id
);
408 /* TODO Deal with KVM_DEV_IRQ_ASSIGNED_MASK_MSIX */
409 static int kvm_vm_ioctl_assign_irq(struct kvm
*kvm
,
410 struct kvm_assigned_irq
*assigned_irq
)
413 struct kvm_assigned_dev_kernel
*match
;
414 unsigned long host_irq_type
, guest_irq_type
;
416 if (!irqchip_in_kernel(kvm
))
419 mutex_lock(&kvm
->lock
);
421 match
= kvm_find_assigned_dev(&kvm
->arch
.assigned_dev_head
,
422 assigned_irq
->assigned_dev_id
);
426 host_irq_type
= (assigned_irq
->flags
& KVM_DEV_IRQ_HOST_MASK
);
427 guest_irq_type
= (assigned_irq
->flags
& KVM_DEV_IRQ_GUEST_MASK
);
430 /* can only assign one type at a time */
431 if (hweight_long(host_irq_type
) > 1)
433 if (hweight_long(guest_irq_type
) > 1)
435 if (host_irq_type
== 0 && guest_irq_type
== 0)
440 r
= assign_host_irq(kvm
, match
, host_irq_type
);
445 r
= assign_guest_irq(kvm
, match
, assigned_irq
, guest_irq_type
);
447 mutex_unlock(&kvm
->lock
);
451 static int kvm_vm_ioctl_deassign_dev_irq(struct kvm
*kvm
,
452 struct kvm_assigned_irq
456 struct kvm_assigned_dev_kernel
*match
;
458 mutex_lock(&kvm
->lock
);
460 match
= kvm_find_assigned_dev(&kvm
->arch
.assigned_dev_head
,
461 assigned_irq
->assigned_dev_id
);
465 r
= kvm_deassign_irq(kvm
, match
, assigned_irq
->flags
);
467 mutex_unlock(&kvm
->lock
);
471 static int kvm_vm_ioctl_assign_device(struct kvm
*kvm
,
472 struct kvm_assigned_pci_dev
*assigned_dev
)
475 struct kvm_assigned_dev_kernel
*match
;
478 mutex_lock(&kvm
->lock
);
479 idx
= srcu_read_lock(&kvm
->srcu
);
481 match
= kvm_find_assigned_dev(&kvm
->arch
.assigned_dev_head
,
482 assigned_dev
->assigned_dev_id
);
484 /* device already assigned */
489 match
= kzalloc(sizeof(struct kvm_assigned_dev_kernel
), GFP_KERNEL
);
491 printk(KERN_INFO
"%s: Couldn't allocate memory\n",
496 dev
= pci_get_domain_bus_and_slot(assigned_dev
->segnr
,
498 assigned_dev
->devfn
);
500 printk(KERN_INFO
"%s: host device not found\n", __func__
);
504 if (pci_enable_device(dev
)) {
505 printk(KERN_INFO
"%s: Could not enable PCI device\n", __func__
);
509 r
= pci_request_regions(dev
, "kvm_assigned_device");
511 printk(KERN_INFO
"%s: Could not get access to device regions\n",
516 pci_reset_function(dev
);
518 match
->assigned_dev_id
= assigned_dev
->assigned_dev_id
;
519 match
->host_segnr
= assigned_dev
->segnr
;
520 match
->host_busnr
= assigned_dev
->busnr
;
521 match
->host_devfn
= assigned_dev
->devfn
;
522 match
->flags
= assigned_dev
->flags
;
524 spin_lock_init(&match
->intx_lock
);
525 match
->irq_source_id
= -1;
527 match
->ack_notifier
.irq_acked
= kvm_assigned_dev_ack_irq
;
529 list_add(&match
->list
, &kvm
->arch
.assigned_dev_head
);
531 if (assigned_dev
->flags
& KVM_DEV_ASSIGN_ENABLE_IOMMU
) {
532 if (!kvm
->arch
.iommu_domain
) {
533 r
= kvm_iommu_map_guest(kvm
);
537 r
= kvm_assign_device(kvm
, match
);
543 srcu_read_unlock(&kvm
->srcu
, idx
);
544 mutex_unlock(&kvm
->lock
);
547 list_del(&match
->list
);
548 pci_release_regions(dev
);
550 pci_disable_device(dev
);
555 srcu_read_unlock(&kvm
->srcu
, idx
);
556 mutex_unlock(&kvm
->lock
);
560 static int kvm_vm_ioctl_deassign_device(struct kvm
*kvm
,
561 struct kvm_assigned_pci_dev
*assigned_dev
)
564 struct kvm_assigned_dev_kernel
*match
;
566 mutex_lock(&kvm
->lock
);
568 match
= kvm_find_assigned_dev(&kvm
->arch
.assigned_dev_head
,
569 assigned_dev
->assigned_dev_id
);
571 printk(KERN_INFO
"%s: device hasn't been assigned before, "
572 "so cannot be deassigned\n", __func__
);
577 if (match
->flags
& KVM_DEV_ASSIGN_ENABLE_IOMMU
)
578 kvm_deassign_device(kvm
, match
);
580 kvm_free_assigned_device(kvm
, match
);
583 mutex_unlock(&kvm
->lock
);
588 #ifdef __KVM_HAVE_MSIX
589 static int kvm_vm_ioctl_set_msix_nr(struct kvm
*kvm
,
590 struct kvm_assigned_msix_nr
*entry_nr
)
593 struct kvm_assigned_dev_kernel
*adev
;
595 mutex_lock(&kvm
->lock
);
597 adev
= kvm_find_assigned_dev(&kvm
->arch
.assigned_dev_head
,
598 entry_nr
->assigned_dev_id
);
604 if (adev
->entries_nr
== 0) {
605 adev
->entries_nr
= entry_nr
->entry_nr
;
606 if (adev
->entries_nr
== 0 ||
607 adev
->entries_nr
>= KVM_MAX_MSIX_PER_DEV
) {
612 adev
->host_msix_entries
= kzalloc(sizeof(struct msix_entry
) *
615 if (!adev
->host_msix_entries
) {
619 adev
->guest_msix_entries
=
620 kzalloc(sizeof(struct msix_entry
) * entry_nr
->entry_nr
,
622 if (!adev
->guest_msix_entries
) {
623 kfree(adev
->host_msix_entries
);
627 } else /* Not allowed set MSI-X number twice */
630 mutex_unlock(&kvm
->lock
);
634 static int kvm_vm_ioctl_set_msix_entry(struct kvm
*kvm
,
635 struct kvm_assigned_msix_entry
*entry
)
638 struct kvm_assigned_dev_kernel
*adev
;
640 mutex_lock(&kvm
->lock
);
642 adev
= kvm_find_assigned_dev(&kvm
->arch
.assigned_dev_head
,
643 entry
->assigned_dev_id
);
650 for (i
= 0; i
< adev
->entries_nr
; i
++)
651 if (adev
->guest_msix_entries
[i
].vector
== 0 ||
652 adev
->guest_msix_entries
[i
].entry
== entry
->entry
) {
653 adev
->guest_msix_entries
[i
].entry
= entry
->entry
;
654 adev
->guest_msix_entries
[i
].vector
= entry
->gsi
;
655 adev
->host_msix_entries
[i
].entry
= entry
->entry
;
658 if (i
== adev
->entries_nr
) {
664 mutex_unlock(&kvm
->lock
);
670 long kvm_vm_ioctl_assigned_device(struct kvm
*kvm
, unsigned ioctl
,
673 void __user
*argp
= (void __user
*)arg
;
677 case KVM_ASSIGN_PCI_DEVICE
: {
678 struct kvm_assigned_pci_dev assigned_dev
;
681 if (copy_from_user(&assigned_dev
, argp
, sizeof assigned_dev
))
683 r
= kvm_vm_ioctl_assign_device(kvm
, &assigned_dev
);
688 case KVM_ASSIGN_IRQ
: {
692 case KVM_ASSIGN_DEV_IRQ
: {
693 struct kvm_assigned_irq assigned_irq
;
696 if (copy_from_user(&assigned_irq
, argp
, sizeof assigned_irq
))
698 r
= kvm_vm_ioctl_assign_irq(kvm
, &assigned_irq
);
703 case KVM_DEASSIGN_DEV_IRQ
: {
704 struct kvm_assigned_irq assigned_irq
;
707 if (copy_from_user(&assigned_irq
, argp
, sizeof assigned_irq
))
709 r
= kvm_vm_ioctl_deassign_dev_irq(kvm
, &assigned_irq
);
714 case KVM_DEASSIGN_PCI_DEVICE
: {
715 struct kvm_assigned_pci_dev assigned_dev
;
718 if (copy_from_user(&assigned_dev
, argp
, sizeof assigned_dev
))
720 r
= kvm_vm_ioctl_deassign_device(kvm
, &assigned_dev
);
725 #ifdef KVM_CAP_IRQ_ROUTING
726 case KVM_SET_GSI_ROUTING
: {
727 struct kvm_irq_routing routing
;
728 struct kvm_irq_routing __user
*urouting
;
729 struct kvm_irq_routing_entry
*entries
;
732 if (copy_from_user(&routing
, argp
, sizeof(routing
)))
735 if (routing
.nr
>= KVM_MAX_IRQ_ROUTES
)
740 entries
= vmalloc(routing
.nr
* sizeof(*entries
));
745 if (copy_from_user(entries
, urouting
->entries
,
746 routing
.nr
* sizeof(*entries
)))
747 goto out_free_irq_routing
;
748 r
= kvm_set_irq_routing(kvm
, entries
, routing
.nr
,
750 out_free_irq_routing
:
754 #endif /* KVM_CAP_IRQ_ROUTING */
755 #ifdef __KVM_HAVE_MSIX
756 case KVM_ASSIGN_SET_MSIX_NR
: {
757 struct kvm_assigned_msix_nr entry_nr
;
759 if (copy_from_user(&entry_nr
, argp
, sizeof entry_nr
))
761 r
= kvm_vm_ioctl_set_msix_nr(kvm
, &entry_nr
);
766 case KVM_ASSIGN_SET_MSIX_ENTRY
: {
767 struct kvm_assigned_msix_entry entry
;
769 if (copy_from_user(&entry
, argp
, sizeof entry
))
771 r
= kvm_vm_ioctl_set_msix_entry(kvm
, &entry
);