4 * This module allows virtio devices to be used over a virtual PCI device.
5 * This can be used with QEMU based VMMs like KVM or Xen.
7 * Copyright IBM Corp. 2007
10 * Anthony Liguori <aliguori@us.ibm.com>
12 * This work is licensed under the terms of the GNU GPL, version 2 or later.
13 * See the COPYING file in the top-level directory.
17 #include <linux/module.h>
18 #include <linux/list.h>
19 #include <linux/pci.h>
20 #include <linux/slab.h>
21 #include <linux/interrupt.h>
22 #include <linux/virtio.h>
23 #include <linux/virtio_config.h>
24 #include <linux/virtio_ring.h>
25 #include <linux/virtio_pci.h>
26 #include <linux/highmem.h>
27 #include <linux/spinlock.h>
29 MODULE_AUTHOR("Anthony Liguori <aliguori@us.ibm.com>");
30 MODULE_DESCRIPTION("virtio-pci");
31 MODULE_LICENSE("GPL");
34 /* Our device structure */
35 struct virtio_pci_device
37 struct virtio_device vdev
;
38 struct pci_dev
*pci_dev
;
40 /* the IO mapping for the PCI config space */
43 /* a list of queues so we can dispatch IRQs */
45 struct list_head virtqueues
;
50 struct msix_entry
*msix_entries
;
51 /* Name strings for interrupts. This size should be enough,
52 * and I'm too lazy to allocate each name separately. */
53 char (*msix_names
)[256];
54 /* Number of available vectors */
55 unsigned msix_vectors
;
56 /* Vectors allocated, excluding per-vq vectors if any */
57 unsigned msix_used_vectors
;
58 /* Whether we have vector per vq */
62 /* Constants for MSI-X */
63 /* Use first vector for configuration changes, second and the rest for
64 * virtqueues Thus, we need at least 2 vectors for MSI. */
66 VP_MSIX_CONFIG_VECTOR
= 0,
67 VP_MSIX_VQ_VECTOR
= 1,
70 struct virtio_pci_vq_info
72 /* the actual virtqueue */
75 /* the number of entries in the queue */
78 /* the index of the queue */
81 /* the virtual address of the ring queue */
84 /* the list node for the virtqueues list */
85 struct list_head node
;
87 /* MSI-X vector (or none) */
91 /* Qumranet donated their vendor ID for devices 0x1000 thru 0x10FF. */
92 static struct pci_device_id virtio_pci_id_table
[] = {
93 { 0x1af4, PCI_ANY_ID
, PCI_ANY_ID
, PCI_ANY_ID
, 0, 0, 0 },
97 MODULE_DEVICE_TABLE(pci
, virtio_pci_id_table
);
99 /* Convert a generic virtio device to our structure */
100 static struct virtio_pci_device
*to_vp_device(struct virtio_device
*vdev
)
102 return container_of(vdev
, struct virtio_pci_device
, vdev
);
105 /* virtio config->get_features() implementation */
106 static u32
vp_get_features(struct virtio_device
*vdev
)
108 struct virtio_pci_device
*vp_dev
= to_vp_device(vdev
);
110 /* When someone needs more than 32 feature bits, we'll need to
111 * steal a bit to indicate that the rest are somewhere else. */
112 return ioread32(vp_dev
->ioaddr
+ VIRTIO_PCI_HOST_FEATURES
);
115 /* virtio config->finalize_features() implementation */
116 static void vp_finalize_features(struct virtio_device
*vdev
)
118 struct virtio_pci_device
*vp_dev
= to_vp_device(vdev
);
120 /* Give virtio_ring a chance to accept features. */
121 vring_transport_features(vdev
);
123 /* We only support 32 feature bits. */
124 BUILD_BUG_ON(ARRAY_SIZE(vdev
->features
) != 1);
125 iowrite32(vdev
->features
[0], vp_dev
->ioaddr
+VIRTIO_PCI_GUEST_FEATURES
);
128 /* virtio config->get() implementation */
129 static void vp_get(struct virtio_device
*vdev
, unsigned offset
,
130 void *buf
, unsigned len
)
132 struct virtio_pci_device
*vp_dev
= to_vp_device(vdev
);
133 void __iomem
*ioaddr
= vp_dev
->ioaddr
+
134 VIRTIO_PCI_CONFIG(vp_dev
) + offset
;
138 for (i
= 0; i
< len
; i
++)
139 ptr
[i
] = ioread8(ioaddr
+ i
);
142 /* the config->set() implementation. it's symmetric to the config->get()
144 static void vp_set(struct virtio_device
*vdev
, unsigned offset
,
145 const void *buf
, unsigned len
)
147 struct virtio_pci_device
*vp_dev
= to_vp_device(vdev
);
148 void __iomem
*ioaddr
= vp_dev
->ioaddr
+
149 VIRTIO_PCI_CONFIG(vp_dev
) + offset
;
153 for (i
= 0; i
< len
; i
++)
154 iowrite8(ptr
[i
], ioaddr
+ i
);
157 /* config->{get,set}_status() implementations */
158 static u8
vp_get_status(struct virtio_device
*vdev
)
160 struct virtio_pci_device
*vp_dev
= to_vp_device(vdev
);
161 return ioread8(vp_dev
->ioaddr
+ VIRTIO_PCI_STATUS
);
164 static void vp_set_status(struct virtio_device
*vdev
, u8 status
)
166 struct virtio_pci_device
*vp_dev
= to_vp_device(vdev
);
167 /* We should never be setting status to 0. */
169 iowrite8(status
, vp_dev
->ioaddr
+ VIRTIO_PCI_STATUS
);
172 static void vp_reset(struct virtio_device
*vdev
)
174 struct virtio_pci_device
*vp_dev
= to_vp_device(vdev
);
175 /* 0 status means a reset. */
176 iowrite8(0, vp_dev
->ioaddr
+ VIRTIO_PCI_STATUS
);
179 /* the notify function used when creating a virt queue */
180 static void vp_notify(struct virtqueue
*vq
)
182 struct virtio_pci_device
*vp_dev
= to_vp_device(vq
->vdev
);
183 struct virtio_pci_vq_info
*info
= vq
->priv
;
185 /* we write the queue's selector into the notification register to
186 * signal the other end */
187 iowrite16(info
->queue_index
, vp_dev
->ioaddr
+ VIRTIO_PCI_QUEUE_NOTIFY
);
190 /* Handle a configuration change: Tell driver if it wants to know. */
191 static irqreturn_t
vp_config_changed(int irq
, void *opaque
)
193 struct virtio_pci_device
*vp_dev
= opaque
;
194 struct virtio_driver
*drv
;
195 drv
= container_of(vp_dev
->vdev
.dev
.driver
,
196 struct virtio_driver
, driver
);
198 if (drv
&& drv
->config_changed
)
199 drv
->config_changed(&vp_dev
->vdev
);
203 /* Notify all virtqueues on an interrupt. */
204 static irqreturn_t
vp_vring_interrupt(int irq
, void *opaque
)
206 struct virtio_pci_device
*vp_dev
= opaque
;
207 struct virtio_pci_vq_info
*info
;
208 irqreturn_t ret
= IRQ_NONE
;
211 spin_lock_irqsave(&vp_dev
->lock
, flags
);
212 list_for_each_entry(info
, &vp_dev
->virtqueues
, node
) {
213 if (vring_interrupt(irq
, info
->vq
) == IRQ_HANDLED
)
216 spin_unlock_irqrestore(&vp_dev
->lock
, flags
);
221 /* A small wrapper to also acknowledge the interrupt when it's handled.
222 * I really need an EIO hook for the vring so I can ack the interrupt once we
223 * know that we'll be handling the IRQ but before we invoke the callback since
224 * the callback may notify the host which results in the host attempting to
225 * raise an interrupt that we would then mask once we acknowledged the
227 static irqreturn_t
vp_interrupt(int irq
, void *opaque
)
229 struct virtio_pci_device
*vp_dev
= opaque
;
232 /* reading the ISR has the effect of also clearing it so it's very
233 * important to save off the value. */
234 isr
= ioread8(vp_dev
->ioaddr
+ VIRTIO_PCI_ISR
);
236 /* It's definitely not us if the ISR was not high */
240 /* Configuration change? Tell driver if it wants to know. */
241 if (isr
& VIRTIO_PCI_ISR_CONFIG
)
242 vp_config_changed(irq
, opaque
);
244 return vp_vring_interrupt(irq
, opaque
);
247 static void vp_free_vectors(struct virtio_device
*vdev
)
249 struct virtio_pci_device
*vp_dev
= to_vp_device(vdev
);
252 if (vp_dev
->intx_enabled
) {
253 free_irq(vp_dev
->pci_dev
->irq
, vp_dev
);
254 vp_dev
->intx_enabled
= 0;
257 for (i
= 0; i
< vp_dev
->msix_used_vectors
; ++i
)
258 free_irq(vp_dev
->msix_entries
[i
].vector
, vp_dev
);
260 if (vp_dev
->msix_enabled
) {
261 /* Disable the vector used for configuration */
262 iowrite16(VIRTIO_MSI_NO_VECTOR
,
263 vp_dev
->ioaddr
+ VIRTIO_MSI_CONFIG_VECTOR
);
264 /* Flush the write out to device */
265 ioread16(vp_dev
->ioaddr
+ VIRTIO_MSI_CONFIG_VECTOR
);
267 pci_disable_msix(vp_dev
->pci_dev
);
268 vp_dev
->msix_enabled
= 0;
269 vp_dev
->msix_vectors
= 0;
272 vp_dev
->msix_used_vectors
= 0;
273 kfree(vp_dev
->msix_names
);
274 vp_dev
->msix_names
= NULL
;
275 kfree(vp_dev
->msix_entries
);
276 vp_dev
->msix_entries
= NULL
;
279 static int vp_request_msix_vectors(struct virtio_device
*vdev
, int nvectors
,
282 struct virtio_pci_device
*vp_dev
= to_vp_device(vdev
);
283 const char *name
= dev_name(&vp_dev
->vdev
.dev
);
287 vp_dev
->msix_entries
= kmalloc(nvectors
* sizeof *vp_dev
->msix_entries
,
289 if (!vp_dev
->msix_entries
)
291 vp_dev
->msix_names
= kmalloc(nvectors
* sizeof *vp_dev
->msix_names
,
293 if (!vp_dev
->msix_names
)
296 for (i
= 0; i
< nvectors
; ++i
)
297 vp_dev
->msix_entries
[i
].entry
= i
;
299 /* pci_enable_msix returns positive if we can't get this many. */
300 err
= pci_enable_msix(vp_dev
->pci_dev
, vp_dev
->msix_entries
, nvectors
);
305 vp_dev
->msix_vectors
= nvectors
;
306 vp_dev
->msix_enabled
= 1;
308 /* Set the vector used for configuration */
309 v
= vp_dev
->msix_used_vectors
;
310 snprintf(vp_dev
->msix_names
[v
], sizeof *vp_dev
->msix_names
,
312 err
= request_irq(vp_dev
->msix_entries
[v
].vector
,
313 vp_config_changed
, 0, vp_dev
->msix_names
[v
],
317 ++vp_dev
->msix_used_vectors
;
319 iowrite16(v
, vp_dev
->ioaddr
+ VIRTIO_MSI_CONFIG_VECTOR
);
320 /* Verify we had enough resources to assign the vector */
321 v
= ioread16(vp_dev
->ioaddr
+ VIRTIO_MSI_CONFIG_VECTOR
);
322 if (v
== VIRTIO_MSI_NO_VECTOR
) {
327 if (!per_vq_vectors
) {
328 /* Shared vector for all VQs */
329 v
= vp_dev
->msix_used_vectors
;
330 snprintf(vp_dev
->msix_names
[v
], sizeof *vp_dev
->msix_names
,
331 "%s-virtqueues", name
);
332 err
= request_irq(vp_dev
->msix_entries
[v
].vector
,
333 vp_vring_interrupt
, 0, vp_dev
->msix_names
[v
],
337 ++vp_dev
->msix_used_vectors
;
341 vp_free_vectors(vdev
);
345 static int vp_request_intx(struct virtio_device
*vdev
)
348 struct virtio_pci_device
*vp_dev
= to_vp_device(vdev
);
350 err
= request_irq(vp_dev
->pci_dev
->irq
, vp_interrupt
,
351 IRQF_SHARED
, dev_name(&vdev
->dev
), vp_dev
);
353 vp_dev
->intx_enabled
= 1;
357 static struct virtqueue
*setup_vq(struct virtio_device
*vdev
, unsigned index
,
358 void (*callback
)(struct virtqueue
*vq
),
362 struct virtio_pci_device
*vp_dev
= to_vp_device(vdev
);
363 struct virtio_pci_vq_info
*info
;
364 struct virtqueue
*vq
;
365 unsigned long flags
, size
;
369 /* Select the queue we're interested in */
370 iowrite16(index
, vp_dev
->ioaddr
+ VIRTIO_PCI_QUEUE_SEL
);
372 /* Check if queue is either not available or already active. */
373 num
= ioread16(vp_dev
->ioaddr
+ VIRTIO_PCI_QUEUE_NUM
);
374 if (!num
|| ioread32(vp_dev
->ioaddr
+ VIRTIO_PCI_QUEUE_PFN
))
375 return ERR_PTR(-ENOENT
);
377 /* allocate and fill out our structure the represents an active
379 info
= kmalloc(sizeof(struct virtio_pci_vq_info
), GFP_KERNEL
);
381 return ERR_PTR(-ENOMEM
);
383 info
->queue_index
= index
;
385 info
->msix_vector
= msix_vec
;
387 size
= PAGE_ALIGN(vring_size(num
, VIRTIO_PCI_VRING_ALIGN
));
388 info
->queue
= alloc_pages_exact(size
, GFP_KERNEL
|__GFP_ZERO
);
389 if (info
->queue
== NULL
) {
394 /* activate the queue */
395 iowrite32(virt_to_phys(info
->queue
) >> VIRTIO_PCI_QUEUE_ADDR_SHIFT
,
396 vp_dev
->ioaddr
+ VIRTIO_PCI_QUEUE_PFN
);
398 /* create the vring */
399 vq
= vring_new_virtqueue(info
->num
, VIRTIO_PCI_VRING_ALIGN
,
400 vdev
, info
->queue
, vp_notify
, callback
, name
);
403 goto out_activate_queue
;
409 if (msix_vec
!= VIRTIO_MSI_NO_VECTOR
) {
410 iowrite16(msix_vec
, vp_dev
->ioaddr
+ VIRTIO_MSI_QUEUE_VECTOR
);
411 msix_vec
= ioread16(vp_dev
->ioaddr
+ VIRTIO_MSI_QUEUE_VECTOR
);
412 if (msix_vec
== VIRTIO_MSI_NO_VECTOR
) {
418 spin_lock_irqsave(&vp_dev
->lock
, flags
);
419 list_add(&info
->node
, &vp_dev
->virtqueues
);
420 spin_unlock_irqrestore(&vp_dev
->lock
, flags
);
425 vring_del_virtqueue(vq
);
427 iowrite32(0, vp_dev
->ioaddr
+ VIRTIO_PCI_QUEUE_PFN
);
428 free_pages_exact(info
->queue
, size
);
434 static void vp_del_vq(struct virtqueue
*vq
)
436 struct virtio_pci_device
*vp_dev
= to_vp_device(vq
->vdev
);
437 struct virtio_pci_vq_info
*info
= vq
->priv
;
438 unsigned long flags
, size
;
440 spin_lock_irqsave(&vp_dev
->lock
, flags
);
441 list_del(&info
->node
);
442 spin_unlock_irqrestore(&vp_dev
->lock
, flags
);
444 iowrite16(info
->queue_index
, vp_dev
->ioaddr
+ VIRTIO_PCI_QUEUE_SEL
);
446 if (vp_dev
->msix_enabled
) {
447 iowrite16(VIRTIO_MSI_NO_VECTOR
,
448 vp_dev
->ioaddr
+ VIRTIO_MSI_QUEUE_VECTOR
);
449 /* Flush the write out to device */
450 ioread8(vp_dev
->ioaddr
+ VIRTIO_PCI_ISR
);
453 vring_del_virtqueue(vq
);
455 /* Select and deactivate the queue */
456 iowrite32(0, vp_dev
->ioaddr
+ VIRTIO_PCI_QUEUE_PFN
);
458 size
= PAGE_ALIGN(vring_size(info
->num
, VIRTIO_PCI_VRING_ALIGN
));
459 free_pages_exact(info
->queue
, size
);
463 /* the config->del_vqs() implementation */
464 static void vp_del_vqs(struct virtio_device
*vdev
)
466 struct virtio_pci_device
*vp_dev
= to_vp_device(vdev
);
467 struct virtqueue
*vq
, *n
;
468 struct virtio_pci_vq_info
*info
;
470 list_for_each_entry_safe(vq
, n
, &vdev
->vqs
, list
) {
472 if (vp_dev
->per_vq_vectors
&&
473 info
->msix_vector
!= VIRTIO_MSI_NO_VECTOR
)
474 free_irq(vp_dev
->msix_entries
[info
->msix_vector
].vector
,
478 vp_dev
->per_vq_vectors
= false;
480 vp_free_vectors(vdev
);
483 static int vp_try_to_find_vqs(struct virtio_device
*vdev
, unsigned nvqs
,
484 struct virtqueue
*vqs
[],
485 vq_callback_t
*callbacks
[],
490 struct virtio_pci_device
*vp_dev
= to_vp_device(vdev
);
492 int i
, err
, nvectors
, allocated_vectors
;
495 /* Old style: one normal interrupt for change and all vqs. */
496 err
= vp_request_intx(vdev
);
500 if (per_vq_vectors
) {
501 /* Best option: one for change interrupt, one per vq. */
503 for (i
= 0; i
< nvqs
; ++i
)
507 /* Second best: one for change, shared for all vqs. */
511 err
= vp_request_msix_vectors(vdev
, nvectors
, per_vq_vectors
);
516 vp_dev
->per_vq_vectors
= per_vq_vectors
;
517 allocated_vectors
= vp_dev
->msix_used_vectors
;
518 for (i
= 0; i
< nvqs
; ++i
) {
519 if (!callbacks
[i
] || !vp_dev
->msix_enabled
)
520 msix_vec
= VIRTIO_MSI_NO_VECTOR
;
521 else if (vp_dev
->per_vq_vectors
)
522 msix_vec
= allocated_vectors
++;
524 msix_vec
= VP_MSIX_VQ_VECTOR
;
525 vqs
[i
] = setup_vq(vdev
, i
, callbacks
[i
], names
[i
], msix_vec
);
526 if (IS_ERR(vqs
[i
])) {
527 err
= PTR_ERR(vqs
[i
]);
531 if (!vp_dev
->per_vq_vectors
|| msix_vec
== VIRTIO_MSI_NO_VECTOR
)
534 /* allocate per-vq irq if available and necessary */
535 snprintf(vp_dev
->msix_names
[msix_vec
],
536 sizeof *vp_dev
->msix_names
,
538 dev_name(&vp_dev
->vdev
.dev
), names
[i
]);
539 err
= request_irq(vp_dev
->msix_entries
[msix_vec
].vector
,
541 vp_dev
->msix_names
[msix_vec
],
557 /* the config->find_vqs() implementation */
558 static int vp_find_vqs(struct virtio_device
*vdev
, unsigned nvqs
,
559 struct virtqueue
*vqs
[],
560 vq_callback_t
*callbacks
[],
565 /* Try MSI-X with one vector per queue. */
566 err
= vp_try_to_find_vqs(vdev
, nvqs
, vqs
, callbacks
, names
, true, true);
569 /* Fallback: MSI-X with one vector for config, one shared for queues. */
570 err
= vp_try_to_find_vqs(vdev
, nvqs
, vqs
, callbacks
, names
,
574 /* Finally fall back to regular interrupts. */
575 return vp_try_to_find_vqs(vdev
, nvqs
, vqs
, callbacks
, names
,
579 static struct virtio_config_ops virtio_pci_config_ops
= {
582 .get_status
= vp_get_status
,
583 .set_status
= vp_set_status
,
585 .find_vqs
= vp_find_vqs
,
586 .del_vqs
= vp_del_vqs
,
587 .get_features
= vp_get_features
,
588 .finalize_features
= vp_finalize_features
,
591 static void virtio_pci_release_dev(struct device
*_d
)
593 struct virtio_device
*dev
= container_of(_d
, struct virtio_device
,
595 struct virtio_pci_device
*vp_dev
= to_vp_device(dev
);
600 /* the PCI probing function */
601 static int __devinit
virtio_pci_probe(struct pci_dev
*pci_dev
,
602 const struct pci_device_id
*id
)
604 struct virtio_pci_device
*vp_dev
;
607 /* We only own devices >= 0x1000 and <= 0x103f: leave the rest. */
608 if (pci_dev
->device
< 0x1000 || pci_dev
->device
> 0x103f)
611 if (pci_dev
->revision
!= VIRTIO_PCI_ABI_VERSION
) {
612 printk(KERN_ERR
"virtio_pci: expected ABI version %d, got %d\n",
613 VIRTIO_PCI_ABI_VERSION
, pci_dev
->revision
);
617 /* allocate our structure and fill it out */
618 vp_dev
= kzalloc(sizeof(struct virtio_pci_device
), GFP_KERNEL
);
622 vp_dev
->vdev
.dev
.parent
= &pci_dev
->dev
;
623 vp_dev
->vdev
.dev
.release
= virtio_pci_release_dev
;
624 vp_dev
->vdev
.config
= &virtio_pci_config_ops
;
625 vp_dev
->pci_dev
= pci_dev
;
626 INIT_LIST_HEAD(&vp_dev
->virtqueues
);
627 spin_lock_init(&vp_dev
->lock
);
629 /* Disable MSI/MSIX to bring device to a known good state. */
630 pci_msi_off(pci_dev
);
632 /* enable the device */
633 err
= pci_enable_device(pci_dev
);
637 err
= pci_request_regions(pci_dev
, "virtio-pci");
639 goto out_enable_device
;
641 vp_dev
->ioaddr
= pci_iomap(pci_dev
, 0, 0);
642 if (vp_dev
->ioaddr
== NULL
)
643 goto out_req_regions
;
645 pci_set_drvdata(pci_dev
, vp_dev
);
646 pci_set_master(pci_dev
);
648 /* we use the subsystem vendor/device id as the virtio vendor/device
649 * id. this allows us to use the same PCI vendor/device id for all
650 * virtio devices and to identify the particular virtio driver by
651 * the subsystem ids */
652 vp_dev
->vdev
.id
.vendor
= pci_dev
->subsystem_vendor
;
653 vp_dev
->vdev
.id
.device
= pci_dev
->subsystem_device
;
655 /* finally register the virtio device */
656 err
= register_virtio_device(&vp_dev
->vdev
);
658 goto out_set_drvdata
;
663 pci_set_drvdata(pci_dev
, NULL
);
664 pci_iounmap(pci_dev
, vp_dev
->ioaddr
);
666 pci_release_regions(pci_dev
);
668 pci_disable_device(pci_dev
);
674 static void __devexit
virtio_pci_remove(struct pci_dev
*pci_dev
)
676 struct virtio_pci_device
*vp_dev
= pci_get_drvdata(pci_dev
);
678 unregister_virtio_device(&vp_dev
->vdev
);
680 vp_del_vqs(&vp_dev
->vdev
);
681 pci_set_drvdata(pci_dev
, NULL
);
682 pci_iounmap(pci_dev
, vp_dev
->ioaddr
);
683 pci_release_regions(pci_dev
);
684 pci_disable_device(pci_dev
);
688 static int virtio_pci_suspend(struct pci_dev
*pci_dev
, pm_message_t state
)
690 pci_save_state(pci_dev
);
691 pci_set_power_state(pci_dev
, PCI_D3hot
);
695 static int virtio_pci_resume(struct pci_dev
*pci_dev
)
697 pci_restore_state(pci_dev
);
698 pci_set_power_state(pci_dev
, PCI_D0
);
703 static struct pci_driver virtio_pci_driver
= {
704 .name
= "virtio-pci",
705 .id_table
= virtio_pci_id_table
,
706 .probe
= virtio_pci_probe
,
707 .remove
= __devexit_p(virtio_pci_remove
),
709 .suspend
= virtio_pci_suspend
,
710 .resume
= virtio_pci_resume
,
714 static int __init
virtio_pci_init(void)
716 return pci_register_driver(&virtio_pci_driver
);
719 module_init(virtio_pci_init
);
721 static void __exit
virtio_pci_exit(void)
723 pci_unregister_driver(&virtio_pci_driver
);
726 module_exit(virtio_pci_exit
);