qemu-kvm: Fix and clean up msix vector use/unuse hooks
[qemu/qemu-dev-zwu.git] / hw / msix.c
blob1bdffb6ba1137edd5244181b48642e6b89b97dab
1 /*
2 * MSI-X device support
4 * This module includes support for MSI-X in pci devices.
6 * Author: Michael S. Tsirkin <mst@redhat.com>
8 * Copyright (c) 2009, Red Hat Inc, Michael S. Tsirkin (mst@redhat.com)
10 * This work is licensed under the terms of the GNU GPL, version 2. See
11 * the COPYING file in the top-level directory.
14 #include "hw.h"
15 #include "msix.h"
16 #include "pci.h"
17 #include "range.h"
18 #include "kvm.h"
20 /* MSI-X capability structure */
21 #define MSIX_TABLE_OFFSET 4
22 #define MSIX_PBA_OFFSET 8
23 #define MSIX_CAP_LENGTH 12
25 /* MSI enable bit and maskall bit are in byte 1 in FLAGS register */
26 #define MSIX_CONTROL_OFFSET (PCI_MSIX_FLAGS + 1)
27 #define MSIX_ENABLE_MASK (PCI_MSIX_FLAGS_ENABLE >> 8)
28 #define MSIX_MASKALL_MASK (PCI_MSIX_FLAGS_MASKALL >> 8)
30 /* MSI-X table format */
31 #define MSIX_MSG_ADDR 0
32 #define MSIX_MSG_UPPER_ADDR 4
33 #define MSIX_MSG_DATA 8
34 #define MSIX_VECTOR_CTRL 12
35 #define MSIX_ENTRY_SIZE 16
36 #define MSIX_VECTOR_MASK 0x1
38 /* How much space does an MSIX table need. */
39 /* The spec requires giving the table structure
40 * a 4K aligned region all by itself. */
41 #define MSIX_PAGE_SIZE 0x1000
42 /* Reserve second half of the page for pending bits */
43 #define MSIX_PAGE_PENDING (MSIX_PAGE_SIZE / 2)
44 #define MSIX_MAX_ENTRIES 32
47 /* Flag for interrupt controller to declare MSI-X support */
48 int msix_supported;
50 /* KVM specific MSIX helpers */
51 static void kvm_msix_free(PCIDevice *dev)
53 int vector, changed = 0;
55 for (vector = 0; vector < dev->msix_entries_nr; ++vector) {
56 if (dev->msix_entry_used[vector]) {
57 kvm_msi_message_del(&dev->msix_irq_entries[vector]);
58 changed = 1;
61 if (changed) {
62 kvm_commit_irq_routes();
66 static void kvm_msix_message_from_vector(PCIDevice *dev, unsigned vector,
67 KVMMsiMessage *kmm)
69 uint8_t *table_entry = dev->msix_table_page + vector * MSIX_ENTRY_SIZE;
71 kmm->addr_lo = pci_get_long(table_entry + MSIX_MSG_ADDR);
72 kmm->addr_hi = pci_get_long(table_entry + MSIX_MSG_UPPER_ADDR);
73 kmm->data = pci_get_long(table_entry + MSIX_MSG_DATA);
76 static void kvm_msix_update(PCIDevice *dev, int vector,
77 int was_masked, int is_masked)
79 KVMMsiMessage e = {}, *entry;
80 int mask_cleared = was_masked && !is_masked;
81 /* It is only legal to change an entry when it is masked. Therefore, it is
82 * enough to update the routing in kernel when mask is being cleared. */
83 if (!mask_cleared) {
84 return;
86 if (!dev->msix_entry_used[vector]) {
87 return;
89 entry = dev->msix_irq_entries + vector;
90 e.gsi = entry->gsi;
91 kvm_msix_message_from_vector(dev, vector, &e);
92 if (memcmp(entry, &e, sizeof e) != 0) {
93 int r;
95 r = kvm_msi_message_update(entry, &e);
96 if (r) {
97 fprintf(stderr, "%s: kvm_update_msix failed: %s\n", __func__,
98 strerror(-r));
99 exit(1);
101 *entry = e;
102 r = kvm_commit_irq_routes();
103 if (r) {
104 fprintf(stderr, "%s: kvm_commit_irq_routes failed: %s\n", __func__,
105 strerror(-r));
106 exit(1);
111 static int kvm_msix_vector_add(PCIDevice *dev, unsigned vector)
113 KVMMsiMessage *kmm = dev->msix_irq_entries + vector;
114 int r;
116 if (!kvm_has_gsi_routing()) {
117 fprintf(stderr, "Warning: no MSI-X support found. "
118 "At least kernel 2.6.30 is required for MSI-X support.\n"
120 return -EOPNOTSUPP;
123 r = kvm_get_irq_route_gsi();
124 if (r < 0) {
125 fprintf(stderr, "%s: kvm_get_irq_route_gsi failed: %s\n", __func__, strerror(-r));
126 return r;
128 kmm->gsi = r;
129 kvm_msix_message_from_vector(dev, vector, kmm);
130 r = kvm_msi_message_add(kmm);
131 if (r < 0) {
132 fprintf(stderr, "%s: kvm_add_msix failed: %s\n", __func__, strerror(-r));
133 return r;
136 r = kvm_commit_irq_routes();
137 if (r < 0) {
138 fprintf(stderr, "%s: kvm_commit_irq_routes failed: %s\n", __func__, strerror(-r));
139 return r;
141 return 0;
144 static void kvm_msix_vector_del(PCIDevice *dev, unsigned vector)
146 kvm_msi_message_del(&dev->msix_irq_entries[vector]);
147 kvm_commit_irq_routes();
150 /* Add MSI-X capability to the config space for the device. */
151 /* Given a bar and its size, add MSI-X table on top of it
152 * and fill MSI-X capability in the config space.
153 * Original bar size must be a power of 2 or 0.
154 * New bar size is returned. */
155 static int msix_add_config(struct PCIDevice *pdev, unsigned short nentries,
156 unsigned bar_nr, unsigned bar_size)
158 int config_offset;
159 uint8_t *config;
161 pdev->msix_bar_size = bar_size;
163 config_offset = pci_find_capability(pdev, PCI_CAP_ID_MSIX);
165 if (!config_offset) {
166 uint32_t new_size;
168 if (nentries < 1 || nentries > PCI_MSIX_FLAGS_QSIZE + 1)
169 return -EINVAL;
170 if (bar_size > 0x80000000)
171 return -ENOSPC;
173 /* Add space for MSI-X structures */
174 if (!bar_size) {
175 new_size = MSIX_PAGE_SIZE;
176 } else if (bar_size < MSIX_PAGE_SIZE) {
177 bar_size = MSIX_PAGE_SIZE;
178 new_size = MSIX_PAGE_SIZE * 2;
179 } else {
180 new_size = bar_size * 2;
183 pdev->msix_bar_size = new_size;
184 config_offset = pci_add_capability(pdev, PCI_CAP_ID_MSIX,
185 0, MSIX_CAP_LENGTH);
186 if (config_offset < 0)
187 return config_offset;
188 config = pdev->config + config_offset;
190 pci_set_word(config + PCI_MSIX_FLAGS, nentries - 1);
191 /* Table on top of BAR */
192 pci_set_long(config + MSIX_TABLE_OFFSET, bar_size | bar_nr);
193 /* Pending bits on top of that */
194 pci_set_long(config + MSIX_PBA_OFFSET, (bar_size + MSIX_PAGE_PENDING) |
195 bar_nr);
197 pdev->msix_cap = config_offset;
198 /* Make flags bit writeable. */
199 pdev->wmask[config_offset + MSIX_CONTROL_OFFSET] |= MSIX_ENABLE_MASK |
200 MSIX_MASKALL_MASK;
201 return 0;
204 static uint32_t msix_mmio_readl(void *opaque, target_phys_addr_t addr)
206 PCIDevice *dev = opaque;
207 unsigned int offset = addr & (MSIX_PAGE_SIZE - 1) & ~0x3;
208 void *page = dev->msix_table_page;
210 return pci_get_long(page + offset);
213 static uint32_t msix_mmio_read_unallowed(void *opaque, target_phys_addr_t addr)
215 fprintf(stderr, "MSI-X: only dword read is allowed!\n");
216 return 0;
219 static uint8_t msix_pending_mask(int vector)
221 return 1 << (vector % 8);
224 static uint8_t *msix_pending_byte(PCIDevice *dev, int vector)
226 return dev->msix_table_page + MSIX_PAGE_PENDING + vector / 8;
229 static int msix_is_pending(PCIDevice *dev, int vector)
231 return *msix_pending_byte(dev, vector) & msix_pending_mask(vector);
234 static void msix_set_pending(PCIDevice *dev, int vector)
236 *msix_pending_byte(dev, vector) |= msix_pending_mask(vector);
239 static void msix_clr_pending(PCIDevice *dev, int vector)
241 *msix_pending_byte(dev, vector) &= ~msix_pending_mask(vector);
244 static int msix_function_masked(PCIDevice *dev)
246 return dev->config[dev->msix_cap + MSIX_CONTROL_OFFSET] & MSIX_MASKALL_MASK;
249 static int msix_is_masked(PCIDevice *dev, int vector)
251 unsigned offset = vector * MSIX_ENTRY_SIZE + MSIX_VECTOR_CTRL;
252 return msix_function_masked(dev) ||
253 dev->msix_table_page[offset] & MSIX_VECTOR_MASK;
256 static void msix_handle_mask_update(PCIDevice *dev, int vector)
258 if (!msix_is_masked(dev, vector) && msix_is_pending(dev, vector)) {
259 msix_clr_pending(dev, vector);
260 msix_notify(dev, vector);
264 /* Handle MSI-X capability config write. */
265 void msix_write_config(PCIDevice *dev, uint32_t addr,
266 uint32_t val, int len)
268 unsigned enable_pos = dev->msix_cap + MSIX_CONTROL_OFFSET;
269 int vector;
271 if (!range_covers_byte(addr, len, enable_pos)) {
272 return;
275 if (!msix_enabled(dev)) {
276 return;
279 pci_device_deassert_intx(dev);
281 if (msix_function_masked(dev)) {
282 return;
285 for (vector = 0; vector < dev->msix_entries_nr; ++vector) {
286 msix_handle_mask_update(dev, vector);
290 static void msix_mmio_writel(void *opaque, target_phys_addr_t addr,
291 uint32_t val)
293 PCIDevice *dev = opaque;
294 unsigned int offset = addr & (MSIX_PAGE_SIZE - 1) & ~0x3;
295 int vector = offset / MSIX_ENTRY_SIZE;
296 int was_masked = msix_is_masked(dev, vector);
297 pci_set_long(dev->msix_table_page + offset, val);
298 if (kvm_enabled() && kvm_irqchip_in_kernel()) {
299 kvm_msix_update(dev, vector, was_masked, msix_is_masked(dev, vector));
301 if (was_masked != msix_is_masked(dev, vector) && dev->msix_mask_notifier) {
302 int r = dev->msix_mask_notifier(dev, vector,
303 msix_is_masked(dev, vector));
304 assert(r >= 0);
306 msix_handle_mask_update(dev, vector);
309 static void msix_mmio_write_unallowed(void *opaque, target_phys_addr_t addr,
310 uint32_t val)
312 fprintf(stderr, "MSI-X: only dword write is allowed!\n");
315 static CPUWriteMemoryFunc * const msix_mmio_write[] = {
316 msix_mmio_write_unallowed, msix_mmio_write_unallowed, msix_mmio_writel
319 static CPUReadMemoryFunc * const msix_mmio_read[] = {
320 msix_mmio_read_unallowed, msix_mmio_read_unallowed, msix_mmio_readl
323 /* Should be called from device's map method. */
324 void msix_mmio_map(PCIDevice *d, int region_num,
325 pcibus_t addr, pcibus_t size, int type)
327 uint8_t *config = d->config + d->msix_cap;
328 uint32_t table = pci_get_long(config + MSIX_TABLE_OFFSET);
329 uint32_t offset = table & ~(MSIX_PAGE_SIZE - 1);
330 /* TODO: for assigned devices, we'll want to make it possible to map
331 * pending bits separately in case they are in a separate bar. */
332 int table_bir = table & PCI_MSIX_FLAGS_BIRMASK;
334 if (table_bir != region_num)
335 return;
336 if (size <= offset)
337 return;
338 cpu_register_physical_memory(addr + offset,
339 MIN(size - offset, MSIX_PAGE_SIZE),
340 d->msix_mmio_index);
343 static void msix_mask_all(struct PCIDevice *dev, unsigned nentries)
345 int vector, r;
346 for (vector = 0; vector < nentries; ++vector) {
347 unsigned offset = vector * MSIX_ENTRY_SIZE + MSIX_VECTOR_CTRL;
348 int was_masked = msix_is_masked(dev, vector);
349 dev->msix_table_page[offset] |= MSIX_VECTOR_MASK;
350 if (was_masked != msix_is_masked(dev, vector) &&
351 dev->msix_mask_notifier) {
352 r = dev->msix_mask_notifier(dev, vector,
353 msix_is_masked(dev, vector));
354 assert(r >= 0);
359 /* Initialize the MSI-X structures. Note: if MSI-X is supported, BAR size is
360 * modified, it should be retrieved with msix_bar_size. */
361 int msix_init(struct PCIDevice *dev, unsigned short nentries,
362 unsigned bar_nr, unsigned bar_size)
364 int ret;
365 /* Nothing to do if MSI is not supported by interrupt controller */
366 if (!msix_supported)
367 return -ENOTSUP;
369 if (nentries > MSIX_MAX_ENTRIES)
370 return -EINVAL;
372 dev->msix_mask_notifier = NULL;
373 dev->msix_entry_used = qemu_mallocz(MSIX_MAX_ENTRIES *
374 sizeof *dev->msix_entry_used);
376 dev->msix_table_page = qemu_mallocz(MSIX_PAGE_SIZE);
377 msix_mask_all(dev, nentries);
379 dev->msix_mmio_index = cpu_register_io_memory(msix_mmio_read,
380 msix_mmio_write, dev,
381 DEVICE_NATIVE_ENDIAN);
382 if (dev->msix_mmio_index == -1) {
383 ret = -EBUSY;
384 goto err_index;
387 dev->msix_entries_nr = nentries;
388 ret = msix_add_config(dev, nentries, bar_nr, bar_size);
389 if (ret)
390 goto err_config;
392 if (kvm_enabled() && kvm_irqchip_in_kernel()) {
393 dev->msix_irq_entries = qemu_malloc(nentries *
394 sizeof *dev->msix_irq_entries);
397 dev->cap_present |= QEMU_PCI_CAP_MSIX;
398 return 0;
400 err_config:
401 dev->msix_entries_nr = 0;
402 cpu_unregister_io_memory(dev->msix_mmio_index);
403 err_index:
404 qemu_free(dev->msix_table_page);
405 dev->msix_table_page = NULL;
406 qemu_free(dev->msix_entry_used);
407 dev->msix_entry_used = NULL;
408 return ret;
411 static void msix_free_irq_entries(PCIDevice *dev)
413 int vector;
415 if (kvm_enabled() && kvm_irqchip_in_kernel()) {
416 kvm_msix_free(dev);
419 for (vector = 0; vector < dev->msix_entries_nr; ++vector) {
420 dev->msix_entry_used[vector] = 0;
421 msix_clr_pending(dev, vector);
425 /* Clean up resources for the device. */
426 int msix_uninit(PCIDevice *dev)
428 if (!(dev->cap_present & QEMU_PCI_CAP_MSIX))
429 return 0;
430 pci_del_capability(dev, PCI_CAP_ID_MSIX, MSIX_CAP_LENGTH);
431 dev->msix_cap = 0;
432 msix_free_irq_entries(dev);
433 dev->msix_entries_nr = 0;
434 cpu_unregister_io_memory(dev->msix_mmio_index);
435 qemu_free(dev->msix_table_page);
436 dev->msix_table_page = NULL;
437 qemu_free(dev->msix_entry_used);
438 dev->msix_entry_used = NULL;
439 qemu_free(dev->msix_irq_entries);
440 dev->msix_irq_entries = NULL;
441 dev->cap_present &= ~QEMU_PCI_CAP_MSIX;
442 return 0;
445 void msix_save(PCIDevice *dev, QEMUFile *f)
447 unsigned n = dev->msix_entries_nr;
449 if (!msix_supported) {
450 return;
453 if (!(dev->cap_present & QEMU_PCI_CAP_MSIX)) {
454 return;
456 qemu_put_buffer(f, dev->msix_table_page, n * MSIX_ENTRY_SIZE);
457 qemu_put_buffer(f, dev->msix_table_page + MSIX_PAGE_PENDING, (n + 7) / 8);
460 /* Should be called after restoring the config space. */
461 void msix_load(PCIDevice *dev, QEMUFile *f)
463 unsigned n = dev->msix_entries_nr;
465 if (!msix_supported)
466 return;
468 if (!(dev->cap_present & QEMU_PCI_CAP_MSIX)) {
469 return;
472 msix_free_irq_entries(dev);
473 qemu_get_buffer(f, dev->msix_table_page, n * MSIX_ENTRY_SIZE);
474 qemu_get_buffer(f, dev->msix_table_page + MSIX_PAGE_PENDING, (n + 7) / 8);
477 /* Does device support MSI-X? */
478 int msix_present(PCIDevice *dev)
480 return dev->cap_present & QEMU_PCI_CAP_MSIX;
483 /* Is MSI-X enabled? */
484 int msix_enabled(PCIDevice *dev)
486 return (dev->cap_present & QEMU_PCI_CAP_MSIX) &&
487 (dev->config[dev->msix_cap + MSIX_CONTROL_OFFSET] &
488 MSIX_ENABLE_MASK);
491 /* Size of bar where MSI-X table resides, or 0 if MSI-X not supported. */
492 uint32_t msix_bar_size(PCIDevice *dev)
494 return (dev->cap_present & QEMU_PCI_CAP_MSIX) ?
495 dev->msix_bar_size : 0;
498 /* Send an MSI-X message */
499 void msix_notify(PCIDevice *dev, unsigned vector)
501 uint8_t *table_entry = dev->msix_table_page + vector * MSIX_ENTRY_SIZE;
502 uint64_t address;
503 uint32_t data;
505 if (vector >= dev->msix_entries_nr || !dev->msix_entry_used[vector])
506 return;
507 if (msix_is_masked(dev, vector)) {
508 msix_set_pending(dev, vector);
509 return;
512 if (kvm_enabled() && kvm_irqchip_in_kernel()) {
513 kvm_set_irq(dev->msix_irq_entries[vector].gsi, 1, NULL);
514 return;
517 address = pci_get_long(table_entry + MSIX_MSG_UPPER_ADDR);
518 address = (address << 32) | pci_get_long(table_entry + MSIX_MSG_ADDR);
519 data = pci_get_long(table_entry + MSIX_MSG_DATA);
520 stl_phys(address, data);
523 void msix_reset(PCIDevice *dev)
525 if (!(dev->cap_present & QEMU_PCI_CAP_MSIX))
526 return;
527 msix_free_irq_entries(dev);
528 dev->config[dev->msix_cap + MSIX_CONTROL_OFFSET] &=
529 ~dev->wmask[dev->msix_cap + MSIX_CONTROL_OFFSET];
530 memset(dev->msix_table_page, 0, MSIX_PAGE_SIZE);
531 msix_mask_all(dev, dev->msix_entries_nr);
534 /* PCI spec suggests that devices make it possible for software to configure
535 * less vectors than supported by the device, but does not specify a standard
536 * mechanism for devices to do so.
538 * We support this by asking devices to declare vectors software is going to
539 * actually use, and checking this on the notification path. Devices that
540 * don't want to follow the spec suggestion can declare all vectors as used. */
542 /* Mark vector as used. */
543 int msix_vector_use(PCIDevice *dev, unsigned vector)
545 int ret;
546 if (vector >= dev->msix_entries_nr)
547 return -EINVAL;
548 if (kvm_enabled() && kvm_irqchip_in_kernel() &&
549 !dev->msix_entry_used[vector]) {
550 ret = kvm_msix_vector_add(dev, vector);
551 if (ret) {
552 return ret;
555 ++dev->msix_entry_used[vector];
556 return 0;
559 /* Mark vector as unused. */
560 void msix_vector_unuse(PCIDevice *dev, unsigned vector)
562 if (vector >= dev->msix_entries_nr || !dev->msix_entry_used[vector]) {
563 return;
565 if (--dev->msix_entry_used[vector]) {
566 return;
568 if (kvm_enabled() && kvm_irqchip_in_kernel()) {
569 kvm_msix_vector_del(dev, vector);
571 msix_clr_pending(dev, vector);
574 void msix_unuse_all_vectors(PCIDevice *dev)
576 if (!(dev->cap_present & QEMU_PCI_CAP_MSIX))
577 return;
578 msix_free_irq_entries(dev);
581 /* Invoke the notifier if vector entry is used and unmasked. */
582 static int msix_notify_if_unmasked(PCIDevice *dev, unsigned vector, int masked)
584 assert(dev->msix_mask_notifier);
585 if (!dev->msix_entry_used[vector] || msix_is_masked(dev, vector)) {
586 return 0;
588 return dev->msix_mask_notifier(dev, vector, masked);
591 static int msix_set_mask_notifier_for_vector(PCIDevice *dev, unsigned vector)
593 /* Notifier has been set. Invoke it on unmasked vectors. */
594 return msix_notify_if_unmasked(dev, vector, 0);
597 static int msix_unset_mask_notifier_for_vector(PCIDevice *dev, unsigned vector)
599 /* Notifier will be unset. Invoke it to mask unmasked entries. */
600 return msix_notify_if_unmasked(dev, vector, 1);
603 int msix_set_mask_notifier(PCIDevice *dev, msix_mask_notifier_func f)
605 int r, n;
606 assert(!dev->msix_mask_notifier);
607 dev->msix_mask_notifier = f;
608 for (n = 0; n < dev->msix_entries_nr; ++n) {
609 r = msix_set_mask_notifier_for_vector(dev, n);
610 if (r < 0) {
611 goto undo;
614 return 0;
616 undo:
617 while (--n >= 0) {
618 msix_unset_mask_notifier_for_vector(dev, n);
620 dev->msix_mask_notifier = NULL;
621 return r;
624 int msix_unset_mask_notifier(PCIDevice *dev)
626 int r, n;
627 assert(dev->msix_mask_notifier);
628 for (n = 0; n < dev->msix_entries_nr; ++n) {
629 r = msix_unset_mask_notifier_for_vector(dev, n);
630 if (r < 0) {
631 goto undo;
634 dev->msix_mask_notifier = NULL;
635 return 0;
637 undo:
638 while (--n >= 0) {
639 msix_set_mask_notifier_for_vector(dev, n);
641 return r;