4 * This module includes support for MSI-X in pci devices.
6 * Author: Michael S. Tsirkin <mst@redhat.com>
8 * Copyright (c) 2009, Red Hat Inc, Michael S. Tsirkin (mst@redhat.com)
10 * This work is licensed under the terms of the GNU GPL, version 2. See
11 * the COPYING file in the top-level directory.
20 /* MSI-X capability structure */
21 #define MSIX_TABLE_OFFSET 4
22 #define MSIX_PBA_OFFSET 8
23 #define MSIX_CAP_LENGTH 12
25 /* MSI enable bit and maskall bit are in byte 1 in FLAGS register */
26 #define MSIX_CONTROL_OFFSET (PCI_MSIX_FLAGS + 1)
27 #define MSIX_ENABLE_MASK (PCI_MSIX_FLAGS_ENABLE >> 8)
28 #define MSIX_MASKALL_MASK (PCI_MSIX_FLAGS_MASKALL >> 8)
30 /* MSI-X table format */
31 #define MSIX_MSG_ADDR 0
32 #define MSIX_MSG_UPPER_ADDR 4
33 #define MSIX_MSG_DATA 8
34 #define MSIX_VECTOR_CTRL 12
35 #define MSIX_ENTRY_SIZE 16
36 #define MSIX_VECTOR_MASK 0x1
38 /* How much space does an MSIX table need. */
39 /* The spec requires giving the table structure
40 * a 4K aligned region all by itself. */
41 #define MSIX_PAGE_SIZE 0x1000
42 /* Reserve second half of the page for pending bits */
43 #define MSIX_PAGE_PENDING (MSIX_PAGE_SIZE / 2)
44 #define MSIX_MAX_ENTRIES 32
47 /* Flag for interrupt controller to declare MSI-X support */
50 /* KVM specific MSIX helpers */
51 static void kvm_msix_free(PCIDevice
*dev
)
53 int vector
, changed
= 0;
55 for (vector
= 0; vector
< dev
->msix_entries_nr
; ++vector
) {
56 if (dev
->msix_entry_used
[vector
]) {
57 kvm_msi_message_del(&dev
->msix_irq_entries
[vector
]);
62 kvm_commit_irq_routes();
66 static void kvm_msix_message_from_vector(PCIDevice
*dev
, unsigned vector
,
69 uint8_t *table_entry
= dev
->msix_table_page
+ vector
* MSIX_ENTRY_SIZE
;
71 kmm
->addr_lo
= pci_get_long(table_entry
+ MSIX_MSG_ADDR
);
72 kmm
->addr_hi
= pci_get_long(table_entry
+ MSIX_MSG_UPPER_ADDR
);
73 kmm
->data
= pci_get_long(table_entry
+ MSIX_MSG_DATA
);
76 static void kvm_msix_update(PCIDevice
*dev
, int vector
,
77 int was_masked
, int is_masked
)
79 KVMMsiMessage e
= {}, *entry
;
80 int mask_cleared
= was_masked
&& !is_masked
;
81 /* It is only legal to change an entry when it is masked. Therefore, it is
82 * enough to update the routing in kernel when mask is being cleared. */
86 if (!dev
->msix_entry_used
[vector
]) {
89 entry
= dev
->msix_irq_entries
+ vector
;
91 kvm_msix_message_from_vector(dev
, vector
, &e
);
92 if (memcmp(entry
, &e
, sizeof e
) != 0) {
95 r
= kvm_msi_message_update(entry
, &e
);
97 fprintf(stderr
, "%s: kvm_update_msix failed: %s\n", __func__
,
102 r
= kvm_commit_irq_routes();
104 fprintf(stderr
, "%s: kvm_commit_irq_routes failed: %s\n", __func__
,
111 static int kvm_msix_vector_add(PCIDevice
*dev
, unsigned vector
)
113 KVMMsiMessage
*kmm
= dev
->msix_irq_entries
+ vector
;
116 if (!kvm_has_gsi_routing()) {
117 fprintf(stderr
, "Warning: no MSI-X support found. "
118 "At least kernel 2.6.30 is required for MSI-X support.\n"
123 r
= kvm_get_irq_route_gsi();
125 fprintf(stderr
, "%s: kvm_get_irq_route_gsi failed: %s\n", __func__
, strerror(-r
));
129 kvm_msix_message_from_vector(dev
, vector
, kmm
);
130 r
= kvm_msi_message_add(kmm
);
132 fprintf(stderr
, "%s: kvm_add_msix failed: %s\n", __func__
, strerror(-r
));
136 r
= kvm_commit_irq_routes();
138 fprintf(stderr
, "%s: kvm_commit_irq_routes failed: %s\n", __func__
, strerror(-r
));
144 static void kvm_msix_vector_del(PCIDevice
*dev
, unsigned vector
)
146 kvm_msi_message_del(&dev
->msix_irq_entries
[vector
]);
147 kvm_commit_irq_routes();
150 /* Add MSI-X capability to the config space for the device. */
151 /* Given a bar and its size, add MSI-X table on top of it
152 * and fill MSI-X capability in the config space.
153 * Original bar size must be a power of 2 or 0.
154 * New bar size is returned. */
155 static int msix_add_config(struct PCIDevice
*pdev
, unsigned short nentries
,
156 unsigned bar_nr
, unsigned bar_size
)
161 pdev
->msix_bar_size
= bar_size
;
163 config_offset
= pci_find_capability(pdev
, PCI_CAP_ID_MSIX
);
165 if (!config_offset
) {
168 if (nentries
< 1 || nentries
> PCI_MSIX_FLAGS_QSIZE
+ 1)
170 if (bar_size
> 0x80000000)
173 /* Add space for MSI-X structures */
175 new_size
= MSIX_PAGE_SIZE
;
176 } else if (bar_size
< MSIX_PAGE_SIZE
) {
177 bar_size
= MSIX_PAGE_SIZE
;
178 new_size
= MSIX_PAGE_SIZE
* 2;
180 new_size
= bar_size
* 2;
183 pdev
->msix_bar_size
= new_size
;
184 config_offset
= pci_add_capability(pdev
, PCI_CAP_ID_MSIX
,
186 if (config_offset
< 0)
187 return config_offset
;
188 config
= pdev
->config
+ config_offset
;
190 pci_set_word(config
+ PCI_MSIX_FLAGS
, nentries
- 1);
191 /* Table on top of BAR */
192 pci_set_long(config
+ MSIX_TABLE_OFFSET
, bar_size
| bar_nr
);
193 /* Pending bits on top of that */
194 pci_set_long(config
+ MSIX_PBA_OFFSET
, (bar_size
+ MSIX_PAGE_PENDING
) |
197 pdev
->msix_cap
= config_offset
;
198 /* Make flags bit writeable. */
199 pdev
->wmask
[config_offset
+ MSIX_CONTROL_OFFSET
] |= MSIX_ENABLE_MASK
|
204 static uint32_t msix_mmio_readl(void *opaque
, target_phys_addr_t addr
)
206 PCIDevice
*dev
= opaque
;
207 unsigned int offset
= addr
& (MSIX_PAGE_SIZE
- 1) & ~0x3;
208 void *page
= dev
->msix_table_page
;
210 return pci_get_long(page
+ offset
);
213 static uint32_t msix_mmio_read_unallowed(void *opaque
, target_phys_addr_t addr
)
215 fprintf(stderr
, "MSI-X: only dword read is allowed!\n");
219 static uint8_t msix_pending_mask(int vector
)
221 return 1 << (vector
% 8);
224 static uint8_t *msix_pending_byte(PCIDevice
*dev
, int vector
)
226 return dev
->msix_table_page
+ MSIX_PAGE_PENDING
+ vector
/ 8;
229 static int msix_is_pending(PCIDevice
*dev
, int vector
)
231 return *msix_pending_byte(dev
, vector
) & msix_pending_mask(vector
);
234 static void msix_set_pending(PCIDevice
*dev
, int vector
)
236 *msix_pending_byte(dev
, vector
) |= msix_pending_mask(vector
);
239 static void msix_clr_pending(PCIDevice
*dev
, int vector
)
241 *msix_pending_byte(dev
, vector
) &= ~msix_pending_mask(vector
);
244 static int msix_function_masked(PCIDevice
*dev
)
246 return dev
->config
[dev
->msix_cap
+ MSIX_CONTROL_OFFSET
] & MSIX_MASKALL_MASK
;
249 static int msix_is_masked(PCIDevice
*dev
, int vector
)
251 unsigned offset
= vector
* MSIX_ENTRY_SIZE
+ MSIX_VECTOR_CTRL
;
252 return msix_function_masked(dev
) ||
253 dev
->msix_table_page
[offset
] & MSIX_VECTOR_MASK
;
256 static void msix_handle_mask_update(PCIDevice
*dev
, int vector
)
258 if (!msix_is_masked(dev
, vector
) && msix_is_pending(dev
, vector
)) {
259 msix_clr_pending(dev
, vector
);
260 msix_notify(dev
, vector
);
264 /* Handle MSI-X capability config write. */
265 void msix_write_config(PCIDevice
*dev
, uint32_t addr
,
266 uint32_t val
, int len
)
268 unsigned enable_pos
= dev
->msix_cap
+ MSIX_CONTROL_OFFSET
;
271 if (!range_covers_byte(addr
, len
, enable_pos
)) {
275 if (!msix_enabled(dev
)) {
279 pci_device_deassert_intx(dev
);
281 if (msix_function_masked(dev
)) {
285 for (vector
= 0; vector
< dev
->msix_entries_nr
; ++vector
) {
286 msix_handle_mask_update(dev
, vector
);
290 static void msix_mmio_writel(void *opaque
, target_phys_addr_t addr
,
293 PCIDevice
*dev
= opaque
;
294 unsigned int offset
= addr
& (MSIX_PAGE_SIZE
- 1) & ~0x3;
295 int vector
= offset
/ MSIX_ENTRY_SIZE
;
296 int was_masked
= msix_is_masked(dev
, vector
);
297 pci_set_long(dev
->msix_table_page
+ offset
, val
);
298 if (kvm_enabled() && kvm_irqchip_in_kernel()) {
299 kvm_msix_update(dev
, vector
, was_masked
, msix_is_masked(dev
, vector
));
301 if (was_masked
!= msix_is_masked(dev
, vector
) && dev
->msix_mask_notifier
) {
302 int r
= dev
->msix_mask_notifier(dev
, vector
,
303 msix_is_masked(dev
, vector
));
306 msix_handle_mask_update(dev
, vector
);
309 static void msix_mmio_write_unallowed(void *opaque
, target_phys_addr_t addr
,
312 fprintf(stderr
, "MSI-X: only dword write is allowed!\n");
315 static CPUWriteMemoryFunc
* const msix_mmio_write
[] = {
316 msix_mmio_write_unallowed
, msix_mmio_write_unallowed
, msix_mmio_writel
319 static CPUReadMemoryFunc
* const msix_mmio_read
[] = {
320 msix_mmio_read_unallowed
, msix_mmio_read_unallowed
, msix_mmio_readl
323 /* Should be called from device's map method. */
324 void msix_mmio_map(PCIDevice
*d
, int region_num
,
325 pcibus_t addr
, pcibus_t size
, int type
)
327 uint8_t *config
= d
->config
+ d
->msix_cap
;
328 uint32_t table
= pci_get_long(config
+ MSIX_TABLE_OFFSET
);
329 uint32_t offset
= table
& ~(MSIX_PAGE_SIZE
- 1);
330 /* TODO: for assigned devices, we'll want to make it possible to map
331 * pending bits separately in case they are in a separate bar. */
332 int table_bir
= table
& PCI_MSIX_FLAGS_BIRMASK
;
334 if (table_bir
!= region_num
)
338 cpu_register_physical_memory(addr
+ offset
,
339 MIN(size
- offset
, MSIX_PAGE_SIZE
),
343 static void msix_mask_all(struct PCIDevice
*dev
, unsigned nentries
)
346 for (vector
= 0; vector
< nentries
; ++vector
) {
347 unsigned offset
= vector
* MSIX_ENTRY_SIZE
+ MSIX_VECTOR_CTRL
;
348 int was_masked
= msix_is_masked(dev
, vector
);
349 dev
->msix_table_page
[offset
] |= MSIX_VECTOR_MASK
;
350 if (was_masked
!= msix_is_masked(dev
, vector
) &&
351 dev
->msix_mask_notifier
) {
352 r
= dev
->msix_mask_notifier(dev
, vector
,
353 msix_is_masked(dev
, vector
));
359 /* Initialize the MSI-X structures. Note: if MSI-X is supported, BAR size is
360 * modified, it should be retrieved with msix_bar_size. */
361 int msix_init(struct PCIDevice
*dev
, unsigned short nentries
,
362 unsigned bar_nr
, unsigned bar_size
)
365 /* Nothing to do if MSI is not supported by interrupt controller */
369 if (nentries
> MSIX_MAX_ENTRIES
)
372 dev
->msix_mask_notifier
= NULL
;
373 dev
->msix_entry_used
= qemu_mallocz(MSIX_MAX_ENTRIES
*
374 sizeof *dev
->msix_entry_used
);
376 dev
->msix_table_page
= qemu_mallocz(MSIX_PAGE_SIZE
);
377 msix_mask_all(dev
, nentries
);
379 dev
->msix_mmio_index
= cpu_register_io_memory(msix_mmio_read
,
380 msix_mmio_write
, dev
,
381 DEVICE_NATIVE_ENDIAN
);
382 if (dev
->msix_mmio_index
== -1) {
387 dev
->msix_entries_nr
= nentries
;
388 ret
= msix_add_config(dev
, nentries
, bar_nr
, bar_size
);
392 if (kvm_enabled() && kvm_irqchip_in_kernel()) {
393 dev
->msix_irq_entries
= qemu_malloc(nentries
*
394 sizeof *dev
->msix_irq_entries
);
397 dev
->cap_present
|= QEMU_PCI_CAP_MSIX
;
401 dev
->msix_entries_nr
= 0;
402 cpu_unregister_io_memory(dev
->msix_mmio_index
);
404 qemu_free(dev
->msix_table_page
);
405 dev
->msix_table_page
= NULL
;
406 qemu_free(dev
->msix_entry_used
);
407 dev
->msix_entry_used
= NULL
;
411 static void msix_free_irq_entries(PCIDevice
*dev
)
415 if (kvm_enabled() && kvm_irqchip_in_kernel()) {
419 for (vector
= 0; vector
< dev
->msix_entries_nr
; ++vector
) {
420 dev
->msix_entry_used
[vector
] = 0;
421 msix_clr_pending(dev
, vector
);
425 /* Clean up resources for the device. */
426 int msix_uninit(PCIDevice
*dev
)
428 if (!(dev
->cap_present
& QEMU_PCI_CAP_MSIX
))
430 pci_del_capability(dev
, PCI_CAP_ID_MSIX
, MSIX_CAP_LENGTH
);
432 msix_free_irq_entries(dev
);
433 dev
->msix_entries_nr
= 0;
434 cpu_unregister_io_memory(dev
->msix_mmio_index
);
435 qemu_free(dev
->msix_table_page
);
436 dev
->msix_table_page
= NULL
;
437 qemu_free(dev
->msix_entry_used
);
438 dev
->msix_entry_used
= NULL
;
439 qemu_free(dev
->msix_irq_entries
);
440 dev
->msix_irq_entries
= NULL
;
441 dev
->cap_present
&= ~QEMU_PCI_CAP_MSIX
;
445 void msix_save(PCIDevice
*dev
, QEMUFile
*f
)
447 unsigned n
= dev
->msix_entries_nr
;
449 if (!msix_supported
) {
453 if (!(dev
->cap_present
& QEMU_PCI_CAP_MSIX
)) {
456 qemu_put_buffer(f
, dev
->msix_table_page
, n
* MSIX_ENTRY_SIZE
);
457 qemu_put_buffer(f
, dev
->msix_table_page
+ MSIX_PAGE_PENDING
, (n
+ 7) / 8);
460 /* Should be called after restoring the config space. */
461 void msix_load(PCIDevice
*dev
, QEMUFile
*f
)
463 unsigned n
= dev
->msix_entries_nr
;
468 if (!(dev
->cap_present
& QEMU_PCI_CAP_MSIX
)) {
472 msix_free_irq_entries(dev
);
473 qemu_get_buffer(f
, dev
->msix_table_page
, n
* MSIX_ENTRY_SIZE
);
474 qemu_get_buffer(f
, dev
->msix_table_page
+ MSIX_PAGE_PENDING
, (n
+ 7) / 8);
477 /* Does device support MSI-X? */
478 int msix_present(PCIDevice
*dev
)
480 return dev
->cap_present
& QEMU_PCI_CAP_MSIX
;
483 /* Is MSI-X enabled? */
484 int msix_enabled(PCIDevice
*dev
)
486 return (dev
->cap_present
& QEMU_PCI_CAP_MSIX
) &&
487 (dev
->config
[dev
->msix_cap
+ MSIX_CONTROL_OFFSET
] &
491 /* Size of bar where MSI-X table resides, or 0 if MSI-X not supported. */
492 uint32_t msix_bar_size(PCIDevice
*dev
)
494 return (dev
->cap_present
& QEMU_PCI_CAP_MSIX
) ?
495 dev
->msix_bar_size
: 0;
498 /* Send an MSI-X message */
499 void msix_notify(PCIDevice
*dev
, unsigned vector
)
501 uint8_t *table_entry
= dev
->msix_table_page
+ vector
* MSIX_ENTRY_SIZE
;
505 if (vector
>= dev
->msix_entries_nr
|| !dev
->msix_entry_used
[vector
])
507 if (msix_is_masked(dev
, vector
)) {
508 msix_set_pending(dev
, vector
);
512 if (kvm_enabled() && kvm_irqchip_in_kernel()) {
513 kvm_set_irq(dev
->msix_irq_entries
[vector
].gsi
, 1, NULL
);
517 address
= pci_get_long(table_entry
+ MSIX_MSG_UPPER_ADDR
);
518 address
= (address
<< 32) | pci_get_long(table_entry
+ MSIX_MSG_ADDR
);
519 data
= pci_get_long(table_entry
+ MSIX_MSG_DATA
);
520 stl_phys(address
, data
);
523 void msix_reset(PCIDevice
*dev
)
525 if (!(dev
->cap_present
& QEMU_PCI_CAP_MSIX
))
527 msix_free_irq_entries(dev
);
528 dev
->config
[dev
->msix_cap
+ MSIX_CONTROL_OFFSET
] &=
529 ~dev
->wmask
[dev
->msix_cap
+ MSIX_CONTROL_OFFSET
];
530 memset(dev
->msix_table_page
, 0, MSIX_PAGE_SIZE
);
531 msix_mask_all(dev
, dev
->msix_entries_nr
);
534 /* PCI spec suggests that devices make it possible for software to configure
535 * less vectors than supported by the device, but does not specify a standard
536 * mechanism for devices to do so.
538 * We support this by asking devices to declare vectors software is going to
539 * actually use, and checking this on the notification path. Devices that
540 * don't want to follow the spec suggestion can declare all vectors as used. */
542 /* Mark vector as used. */
543 int msix_vector_use(PCIDevice
*dev
, unsigned vector
)
546 if (vector
>= dev
->msix_entries_nr
)
548 if (kvm_enabled() && kvm_irqchip_in_kernel() &&
549 !dev
->msix_entry_used
[vector
]) {
550 ret
= kvm_msix_vector_add(dev
, vector
);
555 ++dev
->msix_entry_used
[vector
];
559 /* Mark vector as unused. */
560 void msix_vector_unuse(PCIDevice
*dev
, unsigned vector
)
562 if (vector
>= dev
->msix_entries_nr
|| !dev
->msix_entry_used
[vector
]) {
565 if (--dev
->msix_entry_used
[vector
]) {
568 if (kvm_enabled() && kvm_irqchip_in_kernel()) {
569 kvm_msix_vector_del(dev
, vector
);
571 msix_clr_pending(dev
, vector
);
574 void msix_unuse_all_vectors(PCIDevice
*dev
)
576 if (!(dev
->cap_present
& QEMU_PCI_CAP_MSIX
))
578 msix_free_irq_entries(dev
);
581 /* Invoke the notifier if vector entry is used and unmasked. */
582 static int msix_notify_if_unmasked(PCIDevice
*dev
, unsigned vector
, int masked
)
584 assert(dev
->msix_mask_notifier
);
585 if (!dev
->msix_entry_used
[vector
] || msix_is_masked(dev
, vector
)) {
588 return dev
->msix_mask_notifier(dev
, vector
, masked
);
591 static int msix_set_mask_notifier_for_vector(PCIDevice
*dev
, unsigned vector
)
593 /* Notifier has been set. Invoke it on unmasked vectors. */
594 return msix_notify_if_unmasked(dev
, vector
, 0);
597 static int msix_unset_mask_notifier_for_vector(PCIDevice
*dev
, unsigned vector
)
599 /* Notifier will be unset. Invoke it to mask unmasked entries. */
600 return msix_notify_if_unmasked(dev
, vector
, 1);
603 int msix_set_mask_notifier(PCIDevice
*dev
, msix_mask_notifier_func f
)
606 assert(!dev
->msix_mask_notifier
);
607 dev
->msix_mask_notifier
= f
;
608 for (n
= 0; n
< dev
->msix_entries_nr
; ++n
) {
609 r
= msix_set_mask_notifier_for_vector(dev
, n
);
618 msix_unset_mask_notifier_for_vector(dev
, n
);
620 dev
->msix_mask_notifier
= NULL
;
624 int msix_unset_mask_notifier(PCIDevice
*dev
)
627 assert(dev
->msix_mask_notifier
);
628 for (n
= 0; n
< dev
->msix_entries_nr
; ++n
) {
629 r
= msix_unset_mask_notifier_for_vector(dev
, n
);
634 dev
->msix_mask_notifier
= NULL
;
639 msix_set_mask_notifier_for_vector(dev
, n
);