2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
4 * Copyright (c) 2011 NetApp, Inc.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
16 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * Copyright 2019 Joyent, Inc.
33 * Copyright 2022 OmniOS Community Edition (OmniOSce) Association.
36 #include <sys/cdefs.h>
37 __FBSDID("$FreeBSD$");
39 #include <sys/param.h>
40 #include <sys/systm.h>
41 #include <sys/kernel.h>
43 #include <sys/module.h>
45 #include <sys/pciio.h>
46 #include <sys/sysctl.h>
48 #include <dev/pci/pcivar.h>
49 #include <dev/pci/pcireg.h>
51 #include <machine/vmm.h>
52 #include <machine/vmm_dev.h>
57 #include <sys/sunddi.h>
59 #include <sys/pci_cap.h>
60 #include <sys/pcie_impl.h>
61 #include <sys/ppt_dev.h>
62 #include <sys/mkdev.h>
63 #include <sys/sysmacros.h>
65 #include "vmm_lapic.h"
70 #define MAX_MSIMSGS 32
73 * If the MSI-X table is located in the middle of a BAR then that MMIO
74 * region gets split into two segments - one segment above the MSI-X table
75 * and the other segment below the MSI-X table - with a hole in place of
76 * the MSI-X table so accesses to it can be trapped and emulated.
78 * So, allocate a MMIO segment for each BAR register + 1 additional segment.
80 #define MAX_MMIOSEGS ((PCIR_MAX_BAR_0 + 1) + 1)
83 struct pptdev
*pptdev
;
98 ddi_acc_handle_t io_handle
;
104 dev_info_t
*pptd_dip
;
105 list_node_t pptd_node
;
106 ddi_acc_handle_t pptd_cfg
;
107 struct pptbar pptd_bars
[PCI_BASE_NUM
];
109 struct pptseg mmio
[MAX_MMIOSEGS
];
111 int num_msgs
; /* guest state */
114 ddi_intr_handle_t
*inth
;
115 struct pptintr_arg arg
[MAX_MSIMSGS
];
122 ddi_intr_handle_t
*inth
;
123 struct pptintr_arg
*arg
;
128 static major_t ppt_major
;
129 static void *ppt_state
;
130 static kmutex_t pptdev_mtx
;
131 static list_t pptdev_list
;
133 #define PPT_MINOR_NAME "ppt"
135 static ddi_device_acc_attr_t ppt_attr
= {
138 DDI_STORECACHING_OK_ACC
,
143 ppt_open(dev_t
*devp
, int flag
, int otyp
, cred_t
*cr
)
145 /* XXX: require extra privs? */
149 #define BAR_TO_IDX(bar) (((bar) - PCI_CONF_BASE0) / PCI_BAR_SZ_32)
150 #define BAR_VALID(b) ( \
151 (b) >= PCI_CONF_BASE0 && \
152 (b) <= PCI_CONF_BASE5 && \
153 ((b) & (PCI_BAR_SZ_32-1)) == 0)
156 ppt_ioctl(dev_t dev
, int cmd
, intptr_t arg
, int md
, cred_t
*cr
, int *rv
)
158 minor_t minor
= getminor(dev
);
160 void *data
= (void *)arg
;
162 if ((ppt
= ddi_get_soft_state(ppt_state
, minor
)) == NULL
) {
168 struct ppt_cfg_io cio
;
169 ddi_acc_handle_t cfg
= ppt
->pptd_cfg
;
171 if (ddi_copyin(data
, &cio
, sizeof (cio
), md
) != 0) {
174 switch (cio
.pci_width
) {
176 cio
.pci_data
= pci_config_get32(cfg
, cio
.pci_off
);
179 cio
.pci_data
= pci_config_get16(cfg
, cio
.pci_off
);
182 cio
.pci_data
= pci_config_get8(cfg
, cio
.pci_off
);
188 if (ddi_copyout(&cio
, data
, sizeof (cio
), md
) != 0) {
193 case PPT_CFG_WRITE
: {
194 struct ppt_cfg_io cio
;
195 ddi_acc_handle_t cfg
= ppt
->pptd_cfg
;
197 if (ddi_copyin(data
, &cio
, sizeof (cio
), md
) != 0) {
200 switch (cio
.pci_width
) {
202 pci_config_put32(cfg
, cio
.pci_off
, cio
.pci_data
);
205 pci_config_put16(cfg
, cio
.pci_off
, cio
.pci_data
);
208 pci_config_put8(cfg
, cio
.pci_off
, cio
.pci_data
);
216 case PPT_BAR_QUERY
: {
217 struct ppt_bar_query barg
;
220 if (ddi_copyin(data
, &barg
, sizeof (barg
), md
) != 0) {
223 if (barg
.pbq_baridx
>= PCI_BASE_NUM
) {
226 pbar
= &ppt
->pptd_bars
[barg
.pbq_baridx
];
228 if (pbar
->base
== 0 || pbar
->size
== 0) {
231 barg
.pbq_type
= pbar
->type
;
232 barg
.pbq_base
= pbar
->base
;
233 barg
.pbq_size
= pbar
->size
;
235 if (ddi_copyout(&barg
, data
, sizeof (barg
), md
) != 0) {
241 struct ppt_bar_io bio
;
245 ddi_acc_handle_t cfg
;
247 if (ddi_copyin(data
, &bio
, sizeof (bio
), md
) != 0) {
251 if (rnum
>= PCI_BASE_NUM
) {
254 pbar
= &ppt
->pptd_bars
[rnum
];
255 if (pbar
->type
!= PCI_ADDR_IO
|| pbar
->io_handle
== NULL
) {
258 addr
= pbar
->io_ptr
+ bio
.pbi_off
;
260 switch (bio
.pbi_width
) {
262 bio
.pbi_data
= ddi_get32(pbar
->io_handle
, addr
);
265 bio
.pbi_data
= ddi_get16(pbar
->io_handle
, addr
);
268 bio
.pbi_data
= ddi_get8(pbar
->io_handle
, addr
);
274 if (ddi_copyout(&bio
, data
, sizeof (bio
), md
) != 0) {
279 case PPT_BAR_WRITE
: {
280 struct ppt_bar_io bio
;
284 ddi_acc_handle_t cfg
;
286 if (ddi_copyin(data
, &bio
, sizeof (bio
), md
) != 0) {
290 if (rnum
>= PCI_BASE_NUM
) {
293 pbar
= &ppt
->pptd_bars
[rnum
];
294 if (pbar
->type
!= PCI_ADDR_IO
|| pbar
->io_handle
== NULL
) {
297 addr
= pbar
->io_ptr
+ bio
.pbi_off
;
299 switch (bio
.pbi_width
) {
301 ddi_put32(pbar
->io_handle
, addr
, bio
.pbi_data
);
304 ddi_put16(pbar
->io_handle
, addr
, bio
.pbi_data
);
307 ddi_put8(pbar
->io_handle
, addr
, bio
.pbi_data
);
324 ppt_find_msix_table_bar(struct pptdev
*ppt
)
329 if (PCI_CAP_LOCATE(ppt
->pptd_cfg
, PCI_CAP_ID_MSI_X
, &base
) !=
333 off
= pci_config_get32(ppt
->pptd_cfg
, base
+ PCI_MSIX_TBL_OFFSET
);
335 if (off
== PCI_EINVAL32
)
338 return (off
& PCI_MSIX_TBL_BIR_MASK
);
342 ppt_devmap(dev_t dev
, devmap_cookie_t dhp
, offset_t off
, size_t len
,
343 size_t *maplen
, uint_t model
)
350 minor
= getminor(dev
);
352 if ((ppt
= ddi_get_soft_state(ppt_state
, minor
)) == NULL
)
355 #ifdef _MULTI_DATAMODEL
356 if (ddi_model_convert_from(model
) != DDI_MODEL_NONE
)
360 if (off
< 0 || off
!= P2ALIGN(off
, PAGESIZE
))
363 if ((bar
= ppt_find_msix_table_bar(ppt
)) == -1)
366 ddireg
= ppt
->pptd_bars
[bar
].ddireg
;
371 err
= devmap_devmem_setup(dhp
, ppt
->pptd_dip
, NULL
, ddireg
, off
, len
,
372 PROT_USER
| PROT_READ
| PROT_WRITE
, IOMEM_DATA_CACHED
, &ppt_attr
);
374 if (err
== DDI_SUCCESS
)
381 ppt_bar_wipe(struct pptdev
*ppt
)
385 for (i
= 0; i
< PCI_BASE_NUM
; i
++) {
386 struct pptbar
*pbar
= &ppt
->pptd_bars
[i
];
387 if (pbar
->type
== PCI_ADDR_IO
&& pbar
->io_handle
!= NULL
) {
388 ddi_regs_map_free(&pbar
->io_handle
);
391 bzero(&ppt
->pptd_bars
, sizeof (ppt
->pptd_bars
));
395 ppt_bar_crawl(struct pptdev
*ppt
)
401 if (ddi_getlongprop(DDI_DEV_T_ANY
, ppt
->pptd_dip
, DDI_PROP_DONTPASS
,
402 "assigned-addresses", (caddr_t
)®s
, &rlen
) != DDI_PROP_SUCCESS
) {
406 VERIFY3S(rlen
, >, 0);
407 rcount
= rlen
/ sizeof (pci_regspec_t
);
408 for (i
= 0; i
< rcount
; i
++) {
409 pci_regspec_t
*reg
= ®s
[i
];
413 DTRACE_PROBE1(ppt__crawl__reg
, pci_regspec_t
*, reg
);
414 bar
= PCI_REG_REG_G(reg
->pci_phys_hi
);
415 if (!BAR_VALID(bar
)) {
419 rnum
= BAR_TO_IDX(bar
);
420 pbar
= &ppt
->pptd_bars
[rnum
];
421 /* is this somehow already populated? */
422 if (pbar
->base
!= 0 || pbar
->size
!= 0) {
428 * Register 0 corresponds to the PCI config space.
429 * The registers which match the assigned-addresses list are
432 pbar
->ddireg
= i
+ 1;
434 pbar
->type
= reg
->pci_phys_hi
& PCI_ADDR_MASK
;
435 pbar
->base
= ((uint64_t)reg
->pci_phys_mid
<< 32) |
436 (uint64_t)reg
->pci_phys_low
;
437 pbar
->size
= ((uint64_t)reg
->pci_size_hi
<< 32) |
438 (uint64_t)reg
->pci_size_low
;
439 if (pbar
->type
== PCI_ADDR_IO
) {
440 err
= ddi_regs_map_setup(ppt
->pptd_dip
, rnum
,
441 &pbar
->io_ptr
, 0, 0, &ppt_attr
, &pbar
->io_handle
);
447 kmem_free(regs
, rlen
);
456 ppt_bar_verify_mmio(struct pptdev
*ppt
, uint64_t base
, uint64_t size
)
458 const uint64_t map_end
= base
+ size
;
460 /* Zero-length or overflow mappings are not valid */
461 if (map_end
<= base
) {
464 /* MMIO bounds should be page-aligned */
465 if ((base
& PAGEOFFSET
) != 0 || (size
& PAGEOFFSET
) != 0) {
469 for (uint_t i
= 0; i
< PCI_BASE_NUM
; i
++) {
470 const struct pptbar
*bar
= &ppt
->pptd_bars
[i
];
471 const uint64_t bar_end
= bar
->base
+ bar
->size
;
473 /* Only memory BARs can be mapped */
474 if (bar
->type
!= PCI_ADDR_MEM32
&&
475 bar
->type
!= PCI_ADDR_MEM64
) {
479 /* Does the mapping fit within this BAR? */
480 if (base
< bar
->base
|| base
>= bar_end
||
481 map_end
< bar
->base
|| map_end
> bar_end
) {
485 /* This BAR satisfies the provided map */
492 ppt_ddi_attach(dev_info_t
*dip
, ddi_attach_cmd_t cmd
)
494 struct pptdev
*ppt
= NULL
;
495 char name
[PPT_MAXNAMELEN
];
498 if (cmd
!= DDI_ATTACH
)
499 return (DDI_FAILURE
);
501 inst
= ddi_get_instance(dip
);
503 if (ddi_soft_state_zalloc(ppt_state
, inst
) != DDI_SUCCESS
) {
506 VERIFY(ppt
= ddi_get_soft_state(ppt_state
, inst
));
508 ddi_set_driver_private(dip
, ppt
);
510 if (pci_config_setup(dip
, &ppt
->pptd_cfg
) != DDI_SUCCESS
) {
513 if (ppt_bar_crawl(ppt
) != 0) {
516 if (ddi_create_minor_node(dip
, PPT_MINOR_NAME
, S_IFCHR
, inst
,
517 DDI_PSEUDO
, 0) != DDI_SUCCESS
) {
521 mutex_enter(&pptdev_mtx
);
522 list_insert_tail(&pptdev_list
, ppt
);
523 mutex_exit(&pptdev_mtx
);
525 return (DDI_SUCCESS
);
529 ddi_remove_minor_node(dip
, NULL
);
530 if (ppt
->pptd_cfg
!= NULL
) {
531 pci_config_teardown(&ppt
->pptd_cfg
);
534 ddi_soft_state_free(ppt_state
, inst
);
536 return (DDI_FAILURE
);
540 ppt_ddi_detach(dev_info_t
*dip
, ddi_detach_cmd_t cmd
)
545 if (cmd
!= DDI_DETACH
)
546 return (DDI_FAILURE
);
548 ppt
= ddi_get_driver_private(dip
);
549 inst
= ddi_get_instance(dip
);
551 ASSERT3P(ddi_get_soft_state(ppt_state
, inst
), ==, ppt
);
553 mutex_enter(&pptdev_mtx
);
554 if (ppt
->vm
!= NULL
) {
555 mutex_exit(&pptdev_mtx
);
556 return (DDI_FAILURE
);
558 list_remove(&pptdev_list
, ppt
);
559 mutex_exit(&pptdev_mtx
);
561 ddi_remove_minor_node(dip
, PPT_MINOR_NAME
);
563 pci_config_teardown(&ppt
->pptd_cfg
);
564 ddi_set_driver_private(dip
, NULL
);
565 ddi_soft_state_free(ppt_state
, inst
);
567 return (DDI_SUCCESS
);
571 ppt_ddi_info(dev_info_t
*dip
, ddi_info_cmd_t cmd
, void *arg
, void **result
)
573 int error
= DDI_FAILURE
;
574 int inst
= getminor((dev_t
)arg
);
577 case DDI_INFO_DEVT2DEVINFO
: {
578 struct pptdev
*ppt
= ddi_get_soft_state(ppt_state
, inst
);
581 *result
= (void *)ppt
->pptd_dip
;
586 case DDI_INFO_DEVT2INSTANCE
: {
587 *result
= (void *)(uintptr_t)inst
;
597 static struct cb_ops ppt_cb_ops
= {
600 nodev
, /* strategy */
606 ppt_devmap
, /* devmap */
612 D_NEW
| D_MP
| D_64BIT
| D_DEVMAP
,
616 static struct dev_ops ppt_ops
= {
620 nulldev
, /* identify */
626 (struct bus_ops
*)NULL
629 static struct modldrv modldrv
= {
631 "bhyve pci pass-thru",
635 static struct modlinkage modlinkage
= {
646 mutex_init(&pptdev_mtx
, NULL
, MUTEX_DRIVER
, NULL
);
647 list_create(&pptdev_list
, sizeof (struct pptdev
),
648 offsetof(struct pptdev
, pptd_node
));
650 error
= ddi_soft_state_init(&ppt_state
, sizeof (struct pptdev
), 0);
655 error
= mod_install(&modlinkage
);
657 ppt_major
= ddi_name_to_major("ppt");
660 ddi_soft_state_fini(&ppt_state
);
670 error
= mod_remove(&modlinkage
);
673 ddi_soft_state_fini(&ppt_state
);
679 _info(struct modinfo
*modinfop
)
681 return (mod_info(&modlinkage
, modinfop
));
685 ppt_wait_for_pending_txn(dev_info_t
*dip
, uint_t max_delay_us
)
687 uint16_t cap_ptr
, devsts
;
688 ddi_acc_handle_t hdl
;
690 if (pci_config_setup(dip
, &hdl
) != DDI_SUCCESS
)
693 if (PCI_CAP_LOCATE(hdl
, PCI_CAP_ID_PCI_E
, &cap_ptr
) != DDI_SUCCESS
) {
694 pci_config_teardown(&hdl
);
698 devsts
= PCI_CAP_GET16(hdl
, 0, cap_ptr
, PCIE_DEVSTS
);
699 while ((devsts
& PCIE_DEVSTS_TRANS_PENDING
) != 0) {
700 if (max_delay_us
== 0) {
701 pci_config_teardown(&hdl
);
705 /* Poll once every 100 milliseconds up to the timeout. */
706 if (max_delay_us
> 100000) {
707 delay(drv_usectohz(100000));
708 max_delay_us
-= 100000;
710 delay(drv_usectohz(max_delay_us
));
713 devsts
= PCI_CAP_GET16(hdl
, 0, cap_ptr
, PCIE_DEVSTS
);
716 pci_config_teardown(&hdl
);
721 ppt_max_completion_tmo_us(dev_info_t
*dip
)
725 ddi_acc_handle_t hdl
;
726 uint_t timo_ranges
[] = { /* timeout ranges */
745 if (pci_config_setup(dip
, &hdl
) != DDI_SUCCESS
)
746 return (50000); /* default 50ms */
748 if (PCI_CAP_LOCATE(hdl
, PCI_CAP_ID_PCI_E
, &cap_ptr
) != DDI_SUCCESS
)
751 if ((PCI_CAP_GET16(hdl
, 0, cap_ptr
, PCIE_PCIECAP
) &
752 PCIE_PCIECAP_VER_MASK
) < PCIE_PCIECAP_VER_2_0
)
755 if ((PCI_CAP_GET32(hdl
, 0, cap_ptr
, PCIE_DEVCAP2
) &
756 PCIE_DEVCTL2_COM_TO_RANGE_MASK
) == 0)
759 timo
= timo_ranges
[PCI_CAP_GET16(hdl
, 0, cap_ptr
, PCIE_DEVCTL2
) &
760 PCIE_DEVCAP2_COM_TO_RANGE_MASK
];
764 timo
= 50000; /* default 50ms */
766 pci_config_teardown(&hdl
);
771 ppt_flr(dev_info_t
*dip
, boolean_t force
)
773 uint16_t cap_ptr
, ctl
, cmd
;
774 ddi_acc_handle_t hdl
;
775 uint_t compl_delay
= 0, max_delay_us
;
777 if (pci_config_setup(dip
, &hdl
) != DDI_SUCCESS
)
780 if (PCI_CAP_LOCATE(hdl
, PCI_CAP_ID_PCI_E
, &cap_ptr
) != DDI_SUCCESS
)
783 if ((PCI_CAP_GET32(hdl
, 0, cap_ptr
, PCIE_DEVCAP
) & PCIE_DEVCAP_FLR
)
787 max_delay_us
= MAX(ppt_max_completion_tmo_us(dip
), 10000);
790 * Disable busmastering to prevent generation of new transactions while
791 * waiting for the device to go idle. If the idle timeout fails, the
792 * command register is restored which will re-enable busmastering.
794 cmd
= pci_config_get16(hdl
, PCI_CONF_COMM
);
795 pci_config_put16(hdl
, PCI_CONF_COMM
, cmd
& ~PCI_COMM_ME
);
796 if (!ppt_wait_for_pending_txn(dip
, max_delay_us
)) {
798 pci_config_put16(hdl
, PCI_CONF_COMM
, cmd
);
801 dev_err(dip
, CE_WARN
,
802 "?Resetting with transactions pending after %u us\n",
806 * Extend the post-FLR delay to cover the maximum Completion
807 * Timeout delay of anything in flight during the FLR delay.
808 * Enforce a minimum delay of at least 10ms.
810 compl_delay
= MAX(10, (ppt_max_completion_tmo_us(dip
) / 1000));
813 /* Initiate the reset. */
814 ctl
= PCI_CAP_GET16(hdl
, 0, cap_ptr
, PCIE_DEVCTL
);
815 (void) PCI_CAP_PUT16(hdl
, 0, cap_ptr
, PCIE_DEVCTL
,
816 ctl
| PCIE_DEVCTL_INITIATE_FLR
);
818 /* Wait for at least 100ms */
819 delay(drv_usectohz((100 + compl_delay
) * 1000));
821 pci_config_teardown(&hdl
);
826 * TODO: If the FLR fails for some reason, we should attempt a reset
827 * using the PCI power management facilities (if possible).
829 pci_config_teardown(&hdl
);
834 ppt_findf(struct vm
*vm
, int fd
, struct pptdev
**pptp
)
836 struct pptdev
*ppt
= NULL
;
841 ASSERT(MUTEX_HELD(&pptdev_mtx
));
843 if ((fp
= getf(fd
)) == NULL
)
846 va
.va_mask
= AT_RDEV
;
847 if (VOP_GETATTR(fp
->f_vnode
, &va
, NO_FOLLOW
, fp
->f_cred
, NULL
) != 0 ||
848 getmajor(va
.va_rdev
) != ppt_major
) {
853 ppt
= ddi_get_soft_state(ppt_state
, getminor(va
.va_rdev
));
874 ppt_unmap_all_mmio(struct vm
*vm
, struct pptdev
*ppt
)
879 for (i
= 0; i
< MAX_MMIOSEGS
; i
++) {
883 (void) vm_unmap_mmio(vm
, seg
->gpa
, seg
->len
);
884 bzero(seg
, sizeof (struct pptseg
));
889 ppt_teardown_msi(struct pptdev
*ppt
)
893 if (ppt
->msi
.num_msgs
== 0)
896 for (i
= 0; i
< ppt
->msi
.num_msgs
; i
++) {
899 (void) ddi_intr_get_cap(ppt
->msi
.inth
[i
], &intr_cap
);
900 if (intr_cap
& DDI_INTR_FLAG_BLOCK
)
901 ddi_intr_block_disable(&ppt
->msi
.inth
[i
], 1);
903 ddi_intr_disable(ppt
->msi
.inth
[i
]);
905 ddi_intr_remove_handler(ppt
->msi
.inth
[i
]);
906 ddi_intr_free(ppt
->msi
.inth
[i
]);
908 ppt
->msi
.inth
[i
] = NULL
;
911 kmem_free(ppt
->msi
.inth
, ppt
->msi
.inth_sz
);
912 ppt
->msi
.inth
= NULL
;
913 ppt
->msi
.inth_sz
= 0;
914 ppt
->msi
.is_fixed
= B_FALSE
;
916 ppt
->msi
.num_msgs
= 0;
920 ppt_teardown_msix_intr(struct pptdev
*ppt
, int idx
)
922 if (ppt
->msix
.inth
!= NULL
&& ppt
->msix
.inth
[idx
] != NULL
) {
925 (void) ddi_intr_get_cap(ppt
->msix
.inth
[idx
], &intr_cap
);
926 if (intr_cap
& DDI_INTR_FLAG_BLOCK
)
927 ddi_intr_block_disable(&ppt
->msix
.inth
[idx
], 1);
929 ddi_intr_disable(ppt
->msix
.inth
[idx
]);
931 ddi_intr_remove_handler(ppt
->msix
.inth
[idx
]);
936 ppt_teardown_msix(struct pptdev
*ppt
)
940 if (ppt
->msix
.num_msgs
== 0)
943 for (i
= 0; i
< ppt
->msix
.num_msgs
; i
++)
944 ppt_teardown_msix_intr(ppt
, i
);
946 if (ppt
->msix
.inth
) {
947 for (i
= 0; i
< ppt
->msix
.num_msgs
; i
++)
948 ddi_intr_free(ppt
->msix
.inth
[i
]);
949 kmem_free(ppt
->msix
.inth
, ppt
->msix
.inth_sz
);
950 ppt
->msix
.inth
= NULL
;
951 ppt
->msix
.inth_sz
= 0;
952 kmem_free(ppt
->msix
.arg
, ppt
->msix
.arg_sz
);
953 ppt
->msix
.arg
= NULL
;
954 ppt
->msix
.arg_sz
= 0;
957 ppt
->msix
.num_msgs
= 0;
961 ppt_assigned_devices(struct vm
*vm
)
966 mutex_enter(&pptdev_mtx
);
967 for (ppt
= list_head(&pptdev_list
); ppt
!= NULL
;
968 ppt
= list_next(&pptdev_list
, ppt
)) {
973 mutex_exit(&pptdev_mtx
);
978 ppt_is_mmio(struct vm
*vm
, vm_paddr_t gpa
)
980 struct pptdev
*ppt
= list_head(&pptdev_list
);
982 /* XXX: this should probably be restructured to avoid the lock */
983 mutex_enter(&pptdev_mtx
);
984 for (ppt
= list_head(&pptdev_list
); ppt
!= NULL
;
985 ppt
= list_next(&pptdev_list
, ppt
)) {
990 for (uint_t i
= 0; i
< MAX_MMIOSEGS
; i
++) {
991 struct pptseg
*seg
= &ppt
->mmio
[i
];
995 if (gpa
>= seg
->gpa
&& gpa
< seg
->gpa
+ seg
->len
) {
996 mutex_exit(&pptdev_mtx
);
1002 mutex_exit(&pptdev_mtx
);
1007 ppt_assign_device(struct vm
*vm
, int pptfd
)
1012 mutex_enter(&pptdev_mtx
);
1013 /* Passing NULL requires the device to be unowned. */
1014 err
= ppt_findf(NULL
, pptfd
, &ppt
);
1016 mutex_exit(&pptdev_mtx
);
1020 if (pci_save_config_regs(ppt
->pptd_dip
) != DDI_SUCCESS
) {
1024 ppt_flr(ppt
->pptd_dip
, B_TRUE
);
1027 * Restore the device state after reset and then perform another save
1028 * so the "pristine" state can be restored when the device is removed
1031 if (pci_restore_config_regs(ppt
->pptd_dip
) != DDI_SUCCESS
||
1032 pci_save_config_regs(ppt
->pptd_dip
) != DDI_SUCCESS
) {
1038 iommu_remove_device(iommu_host_domain(), pci_get_bdf(ppt
->pptd_dip
));
1039 iommu_add_device(vm_iommu_domain(vm
), pci_get_bdf(ppt
->pptd_dip
));
1040 pf_set_passthru(ppt
->pptd_dip
, B_TRUE
);
1044 mutex_exit(&pptdev_mtx
);
1049 ppt_reset_pci_power_state(dev_info_t
*dip
)
1051 ddi_acc_handle_t cfg
;
1054 if (pci_config_setup(dip
, &cfg
) != DDI_SUCCESS
)
1057 if (PCI_CAP_LOCATE(cfg
, PCI_CAP_ID_PM
, &cap_ptr
) == DDI_SUCCESS
) {
1060 val
= PCI_CAP_GET16(cfg
, 0, cap_ptr
, PCI_PMCSR
);
1061 if ((val
& PCI_PMCSR_STATE_MASK
) != PCI_PMCSR_D0
) {
1062 val
= (val
& ~PCI_PMCSR_STATE_MASK
) | PCI_PMCSR_D0
;
1063 (void) PCI_CAP_PUT16(cfg
, 0, cap_ptr
, PCI_PMCSR
,
1068 pci_config_teardown(&cfg
);
1072 ppt_do_unassign(struct pptdev
*ppt
)
1074 struct vm
*vm
= ppt
->vm
;
1076 ASSERT3P(vm
, !=, NULL
);
1077 ASSERT(MUTEX_HELD(&pptdev_mtx
));
1080 ppt_flr(ppt
->pptd_dip
, B_TRUE
);
1083 * Restore from the state saved during device assignment.
1084 * If the device power state has been altered, that must be remedied
1085 * first, as it will reset register state during the transition.
1087 ppt_reset_pci_power_state(ppt
->pptd_dip
);
1088 (void) pci_restore_config_regs(ppt
->pptd_dip
);
1090 pf_set_passthru(ppt
->pptd_dip
, B_FALSE
);
1092 ppt_unmap_all_mmio(vm
, ppt
);
1093 ppt_teardown_msi(ppt
);
1094 ppt_teardown_msix(ppt
);
1095 iommu_remove_device(vm_iommu_domain(vm
), pci_get_bdf(ppt
->pptd_dip
));
1096 iommu_add_device(iommu_host_domain(), pci_get_bdf(ppt
->pptd_dip
));
1101 ppt_unassign_device(struct vm
*vm
, int pptfd
)
1106 mutex_enter(&pptdev_mtx
);
1107 err
= ppt_findf(vm
, pptfd
, &ppt
);
1109 mutex_exit(&pptdev_mtx
);
1113 ppt_do_unassign(ppt
);
1116 mutex_exit(&pptdev_mtx
);
1121 ppt_unassign_all(struct vm
*vm
)
1125 mutex_enter(&pptdev_mtx
);
1126 for (ppt
= list_head(&pptdev_list
); ppt
!= NULL
;
1127 ppt
= list_next(&pptdev_list
, ppt
)) {
1128 if (ppt
->vm
== vm
) {
1129 ppt_do_unassign(ppt
);
1132 mutex_exit(&pptdev_mtx
);
1136 ppt_map_mmio(struct vm
*vm
, int pptfd
, vm_paddr_t gpa
, size_t len
,
1142 if ((len
& PAGEOFFSET
) != 0 || len
== 0 || (gpa
& PAGEOFFSET
) != 0 ||
1143 (hpa
& PAGEOFFSET
) != 0 || gpa
+ len
< gpa
|| hpa
+ len
< hpa
) {
1147 mutex_enter(&pptdev_mtx
);
1148 err
= ppt_findf(vm
, pptfd
, &ppt
);
1150 mutex_exit(&pptdev_mtx
);
1155 * Ensure that the host-physical range of the requested mapping fits
1156 * within one of the MMIO BARs of the device.
1158 if (!ppt_bar_verify_mmio(ppt
, hpa
, len
)) {
1163 for (uint_t i
= 0; i
< MAX_MMIOSEGS
; i
++) {
1164 struct pptseg
*seg
= &ppt
->mmio
[i
];
1166 if (seg
->len
== 0) {
1167 err
= vm_map_mmio(vm
, gpa
, len
, hpa
);
1179 mutex_exit(&pptdev_mtx
);
1184 ppt_unmap_mmio(struct vm
*vm
, int pptfd
, vm_paddr_t gpa
, size_t len
)
1190 mutex_enter(&pptdev_mtx
);
1191 err
= ppt_findf(vm
, pptfd
, &ppt
);
1193 mutex_exit(&pptdev_mtx
);
1197 for (i
= 0; i
< MAX_MMIOSEGS
; i
++) {
1198 struct pptseg
*seg
= &ppt
->mmio
[i
];
1200 if (seg
->gpa
== gpa
&& seg
->len
== len
) {
1201 err
= vm_unmap_mmio(vm
, seg
->gpa
, seg
->len
);
1212 mutex_exit(&pptdev_mtx
);
1217 pptintr(caddr_t arg
, caddr_t unused
)
1219 struct pptintr_arg
*pptarg
= (struct pptintr_arg
*)arg
;
1220 struct pptdev
*ppt
= pptarg
->pptdev
;
1222 if (ppt
->vm
!= NULL
) {
1223 lapic_intr_msi(ppt
->vm
, pptarg
->addr
, pptarg
->msg_data
);
1227 * This is not expected to happen - panic?
1232 * For legacy interrupts give other filters a chance in case
1233 * the interrupt was not generated by the passthrough device.
1235 return (ppt
->msi
.is_fixed
? DDI_INTR_UNCLAIMED
: DDI_INTR_CLAIMED
);
1239 ppt_setup_msi(struct vm
*vm
, int vcpu
, int pptfd
, uint64_t addr
, uint64_t msg
,
1242 int i
, msi_count
, intr_type
;
1246 if (numvec
< 0 || numvec
> MAX_MSIMSGS
)
1249 mutex_enter(&pptdev_mtx
);
1250 err
= ppt_findf(vm
, pptfd
, &ppt
);
1252 mutex_exit(&pptdev_mtx
);
1256 /* Reject attempts to enable MSI while MSI-X is active. */
1257 if (ppt
->msix
.num_msgs
!= 0 && numvec
!= 0) {
1262 /* Free any allocated resources */
1263 ppt_teardown_msi(ppt
);
1266 /* nothing more to do */
1270 if (ddi_intr_get_navail(ppt
->pptd_dip
, DDI_INTR_TYPE_MSI
,
1271 &msi_count
) != DDI_SUCCESS
) {
1272 if (ddi_intr_get_navail(ppt
->pptd_dip
, DDI_INTR_TYPE_FIXED
,
1273 &msi_count
) != DDI_SUCCESS
) {
1278 intr_type
= DDI_INTR_TYPE_FIXED
;
1279 ppt
->msi
.is_fixed
= B_TRUE
;
1281 intr_type
= DDI_INTR_TYPE_MSI
;
1285 * The device must be capable of supporting the number of vectors
1286 * the guest wants to allocate.
1288 if (numvec
> msi_count
) {
1293 ppt
->msi
.inth_sz
= numvec
* sizeof (ddi_intr_handle_t
);
1294 ppt
->msi
.inth
= kmem_zalloc(ppt
->msi
.inth_sz
, KM_SLEEP
);
1295 if (ddi_intr_alloc(ppt
->pptd_dip
, ppt
->msi
.inth
, intr_type
, 0,
1296 numvec
, &msi_count
, 0) != DDI_SUCCESS
) {
1297 kmem_free(ppt
->msi
.inth
, ppt
->msi
.inth_sz
);
1302 /* Verify that we got as many vectors as the guest requested */
1303 if (numvec
!= msi_count
) {
1304 ppt_teardown_msi(ppt
);
1309 /* Set up & enable interrupt handler for each vector. */
1310 for (i
= 0; i
< numvec
; i
++) {
1311 int res
, intr_cap
= 0;
1313 ppt
->msi
.num_msgs
= i
+ 1;
1314 ppt
->msi
.arg
[i
].pptdev
= ppt
;
1315 ppt
->msi
.arg
[i
].addr
= addr
;
1316 ppt
->msi
.arg
[i
].msg_data
= msg
+ i
;
1318 if (ddi_intr_add_handler(ppt
->msi
.inth
[i
], pptintr
,
1319 &ppt
->msi
.arg
[i
], NULL
) != DDI_SUCCESS
)
1322 (void) ddi_intr_get_cap(ppt
->msi
.inth
[i
], &intr_cap
);
1323 if (intr_cap
& DDI_INTR_FLAG_BLOCK
)
1324 res
= ddi_intr_block_enable(&ppt
->msi
.inth
[i
], 1);
1326 res
= ddi_intr_enable(ppt
->msi
.inth
[i
]);
1328 if (res
!= DDI_SUCCESS
)
1332 ppt_teardown_msi(ppt
);
1338 mutex_exit(&pptdev_mtx
);
1343 ppt_setup_msix(struct vm
*vm
, int vcpu
, int pptfd
, int idx
, uint64_t addr
,
1344 uint64_t msg
, uint32_t vector_control
)
1347 int numvec
, alloced
;
1350 mutex_enter(&pptdev_mtx
);
1351 err
= ppt_findf(vm
, pptfd
, &ppt
);
1353 mutex_exit(&pptdev_mtx
);
1357 /* Reject attempts to enable MSI-X while MSI is active. */
1358 if (ppt
->msi
.num_msgs
!= 0) {
1364 * First-time configuration:
1365 * Allocate the MSI-X table
1366 * Allocate the IRQ resources
1367 * Set up some variables in ppt->msix
1369 if (ppt
->msix
.num_msgs
== 0) {
1370 dev_info_t
*dip
= ppt
->pptd_dip
;
1372 if (ddi_intr_get_navail(dip
, DDI_INTR_TYPE_MSIX
,
1373 &numvec
) != DDI_SUCCESS
) {
1378 ppt
->msix
.num_msgs
= numvec
;
1380 ppt
->msix
.arg_sz
= numvec
* sizeof (ppt
->msix
.arg
[0]);
1381 ppt
->msix
.arg
= kmem_zalloc(ppt
->msix
.arg_sz
, KM_SLEEP
);
1382 ppt
->msix
.inth_sz
= numvec
* sizeof (ddi_intr_handle_t
);
1383 ppt
->msix
.inth
= kmem_zalloc(ppt
->msix
.inth_sz
, KM_SLEEP
);
1385 if (ddi_intr_alloc(dip
, ppt
->msix
.inth
, DDI_INTR_TYPE_MSIX
, 0,
1386 numvec
, &alloced
, 0) != DDI_SUCCESS
) {
1387 kmem_free(ppt
->msix
.arg
, ppt
->msix
.arg_sz
);
1388 kmem_free(ppt
->msix
.inth
, ppt
->msix
.inth_sz
);
1389 ppt
->msix
.arg
= NULL
;
1390 ppt
->msix
.inth
= NULL
;
1391 ppt
->msix
.arg_sz
= ppt
->msix
.inth_sz
= 0;
1396 if (numvec
!= alloced
) {
1397 ppt_teardown_msix(ppt
);
1403 if (idx
>= ppt
->msix
.num_msgs
) {
1408 if ((vector_control
& PCIM_MSIX_VCTRL_MASK
) == 0) {
1411 /* Tear down the IRQ if it's already set up */
1412 ppt_teardown_msix_intr(ppt
, idx
);
1414 ppt
->msix
.arg
[idx
].pptdev
= ppt
;
1415 ppt
->msix
.arg
[idx
].addr
= addr
;
1416 ppt
->msix
.arg
[idx
].msg_data
= msg
;
1418 /* Setup the MSI-X interrupt */
1419 if (ddi_intr_add_handler(ppt
->msix
.inth
[idx
], pptintr
,
1420 &ppt
->msix
.arg
[idx
], NULL
) != DDI_SUCCESS
) {
1425 (void) ddi_intr_get_cap(ppt
->msix
.inth
[idx
], &intr_cap
);
1426 if (intr_cap
& DDI_INTR_FLAG_BLOCK
)
1427 res
= ddi_intr_block_enable(&ppt
->msix
.inth
[idx
], 1);
1429 res
= ddi_intr_enable(ppt
->msix
.inth
[idx
]);
1431 if (res
!= DDI_SUCCESS
) {
1432 ddi_intr_remove_handler(ppt
->msix
.inth
[idx
]);
1437 /* Masked, tear it down if it's already been set up */
1438 ppt_teardown_msix_intr(ppt
, idx
);
1443 mutex_exit(&pptdev_mtx
);
1448 ppt_get_limits(struct vm
*vm
, int pptfd
, int *msilimit
, int *msixlimit
)
1453 mutex_enter(&pptdev_mtx
);
1454 err
= ppt_findf(vm
, pptfd
, &ppt
);
1456 mutex_exit(&pptdev_mtx
);
1460 if (ddi_intr_get_navail(ppt
->pptd_dip
, DDI_INTR_TYPE_MSI
,
1461 msilimit
) != DDI_SUCCESS
) {
1464 if (ddi_intr_get_navail(ppt
->pptd_dip
, DDI_INTR_TYPE_MSIX
,
1465 msixlimit
) != DDI_SUCCESS
) {
1470 mutex_exit(&pptdev_mtx
);
1475 ppt_disable_msix(struct vm
*vm
, int pptfd
)
1480 mutex_enter(&pptdev_mtx
);
1481 err
= ppt_findf(vm
, pptfd
, &ppt
);
1483 mutex_exit(&pptdev_mtx
);
1487 ppt_teardown_msix(ppt
);
1490 mutex_exit(&pptdev_mtx
);