15325 bhyve upstream sync 2023 January
[illumos-gate.git] / usr / src / uts / intel / io / vmm / io / ppt.c
blob5da00958877551bda6805efc889eb48b8c7792ba
1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
4 * Copyright (c) 2011 NetApp, Inc.
5 * All rights reserved.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
16 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
28 * $FreeBSD$
32 * Copyright 2019 Joyent, Inc.
33 * Copyright 2022 OmniOS Community Edition (OmniOSce) Association.
36 #include <sys/cdefs.h>
37 __FBSDID("$FreeBSD$");
39 #include <sys/param.h>
40 #include <sys/systm.h>
41 #include <sys/kernel.h>
42 #include <sys/kmem.h>
43 #include <sys/module.h>
44 #include <sys/bus.h>
45 #include <sys/pciio.h>
46 #include <sys/sysctl.h>
48 #include <dev/pci/pcivar.h>
49 #include <dev/pci/pcireg.h>
51 #include <machine/vmm.h>
52 #include <machine/vmm_dev.h>
54 #include <sys/conf.h>
55 #include <sys/ddi.h>
56 #include <sys/stat.h>
57 #include <sys/sunddi.h>
58 #include <sys/pci.h>
59 #include <sys/pci_cap.h>
60 #include <sys/pcie_impl.h>
61 #include <sys/ppt_dev.h>
62 #include <sys/mkdev.h>
63 #include <sys/sysmacros.h>
65 #include "vmm_lapic.h"
67 #include "iommu.h"
68 #include "ppt.h"
70 #define MAX_MSIMSGS 32
73 * If the MSI-X table is located in the middle of a BAR then that MMIO
74 * region gets split into two segments - one segment above the MSI-X table
75 * and the other segment below the MSI-X table - with a hole in place of
76 * the MSI-X table so accesses to it can be trapped and emulated.
78 * So, allocate a MMIO segment for each BAR register + 1 additional segment.
80 #define MAX_MMIOSEGS ((PCIR_MAX_BAR_0 + 1) + 1)
82 struct pptintr_arg {
83 struct pptdev *pptdev;
84 uint64_t addr;
85 uint64_t msg_data;
88 struct pptseg {
89 vm_paddr_t gpa;
90 size_t len;
91 int wired;
94 struct pptbar {
95 uint64_t base;
96 uint64_t size;
97 uint_t type;
98 ddi_acc_handle_t io_handle;
99 caddr_t io_ptr;
100 uint_t ddireg;
103 struct pptdev {
104 dev_info_t *pptd_dip;
105 list_node_t pptd_node;
106 ddi_acc_handle_t pptd_cfg;
107 struct pptbar pptd_bars[PCI_BASE_NUM];
108 struct vm *vm;
109 struct pptseg mmio[MAX_MMIOSEGS];
110 struct {
111 int num_msgs; /* guest state */
112 boolean_t is_fixed;
113 size_t inth_sz;
114 ddi_intr_handle_t *inth;
115 struct pptintr_arg arg[MAX_MSIMSGS];
116 } msi;
118 struct {
119 int num_msgs;
120 size_t inth_sz;
121 size_t arg_sz;
122 ddi_intr_handle_t *inth;
123 struct pptintr_arg *arg;
124 } msix;
128 static major_t ppt_major;
129 static void *ppt_state;
130 static kmutex_t pptdev_mtx;
131 static list_t pptdev_list;
133 #define PPT_MINOR_NAME "ppt"
135 static ddi_device_acc_attr_t ppt_attr = {
136 DDI_DEVICE_ATTR_V0,
137 DDI_NEVERSWAP_ACC,
138 DDI_STORECACHING_OK_ACC,
139 DDI_DEFAULT_ACC
142 static int
143 ppt_open(dev_t *devp, int flag, int otyp, cred_t *cr)
145 /* XXX: require extra privs? */
146 return (0);
149 #define BAR_TO_IDX(bar) (((bar) - PCI_CONF_BASE0) / PCI_BAR_SZ_32)
150 #define BAR_VALID(b) ( \
151 (b) >= PCI_CONF_BASE0 && \
152 (b) <= PCI_CONF_BASE5 && \
153 ((b) & (PCI_BAR_SZ_32-1)) == 0)
155 static int
156 ppt_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv)
158 minor_t minor = getminor(dev);
159 struct pptdev *ppt;
160 void *data = (void *)arg;
162 if ((ppt = ddi_get_soft_state(ppt_state, minor)) == NULL) {
163 return (ENOENT);
166 switch (cmd) {
167 case PPT_CFG_READ: {
168 struct ppt_cfg_io cio;
169 ddi_acc_handle_t cfg = ppt->pptd_cfg;
171 if (ddi_copyin(data, &cio, sizeof (cio), md) != 0) {
172 return (EFAULT);
174 switch (cio.pci_width) {
175 case 4:
176 cio.pci_data = pci_config_get32(cfg, cio.pci_off);
177 break;
178 case 2:
179 cio.pci_data = pci_config_get16(cfg, cio.pci_off);
180 break;
181 case 1:
182 cio.pci_data = pci_config_get8(cfg, cio.pci_off);
183 break;
184 default:
185 return (EINVAL);
188 if (ddi_copyout(&cio, data, sizeof (cio), md) != 0) {
189 return (EFAULT);
191 return (0);
193 case PPT_CFG_WRITE: {
194 struct ppt_cfg_io cio;
195 ddi_acc_handle_t cfg = ppt->pptd_cfg;
197 if (ddi_copyin(data, &cio, sizeof (cio), md) != 0) {
198 return (EFAULT);
200 switch (cio.pci_width) {
201 case 4:
202 pci_config_put32(cfg, cio.pci_off, cio.pci_data);
203 break;
204 case 2:
205 pci_config_put16(cfg, cio.pci_off, cio.pci_data);
206 break;
207 case 1:
208 pci_config_put8(cfg, cio.pci_off, cio.pci_data);
209 break;
210 default:
211 return (EINVAL);
214 return (0);
216 case PPT_BAR_QUERY: {
217 struct ppt_bar_query barg;
218 struct pptbar *pbar;
220 if (ddi_copyin(data, &barg, sizeof (barg), md) != 0) {
221 return (EFAULT);
223 if (barg.pbq_baridx >= PCI_BASE_NUM) {
224 return (EINVAL);
226 pbar = &ppt->pptd_bars[barg.pbq_baridx];
228 if (pbar->base == 0 || pbar->size == 0) {
229 return (ENOENT);
231 barg.pbq_type = pbar->type;
232 barg.pbq_base = pbar->base;
233 barg.pbq_size = pbar->size;
235 if (ddi_copyout(&barg, data, sizeof (barg), md) != 0) {
236 return (EFAULT);
238 return (0);
240 case PPT_BAR_READ: {
241 struct ppt_bar_io bio;
242 struct pptbar *pbar;
243 void *addr;
244 uint_t rnum;
245 ddi_acc_handle_t cfg;
247 if (ddi_copyin(data, &bio, sizeof (bio), md) != 0) {
248 return (EFAULT);
250 rnum = bio.pbi_bar;
251 if (rnum >= PCI_BASE_NUM) {
252 return (EINVAL);
254 pbar = &ppt->pptd_bars[rnum];
255 if (pbar->type != PCI_ADDR_IO || pbar->io_handle == NULL) {
256 return (EINVAL);
258 addr = pbar->io_ptr + bio.pbi_off;
260 switch (bio.pbi_width) {
261 case 4:
262 bio.pbi_data = ddi_get32(pbar->io_handle, addr);
263 break;
264 case 2:
265 bio.pbi_data = ddi_get16(pbar->io_handle, addr);
266 break;
267 case 1:
268 bio.pbi_data = ddi_get8(pbar->io_handle, addr);
269 break;
270 default:
271 return (EINVAL);
274 if (ddi_copyout(&bio, data, sizeof (bio), md) != 0) {
275 return (EFAULT);
277 return (0);
279 case PPT_BAR_WRITE: {
280 struct ppt_bar_io bio;
281 struct pptbar *pbar;
282 void *addr;
283 uint_t rnum;
284 ddi_acc_handle_t cfg;
286 if (ddi_copyin(data, &bio, sizeof (bio), md) != 0) {
287 return (EFAULT);
289 rnum = bio.pbi_bar;
290 if (rnum >= PCI_BASE_NUM) {
291 return (EINVAL);
293 pbar = &ppt->pptd_bars[rnum];
294 if (pbar->type != PCI_ADDR_IO || pbar->io_handle == NULL) {
295 return (EINVAL);
297 addr = pbar->io_ptr + bio.pbi_off;
299 switch (bio.pbi_width) {
300 case 4:
301 ddi_put32(pbar->io_handle, addr, bio.pbi_data);
302 break;
303 case 2:
304 ddi_put16(pbar->io_handle, addr, bio.pbi_data);
305 break;
306 case 1:
307 ddi_put8(pbar->io_handle, addr, bio.pbi_data);
308 break;
309 default:
310 return (EINVAL);
313 return (0);
316 default:
317 return (ENOTTY);
320 return (0);
323 static int
324 ppt_find_msix_table_bar(struct pptdev *ppt)
326 uint16_t base;
327 uint32_t off;
329 if (PCI_CAP_LOCATE(ppt->pptd_cfg, PCI_CAP_ID_MSI_X, &base) !=
330 DDI_SUCCESS)
331 return (-1);
333 off = pci_config_get32(ppt->pptd_cfg, base + PCI_MSIX_TBL_OFFSET);
335 if (off == PCI_EINVAL32)
336 return (-1);
338 return (off & PCI_MSIX_TBL_BIR_MASK);
341 static int
342 ppt_devmap(dev_t dev, devmap_cookie_t dhp, offset_t off, size_t len,
343 size_t *maplen, uint_t model)
345 minor_t minor;
346 struct pptdev *ppt;
347 int err, bar;
348 uint_t ddireg;
350 minor = getminor(dev);
352 if ((ppt = ddi_get_soft_state(ppt_state, minor)) == NULL)
353 return (ENXIO);
355 #ifdef _MULTI_DATAMODEL
356 if (ddi_model_convert_from(model) != DDI_MODEL_NONE)
357 return (ENXIO);
358 #endif
360 if (off < 0 || off != P2ALIGN(off, PAGESIZE))
361 return (EINVAL);
363 if ((bar = ppt_find_msix_table_bar(ppt)) == -1)
364 return (EINVAL);
366 ddireg = ppt->pptd_bars[bar].ddireg;
368 if (ddireg == 0)
369 return (EINVAL);
371 err = devmap_devmem_setup(dhp, ppt->pptd_dip, NULL, ddireg, off, len,
372 PROT_USER | PROT_READ | PROT_WRITE, IOMEM_DATA_CACHED, &ppt_attr);
374 if (err == DDI_SUCCESS)
375 *maplen = len;
377 return (err);
380 static void
381 ppt_bar_wipe(struct pptdev *ppt)
383 uint_t i;
385 for (i = 0; i < PCI_BASE_NUM; i++) {
386 struct pptbar *pbar = &ppt->pptd_bars[i];
387 if (pbar->type == PCI_ADDR_IO && pbar->io_handle != NULL) {
388 ddi_regs_map_free(&pbar->io_handle);
391 bzero(&ppt->pptd_bars, sizeof (ppt->pptd_bars));
394 static int
395 ppt_bar_crawl(struct pptdev *ppt)
397 pci_regspec_t *regs;
398 uint_t rcount, i;
399 int err = 0, rlen;
401 if (ddi_getlongprop(DDI_DEV_T_ANY, ppt->pptd_dip, DDI_PROP_DONTPASS,
402 "assigned-addresses", (caddr_t)&regs, &rlen) != DDI_PROP_SUCCESS) {
403 return (EIO);
406 VERIFY3S(rlen, >, 0);
407 rcount = rlen / sizeof (pci_regspec_t);
408 for (i = 0; i < rcount; i++) {
409 pci_regspec_t *reg = &regs[i];
410 struct pptbar *pbar;
411 uint_t bar, rnum;
413 DTRACE_PROBE1(ppt__crawl__reg, pci_regspec_t *, reg);
414 bar = PCI_REG_REG_G(reg->pci_phys_hi);
415 if (!BAR_VALID(bar)) {
416 continue;
419 rnum = BAR_TO_IDX(bar);
420 pbar = &ppt->pptd_bars[rnum];
421 /* is this somehow already populated? */
422 if (pbar->base != 0 || pbar->size != 0) {
423 err = EEXIST;
424 break;
428 * Register 0 corresponds to the PCI config space.
429 * The registers which match the assigned-addresses list are
430 * offset by 1.
432 pbar->ddireg = i + 1;
434 pbar->type = reg->pci_phys_hi & PCI_ADDR_MASK;
435 pbar->base = ((uint64_t)reg->pci_phys_mid << 32) |
436 (uint64_t)reg->pci_phys_low;
437 pbar->size = ((uint64_t)reg->pci_size_hi << 32) |
438 (uint64_t)reg->pci_size_low;
439 if (pbar->type == PCI_ADDR_IO) {
440 err = ddi_regs_map_setup(ppt->pptd_dip, rnum,
441 &pbar->io_ptr, 0, 0, &ppt_attr, &pbar->io_handle);
442 if (err != 0) {
443 break;
447 kmem_free(regs, rlen);
449 if (err != 0) {
450 ppt_bar_wipe(ppt);
452 return (err);
455 static boolean_t
456 ppt_bar_verify_mmio(struct pptdev *ppt, uint64_t base, uint64_t size)
458 const uint64_t map_end = base + size;
460 /* Zero-length or overflow mappings are not valid */
461 if (map_end <= base) {
462 return (B_FALSE);
464 /* MMIO bounds should be page-aligned */
465 if ((base & PAGEOFFSET) != 0 || (size & PAGEOFFSET) != 0) {
466 return (B_FALSE);
469 for (uint_t i = 0; i < PCI_BASE_NUM; i++) {
470 const struct pptbar *bar = &ppt->pptd_bars[i];
471 const uint64_t bar_end = bar->base + bar->size;
473 /* Only memory BARs can be mapped */
474 if (bar->type != PCI_ADDR_MEM32 &&
475 bar->type != PCI_ADDR_MEM64) {
476 continue;
479 /* Does the mapping fit within this BAR? */
480 if (base < bar->base || base >= bar_end ||
481 map_end < bar->base || map_end > bar_end) {
482 continue;
485 /* This BAR satisfies the provided map */
486 return (B_TRUE);
488 return (B_FALSE);
491 static int
492 ppt_ddi_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
494 struct pptdev *ppt = NULL;
495 char name[PPT_MAXNAMELEN];
496 int inst;
498 if (cmd != DDI_ATTACH)
499 return (DDI_FAILURE);
501 inst = ddi_get_instance(dip);
503 if (ddi_soft_state_zalloc(ppt_state, inst) != DDI_SUCCESS) {
504 goto fail;
506 VERIFY(ppt = ddi_get_soft_state(ppt_state, inst));
507 ppt->pptd_dip = dip;
508 ddi_set_driver_private(dip, ppt);
510 if (pci_config_setup(dip, &ppt->pptd_cfg) != DDI_SUCCESS) {
511 goto fail;
513 if (ppt_bar_crawl(ppt) != 0) {
514 goto fail;
516 if (ddi_create_minor_node(dip, PPT_MINOR_NAME, S_IFCHR, inst,
517 DDI_PSEUDO, 0) != DDI_SUCCESS) {
518 goto fail;
521 mutex_enter(&pptdev_mtx);
522 list_insert_tail(&pptdev_list, ppt);
523 mutex_exit(&pptdev_mtx);
525 return (DDI_SUCCESS);
527 fail:
528 if (ppt != NULL) {
529 ddi_remove_minor_node(dip, NULL);
530 if (ppt->pptd_cfg != NULL) {
531 pci_config_teardown(&ppt->pptd_cfg);
533 ppt_bar_wipe(ppt);
534 ddi_soft_state_free(ppt_state, inst);
536 return (DDI_FAILURE);
539 static int
540 ppt_ddi_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
542 struct pptdev *ppt;
543 int inst;
545 if (cmd != DDI_DETACH)
546 return (DDI_FAILURE);
548 ppt = ddi_get_driver_private(dip);
549 inst = ddi_get_instance(dip);
551 ASSERT3P(ddi_get_soft_state(ppt_state, inst), ==, ppt);
553 mutex_enter(&pptdev_mtx);
554 if (ppt->vm != NULL) {
555 mutex_exit(&pptdev_mtx);
556 return (DDI_FAILURE);
558 list_remove(&pptdev_list, ppt);
559 mutex_exit(&pptdev_mtx);
561 ddi_remove_minor_node(dip, PPT_MINOR_NAME);
562 ppt_bar_wipe(ppt);
563 pci_config_teardown(&ppt->pptd_cfg);
564 ddi_set_driver_private(dip, NULL);
565 ddi_soft_state_free(ppt_state, inst);
567 return (DDI_SUCCESS);
570 static int
571 ppt_ddi_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
573 int error = DDI_FAILURE;
574 int inst = getminor((dev_t)arg);
576 switch (cmd) {
577 case DDI_INFO_DEVT2DEVINFO: {
578 struct pptdev *ppt = ddi_get_soft_state(ppt_state, inst);
580 if (ppt != NULL) {
581 *result = (void *)ppt->pptd_dip;
582 error = DDI_SUCCESS;
584 break;
586 case DDI_INFO_DEVT2INSTANCE: {
587 *result = (void *)(uintptr_t)inst;
588 error = DDI_SUCCESS;
589 break;
591 default:
592 break;
594 return (error);
597 static struct cb_ops ppt_cb_ops = {
598 ppt_open,
599 nulldev, /* close */
600 nodev, /* strategy */
601 nodev, /* print */
602 nodev, /* dump */
603 nodev, /* read */
604 nodev, /* write */
605 ppt_ioctl,
606 ppt_devmap, /* devmap */
607 NULL, /* mmap */
608 NULL, /* segmap */
609 nochpoll, /* poll */
610 ddi_prop_op,
611 NULL,
612 D_NEW | D_MP | D_64BIT | D_DEVMAP,
613 CB_REV
616 static struct dev_ops ppt_ops = {
617 DEVO_REV,
619 ppt_ddi_info,
620 nulldev, /* identify */
621 nulldev, /* probe */
622 ppt_ddi_attach,
623 ppt_ddi_detach,
624 nodev, /* reset */
625 &ppt_cb_ops,
626 (struct bus_ops *)NULL
629 static struct modldrv modldrv = {
630 &mod_driverops,
631 "bhyve pci pass-thru",
632 &ppt_ops
635 static struct modlinkage modlinkage = {
636 MODREV_1,
637 &modldrv,
638 NULL
642 _init(void)
644 int error;
646 mutex_init(&pptdev_mtx, NULL, MUTEX_DRIVER, NULL);
647 list_create(&pptdev_list, sizeof (struct pptdev),
648 offsetof(struct pptdev, pptd_node));
650 error = ddi_soft_state_init(&ppt_state, sizeof (struct pptdev), 0);
651 if (error) {
652 goto fail;
655 error = mod_install(&modlinkage);
657 ppt_major = ddi_name_to_major("ppt");
658 fail:
659 if (error) {
660 ddi_soft_state_fini(&ppt_state);
662 return (error);
666 _fini(void)
668 int error;
670 error = mod_remove(&modlinkage);
671 if (error)
672 return (error);
673 ddi_soft_state_fini(&ppt_state);
675 return (0);
679 _info(struct modinfo *modinfop)
681 return (mod_info(&modlinkage, modinfop));
684 static boolean_t
685 ppt_wait_for_pending_txn(dev_info_t *dip, uint_t max_delay_us)
687 uint16_t cap_ptr, devsts;
688 ddi_acc_handle_t hdl;
690 if (pci_config_setup(dip, &hdl) != DDI_SUCCESS)
691 return (B_FALSE);
693 if (PCI_CAP_LOCATE(hdl, PCI_CAP_ID_PCI_E, &cap_ptr) != DDI_SUCCESS) {
694 pci_config_teardown(&hdl);
695 return (B_FALSE);
698 devsts = PCI_CAP_GET16(hdl, 0, cap_ptr, PCIE_DEVSTS);
699 while ((devsts & PCIE_DEVSTS_TRANS_PENDING) != 0) {
700 if (max_delay_us == 0) {
701 pci_config_teardown(&hdl);
702 return (B_FALSE);
705 /* Poll once every 100 milliseconds up to the timeout. */
706 if (max_delay_us > 100000) {
707 delay(drv_usectohz(100000));
708 max_delay_us -= 100000;
709 } else {
710 delay(drv_usectohz(max_delay_us));
711 max_delay_us = 0;
713 devsts = PCI_CAP_GET16(hdl, 0, cap_ptr, PCIE_DEVSTS);
716 pci_config_teardown(&hdl);
717 return (B_TRUE);
720 static uint_t
721 ppt_max_completion_tmo_us(dev_info_t *dip)
723 uint_t timo = 0;
724 uint16_t cap_ptr;
725 ddi_acc_handle_t hdl;
726 uint_t timo_ranges[] = { /* timeout ranges */
727 50000, /* 50ms */
728 100, /* 100us */
729 10000, /* 10ms */
732 55000, /* 55ms */
733 210000, /* 210ms */
736 900000, /* 900ms */
737 3500000, /* 3.5s */
740 13000000, /* 13s */
741 64000000, /* 64s */
745 if (pci_config_setup(dip, &hdl) != DDI_SUCCESS)
746 return (50000); /* default 50ms */
748 if (PCI_CAP_LOCATE(hdl, PCI_CAP_ID_PCI_E, &cap_ptr) != DDI_SUCCESS)
749 goto out;
751 if ((PCI_CAP_GET16(hdl, 0, cap_ptr, PCIE_PCIECAP) &
752 PCIE_PCIECAP_VER_MASK) < PCIE_PCIECAP_VER_2_0)
753 goto out;
755 if ((PCI_CAP_GET32(hdl, 0, cap_ptr, PCIE_DEVCAP2) &
756 PCIE_DEVCTL2_COM_TO_RANGE_MASK) == 0)
757 goto out;
759 timo = timo_ranges[PCI_CAP_GET16(hdl, 0, cap_ptr, PCIE_DEVCTL2) &
760 PCIE_DEVCAP2_COM_TO_RANGE_MASK];
762 out:
763 if (timo == 0)
764 timo = 50000; /* default 50ms */
766 pci_config_teardown(&hdl);
767 return (timo);
770 static boolean_t
771 ppt_flr(dev_info_t *dip, boolean_t force)
773 uint16_t cap_ptr, ctl, cmd;
774 ddi_acc_handle_t hdl;
775 uint_t compl_delay = 0, max_delay_us;
777 if (pci_config_setup(dip, &hdl) != DDI_SUCCESS)
778 return (B_FALSE);
780 if (PCI_CAP_LOCATE(hdl, PCI_CAP_ID_PCI_E, &cap_ptr) != DDI_SUCCESS)
781 goto fail;
783 if ((PCI_CAP_GET32(hdl, 0, cap_ptr, PCIE_DEVCAP) & PCIE_DEVCAP_FLR)
784 == 0)
785 goto fail;
787 max_delay_us = MAX(ppt_max_completion_tmo_us(dip), 10000);
790 * Disable busmastering to prevent generation of new transactions while
791 * waiting for the device to go idle. If the idle timeout fails, the
792 * command register is restored which will re-enable busmastering.
794 cmd = pci_config_get16(hdl, PCI_CONF_COMM);
795 pci_config_put16(hdl, PCI_CONF_COMM, cmd & ~PCI_COMM_ME);
796 if (!ppt_wait_for_pending_txn(dip, max_delay_us)) {
797 if (!force) {
798 pci_config_put16(hdl, PCI_CONF_COMM, cmd);
799 goto fail;
801 dev_err(dip, CE_WARN,
802 "?Resetting with transactions pending after %u us\n",
803 max_delay_us);
806 * Extend the post-FLR delay to cover the maximum Completion
807 * Timeout delay of anything in flight during the FLR delay.
808 * Enforce a minimum delay of at least 10ms.
810 compl_delay = MAX(10, (ppt_max_completion_tmo_us(dip) / 1000));
813 /* Initiate the reset. */
814 ctl = PCI_CAP_GET16(hdl, 0, cap_ptr, PCIE_DEVCTL);
815 (void) PCI_CAP_PUT16(hdl, 0, cap_ptr, PCIE_DEVCTL,
816 ctl | PCIE_DEVCTL_INITIATE_FLR);
818 /* Wait for at least 100ms */
819 delay(drv_usectohz((100 + compl_delay) * 1000));
821 pci_config_teardown(&hdl);
822 return (B_TRUE);
824 fail:
826 * TODO: If the FLR fails for some reason, we should attempt a reset
827 * using the PCI power management facilities (if possible).
829 pci_config_teardown(&hdl);
830 return (B_FALSE);
833 static int
834 ppt_findf(struct vm *vm, int fd, struct pptdev **pptp)
836 struct pptdev *ppt = NULL;
837 file_t *fp;
838 vattr_t va;
839 int err = 0;
841 ASSERT(MUTEX_HELD(&pptdev_mtx));
843 if ((fp = getf(fd)) == NULL)
844 return (EBADF);
846 va.va_mask = AT_RDEV;
847 if (VOP_GETATTR(fp->f_vnode, &va, NO_FOLLOW, fp->f_cred, NULL) != 0 ||
848 getmajor(va.va_rdev) != ppt_major) {
849 err = EBADF;
850 goto fail;
853 ppt = ddi_get_soft_state(ppt_state, getminor(va.va_rdev));
855 if (ppt == NULL) {
856 err = EBADF;
857 goto fail;
860 if (ppt->vm != vm) {
861 err = EBUSY;
862 goto fail;
865 *pptp = ppt;
866 return (0);
868 fail:
869 releasef(fd);
870 return (err);
873 static void
874 ppt_unmap_all_mmio(struct vm *vm, struct pptdev *ppt)
876 int i;
877 struct pptseg *seg;
879 for (i = 0; i < MAX_MMIOSEGS; i++) {
880 seg = &ppt->mmio[i];
881 if (seg->len == 0)
882 continue;
883 (void) vm_unmap_mmio(vm, seg->gpa, seg->len);
884 bzero(seg, sizeof (struct pptseg));
888 static void
889 ppt_teardown_msi(struct pptdev *ppt)
891 int i;
893 if (ppt->msi.num_msgs == 0)
894 return;
896 for (i = 0; i < ppt->msi.num_msgs; i++) {
897 int intr_cap;
899 (void) ddi_intr_get_cap(ppt->msi.inth[i], &intr_cap);
900 if (intr_cap & DDI_INTR_FLAG_BLOCK)
901 ddi_intr_block_disable(&ppt->msi.inth[i], 1);
902 else
903 ddi_intr_disable(ppt->msi.inth[i]);
905 ddi_intr_remove_handler(ppt->msi.inth[i]);
906 ddi_intr_free(ppt->msi.inth[i]);
908 ppt->msi.inth[i] = NULL;
911 kmem_free(ppt->msi.inth, ppt->msi.inth_sz);
912 ppt->msi.inth = NULL;
913 ppt->msi.inth_sz = 0;
914 ppt->msi.is_fixed = B_FALSE;
916 ppt->msi.num_msgs = 0;
919 static void
920 ppt_teardown_msix_intr(struct pptdev *ppt, int idx)
922 if (ppt->msix.inth != NULL && ppt->msix.inth[idx] != NULL) {
923 int intr_cap;
925 (void) ddi_intr_get_cap(ppt->msix.inth[idx], &intr_cap);
926 if (intr_cap & DDI_INTR_FLAG_BLOCK)
927 ddi_intr_block_disable(&ppt->msix.inth[idx], 1);
928 else
929 ddi_intr_disable(ppt->msix.inth[idx]);
931 ddi_intr_remove_handler(ppt->msix.inth[idx]);
935 static void
936 ppt_teardown_msix(struct pptdev *ppt)
938 uint_t i;
940 if (ppt->msix.num_msgs == 0)
941 return;
943 for (i = 0; i < ppt->msix.num_msgs; i++)
944 ppt_teardown_msix_intr(ppt, i);
946 if (ppt->msix.inth) {
947 for (i = 0; i < ppt->msix.num_msgs; i++)
948 ddi_intr_free(ppt->msix.inth[i]);
949 kmem_free(ppt->msix.inth, ppt->msix.inth_sz);
950 ppt->msix.inth = NULL;
951 ppt->msix.inth_sz = 0;
952 kmem_free(ppt->msix.arg, ppt->msix.arg_sz);
953 ppt->msix.arg = NULL;
954 ppt->msix.arg_sz = 0;
957 ppt->msix.num_msgs = 0;
961 ppt_assigned_devices(struct vm *vm)
963 struct pptdev *ppt;
964 uint_t num = 0;
966 mutex_enter(&pptdev_mtx);
967 for (ppt = list_head(&pptdev_list); ppt != NULL;
968 ppt = list_next(&pptdev_list, ppt)) {
969 if (ppt->vm == vm) {
970 num++;
973 mutex_exit(&pptdev_mtx);
974 return (num);
977 boolean_t
978 ppt_is_mmio(struct vm *vm, vm_paddr_t gpa)
980 struct pptdev *ppt = list_head(&pptdev_list);
982 /* XXX: this should probably be restructured to avoid the lock */
983 mutex_enter(&pptdev_mtx);
984 for (ppt = list_head(&pptdev_list); ppt != NULL;
985 ppt = list_next(&pptdev_list, ppt)) {
986 if (ppt->vm != vm) {
987 continue;
990 for (uint_t i = 0; i < MAX_MMIOSEGS; i++) {
991 struct pptseg *seg = &ppt->mmio[i];
993 if (seg->len == 0)
994 continue;
995 if (gpa >= seg->gpa && gpa < seg->gpa + seg->len) {
996 mutex_exit(&pptdev_mtx);
997 return (B_TRUE);
1002 mutex_exit(&pptdev_mtx);
1003 return (B_FALSE);
1007 ppt_assign_device(struct vm *vm, int pptfd)
1009 struct pptdev *ppt;
1010 int err = 0;
1012 mutex_enter(&pptdev_mtx);
1013 /* Passing NULL requires the device to be unowned. */
1014 err = ppt_findf(NULL, pptfd, &ppt);
1015 if (err != 0) {
1016 mutex_exit(&pptdev_mtx);
1017 return (err);
1020 if (pci_save_config_regs(ppt->pptd_dip) != DDI_SUCCESS) {
1021 err = EIO;
1022 goto done;
1024 ppt_flr(ppt->pptd_dip, B_TRUE);
1027 * Restore the device state after reset and then perform another save
1028 * so the "pristine" state can be restored when the device is removed
1029 * from the guest.
1031 if (pci_restore_config_regs(ppt->pptd_dip) != DDI_SUCCESS ||
1032 pci_save_config_regs(ppt->pptd_dip) != DDI_SUCCESS) {
1033 err = EIO;
1034 goto done;
1037 ppt->vm = vm;
1038 iommu_remove_device(iommu_host_domain(), pci_get_bdf(ppt->pptd_dip));
1039 iommu_add_device(vm_iommu_domain(vm), pci_get_bdf(ppt->pptd_dip));
1040 pf_set_passthru(ppt->pptd_dip, B_TRUE);
1042 done:
1043 releasef(pptfd);
1044 mutex_exit(&pptdev_mtx);
1045 return (err);
1048 static void
1049 ppt_reset_pci_power_state(dev_info_t *dip)
1051 ddi_acc_handle_t cfg;
1052 uint16_t cap_ptr;
1054 if (pci_config_setup(dip, &cfg) != DDI_SUCCESS)
1055 return;
1057 if (PCI_CAP_LOCATE(cfg, PCI_CAP_ID_PM, &cap_ptr) == DDI_SUCCESS) {
1058 uint16_t val;
1060 val = PCI_CAP_GET16(cfg, 0, cap_ptr, PCI_PMCSR);
1061 if ((val & PCI_PMCSR_STATE_MASK) != PCI_PMCSR_D0) {
1062 val = (val & ~PCI_PMCSR_STATE_MASK) | PCI_PMCSR_D0;
1063 (void) PCI_CAP_PUT16(cfg, 0, cap_ptr, PCI_PMCSR,
1064 val);
1068 pci_config_teardown(&cfg);
1071 static void
1072 ppt_do_unassign(struct pptdev *ppt)
1074 struct vm *vm = ppt->vm;
1076 ASSERT3P(vm, !=, NULL);
1077 ASSERT(MUTEX_HELD(&pptdev_mtx));
1080 ppt_flr(ppt->pptd_dip, B_TRUE);
1083 * Restore from the state saved during device assignment.
1084 * If the device power state has been altered, that must be remedied
1085 * first, as it will reset register state during the transition.
1087 ppt_reset_pci_power_state(ppt->pptd_dip);
1088 (void) pci_restore_config_regs(ppt->pptd_dip);
1090 pf_set_passthru(ppt->pptd_dip, B_FALSE);
1092 ppt_unmap_all_mmio(vm, ppt);
1093 ppt_teardown_msi(ppt);
1094 ppt_teardown_msix(ppt);
1095 iommu_remove_device(vm_iommu_domain(vm), pci_get_bdf(ppt->pptd_dip));
1096 iommu_add_device(iommu_host_domain(), pci_get_bdf(ppt->pptd_dip));
1097 ppt->vm = NULL;
1101 ppt_unassign_device(struct vm *vm, int pptfd)
1103 struct pptdev *ppt;
1104 int err = 0;
1106 mutex_enter(&pptdev_mtx);
1107 err = ppt_findf(vm, pptfd, &ppt);
1108 if (err != 0) {
1109 mutex_exit(&pptdev_mtx);
1110 return (err);
1113 ppt_do_unassign(ppt);
1115 releasef(pptfd);
1116 mutex_exit(&pptdev_mtx);
1117 return (err);
1120 void
1121 ppt_unassign_all(struct vm *vm)
1123 struct pptdev *ppt;
1125 mutex_enter(&pptdev_mtx);
1126 for (ppt = list_head(&pptdev_list); ppt != NULL;
1127 ppt = list_next(&pptdev_list, ppt)) {
1128 if (ppt->vm == vm) {
1129 ppt_do_unassign(ppt);
1132 mutex_exit(&pptdev_mtx);
1136 ppt_map_mmio(struct vm *vm, int pptfd, vm_paddr_t gpa, size_t len,
1137 vm_paddr_t hpa)
1139 struct pptdev *ppt;
1140 int err = 0;
1142 if ((len & PAGEOFFSET) != 0 || len == 0 || (gpa & PAGEOFFSET) != 0 ||
1143 (hpa & PAGEOFFSET) != 0 || gpa + len < gpa || hpa + len < hpa) {
1144 return (EINVAL);
1147 mutex_enter(&pptdev_mtx);
1148 err = ppt_findf(vm, pptfd, &ppt);
1149 if (err != 0) {
1150 mutex_exit(&pptdev_mtx);
1151 return (err);
1155 * Ensure that the host-physical range of the requested mapping fits
1156 * within one of the MMIO BARs of the device.
1158 if (!ppt_bar_verify_mmio(ppt, hpa, len)) {
1159 err = EINVAL;
1160 goto done;
1163 for (uint_t i = 0; i < MAX_MMIOSEGS; i++) {
1164 struct pptseg *seg = &ppt->mmio[i];
1166 if (seg->len == 0) {
1167 err = vm_map_mmio(vm, gpa, len, hpa);
1168 if (err == 0) {
1169 seg->gpa = gpa;
1170 seg->len = len;
1172 goto done;
1175 err = ENOSPC;
1177 done:
1178 releasef(pptfd);
1179 mutex_exit(&pptdev_mtx);
1180 return (err);
1184 ppt_unmap_mmio(struct vm *vm, int pptfd, vm_paddr_t gpa, size_t len)
1186 struct pptdev *ppt;
1187 int err = 0;
1188 uint_t i;
1190 mutex_enter(&pptdev_mtx);
1191 err = ppt_findf(vm, pptfd, &ppt);
1192 if (err != 0) {
1193 mutex_exit(&pptdev_mtx);
1194 return (err);
1197 for (i = 0; i < MAX_MMIOSEGS; i++) {
1198 struct pptseg *seg = &ppt->mmio[i];
1200 if (seg->gpa == gpa && seg->len == len) {
1201 err = vm_unmap_mmio(vm, seg->gpa, seg->len);
1202 if (err == 0) {
1203 seg->gpa = 0;
1204 seg->len = 0;
1206 goto out;
1209 err = ENOENT;
1210 out:
1211 releasef(pptfd);
1212 mutex_exit(&pptdev_mtx);
1213 return (err);
1216 static uint_t
1217 pptintr(caddr_t arg, caddr_t unused)
1219 struct pptintr_arg *pptarg = (struct pptintr_arg *)arg;
1220 struct pptdev *ppt = pptarg->pptdev;
1222 if (ppt->vm != NULL) {
1223 lapic_intr_msi(ppt->vm, pptarg->addr, pptarg->msg_data);
1224 } else {
1226 * XXX
1227 * This is not expected to happen - panic?
1232 * For legacy interrupts give other filters a chance in case
1233 * the interrupt was not generated by the passthrough device.
1235 return (ppt->msi.is_fixed ? DDI_INTR_UNCLAIMED : DDI_INTR_CLAIMED);
1239 ppt_setup_msi(struct vm *vm, int vcpu, int pptfd, uint64_t addr, uint64_t msg,
1240 int numvec)
1242 int i, msi_count, intr_type;
1243 struct pptdev *ppt;
1244 int err = 0;
1246 if (numvec < 0 || numvec > MAX_MSIMSGS)
1247 return (EINVAL);
1249 mutex_enter(&pptdev_mtx);
1250 err = ppt_findf(vm, pptfd, &ppt);
1251 if (err != 0) {
1252 mutex_exit(&pptdev_mtx);
1253 return (err);
1256 /* Reject attempts to enable MSI while MSI-X is active. */
1257 if (ppt->msix.num_msgs != 0 && numvec != 0) {
1258 err = EBUSY;
1259 goto done;
1262 /* Free any allocated resources */
1263 ppt_teardown_msi(ppt);
1265 if (numvec == 0) {
1266 /* nothing more to do */
1267 goto done;
1270 if (ddi_intr_get_navail(ppt->pptd_dip, DDI_INTR_TYPE_MSI,
1271 &msi_count) != DDI_SUCCESS) {
1272 if (ddi_intr_get_navail(ppt->pptd_dip, DDI_INTR_TYPE_FIXED,
1273 &msi_count) != DDI_SUCCESS) {
1274 err = EINVAL;
1275 goto done;
1278 intr_type = DDI_INTR_TYPE_FIXED;
1279 ppt->msi.is_fixed = B_TRUE;
1280 } else {
1281 intr_type = DDI_INTR_TYPE_MSI;
1285 * The device must be capable of supporting the number of vectors
1286 * the guest wants to allocate.
1288 if (numvec > msi_count) {
1289 err = EINVAL;
1290 goto done;
1293 ppt->msi.inth_sz = numvec * sizeof (ddi_intr_handle_t);
1294 ppt->msi.inth = kmem_zalloc(ppt->msi.inth_sz, KM_SLEEP);
1295 if (ddi_intr_alloc(ppt->pptd_dip, ppt->msi.inth, intr_type, 0,
1296 numvec, &msi_count, 0) != DDI_SUCCESS) {
1297 kmem_free(ppt->msi.inth, ppt->msi.inth_sz);
1298 err = EINVAL;
1299 goto done;
1302 /* Verify that we got as many vectors as the guest requested */
1303 if (numvec != msi_count) {
1304 ppt_teardown_msi(ppt);
1305 err = EINVAL;
1306 goto done;
1309 /* Set up & enable interrupt handler for each vector. */
1310 for (i = 0; i < numvec; i++) {
1311 int res, intr_cap = 0;
1313 ppt->msi.num_msgs = i + 1;
1314 ppt->msi.arg[i].pptdev = ppt;
1315 ppt->msi.arg[i].addr = addr;
1316 ppt->msi.arg[i].msg_data = msg + i;
1318 if (ddi_intr_add_handler(ppt->msi.inth[i], pptintr,
1319 &ppt->msi.arg[i], NULL) != DDI_SUCCESS)
1320 break;
1322 (void) ddi_intr_get_cap(ppt->msi.inth[i], &intr_cap);
1323 if (intr_cap & DDI_INTR_FLAG_BLOCK)
1324 res = ddi_intr_block_enable(&ppt->msi.inth[i], 1);
1325 else
1326 res = ddi_intr_enable(ppt->msi.inth[i]);
1328 if (res != DDI_SUCCESS)
1329 break;
1331 if (i < numvec) {
1332 ppt_teardown_msi(ppt);
1333 err = ENXIO;
1336 done:
1337 releasef(pptfd);
1338 mutex_exit(&pptdev_mtx);
1339 return (err);
1343 ppt_setup_msix(struct vm *vm, int vcpu, int pptfd, int idx, uint64_t addr,
1344 uint64_t msg, uint32_t vector_control)
1346 struct pptdev *ppt;
1347 int numvec, alloced;
1348 int err = 0;
1350 mutex_enter(&pptdev_mtx);
1351 err = ppt_findf(vm, pptfd, &ppt);
1352 if (err != 0) {
1353 mutex_exit(&pptdev_mtx);
1354 return (err);
1357 /* Reject attempts to enable MSI-X while MSI is active. */
1358 if (ppt->msi.num_msgs != 0) {
1359 err = EBUSY;
1360 goto done;
1364 * First-time configuration:
1365 * Allocate the MSI-X table
1366 * Allocate the IRQ resources
1367 * Set up some variables in ppt->msix
1369 if (ppt->msix.num_msgs == 0) {
1370 dev_info_t *dip = ppt->pptd_dip;
1372 if (ddi_intr_get_navail(dip, DDI_INTR_TYPE_MSIX,
1373 &numvec) != DDI_SUCCESS) {
1374 err = EINVAL;
1375 goto done;
1378 ppt->msix.num_msgs = numvec;
1380 ppt->msix.arg_sz = numvec * sizeof (ppt->msix.arg[0]);
1381 ppt->msix.arg = kmem_zalloc(ppt->msix.arg_sz, KM_SLEEP);
1382 ppt->msix.inth_sz = numvec * sizeof (ddi_intr_handle_t);
1383 ppt->msix.inth = kmem_zalloc(ppt->msix.inth_sz, KM_SLEEP);
1385 if (ddi_intr_alloc(dip, ppt->msix.inth, DDI_INTR_TYPE_MSIX, 0,
1386 numvec, &alloced, 0) != DDI_SUCCESS) {
1387 kmem_free(ppt->msix.arg, ppt->msix.arg_sz);
1388 kmem_free(ppt->msix.inth, ppt->msix.inth_sz);
1389 ppt->msix.arg = NULL;
1390 ppt->msix.inth = NULL;
1391 ppt->msix.arg_sz = ppt->msix.inth_sz = 0;
1392 err = EINVAL;
1393 goto done;
1396 if (numvec != alloced) {
1397 ppt_teardown_msix(ppt);
1398 err = EINVAL;
1399 goto done;
1403 if (idx >= ppt->msix.num_msgs) {
1404 err = EINVAL;
1405 goto done;
1408 if ((vector_control & PCIM_MSIX_VCTRL_MASK) == 0) {
1409 int intr_cap, res;
1411 /* Tear down the IRQ if it's already set up */
1412 ppt_teardown_msix_intr(ppt, idx);
1414 ppt->msix.arg[idx].pptdev = ppt;
1415 ppt->msix.arg[idx].addr = addr;
1416 ppt->msix.arg[idx].msg_data = msg;
1418 /* Setup the MSI-X interrupt */
1419 if (ddi_intr_add_handler(ppt->msix.inth[idx], pptintr,
1420 &ppt->msix.arg[idx], NULL) != DDI_SUCCESS) {
1421 err = ENXIO;
1422 goto done;
1425 (void) ddi_intr_get_cap(ppt->msix.inth[idx], &intr_cap);
1426 if (intr_cap & DDI_INTR_FLAG_BLOCK)
1427 res = ddi_intr_block_enable(&ppt->msix.inth[idx], 1);
1428 else
1429 res = ddi_intr_enable(ppt->msix.inth[idx]);
1431 if (res != DDI_SUCCESS) {
1432 ddi_intr_remove_handler(ppt->msix.inth[idx]);
1433 err = ENXIO;
1434 goto done;
1436 } else {
1437 /* Masked, tear it down if it's already been set up */
1438 ppt_teardown_msix_intr(ppt, idx);
1441 done:
1442 releasef(pptfd);
1443 mutex_exit(&pptdev_mtx);
1444 return (err);
1448 ppt_get_limits(struct vm *vm, int pptfd, int *msilimit, int *msixlimit)
1450 struct pptdev *ppt;
1451 int err = 0;
1453 mutex_enter(&pptdev_mtx);
1454 err = ppt_findf(vm, pptfd, &ppt);
1455 if (err != 0) {
1456 mutex_exit(&pptdev_mtx);
1457 return (err);
1460 if (ddi_intr_get_navail(ppt->pptd_dip, DDI_INTR_TYPE_MSI,
1461 msilimit) != DDI_SUCCESS) {
1462 *msilimit = -1;
1464 if (ddi_intr_get_navail(ppt->pptd_dip, DDI_INTR_TYPE_MSIX,
1465 msixlimit) != DDI_SUCCESS) {
1466 *msixlimit = -1;
1469 releasef(pptfd);
1470 mutex_exit(&pptdev_mtx);
1471 return (err);
1475 ppt_disable_msix(struct vm *vm, int pptfd)
1477 struct pptdev *ppt;
1478 int err = 0;
1480 mutex_enter(&pptdev_mtx);
1481 err = ppt_findf(vm, pptfd, &ppt);
1482 if (err != 0) {
1483 mutex_exit(&pptdev_mtx);
1484 return (err);
1487 ppt_teardown_msix(ppt);
1489 releasef(pptfd);
1490 mutex_exit(&pptdev_mtx);
1491 return (err);