2 * QEMU paravirtual RDMA
4 * Copyright (C) 2018 Oracle
5 * Copyright (C) 2018 Red Hat Inc
8 * Yuval Shaia <yuval.shaia@oracle.com>
9 * Marcel Apfelbaum <marcel@redhat.com>
11 * This work is licensed under the terms of the GNU GPL, version 2 or later.
12 * See the COPYING file in the top-level directory.
16 #include "qemu/osdep.h"
17 #include "qapi/error.h"
19 #include "hw/pci/pci.h"
20 #include "hw/pci/pci_ids.h"
21 #include "hw/pci/msi.h"
22 #include "hw/pci/msix.h"
23 #include "hw/qdev-core.h"
24 #include "hw/qdev-properties.h"
28 #include "../rdma_rm.h"
29 #include "../rdma_backend.h"
30 #include "../rdma_utils.h"
32 #include <infiniband/verbs.h>
34 #include "standard-headers/rdma/vmw_pvrdma-abi.h"
35 #include "standard-headers/drivers/infiniband/hw/vmw_pvrdma/pvrdma_dev_api.h"
36 #include "pvrdma_qp_ops.h"
38 static Property pvrdma_dev_properties
[] = {
39 DEFINE_PROP_STRING("netdev", PVRDMADev
, backend_eth_device_name
),
40 DEFINE_PROP_STRING("ibdev", PVRDMADev
, backend_device_name
),
41 DEFINE_PROP_UINT8("ibport", PVRDMADev
, backend_port_num
, 1),
42 DEFINE_PROP_UINT64("dev-caps-max-mr-size", PVRDMADev
, dev_attr
.max_mr_size
,
44 DEFINE_PROP_INT32("dev-caps-max-qp", PVRDMADev
, dev_attr
.max_qp
, MAX_QP
),
45 DEFINE_PROP_INT32("dev-caps-max-sge", PVRDMADev
, dev_attr
.max_sge
, MAX_SGE
),
46 DEFINE_PROP_INT32("dev-caps-max-cq", PVRDMADev
, dev_attr
.max_cq
, MAX_CQ
),
47 DEFINE_PROP_INT32("dev-caps-max-mr", PVRDMADev
, dev_attr
.max_mr
, MAX_MR
),
48 DEFINE_PROP_INT32("dev-caps-max-pd", PVRDMADev
, dev_attr
.max_pd
, MAX_PD
),
49 DEFINE_PROP_INT32("dev-caps-qp-rd-atom", PVRDMADev
, dev_attr
.max_qp_rd_atom
,
51 DEFINE_PROP_INT32("dev-caps-max-qp-init-rd-atom", PVRDMADev
,
52 dev_attr
.max_qp_init_rd_atom
, MAX_QP_INIT_RD_ATOM
),
53 DEFINE_PROP_INT32("dev-caps-max-ah", PVRDMADev
, dev_attr
.max_ah
, MAX_AH
),
54 DEFINE_PROP_CHR("mad-chardev", PVRDMADev
, mad_chr
),
55 DEFINE_PROP_END_OF_LIST(),
58 static void free_dev_ring(PCIDevice
*pci_dev
, PvrdmaRing
*ring
,
61 pvrdma_ring_free(ring
);
62 rdma_pci_dma_unmap(pci_dev
, ring_state
, TARGET_PAGE_SIZE
);
65 static int init_dev_ring(PvrdmaRing
*ring
, struct pvrdma_ring
**ring_state
,
66 const char *name
, PCIDevice
*pci_dev
,
67 dma_addr_t dir_addr
, uint32_t num_pages
)
72 pr_dbg("Initializing device ring %s\n", name
);
73 pr_dbg("pdir_dma=0x%llx\n", (long long unsigned int)dir_addr
);
74 pr_dbg("num_pages=%d\n", num_pages
);
75 dir
= rdma_pci_dma_map(pci_dev
, dir_addr
, TARGET_PAGE_SIZE
);
77 pr_err("Failed to map to page directory\n");
81 tbl
= rdma_pci_dma_map(pci_dev
, dir
[0], TARGET_PAGE_SIZE
);
83 pr_err("Failed to map to page table\n");
88 *ring_state
= rdma_pci_dma_map(pci_dev
, tbl
[0], TARGET_PAGE_SIZE
);
90 pr_err("Failed to map to ring state\n");
94 /* RX ring is the second */
96 rc
= pvrdma_ring_init(ring
, name
, pci_dev
,
97 (struct pvrdma_ring
*)*ring_state
,
98 (num_pages
- 1) * TARGET_PAGE_SIZE
/
99 sizeof(struct pvrdma_cqne
),
100 sizeof(struct pvrdma_cqne
),
101 (dma_addr_t
*)&tbl
[1], (dma_addr_t
)num_pages
- 1);
103 pr_err("Failed to initialize ring\n");
105 goto out_free_ring_state
;
111 rdma_pci_dma_unmap(pci_dev
, *ring_state
, TARGET_PAGE_SIZE
);
114 rdma_pci_dma_unmap(pci_dev
, tbl
, TARGET_PAGE_SIZE
);
117 rdma_pci_dma_unmap(pci_dev
, dir
, TARGET_PAGE_SIZE
);
123 static void free_dsr(PVRDMADev
*dev
)
125 PCIDevice
*pci_dev
= PCI_DEVICE(dev
);
127 if (!dev
->dsr_info
.dsr
) {
131 free_dev_ring(pci_dev
, &dev
->dsr_info
.async
,
132 dev
->dsr_info
.async_ring_state
);
134 free_dev_ring(pci_dev
, &dev
->dsr_info
.cq
, dev
->dsr_info
.cq_ring_state
);
136 rdma_pci_dma_unmap(pci_dev
, dev
->dsr_info
.req
,
137 sizeof(union pvrdma_cmd_req
));
139 rdma_pci_dma_unmap(pci_dev
, dev
->dsr_info
.rsp
,
140 sizeof(union pvrdma_cmd_resp
));
142 rdma_pci_dma_unmap(pci_dev
, dev
->dsr_info
.dsr
,
143 sizeof(struct pvrdma_device_shared_region
));
145 dev
->dsr_info
.dsr
= NULL
;
148 static int load_dsr(PVRDMADev
*dev
)
151 PCIDevice
*pci_dev
= PCI_DEVICE(dev
);
153 struct pvrdma_device_shared_region
*dsr
;
158 pr_dbg("dsr_dma=0x%llx\n", (long long unsigned int)dev
->dsr_info
.dma
);
159 dev
->dsr_info
.dsr
= rdma_pci_dma_map(pci_dev
, dev
->dsr_info
.dma
,
160 sizeof(struct pvrdma_device_shared_region
));
161 if (!dev
->dsr_info
.dsr
) {
162 pr_err("Failed to map to DSR\n");
168 dsr_info
= &dev
->dsr_info
;
171 /* Map to command slot */
172 pr_dbg("cmd_dma=0x%llx\n", (long long unsigned int)dsr
->cmd_slot_dma
);
173 dsr_info
->req
= rdma_pci_dma_map(pci_dev
, dsr
->cmd_slot_dma
,
174 sizeof(union pvrdma_cmd_req
));
175 if (!dsr_info
->req
) {
176 pr_err("Failed to map to command slot address\n");
181 /* Map to response slot */
182 pr_dbg("rsp_dma=0x%llx\n", (long long unsigned int)dsr
->resp_slot_dma
);
183 dsr_info
->rsp
= rdma_pci_dma_map(pci_dev
, dsr
->resp_slot_dma
,
184 sizeof(union pvrdma_cmd_resp
));
185 if (!dsr_info
->rsp
) {
186 pr_err("Failed to map to response slot address\n");
191 /* Map to CQ notification ring */
192 rc
= init_dev_ring(&dsr_info
->cq
, &dsr_info
->cq_ring_state
, "dev_cq",
193 pci_dev
, dsr
->cq_ring_pages
.pdir_dma
,
194 dsr
->cq_ring_pages
.num_pages
);
196 pr_err("Failed to map to initialize CQ ring\n");
201 /* Map to event notification ring */
202 rc
= init_dev_ring(&dsr_info
->async
, &dsr_info
->async_ring_state
,
203 "dev_async", pci_dev
, dsr
->async_ring_pages
.pdir_dma
,
204 dsr
->async_ring_pages
.num_pages
);
206 pr_err("Failed to map to initialize event ring\n");
214 rdma_pci_dma_unmap(pci_dev
, dsr_info
->rsp
, sizeof(union pvrdma_cmd_resp
));
217 rdma_pci_dma_unmap(pci_dev
, dsr_info
->req
, sizeof(union pvrdma_cmd_req
));
220 rdma_pci_dma_unmap(pci_dev
, dsr_info
->dsr
,
221 sizeof(struct pvrdma_device_shared_region
));
222 dsr_info
->dsr
= NULL
;
228 static void init_dsr_dev_caps(PVRDMADev
*dev
)
230 struct pvrdma_device_shared_region
*dsr
;
232 if (dev
->dsr_info
.dsr
== NULL
) {
233 pr_err("Can't initialized DSR\n");
237 dsr
= dev
->dsr_info
.dsr
;
239 dsr
->caps
.fw_ver
= PVRDMA_FW_VERSION
;
240 pr_dbg("fw_ver=0x%" PRIx64
"\n", dsr
->caps
.fw_ver
);
242 dsr
->caps
.mode
= PVRDMA_DEVICE_MODE_ROCE
;
243 pr_dbg("mode=%d\n", dsr
->caps
.mode
);
245 dsr
->caps
.gid_types
|= PVRDMA_GID_TYPE_FLAG_ROCE_V1
;
246 pr_dbg("gid_types=0x%x\n", dsr
->caps
.gid_types
);
248 dsr
->caps
.max_uar
= RDMA_BAR2_UAR_SIZE
;
249 pr_dbg("max_uar=%d\n", dsr
->caps
.max_uar
);
251 dsr
->caps
.max_mr_size
= dev
->dev_attr
.max_mr_size
;
252 dsr
->caps
.max_qp
= dev
->dev_attr
.max_qp
;
253 dsr
->caps
.max_qp_wr
= dev
->dev_attr
.max_qp_wr
;
254 dsr
->caps
.max_sge
= dev
->dev_attr
.max_sge
;
255 dsr
->caps
.max_cq
= dev
->dev_attr
.max_cq
;
256 dsr
->caps
.max_cqe
= dev
->dev_attr
.max_cqe
;
257 dsr
->caps
.max_mr
= dev
->dev_attr
.max_mr
;
258 dsr
->caps
.max_pd
= dev
->dev_attr
.max_pd
;
259 dsr
->caps
.max_ah
= dev
->dev_attr
.max_ah
;
261 dsr
->caps
.gid_tbl_len
= MAX_GIDS
;
262 pr_dbg("gid_tbl_len=%d\n", dsr
->caps
.gid_tbl_len
);
264 dsr
->caps
.sys_image_guid
= 0;
265 pr_dbg("sys_image_guid=%" PRIx64
"\n", dsr
->caps
.sys_image_guid
);
267 dsr
->caps
.node_guid
= cpu_to_be64(dev
->node_guid
);
268 pr_dbg("node_guid=%" PRIx64
"\n", be64_to_cpu(dsr
->caps
.node_guid
));
270 dsr
->caps
.phys_port_cnt
= MAX_PORTS
;
271 pr_dbg("phys_port_cnt=%d\n", dsr
->caps
.phys_port_cnt
);
273 dsr
->caps
.max_pkeys
= MAX_PKEYS
;
274 pr_dbg("max_pkeys=%d\n", dsr
->caps
.max_pkeys
);
276 pr_dbg("Initialized\n");
279 static void uninit_msix(PCIDevice
*pdev
, int used_vectors
)
281 PVRDMADev
*dev
= PVRDMA_DEV(pdev
);
284 for (i
= 0; i
< used_vectors
; i
++) {
285 msix_vector_unuse(pdev
, i
);
288 msix_uninit(pdev
, &dev
->msix
, &dev
->msix
);
291 static int init_msix(PCIDevice
*pdev
, Error
**errp
)
293 PVRDMADev
*dev
= PVRDMA_DEV(pdev
);
297 rc
= msix_init(pdev
, RDMA_MAX_INTRS
, &dev
->msix
, RDMA_MSIX_BAR_IDX
,
298 RDMA_MSIX_TABLE
, &dev
->msix
, RDMA_MSIX_BAR_IDX
,
299 RDMA_MSIX_PBA
, 0, NULL
);
302 error_setg(errp
, "Failed to initialize MSI-X");
306 for (i
= 0; i
< RDMA_MAX_INTRS
; i
++) {
307 rc
= msix_vector_use(PCI_DEVICE(dev
), i
);
309 error_setg(errp
, "Fail mark MSI-X vector %d", i
);
310 uninit_msix(pdev
, i
);
318 static void pvrdma_fini(PCIDevice
*pdev
)
320 PVRDMADev
*dev
= PVRDMA_DEV(pdev
);
322 pr_dbg("Closing device %s %x.%x\n", pdev
->name
, PCI_SLOT(pdev
->devfn
),
323 PCI_FUNC(pdev
->devfn
));
325 pvrdma_qp_ops_fini();
327 rdma_rm_fini(&dev
->rdma_dev_res
, &dev
->backend_dev
,
328 dev
->backend_eth_device_name
);
330 rdma_backend_fini(&dev
->backend_dev
);
334 if (msix_enabled(pdev
)) {
335 uninit_msix(pdev
, RDMA_MAX_INTRS
);
339 static void pvrdma_stop(PVRDMADev
*dev
)
341 rdma_backend_stop(&dev
->backend_dev
);
344 static void pvrdma_start(PVRDMADev
*dev
)
346 rdma_backend_start(&dev
->backend_dev
);
349 static void activate_device(PVRDMADev
*dev
)
352 set_reg_val(dev
, PVRDMA_REG_ERR
, 0);
353 pr_dbg("Device activated\n");
356 static int unquiesce_device(PVRDMADev
*dev
)
358 pr_dbg("Device unquiesced\n");
362 static void reset_device(PVRDMADev
*dev
)
366 pr_dbg("Device reset complete\n");
369 static uint64_t regs_read(void *opaque
, hwaddr addr
, unsigned size
)
371 PVRDMADev
*dev
= opaque
;
374 /* pr_dbg("addr=0x%lx, size=%d\n", addr, size); */
376 if (get_reg_val(dev
, addr
, &val
)) {
377 pr_dbg("Error trying to read REG value from address 0x%x\n",
382 trace_pvrdma_regs_read(addr
, val
);
387 static void regs_write(void *opaque
, hwaddr addr
, uint64_t val
, unsigned size
)
389 PVRDMADev
*dev
= opaque
;
391 /* pr_dbg("addr=0x%lx, val=0x%x, size=%d\n", addr, (uint32_t)val, size); */
393 if (set_reg_val(dev
, addr
, val
)) {
394 pr_err("Fail to set REG value, addr=0x%" PRIx64
", val=0x%" PRIx64
"\n",
399 trace_pvrdma_regs_write(addr
, val
);
402 case PVRDMA_REG_DSRLOW
:
403 dev
->dsr_info
.dma
= val
;
405 case PVRDMA_REG_DSRHIGH
:
406 dev
->dsr_info
.dma
|= val
<< 32;
408 init_dsr_dev_caps(dev
);
412 case PVRDMA_DEVICE_CTL_ACTIVATE
:
413 activate_device(dev
);
415 case PVRDMA_DEVICE_CTL_UNQUIESCE
:
416 unquiesce_device(dev
);
418 case PVRDMA_DEVICE_CTL_RESET
:
424 pr_dbg("Interrupt mask=0x%" PRIx64
"\n", val
);
425 dev
->interrupt_mask
= val
;
427 case PVRDMA_REG_REQUEST
:
429 execute_command(dev
);
437 static const MemoryRegionOps regs_ops
= {
440 .endianness
= DEVICE_LITTLE_ENDIAN
,
442 .min_access_size
= sizeof(uint32_t),
443 .max_access_size
= sizeof(uint32_t),
447 static void uar_write(void *opaque
, hwaddr addr
, uint64_t val
, unsigned size
)
449 PVRDMADev
*dev
= opaque
;
451 /* pr_dbg("addr=0x%lx, val=0x%x, size=%d\n", addr, (uint32_t)val, size); */
453 switch (addr
& 0xFFF) { /* Mask with 0xFFF as each UC gets page */
454 case PVRDMA_UAR_QP_OFFSET
:
455 pr_dbg("UAR QP command, addr=0x%" PRIx64
", val=0x%" PRIx64
"\n",
456 (uint64_t)addr
, val
);
457 if (val
& PVRDMA_UAR_QP_SEND
) {
458 pvrdma_qp_send(dev
, val
& PVRDMA_UAR_HANDLE_MASK
);
460 if (val
& PVRDMA_UAR_QP_RECV
) {
461 pvrdma_qp_recv(dev
, val
& PVRDMA_UAR_HANDLE_MASK
);
464 case PVRDMA_UAR_CQ_OFFSET
:
465 /* pr_dbg("UAR CQ cmd, addr=0x%x, val=0x%lx\n", (uint32_t)addr, val); */
466 if (val
& PVRDMA_UAR_CQ_ARM
) {
467 rdma_rm_req_notify_cq(&dev
->rdma_dev_res
,
468 val
& PVRDMA_UAR_HANDLE_MASK
,
469 !!(val
& PVRDMA_UAR_CQ_ARM_SOL
));
471 if (val
& PVRDMA_UAR_CQ_ARM_SOL
) {
472 pr_dbg("UAR_CQ_ARM_SOL (%" PRIx64
")\n",
473 val
& PVRDMA_UAR_HANDLE_MASK
);
475 if (val
& PVRDMA_UAR_CQ_POLL
) {
476 pr_dbg("UAR_CQ_POLL (%" PRIx64
")\n", val
& PVRDMA_UAR_HANDLE_MASK
);
477 pvrdma_cq_poll(&dev
->rdma_dev_res
, val
& PVRDMA_UAR_HANDLE_MASK
);
481 pr_err("Unsupported command, addr=0x%" PRIx64
", val=0x%" PRIx64
"\n",
487 static const MemoryRegionOps uar_ops
= {
489 .endianness
= DEVICE_LITTLE_ENDIAN
,
491 .min_access_size
= sizeof(uint32_t),
492 .max_access_size
= sizeof(uint32_t),
496 static void init_pci_config(PCIDevice
*pdev
)
498 pdev
->config
[PCI_INTERRUPT_PIN
] = 1;
501 static void init_bars(PCIDevice
*pdev
)
503 PVRDMADev
*dev
= PVRDMA_DEV(pdev
);
506 memory_region_init(&dev
->msix
, OBJECT(dev
), "pvrdma-msix",
507 RDMA_BAR0_MSIX_SIZE
);
508 pci_register_bar(pdev
, RDMA_MSIX_BAR_IDX
, PCI_BASE_ADDRESS_SPACE_MEMORY
,
511 /* BAR 1 - Registers */
512 memset(&dev
->regs_data
, 0, sizeof(dev
->regs_data
));
513 memory_region_init_io(&dev
->regs
, OBJECT(dev
), ®s_ops
, dev
,
514 "pvrdma-regs", sizeof(dev
->regs_data
));
515 pci_register_bar(pdev
, RDMA_REG_BAR_IDX
, PCI_BASE_ADDRESS_SPACE_MEMORY
,
519 memset(&dev
->uar_data
, 0, sizeof(dev
->uar_data
));
520 memory_region_init_io(&dev
->uar
, OBJECT(dev
), &uar_ops
, dev
, "rdma-uar",
521 sizeof(dev
->uar_data
));
522 pci_register_bar(pdev
, RDMA_UAR_BAR_IDX
, PCI_BASE_ADDRESS_SPACE_MEMORY
,
526 static void init_regs(PCIDevice
*pdev
)
528 PVRDMADev
*dev
= PVRDMA_DEV(pdev
);
530 set_reg_val(dev
, PVRDMA_REG_VERSION
, PVRDMA_HW_VERSION
);
531 set_reg_val(dev
, PVRDMA_REG_ERR
, 0xFFFF);
534 static void init_dev_caps(PVRDMADev
*dev
)
536 size_t pg_tbl_bytes
= TARGET_PAGE_SIZE
*
537 (TARGET_PAGE_SIZE
/ sizeof(uint64_t));
538 size_t wr_sz
= MAX(sizeof(struct pvrdma_sq_wqe_hdr
),
539 sizeof(struct pvrdma_rq_wqe_hdr
));
541 dev
->dev_attr
.max_qp_wr
= pg_tbl_bytes
/
542 (wr_sz
+ sizeof(struct pvrdma_sge
) * MAX_SGE
) -
543 TARGET_PAGE_SIZE
; /* First page is ring state */
544 pr_dbg("max_qp_wr=%d\n", dev
->dev_attr
.max_qp_wr
);
546 dev
->dev_attr
.max_cqe
= pg_tbl_bytes
/ sizeof(struct pvrdma_cqe
) -
547 TARGET_PAGE_SIZE
; /* First page is ring state */
548 pr_dbg("max_cqe=%d\n", dev
->dev_attr
.max_cqe
);
551 static int pvrdma_check_ram_shared(Object
*obj
, void *opaque
)
553 bool *shared
= opaque
;
555 if (object_dynamic_cast(obj
, "memory-backend-ram")) {
556 *shared
= object_property_get_bool(obj
, "share", NULL
);
562 static void pvrdma_realize(PCIDevice
*pdev
, Error
**errp
)
565 PVRDMADev
*dev
= PVRDMA_DEV(pdev
);
567 bool ram_shared
= false;
571 pr_dbg("Initializing device %s %x.%x\n", pdev
->name
,
572 PCI_SLOT(pdev
->devfn
), PCI_FUNC(pdev
->devfn
));
574 if (TARGET_PAGE_SIZE
!= getpagesize()) {
575 error_setg(errp
, "Target page size must be the same as host page size");
579 memdev_root
= object_resolve_path("/objects", NULL
);
581 object_child_foreach(memdev_root
, pvrdma_check_ram_shared
, &ram_shared
);
584 error_setg(errp
, "Only shared memory backed ram is supported");
588 dev
->dsr_info
.dsr
= NULL
;
590 init_pci_config(pdev
);
598 rc
= init_msix(pdev
, errp
);
603 rc
= rdma_backend_init(&dev
->backend_dev
, pdev
, &dev
->rdma_dev_res
,
604 dev
->backend_device_name
, dev
->backend_port_num
,
605 &dev
->dev_attr
, &dev
->mad_chr
, errp
);
610 rc
= rdma_rm_init(&dev
->rdma_dev_res
, &dev
->dev_attr
, errp
);
615 rc
= pvrdma_qp_ops_init();
622 error_append_hint(errp
, "Device fail to load\n");
626 static void pvrdma_exit(PCIDevice
*pdev
)
631 static void pvrdma_class_init(ObjectClass
*klass
, void *data
)
633 DeviceClass
*dc
= DEVICE_CLASS(klass
);
634 PCIDeviceClass
*k
= PCI_DEVICE_CLASS(klass
);
636 k
->realize
= pvrdma_realize
;
637 k
->exit
= pvrdma_exit
;
638 k
->vendor_id
= PCI_VENDOR_ID_VMWARE
;
639 k
->device_id
= PCI_DEVICE_ID_VMWARE_PVRDMA
;
641 k
->class_id
= PCI_CLASS_NETWORK_OTHER
;
643 dc
->desc
= "RDMA Device";
644 dc
->props
= pvrdma_dev_properties
;
645 set_bit(DEVICE_CATEGORY_NETWORK
, dc
->categories
);
648 static const TypeInfo pvrdma_info
= {
649 .name
= PVRDMA_HW_NAME
,
650 .parent
= TYPE_PCI_DEVICE
,
651 .instance_size
= sizeof(PVRDMADev
),
652 .class_init
= pvrdma_class_init
,
653 .interfaces
= (InterfaceInfo
[]) {
654 { INTERFACE_CONVENTIONAL_PCI_DEVICE
},
659 static void register_types(void)
661 type_register_static(&pvrdma_info
);
664 type_init(register_types
)