2 * QEMU paravirtual RDMA
4 * Copyright (C) 2018 Oracle
5 * Copyright (C) 2018 Red Hat Inc
8 * Yuval Shaia <yuval.shaia@oracle.com>
9 * Marcel Apfelbaum <marcel@redhat.com>
11 * This work is licensed under the terms of the GNU GPL, version 2 or later.
12 * See the COPYING file in the top-level directory.
16 #include "qemu/osdep.h"
17 #include "qapi/error.h"
19 #include "hw/pci/pci.h"
20 #include "hw/pci/pci_ids.h"
21 #include "hw/pci/msi.h"
22 #include "hw/pci/msix.h"
23 #include "hw/qdev-core.h"
24 #include "hw/qdev-properties.h"
28 #include "../rdma_rm.h"
29 #include "../rdma_backend.h"
30 #include "../rdma_utils.h"
32 #include <infiniband/verbs.h>
34 #include "standard-headers/rdma/vmw_pvrdma-abi.h"
35 #include "standard-headers/drivers/infiniband/hw/vmw_pvrdma/pvrdma_dev_api.h"
36 #include "pvrdma_qp_ops.h"
38 static Property pvrdma_dev_properties
[] = {
39 DEFINE_PROP_STRING("backend-dev", PVRDMADev
, backend_device_name
),
40 DEFINE_PROP_UINT8("backend-port", PVRDMADev
, backend_port_num
, 1),
41 DEFINE_PROP_UINT8("backend-gid-idx", PVRDMADev
, backend_gid_idx
, 0),
42 DEFINE_PROP_UINT64("dev-caps-max-mr-size", PVRDMADev
, dev_attr
.max_mr_size
,
44 DEFINE_PROP_INT32("dev-caps-max-qp", PVRDMADev
, dev_attr
.max_qp
, MAX_QP
),
45 DEFINE_PROP_INT32("dev-caps-max-sge", PVRDMADev
, dev_attr
.max_sge
, MAX_SGE
),
46 DEFINE_PROP_INT32("dev-caps-max-cq", PVRDMADev
, dev_attr
.max_cq
, MAX_CQ
),
47 DEFINE_PROP_INT32("dev-caps-max-mr", PVRDMADev
, dev_attr
.max_mr
, MAX_MR
),
48 DEFINE_PROP_INT32("dev-caps-max-pd", PVRDMADev
, dev_attr
.max_pd
, MAX_PD
),
49 DEFINE_PROP_INT32("dev-caps-qp-rd-atom", PVRDMADev
, dev_attr
.max_qp_rd_atom
,
51 DEFINE_PROP_INT32("dev-caps-max-qp-init-rd-atom", PVRDMADev
,
52 dev_attr
.max_qp_init_rd_atom
, MAX_QP_INIT_RD_ATOM
),
53 DEFINE_PROP_INT32("dev-caps-max-ah", PVRDMADev
, dev_attr
.max_ah
, MAX_AH
),
54 DEFINE_PROP_END_OF_LIST(),
57 static void free_dev_ring(PCIDevice
*pci_dev
, PvrdmaRing
*ring
,
60 pvrdma_ring_free(ring
);
61 rdma_pci_dma_unmap(pci_dev
, ring_state
, TARGET_PAGE_SIZE
);
64 static int init_dev_ring(PvrdmaRing
*ring
, struct pvrdma_ring
**ring_state
,
65 const char *name
, PCIDevice
*pci_dev
,
66 dma_addr_t dir_addr
, uint32_t num_pages
)
71 pr_dbg("Initializing device ring %s\n", name
);
72 pr_dbg("pdir_dma=0x%llx\n", (long long unsigned int)dir_addr
);
73 pr_dbg("num_pages=%d\n", num_pages
);
74 dir
= rdma_pci_dma_map(pci_dev
, dir_addr
, TARGET_PAGE_SIZE
);
76 pr_err("Failed to map to page directory\n");
80 tbl
= rdma_pci_dma_map(pci_dev
, dir
[0], TARGET_PAGE_SIZE
);
82 pr_err("Failed to map to page table\n");
87 *ring_state
= rdma_pci_dma_map(pci_dev
, tbl
[0], TARGET_PAGE_SIZE
);
89 pr_err("Failed to map to ring state\n");
93 /* RX ring is the second */
95 rc
= pvrdma_ring_init(ring
, name
, pci_dev
,
96 (struct pvrdma_ring
*)*ring_state
,
97 (num_pages
- 1) * TARGET_PAGE_SIZE
/
98 sizeof(struct pvrdma_cqne
),
99 sizeof(struct pvrdma_cqne
),
100 (dma_addr_t
*)&tbl
[1], (dma_addr_t
)num_pages
- 1);
102 pr_err("Failed to initialize ring\n");
104 goto out_free_ring_state
;
110 rdma_pci_dma_unmap(pci_dev
, *ring_state
, TARGET_PAGE_SIZE
);
113 rdma_pci_dma_unmap(pci_dev
, tbl
, TARGET_PAGE_SIZE
);
116 rdma_pci_dma_unmap(pci_dev
, dir
, TARGET_PAGE_SIZE
);
122 static void free_dsr(PVRDMADev
*dev
)
124 PCIDevice
*pci_dev
= PCI_DEVICE(dev
);
126 if (!dev
->dsr_info
.dsr
) {
130 free_dev_ring(pci_dev
, &dev
->dsr_info
.async
,
131 dev
->dsr_info
.async_ring_state
);
133 free_dev_ring(pci_dev
, &dev
->dsr_info
.cq
, dev
->dsr_info
.cq_ring_state
);
135 rdma_pci_dma_unmap(pci_dev
, dev
->dsr_info
.req
,
136 sizeof(union pvrdma_cmd_req
));
138 rdma_pci_dma_unmap(pci_dev
, dev
->dsr_info
.rsp
,
139 sizeof(union pvrdma_cmd_resp
));
141 rdma_pci_dma_unmap(pci_dev
, dev
->dsr_info
.dsr
,
142 sizeof(struct pvrdma_device_shared_region
));
144 dev
->dsr_info
.dsr
= NULL
;
147 static int load_dsr(PVRDMADev
*dev
)
150 PCIDevice
*pci_dev
= PCI_DEVICE(dev
);
152 struct pvrdma_device_shared_region
*dsr
;
157 pr_dbg("dsr_dma=0x%llx\n", (long long unsigned int)dev
->dsr_info
.dma
);
158 dev
->dsr_info
.dsr
= rdma_pci_dma_map(pci_dev
, dev
->dsr_info
.dma
,
159 sizeof(struct pvrdma_device_shared_region
));
160 if (!dev
->dsr_info
.dsr
) {
161 pr_err("Failed to map to DSR\n");
167 dsr_info
= &dev
->dsr_info
;
170 /* Map to command slot */
171 pr_dbg("cmd_dma=0x%llx\n", (long long unsigned int)dsr
->cmd_slot_dma
);
172 dsr_info
->req
= rdma_pci_dma_map(pci_dev
, dsr
->cmd_slot_dma
,
173 sizeof(union pvrdma_cmd_req
));
174 if (!dsr_info
->req
) {
175 pr_err("Failed to map to command slot address\n");
180 /* Map to response slot */
181 pr_dbg("rsp_dma=0x%llx\n", (long long unsigned int)dsr
->resp_slot_dma
);
182 dsr_info
->rsp
= rdma_pci_dma_map(pci_dev
, dsr
->resp_slot_dma
,
183 sizeof(union pvrdma_cmd_resp
));
184 if (!dsr_info
->rsp
) {
185 pr_err("Failed to map to response slot address\n");
190 /* Map to CQ notification ring */
191 rc
= init_dev_ring(&dsr_info
->cq
, &dsr_info
->cq_ring_state
, "dev_cq",
192 pci_dev
, dsr
->cq_ring_pages
.pdir_dma
,
193 dsr
->cq_ring_pages
.num_pages
);
195 pr_err("Failed to map to initialize CQ ring\n");
200 /* Map to event notification ring */
201 rc
= init_dev_ring(&dsr_info
->async
, &dsr_info
->async_ring_state
,
202 "dev_async", pci_dev
, dsr
->async_ring_pages
.pdir_dma
,
203 dsr
->async_ring_pages
.num_pages
);
205 pr_err("Failed to map to initialize event ring\n");
213 rdma_pci_dma_unmap(pci_dev
, dsr_info
->rsp
, sizeof(union pvrdma_cmd_resp
));
216 rdma_pci_dma_unmap(pci_dev
, dsr_info
->req
, sizeof(union pvrdma_cmd_req
));
219 rdma_pci_dma_unmap(pci_dev
, dsr_info
->dsr
,
220 sizeof(struct pvrdma_device_shared_region
));
221 dsr_info
->dsr
= NULL
;
227 static void init_dsr_dev_caps(PVRDMADev
*dev
)
229 struct pvrdma_device_shared_region
*dsr
;
231 if (dev
->dsr_info
.dsr
== NULL
) {
232 pr_err("Can't initialized DSR\n");
236 dsr
= dev
->dsr_info
.dsr
;
238 dsr
->caps
.fw_ver
= PVRDMA_FW_VERSION
;
239 pr_dbg("fw_ver=0x%lx\n", dsr
->caps
.fw_ver
);
241 dsr
->caps
.mode
= PVRDMA_DEVICE_MODE_ROCE
;
242 pr_dbg("mode=%d\n", dsr
->caps
.mode
);
244 dsr
->caps
.gid_types
|= PVRDMA_GID_TYPE_FLAG_ROCE_V1
;
245 pr_dbg("gid_types=0x%x\n", dsr
->caps
.gid_types
);
247 dsr
->caps
.max_uar
= RDMA_BAR2_UAR_SIZE
;
248 pr_dbg("max_uar=%d\n", dsr
->caps
.max_uar
);
250 dsr
->caps
.max_mr_size
= dev
->dev_attr
.max_mr_size
;
251 dsr
->caps
.max_qp
= dev
->dev_attr
.max_qp
;
252 dsr
->caps
.max_qp_wr
= dev
->dev_attr
.max_qp_wr
;
253 dsr
->caps
.max_sge
= dev
->dev_attr
.max_sge
;
254 dsr
->caps
.max_cq
= dev
->dev_attr
.max_cq
;
255 dsr
->caps
.max_cqe
= dev
->dev_attr
.max_cqe
;
256 dsr
->caps
.max_mr
= dev
->dev_attr
.max_mr
;
257 dsr
->caps
.max_pd
= dev
->dev_attr
.max_pd
;
258 dsr
->caps
.max_ah
= dev
->dev_attr
.max_ah
;
260 dsr
->caps
.gid_tbl_len
= MAX_GIDS
;
261 pr_dbg("gid_tbl_len=%d\n", dsr
->caps
.gid_tbl_len
);
263 dsr
->caps
.sys_image_guid
= 0;
264 pr_dbg("sys_image_guid=%lx\n", dsr
->caps
.sys_image_guid
);
266 dsr
->caps
.node_guid
= cpu_to_be64(dev
->node_guid
);
267 pr_dbg("node_guid=%llx\n",
268 (long long unsigned int)be64_to_cpu(dsr
->caps
.node_guid
));
270 dsr
->caps
.phys_port_cnt
= MAX_PORTS
;
271 pr_dbg("phys_port_cnt=%d\n", dsr
->caps
.phys_port_cnt
);
273 dsr
->caps
.max_pkeys
= MAX_PKEYS
;
274 pr_dbg("max_pkeys=%d\n", dsr
->caps
.max_pkeys
);
276 pr_dbg("Initialized\n");
279 static void free_ports(PVRDMADev
*dev
)
283 for (i
= 0; i
< MAX_PORTS
; i
++) {
284 g_free(dev
->rdma_dev_res
.ports
[i
].gid_tbl
);
288 static void init_ports(PVRDMADev
*dev
, Error
**errp
)
292 memset(dev
->rdma_dev_res
.ports
, 0, sizeof(dev
->rdma_dev_res
.ports
));
294 for (i
= 0; i
< MAX_PORTS
; i
++) {
295 dev
->rdma_dev_res
.ports
[i
].state
= IBV_PORT_DOWN
;
297 dev
->rdma_dev_res
.ports
[i
].pkey_tbl
=
298 g_malloc0(sizeof(*dev
->rdma_dev_res
.ports
[i
].pkey_tbl
) *
303 static void activate_device(PVRDMADev
*dev
)
305 set_reg_val(dev
, PVRDMA_REG_ERR
, 0);
306 pr_dbg("Device activated\n");
309 static int unquiesce_device(PVRDMADev
*dev
)
311 pr_dbg("Device unquiesced\n");
315 static int reset_device(PVRDMADev
*dev
)
317 pr_dbg("Device reset complete\n");
321 static uint64_t regs_read(void *opaque
, hwaddr addr
, unsigned size
)
323 PVRDMADev
*dev
= opaque
;
326 /* pr_dbg("addr=0x%lx, size=%d\n", addr, size); */
328 if (get_reg_val(dev
, addr
, &val
)) {
329 pr_dbg("Error trying to read REG value from address 0x%x\n",
334 trace_pvrdma_regs_read(addr
, val
);
339 static void regs_write(void *opaque
, hwaddr addr
, uint64_t val
, unsigned size
)
341 PVRDMADev
*dev
= opaque
;
343 /* pr_dbg("addr=0x%lx, val=0x%x, size=%d\n", addr, (uint32_t)val, size); */
345 if (set_reg_val(dev
, addr
, val
)) {
346 pr_err("Error trying to set REG value, addr=0x%lx, val=0x%lx\n",
347 (uint64_t)addr
, val
);
351 trace_pvrdma_regs_write(addr
, val
);
354 case PVRDMA_REG_DSRLOW
:
355 dev
->dsr_info
.dma
= val
;
357 case PVRDMA_REG_DSRHIGH
:
358 dev
->dsr_info
.dma
|= val
<< 32;
360 init_dsr_dev_caps(dev
);
364 case PVRDMA_DEVICE_CTL_ACTIVATE
:
365 activate_device(dev
);
367 case PVRDMA_DEVICE_CTL_UNQUIESCE
:
368 unquiesce_device(dev
);
370 case PVRDMA_DEVICE_CTL_RESET
:
376 pr_dbg("Interrupt mask=0x%lx\n", val
);
377 dev
->interrupt_mask
= val
;
379 case PVRDMA_REG_REQUEST
:
381 execute_command(dev
);
389 static const MemoryRegionOps regs_ops
= {
392 .endianness
= DEVICE_LITTLE_ENDIAN
,
394 .min_access_size
= sizeof(uint32_t),
395 .max_access_size
= sizeof(uint32_t),
399 static void uar_write(void *opaque
, hwaddr addr
, uint64_t val
, unsigned size
)
401 PVRDMADev
*dev
= opaque
;
403 /* pr_dbg("addr=0x%lx, val=0x%x, size=%d\n", addr, (uint32_t)val, size); */
405 switch (addr
& 0xFFF) { /* Mask with 0xFFF as each UC gets page */
406 case PVRDMA_UAR_QP_OFFSET
:
407 pr_dbg("UAR QP command, addr=0x%x, val=0x%lx\n", (uint32_t)addr
, val
);
408 if (val
& PVRDMA_UAR_QP_SEND
) {
409 pvrdma_qp_send(dev
, val
& PVRDMA_UAR_HANDLE_MASK
);
411 if (val
& PVRDMA_UAR_QP_RECV
) {
412 pvrdma_qp_recv(dev
, val
& PVRDMA_UAR_HANDLE_MASK
);
415 case PVRDMA_UAR_CQ_OFFSET
:
416 /* pr_dbg("UAR CQ cmd, addr=0x%x, val=0x%lx\n", (uint32_t)addr, val); */
417 if (val
& PVRDMA_UAR_CQ_ARM
) {
418 rdma_rm_req_notify_cq(&dev
->rdma_dev_res
,
419 val
& PVRDMA_UAR_HANDLE_MASK
,
420 !!(val
& PVRDMA_UAR_CQ_ARM_SOL
));
422 if (val
& PVRDMA_UAR_CQ_ARM_SOL
) {
423 pr_dbg("UAR_CQ_ARM_SOL (%ld)\n", val
& PVRDMA_UAR_HANDLE_MASK
);
425 if (val
& PVRDMA_UAR_CQ_POLL
) {
426 pr_dbg("UAR_CQ_POLL (%ld)\n", val
& PVRDMA_UAR_HANDLE_MASK
);
427 pvrdma_cq_poll(&dev
->rdma_dev_res
, val
& PVRDMA_UAR_HANDLE_MASK
);
431 pr_err("Unsupported command, addr=0x%lx, val=0x%lx\n",
432 (uint64_t)addr
, val
);
437 static const MemoryRegionOps uar_ops
= {
439 .endianness
= DEVICE_LITTLE_ENDIAN
,
441 .min_access_size
= sizeof(uint32_t),
442 .max_access_size
= sizeof(uint32_t),
446 static void init_pci_config(PCIDevice
*pdev
)
448 pdev
->config
[PCI_INTERRUPT_PIN
] = 1;
451 static void init_bars(PCIDevice
*pdev
)
453 PVRDMADev
*dev
= PVRDMA_DEV(pdev
);
456 memory_region_init(&dev
->msix
, OBJECT(dev
), "pvrdma-msix",
457 RDMA_BAR0_MSIX_SIZE
);
458 pci_register_bar(pdev
, RDMA_MSIX_BAR_IDX
, PCI_BASE_ADDRESS_SPACE_MEMORY
,
461 /* BAR 1 - Registers */
462 memset(&dev
->regs_data
, 0, sizeof(dev
->regs_data
));
463 memory_region_init_io(&dev
->regs
, OBJECT(dev
), ®s_ops
, dev
,
464 "pvrdma-regs", RDMA_BAR1_REGS_SIZE
);
465 pci_register_bar(pdev
, RDMA_REG_BAR_IDX
, PCI_BASE_ADDRESS_SPACE_MEMORY
,
469 memset(&dev
->uar_data
, 0, sizeof(dev
->uar_data
));
470 memory_region_init_io(&dev
->uar
, OBJECT(dev
), &uar_ops
, dev
, "rdma-uar",
472 pci_register_bar(pdev
, RDMA_UAR_BAR_IDX
, PCI_BASE_ADDRESS_SPACE_MEMORY
,
476 static void init_regs(PCIDevice
*pdev
)
478 PVRDMADev
*dev
= PVRDMA_DEV(pdev
);
480 set_reg_val(dev
, PVRDMA_REG_VERSION
, PVRDMA_HW_VERSION
);
481 set_reg_val(dev
, PVRDMA_REG_ERR
, 0xFFFF);
484 static void uninit_msix(PCIDevice
*pdev
, int used_vectors
)
486 PVRDMADev
*dev
= PVRDMA_DEV(pdev
);
489 for (i
= 0; i
< used_vectors
; i
++) {
490 msix_vector_unuse(pdev
, i
);
493 msix_uninit(pdev
, &dev
->msix
, &dev
->msix
);
496 static int init_msix(PCIDevice
*pdev
, Error
**errp
)
498 PVRDMADev
*dev
= PVRDMA_DEV(pdev
);
502 rc
= msix_init(pdev
, RDMA_MAX_INTRS
, &dev
->msix
, RDMA_MSIX_BAR_IDX
,
503 RDMA_MSIX_TABLE
, &dev
->msix
, RDMA_MSIX_BAR_IDX
,
504 RDMA_MSIX_PBA
, 0, NULL
);
507 error_setg(errp
, "Failed to initialize MSI-X");
511 for (i
= 0; i
< RDMA_MAX_INTRS
; i
++) {
512 rc
= msix_vector_use(PCI_DEVICE(dev
), i
);
514 error_setg(errp
, "Fail mark MSI-X vercor %d", i
);
515 uninit_msix(pdev
, i
);
523 static void init_dev_caps(PVRDMADev
*dev
)
525 size_t pg_tbl_bytes
= TARGET_PAGE_SIZE
*
526 (TARGET_PAGE_SIZE
/ sizeof(uint64_t));
527 size_t wr_sz
= MAX(sizeof(struct pvrdma_sq_wqe_hdr
),
528 sizeof(struct pvrdma_rq_wqe_hdr
));
530 dev
->dev_attr
.max_qp_wr
= pg_tbl_bytes
/
531 (wr_sz
+ sizeof(struct pvrdma_sge
) * MAX_SGE
) -
532 TARGET_PAGE_SIZE
; /* First page is ring state */
533 pr_dbg("max_qp_wr=%d\n", dev
->dev_attr
.max_qp_wr
);
535 dev
->dev_attr
.max_cqe
= pg_tbl_bytes
/ sizeof(struct pvrdma_cqe
) -
536 TARGET_PAGE_SIZE
; /* First page is ring state */
537 pr_dbg("max_cqe=%d\n", dev
->dev_attr
.max_cqe
);
540 static int pvrdma_check_ram_shared(Object
*obj
, void *opaque
)
542 bool *shared
= opaque
;
544 if (object_dynamic_cast(obj
, "memory-backend-ram")) {
545 *shared
= object_property_get_bool(obj
, "share", NULL
);
551 static void pvrdma_realize(PCIDevice
*pdev
, Error
**errp
)
554 PVRDMADev
*dev
= PVRDMA_DEV(pdev
);
556 bool ram_shared
= false;
558 pr_dbg("Initializing device %s %x.%x\n", pdev
->name
,
559 PCI_SLOT(pdev
->devfn
), PCI_FUNC(pdev
->devfn
));
561 if (TARGET_PAGE_SIZE
!= getpagesize()) {
562 error_setg(errp
, "Target page size must be the same as host page size");
566 memdev_root
= object_resolve_path("/objects", NULL
);
568 object_child_foreach(memdev_root
, pvrdma_check_ram_shared
, &ram_shared
);
571 error_setg(errp
, "Only shared memory backed ram is supported");
575 dev
->dsr_info
.dsr
= NULL
;
577 init_pci_config(pdev
);
585 rc
= init_msix(pdev
, errp
);
590 rc
= rdma_backend_init(&dev
->backend_dev
, &dev
->rdma_dev_res
,
591 dev
->backend_device_name
, dev
->backend_port_num
,
592 dev
->backend_gid_idx
, &dev
->dev_attr
, errp
);
597 rc
= rdma_rm_init(&dev
->rdma_dev_res
, &dev
->dev_attr
, errp
);
602 init_ports(dev
, errp
);
604 rc
= pvrdma_qp_ops_init();
611 error_append_hint(errp
, "Device fail to load\n");
615 static void pvrdma_exit(PCIDevice
*pdev
)
617 PVRDMADev
*dev
= PVRDMA_DEV(pdev
);
619 pr_dbg("Closing device %s %x.%x\n", pdev
->name
, PCI_SLOT(pdev
->devfn
),
620 PCI_FUNC(pdev
->devfn
));
622 pvrdma_qp_ops_fini();
626 rdma_rm_fini(&dev
->rdma_dev_res
);
628 rdma_backend_fini(&dev
->backend_dev
);
632 if (msix_enabled(pdev
)) {
633 uninit_msix(pdev
, RDMA_MAX_INTRS
);
637 static void pvrdma_class_init(ObjectClass
*klass
, void *data
)
639 DeviceClass
*dc
= DEVICE_CLASS(klass
);
640 PCIDeviceClass
*k
= PCI_DEVICE_CLASS(klass
);
642 k
->realize
= pvrdma_realize
;
643 k
->exit
= pvrdma_exit
;
644 k
->vendor_id
= PCI_VENDOR_ID_VMWARE
;
645 k
->device_id
= PCI_DEVICE_ID_VMWARE_PVRDMA
;
647 k
->class_id
= PCI_CLASS_NETWORK_OTHER
;
649 dc
->desc
= "RDMA Device";
650 dc
->props
= pvrdma_dev_properties
;
651 set_bit(DEVICE_CATEGORY_NETWORK
, dc
->categories
);
654 static const TypeInfo pvrdma_info
= {
655 .name
= PVRDMA_HW_NAME
,
656 .parent
= TYPE_PCI_DEVICE
,
657 .instance_size
= sizeof(PVRDMADev
),
658 .class_init
= pvrdma_class_init
,
659 .interfaces
= (InterfaceInfo
[]) {
660 { INTERFACE_CONVENTIONAL_PCI_DEVICE
},
665 static void register_types(void)
667 type_register_static(&pvrdma_info
);
670 type_init(register_types
)