2 * QEMU paravirtual RDMA - Resource Manager Implementation
4 * Copyright (C) 2018 Oracle
5 * Copyright (C) 2018 Red Hat Inc
8 * Yuval Shaia <yuval.shaia@oracle.com>
9 * Marcel Apfelbaum <marcel@redhat.com>
11 * This work is licensed under the terms of the GNU GPL, version 2 or later.
12 * See the COPYING file in the top-level directory.
16 #include "qemu/osdep.h"
17 #include "qapi/error.h"
19 #include "monitor/monitor.h"
22 #include "rdma_utils.h"
23 #include "rdma_backend.h"
26 /* Page directory and page tables */
27 #define PG_DIR_SZ { TARGET_PAGE_SIZE / sizeof(__u64) }
28 #define PG_TBL_SZ { TARGET_PAGE_SIZE / sizeof(__u64) }
30 void rdma_dump_device_counters(Monitor
*mon
, RdmaDeviceResources
*dev_res
)
32 monitor_printf(mon
, "\ttx : %" PRId64
"\n",
34 monitor_printf(mon
, "\ttx_len : %" PRId64
"\n",
35 dev_res
->stats
.tx_len
);
36 monitor_printf(mon
, "\ttx_err : %" PRId64
"\n",
37 dev_res
->stats
.tx_err
);
38 monitor_printf(mon
, "\trx_bufs : %" PRId64
"\n",
39 dev_res
->stats
.rx_bufs
);
40 monitor_printf(mon
, "\trx_bufs_len : %" PRId64
"\n",
41 dev_res
->stats
.rx_bufs_len
);
42 monitor_printf(mon
, "\trx_bufs_err : %" PRId64
"\n",
43 dev_res
->stats
.rx_bufs_err
);
44 monitor_printf(mon
, "\tcomps : %" PRId64
"\n",
45 dev_res
->stats
.completions
);
46 monitor_printf(mon
, "\tmissing_comps : %" PRId32
"\n",
47 dev_res
->stats
.missing_cqe
);
48 monitor_printf(mon
, "\tpoll_cq (bk) : %" PRId64
"\n",
49 dev_res
->stats
.poll_cq_from_bk
);
50 monitor_printf(mon
, "\tpoll_cq_ppoll_to : %" PRId64
"\n",
51 dev_res
->stats
.poll_cq_ppoll_to
);
52 monitor_printf(mon
, "\tpoll_cq (fe) : %" PRId64
"\n",
53 dev_res
->stats
.poll_cq_from_guest
);
54 monitor_printf(mon
, "\tpoll_cq_empty : %" PRId64
"\n",
55 dev_res
->stats
.poll_cq_from_guest_empty
);
56 monitor_printf(mon
, "\tmad_tx : %" PRId64
"\n",
57 dev_res
->stats
.mad_tx
);
58 monitor_printf(mon
, "\tmad_tx_err : %" PRId64
"\n",
59 dev_res
->stats
.mad_tx_err
);
60 monitor_printf(mon
, "\tmad_rx : %" PRId64
"\n",
61 dev_res
->stats
.mad_rx
);
62 monitor_printf(mon
, "\tmad_rx_err : %" PRId64
"\n",
63 dev_res
->stats
.mad_rx_err
);
64 monitor_printf(mon
, "\tmad_rx_bufs : %" PRId64
"\n",
65 dev_res
->stats
.mad_rx_bufs
);
66 monitor_printf(mon
, "\tmad_rx_bufs_err : %" PRId64
"\n",
67 dev_res
->stats
.mad_rx_bufs_err
);
68 monitor_printf(mon
, "\tPDs : %" PRId32
"\n",
69 dev_res
->pd_tbl
.used
);
70 monitor_printf(mon
, "\tMRs : %" PRId32
"\n",
71 dev_res
->mr_tbl
.used
);
72 monitor_printf(mon
, "\tUCs : %" PRId32
"\n",
73 dev_res
->uc_tbl
.used
);
74 monitor_printf(mon
, "\tQPs : %" PRId32
"\n",
75 dev_res
->qp_tbl
.used
);
76 monitor_printf(mon
, "\tCQs : %" PRId32
"\n",
77 dev_res
->cq_tbl
.used
);
78 monitor_printf(mon
, "\tCEQ_CTXs : %" PRId32
"\n",
79 dev_res
->cqe_ctx_tbl
.used
);
82 static inline void res_tbl_init(const char *name
, RdmaRmResTbl
*tbl
,
83 uint32_t tbl_sz
, uint32_t res_sz
)
85 tbl
->tbl
= g_malloc(tbl_sz
* res_sz
);
87 strncpy(tbl
->name
, name
, MAX_RM_TBL_NAME
);
88 tbl
->name
[MAX_RM_TBL_NAME
- 1] = 0;
90 tbl
->bitmap
= bitmap_new(tbl_sz
);
94 qemu_mutex_init(&tbl
->lock
);
97 static inline void res_tbl_free(RdmaRmResTbl
*tbl
)
102 qemu_mutex_destroy(&tbl
->lock
);
107 static inline void *rdma_res_tbl_get(RdmaRmResTbl
*tbl
, uint32_t handle
)
109 trace_rdma_res_tbl_get(tbl
->name
, handle
);
111 if ((handle
< tbl
->tbl_sz
) && (test_bit(handle
, tbl
->bitmap
))) {
112 return tbl
->tbl
+ handle
* tbl
->res_sz
;
114 rdma_error_report("Table %s, invalid handle %d", tbl
->name
, handle
);
119 static inline void *rdma_res_tbl_alloc(RdmaRmResTbl
*tbl
, uint32_t *handle
)
121 qemu_mutex_lock(&tbl
->lock
);
123 *handle
= find_first_zero_bit(tbl
->bitmap
, tbl
->tbl_sz
);
124 if (*handle
> tbl
->tbl_sz
) {
125 rdma_error_report("Table %s, failed to allocate, bitmap is full",
127 qemu_mutex_unlock(&tbl
->lock
);
131 set_bit(*handle
, tbl
->bitmap
);
135 qemu_mutex_unlock(&tbl
->lock
);
137 memset(tbl
->tbl
+ *handle
* tbl
->res_sz
, 0, tbl
->res_sz
);
139 trace_rdma_res_tbl_alloc(tbl
->name
, *handle
);
141 return tbl
->tbl
+ *handle
* tbl
->res_sz
;
144 static inline void rdma_res_tbl_dealloc(RdmaRmResTbl
*tbl
, uint32_t handle
)
146 trace_rdma_res_tbl_dealloc(tbl
->name
, handle
);
148 qemu_mutex_lock(&tbl
->lock
);
150 if (handle
< tbl
->tbl_sz
) {
151 clear_bit(handle
, tbl
->bitmap
);
155 qemu_mutex_unlock(&tbl
->lock
);
158 int rdma_rm_alloc_pd(RdmaDeviceResources
*dev_res
, RdmaBackendDev
*backend_dev
,
159 uint32_t *pd_handle
, uint32_t ctx_handle
)
164 pd
= rdma_res_tbl_alloc(&dev_res
->pd_tbl
, pd_handle
);
169 ret
= rdma_backend_create_pd(backend_dev
, &pd
->backend_pd
);
172 goto out_tbl_dealloc
;
175 pd
->ctx_handle
= ctx_handle
;
180 rdma_res_tbl_dealloc(&dev_res
->pd_tbl
, *pd_handle
);
186 RdmaRmPD
*rdma_rm_get_pd(RdmaDeviceResources
*dev_res
, uint32_t pd_handle
)
188 return rdma_res_tbl_get(&dev_res
->pd_tbl
, pd_handle
);
191 void rdma_rm_dealloc_pd(RdmaDeviceResources
*dev_res
, uint32_t pd_handle
)
193 RdmaRmPD
*pd
= rdma_rm_get_pd(dev_res
, pd_handle
);
196 rdma_backend_destroy_pd(&pd
->backend_pd
);
197 rdma_res_tbl_dealloc(&dev_res
->pd_tbl
, pd_handle
);
201 int rdma_rm_alloc_mr(RdmaDeviceResources
*dev_res
, uint32_t pd_handle
,
202 uint64_t guest_start
, uint64_t guest_length
,
203 void *host_virt
, int access_flags
, uint32_t *mr_handle
,
204 uint32_t *lkey
, uint32_t *rkey
)
210 pd
= rdma_rm_get_pd(dev_res
, pd_handle
);
215 mr
= rdma_res_tbl_alloc(&dev_res
->mr_tbl
, mr_handle
);
219 trace_rdma_rm_alloc_mr(*mr_handle
, host_virt
, guest_start
, guest_length
,
223 mr
->virt
= host_virt
;
224 mr
->start
= guest_start
;
225 mr
->length
= guest_length
;
226 mr
->virt
+= (mr
->start
& (TARGET_PAGE_SIZE
- 1));
228 ret
= rdma_backend_create_mr(&mr
->backend_mr
, &pd
->backend_pd
, mr
->virt
,
229 mr
->length
, access_flags
);
236 /* We keep mr_handle in lkey so send and recv get get mr ptr */
240 mr
->pd_handle
= pd_handle
;
245 rdma_res_tbl_dealloc(&dev_res
->mr_tbl
, *mr_handle
);
250 RdmaRmMR
*rdma_rm_get_mr(RdmaDeviceResources
*dev_res
, uint32_t mr_handle
)
252 return rdma_res_tbl_get(&dev_res
->mr_tbl
, mr_handle
);
255 void rdma_rm_dealloc_mr(RdmaDeviceResources
*dev_res
, uint32_t mr_handle
)
257 RdmaRmMR
*mr
= rdma_rm_get_mr(dev_res
, mr_handle
);
260 rdma_backend_destroy_mr(&mr
->backend_mr
);
261 trace_rdma_rm_dealloc_mr(mr_handle
, mr
->start
);
263 mr
->virt
-= (mr
->start
& (TARGET_PAGE_SIZE
- 1));
264 munmap(mr
->virt
, mr
->length
);
266 rdma_res_tbl_dealloc(&dev_res
->mr_tbl
, mr_handle
);
270 int rdma_rm_alloc_uc(RdmaDeviceResources
*dev_res
, uint32_t pfn
,
275 /* TODO: Need to make sure pfn is between bar start address and
276 * bsd+RDMA_BAR2_UAR_SIZE
277 if (pfn > RDMA_BAR2_UAR_SIZE) {
278 rdma_error_report("pfn out of range (%d > %d)", pfn,
284 uc
= rdma_res_tbl_alloc(&dev_res
->uc_tbl
, uc_handle
);
292 RdmaRmUC
*rdma_rm_get_uc(RdmaDeviceResources
*dev_res
, uint32_t uc_handle
)
294 return rdma_res_tbl_get(&dev_res
->uc_tbl
, uc_handle
);
297 void rdma_rm_dealloc_uc(RdmaDeviceResources
*dev_res
, uint32_t uc_handle
)
299 RdmaRmUC
*uc
= rdma_rm_get_uc(dev_res
, uc_handle
);
302 rdma_res_tbl_dealloc(&dev_res
->uc_tbl
, uc_handle
);
306 RdmaRmCQ
*rdma_rm_get_cq(RdmaDeviceResources
*dev_res
, uint32_t cq_handle
)
308 return rdma_res_tbl_get(&dev_res
->cq_tbl
, cq_handle
);
311 int rdma_rm_alloc_cq(RdmaDeviceResources
*dev_res
, RdmaBackendDev
*backend_dev
,
312 uint32_t cqe
, uint32_t *cq_handle
, void *opaque
)
317 cq
= rdma_res_tbl_alloc(&dev_res
->cq_tbl
, cq_handle
);
323 cq
->notify
= CNT_CLEAR
;
325 rc
= rdma_backend_create_cq(backend_dev
, &cq
->backend_cq
, cqe
);
334 rdma_rm_dealloc_cq(dev_res
, *cq_handle
);
339 void rdma_rm_req_notify_cq(RdmaDeviceResources
*dev_res
, uint32_t cq_handle
,
344 cq
= rdma_rm_get_cq(dev_res
, cq_handle
);
349 if (cq
->notify
!= CNT_SET
) {
350 cq
->notify
= notify
? CNT_ARM
: CNT_CLEAR
;
354 void rdma_rm_dealloc_cq(RdmaDeviceResources
*dev_res
, uint32_t cq_handle
)
358 cq
= rdma_rm_get_cq(dev_res
, cq_handle
);
363 rdma_backend_destroy_cq(&cq
->backend_cq
);
365 rdma_res_tbl_dealloc(&dev_res
->cq_tbl
, cq_handle
);
368 RdmaRmQP
*rdma_rm_get_qp(RdmaDeviceResources
*dev_res
, uint32_t qpn
)
370 GBytes
*key
= g_bytes_new(&qpn
, sizeof(qpn
));
372 RdmaRmQP
*qp
= g_hash_table_lookup(dev_res
->qp_hash
, key
);
377 rdma_error_report("Invalid QP handle %d", qpn
);
383 int rdma_rm_alloc_qp(RdmaDeviceResources
*dev_res
, uint32_t pd_handle
,
384 uint8_t qp_type
, uint32_t max_send_wr
,
385 uint32_t max_send_sge
, uint32_t send_cq_handle
,
386 uint32_t max_recv_wr
, uint32_t max_recv_sge
,
387 uint32_t recv_cq_handle
, void *opaque
, uint32_t *qpn
)
395 pd
= rdma_rm_get_pd(dev_res
, pd_handle
);
400 scq
= rdma_rm_get_cq(dev_res
, send_cq_handle
);
401 rcq
= rdma_rm_get_cq(dev_res
, recv_cq_handle
);
404 rdma_error_report("Invalid send_cqn or recv_cqn (%d, %d)",
405 send_cq_handle
, recv_cq_handle
);
409 if (qp_type
== IBV_QPT_GSI
) {
410 scq
->notify
= CNT_SET
;
411 rcq
->notify
= CNT_SET
;
414 qp
= rdma_res_tbl_alloc(&dev_res
->qp_tbl
, &rm_qpn
);
420 qp
->qp_state
= IBV_QPS_RESET
;
421 qp
->qp_type
= qp_type
;
422 qp
->send_cq_handle
= send_cq_handle
;
423 qp
->recv_cq_handle
= recv_cq_handle
;
426 rc
= rdma_backend_create_qp(&qp
->backend_qp
, qp_type
, &pd
->backend_pd
,
427 &scq
->backend_cq
, &rcq
->backend_cq
, max_send_wr
,
428 max_recv_wr
, max_send_sge
, max_recv_sge
);
434 *qpn
= rdma_backend_qpn(&qp
->backend_qp
);
435 trace_rdma_rm_alloc_qp(rm_qpn
, *qpn
, qp_type
);
436 g_hash_table_insert(dev_res
->qp_hash
, g_bytes_new(qpn
, sizeof(*qpn
)), qp
);
441 rdma_res_tbl_dealloc(&dev_res
->qp_tbl
, qp
->qpn
);
446 int rdma_rm_modify_qp(RdmaDeviceResources
*dev_res
, RdmaBackendDev
*backend_dev
,
447 uint32_t qp_handle
, uint32_t attr_mask
, uint8_t sgid_idx
,
448 union ibv_gid
*dgid
, uint32_t dqpn
,
449 enum ibv_qp_state qp_state
, uint32_t qkey
,
450 uint32_t rq_psn
, uint32_t sq_psn
)
455 qp
= rdma_rm_get_qp(dev_res
, qp_handle
);
460 if (qp
->qp_type
== IBV_QPT_SMI
) {
461 rdma_error_report("Got QP0 request");
463 } else if (qp
->qp_type
== IBV_QPT_GSI
) {
467 trace_rdma_rm_modify_qp(qp_handle
, attr_mask
, qp_state
, sgid_idx
);
469 if (attr_mask
& IBV_QP_STATE
) {
470 qp
->qp_state
= qp_state
;
472 if (qp
->qp_state
== IBV_QPS_INIT
) {
473 ret
= rdma_backend_qp_state_init(backend_dev
, &qp
->backend_qp
,
480 if (qp
->qp_state
== IBV_QPS_RTR
) {
481 /* Get backend gid index */
482 sgid_idx
= rdma_rm_get_backend_gid_index(dev_res
, backend_dev
,
484 if (sgid_idx
<= 0) { /* TODO check also less than bk.max_sgid */
485 rdma_error_report("Failed to get bk sgid_idx for sgid_idx %d",
490 ret
= rdma_backend_qp_state_rtr(backend_dev
, &qp
->backend_qp
,
491 qp
->qp_type
, sgid_idx
, dgid
, dqpn
,
493 attr_mask
& IBV_QP_QKEY
);
499 if (qp
->qp_state
== IBV_QPS_RTS
) {
500 ret
= rdma_backend_qp_state_rts(&qp
->backend_qp
, qp
->qp_type
,
502 attr_mask
& IBV_QP_QKEY
);
512 int rdma_rm_query_qp(RdmaDeviceResources
*dev_res
, RdmaBackendDev
*backend_dev
,
513 uint32_t qp_handle
, struct ibv_qp_attr
*attr
,
514 int attr_mask
, struct ibv_qp_init_attr
*init_attr
)
518 qp
= rdma_rm_get_qp(dev_res
, qp_handle
);
523 return rdma_backend_query_qp(&qp
->backend_qp
, attr
, attr_mask
, init_attr
);
526 void rdma_rm_dealloc_qp(RdmaDeviceResources
*dev_res
, uint32_t qp_handle
)
531 key
= g_bytes_new(&qp_handle
, sizeof(qp_handle
));
532 qp
= g_hash_table_lookup(dev_res
->qp_hash
, key
);
533 g_hash_table_remove(dev_res
->qp_hash
, key
);
540 rdma_backend_destroy_qp(&qp
->backend_qp
, dev_res
);
542 rdma_res_tbl_dealloc(&dev_res
->qp_tbl
, qp
->qpn
);
545 void *rdma_rm_get_cqe_ctx(RdmaDeviceResources
*dev_res
, uint32_t cqe_ctx_id
)
549 cqe_ctx
= rdma_res_tbl_get(&dev_res
->cqe_ctx_tbl
, cqe_ctx_id
);
557 int rdma_rm_alloc_cqe_ctx(RdmaDeviceResources
*dev_res
, uint32_t *cqe_ctx_id
,
562 cqe_ctx
= rdma_res_tbl_alloc(&dev_res
->cqe_ctx_tbl
, cqe_ctx_id
);
572 void rdma_rm_dealloc_cqe_ctx(RdmaDeviceResources
*dev_res
, uint32_t cqe_ctx_id
)
574 rdma_res_tbl_dealloc(&dev_res
->cqe_ctx_tbl
, cqe_ctx_id
);
577 int rdma_rm_add_gid(RdmaDeviceResources
*dev_res
, RdmaBackendDev
*backend_dev
,
578 const char *ifname
, union ibv_gid
*gid
, int gid_idx
)
582 rc
= rdma_backend_add_gid(backend_dev
, ifname
, gid
);
587 memcpy(&dev_res
->port
.gid_tbl
[gid_idx
].gid
, gid
, sizeof(*gid
));
592 int rdma_rm_del_gid(RdmaDeviceResources
*dev_res
, RdmaBackendDev
*backend_dev
,
593 const char *ifname
, int gid_idx
)
597 if (!dev_res
->port
.gid_tbl
[gid_idx
].gid
.global
.interface_id
) {
601 rc
= rdma_backend_del_gid(backend_dev
, ifname
,
602 &dev_res
->port
.gid_tbl
[gid_idx
].gid
);
607 memset(dev_res
->port
.gid_tbl
[gid_idx
].gid
.raw
, 0,
608 sizeof(dev_res
->port
.gid_tbl
[gid_idx
].gid
));
609 dev_res
->port
.gid_tbl
[gid_idx
].backend_gid_index
= -1;
614 int rdma_rm_get_backend_gid_index(RdmaDeviceResources
*dev_res
,
615 RdmaBackendDev
*backend_dev
, int sgid_idx
)
617 if (unlikely(sgid_idx
< 0 || sgid_idx
>= MAX_PORT_GIDS
)) {
618 rdma_error_report("Got invalid sgid_idx %d", sgid_idx
);
622 if (unlikely(dev_res
->port
.gid_tbl
[sgid_idx
].backend_gid_index
== -1)) {
623 dev_res
->port
.gid_tbl
[sgid_idx
].backend_gid_index
=
624 rdma_backend_get_gid_index(backend_dev
,
625 &dev_res
->port
.gid_tbl
[sgid_idx
].gid
);
628 return dev_res
->port
.gid_tbl
[sgid_idx
].backend_gid_index
;
631 static void destroy_qp_hash_key(gpointer data
)
636 static void init_ports(RdmaDeviceResources
*dev_res
)
640 memset(&dev_res
->port
, 0, sizeof(dev_res
->port
));
642 dev_res
->port
.state
= IBV_PORT_DOWN
;
643 for (i
= 0; i
< MAX_PORT_GIDS
; i
++) {
644 dev_res
->port
.gid_tbl
[i
].backend_gid_index
= -1;
648 static void fini_ports(RdmaDeviceResources
*dev_res
,
649 RdmaBackendDev
*backend_dev
, const char *ifname
)
653 dev_res
->port
.state
= IBV_PORT_DOWN
;
654 for (i
= 0; i
< MAX_PORT_GIDS
; i
++) {
655 rdma_rm_del_gid(dev_res
, backend_dev
, ifname
, i
);
659 int rdma_rm_init(RdmaDeviceResources
*dev_res
, struct ibv_device_attr
*dev_attr
)
661 dev_res
->qp_hash
= g_hash_table_new_full(g_bytes_hash
, g_bytes_equal
,
662 destroy_qp_hash_key
, NULL
);
663 if (!dev_res
->qp_hash
) {
667 res_tbl_init("PD", &dev_res
->pd_tbl
, dev_attr
->max_pd
, sizeof(RdmaRmPD
));
668 res_tbl_init("CQ", &dev_res
->cq_tbl
, dev_attr
->max_cq
, sizeof(RdmaRmCQ
));
669 res_tbl_init("MR", &dev_res
->mr_tbl
, dev_attr
->max_mr
, sizeof(RdmaRmMR
));
670 res_tbl_init("QP", &dev_res
->qp_tbl
, dev_attr
->max_qp
, sizeof(RdmaRmQP
));
671 res_tbl_init("CQE_CTX", &dev_res
->cqe_ctx_tbl
, dev_attr
->max_qp
*
672 dev_attr
->max_qp_wr
, sizeof(void *));
673 res_tbl_init("UC", &dev_res
->uc_tbl
, MAX_UCS
, sizeof(RdmaRmUC
));
677 qemu_mutex_init(&dev_res
->lock
);
679 memset(&dev_res
->stats
, 0, sizeof(dev_res
->stats
));
680 atomic_set(&dev_res
->stats
.missing_cqe
, 0);
685 void rdma_rm_fini(RdmaDeviceResources
*dev_res
, RdmaBackendDev
*backend_dev
,
688 qemu_mutex_destroy(&dev_res
->lock
);
690 fini_ports(dev_res
, backend_dev
, ifname
);
692 res_tbl_free(&dev_res
->uc_tbl
);
693 res_tbl_free(&dev_res
->cqe_ctx_tbl
);
694 res_tbl_free(&dev_res
->qp_tbl
);
695 res_tbl_free(&dev_res
->mr_tbl
);
696 res_tbl_free(&dev_res
->cq_tbl
);
697 res_tbl_free(&dev_res
->pd_tbl
);
699 if (dev_res
->qp_hash
) {
700 g_hash_table_destroy(dev_res
->qp_hash
);