2 * QEMU paravirtual RDMA - Generic RDMA backend
4 * Copyright (C) 2018 Oracle
5 * Copyright (C) 2018 Red Hat Inc
8 * Yuval Shaia <yuval.shaia@oracle.com>
9 * Marcel Apfelbaum <marcel@redhat.com>
11 * This work is licensed under the terms of the GNU GPL, version 2 or later.
12 * See the COPYING file in the top-level directory.
16 #include "qemu/osdep.h"
17 #include "qemu/error-report.h"
18 #include "sysemu/sysemu.h"
19 #include "qapi/error.h"
20 #include "qapi/qmp/qlist.h"
21 #include "qapi/qmp/qnum.h"
22 #include "qapi/qapi-events-rdma.h"
24 #include <infiniband/verbs.h>
25 #include <infiniband/umad_types.h>
26 #include <infiniband/umad.h>
27 #include <rdma/rdma_user_cm.h>
29 #include "contrib/rdmacm-mux/rdmacm-mux.h"
31 #include "rdma_utils.h"
33 #include "rdma_backend.h"
36 #define VENDOR_ERR_FAIL_BACKEND 0x201
37 #define VENDOR_ERR_TOO_MANY_SGES 0x202
38 #define VENDOR_ERR_NOMEM 0x203
39 #define VENDOR_ERR_QP0 0x204
40 #define VENDOR_ERR_INV_NUM_SGE 0x205
41 #define VENDOR_ERR_MAD_SEND 0x206
42 #define VENDOR_ERR_INVLKEY 0x207
43 #define VENDOR_ERR_MR_SMALL 0x208
44 #define VENDOR_ERR_INV_MAD_BUFF 0x209
46 #define THR_NAME_LEN 16
47 #define THR_POLL_TO 5000
49 #define MAD_HDR_SIZE sizeof(struct ibv_grh)
51 typedef struct BackendCtx
{
54 struct ibv_sge sge
; /* Used to save MAD recv buffer */
58 struct ib_user_mad hdr
;
59 char mad
[RDMA_MAX_PRIVATE_DATA
];
62 static void (*comp_handler
)(void *ctx
, struct ibv_wc
*wc
);
64 static void dummy_comp_handler(void *ctx
, struct ibv_wc
*wc
)
66 pr_err("No completion handler is registered\n");
69 static inline void complete_work(enum ibv_wc_status status
, uint32_t vendor_err
,
72 struct ibv_wc wc
= {0};
75 wc
.vendor_err
= vendor_err
;
77 comp_handler(ctx
, &wc
);
80 static void poll_cq(RdmaDeviceResources
*rdma_dev_res
, struct ibv_cq
*ibcq
)
86 pr_dbg("Entering poll_cq loop on cq %p\n", ibcq
);
88 ne
= ibv_poll_cq(ibcq
, ARRAY_SIZE(wc
), wc
);
90 pr_dbg("Got %d completion(s) from cq %p\n", ne
, ibcq
);
92 for (i
= 0; i
< ne
; i
++) {
93 pr_dbg("wr_id=0x%" PRIx64
"\n", wc
[i
].wr_id
);
94 pr_dbg("status=%d\n", wc
[i
].status
);
96 bctx
= rdma_rm_get_cqe_ctx(rdma_dev_res
, wc
[i
].wr_id
);
97 if (unlikely(!bctx
)) {
98 pr_dbg("Error: Failed to find ctx for req %" PRId64
"\n",
102 pr_dbg("Processing %s CQE\n", bctx
->is_tx_req
? "send" : "recv");
104 comp_handler(bctx
->up_ctx
, &wc
[i
]);
106 rdma_rm_dealloc_cqe_ctx(rdma_dev_res
, wc
[i
].wr_id
);
112 pr_dbg("Got error %d from ibv_poll_cq\n", ne
);
116 static void *comp_handler_thread(void *arg
)
118 RdmaBackendDev
*backend_dev
= (RdmaBackendDev
*)arg
;
120 struct ibv_cq
*ev_cq
;
125 /* Change to non-blocking mode */
126 flags
= fcntl(backend_dev
->channel
->fd
, F_GETFL
);
127 rc
= fcntl(backend_dev
->channel
->fd
, F_SETFL
, flags
| O_NONBLOCK
);
129 pr_dbg("Fail to change to non-blocking mode\n");
133 pr_dbg("Starting\n");
135 pfds
[0].fd
= backend_dev
->channel
->fd
;
136 pfds
[0].events
= G_IO_IN
| G_IO_HUP
| G_IO_ERR
;
138 backend_dev
->comp_thread
.is_running
= true;
140 while (backend_dev
->comp_thread
.run
) {
142 rc
= qemu_poll_ns(pfds
, 1, THR_POLL_TO
* (int64_t)SCALE_MS
);
143 } while (!rc
&& backend_dev
->comp_thread
.run
);
145 if (backend_dev
->comp_thread
.run
) {
146 pr_dbg("Waiting for completion on channel %p\n", backend_dev
->channel
);
147 rc
= ibv_get_cq_event(backend_dev
->channel
, &ev_cq
, &ev_ctx
);
148 pr_dbg("ibv_get_cq_event=%d\n", rc
);
150 pr_dbg("---> ibv_get_cq_event (%d)\n", rc
);
154 rc
= ibv_req_notify_cq(ev_cq
, 0);
156 pr_dbg("Error %d from ibv_req_notify_cq\n", rc
);
159 poll_cq(backend_dev
->rdma_dev_res
, ev_cq
);
161 ibv_ack_cq_events(ev_cq
, 1);
165 pr_dbg("Going down\n");
167 /* TODO: Post cqe for all remaining buffs that were posted */
169 backend_dev
->comp_thread
.is_running
= false;
176 static inline void disable_rdmacm_mux_async(RdmaBackendDev
*backend_dev
)
178 atomic_set(&backend_dev
->rdmacm_mux
.can_receive
, 0);
181 static inline void enable_rdmacm_mux_async(RdmaBackendDev
*backend_dev
)
183 atomic_set(&backend_dev
->rdmacm_mux
.can_receive
, sizeof(RdmaCmMuxMsg
));
186 static inline int rdmacm_mux_can_process_async(RdmaBackendDev
*backend_dev
)
188 return atomic_read(&backend_dev
->rdmacm_mux
.can_receive
);
191 static int check_mux_op_status(CharBackend
*mad_chr_be
)
193 RdmaCmMuxMsg msg
= {0};
196 pr_dbg("Reading response\n");
197 ret
= qemu_chr_fe_read_all(mad_chr_be
, (uint8_t *)&msg
, sizeof(msg
));
198 if (ret
!= sizeof(msg
)) {
199 pr_dbg("Invalid message size %d, expecting %ld\n", ret
, sizeof(msg
));
203 pr_dbg("msg_type=%d\n", msg
.hdr
.msg_type
);
204 pr_dbg("op_code=%d\n", msg
.hdr
.op_code
);
205 pr_dbg("err_code=%d\n", msg
.hdr
.err_code
);
207 if (msg
.hdr
.msg_type
!= RDMACM_MUX_MSG_TYPE_RESP
) {
208 pr_dbg("Invalid message type %d\n", msg
.hdr
.msg_type
);
212 if (msg
.hdr
.err_code
!= RDMACM_MUX_ERR_CODE_OK
) {
213 pr_dbg("Operation failed in mux, error code %d\n", msg
.hdr
.err_code
);
220 static int exec_rdmacm_mux_req(RdmaBackendDev
*backend_dev
, RdmaCmMuxMsg
*msg
)
224 pr_dbg("Executing request %d\n", msg
->hdr
.op_code
);
226 msg
->hdr
.msg_type
= RDMACM_MUX_MSG_TYPE_REQ
;
227 disable_rdmacm_mux_async(backend_dev
);
228 rc
= qemu_chr_fe_write(backend_dev
->rdmacm_mux
.chr_be
,
229 (const uint8_t *)msg
, sizeof(*msg
));
230 if (rc
!= sizeof(*msg
)) {
231 enable_rdmacm_mux_async(backend_dev
);
232 pr_dbg("Fail to send request to rdmacm_mux (rc=%d)\n", rc
);
236 rc
= check_mux_op_status(backend_dev
->rdmacm_mux
.chr_be
);
238 pr_dbg("Fail to execute rdmacm_mux request %d (rc=%d)\n",
239 msg
->hdr
.op_code
, rc
);
242 enable_rdmacm_mux_async(backend_dev
);
247 static void stop_backend_thread(RdmaBackendThread
*thread
)
250 while (thread
->is_running
) {
251 pr_dbg("Waiting for thread to complete\n");
252 sleep(THR_POLL_TO
/ SCALE_US
/ 2);
256 static void start_comp_thread(RdmaBackendDev
*backend_dev
)
258 char thread_name
[THR_NAME_LEN
] = {0};
260 stop_backend_thread(&backend_dev
->comp_thread
);
262 snprintf(thread_name
, sizeof(thread_name
), "rdma_comp_%s",
263 ibv_get_device_name(backend_dev
->ib_dev
));
264 backend_dev
->comp_thread
.run
= true;
265 qemu_thread_create(&backend_dev
->comp_thread
.thread
, thread_name
,
266 comp_handler_thread
, backend_dev
, QEMU_THREAD_DETACHED
);
269 void rdma_backend_register_comp_handler(void (*handler
)(void *ctx
,
272 comp_handler
= handler
;
275 void rdma_backend_unregister_comp_handler(void)
277 rdma_backend_register_comp_handler(dummy_comp_handler
);
280 int rdma_backend_query_port(RdmaBackendDev
*backend_dev
,
281 struct ibv_port_attr
*port_attr
)
285 rc
= ibv_query_port(backend_dev
->context
, backend_dev
->port_num
, port_attr
);
287 pr_dbg("Error %d from ibv_query_port\n", rc
);
294 void rdma_backend_poll_cq(RdmaDeviceResources
*rdma_dev_res
, RdmaBackendCQ
*cq
)
296 poll_cq(rdma_dev_res
, cq
->ibcq
);
299 static GHashTable
*ah_hash
;
301 static struct ibv_ah
*create_ah(RdmaBackendDev
*backend_dev
, struct ibv_pd
*pd
,
302 uint8_t sgid_idx
, union ibv_gid
*dgid
)
304 GBytes
*ah_key
= g_bytes_new(dgid
, sizeof(*dgid
));
305 struct ibv_ah
*ah
= g_hash_table_lookup(ah_hash
, ah_key
);
308 trace_create_ah_cache_hit(be64_to_cpu(dgid
->global
.subnet_prefix
),
309 be64_to_cpu(dgid
->global
.interface_id
));
310 g_bytes_unref(ah_key
);
312 struct ibv_ah_attr ah_attr
= {
314 .port_num
= backend_dev
->port_num
,
318 ah_attr
.grh
.dgid
= *dgid
;
319 ah_attr
.grh
.sgid_index
= sgid_idx
;
321 ah
= ibv_create_ah(pd
, &ah_attr
);
323 g_hash_table_insert(ah_hash
, ah_key
, ah
);
325 g_bytes_unref(ah_key
);
326 pr_dbg("Fail to create AH for gid <0x%" PRIx64
", 0x%" PRIx64
">\n",
327 be64_to_cpu(dgid
->global
.subnet_prefix
),
328 be64_to_cpu(dgid
->global
.interface_id
));
331 trace_create_ah_cache_miss(be64_to_cpu(dgid
->global
.subnet_prefix
),
332 be64_to_cpu(dgid
->global
.interface_id
));
338 static void destroy_ah_hash_key(gpointer data
)
343 static void destroy_ah_hast_data(gpointer data
)
345 struct ibv_ah
*ah
= data
;
350 static void ah_cache_init(void)
352 ah_hash
= g_hash_table_new_full(g_bytes_hash
, g_bytes_equal
,
353 destroy_ah_hash_key
, destroy_ah_hast_data
);
356 static int build_host_sge_array(RdmaDeviceResources
*rdma_dev_res
,
357 struct ibv_sge
*dsge
, struct ibv_sge
*ssge
,
363 pr_dbg("num_sge=%d\n", num_sge
);
365 for (ssge_idx
= 0; ssge_idx
< num_sge
; ssge_idx
++) {
366 mr
= rdma_rm_get_mr(rdma_dev_res
, ssge
[ssge_idx
].lkey
);
368 pr_dbg("Invalid lkey 0x%x\n", ssge
[ssge_idx
].lkey
);
369 return VENDOR_ERR_INVLKEY
| ssge
[ssge_idx
].lkey
;
372 dsge
->addr
= (uintptr_t)mr
->virt
+ ssge
[ssge_idx
].addr
- mr
->start
;
373 dsge
->length
= ssge
[ssge_idx
].length
;
374 dsge
->lkey
= rdma_backend_mr_lkey(&mr
->backend_mr
);
376 pr_dbg("ssge->addr=0x%" PRIx64
"\n", ssge
[ssge_idx
].addr
);
377 pr_dbg("dsge->addr=0x%" PRIx64
"\n", dsge
->addr
);
378 pr_dbg("dsge->length=%d\n", dsge
->length
);
379 pr_dbg("dsge->lkey=0x%x\n", dsge
->lkey
);
387 static int mad_send(RdmaBackendDev
*backend_dev
, uint8_t sgid_idx
,
388 union ibv_gid
*sgid
, struct ibv_sge
*sge
, uint32_t num_sge
)
390 RdmaCmMuxMsg msg
= {0};
394 pr_dbg("num_sge=%d\n", num_sge
);
400 msg
.hdr
.op_code
= RDMACM_MUX_OP_CODE_MAD
;
401 memcpy(msg
.hdr
.sgid
.raw
, sgid
->raw
, sizeof(msg
.hdr
.sgid
));
403 msg
.umad_len
= sge
[0].length
+ sge
[1].length
;
404 pr_dbg("umad_len=%d\n", msg
.umad_len
);
406 if (msg
.umad_len
> sizeof(msg
.umad
.mad
)) {
410 msg
.umad
.hdr
.addr
.qpn
= htobe32(1);
411 msg
.umad
.hdr
.addr
.grh_present
= 1;
412 pr_dbg("sgid_idx=%d\n", sgid_idx
);
413 pr_dbg("sgid=0x%llx\n", sgid
->global
.interface_id
);
414 msg
.umad
.hdr
.addr
.gid_index
= sgid_idx
;
415 memcpy(msg
.umad
.hdr
.addr
.gid
, sgid
->raw
, sizeof(msg
.umad
.hdr
.addr
.gid
));
416 msg
.umad
.hdr
.addr
.hop_limit
= 0xFF;
418 hdr
= rdma_pci_dma_map(backend_dev
->dev
, sge
[0].addr
, sge
[0].length
);
420 pr_dbg("Fail to map to sge[0]\n");
423 data
= rdma_pci_dma_map(backend_dev
->dev
, sge
[1].addr
, sge
[1].length
);
425 pr_dbg("Fail to map to sge[1]\n");
426 rdma_pci_dma_unmap(backend_dev
->dev
, hdr
, sge
[0].length
);
430 pr_dbg_buf("mad_hdr", hdr
, sge
[0].length
);
431 pr_dbg_buf("mad_data", data
, sge
[1].length
);
433 memcpy(&msg
.umad
.mad
[0], hdr
, sge
[0].length
);
434 memcpy(&msg
.umad
.mad
[sge
[0].length
], data
, sge
[1].length
);
436 rdma_pci_dma_unmap(backend_dev
->dev
, data
, sge
[1].length
);
437 rdma_pci_dma_unmap(backend_dev
->dev
, hdr
, sge
[0].length
);
439 ret
= exec_rdmacm_mux_req(backend_dev
, &msg
);
441 pr_dbg("Fail to send MAD to rdma_umadmux (%d)\n", ret
);
448 void rdma_backend_post_send(RdmaBackendDev
*backend_dev
,
449 RdmaBackendQP
*qp
, uint8_t qp_type
,
450 struct ibv_sge
*sge
, uint32_t num_sge
,
451 uint8_t sgid_idx
, union ibv_gid
*sgid
,
452 union ibv_gid
*dgid
, uint32_t dqpn
, uint32_t dqkey
,
456 struct ibv_sge new_sge
[MAX_SGE
];
459 struct ibv_send_wr wr
= {0}, *bad_wr
;
461 if (!qp
->ibqp
) { /* This field does not get initialized for QP0 and QP1 */
462 if (qp_type
== IBV_QPT_SMI
) {
463 pr_dbg("QP0 unsupported\n");
464 complete_work(IBV_WC_GENERAL_ERR
, VENDOR_ERR_QP0
, ctx
);
465 } else if (qp_type
== IBV_QPT_GSI
) {
467 rc
= mad_send(backend_dev
, sgid_idx
, sgid
, sge
, num_sge
);
469 complete_work(IBV_WC_GENERAL_ERR
, VENDOR_ERR_MAD_SEND
, ctx
);
471 complete_work(IBV_WC_SUCCESS
, 0, ctx
);
477 pr_dbg("num_sge=%d\n", num_sge
);
478 if (!num_sge
|| num_sge
> MAX_SGE
) {
479 pr_dbg("invalid num_sge=%d\n", num_sge
);
480 complete_work(IBV_WC_GENERAL_ERR
, VENDOR_ERR_INV_NUM_SGE
, ctx
);
484 bctx
= g_malloc0(sizeof(*bctx
));
488 rc
= rdma_rm_alloc_cqe_ctx(backend_dev
->rdma_dev_res
, &bctx_id
, bctx
);
490 pr_dbg("Failed to allocate cqe_ctx\n");
491 complete_work(IBV_WC_GENERAL_ERR
, VENDOR_ERR_NOMEM
, ctx
);
495 rc
= build_host_sge_array(backend_dev
->rdma_dev_res
, new_sge
, sge
, num_sge
);
497 pr_dbg("Error: Failed to build host SGE array\n");
498 complete_work(IBV_WC_GENERAL_ERR
, rc
, ctx
);
499 goto out_dealloc_cqe_ctx
;
502 if (qp_type
== IBV_QPT_UD
) {
503 wr
.wr
.ud
.ah
= create_ah(backend_dev
, qp
->ibpd
, sgid_idx
, dgid
);
505 complete_work(IBV_WC_GENERAL_ERR
, VENDOR_ERR_FAIL_BACKEND
, ctx
);
506 goto out_dealloc_cqe_ctx
;
508 wr
.wr
.ud
.remote_qpn
= dqpn
;
509 wr
.wr
.ud
.remote_qkey
= dqkey
;
512 wr
.num_sge
= num_sge
;
513 wr
.opcode
= IBV_WR_SEND
;
514 wr
.send_flags
= IBV_SEND_SIGNALED
;
515 wr
.sg_list
= new_sge
;
518 rc
= ibv_post_send(qp
->ibqp
, &wr
, &bad_wr
);
519 pr_dbg("ibv_post_send=%d\n", rc
);
521 pr_dbg("Fail (%d, %d) to post send WQE to qpn %d\n", rc
, errno
,
523 complete_work(IBV_WC_GENERAL_ERR
, VENDOR_ERR_FAIL_BACKEND
, ctx
);
524 goto out_dealloc_cqe_ctx
;
530 rdma_rm_dealloc_cqe_ctx(backend_dev
->rdma_dev_res
, bctx_id
);
536 static unsigned int save_mad_recv_buffer(RdmaBackendDev
*backend_dev
,
537 struct ibv_sge
*sge
, uint32_t num_sge
,
545 pr_dbg("Invalid num_sge (%d), expecting 1\n", num_sge
);
546 return VENDOR_ERR_INV_NUM_SGE
;
549 if (sge
[0].length
< RDMA_MAX_PRIVATE_DATA
+ sizeof(struct ibv_grh
)) {
550 pr_dbg("Too small buffer for MAD\n");
551 return VENDOR_ERR_INV_MAD_BUFF
;
554 pr_dbg("addr=0x%" PRIx64
"\n", sge
[0].addr
);
555 pr_dbg("length=%d\n", sge
[0].length
);
556 pr_dbg("lkey=%d\n", sge
[0].lkey
);
558 bctx
= g_malloc0(sizeof(*bctx
));
560 rc
= rdma_rm_alloc_cqe_ctx(backend_dev
->rdma_dev_res
, &bctx_id
, bctx
);
563 pr_dbg("Fail to allocate cqe_ctx\n");
564 return VENDOR_ERR_NOMEM
;
567 pr_dbg("bctx_id %d, bctx %p, ctx %p\n", bctx_id
, bctx
, ctx
);
571 qemu_mutex_lock(&backend_dev
->recv_mads_list
.lock
);
572 qlist_append_int(backend_dev
->recv_mads_list
.list
, bctx_id
);
573 qemu_mutex_unlock(&backend_dev
->recv_mads_list
.lock
);
578 void rdma_backend_post_recv(RdmaBackendDev
*backend_dev
,
579 RdmaDeviceResources
*rdma_dev_res
,
580 RdmaBackendQP
*qp
, uint8_t qp_type
,
581 struct ibv_sge
*sge
, uint32_t num_sge
, void *ctx
)
584 struct ibv_sge new_sge
[MAX_SGE
];
587 struct ibv_recv_wr wr
= {0}, *bad_wr
;
589 if (!qp
->ibqp
) { /* This field does not get initialized for QP0 and QP1 */
590 if (qp_type
== IBV_QPT_SMI
) {
591 pr_dbg("QP0 unsupported\n");
592 complete_work(IBV_WC_GENERAL_ERR
, VENDOR_ERR_QP0
, ctx
);
594 if (qp_type
== IBV_QPT_GSI
) {
596 rc
= save_mad_recv_buffer(backend_dev
, sge
, num_sge
, ctx
);
598 complete_work(IBV_WC_GENERAL_ERR
, rc
, ctx
);
604 pr_dbg("num_sge=%d\n", num_sge
);
605 if (!num_sge
|| num_sge
> MAX_SGE
) {
606 pr_dbg("invalid num_sge=%d\n", num_sge
);
607 complete_work(IBV_WC_GENERAL_ERR
, VENDOR_ERR_INV_NUM_SGE
, ctx
);
611 bctx
= g_malloc0(sizeof(*bctx
));
615 rc
= rdma_rm_alloc_cqe_ctx(rdma_dev_res
, &bctx_id
, bctx
);
617 pr_dbg("Failed to allocate cqe_ctx\n");
618 complete_work(IBV_WC_GENERAL_ERR
, VENDOR_ERR_NOMEM
, ctx
);
622 rc
= build_host_sge_array(rdma_dev_res
, new_sge
, sge
, num_sge
);
624 pr_dbg("Error: Failed to build host SGE array\n");
625 complete_work(IBV_WC_GENERAL_ERR
, rc
, ctx
);
626 goto out_dealloc_cqe_ctx
;
629 wr
.num_sge
= num_sge
;
630 wr
.sg_list
= new_sge
;
632 rc
= ibv_post_recv(qp
->ibqp
, &wr
, &bad_wr
);
633 pr_dbg("ibv_post_recv=%d\n", rc
);
635 pr_dbg("Fail (%d, %d) to post recv WQE to qpn %d\n", rc
, errno
,
637 complete_work(IBV_WC_GENERAL_ERR
, VENDOR_ERR_FAIL_BACKEND
, ctx
);
638 goto out_dealloc_cqe_ctx
;
644 rdma_rm_dealloc_cqe_ctx(rdma_dev_res
, bctx_id
);
650 int rdma_backend_create_pd(RdmaBackendDev
*backend_dev
, RdmaBackendPD
*pd
)
652 pd
->ibpd
= ibv_alloc_pd(backend_dev
->context
);
654 return pd
->ibpd
? 0 : -EIO
;
657 void rdma_backend_destroy_pd(RdmaBackendPD
*pd
)
660 ibv_dealloc_pd(pd
->ibpd
);
664 int rdma_backend_create_mr(RdmaBackendMR
*mr
, RdmaBackendPD
*pd
, void *addr
,
665 size_t length
, int access
)
667 pr_dbg("addr=0x%p\n", addr
);
668 pr_dbg("len=%zu\n", length
);
669 mr
->ibmr
= ibv_reg_mr(pd
->ibpd
, addr
, length
, access
);
671 pr_dbg("lkey=0x%x\n", mr
->ibmr
->lkey
);
672 pr_dbg("rkey=0x%x\n", mr
->ibmr
->rkey
);
676 return mr
->ibmr
? 0 : -EIO
;
679 void rdma_backend_destroy_mr(RdmaBackendMR
*mr
)
682 ibv_dereg_mr(mr
->ibmr
);
686 int rdma_backend_create_cq(RdmaBackendDev
*backend_dev
, RdmaBackendCQ
*cq
,
691 pr_dbg("cqe=%d\n", cqe
);
693 pr_dbg("dev->channel=%p\n", backend_dev
->channel
);
694 cq
->ibcq
= ibv_create_cq(backend_dev
->context
, cqe
+ 1, NULL
,
695 backend_dev
->channel
, 0);
698 rc
= ibv_req_notify_cq(cq
->ibcq
, 0);
700 pr_dbg("Error %d from ibv_req_notify_cq\n", rc
);
702 cq
->backend_dev
= backend_dev
;
705 return cq
->ibcq
? 0 : -EIO
;
708 void rdma_backend_destroy_cq(RdmaBackendCQ
*cq
)
711 ibv_destroy_cq(cq
->ibcq
);
715 int rdma_backend_create_qp(RdmaBackendQP
*qp
, uint8_t qp_type
,
716 RdmaBackendPD
*pd
, RdmaBackendCQ
*scq
,
717 RdmaBackendCQ
*rcq
, uint32_t max_send_wr
,
718 uint32_t max_recv_wr
, uint32_t max_send_sge
,
719 uint32_t max_recv_sge
)
721 struct ibv_qp_init_attr attr
= {0};
724 pr_dbg("qp_type=%d\n", qp_type
);
737 pr_dbg("Unsupported QP type %d\n", qp_type
);
741 attr
.qp_type
= qp_type
;
742 attr
.send_cq
= scq
->ibcq
;
743 attr
.recv_cq
= rcq
->ibcq
;
744 attr
.cap
.max_send_wr
= max_send_wr
;
745 attr
.cap
.max_recv_wr
= max_recv_wr
;
746 attr
.cap
.max_send_sge
= max_send_sge
;
747 attr
.cap
.max_recv_sge
= max_recv_sge
;
749 pr_dbg("max_send_wr=%d\n", max_send_wr
);
750 pr_dbg("max_recv_wr=%d\n", max_recv_wr
);
751 pr_dbg("max_send_sge=%d\n", max_send_sge
);
752 pr_dbg("max_recv_sge=%d\n", max_recv_sge
);
754 qp
->ibqp
= ibv_create_qp(pd
->ibpd
, &attr
);
755 if (likely(!qp
->ibqp
)) {
756 pr_dbg("Error from ibv_create_qp\n");
762 /* TODO: Query QP to get max_inline_data and save it to be used in send */
764 pr_dbg("qpn=0x%x\n", qp
->ibqp
->qp_num
);
769 int rdma_backend_qp_state_init(RdmaBackendDev
*backend_dev
, RdmaBackendQP
*qp
,
770 uint8_t qp_type
, uint32_t qkey
)
772 struct ibv_qp_attr attr
= {0};
775 pr_dbg("qpn=0x%x\n", qp
->ibqp
->qp_num
);
776 pr_dbg("sport_num=%d\n", backend_dev
->port_num
);
778 attr_mask
= IBV_QP_STATE
| IBV_QP_PKEY_INDEX
| IBV_QP_PORT
;
779 attr
.qp_state
= IBV_QPS_INIT
;
781 attr
.port_num
= backend_dev
->port_num
;
785 attr_mask
|= IBV_QP_ACCESS_FLAGS
;
790 attr_mask
|= IBV_QP_QKEY
;
794 pr_dbg("Unsupported QP type %d\n", qp_type
);
798 rc
= ibv_modify_qp(qp
->ibqp
, &attr
, attr_mask
);
800 pr_dbg("Error %d from ibv_modify_qp\n", rc
);
807 int rdma_backend_qp_state_rtr(RdmaBackendDev
*backend_dev
, RdmaBackendQP
*qp
,
808 uint8_t qp_type
, uint8_t sgid_idx
,
809 union ibv_gid
*dgid
, uint32_t dqpn
,
810 uint32_t rq_psn
, uint32_t qkey
, bool use_qkey
)
812 struct ibv_qp_attr attr
= {0};
813 union ibv_gid ibv_gid
= {
814 .global
.interface_id
= dgid
->global
.interface_id
,
815 .global
.subnet_prefix
= dgid
->global
.subnet_prefix
819 attr
.qp_state
= IBV_QPS_RTR
;
820 attr_mask
= IBV_QP_STATE
;
822 qp
->sgid_idx
= sgid_idx
;
826 pr_dbg("dgid=0x%" PRIx64
",%" PRIx64
"\n",
827 be64_to_cpu(ibv_gid
.global
.subnet_prefix
),
828 be64_to_cpu(ibv_gid
.global
.interface_id
));
829 pr_dbg("dqpn=0x%x\n", dqpn
);
830 pr_dbg("sgid_idx=%d\n", qp
->sgid_idx
);
831 pr_dbg("sport_num=%d\n", backend_dev
->port_num
);
832 pr_dbg("rq_psn=0x%x\n", rq_psn
);
834 attr
.path_mtu
= IBV_MTU_1024
;
835 attr
.dest_qp_num
= dqpn
;
836 attr
.max_dest_rd_atomic
= 1;
837 attr
.min_rnr_timer
= 12;
838 attr
.ah_attr
.port_num
= backend_dev
->port_num
;
839 attr
.ah_attr
.is_global
= 1;
840 attr
.ah_attr
.grh
.hop_limit
= 1;
841 attr
.ah_attr
.grh
.dgid
= ibv_gid
;
842 attr
.ah_attr
.grh
.sgid_index
= qp
->sgid_idx
;
843 attr
.rq_psn
= rq_psn
;
845 attr_mask
|= IBV_QP_AV
| IBV_QP_PATH_MTU
| IBV_QP_DEST_QPN
|
846 IBV_QP_RQ_PSN
| IBV_QP_MAX_DEST_RD_ATOMIC
|
847 IBV_QP_MIN_RNR_TIMER
;
851 pr_dbg("qkey=0x%x\n", qkey
);
854 attr_mask
|= IBV_QP_QKEY
;
859 rc
= ibv_modify_qp(qp
->ibqp
, &attr
, attr_mask
);
861 pr_dbg("Error %d from ibv_modify_qp\n", rc
);
868 int rdma_backend_qp_state_rts(RdmaBackendQP
*qp
, uint8_t qp_type
,
869 uint32_t sq_psn
, uint32_t qkey
, bool use_qkey
)
871 struct ibv_qp_attr attr
= {0};
874 pr_dbg("qpn=0x%x\n", qp
->ibqp
->qp_num
);
875 pr_dbg("sq_psn=0x%x\n", sq_psn
);
877 attr
.qp_state
= IBV_QPS_RTS
;
878 attr
.sq_psn
= sq_psn
;
879 attr_mask
= IBV_QP_STATE
| IBV_QP_SQ_PSN
;
886 attr
.max_rd_atomic
= 1;
888 attr_mask
|= IBV_QP_TIMEOUT
| IBV_QP_RETRY_CNT
| IBV_QP_RNR_RETRY
|
889 IBV_QP_MAX_QP_RD_ATOMIC
;
894 pr_dbg("qkey=0x%x\n", qkey
);
896 attr_mask
|= IBV_QP_QKEY
;
901 rc
= ibv_modify_qp(qp
->ibqp
, &attr
, attr_mask
);
903 pr_dbg("Error %d from ibv_modify_qp\n", rc
);
910 int rdma_backend_query_qp(RdmaBackendQP
*qp
, struct ibv_qp_attr
*attr
,
911 int attr_mask
, struct ibv_qp_init_attr
*init_attr
)
915 attr
->qp_state
= IBV_QPS_RTS
;
919 return ibv_query_qp(qp
->ibqp
, attr
, attr_mask
, init_attr
);
922 void rdma_backend_destroy_qp(RdmaBackendQP
*qp
)
925 ibv_destroy_qp(qp
->ibqp
);
929 #define CHK_ATTR(req, dev, member, fmt) ({ \
930 pr_dbg("%s="fmt","fmt"\n", #member, dev.member, req->member); \
931 if (req->member > dev.member) { \
932 warn_report("%s = "fmt" is higher than host device capability "fmt, \
933 #member, req->member, dev.member); \
934 req->member = dev.member; \
936 pr_dbg("%s="fmt"\n", #member, req->member); })
938 static int init_device_caps(RdmaBackendDev
*backend_dev
,
939 struct ibv_device_attr
*dev_attr
)
941 if (ibv_query_device(backend_dev
->context
, &backend_dev
->dev_attr
)) {
945 CHK_ATTR(dev_attr
, backend_dev
->dev_attr
, max_mr_size
, "%" PRId64
);
946 CHK_ATTR(dev_attr
, backend_dev
->dev_attr
, max_qp
, "%d");
947 CHK_ATTR(dev_attr
, backend_dev
->dev_attr
, max_sge
, "%d");
948 CHK_ATTR(dev_attr
, backend_dev
->dev_attr
, max_qp_wr
, "%d");
949 CHK_ATTR(dev_attr
, backend_dev
->dev_attr
, max_cq
, "%d");
950 CHK_ATTR(dev_attr
, backend_dev
->dev_attr
, max_cqe
, "%d");
951 CHK_ATTR(dev_attr
, backend_dev
->dev_attr
, max_mr
, "%d");
952 CHK_ATTR(dev_attr
, backend_dev
->dev_attr
, max_pd
, "%d");
953 CHK_ATTR(dev_attr
, backend_dev
->dev_attr
, max_qp_rd_atom
, "%d");
954 CHK_ATTR(dev_attr
, backend_dev
->dev_attr
, max_qp_init_rd_atom
, "%d");
955 CHK_ATTR(dev_attr
, backend_dev
->dev_attr
, max_ah
, "%d");
960 static inline void build_mad_hdr(struct ibv_grh
*grh
, union ibv_gid
*sgid
,
961 union ibv_gid
*my_gid
, int paylen
)
963 grh
->paylen
= htons(paylen
);
967 pr_dbg("paylen=%d (net=0x%x)\n", paylen
, grh
->paylen
);
968 pr_dbg("dgid=0x%llx\n", my_gid
->global
.interface_id
);
969 pr_dbg("sgid=0x%llx\n", sgid
->global
.interface_id
);
972 static void process_incoming_mad_req(RdmaBackendDev
*backend_dev
,
976 unsigned long cqe_ctx_id
;
980 pr_dbg("umad_len=%d\n", msg
->umad_len
);
983 struct umad_hdr
*hdr
= (struct umad_hdr
*)&msg
->umad
.mad
;
984 pr_dbg("bv %x cls %x cv %x mtd %x st %d tid %" PRIx64
" at %x atm %x\n",
985 hdr
->base_version
, hdr
->mgmt_class
, hdr
->class_version
,
986 hdr
->method
, hdr
->status
, be64toh(hdr
->tid
),
987 hdr
->attr_id
, hdr
->attr_mod
);
990 qemu_mutex_lock(&backend_dev
->recv_mads_list
.lock
);
991 o_ctx_id
= qlist_pop(backend_dev
->recv_mads_list
.list
);
992 qemu_mutex_unlock(&backend_dev
->recv_mads_list
.lock
);
994 pr_dbg("No more free MADs buffers, waiting for a while\n");
999 cqe_ctx_id
= qnum_get_uint(qobject_to(QNum
, o_ctx_id
));
1000 bctx
= rdma_rm_get_cqe_ctx(backend_dev
->rdma_dev_res
, cqe_ctx_id
);
1001 if (unlikely(!bctx
)) {
1002 pr_dbg("Error: Fail to find ctx for %ld\n", cqe_ctx_id
);
1006 pr_dbg("id %ld, bctx %p, ctx %p\n", cqe_ctx_id
, bctx
, bctx
->up_ctx
);
1008 mad
= rdma_pci_dma_map(backend_dev
->dev
, bctx
->sge
.addr
,
1010 if (!mad
|| bctx
->sge
.length
< msg
->umad_len
+ MAD_HDR_SIZE
) {
1011 complete_work(IBV_WC_GENERAL_ERR
, VENDOR_ERR_INV_MAD_BUFF
,
1014 struct ibv_wc wc
= {0};
1015 pr_dbg_buf("mad", msg
->umad
.mad
, msg
->umad_len
);
1016 memset(mad
, 0, bctx
->sge
.length
);
1017 build_mad_hdr((struct ibv_grh
*)mad
,
1018 (union ibv_gid
*)&msg
->umad
.hdr
.addr
.gid
, &msg
->hdr
.sgid
,
1020 memcpy(&mad
[MAD_HDR_SIZE
], msg
->umad
.mad
, msg
->umad_len
);
1021 rdma_pci_dma_unmap(backend_dev
->dev
, mad
, bctx
->sge
.length
);
1023 wc
.byte_len
= msg
->umad_len
;
1024 wc
.status
= IBV_WC_SUCCESS
;
1025 wc
.wc_flags
= IBV_WC_GRH
;
1026 comp_handler(bctx
->up_ctx
, &wc
);
1030 rdma_rm_dealloc_cqe_ctx(backend_dev
->rdma_dev_res
, cqe_ctx_id
);
1033 static inline int rdmacm_mux_can_receive(void *opaque
)
1035 RdmaBackendDev
*backend_dev
= (RdmaBackendDev
*)opaque
;
1037 return rdmacm_mux_can_process_async(backend_dev
);
1040 static void rdmacm_mux_read(void *opaque
, const uint8_t *buf
, int size
)
1042 RdmaBackendDev
*backend_dev
= (RdmaBackendDev
*)opaque
;
1043 RdmaCmMuxMsg
*msg
= (RdmaCmMuxMsg
*)buf
;
1045 pr_dbg("Got %d bytes\n", size
);
1046 pr_dbg("msg_type=%d\n", msg
->hdr
.msg_type
);
1047 pr_dbg("op_code=%d\n", msg
->hdr
.op_code
);
1049 if (msg
->hdr
.msg_type
!= RDMACM_MUX_MSG_TYPE_REQ
&&
1050 msg
->hdr
.op_code
!= RDMACM_MUX_OP_CODE_MAD
) {
1051 pr_dbg("Error: Not a MAD request, skipping\n");
1054 process_incoming_mad_req(backend_dev
, msg
);
1057 static int mad_init(RdmaBackendDev
*backend_dev
, CharBackend
*mad_chr_be
)
1061 backend_dev
->rdmacm_mux
.chr_be
= mad_chr_be
;
1063 ret
= qemu_chr_fe_backend_connected(backend_dev
->rdmacm_mux
.chr_be
);
1065 pr_dbg("Missing chardev for MAD multiplexer\n");
1069 qemu_mutex_init(&backend_dev
->recv_mads_list
.lock
);
1070 backend_dev
->recv_mads_list
.list
= qlist_new();
1072 enable_rdmacm_mux_async(backend_dev
);
1074 qemu_chr_fe_set_handlers(backend_dev
->rdmacm_mux
.chr_be
,
1075 rdmacm_mux_can_receive
, rdmacm_mux_read
, NULL
,
1076 NULL
, backend_dev
, NULL
, true);
1081 static void mad_fini(RdmaBackendDev
*backend_dev
)
1083 pr_dbg("Stopping MAD\n");
1084 disable_rdmacm_mux_async(backend_dev
);
1085 qemu_chr_fe_disconnect(backend_dev
->rdmacm_mux
.chr_be
);
1086 qlist_destroy_obj(QOBJECT(backend_dev
->recv_mads_list
.list
));
1087 qemu_mutex_destroy(&backend_dev
->recv_mads_list
.lock
);
1090 int rdma_backend_get_gid_index(RdmaBackendDev
*backend_dev
,
1097 pr_dbg("0x%llx, 0x%llx\n",
1098 (long long unsigned int)be64_to_cpu(gid
->global
.subnet_prefix
),
1099 (long long unsigned int)be64_to_cpu(gid
->global
.interface_id
));
1102 ret
= ibv_query_gid(backend_dev
->context
, backend_dev
->port_num
, i
,
1105 } while (!ret
&& (memcmp(&sgid
, gid
, sizeof(*gid
))));
1107 pr_dbg("gid_index=%d\n", i
- 1);
1109 return ret
? ret
: i
- 1;
1112 int rdma_backend_add_gid(RdmaBackendDev
*backend_dev
, const char *ifname
,
1115 RdmaCmMuxMsg msg
= {0};
1118 pr_dbg("0x%llx, 0x%llx\n",
1119 (long long unsigned int)be64_to_cpu(gid
->global
.subnet_prefix
),
1120 (long long unsigned int)be64_to_cpu(gid
->global
.interface_id
));
1122 msg
.hdr
.op_code
= RDMACM_MUX_OP_CODE_REG
;
1123 memcpy(msg
.hdr
.sgid
.raw
, gid
->raw
, sizeof(msg
.hdr
.sgid
));
1125 ret
= exec_rdmacm_mux_req(backend_dev
, &msg
);
1127 pr_dbg("Fail to register GID to rdma_umadmux (%d)\n", ret
);
1131 qapi_event_send_rdma_gid_status_changed(ifname
, true,
1132 gid
->global
.subnet_prefix
,
1133 gid
->global
.interface_id
);
1138 int rdma_backend_del_gid(RdmaBackendDev
*backend_dev
, const char *ifname
,
1141 RdmaCmMuxMsg msg
= {0};
1144 pr_dbg("0x%llx, 0x%llx\n",
1145 (long long unsigned int)be64_to_cpu(gid
->global
.subnet_prefix
),
1146 (long long unsigned int)be64_to_cpu(gid
->global
.interface_id
));
1148 msg
.hdr
.op_code
= RDMACM_MUX_OP_CODE_UNREG
;
1149 memcpy(msg
.hdr
.sgid
.raw
, gid
->raw
, sizeof(msg
.hdr
.sgid
));
1151 ret
= exec_rdmacm_mux_req(backend_dev
, &msg
);
1153 pr_dbg("Fail to unregister GID from rdma_umadmux (%d)\n", ret
);
1157 qapi_event_send_rdma_gid_status_changed(ifname
, false,
1158 gid
->global
.subnet_prefix
,
1159 gid
->global
.interface_id
);
1164 int rdma_backend_init(RdmaBackendDev
*backend_dev
, PCIDevice
*pdev
,
1165 RdmaDeviceResources
*rdma_dev_res
,
1166 const char *backend_device_name
, uint8_t port_num
,
1167 struct ibv_device_attr
*dev_attr
, CharBackend
*mad_chr_be
,
1172 int num_ibv_devices
;
1173 struct ibv_device
**dev_list
;
1175 memset(backend_dev
, 0, sizeof(*backend_dev
));
1177 backend_dev
->dev
= pdev
;
1178 backend_dev
->port_num
= port_num
;
1179 backend_dev
->rdma_dev_res
= rdma_dev_res
;
1181 rdma_backend_register_comp_handler(dummy_comp_handler
);
1183 dev_list
= ibv_get_device_list(&num_ibv_devices
);
1185 error_setg(errp
, "Failed to get IB devices list");
1189 if (num_ibv_devices
== 0) {
1190 error_setg(errp
, "No IB devices were found");
1192 goto out_free_dev_list
;
1195 if (backend_device_name
) {
1196 for (i
= 0; dev_list
[i
]; ++i
) {
1197 if (!strcmp(ibv_get_device_name(dev_list
[i
]),
1198 backend_device_name
)) {
1203 backend_dev
->ib_dev
= dev_list
[i
];
1204 if (!backend_dev
->ib_dev
) {
1205 error_setg(errp
, "Failed to find IB device %s",
1206 backend_device_name
);
1208 goto out_free_dev_list
;
1211 backend_dev
->ib_dev
= *dev_list
;
1214 pr_dbg("Using backend device %s, port %d\n",
1215 ibv_get_device_name(backend_dev
->ib_dev
), backend_dev
->port_num
);
1216 pr_dbg("uverb device %s\n", backend_dev
->ib_dev
->dev_name
);
1218 backend_dev
->context
= ibv_open_device(backend_dev
->ib_dev
);
1219 if (!backend_dev
->context
) {
1220 error_setg(errp
, "Failed to open IB device");
1225 backend_dev
->channel
= ibv_create_comp_channel(backend_dev
->context
);
1226 if (!backend_dev
->channel
) {
1227 error_setg(errp
, "Failed to create IB communication channel");
1229 goto out_close_device
;
1231 pr_dbg("dev->backend_dev.channel=%p\n", backend_dev
->channel
);
1233 ret
= init_device_caps(backend_dev
, dev_attr
);
1235 error_setg(errp
, "Failed to initialize device capabilities");
1237 goto out_destroy_comm_channel
;
1241 ret
= mad_init(backend_dev
, mad_chr_be
);
1243 error_setg(errp
, "Fail to initialize mad");
1245 goto out_destroy_comm_channel
;
1248 backend_dev
->comp_thread
.run
= false;
1249 backend_dev
->comp_thread
.is_running
= false;
1253 goto out_free_dev_list
;
1255 out_destroy_comm_channel
:
1256 ibv_destroy_comp_channel(backend_dev
->channel
);
1259 ibv_close_device(backend_dev
->context
);
1262 ibv_free_device_list(dev_list
);
1269 void rdma_backend_start(RdmaBackendDev
*backend_dev
)
1271 pr_dbg("Starting rdma_backend\n");
1272 start_comp_thread(backend_dev
);
1275 void rdma_backend_stop(RdmaBackendDev
*backend_dev
)
1277 pr_dbg("Stopping rdma_backend\n");
1278 stop_backend_thread(&backend_dev
->comp_thread
);
1281 void rdma_backend_fini(RdmaBackendDev
*backend_dev
)
1283 rdma_backend_stop(backend_dev
);
1284 mad_fini(backend_dev
);
1285 g_hash_table_destroy(ah_hash
);
1286 ibv_destroy_comp_channel(backend_dev
->channel
);
1287 ibv_close_device(backend_dev
->context
);