2 * QEMU paravirtual RDMA - Generic RDMA backend
4 * Copyright (C) 2018 Oracle
5 * Copyright (C) 2018 Red Hat Inc
8 * Yuval Shaia <yuval.shaia@oracle.com>
9 * Marcel Apfelbaum <marcel@redhat.com>
11 * This work is licensed under the terms of the GNU GPL, version 2 or later.
12 * See the COPYING file in the top-level directory.
16 #include "qemu/osdep.h"
17 #include "qemu/error-report.h"
18 #include "sysemu/sysemu.h"
19 #include "qapi/error.h"
20 #include "qapi/qmp/qlist.h"
21 #include "qapi/qmp/qnum.h"
22 #include "qapi/qapi-events-rdma.h"
24 #include <infiniband/verbs.h>
25 #include <infiniband/umad_types.h>
26 #include <infiniband/umad.h>
27 #include <rdma/rdma_user_cm.h>
29 #include "contrib/rdmacm-mux/rdmacm-mux.h"
31 #include "rdma_utils.h"
33 #include "rdma_backend.h"
36 #define VENDOR_ERR_FAIL_BACKEND 0x201
37 #define VENDOR_ERR_TOO_MANY_SGES 0x202
38 #define VENDOR_ERR_NOMEM 0x203
39 #define VENDOR_ERR_QP0 0x204
40 #define VENDOR_ERR_NO_SGE 0x205
41 #define VENDOR_ERR_MAD_SEND 0x206
42 #define VENDOR_ERR_INVLKEY 0x207
43 #define VENDOR_ERR_MR_SMALL 0x208
44 #define VENDOR_ERR_INV_MAD_BUFF 0x209
45 #define VENDOR_ERR_INV_NUM_SGE 0x210
47 #define THR_NAME_LEN 16
48 #define THR_POLL_TO 5000
50 #define MAD_HDR_SIZE sizeof(struct ibv_grh)
52 typedef struct BackendCtx
{
55 struct ibv_sge sge
; /* Used to save MAD recv buffer */
59 struct ib_user_mad hdr
;
60 char mad
[RDMA_MAX_PRIVATE_DATA
];
63 static void (*comp_handler
)(void *ctx
, struct ibv_wc
*wc
);
65 static void dummy_comp_handler(void *ctx
, struct ibv_wc
*wc
)
67 pr_err("No completion handler is registered\n");
70 static inline void complete_work(enum ibv_wc_status status
, uint32_t vendor_err
,
73 struct ibv_wc wc
= {0};
76 wc
.vendor_err
= vendor_err
;
78 comp_handler(ctx
, &wc
);
81 static void poll_cq(RdmaDeviceResources
*rdma_dev_res
, struct ibv_cq
*ibcq
)
87 pr_dbg("Entering poll_cq loop on cq %p\n", ibcq
);
89 ne
= ibv_poll_cq(ibcq
, ARRAY_SIZE(wc
), wc
);
91 pr_dbg("Got %d completion(s) from cq %p\n", ne
, ibcq
);
93 for (i
= 0; i
< ne
; i
++) {
94 pr_dbg("wr_id=0x%" PRIx64
"\n", wc
[i
].wr_id
);
95 pr_dbg("status=%d\n", wc
[i
].status
);
97 bctx
= rdma_rm_get_cqe_ctx(rdma_dev_res
, wc
[i
].wr_id
);
98 if (unlikely(!bctx
)) {
99 pr_dbg("Error: Failed to find ctx for req %" PRId64
"\n",
103 pr_dbg("Processing %s CQE\n", bctx
->is_tx_req
? "send" : "recv");
105 comp_handler(bctx
->up_ctx
, &wc
[i
]);
107 rdma_rm_dealloc_cqe_ctx(rdma_dev_res
, wc
[i
].wr_id
);
113 pr_dbg("Got error %d from ibv_poll_cq\n", ne
);
117 static void *comp_handler_thread(void *arg
)
119 RdmaBackendDev
*backend_dev
= (RdmaBackendDev
*)arg
;
121 struct ibv_cq
*ev_cq
;
126 /* Change to non-blocking mode */
127 flags
= fcntl(backend_dev
->channel
->fd
, F_GETFL
);
128 rc
= fcntl(backend_dev
->channel
->fd
, F_SETFL
, flags
| O_NONBLOCK
);
130 pr_dbg("Fail to change to non-blocking mode\n");
134 pr_dbg("Starting\n");
136 pfds
[0].fd
= backend_dev
->channel
->fd
;
137 pfds
[0].events
= G_IO_IN
| G_IO_HUP
| G_IO_ERR
;
139 backend_dev
->comp_thread
.is_running
= true;
141 while (backend_dev
->comp_thread
.run
) {
143 rc
= qemu_poll_ns(pfds
, 1, THR_POLL_TO
* (int64_t)SCALE_MS
);
144 } while (!rc
&& backend_dev
->comp_thread
.run
);
146 if (backend_dev
->comp_thread
.run
) {
147 pr_dbg("Waiting for completion on channel %p\n", backend_dev
->channel
);
148 rc
= ibv_get_cq_event(backend_dev
->channel
, &ev_cq
, &ev_ctx
);
149 pr_dbg("ibv_get_cq_event=%d\n", rc
);
151 pr_dbg("---> ibv_get_cq_event (%d)\n", rc
);
155 rc
= ibv_req_notify_cq(ev_cq
, 0);
157 pr_dbg("Error %d from ibv_req_notify_cq\n", rc
);
160 poll_cq(backend_dev
->rdma_dev_res
, ev_cq
);
162 ibv_ack_cq_events(ev_cq
, 1);
166 pr_dbg("Going down\n");
168 /* TODO: Post cqe for all remaining buffs that were posted */
170 backend_dev
->comp_thread
.is_running
= false;
177 static inline void disable_rdmacm_mux_async(RdmaBackendDev
*backend_dev
)
179 atomic_set(&backend_dev
->rdmacm_mux
.can_receive
, 0);
182 static inline void enable_rdmacm_mux_async(RdmaBackendDev
*backend_dev
)
184 atomic_set(&backend_dev
->rdmacm_mux
.can_receive
, sizeof(RdmaCmMuxMsg
));
187 static inline int rdmacm_mux_can_process_async(RdmaBackendDev
*backend_dev
)
189 return atomic_read(&backend_dev
->rdmacm_mux
.can_receive
);
192 static int check_mux_op_status(CharBackend
*mad_chr_be
)
194 RdmaCmMuxMsg msg
= {0};
197 pr_dbg("Reading response\n");
198 ret
= qemu_chr_fe_read_all(mad_chr_be
, (uint8_t *)&msg
, sizeof(msg
));
199 if (ret
!= sizeof(msg
)) {
200 pr_dbg("Invalid message size %d, expecting %ld\n", ret
, sizeof(msg
));
204 pr_dbg("msg_type=%d\n", msg
.hdr
.msg_type
);
205 pr_dbg("op_code=%d\n", msg
.hdr
.op_code
);
206 pr_dbg("err_code=%d\n", msg
.hdr
.err_code
);
208 if (msg
.hdr
.msg_type
!= RDMACM_MUX_MSG_TYPE_RESP
) {
209 pr_dbg("Invalid message type %d\n", msg
.hdr
.msg_type
);
213 if (msg
.hdr
.err_code
!= RDMACM_MUX_ERR_CODE_OK
) {
214 pr_dbg("Operation failed in mux, error code %d\n", msg
.hdr
.err_code
);
221 static int exec_rdmacm_mux_req(RdmaBackendDev
*backend_dev
, RdmaCmMuxMsg
*msg
)
225 pr_dbg("Executing request %d\n", msg
->hdr
.op_code
);
227 msg
->hdr
.msg_type
= RDMACM_MUX_MSG_TYPE_REQ
;
228 disable_rdmacm_mux_async(backend_dev
);
229 rc
= qemu_chr_fe_write(backend_dev
->rdmacm_mux
.chr_be
,
230 (const uint8_t *)msg
, sizeof(*msg
));
231 if (rc
!= sizeof(*msg
)) {
232 enable_rdmacm_mux_async(backend_dev
);
233 pr_dbg("Fail to send request to rdmacm_mux (rc=%d)\n", rc
);
237 rc
= check_mux_op_status(backend_dev
->rdmacm_mux
.chr_be
);
239 pr_dbg("Fail to execute rdmacm_mux request %d (rc=%d)\n",
240 msg
->hdr
.op_code
, rc
);
243 enable_rdmacm_mux_async(backend_dev
);
248 static void stop_backend_thread(RdmaBackendThread
*thread
)
251 while (thread
->is_running
) {
252 pr_dbg("Waiting for thread to complete\n");
253 sleep(THR_POLL_TO
/ SCALE_US
/ 2);
257 static void start_comp_thread(RdmaBackendDev
*backend_dev
)
259 char thread_name
[THR_NAME_LEN
] = {0};
261 stop_backend_thread(&backend_dev
->comp_thread
);
263 snprintf(thread_name
, sizeof(thread_name
), "rdma_comp_%s",
264 ibv_get_device_name(backend_dev
->ib_dev
));
265 backend_dev
->comp_thread
.run
= true;
266 qemu_thread_create(&backend_dev
->comp_thread
.thread
, thread_name
,
267 comp_handler_thread
, backend_dev
, QEMU_THREAD_DETACHED
);
270 void rdma_backend_register_comp_handler(void (*handler
)(void *ctx
,
273 comp_handler
= handler
;
276 void rdma_backend_unregister_comp_handler(void)
278 rdma_backend_register_comp_handler(dummy_comp_handler
);
281 int rdma_backend_query_port(RdmaBackendDev
*backend_dev
,
282 struct ibv_port_attr
*port_attr
)
286 rc
= ibv_query_port(backend_dev
->context
, backend_dev
->port_num
, port_attr
);
288 pr_dbg("Error %d from ibv_query_port\n", rc
);
295 void rdma_backend_poll_cq(RdmaDeviceResources
*rdma_dev_res
, RdmaBackendCQ
*cq
)
297 poll_cq(rdma_dev_res
, cq
->ibcq
);
300 static GHashTable
*ah_hash
;
302 static struct ibv_ah
*create_ah(RdmaBackendDev
*backend_dev
, struct ibv_pd
*pd
,
303 uint8_t sgid_idx
, union ibv_gid
*dgid
)
305 GBytes
*ah_key
= g_bytes_new(dgid
, sizeof(*dgid
));
306 struct ibv_ah
*ah
= g_hash_table_lookup(ah_hash
, ah_key
);
309 trace_create_ah_cache_hit(be64_to_cpu(dgid
->global
.subnet_prefix
),
310 be64_to_cpu(dgid
->global
.interface_id
));
311 g_bytes_unref(ah_key
);
313 struct ibv_ah_attr ah_attr
= {
315 .port_num
= backend_dev
->port_num
,
319 ah_attr
.grh
.dgid
= *dgid
;
320 ah_attr
.grh
.sgid_index
= sgid_idx
;
322 ah
= ibv_create_ah(pd
, &ah_attr
);
324 g_hash_table_insert(ah_hash
, ah_key
, ah
);
326 g_bytes_unref(ah_key
);
327 pr_dbg("Fail to create AH for gid <0x%" PRIx64
", 0x%" PRIx64
">\n",
328 be64_to_cpu(dgid
->global
.subnet_prefix
),
329 be64_to_cpu(dgid
->global
.interface_id
));
332 trace_create_ah_cache_miss(be64_to_cpu(dgid
->global
.subnet_prefix
),
333 be64_to_cpu(dgid
->global
.interface_id
));
339 static void destroy_ah_hash_key(gpointer data
)
344 static void destroy_ah_hast_data(gpointer data
)
346 struct ibv_ah
*ah
= data
;
351 static void ah_cache_init(void)
353 ah_hash
= g_hash_table_new_full(g_bytes_hash
, g_bytes_equal
,
354 destroy_ah_hash_key
, destroy_ah_hast_data
);
357 static int build_host_sge_array(RdmaDeviceResources
*rdma_dev_res
,
358 struct ibv_sge
*dsge
, struct ibv_sge
*ssge
,
364 pr_dbg("num_sge=%d\n", num_sge
);
366 for (ssge_idx
= 0; ssge_idx
< num_sge
; ssge_idx
++) {
367 mr
= rdma_rm_get_mr(rdma_dev_res
, ssge
[ssge_idx
].lkey
);
369 pr_dbg("Invalid lkey 0x%x\n", ssge
[ssge_idx
].lkey
);
370 return VENDOR_ERR_INVLKEY
| ssge
[ssge_idx
].lkey
;
373 dsge
->addr
= (uintptr_t)mr
->virt
+ ssge
[ssge_idx
].addr
- mr
->start
;
374 dsge
->length
= ssge
[ssge_idx
].length
;
375 dsge
->lkey
= rdma_backend_mr_lkey(&mr
->backend_mr
);
377 pr_dbg("ssge->addr=0x%" PRIx64
"\n", ssge
[ssge_idx
].addr
);
378 pr_dbg("dsge->addr=0x%" PRIx64
"\n", dsge
->addr
);
379 pr_dbg("dsge->length=%d\n", dsge
->length
);
380 pr_dbg("dsge->lkey=0x%x\n", dsge
->lkey
);
388 static int mad_send(RdmaBackendDev
*backend_dev
, uint8_t sgid_idx
,
389 union ibv_gid
*sgid
, struct ibv_sge
*sge
, uint32_t num_sge
)
391 RdmaCmMuxMsg msg
= {0};
395 pr_dbg("num_sge=%d\n", num_sge
);
401 msg
.hdr
.op_code
= RDMACM_MUX_OP_CODE_MAD
;
402 memcpy(msg
.hdr
.sgid
.raw
, sgid
->raw
, sizeof(msg
.hdr
.sgid
));
404 msg
.umad_len
= sge
[0].length
+ sge
[1].length
;
405 pr_dbg("umad_len=%d\n", msg
.umad_len
);
407 if (msg
.umad_len
> sizeof(msg
.umad
.mad
)) {
411 msg
.umad
.hdr
.addr
.qpn
= htobe32(1);
412 msg
.umad
.hdr
.addr
.grh_present
= 1;
413 pr_dbg("sgid_idx=%d\n", sgid_idx
);
414 pr_dbg("sgid=0x%llx\n", sgid
->global
.interface_id
);
415 msg
.umad
.hdr
.addr
.gid_index
= sgid_idx
;
416 memcpy(msg
.umad
.hdr
.addr
.gid
, sgid
->raw
, sizeof(msg
.umad
.hdr
.addr
.gid
));
417 msg
.umad
.hdr
.addr
.hop_limit
= 0xFF;
419 hdr
= rdma_pci_dma_map(backend_dev
->dev
, sge
[0].addr
, sge
[0].length
);
421 pr_dbg("Fail to map to sge[0]\n");
424 data
= rdma_pci_dma_map(backend_dev
->dev
, sge
[1].addr
, sge
[1].length
);
426 pr_dbg("Fail to map to sge[1]\n");
427 rdma_pci_dma_unmap(backend_dev
->dev
, hdr
, sge
[0].length
);
431 pr_dbg_buf("mad_hdr", hdr
, sge
[0].length
);
432 pr_dbg_buf("mad_data", data
, sge
[1].length
);
434 memcpy(&msg
.umad
.mad
[0], hdr
, sge
[0].length
);
435 memcpy(&msg
.umad
.mad
[sge
[0].length
], data
, sge
[1].length
);
437 rdma_pci_dma_unmap(backend_dev
->dev
, data
, sge
[1].length
);
438 rdma_pci_dma_unmap(backend_dev
->dev
, hdr
, sge
[0].length
);
440 ret
= exec_rdmacm_mux_req(backend_dev
, &msg
);
442 pr_dbg("Fail to send MAD to rdma_umadmux (%d)\n", ret
);
449 void rdma_backend_post_send(RdmaBackendDev
*backend_dev
,
450 RdmaBackendQP
*qp
, uint8_t qp_type
,
451 struct ibv_sge
*sge
, uint32_t num_sge
,
452 uint8_t sgid_idx
, union ibv_gid
*sgid
,
453 union ibv_gid
*dgid
, uint32_t dqpn
, uint32_t dqkey
,
457 struct ibv_sge new_sge
[MAX_SGE
];
460 struct ibv_send_wr wr
= {0}, *bad_wr
;
462 if (!qp
->ibqp
) { /* This field does not get initialized for QP0 and QP1 */
463 if (qp_type
== IBV_QPT_SMI
) {
464 pr_dbg("QP0 unsupported\n");
465 complete_work(IBV_WC_GENERAL_ERR
, VENDOR_ERR_QP0
, ctx
);
466 } else if (qp_type
== IBV_QPT_GSI
) {
468 rc
= mad_send(backend_dev
, sgid_idx
, sgid
, sge
, num_sge
);
470 complete_work(IBV_WC_GENERAL_ERR
, VENDOR_ERR_MAD_SEND
, ctx
);
472 complete_work(IBV_WC_SUCCESS
, 0, ctx
);
478 pr_dbg("num_sge=%d\n", num_sge
);
479 if (!num_sge
|| num_sge
> MAX_SGE
) {
480 pr_dbg("invalid num_sge=%d\n", num_sge
);
481 complete_work(IBV_WC_GENERAL_ERR
, VENDOR_ERR_INV_NUM_SGE
, ctx
);
485 bctx
= g_malloc0(sizeof(*bctx
));
489 rc
= rdma_rm_alloc_cqe_ctx(backend_dev
->rdma_dev_res
, &bctx_id
, bctx
);
491 pr_dbg("Failed to allocate cqe_ctx\n");
492 complete_work(IBV_WC_GENERAL_ERR
, VENDOR_ERR_NOMEM
, ctx
);
496 rc
= build_host_sge_array(backend_dev
->rdma_dev_res
, new_sge
, sge
, num_sge
);
498 pr_dbg("Error: Failed to build host SGE array\n");
499 complete_work(IBV_WC_GENERAL_ERR
, rc
, ctx
);
500 goto out_dealloc_cqe_ctx
;
503 if (qp_type
== IBV_QPT_UD
) {
504 wr
.wr
.ud
.ah
= create_ah(backend_dev
, qp
->ibpd
, sgid_idx
, dgid
);
506 complete_work(IBV_WC_GENERAL_ERR
, VENDOR_ERR_FAIL_BACKEND
, ctx
);
507 goto out_dealloc_cqe_ctx
;
509 wr
.wr
.ud
.remote_qpn
= dqpn
;
510 wr
.wr
.ud
.remote_qkey
= dqkey
;
513 wr
.num_sge
= num_sge
;
514 wr
.opcode
= IBV_WR_SEND
;
515 wr
.send_flags
= IBV_SEND_SIGNALED
;
516 wr
.sg_list
= new_sge
;
519 rc
= ibv_post_send(qp
->ibqp
, &wr
, &bad_wr
);
520 pr_dbg("ibv_post_send=%d\n", rc
);
522 pr_dbg("Fail (%d, %d) to post send WQE to qpn %d\n", rc
, errno
,
524 complete_work(IBV_WC_GENERAL_ERR
, VENDOR_ERR_FAIL_BACKEND
, ctx
);
525 goto out_dealloc_cqe_ctx
;
531 rdma_rm_dealloc_cqe_ctx(backend_dev
->rdma_dev_res
, bctx_id
);
537 static unsigned int save_mad_recv_buffer(RdmaBackendDev
*backend_dev
,
538 struct ibv_sge
*sge
, uint32_t num_sge
,
546 pr_dbg("Invalid num_sge (%d), expecting 1\n", num_sge
);
547 return VENDOR_ERR_INV_NUM_SGE
;
550 if (sge
[0].length
< RDMA_MAX_PRIVATE_DATA
+ sizeof(struct ibv_grh
)) {
551 pr_dbg("Too small buffer for MAD\n");
552 return VENDOR_ERR_INV_MAD_BUFF
;
555 pr_dbg("addr=0x%" PRIx64
"\n", sge
[0].addr
);
556 pr_dbg("length=%d\n", sge
[0].length
);
557 pr_dbg("lkey=%d\n", sge
[0].lkey
);
559 bctx
= g_malloc0(sizeof(*bctx
));
561 rc
= rdma_rm_alloc_cqe_ctx(backend_dev
->rdma_dev_res
, &bctx_id
, bctx
);
564 pr_dbg("Fail to allocate cqe_ctx\n");
565 return VENDOR_ERR_NOMEM
;
568 pr_dbg("bctx_id %d, bctx %p, ctx %p\n", bctx_id
, bctx
, ctx
);
572 qemu_mutex_lock(&backend_dev
->recv_mads_list
.lock
);
573 qlist_append_int(backend_dev
->recv_mads_list
.list
, bctx_id
);
574 qemu_mutex_unlock(&backend_dev
->recv_mads_list
.lock
);
579 void rdma_backend_post_recv(RdmaBackendDev
*backend_dev
,
580 RdmaDeviceResources
*rdma_dev_res
,
581 RdmaBackendQP
*qp
, uint8_t qp_type
,
582 struct ibv_sge
*sge
, uint32_t num_sge
, void *ctx
)
585 struct ibv_sge new_sge
[MAX_SGE
];
588 struct ibv_recv_wr wr
= {0}, *bad_wr
;
590 if (!qp
->ibqp
) { /* This field does not get initialized for QP0 and QP1 */
591 if (qp_type
== IBV_QPT_SMI
) {
592 pr_dbg("QP0 unsupported\n");
593 complete_work(IBV_WC_GENERAL_ERR
, VENDOR_ERR_QP0
, ctx
);
595 if (qp_type
== IBV_QPT_GSI
) {
597 rc
= save_mad_recv_buffer(backend_dev
, sge
, num_sge
, ctx
);
599 complete_work(IBV_WC_GENERAL_ERR
, rc
, ctx
);
605 pr_dbg("num_sge=%d\n", num_sge
);
606 if (!num_sge
|| num_sge
> MAX_SGE
) {
607 pr_dbg("invalid num_sge=%d\n", num_sge
);
608 complete_work(IBV_WC_GENERAL_ERR
, VENDOR_ERR_INV_NUM_SGE
, ctx
);
612 bctx
= g_malloc0(sizeof(*bctx
));
616 rc
= rdma_rm_alloc_cqe_ctx(rdma_dev_res
, &bctx_id
, bctx
);
618 pr_dbg("Failed to allocate cqe_ctx\n");
619 complete_work(IBV_WC_GENERAL_ERR
, VENDOR_ERR_NOMEM
, ctx
);
623 rc
= build_host_sge_array(rdma_dev_res
, new_sge
, sge
, num_sge
);
625 pr_dbg("Error: Failed to build host SGE array\n");
626 complete_work(IBV_WC_GENERAL_ERR
, rc
, ctx
);
627 goto out_dealloc_cqe_ctx
;
630 wr
.num_sge
= num_sge
;
631 wr
.sg_list
= new_sge
;
633 rc
= ibv_post_recv(qp
->ibqp
, &wr
, &bad_wr
);
634 pr_dbg("ibv_post_recv=%d\n", rc
);
636 pr_dbg("Fail (%d, %d) to post recv WQE to qpn %d\n", rc
, errno
,
638 complete_work(IBV_WC_GENERAL_ERR
, VENDOR_ERR_FAIL_BACKEND
, ctx
);
639 goto out_dealloc_cqe_ctx
;
645 rdma_rm_dealloc_cqe_ctx(rdma_dev_res
, bctx_id
);
651 int rdma_backend_create_pd(RdmaBackendDev
*backend_dev
, RdmaBackendPD
*pd
)
653 pd
->ibpd
= ibv_alloc_pd(backend_dev
->context
);
655 return pd
->ibpd
? 0 : -EIO
;
658 void rdma_backend_destroy_pd(RdmaBackendPD
*pd
)
661 ibv_dealloc_pd(pd
->ibpd
);
665 int rdma_backend_create_mr(RdmaBackendMR
*mr
, RdmaBackendPD
*pd
, void *addr
,
666 size_t length
, int access
)
668 pr_dbg("addr=0x%p\n", addr
);
669 pr_dbg("len=%zu\n", length
);
670 mr
->ibmr
= ibv_reg_mr(pd
->ibpd
, addr
, length
, access
);
672 pr_dbg("lkey=0x%x\n", mr
->ibmr
->lkey
);
673 pr_dbg("rkey=0x%x\n", mr
->ibmr
->rkey
);
677 return mr
->ibmr
? 0 : -EIO
;
680 void rdma_backend_destroy_mr(RdmaBackendMR
*mr
)
683 ibv_dereg_mr(mr
->ibmr
);
687 int rdma_backend_create_cq(RdmaBackendDev
*backend_dev
, RdmaBackendCQ
*cq
,
692 pr_dbg("cqe=%d\n", cqe
);
694 pr_dbg("dev->channel=%p\n", backend_dev
->channel
);
695 cq
->ibcq
= ibv_create_cq(backend_dev
->context
, cqe
+ 1, NULL
,
696 backend_dev
->channel
, 0);
699 rc
= ibv_req_notify_cq(cq
->ibcq
, 0);
701 pr_dbg("Error %d from ibv_req_notify_cq\n", rc
);
703 cq
->backend_dev
= backend_dev
;
706 return cq
->ibcq
? 0 : -EIO
;
709 void rdma_backend_destroy_cq(RdmaBackendCQ
*cq
)
712 ibv_destroy_cq(cq
->ibcq
);
716 int rdma_backend_create_qp(RdmaBackendQP
*qp
, uint8_t qp_type
,
717 RdmaBackendPD
*pd
, RdmaBackendCQ
*scq
,
718 RdmaBackendCQ
*rcq
, uint32_t max_send_wr
,
719 uint32_t max_recv_wr
, uint32_t max_send_sge
,
720 uint32_t max_recv_sge
)
722 struct ibv_qp_init_attr attr
= {0};
725 pr_dbg("qp_type=%d\n", qp_type
);
738 pr_dbg("Unsupported QP type %d\n", qp_type
);
742 attr
.qp_type
= qp_type
;
743 attr
.send_cq
= scq
->ibcq
;
744 attr
.recv_cq
= rcq
->ibcq
;
745 attr
.cap
.max_send_wr
= max_send_wr
;
746 attr
.cap
.max_recv_wr
= max_recv_wr
;
747 attr
.cap
.max_send_sge
= max_send_sge
;
748 attr
.cap
.max_recv_sge
= max_recv_sge
;
750 pr_dbg("max_send_wr=%d\n", max_send_wr
);
751 pr_dbg("max_recv_wr=%d\n", max_recv_wr
);
752 pr_dbg("max_send_sge=%d\n", max_send_sge
);
753 pr_dbg("max_recv_sge=%d\n", max_recv_sge
);
755 qp
->ibqp
= ibv_create_qp(pd
->ibpd
, &attr
);
756 if (likely(!qp
->ibqp
)) {
757 pr_dbg("Error from ibv_create_qp\n");
763 /* TODO: Query QP to get max_inline_data and save it to be used in send */
765 pr_dbg("qpn=0x%x\n", qp
->ibqp
->qp_num
);
770 int rdma_backend_qp_state_init(RdmaBackendDev
*backend_dev
, RdmaBackendQP
*qp
,
771 uint8_t qp_type
, uint32_t qkey
)
773 struct ibv_qp_attr attr
= {0};
776 pr_dbg("qpn=0x%x\n", qp
->ibqp
->qp_num
);
777 pr_dbg("sport_num=%d\n", backend_dev
->port_num
);
779 attr_mask
= IBV_QP_STATE
| IBV_QP_PKEY_INDEX
| IBV_QP_PORT
;
780 attr
.qp_state
= IBV_QPS_INIT
;
782 attr
.port_num
= backend_dev
->port_num
;
786 attr_mask
|= IBV_QP_ACCESS_FLAGS
;
791 attr_mask
|= IBV_QP_QKEY
;
795 pr_dbg("Unsupported QP type %d\n", qp_type
);
799 rc
= ibv_modify_qp(qp
->ibqp
, &attr
, attr_mask
);
801 pr_dbg("Error %d from ibv_modify_qp\n", rc
);
808 int rdma_backend_qp_state_rtr(RdmaBackendDev
*backend_dev
, RdmaBackendQP
*qp
,
809 uint8_t qp_type
, uint8_t sgid_idx
,
810 union ibv_gid
*dgid
, uint32_t dqpn
,
811 uint32_t rq_psn
, uint32_t qkey
, bool use_qkey
)
813 struct ibv_qp_attr attr
= {0};
814 union ibv_gid ibv_gid
= {
815 .global
.interface_id
= dgid
->global
.interface_id
,
816 .global
.subnet_prefix
= dgid
->global
.subnet_prefix
820 attr
.qp_state
= IBV_QPS_RTR
;
821 attr_mask
= IBV_QP_STATE
;
823 qp
->sgid_idx
= sgid_idx
;
827 pr_dbg("dgid=0x%" PRIx64
",%" PRIx64
"\n",
828 be64_to_cpu(ibv_gid
.global
.subnet_prefix
),
829 be64_to_cpu(ibv_gid
.global
.interface_id
));
830 pr_dbg("dqpn=0x%x\n", dqpn
);
831 pr_dbg("sgid_idx=%d\n", qp
->sgid_idx
);
832 pr_dbg("sport_num=%d\n", backend_dev
->port_num
);
833 pr_dbg("rq_psn=0x%x\n", rq_psn
);
835 attr
.path_mtu
= IBV_MTU_1024
;
836 attr
.dest_qp_num
= dqpn
;
837 attr
.max_dest_rd_atomic
= 1;
838 attr
.min_rnr_timer
= 12;
839 attr
.ah_attr
.port_num
= backend_dev
->port_num
;
840 attr
.ah_attr
.is_global
= 1;
841 attr
.ah_attr
.grh
.hop_limit
= 1;
842 attr
.ah_attr
.grh
.dgid
= ibv_gid
;
843 attr
.ah_attr
.grh
.sgid_index
= qp
->sgid_idx
;
844 attr
.rq_psn
= rq_psn
;
846 attr_mask
|= IBV_QP_AV
| IBV_QP_PATH_MTU
| IBV_QP_DEST_QPN
|
847 IBV_QP_RQ_PSN
| IBV_QP_MAX_DEST_RD_ATOMIC
|
848 IBV_QP_MIN_RNR_TIMER
;
852 pr_dbg("qkey=0x%x\n", qkey
);
855 attr_mask
|= IBV_QP_QKEY
;
860 rc
= ibv_modify_qp(qp
->ibqp
, &attr
, attr_mask
);
862 pr_dbg("Error %d from ibv_modify_qp\n", rc
);
869 int rdma_backend_qp_state_rts(RdmaBackendQP
*qp
, uint8_t qp_type
,
870 uint32_t sq_psn
, uint32_t qkey
, bool use_qkey
)
872 struct ibv_qp_attr attr
= {0};
875 pr_dbg("qpn=0x%x\n", qp
->ibqp
->qp_num
);
876 pr_dbg("sq_psn=0x%x\n", sq_psn
);
878 attr
.qp_state
= IBV_QPS_RTS
;
879 attr
.sq_psn
= sq_psn
;
880 attr_mask
= IBV_QP_STATE
| IBV_QP_SQ_PSN
;
887 attr
.max_rd_atomic
= 1;
889 attr_mask
|= IBV_QP_TIMEOUT
| IBV_QP_RETRY_CNT
| IBV_QP_RNR_RETRY
|
890 IBV_QP_MAX_QP_RD_ATOMIC
;
895 pr_dbg("qkey=0x%x\n", qkey
);
897 attr_mask
|= IBV_QP_QKEY
;
902 rc
= ibv_modify_qp(qp
->ibqp
, &attr
, attr_mask
);
904 pr_dbg("Error %d from ibv_modify_qp\n", rc
);
911 int rdma_backend_query_qp(RdmaBackendQP
*qp
, struct ibv_qp_attr
*attr
,
912 int attr_mask
, struct ibv_qp_init_attr
*init_attr
)
916 attr
->qp_state
= IBV_QPS_RTS
;
920 return ibv_query_qp(qp
->ibqp
, attr
, attr_mask
, init_attr
);
923 void rdma_backend_destroy_qp(RdmaBackendQP
*qp
)
926 ibv_destroy_qp(qp
->ibqp
);
930 #define CHK_ATTR(req, dev, member, fmt) ({ \
931 pr_dbg("%s="fmt","fmt"\n", #member, dev.member, req->member); \
932 if (req->member > dev.member) { \
933 warn_report("%s = "fmt" is higher than host device capability "fmt, \
934 #member, req->member, dev.member); \
935 req->member = dev.member; \
937 pr_dbg("%s="fmt"\n", #member, req->member); })
939 static int init_device_caps(RdmaBackendDev
*backend_dev
,
940 struct ibv_device_attr
*dev_attr
)
942 if (ibv_query_device(backend_dev
->context
, &backend_dev
->dev_attr
)) {
946 CHK_ATTR(dev_attr
, backend_dev
->dev_attr
, max_mr_size
, "%" PRId64
);
947 CHK_ATTR(dev_attr
, backend_dev
->dev_attr
, max_qp
, "%d");
948 CHK_ATTR(dev_attr
, backend_dev
->dev_attr
, max_sge
, "%d");
949 CHK_ATTR(dev_attr
, backend_dev
->dev_attr
, max_qp_wr
, "%d");
950 CHK_ATTR(dev_attr
, backend_dev
->dev_attr
, max_cq
, "%d");
951 CHK_ATTR(dev_attr
, backend_dev
->dev_attr
, max_cqe
, "%d");
952 CHK_ATTR(dev_attr
, backend_dev
->dev_attr
, max_mr
, "%d");
953 CHK_ATTR(dev_attr
, backend_dev
->dev_attr
, max_pd
, "%d");
954 CHK_ATTR(dev_attr
, backend_dev
->dev_attr
, max_qp_rd_atom
, "%d");
955 CHK_ATTR(dev_attr
, backend_dev
->dev_attr
, max_qp_init_rd_atom
, "%d");
956 CHK_ATTR(dev_attr
, backend_dev
->dev_attr
, max_ah
, "%d");
961 static inline void build_mad_hdr(struct ibv_grh
*grh
, union ibv_gid
*sgid
,
962 union ibv_gid
*my_gid
, int paylen
)
964 grh
->paylen
= htons(paylen
);
968 pr_dbg("paylen=%d (net=0x%x)\n", paylen
, grh
->paylen
);
969 pr_dbg("dgid=0x%llx\n", my_gid
->global
.interface_id
);
970 pr_dbg("sgid=0x%llx\n", sgid
->global
.interface_id
);
973 static void process_incoming_mad_req(RdmaBackendDev
*backend_dev
,
977 unsigned long cqe_ctx_id
;
981 pr_dbg("umad_len=%d\n", msg
->umad_len
);
984 struct umad_hdr
*hdr
= (struct umad_hdr
*)&msg
->umad
.mad
;
985 pr_dbg("bv %x cls %x cv %x mtd %x st %d tid %" PRIx64
" at %x atm %x\n",
986 hdr
->base_version
, hdr
->mgmt_class
, hdr
->class_version
,
987 hdr
->method
, hdr
->status
, be64toh(hdr
->tid
),
988 hdr
->attr_id
, hdr
->attr_mod
);
991 qemu_mutex_lock(&backend_dev
->recv_mads_list
.lock
);
992 o_ctx_id
= qlist_pop(backend_dev
->recv_mads_list
.list
);
993 qemu_mutex_unlock(&backend_dev
->recv_mads_list
.lock
);
995 pr_dbg("No more free MADs buffers, waiting for a while\n");
1000 cqe_ctx_id
= qnum_get_uint(qobject_to(QNum
, o_ctx_id
));
1001 bctx
= rdma_rm_get_cqe_ctx(backend_dev
->rdma_dev_res
, cqe_ctx_id
);
1002 if (unlikely(!bctx
)) {
1003 pr_dbg("Error: Fail to find ctx for %ld\n", cqe_ctx_id
);
1007 pr_dbg("id %ld, bctx %p, ctx %p\n", cqe_ctx_id
, bctx
, bctx
->up_ctx
);
1009 mad
= rdma_pci_dma_map(backend_dev
->dev
, bctx
->sge
.addr
,
1011 if (!mad
|| bctx
->sge
.length
< msg
->umad_len
+ MAD_HDR_SIZE
) {
1012 complete_work(IBV_WC_GENERAL_ERR
, VENDOR_ERR_INV_MAD_BUFF
,
1015 struct ibv_wc wc
= {0};
1016 pr_dbg_buf("mad", msg
->umad
.mad
, msg
->umad_len
);
1017 memset(mad
, 0, bctx
->sge
.length
);
1018 build_mad_hdr((struct ibv_grh
*)mad
,
1019 (union ibv_gid
*)&msg
->umad
.hdr
.addr
.gid
, &msg
->hdr
.sgid
,
1021 memcpy(&mad
[MAD_HDR_SIZE
], msg
->umad
.mad
, msg
->umad_len
);
1022 rdma_pci_dma_unmap(backend_dev
->dev
, mad
, bctx
->sge
.length
);
1024 wc
.byte_len
= msg
->umad_len
;
1025 wc
.status
= IBV_WC_SUCCESS
;
1026 wc
.wc_flags
= IBV_WC_GRH
;
1027 comp_handler(bctx
->up_ctx
, &wc
);
1031 rdma_rm_dealloc_cqe_ctx(backend_dev
->rdma_dev_res
, cqe_ctx_id
);
1034 static inline int rdmacm_mux_can_receive(void *opaque
)
1036 RdmaBackendDev
*backend_dev
= (RdmaBackendDev
*)opaque
;
1038 return rdmacm_mux_can_process_async(backend_dev
);
1041 static void rdmacm_mux_read(void *opaque
, const uint8_t *buf
, int size
)
1043 RdmaBackendDev
*backend_dev
= (RdmaBackendDev
*)opaque
;
1044 RdmaCmMuxMsg
*msg
= (RdmaCmMuxMsg
*)buf
;
1046 pr_dbg("Got %d bytes\n", size
);
1047 pr_dbg("msg_type=%d\n", msg
->hdr
.msg_type
);
1048 pr_dbg("op_code=%d\n", msg
->hdr
.op_code
);
1050 if (msg
->hdr
.msg_type
!= RDMACM_MUX_MSG_TYPE_REQ
&&
1051 msg
->hdr
.op_code
!= RDMACM_MUX_OP_CODE_MAD
) {
1052 pr_dbg("Error: Not a MAD request, skipping\n");
1055 process_incoming_mad_req(backend_dev
, msg
);
1058 static int mad_init(RdmaBackendDev
*backend_dev
, CharBackend
*mad_chr_be
)
1062 backend_dev
->rdmacm_mux
.chr_be
= mad_chr_be
;
1064 ret
= qemu_chr_fe_backend_connected(backend_dev
->rdmacm_mux
.chr_be
);
1066 pr_dbg("Missing chardev for MAD multiplexer\n");
1070 qemu_mutex_init(&backend_dev
->recv_mads_list
.lock
);
1071 backend_dev
->recv_mads_list
.list
= qlist_new();
1073 enable_rdmacm_mux_async(backend_dev
);
1075 qemu_chr_fe_set_handlers(backend_dev
->rdmacm_mux
.chr_be
,
1076 rdmacm_mux_can_receive
, rdmacm_mux_read
, NULL
,
1077 NULL
, backend_dev
, NULL
, true);
1082 static void mad_fini(RdmaBackendDev
*backend_dev
)
1084 pr_dbg("Stopping MAD\n");
1085 disable_rdmacm_mux_async(backend_dev
);
1086 qemu_chr_fe_disconnect(backend_dev
->rdmacm_mux
.chr_be
);
1087 qlist_destroy_obj(QOBJECT(backend_dev
->recv_mads_list
.list
));
1088 qemu_mutex_destroy(&backend_dev
->recv_mads_list
.lock
);
1091 int rdma_backend_get_gid_index(RdmaBackendDev
*backend_dev
,
1098 pr_dbg("0x%llx, 0x%llx\n",
1099 (long long unsigned int)be64_to_cpu(gid
->global
.subnet_prefix
),
1100 (long long unsigned int)be64_to_cpu(gid
->global
.interface_id
));
1103 ret
= ibv_query_gid(backend_dev
->context
, backend_dev
->port_num
, i
,
1106 } while (!ret
&& (memcmp(&sgid
, gid
, sizeof(*gid
))));
1108 pr_dbg("gid_index=%d\n", i
- 1);
1110 return ret
? ret
: i
- 1;
1113 int rdma_backend_add_gid(RdmaBackendDev
*backend_dev
, const char *ifname
,
1116 RdmaCmMuxMsg msg
= {0};
1119 pr_dbg("0x%llx, 0x%llx\n",
1120 (long long unsigned int)be64_to_cpu(gid
->global
.subnet_prefix
),
1121 (long long unsigned int)be64_to_cpu(gid
->global
.interface_id
));
1123 msg
.hdr
.op_code
= RDMACM_MUX_OP_CODE_REG
;
1124 memcpy(msg
.hdr
.sgid
.raw
, gid
->raw
, sizeof(msg
.hdr
.sgid
));
1126 ret
= exec_rdmacm_mux_req(backend_dev
, &msg
);
1128 pr_dbg("Fail to register GID to rdma_umadmux (%d)\n", ret
);
1132 qapi_event_send_rdma_gid_status_changed(ifname
, true,
1133 gid
->global
.subnet_prefix
,
1134 gid
->global
.interface_id
);
1139 int rdma_backend_del_gid(RdmaBackendDev
*backend_dev
, const char *ifname
,
1142 RdmaCmMuxMsg msg
= {0};
1145 pr_dbg("0x%llx, 0x%llx\n",
1146 (long long unsigned int)be64_to_cpu(gid
->global
.subnet_prefix
),
1147 (long long unsigned int)be64_to_cpu(gid
->global
.interface_id
));
1149 msg
.hdr
.op_code
= RDMACM_MUX_OP_CODE_UNREG
;
1150 memcpy(msg
.hdr
.sgid
.raw
, gid
->raw
, sizeof(msg
.hdr
.sgid
));
1152 ret
= exec_rdmacm_mux_req(backend_dev
, &msg
);
1154 pr_dbg("Fail to unregister GID from rdma_umadmux (%d)\n", ret
);
1158 qapi_event_send_rdma_gid_status_changed(ifname
, false,
1159 gid
->global
.subnet_prefix
,
1160 gid
->global
.interface_id
);
1165 int rdma_backend_init(RdmaBackendDev
*backend_dev
, PCIDevice
*pdev
,
1166 RdmaDeviceResources
*rdma_dev_res
,
1167 const char *backend_device_name
, uint8_t port_num
,
1168 struct ibv_device_attr
*dev_attr
, CharBackend
*mad_chr_be
,
1173 int num_ibv_devices
;
1174 struct ibv_device
**dev_list
;
1176 memset(backend_dev
, 0, sizeof(*backend_dev
));
1178 backend_dev
->dev
= pdev
;
1179 backend_dev
->port_num
= port_num
;
1180 backend_dev
->rdma_dev_res
= rdma_dev_res
;
1182 rdma_backend_register_comp_handler(dummy_comp_handler
);
1184 dev_list
= ibv_get_device_list(&num_ibv_devices
);
1186 error_setg(errp
, "Failed to get IB devices list");
1190 if (num_ibv_devices
== 0) {
1191 error_setg(errp
, "No IB devices were found");
1193 goto out_free_dev_list
;
1196 if (backend_device_name
) {
1197 for (i
= 0; dev_list
[i
]; ++i
) {
1198 if (!strcmp(ibv_get_device_name(dev_list
[i
]),
1199 backend_device_name
)) {
1204 backend_dev
->ib_dev
= dev_list
[i
];
1205 if (!backend_dev
->ib_dev
) {
1206 error_setg(errp
, "Failed to find IB device %s",
1207 backend_device_name
);
1209 goto out_free_dev_list
;
1212 backend_dev
->ib_dev
= *dev_list
;
1215 pr_dbg("Using backend device %s, port %d\n",
1216 ibv_get_device_name(backend_dev
->ib_dev
), backend_dev
->port_num
);
1217 pr_dbg("uverb device %s\n", backend_dev
->ib_dev
->dev_name
);
1219 backend_dev
->context
= ibv_open_device(backend_dev
->ib_dev
);
1220 if (!backend_dev
->context
) {
1221 error_setg(errp
, "Failed to open IB device");
1226 backend_dev
->channel
= ibv_create_comp_channel(backend_dev
->context
);
1227 if (!backend_dev
->channel
) {
1228 error_setg(errp
, "Failed to create IB communication channel");
1230 goto out_close_device
;
1232 pr_dbg("dev->backend_dev.channel=%p\n", backend_dev
->channel
);
1234 ret
= init_device_caps(backend_dev
, dev_attr
);
1236 error_setg(errp
, "Failed to initialize device capabilities");
1238 goto out_destroy_comm_channel
;
1242 ret
= mad_init(backend_dev
, mad_chr_be
);
1244 error_setg(errp
, "Fail to initialize mad");
1246 goto out_destroy_comm_channel
;
1249 backend_dev
->comp_thread
.run
= false;
1250 backend_dev
->comp_thread
.is_running
= false;
1254 goto out_free_dev_list
;
1256 out_destroy_comm_channel
:
1257 ibv_destroy_comp_channel(backend_dev
->channel
);
1260 ibv_close_device(backend_dev
->context
);
1263 ibv_free_device_list(dev_list
);
1270 void rdma_backend_start(RdmaBackendDev
*backend_dev
)
1272 pr_dbg("Starting rdma_backend\n");
1273 start_comp_thread(backend_dev
);
1276 void rdma_backend_stop(RdmaBackendDev
*backend_dev
)
1278 pr_dbg("Stopping rdma_backend\n");
1279 stop_backend_thread(&backend_dev
->comp_thread
);
1282 void rdma_backend_fini(RdmaBackendDev
*backend_dev
)
1284 rdma_backend_stop(backend_dev
);
1285 mad_fini(backend_dev
);
1286 g_hash_table_destroy(ah_hash
);
1287 ibv_destroy_comp_channel(backend_dev
->channel
);
1288 ibv_close_device(backend_dev
->context
);