2 * QEMU paravirtual RDMA - Generic RDMA backend
4 * Copyright (C) 2018 Oracle
5 * Copyright (C) 2018 Red Hat Inc
8 * Yuval Shaia <yuval.shaia@oracle.com>
9 * Marcel Apfelbaum <marcel@redhat.com>
11 * This work is licensed under the terms of the GNU GPL, version 2 or later.
12 * See the COPYING file in the top-level directory.
16 #include "qemu/osdep.h"
17 #include "qemu/error-report.h"
18 #include "sysemu/sysemu.h"
19 #include "qapi/error.h"
20 #include "qapi/qmp/qlist.h"
21 #include "qapi/qmp/qnum.h"
22 #include "qapi/qapi-events-rdma.h"
24 #include <infiniband/verbs.h>
25 #include <infiniband/umad_types.h>
26 #include <infiniband/umad.h>
27 #include <rdma/rdma_user_cm.h>
29 #include "contrib/rdmacm-mux/rdmacm-mux.h"
31 #include "rdma_utils.h"
33 #include "rdma_backend.h"
35 #define THR_NAME_LEN 16
36 #define THR_POLL_TO 5000
38 #define MAD_HDR_SIZE sizeof(struct ibv_grh)
40 typedef struct BackendCtx
{
43 struct ibv_sge sge
; /* Used to save MAD recv buffer */
47 struct ib_user_mad hdr
;
48 char mad
[RDMA_MAX_PRIVATE_DATA
];
51 static void (*comp_handler
)(void *ctx
, struct ibv_wc
*wc
);
53 static void dummy_comp_handler(void *ctx
, struct ibv_wc
*wc
)
55 pr_err("No completion handler is registered\n");
58 static inline void complete_work(enum ibv_wc_status status
, uint32_t vendor_err
,
61 struct ibv_wc wc
= {0};
64 wc
.vendor_err
= vendor_err
;
66 comp_handler(ctx
, &wc
);
69 static void poll_cq(RdmaDeviceResources
*rdma_dev_res
, struct ibv_cq
*ibcq
)
75 pr_dbg("Entering poll_cq loop on cq %p\n", ibcq
);
77 ne
= ibv_poll_cq(ibcq
, ARRAY_SIZE(wc
), wc
);
79 pr_dbg("Got %d completion(s) from cq %p\n", ne
, ibcq
);
81 for (i
= 0; i
< ne
; i
++) {
82 pr_dbg("wr_id=0x%" PRIx64
"\n", wc
[i
].wr_id
);
83 pr_dbg("status=%d\n", wc
[i
].status
);
85 bctx
= rdma_rm_get_cqe_ctx(rdma_dev_res
, wc
[i
].wr_id
);
86 if (unlikely(!bctx
)) {
87 pr_dbg("Error: Failed to find ctx for req %" PRId64
"\n",
91 pr_dbg("Processing %s CQE\n", bctx
->is_tx_req
? "send" : "recv");
93 comp_handler(bctx
->up_ctx
, &wc
[i
]);
95 rdma_rm_dealloc_cqe_ctx(rdma_dev_res
, wc
[i
].wr_id
);
101 pr_dbg("Got error %d from ibv_poll_cq\n", ne
);
105 static void *comp_handler_thread(void *arg
)
107 RdmaBackendDev
*backend_dev
= (RdmaBackendDev
*)arg
;
109 struct ibv_cq
*ev_cq
;
114 /* Change to non-blocking mode */
115 flags
= fcntl(backend_dev
->channel
->fd
, F_GETFL
);
116 rc
= fcntl(backend_dev
->channel
->fd
, F_SETFL
, flags
| O_NONBLOCK
);
118 pr_dbg("Fail to change to non-blocking mode\n");
122 pr_dbg("Starting\n");
124 pfds
[0].fd
= backend_dev
->channel
->fd
;
125 pfds
[0].events
= G_IO_IN
| G_IO_HUP
| G_IO_ERR
;
127 backend_dev
->comp_thread
.is_running
= true;
129 while (backend_dev
->comp_thread
.run
) {
131 rc
= qemu_poll_ns(pfds
, 1, THR_POLL_TO
* (int64_t)SCALE_MS
);
132 } while (!rc
&& backend_dev
->comp_thread
.run
);
134 if (backend_dev
->comp_thread
.run
) {
135 pr_dbg("Waiting for completion on channel %p\n", backend_dev
->channel
);
136 rc
= ibv_get_cq_event(backend_dev
->channel
, &ev_cq
, &ev_ctx
);
137 pr_dbg("ibv_get_cq_event=%d\n", rc
);
139 pr_dbg("---> ibv_get_cq_event (%d)\n", rc
);
143 rc
= ibv_req_notify_cq(ev_cq
, 0);
145 pr_dbg("Error %d from ibv_req_notify_cq\n", rc
);
148 poll_cq(backend_dev
->rdma_dev_res
, ev_cq
);
150 ibv_ack_cq_events(ev_cq
, 1);
154 pr_dbg("Going down\n");
156 /* TODO: Post cqe for all remaining buffs that were posted */
158 backend_dev
->comp_thread
.is_running
= false;
165 static inline void disable_rdmacm_mux_async(RdmaBackendDev
*backend_dev
)
167 atomic_set(&backend_dev
->rdmacm_mux
.can_receive
, 0);
170 static inline void enable_rdmacm_mux_async(RdmaBackendDev
*backend_dev
)
172 atomic_set(&backend_dev
->rdmacm_mux
.can_receive
, sizeof(RdmaCmMuxMsg
));
175 static inline int rdmacm_mux_can_process_async(RdmaBackendDev
*backend_dev
)
177 return atomic_read(&backend_dev
->rdmacm_mux
.can_receive
);
180 static int check_mux_op_status(CharBackend
*mad_chr_be
)
182 RdmaCmMuxMsg msg
= {};
185 pr_dbg("Reading response\n");
186 ret
= qemu_chr_fe_read_all(mad_chr_be
, (uint8_t *)&msg
, sizeof(msg
));
187 if (ret
!= sizeof(msg
)) {
188 pr_dbg("Invalid message size %d, expecting %ld\n", ret
, sizeof(msg
));
192 pr_dbg("msg_type=%d\n", msg
.hdr
.msg_type
);
193 pr_dbg("op_code=%d\n", msg
.hdr
.op_code
);
194 pr_dbg("err_code=%d\n", msg
.hdr
.err_code
);
196 if (msg
.hdr
.msg_type
!= RDMACM_MUX_MSG_TYPE_RESP
) {
197 pr_dbg("Invalid message type %d\n", msg
.hdr
.msg_type
);
201 if (msg
.hdr
.err_code
!= RDMACM_MUX_ERR_CODE_OK
) {
202 pr_dbg("Operation failed in mux, error code %d\n", msg
.hdr
.err_code
);
209 static int exec_rdmacm_mux_req(RdmaBackendDev
*backend_dev
, RdmaCmMuxMsg
*msg
)
213 pr_dbg("Executing request %d\n", msg
->hdr
.op_code
);
215 msg
->hdr
.msg_type
= RDMACM_MUX_MSG_TYPE_REQ
;
216 disable_rdmacm_mux_async(backend_dev
);
217 rc
= qemu_chr_fe_write(backend_dev
->rdmacm_mux
.chr_be
,
218 (const uint8_t *)msg
, sizeof(*msg
));
219 if (rc
!= sizeof(*msg
)) {
220 enable_rdmacm_mux_async(backend_dev
);
221 pr_dbg("Fail to send request to rdmacm_mux (rc=%d)\n", rc
);
225 rc
= check_mux_op_status(backend_dev
->rdmacm_mux
.chr_be
);
227 pr_dbg("Fail to execute rdmacm_mux request %d (rc=%d)\n",
228 msg
->hdr
.op_code
, rc
);
231 enable_rdmacm_mux_async(backend_dev
);
236 static void stop_backend_thread(RdmaBackendThread
*thread
)
239 while (thread
->is_running
) {
240 pr_dbg("Waiting for thread to complete\n");
241 sleep(THR_POLL_TO
/ SCALE_US
/ 2);
245 static void start_comp_thread(RdmaBackendDev
*backend_dev
)
247 char thread_name
[THR_NAME_LEN
] = {0};
249 stop_backend_thread(&backend_dev
->comp_thread
);
251 snprintf(thread_name
, sizeof(thread_name
), "rdma_comp_%s",
252 ibv_get_device_name(backend_dev
->ib_dev
));
253 backend_dev
->comp_thread
.run
= true;
254 qemu_thread_create(&backend_dev
->comp_thread
.thread
, thread_name
,
255 comp_handler_thread
, backend_dev
, QEMU_THREAD_DETACHED
);
258 void rdma_backend_register_comp_handler(void (*handler
)(void *ctx
,
261 comp_handler
= handler
;
264 void rdma_backend_unregister_comp_handler(void)
266 rdma_backend_register_comp_handler(dummy_comp_handler
);
269 int rdma_backend_query_port(RdmaBackendDev
*backend_dev
,
270 struct ibv_port_attr
*port_attr
)
274 rc
= ibv_query_port(backend_dev
->context
, backend_dev
->port_num
, port_attr
);
276 pr_dbg("Error %d from ibv_query_port\n", rc
);
283 void rdma_backend_poll_cq(RdmaDeviceResources
*rdma_dev_res
, RdmaBackendCQ
*cq
)
285 poll_cq(rdma_dev_res
, cq
->ibcq
);
288 static GHashTable
*ah_hash
;
290 static struct ibv_ah
*create_ah(RdmaBackendDev
*backend_dev
, struct ibv_pd
*pd
,
291 uint8_t sgid_idx
, union ibv_gid
*dgid
)
293 GBytes
*ah_key
= g_bytes_new(dgid
, sizeof(*dgid
));
294 struct ibv_ah
*ah
= g_hash_table_lookup(ah_hash
, ah_key
);
297 trace_create_ah_cache_hit(be64_to_cpu(dgid
->global
.subnet_prefix
),
298 be64_to_cpu(dgid
->global
.interface_id
));
299 g_bytes_unref(ah_key
);
301 struct ibv_ah_attr ah_attr
= {
303 .port_num
= backend_dev
->port_num
,
307 ah_attr
.grh
.dgid
= *dgid
;
308 ah_attr
.grh
.sgid_index
= sgid_idx
;
310 ah
= ibv_create_ah(pd
, &ah_attr
);
312 g_hash_table_insert(ah_hash
, ah_key
, ah
);
314 g_bytes_unref(ah_key
);
315 pr_dbg("Fail to create AH for gid <0x%" PRIx64
", 0x%" PRIx64
">\n",
316 be64_to_cpu(dgid
->global
.subnet_prefix
),
317 be64_to_cpu(dgid
->global
.interface_id
));
320 trace_create_ah_cache_miss(be64_to_cpu(dgid
->global
.subnet_prefix
),
321 be64_to_cpu(dgid
->global
.interface_id
));
327 static void destroy_ah_hash_key(gpointer data
)
332 static void destroy_ah_hast_data(gpointer data
)
334 struct ibv_ah
*ah
= data
;
339 static void ah_cache_init(void)
341 ah_hash
= g_hash_table_new_full(g_bytes_hash
, g_bytes_equal
,
342 destroy_ah_hash_key
, destroy_ah_hast_data
);
345 static int build_host_sge_array(RdmaDeviceResources
*rdma_dev_res
,
346 struct ibv_sge
*dsge
, struct ibv_sge
*ssge
,
352 pr_dbg("num_sge=%d\n", num_sge
);
354 for (ssge_idx
= 0; ssge_idx
< num_sge
; ssge_idx
++) {
355 mr
= rdma_rm_get_mr(rdma_dev_res
, ssge
[ssge_idx
].lkey
);
357 pr_dbg("Invalid lkey 0x%x\n", ssge
[ssge_idx
].lkey
);
358 return VENDOR_ERR_INVLKEY
| ssge
[ssge_idx
].lkey
;
361 dsge
->addr
= (uintptr_t)mr
->virt
+ ssge
[ssge_idx
].addr
- mr
->start
;
362 dsge
->length
= ssge
[ssge_idx
].length
;
363 dsge
->lkey
= rdma_backend_mr_lkey(&mr
->backend_mr
);
365 pr_dbg("ssge->addr=0x%" PRIx64
"\n", ssge
[ssge_idx
].addr
);
366 pr_dbg("dsge->addr=0x%" PRIx64
"\n", dsge
->addr
);
367 pr_dbg("dsge->length=%d\n", dsge
->length
);
368 pr_dbg("dsge->lkey=0x%x\n", dsge
->lkey
);
376 static int mad_send(RdmaBackendDev
*backend_dev
, uint8_t sgid_idx
,
377 union ibv_gid
*sgid
, struct ibv_sge
*sge
, uint32_t num_sge
)
379 RdmaCmMuxMsg msg
= {};
383 pr_dbg("num_sge=%d\n", num_sge
);
389 msg
.hdr
.op_code
= RDMACM_MUX_OP_CODE_MAD
;
390 memcpy(msg
.hdr
.sgid
.raw
, sgid
->raw
, sizeof(msg
.hdr
.sgid
));
392 msg
.umad_len
= sge
[0].length
+ sge
[1].length
;
393 pr_dbg("umad_len=%d\n", msg
.umad_len
);
395 if (msg
.umad_len
> sizeof(msg
.umad
.mad
)) {
399 msg
.umad
.hdr
.addr
.qpn
= htobe32(1);
400 msg
.umad
.hdr
.addr
.grh_present
= 1;
401 pr_dbg("sgid_idx=%d\n", sgid_idx
);
402 pr_dbg("sgid=0x%llx\n", sgid
->global
.interface_id
);
403 msg
.umad
.hdr
.addr
.gid_index
= sgid_idx
;
404 memcpy(msg
.umad
.hdr
.addr
.gid
, sgid
->raw
, sizeof(msg
.umad
.hdr
.addr
.gid
));
405 msg
.umad
.hdr
.addr
.hop_limit
= 0xFF;
407 hdr
= rdma_pci_dma_map(backend_dev
->dev
, sge
[0].addr
, sge
[0].length
);
409 pr_dbg("Fail to map to sge[0]\n");
412 data
= rdma_pci_dma_map(backend_dev
->dev
, sge
[1].addr
, sge
[1].length
);
414 pr_dbg("Fail to map to sge[1]\n");
415 rdma_pci_dma_unmap(backend_dev
->dev
, hdr
, sge
[0].length
);
419 pr_dbg_buf("mad_hdr", hdr
, sge
[0].length
);
420 pr_dbg_buf("mad_data", data
, sge
[1].length
);
422 memcpy(&msg
.umad
.mad
[0], hdr
, sge
[0].length
);
423 memcpy(&msg
.umad
.mad
[sge
[0].length
], data
, sge
[1].length
);
425 rdma_pci_dma_unmap(backend_dev
->dev
, data
, sge
[1].length
);
426 rdma_pci_dma_unmap(backend_dev
->dev
, hdr
, sge
[0].length
);
428 ret
= exec_rdmacm_mux_req(backend_dev
, &msg
);
430 pr_dbg("Fail to send MAD to rdma_umadmux (%d)\n", ret
);
437 void rdma_backend_post_send(RdmaBackendDev
*backend_dev
,
438 RdmaBackendQP
*qp
, uint8_t qp_type
,
439 struct ibv_sge
*sge
, uint32_t num_sge
,
440 uint8_t sgid_idx
, union ibv_gid
*sgid
,
441 union ibv_gid
*dgid
, uint32_t dqpn
, uint32_t dqkey
,
445 struct ibv_sge new_sge
[MAX_SGE
];
448 struct ibv_send_wr wr
= {0}, *bad_wr
;
450 if (!qp
->ibqp
) { /* This field does not get initialized for QP0 and QP1 */
451 if (qp_type
== IBV_QPT_SMI
) {
452 pr_dbg("QP0 unsupported\n");
453 complete_work(IBV_WC_GENERAL_ERR
, VENDOR_ERR_QP0
, ctx
);
454 } else if (qp_type
== IBV_QPT_GSI
) {
456 rc
= mad_send(backend_dev
, sgid_idx
, sgid
, sge
, num_sge
);
458 complete_work(IBV_WC_GENERAL_ERR
, VENDOR_ERR_MAD_SEND
, ctx
);
460 complete_work(IBV_WC_SUCCESS
, 0, ctx
);
466 pr_dbg("num_sge=%d\n", num_sge
);
468 bctx
= g_malloc0(sizeof(*bctx
));
472 rc
= rdma_rm_alloc_cqe_ctx(backend_dev
->rdma_dev_res
, &bctx_id
, bctx
);
474 pr_dbg("Failed to allocate cqe_ctx\n");
475 complete_work(IBV_WC_GENERAL_ERR
, VENDOR_ERR_NOMEM
, ctx
);
479 rc
= build_host_sge_array(backend_dev
->rdma_dev_res
, new_sge
, sge
, num_sge
);
481 pr_dbg("Error: Failed to build host SGE array\n");
482 complete_work(IBV_WC_GENERAL_ERR
, rc
, ctx
);
483 goto out_dealloc_cqe_ctx
;
486 if (qp_type
== IBV_QPT_UD
) {
487 wr
.wr
.ud
.ah
= create_ah(backend_dev
, qp
->ibpd
, sgid_idx
, dgid
);
489 complete_work(IBV_WC_GENERAL_ERR
, VENDOR_ERR_FAIL_BACKEND
, ctx
);
490 goto out_dealloc_cqe_ctx
;
492 wr
.wr
.ud
.remote_qpn
= dqpn
;
493 wr
.wr
.ud
.remote_qkey
= dqkey
;
496 wr
.num_sge
= num_sge
;
497 wr
.opcode
= IBV_WR_SEND
;
498 wr
.send_flags
= IBV_SEND_SIGNALED
;
499 wr
.sg_list
= new_sge
;
502 rc
= ibv_post_send(qp
->ibqp
, &wr
, &bad_wr
);
503 pr_dbg("ibv_post_send=%d\n", rc
);
505 pr_dbg("Fail (%d, %d) to post send WQE to qpn %d\n", rc
, errno
,
507 complete_work(IBV_WC_GENERAL_ERR
, VENDOR_ERR_FAIL_BACKEND
, ctx
);
508 goto out_dealloc_cqe_ctx
;
514 rdma_rm_dealloc_cqe_ctx(backend_dev
->rdma_dev_res
, bctx_id
);
520 static unsigned int save_mad_recv_buffer(RdmaBackendDev
*backend_dev
,
521 struct ibv_sge
*sge
, uint32_t num_sge
,
529 pr_dbg("Invalid num_sge (%d), expecting 1\n", num_sge
);
530 return VENDOR_ERR_INV_NUM_SGE
;
533 if (sge
[0].length
< RDMA_MAX_PRIVATE_DATA
+ sizeof(struct ibv_grh
)) {
534 pr_dbg("Too small buffer for MAD\n");
535 return VENDOR_ERR_INV_MAD_BUFF
;
538 pr_dbg("addr=0x%" PRIx64
"\n", sge
[0].addr
);
539 pr_dbg("length=%d\n", sge
[0].length
);
540 pr_dbg("lkey=%d\n", sge
[0].lkey
);
542 bctx
= g_malloc0(sizeof(*bctx
));
544 rc
= rdma_rm_alloc_cqe_ctx(backend_dev
->rdma_dev_res
, &bctx_id
, bctx
);
547 pr_dbg("Fail to allocate cqe_ctx\n");
548 return VENDOR_ERR_NOMEM
;
551 pr_dbg("bctx_id %d, bctx %p, ctx %p\n", bctx_id
, bctx
, ctx
);
555 qemu_mutex_lock(&backend_dev
->recv_mads_list
.lock
);
556 qlist_append_int(backend_dev
->recv_mads_list
.list
, bctx_id
);
557 qemu_mutex_unlock(&backend_dev
->recv_mads_list
.lock
);
562 void rdma_backend_post_recv(RdmaBackendDev
*backend_dev
,
563 RdmaDeviceResources
*rdma_dev_res
,
564 RdmaBackendQP
*qp
, uint8_t qp_type
,
565 struct ibv_sge
*sge
, uint32_t num_sge
, void *ctx
)
568 struct ibv_sge new_sge
[MAX_SGE
];
571 struct ibv_recv_wr wr
= {0}, *bad_wr
;
573 if (!qp
->ibqp
) { /* This field does not get initialized for QP0 and QP1 */
574 if (qp_type
== IBV_QPT_SMI
) {
575 pr_dbg("QP0 unsupported\n");
576 complete_work(IBV_WC_GENERAL_ERR
, VENDOR_ERR_QP0
, ctx
);
578 if (qp_type
== IBV_QPT_GSI
) {
580 rc
= save_mad_recv_buffer(backend_dev
, sge
, num_sge
, ctx
);
582 complete_work(IBV_WC_GENERAL_ERR
, rc
, ctx
);
588 pr_dbg("num_sge=%d\n", num_sge
);
590 bctx
= g_malloc0(sizeof(*bctx
));
594 rc
= rdma_rm_alloc_cqe_ctx(rdma_dev_res
, &bctx_id
, bctx
);
596 pr_dbg("Failed to allocate cqe_ctx\n");
597 complete_work(IBV_WC_GENERAL_ERR
, VENDOR_ERR_NOMEM
, ctx
);
601 rc
= build_host_sge_array(rdma_dev_res
, new_sge
, sge
, num_sge
);
603 pr_dbg("Error: Failed to build host SGE array\n");
604 complete_work(IBV_WC_GENERAL_ERR
, rc
, ctx
);
605 goto out_dealloc_cqe_ctx
;
608 wr
.num_sge
= num_sge
;
609 wr
.sg_list
= new_sge
;
611 rc
= ibv_post_recv(qp
->ibqp
, &wr
, &bad_wr
);
612 pr_dbg("ibv_post_recv=%d\n", rc
);
614 pr_dbg("Fail (%d, %d) to post recv WQE to qpn %d\n", rc
, errno
,
616 complete_work(IBV_WC_GENERAL_ERR
, VENDOR_ERR_FAIL_BACKEND
, ctx
);
617 goto out_dealloc_cqe_ctx
;
623 rdma_rm_dealloc_cqe_ctx(rdma_dev_res
, bctx_id
);
629 int rdma_backend_create_pd(RdmaBackendDev
*backend_dev
, RdmaBackendPD
*pd
)
631 pd
->ibpd
= ibv_alloc_pd(backend_dev
->context
);
633 return pd
->ibpd
? 0 : -EIO
;
636 void rdma_backend_destroy_pd(RdmaBackendPD
*pd
)
639 ibv_dealloc_pd(pd
->ibpd
);
643 int rdma_backend_create_mr(RdmaBackendMR
*mr
, RdmaBackendPD
*pd
, void *addr
,
644 size_t length
, int access
)
646 pr_dbg("addr=0x%p\n", addr
);
647 pr_dbg("len=%zu\n", length
);
648 mr
->ibmr
= ibv_reg_mr(pd
->ibpd
, addr
, length
, access
);
650 pr_dbg("lkey=0x%x\n", mr
->ibmr
->lkey
);
651 pr_dbg("rkey=0x%x\n", mr
->ibmr
->rkey
);
655 return mr
->ibmr
? 0 : -EIO
;
658 void rdma_backend_destroy_mr(RdmaBackendMR
*mr
)
661 ibv_dereg_mr(mr
->ibmr
);
665 int rdma_backend_create_cq(RdmaBackendDev
*backend_dev
, RdmaBackendCQ
*cq
,
670 pr_dbg("cqe=%d\n", cqe
);
672 pr_dbg("dev->channel=%p\n", backend_dev
->channel
);
673 cq
->ibcq
= ibv_create_cq(backend_dev
->context
, cqe
+ 1, NULL
,
674 backend_dev
->channel
, 0);
677 rc
= ibv_req_notify_cq(cq
->ibcq
, 0);
679 pr_dbg("Error %d from ibv_req_notify_cq\n", rc
);
681 cq
->backend_dev
= backend_dev
;
684 return cq
->ibcq
? 0 : -EIO
;
687 void rdma_backend_destroy_cq(RdmaBackendCQ
*cq
)
690 ibv_destroy_cq(cq
->ibcq
);
694 int rdma_backend_create_qp(RdmaBackendQP
*qp
, uint8_t qp_type
,
695 RdmaBackendPD
*pd
, RdmaBackendCQ
*scq
,
696 RdmaBackendCQ
*rcq
, uint32_t max_send_wr
,
697 uint32_t max_recv_wr
, uint32_t max_send_sge
,
698 uint32_t max_recv_sge
)
700 struct ibv_qp_init_attr attr
= {0};
703 pr_dbg("qp_type=%d\n", qp_type
);
716 pr_dbg("Unsupported QP type %d\n", qp_type
);
720 attr
.qp_type
= qp_type
;
721 attr
.send_cq
= scq
->ibcq
;
722 attr
.recv_cq
= rcq
->ibcq
;
723 attr
.cap
.max_send_wr
= max_send_wr
;
724 attr
.cap
.max_recv_wr
= max_recv_wr
;
725 attr
.cap
.max_send_sge
= max_send_sge
;
726 attr
.cap
.max_recv_sge
= max_recv_sge
;
728 pr_dbg("max_send_wr=%d\n", max_send_wr
);
729 pr_dbg("max_recv_wr=%d\n", max_recv_wr
);
730 pr_dbg("max_send_sge=%d\n", max_send_sge
);
731 pr_dbg("max_recv_sge=%d\n", max_recv_sge
);
733 qp
->ibqp
= ibv_create_qp(pd
->ibpd
, &attr
);
734 if (likely(!qp
->ibqp
)) {
735 pr_dbg("Error from ibv_create_qp\n");
741 /* TODO: Query QP to get max_inline_data and save it to be used in send */
743 pr_dbg("qpn=0x%x\n", qp
->ibqp
->qp_num
);
748 int rdma_backend_qp_state_init(RdmaBackendDev
*backend_dev
, RdmaBackendQP
*qp
,
749 uint8_t qp_type
, uint32_t qkey
)
751 struct ibv_qp_attr attr
= {0};
754 pr_dbg("qpn=0x%x\n", qp
->ibqp
->qp_num
);
755 pr_dbg("sport_num=%d\n", backend_dev
->port_num
);
757 attr_mask
= IBV_QP_STATE
| IBV_QP_PKEY_INDEX
| IBV_QP_PORT
;
758 attr
.qp_state
= IBV_QPS_INIT
;
760 attr
.port_num
= backend_dev
->port_num
;
764 attr_mask
|= IBV_QP_ACCESS_FLAGS
;
769 attr_mask
|= IBV_QP_QKEY
;
773 pr_dbg("Unsupported QP type %d\n", qp_type
);
777 rc
= ibv_modify_qp(qp
->ibqp
, &attr
, attr_mask
);
779 pr_dbg("Error %d from ibv_modify_qp\n", rc
);
786 int rdma_backend_qp_state_rtr(RdmaBackendDev
*backend_dev
, RdmaBackendQP
*qp
,
787 uint8_t qp_type
, uint8_t sgid_idx
,
788 union ibv_gid
*dgid
, uint32_t dqpn
,
789 uint32_t rq_psn
, uint32_t qkey
, bool use_qkey
)
791 struct ibv_qp_attr attr
= {0};
792 union ibv_gid ibv_gid
= {
793 .global
.interface_id
= dgid
->global
.interface_id
,
794 .global
.subnet_prefix
= dgid
->global
.subnet_prefix
798 attr
.qp_state
= IBV_QPS_RTR
;
799 attr_mask
= IBV_QP_STATE
;
801 qp
->sgid_idx
= sgid_idx
;
805 pr_dbg("dgid=0x%" PRIx64
",%" PRIx64
"\n",
806 be64_to_cpu(ibv_gid
.global
.subnet_prefix
),
807 be64_to_cpu(ibv_gid
.global
.interface_id
));
808 pr_dbg("dqpn=0x%x\n", dqpn
);
809 pr_dbg("sgid_idx=%d\n", qp
->sgid_idx
);
810 pr_dbg("sport_num=%d\n", backend_dev
->port_num
);
811 pr_dbg("rq_psn=0x%x\n", rq_psn
);
813 attr
.path_mtu
= IBV_MTU_1024
;
814 attr
.dest_qp_num
= dqpn
;
815 attr
.max_dest_rd_atomic
= 1;
816 attr
.min_rnr_timer
= 12;
817 attr
.ah_attr
.port_num
= backend_dev
->port_num
;
818 attr
.ah_attr
.is_global
= 1;
819 attr
.ah_attr
.grh
.hop_limit
= 1;
820 attr
.ah_attr
.grh
.dgid
= ibv_gid
;
821 attr
.ah_attr
.grh
.sgid_index
= qp
->sgid_idx
;
822 attr
.rq_psn
= rq_psn
;
824 attr_mask
|= IBV_QP_AV
| IBV_QP_PATH_MTU
| IBV_QP_DEST_QPN
|
825 IBV_QP_RQ_PSN
| IBV_QP_MAX_DEST_RD_ATOMIC
|
826 IBV_QP_MIN_RNR_TIMER
;
830 pr_dbg("qkey=0x%x\n", qkey
);
833 attr_mask
|= IBV_QP_QKEY
;
838 rc
= ibv_modify_qp(qp
->ibqp
, &attr
, attr_mask
);
840 pr_dbg("Error %d from ibv_modify_qp\n", rc
);
847 int rdma_backend_qp_state_rts(RdmaBackendQP
*qp
, uint8_t qp_type
,
848 uint32_t sq_psn
, uint32_t qkey
, bool use_qkey
)
850 struct ibv_qp_attr attr
= {0};
853 pr_dbg("qpn=0x%x\n", qp
->ibqp
->qp_num
);
854 pr_dbg("sq_psn=0x%x\n", sq_psn
);
856 attr
.qp_state
= IBV_QPS_RTS
;
857 attr
.sq_psn
= sq_psn
;
858 attr_mask
= IBV_QP_STATE
| IBV_QP_SQ_PSN
;
865 attr
.max_rd_atomic
= 1;
867 attr_mask
|= IBV_QP_TIMEOUT
| IBV_QP_RETRY_CNT
| IBV_QP_RNR_RETRY
|
868 IBV_QP_MAX_QP_RD_ATOMIC
;
873 pr_dbg("qkey=0x%x\n", qkey
);
875 attr_mask
|= IBV_QP_QKEY
;
880 rc
= ibv_modify_qp(qp
->ibqp
, &attr
, attr_mask
);
882 pr_dbg("Error %d from ibv_modify_qp\n", rc
);
889 int rdma_backend_query_qp(RdmaBackendQP
*qp
, struct ibv_qp_attr
*attr
,
890 int attr_mask
, struct ibv_qp_init_attr
*init_attr
)
894 attr
->qp_state
= IBV_QPS_RTS
;
898 return ibv_query_qp(qp
->ibqp
, attr
, attr_mask
, init_attr
);
901 void rdma_backend_destroy_qp(RdmaBackendQP
*qp
)
904 ibv_destroy_qp(qp
->ibqp
);
908 #define CHK_ATTR(req, dev, member, fmt) ({ \
909 pr_dbg("%s="fmt","fmt"\n", #member, dev.member, req->member); \
910 if (req->member > dev.member) { \
911 warn_report("%s = "fmt" is higher than host device capability "fmt, \
912 #member, req->member, dev.member); \
913 req->member = dev.member; \
915 pr_dbg("%s="fmt"\n", #member, req->member); })
917 static int init_device_caps(RdmaBackendDev
*backend_dev
,
918 struct ibv_device_attr
*dev_attr
)
920 struct ibv_device_attr bk_dev_attr
;
922 if (ibv_query_device(backend_dev
->context
, &bk_dev_attr
)) {
926 dev_attr
->max_sge
= MAX_SGE
;
928 CHK_ATTR(dev_attr
, bk_dev_attr
, max_mr_size
, "%" PRId64
);
929 CHK_ATTR(dev_attr
, bk_dev_attr
, max_qp
, "%d");
930 CHK_ATTR(dev_attr
, bk_dev_attr
, max_sge
, "%d");
931 CHK_ATTR(dev_attr
, bk_dev_attr
, max_qp_wr
, "%d");
932 CHK_ATTR(dev_attr
, bk_dev_attr
, max_cq
, "%d");
933 CHK_ATTR(dev_attr
, bk_dev_attr
, max_cqe
, "%d");
934 CHK_ATTR(dev_attr
, bk_dev_attr
, max_mr
, "%d");
935 CHK_ATTR(dev_attr
, bk_dev_attr
, max_pd
, "%d");
936 CHK_ATTR(dev_attr
, bk_dev_attr
, max_qp_rd_atom
, "%d");
937 CHK_ATTR(dev_attr
, bk_dev_attr
, max_qp_init_rd_atom
, "%d");
938 CHK_ATTR(dev_attr
, bk_dev_attr
, max_ah
, "%d");
943 static inline void build_mad_hdr(struct ibv_grh
*grh
, union ibv_gid
*sgid
,
944 union ibv_gid
*my_gid
, int paylen
)
946 grh
->paylen
= htons(paylen
);
950 pr_dbg("paylen=%d (net=0x%x)\n", paylen
, grh
->paylen
);
951 pr_dbg("dgid=0x%llx\n", my_gid
->global
.interface_id
);
952 pr_dbg("sgid=0x%llx\n", sgid
->global
.interface_id
);
955 static void process_incoming_mad_req(RdmaBackendDev
*backend_dev
,
959 unsigned long cqe_ctx_id
;
963 pr_dbg("umad_len=%d\n", msg
->umad_len
);
966 struct umad_hdr
*hdr
= (struct umad_hdr
*)&msg
->umad
.mad
;
967 pr_dbg("bv %x cls %x cv %x mtd %x st %d tid %" PRIx64
" at %x atm %x\n",
968 hdr
->base_version
, hdr
->mgmt_class
, hdr
->class_version
,
969 hdr
->method
, hdr
->status
, be64toh(hdr
->tid
),
970 hdr
->attr_id
, hdr
->attr_mod
);
973 qemu_mutex_lock(&backend_dev
->recv_mads_list
.lock
);
974 o_ctx_id
= qlist_pop(backend_dev
->recv_mads_list
.list
);
975 qemu_mutex_unlock(&backend_dev
->recv_mads_list
.lock
);
977 pr_dbg("No more free MADs buffers, waiting for a while\n");
982 cqe_ctx_id
= qnum_get_uint(qobject_to(QNum
, o_ctx_id
));
983 bctx
= rdma_rm_get_cqe_ctx(backend_dev
->rdma_dev_res
, cqe_ctx_id
);
984 if (unlikely(!bctx
)) {
985 pr_dbg("Error: Fail to find ctx for %ld\n", cqe_ctx_id
);
989 pr_dbg("id %ld, bctx %p, ctx %p\n", cqe_ctx_id
, bctx
, bctx
->up_ctx
);
991 mad
= rdma_pci_dma_map(backend_dev
->dev
, bctx
->sge
.addr
,
993 if (!mad
|| bctx
->sge
.length
< msg
->umad_len
+ MAD_HDR_SIZE
) {
994 complete_work(IBV_WC_GENERAL_ERR
, VENDOR_ERR_INV_MAD_BUFF
,
997 struct ibv_wc wc
= {0};
998 pr_dbg_buf("mad", msg
->umad
.mad
, msg
->umad_len
);
999 memset(mad
, 0, bctx
->sge
.length
);
1000 build_mad_hdr((struct ibv_grh
*)mad
,
1001 (union ibv_gid
*)&msg
->umad
.hdr
.addr
.gid
, &msg
->hdr
.sgid
,
1003 memcpy(&mad
[MAD_HDR_SIZE
], msg
->umad
.mad
, msg
->umad_len
);
1004 rdma_pci_dma_unmap(backend_dev
->dev
, mad
, bctx
->sge
.length
);
1006 wc
.byte_len
= msg
->umad_len
;
1007 wc
.status
= IBV_WC_SUCCESS
;
1008 wc
.wc_flags
= IBV_WC_GRH
;
1009 comp_handler(bctx
->up_ctx
, &wc
);
1013 rdma_rm_dealloc_cqe_ctx(backend_dev
->rdma_dev_res
, cqe_ctx_id
);
1016 static inline int rdmacm_mux_can_receive(void *opaque
)
1018 RdmaBackendDev
*backend_dev
= (RdmaBackendDev
*)opaque
;
1020 return rdmacm_mux_can_process_async(backend_dev
);
1023 static void rdmacm_mux_read(void *opaque
, const uint8_t *buf
, int size
)
1025 RdmaBackendDev
*backend_dev
= (RdmaBackendDev
*)opaque
;
1026 RdmaCmMuxMsg
*msg
= (RdmaCmMuxMsg
*)buf
;
1028 pr_dbg("Got %d bytes\n", size
);
1029 pr_dbg("msg_type=%d\n", msg
->hdr
.msg_type
);
1030 pr_dbg("op_code=%d\n", msg
->hdr
.op_code
);
1032 if (msg
->hdr
.msg_type
!= RDMACM_MUX_MSG_TYPE_REQ
&&
1033 msg
->hdr
.op_code
!= RDMACM_MUX_OP_CODE_MAD
) {
1034 pr_dbg("Error: Not a MAD request, skipping\n");
1037 process_incoming_mad_req(backend_dev
, msg
);
1040 static int mad_init(RdmaBackendDev
*backend_dev
, CharBackend
*mad_chr_be
)
1044 backend_dev
->rdmacm_mux
.chr_be
= mad_chr_be
;
1046 ret
= qemu_chr_fe_backend_connected(backend_dev
->rdmacm_mux
.chr_be
);
1048 pr_dbg("Missing chardev for MAD multiplexer\n");
1052 qemu_mutex_init(&backend_dev
->recv_mads_list
.lock
);
1053 backend_dev
->recv_mads_list
.list
= qlist_new();
1055 enable_rdmacm_mux_async(backend_dev
);
1057 qemu_chr_fe_set_handlers(backend_dev
->rdmacm_mux
.chr_be
,
1058 rdmacm_mux_can_receive
, rdmacm_mux_read
, NULL
,
1059 NULL
, backend_dev
, NULL
, true);
1064 static void mad_fini(RdmaBackendDev
*backend_dev
)
1066 pr_dbg("Stopping MAD\n");
1067 disable_rdmacm_mux_async(backend_dev
);
1068 qemu_chr_fe_disconnect(backend_dev
->rdmacm_mux
.chr_be
);
1069 if (backend_dev
->recv_mads_list
.list
) {
1070 qlist_destroy_obj(QOBJECT(backend_dev
->recv_mads_list
.list
));
1071 qemu_mutex_destroy(&backend_dev
->recv_mads_list
.lock
);
1075 int rdma_backend_get_gid_index(RdmaBackendDev
*backend_dev
,
1082 pr_dbg("0x%llx, 0x%llx\n",
1083 (long long unsigned int)be64_to_cpu(gid
->global
.subnet_prefix
),
1084 (long long unsigned int)be64_to_cpu(gid
->global
.interface_id
));
1087 ret
= ibv_query_gid(backend_dev
->context
, backend_dev
->port_num
, i
,
1090 } while (!ret
&& (memcmp(&sgid
, gid
, sizeof(*gid
))));
1092 pr_dbg("gid_index=%d\n", i
- 1);
1094 return ret
? ret
: i
- 1;
1097 int rdma_backend_add_gid(RdmaBackendDev
*backend_dev
, const char *ifname
,
1100 RdmaCmMuxMsg msg
= {};
1103 pr_dbg("0x%llx, 0x%llx\n",
1104 (long long unsigned int)be64_to_cpu(gid
->global
.subnet_prefix
),
1105 (long long unsigned int)be64_to_cpu(gid
->global
.interface_id
));
1107 msg
.hdr
.op_code
= RDMACM_MUX_OP_CODE_REG
;
1108 memcpy(msg
.hdr
.sgid
.raw
, gid
->raw
, sizeof(msg
.hdr
.sgid
));
1110 ret
= exec_rdmacm_mux_req(backend_dev
, &msg
);
1112 pr_dbg("Fail to register GID to rdma_umadmux (%d)\n", ret
);
1116 qapi_event_send_rdma_gid_status_changed(ifname
, true,
1117 gid
->global
.subnet_prefix
,
1118 gid
->global
.interface_id
);
1123 int rdma_backend_del_gid(RdmaBackendDev
*backend_dev
, const char *ifname
,
1126 RdmaCmMuxMsg msg
= {};
1129 pr_dbg("0x%llx, 0x%llx\n",
1130 (long long unsigned int)be64_to_cpu(gid
->global
.subnet_prefix
),
1131 (long long unsigned int)be64_to_cpu(gid
->global
.interface_id
));
1133 msg
.hdr
.op_code
= RDMACM_MUX_OP_CODE_UNREG
;
1134 memcpy(msg
.hdr
.sgid
.raw
, gid
->raw
, sizeof(msg
.hdr
.sgid
));
1136 ret
= exec_rdmacm_mux_req(backend_dev
, &msg
);
1138 pr_dbg("Fail to unregister GID from rdma_umadmux (%d)\n", ret
);
1142 qapi_event_send_rdma_gid_status_changed(ifname
, false,
1143 gid
->global
.subnet_prefix
,
1144 gid
->global
.interface_id
);
1149 int rdma_backend_init(RdmaBackendDev
*backend_dev
, PCIDevice
*pdev
,
1150 RdmaDeviceResources
*rdma_dev_res
,
1151 const char *backend_device_name
, uint8_t port_num
,
1152 struct ibv_device_attr
*dev_attr
, CharBackend
*mad_chr_be
,
1157 int num_ibv_devices
;
1158 struct ibv_device
**dev_list
;
1160 memset(backend_dev
, 0, sizeof(*backend_dev
));
1162 backend_dev
->dev
= pdev
;
1163 backend_dev
->port_num
= port_num
;
1164 backend_dev
->rdma_dev_res
= rdma_dev_res
;
1166 rdma_backend_register_comp_handler(dummy_comp_handler
);
1168 dev_list
= ibv_get_device_list(&num_ibv_devices
);
1170 error_setg(errp
, "Failed to get IB devices list");
1174 if (num_ibv_devices
== 0) {
1175 error_setg(errp
, "No IB devices were found");
1177 goto out_free_dev_list
;
1180 if (backend_device_name
) {
1181 for (i
= 0; dev_list
[i
]; ++i
) {
1182 if (!strcmp(ibv_get_device_name(dev_list
[i
]),
1183 backend_device_name
)) {
1188 backend_dev
->ib_dev
= dev_list
[i
];
1189 if (!backend_dev
->ib_dev
) {
1190 error_setg(errp
, "Failed to find IB device %s",
1191 backend_device_name
);
1193 goto out_free_dev_list
;
1196 backend_dev
->ib_dev
= *dev_list
;
1199 pr_dbg("Using backend device %s, port %d\n",
1200 ibv_get_device_name(backend_dev
->ib_dev
), backend_dev
->port_num
);
1201 pr_dbg("uverb device %s\n", backend_dev
->ib_dev
->dev_name
);
1203 backend_dev
->context
= ibv_open_device(backend_dev
->ib_dev
);
1204 if (!backend_dev
->context
) {
1205 error_setg(errp
, "Failed to open IB device");
1210 backend_dev
->channel
= ibv_create_comp_channel(backend_dev
->context
);
1211 if (!backend_dev
->channel
) {
1212 error_setg(errp
, "Failed to create IB communication channel");
1214 goto out_close_device
;
1216 pr_dbg("dev->backend_dev.channel=%p\n", backend_dev
->channel
);
1218 ret
= init_device_caps(backend_dev
, dev_attr
);
1220 error_setg(errp
, "Failed to initialize device capabilities");
1222 goto out_destroy_comm_channel
;
1226 ret
= mad_init(backend_dev
, mad_chr_be
);
1228 error_setg(errp
, "Fail to initialize mad");
1230 goto out_destroy_comm_channel
;
1233 backend_dev
->comp_thread
.run
= false;
1234 backend_dev
->comp_thread
.is_running
= false;
1238 goto out_free_dev_list
;
1240 out_destroy_comm_channel
:
1241 ibv_destroy_comp_channel(backend_dev
->channel
);
1244 ibv_close_device(backend_dev
->context
);
1247 ibv_free_device_list(dev_list
);
1254 void rdma_backend_start(RdmaBackendDev
*backend_dev
)
1256 pr_dbg("Starting rdma_backend\n");
1257 start_comp_thread(backend_dev
);
1260 void rdma_backend_stop(RdmaBackendDev
*backend_dev
)
1262 pr_dbg("Stopping rdma_backend\n");
1263 stop_backend_thread(&backend_dev
->comp_thread
);
1266 void rdma_backend_fini(RdmaBackendDev
*backend_dev
)
1268 rdma_backend_stop(backend_dev
);
1269 mad_fini(backend_dev
);
1270 g_hash_table_destroy(ah_hash
);
1271 ibv_destroy_comp_channel(backend_dev
->channel
);
1272 ibv_close_device(backend_dev
->context
);