RPC/RDMA: fix connection IRD/ORD setting
[linux-2.6/verdex.git] / net / sunrpc / xprtrdma / verbs.c
blobe3fe9054fef691ef4338dfa07842832a04e986cc
1 /*
2 * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the BSD-type
8 * license below:
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
14 * Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
17 * Redistributions in binary form must reproduce the above
18 * copyright notice, this list of conditions and the following
19 * disclaimer in the documentation and/or other materials provided
20 * with the distribution.
22 * Neither the name of the Network Appliance, Inc. nor the names of
23 * its contributors may be used to endorse or promote products
24 * derived from this software without specific prior written
25 * permission.
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
41 * verbs.c
43 * Encapsulates the major functions managing:
44 * o adapters
45 * o endpoints
46 * o connections
47 * o buffer memory
50 #include <linux/pci.h> /* for Tavor hack below */
52 #include "xprt_rdma.h"
55 * Globals/Macros
58 #ifdef RPC_DEBUG
59 # define RPCDBG_FACILITY RPCDBG_TRANS
60 #endif
63 * internal functions
67 * handle replies in tasklet context, using a single, global list
68 * rdma tasklet function -- just turn around and call the func
69 * for all replies on the list
72 static DEFINE_SPINLOCK(rpcrdma_tk_lock_g);
73 static LIST_HEAD(rpcrdma_tasklets_g);
75 static void
76 rpcrdma_run_tasklet(unsigned long data)
78 struct rpcrdma_rep *rep;
79 void (*func)(struct rpcrdma_rep *);
80 unsigned long flags;
82 data = data;
83 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
84 while (!list_empty(&rpcrdma_tasklets_g)) {
85 rep = list_entry(rpcrdma_tasklets_g.next,
86 struct rpcrdma_rep, rr_list);
87 list_del(&rep->rr_list);
88 func = rep->rr_func;
89 rep->rr_func = NULL;
90 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
92 if (func)
93 func(rep);
94 else
95 rpcrdma_recv_buffer_put(rep);
97 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
99 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
102 static DECLARE_TASKLET(rpcrdma_tasklet_g, rpcrdma_run_tasklet, 0UL);
104 static inline void
105 rpcrdma_schedule_tasklet(struct rpcrdma_rep *rep)
107 unsigned long flags;
109 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
110 list_add_tail(&rep->rr_list, &rpcrdma_tasklets_g);
111 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
112 tasklet_schedule(&rpcrdma_tasklet_g);
115 static void
116 rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context)
118 struct rpcrdma_ep *ep = context;
120 dprintk("RPC: %s: QP error %X on device %s ep %p\n",
121 __func__, event->event, event->device->name, context);
122 if (ep->rep_connected == 1) {
123 ep->rep_connected = -EIO;
124 ep->rep_func(ep);
125 wake_up_all(&ep->rep_connect_wait);
129 static void
130 rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context)
132 struct rpcrdma_ep *ep = context;
134 dprintk("RPC: %s: CQ error %X on device %s ep %p\n",
135 __func__, event->event, event->device->name, context);
136 if (ep->rep_connected == 1) {
137 ep->rep_connected = -EIO;
138 ep->rep_func(ep);
139 wake_up_all(&ep->rep_connect_wait);
143 static inline
144 void rpcrdma_event_process(struct ib_wc *wc)
146 struct rpcrdma_rep *rep =
147 (struct rpcrdma_rep *)(unsigned long) wc->wr_id;
149 dprintk("RPC: %s: event rep %p status %X opcode %X length %u\n",
150 __func__, rep, wc->status, wc->opcode, wc->byte_len);
152 if (!rep) /* send or bind completion that we don't care about */
153 return;
155 if (IB_WC_SUCCESS != wc->status) {
156 dprintk("RPC: %s: %s WC status %X, connection lost\n",
157 __func__, (wc->opcode & IB_WC_RECV) ? "recv" : "send",
158 wc->status);
159 rep->rr_len = ~0U;
160 rpcrdma_schedule_tasklet(rep);
161 return;
164 switch (wc->opcode) {
165 case IB_WC_RECV:
166 rep->rr_len = wc->byte_len;
167 ib_dma_sync_single_for_cpu(
168 rdmab_to_ia(rep->rr_buffer)->ri_id->device,
169 rep->rr_iov.addr, rep->rr_len, DMA_FROM_DEVICE);
170 /* Keep (only) the most recent credits, after check validity */
171 if (rep->rr_len >= 16) {
172 struct rpcrdma_msg *p =
173 (struct rpcrdma_msg *) rep->rr_base;
174 unsigned int credits = ntohl(p->rm_credit);
175 if (credits == 0) {
176 dprintk("RPC: %s: server"
177 " dropped credits to 0!\n", __func__);
178 /* don't deadlock */
179 credits = 1;
180 } else if (credits > rep->rr_buffer->rb_max_requests) {
181 dprintk("RPC: %s: server"
182 " over-crediting: %d (%d)\n",
183 __func__, credits,
184 rep->rr_buffer->rb_max_requests);
185 credits = rep->rr_buffer->rb_max_requests;
187 atomic_set(&rep->rr_buffer->rb_credits, credits);
189 /* fall through */
190 case IB_WC_BIND_MW:
191 rpcrdma_schedule_tasklet(rep);
192 break;
193 default:
194 dprintk("RPC: %s: unexpected WC event %X\n",
195 __func__, wc->opcode);
196 break;
200 static inline int
201 rpcrdma_cq_poll(struct ib_cq *cq)
203 struct ib_wc wc;
204 int rc;
206 for (;;) {
207 rc = ib_poll_cq(cq, 1, &wc);
208 if (rc < 0) {
209 dprintk("RPC: %s: ib_poll_cq failed %i\n",
210 __func__, rc);
211 return rc;
213 if (rc == 0)
214 break;
216 rpcrdma_event_process(&wc);
219 return 0;
223 * rpcrdma_cq_event_upcall
225 * This upcall handles recv, send, bind and unbind events.
226 * It is reentrant but processes single events in order to maintain
227 * ordering of receives to keep server credits.
229 * It is the responsibility of the scheduled tasklet to return
230 * recv buffers to the pool. NOTE: this affects synchronization of
231 * connection shutdown. That is, the structures required for
232 * the completion of the reply handler must remain intact until
233 * all memory has been reclaimed.
235 * Note that send events are suppressed and do not result in an upcall.
237 static void
238 rpcrdma_cq_event_upcall(struct ib_cq *cq, void *context)
240 int rc;
242 rc = rpcrdma_cq_poll(cq);
243 if (rc)
244 return;
246 rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
247 if (rc) {
248 dprintk("RPC: %s: ib_req_notify_cq failed %i\n",
249 __func__, rc);
250 return;
253 rpcrdma_cq_poll(cq);
256 #ifdef RPC_DEBUG
257 static const char * const conn[] = {
258 "address resolved",
259 "address error",
260 "route resolved",
261 "route error",
262 "connect request",
263 "connect response",
264 "connect error",
265 "unreachable",
266 "rejected",
267 "established",
268 "disconnected",
269 "device removal"
271 #endif
273 static int
274 rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
276 struct rpcrdma_xprt *xprt = id->context;
277 struct rpcrdma_ia *ia = &xprt->rx_ia;
278 struct rpcrdma_ep *ep = &xprt->rx_ep;
279 struct sockaddr_in *addr = (struct sockaddr_in *) &ep->rep_remote_addr;
280 struct ib_qp_attr attr;
281 struct ib_qp_init_attr iattr;
282 int connstate = 0;
284 switch (event->event) {
285 case RDMA_CM_EVENT_ADDR_RESOLVED:
286 case RDMA_CM_EVENT_ROUTE_RESOLVED:
287 complete(&ia->ri_done);
288 break;
289 case RDMA_CM_EVENT_ADDR_ERROR:
290 ia->ri_async_rc = -EHOSTUNREACH;
291 dprintk("RPC: %s: CM address resolution error, ep 0x%p\n",
292 __func__, ep);
293 complete(&ia->ri_done);
294 break;
295 case RDMA_CM_EVENT_ROUTE_ERROR:
296 ia->ri_async_rc = -ENETUNREACH;
297 dprintk("RPC: %s: CM route resolution error, ep 0x%p\n",
298 __func__, ep);
299 complete(&ia->ri_done);
300 break;
301 case RDMA_CM_EVENT_ESTABLISHED:
302 connstate = 1;
303 ib_query_qp(ia->ri_id->qp, &attr,
304 IB_QP_MAX_QP_RD_ATOMIC | IB_QP_MAX_DEST_RD_ATOMIC,
305 &iattr);
306 dprintk("RPC: %s: %d responder resources"
307 " (%d initiator)\n",
308 __func__, attr.max_dest_rd_atomic, attr.max_rd_atomic);
309 goto connected;
310 case RDMA_CM_EVENT_CONNECT_ERROR:
311 connstate = -ENOTCONN;
312 goto connected;
313 case RDMA_CM_EVENT_UNREACHABLE:
314 connstate = -ENETDOWN;
315 goto connected;
316 case RDMA_CM_EVENT_REJECTED:
317 connstate = -ECONNREFUSED;
318 goto connected;
319 case RDMA_CM_EVENT_DISCONNECTED:
320 connstate = -ECONNABORTED;
321 goto connected;
322 case RDMA_CM_EVENT_DEVICE_REMOVAL:
323 connstate = -ENODEV;
324 connected:
325 dprintk("RPC: %s: %s: %u.%u.%u.%u:%u"
326 " (ep 0x%p event 0x%x)\n",
327 __func__,
328 (event->event <= 11) ? conn[event->event] :
329 "unknown connection error",
330 NIPQUAD(addr->sin_addr.s_addr),
331 ntohs(addr->sin_port),
332 ep, event->event);
333 atomic_set(&rpcx_to_rdmax(ep->rep_xprt)->rx_buf.rb_credits, 1);
334 dprintk("RPC: %s: %sconnected\n",
335 __func__, connstate > 0 ? "" : "dis");
336 ep->rep_connected = connstate;
337 ep->rep_func(ep);
338 wake_up_all(&ep->rep_connect_wait);
339 break;
340 default:
341 ia->ri_async_rc = -EINVAL;
342 dprintk("RPC: %s: unexpected CM event %X\n",
343 __func__, event->event);
344 complete(&ia->ri_done);
345 break;
348 return 0;
351 static struct rdma_cm_id *
352 rpcrdma_create_id(struct rpcrdma_xprt *xprt,
353 struct rpcrdma_ia *ia, struct sockaddr *addr)
355 struct rdma_cm_id *id;
356 int rc;
358 id = rdma_create_id(rpcrdma_conn_upcall, xprt, RDMA_PS_TCP);
359 if (IS_ERR(id)) {
360 rc = PTR_ERR(id);
361 dprintk("RPC: %s: rdma_create_id() failed %i\n",
362 __func__, rc);
363 return id;
366 ia->ri_async_rc = 0;
367 rc = rdma_resolve_addr(id, NULL, addr, RDMA_RESOLVE_TIMEOUT);
368 if (rc) {
369 dprintk("RPC: %s: rdma_resolve_addr() failed %i\n",
370 __func__, rc);
371 goto out;
373 wait_for_completion(&ia->ri_done);
374 rc = ia->ri_async_rc;
375 if (rc)
376 goto out;
378 ia->ri_async_rc = 0;
379 rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT);
380 if (rc) {
381 dprintk("RPC: %s: rdma_resolve_route() failed %i\n",
382 __func__, rc);
383 goto out;
385 wait_for_completion(&ia->ri_done);
386 rc = ia->ri_async_rc;
387 if (rc)
388 goto out;
390 return id;
392 out:
393 rdma_destroy_id(id);
394 return ERR_PTR(rc);
398 * Drain any cq, prior to teardown.
400 static void
401 rpcrdma_clean_cq(struct ib_cq *cq)
403 struct ib_wc wc;
404 int count = 0;
406 while (1 == ib_poll_cq(cq, 1, &wc))
407 ++count;
409 if (count)
410 dprintk("RPC: %s: flushed %d events (last 0x%x)\n",
411 __func__, count, wc.opcode);
415 * Exported functions.
419 * Open and initialize an Interface Adapter.
420 * o initializes fields of struct rpcrdma_ia, including
421 * interface and provider attributes and protection zone.
424 rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
426 int rc, mem_priv;
427 struct ib_device_attr devattr;
428 struct rpcrdma_ia *ia = &xprt->rx_ia;
430 init_completion(&ia->ri_done);
432 ia->ri_id = rpcrdma_create_id(xprt, ia, addr);
433 if (IS_ERR(ia->ri_id)) {
434 rc = PTR_ERR(ia->ri_id);
435 goto out1;
438 ia->ri_pd = ib_alloc_pd(ia->ri_id->device);
439 if (IS_ERR(ia->ri_pd)) {
440 rc = PTR_ERR(ia->ri_pd);
441 dprintk("RPC: %s: ib_alloc_pd() failed %i\n",
442 __func__, rc);
443 goto out2;
447 * Query the device to determine if the requested memory
448 * registration strategy is supported. If it isn't, set the
449 * strategy to a globally supported model.
451 rc = ib_query_device(ia->ri_id->device, &devattr);
452 if (rc) {
453 dprintk("RPC: %s: ib_query_device failed %d\n",
454 __func__, rc);
455 goto out2;
458 if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) {
459 ia->ri_have_dma_lkey = 1;
460 ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey;
463 switch (memreg) {
464 case RPCRDMA_MEMWINDOWS:
465 case RPCRDMA_MEMWINDOWS_ASYNC:
466 if (!(devattr.device_cap_flags & IB_DEVICE_MEM_WINDOW)) {
467 dprintk("RPC: %s: MEMWINDOWS registration "
468 "specified but not supported by adapter, "
469 "using slower RPCRDMA_REGISTER\n",
470 __func__);
471 memreg = RPCRDMA_REGISTER;
473 break;
474 case RPCRDMA_MTHCAFMR:
475 if (!ia->ri_id->device->alloc_fmr) {
476 #if RPCRDMA_PERSISTENT_REGISTRATION
477 dprintk("RPC: %s: MTHCAFMR registration "
478 "specified but not supported by adapter, "
479 "using riskier RPCRDMA_ALLPHYSICAL\n",
480 __func__);
481 memreg = RPCRDMA_ALLPHYSICAL;
482 #else
483 dprintk("RPC: %s: MTHCAFMR registration "
484 "specified but not supported by adapter, "
485 "using slower RPCRDMA_REGISTER\n",
486 __func__);
487 memreg = RPCRDMA_REGISTER;
488 #endif
490 break;
491 case RPCRDMA_FRMR:
492 /* Requires both frmr reg and local dma lkey */
493 if ((devattr.device_cap_flags &
494 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) !=
495 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) {
496 #if RPCRDMA_PERSISTENT_REGISTRATION
497 dprintk("RPC: %s: FRMR registration "
498 "specified but not supported by adapter, "
499 "using riskier RPCRDMA_ALLPHYSICAL\n",
500 __func__);
501 memreg = RPCRDMA_ALLPHYSICAL;
502 #else
503 dprintk("RPC: %s: FRMR registration "
504 "specified but not supported by adapter, "
505 "using slower RPCRDMA_REGISTER\n",
506 __func__);
507 memreg = RPCRDMA_REGISTER;
508 #endif
510 break;
514 * Optionally obtain an underlying physical identity mapping in
515 * order to do a memory window-based bind. This base registration
516 * is protected from remote access - that is enabled only by binding
517 * for the specific bytes targeted during each RPC operation, and
518 * revoked after the corresponding completion similar to a storage
519 * adapter.
521 switch (memreg) {
522 case RPCRDMA_BOUNCEBUFFERS:
523 case RPCRDMA_REGISTER:
524 case RPCRDMA_FRMR:
525 break;
526 #if RPCRDMA_PERSISTENT_REGISTRATION
527 case RPCRDMA_ALLPHYSICAL:
528 mem_priv = IB_ACCESS_LOCAL_WRITE |
529 IB_ACCESS_REMOTE_WRITE |
530 IB_ACCESS_REMOTE_READ;
531 goto register_setup;
532 #endif
533 case RPCRDMA_MEMWINDOWS_ASYNC:
534 case RPCRDMA_MEMWINDOWS:
535 mem_priv = IB_ACCESS_LOCAL_WRITE |
536 IB_ACCESS_MW_BIND;
537 goto register_setup;
538 case RPCRDMA_MTHCAFMR:
539 if (ia->ri_have_dma_lkey)
540 break;
541 mem_priv = IB_ACCESS_LOCAL_WRITE;
542 register_setup:
543 ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv);
544 if (IS_ERR(ia->ri_bind_mem)) {
545 printk(KERN_ALERT "%s: ib_get_dma_mr for "
546 "phys register failed with %lX\n\t"
547 "Will continue with degraded performance\n",
548 __func__, PTR_ERR(ia->ri_bind_mem));
549 memreg = RPCRDMA_REGISTER;
550 ia->ri_bind_mem = NULL;
552 break;
553 default:
554 printk(KERN_ERR "%s: invalid memory registration mode %d\n",
555 __func__, memreg);
556 rc = -EINVAL;
557 goto out2;
559 dprintk("RPC: %s: memory registration strategy is %d\n",
560 __func__, memreg);
562 /* Else will do memory reg/dereg for each chunk */
563 ia->ri_memreg_strategy = memreg;
565 return 0;
566 out2:
567 rdma_destroy_id(ia->ri_id);
568 out1:
569 return rc;
573 * Clean up/close an IA.
574 * o if event handles and PD have been initialized, free them.
575 * o close the IA
577 void
578 rpcrdma_ia_close(struct rpcrdma_ia *ia)
580 int rc;
582 dprintk("RPC: %s: entering\n", __func__);
583 if (ia->ri_bind_mem != NULL) {
584 rc = ib_dereg_mr(ia->ri_bind_mem);
585 dprintk("RPC: %s: ib_dereg_mr returned %i\n",
586 __func__, rc);
588 if (ia->ri_id != NULL && !IS_ERR(ia->ri_id) && ia->ri_id->qp)
589 rdma_destroy_qp(ia->ri_id);
590 if (ia->ri_pd != NULL && !IS_ERR(ia->ri_pd)) {
591 rc = ib_dealloc_pd(ia->ri_pd);
592 dprintk("RPC: %s: ib_dealloc_pd returned %i\n",
593 __func__, rc);
595 if (ia->ri_id != NULL && !IS_ERR(ia->ri_id))
596 rdma_destroy_id(ia->ri_id);
600 * Create unconnected endpoint.
603 rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
604 struct rpcrdma_create_data_internal *cdata)
606 struct ib_device_attr devattr;
607 int rc, err;
609 rc = ib_query_device(ia->ri_id->device, &devattr);
610 if (rc) {
611 dprintk("RPC: %s: ib_query_device failed %d\n",
612 __func__, rc);
613 return rc;
616 /* check provider's send/recv wr limits */
617 if (cdata->max_requests > devattr.max_qp_wr)
618 cdata->max_requests = devattr.max_qp_wr;
620 ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall;
621 ep->rep_attr.qp_context = ep;
622 /* send_cq and recv_cq initialized below */
623 ep->rep_attr.srq = NULL;
624 ep->rep_attr.cap.max_send_wr = cdata->max_requests;
625 switch (ia->ri_memreg_strategy) {
626 case RPCRDMA_FRMR:
627 /* Add room for frmr register and invalidate WRs */
628 ep->rep_attr.cap.max_send_wr *= 3;
629 if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr)
630 return -EINVAL;
631 break;
632 case RPCRDMA_MEMWINDOWS_ASYNC:
633 case RPCRDMA_MEMWINDOWS:
634 /* Add room for mw_binds+unbinds - overkill! */
635 ep->rep_attr.cap.max_send_wr++;
636 ep->rep_attr.cap.max_send_wr *= (2 * RPCRDMA_MAX_SEGS);
637 if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr)
638 return -EINVAL;
639 break;
640 default:
641 break;
643 ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
644 ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2);
645 ep->rep_attr.cap.max_recv_sge = 1;
646 ep->rep_attr.cap.max_inline_data = 0;
647 ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
648 ep->rep_attr.qp_type = IB_QPT_RC;
649 ep->rep_attr.port_num = ~0;
651 dprintk("RPC: %s: requested max: dtos: send %d recv %d; "
652 "iovs: send %d recv %d\n",
653 __func__,
654 ep->rep_attr.cap.max_send_wr,
655 ep->rep_attr.cap.max_recv_wr,
656 ep->rep_attr.cap.max_send_sge,
657 ep->rep_attr.cap.max_recv_sge);
659 /* set trigger for requesting send completion */
660 ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 /* - 1*/;
661 switch (ia->ri_memreg_strategy) {
662 case RPCRDMA_MEMWINDOWS_ASYNC:
663 case RPCRDMA_MEMWINDOWS:
664 ep->rep_cqinit -= RPCRDMA_MAX_SEGS;
665 break;
666 default:
667 break;
669 if (ep->rep_cqinit <= 2)
670 ep->rep_cqinit = 0;
671 INIT_CQCOUNT(ep);
672 ep->rep_ia = ia;
673 init_waitqueue_head(&ep->rep_connect_wait);
676 * Create a single cq for receive dto and mw_bind (only ever
677 * care about unbind, really). Send completions are suppressed.
678 * Use single threaded tasklet upcalls to maintain ordering.
680 ep->rep_cq = ib_create_cq(ia->ri_id->device, rpcrdma_cq_event_upcall,
681 rpcrdma_cq_async_error_upcall, NULL,
682 ep->rep_attr.cap.max_recv_wr +
683 ep->rep_attr.cap.max_send_wr + 1, 0);
684 if (IS_ERR(ep->rep_cq)) {
685 rc = PTR_ERR(ep->rep_cq);
686 dprintk("RPC: %s: ib_create_cq failed: %i\n",
687 __func__, rc);
688 goto out1;
691 rc = ib_req_notify_cq(ep->rep_cq, IB_CQ_NEXT_COMP);
692 if (rc) {
693 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
694 __func__, rc);
695 goto out2;
698 ep->rep_attr.send_cq = ep->rep_cq;
699 ep->rep_attr.recv_cq = ep->rep_cq;
701 /* Initialize cma parameters */
703 /* RPC/RDMA does not use private data */
704 ep->rep_remote_cma.private_data = NULL;
705 ep->rep_remote_cma.private_data_len = 0;
707 /* Client offers RDMA Read but does not initiate */
708 ep->rep_remote_cma.initiator_depth = 0;
709 if (ia->ri_memreg_strategy == RPCRDMA_BOUNCEBUFFERS)
710 ep->rep_remote_cma.responder_resources = 0;
711 else if (devattr.max_qp_rd_atom > 32) /* arbitrary but <= 255 */
712 ep->rep_remote_cma.responder_resources = 32;
713 else
714 ep->rep_remote_cma.responder_resources = devattr.max_qp_rd_atom;
716 ep->rep_remote_cma.retry_count = 7;
717 ep->rep_remote_cma.flow_control = 0;
718 ep->rep_remote_cma.rnr_retry_count = 0;
720 return 0;
722 out2:
723 err = ib_destroy_cq(ep->rep_cq);
724 if (err)
725 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
726 __func__, err);
727 out1:
728 return rc;
732 * rpcrdma_ep_destroy
734 * Disconnect and destroy endpoint. After this, the only
735 * valid operations on the ep are to free it (if dynamically
736 * allocated) or re-create it.
738 * The caller's error handling must be sure to not leak the endpoint
739 * if this function fails.
742 rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
744 int rc;
746 dprintk("RPC: %s: entering, connected is %d\n",
747 __func__, ep->rep_connected);
749 if (ia->ri_id->qp) {
750 rc = rpcrdma_ep_disconnect(ep, ia);
751 if (rc)
752 dprintk("RPC: %s: rpcrdma_ep_disconnect"
753 " returned %i\n", __func__, rc);
756 ep->rep_func = NULL;
758 /* padding - could be done in rpcrdma_buffer_destroy... */
759 if (ep->rep_pad_mr) {
760 rpcrdma_deregister_internal(ia, ep->rep_pad_mr, &ep->rep_pad);
761 ep->rep_pad_mr = NULL;
764 if (ia->ri_id->qp) {
765 rdma_destroy_qp(ia->ri_id);
766 ia->ri_id->qp = NULL;
769 rpcrdma_clean_cq(ep->rep_cq);
770 rc = ib_destroy_cq(ep->rep_cq);
771 if (rc)
772 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
773 __func__, rc);
775 return rc;
779 * Connect unconnected endpoint.
782 rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
784 struct rdma_cm_id *id;
785 int rc = 0;
786 int retry_count = 0;
787 int reconnect = (ep->rep_connected != 0);
789 if (reconnect) {
790 struct rpcrdma_xprt *xprt;
791 retry:
792 rc = rpcrdma_ep_disconnect(ep, ia);
793 if (rc && rc != -ENOTCONN)
794 dprintk("RPC: %s: rpcrdma_ep_disconnect"
795 " status %i\n", __func__, rc);
796 rpcrdma_clean_cq(ep->rep_cq);
798 xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
799 id = rpcrdma_create_id(xprt, ia,
800 (struct sockaddr *)&xprt->rx_data.addr);
801 if (IS_ERR(id)) {
802 rc = PTR_ERR(id);
803 goto out;
805 /* TEMP TEMP TEMP - fail if new device:
806 * Deregister/remarshal *all* requests!
807 * Close and recreate adapter, pd, etc!
808 * Re-determine all attributes still sane!
809 * More stuff I haven't thought of!
810 * Rrrgh!
812 if (ia->ri_id->device != id->device) {
813 printk("RPC: %s: can't reconnect on "
814 "different device!\n", __func__);
815 rdma_destroy_id(id);
816 rc = -ENETDOWN;
817 goto out;
819 /* END TEMP */
820 rdma_destroy_id(ia->ri_id);
821 ia->ri_id = id;
824 rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
825 if (rc) {
826 dprintk("RPC: %s: rdma_create_qp failed %i\n",
827 __func__, rc);
828 goto out;
831 /* XXX Tavor device performs badly with 2K MTU! */
832 if (strnicmp(ia->ri_id->device->dma_device->bus->name, "pci", 3) == 0) {
833 struct pci_dev *pcid = to_pci_dev(ia->ri_id->device->dma_device);
834 if (pcid->device == PCI_DEVICE_ID_MELLANOX_TAVOR &&
835 (pcid->vendor == PCI_VENDOR_ID_MELLANOX ||
836 pcid->vendor == PCI_VENDOR_ID_TOPSPIN)) {
837 struct ib_qp_attr attr = {
838 .path_mtu = IB_MTU_1024
840 rc = ib_modify_qp(ia->ri_id->qp, &attr, IB_QP_PATH_MTU);
844 ep->rep_connected = 0;
846 rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma);
847 if (rc) {
848 dprintk("RPC: %s: rdma_connect() failed with %i\n",
849 __func__, rc);
850 goto out;
853 if (reconnect)
854 return 0;
856 wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0);
859 * Check state. A non-peer reject indicates no listener
860 * (ECONNREFUSED), which may be a transient state. All
861 * others indicate a transport condition which has already
862 * undergone a best-effort.
864 if (ep->rep_connected == -ECONNREFUSED
865 && ++retry_count <= RDMA_CONNECT_RETRY_MAX) {
866 dprintk("RPC: %s: non-peer_reject, retry\n", __func__);
867 goto retry;
869 if (ep->rep_connected <= 0) {
870 /* Sometimes, the only way to reliably connect to remote
871 * CMs is to use same nonzero values for ORD and IRD. */
872 if (retry_count++ <= RDMA_CONNECT_RETRY_MAX + 1 &&
873 (ep->rep_remote_cma.responder_resources == 0 ||
874 ep->rep_remote_cma.initiator_depth !=
875 ep->rep_remote_cma.responder_resources)) {
876 if (ep->rep_remote_cma.responder_resources == 0)
877 ep->rep_remote_cma.responder_resources = 1;
878 ep->rep_remote_cma.initiator_depth =
879 ep->rep_remote_cma.responder_resources;
880 goto retry;
882 rc = ep->rep_connected;
883 } else {
884 dprintk("RPC: %s: connected\n", __func__);
887 out:
888 if (rc)
889 ep->rep_connected = rc;
890 return rc;
894 * rpcrdma_ep_disconnect
896 * This is separate from destroy to facilitate the ability
897 * to reconnect without recreating the endpoint.
899 * This call is not reentrant, and must not be made in parallel
900 * on the same endpoint.
903 rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
905 int rc;
907 rpcrdma_clean_cq(ep->rep_cq);
908 rc = rdma_disconnect(ia->ri_id);
909 if (!rc) {
910 /* returns without wait if not connected */
911 wait_event_interruptible(ep->rep_connect_wait,
912 ep->rep_connected != 1);
913 dprintk("RPC: %s: after wait, %sconnected\n", __func__,
914 (ep->rep_connected == 1) ? "still " : "dis");
915 } else {
916 dprintk("RPC: %s: rdma_disconnect %i\n", __func__, rc);
917 ep->rep_connected = rc;
919 return rc;
923 * Initialize buffer memory
926 rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
927 struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata)
929 char *p;
930 size_t len;
931 int i, rc;
932 struct rpcrdma_mw *r;
934 buf->rb_max_requests = cdata->max_requests;
935 spin_lock_init(&buf->rb_lock);
936 atomic_set(&buf->rb_credits, 1);
938 /* Need to allocate:
939 * 1. arrays for send and recv pointers
940 * 2. arrays of struct rpcrdma_req to fill in pointers
941 * 3. array of struct rpcrdma_rep for replies
942 * 4. padding, if any
943 * 5. mw's, fmr's or frmr's, if any
944 * Send/recv buffers in req/rep need to be registered
947 len = buf->rb_max_requests *
948 (sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *));
949 len += cdata->padding;
950 switch (ia->ri_memreg_strategy) {
951 case RPCRDMA_FRMR:
952 len += buf->rb_max_requests * RPCRDMA_MAX_SEGS *
953 sizeof(struct rpcrdma_mw);
954 break;
955 case RPCRDMA_MTHCAFMR:
956 /* TBD we are perhaps overallocating here */
957 len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
958 sizeof(struct rpcrdma_mw);
959 break;
960 case RPCRDMA_MEMWINDOWS_ASYNC:
961 case RPCRDMA_MEMWINDOWS:
962 len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
963 sizeof(struct rpcrdma_mw);
964 break;
965 default:
966 break;
969 /* allocate 1, 4 and 5 in one shot */
970 p = kzalloc(len, GFP_KERNEL);
971 if (p == NULL) {
972 dprintk("RPC: %s: req_t/rep_t/pad kzalloc(%zd) failed\n",
973 __func__, len);
974 rc = -ENOMEM;
975 goto out;
977 buf->rb_pool = p; /* for freeing it later */
979 buf->rb_send_bufs = (struct rpcrdma_req **) p;
980 p = (char *) &buf->rb_send_bufs[buf->rb_max_requests];
981 buf->rb_recv_bufs = (struct rpcrdma_rep **) p;
982 p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests];
985 * Register the zeroed pad buffer, if any.
987 if (cdata->padding) {
988 rc = rpcrdma_register_internal(ia, p, cdata->padding,
989 &ep->rep_pad_mr, &ep->rep_pad);
990 if (rc)
991 goto out;
993 p += cdata->padding;
996 * Allocate the fmr's, or mw's for mw_bind chunk registration.
997 * We "cycle" the mw's in order to minimize rkey reuse,
998 * and also reduce unbind-to-bind collision.
1000 INIT_LIST_HEAD(&buf->rb_mws);
1001 r = (struct rpcrdma_mw *)p;
1002 switch (ia->ri_memreg_strategy) {
1003 case RPCRDMA_FRMR:
1004 for (i = buf->rb_max_requests * RPCRDMA_MAX_SEGS; i; i--) {
1005 r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
1006 RPCRDMA_MAX_SEGS);
1007 if (IS_ERR(r->r.frmr.fr_mr)) {
1008 rc = PTR_ERR(r->r.frmr.fr_mr);
1009 dprintk("RPC: %s: ib_alloc_fast_reg_mr"
1010 " failed %i\n", __func__, rc);
1011 goto out;
1013 r->r.frmr.fr_pgl =
1014 ib_alloc_fast_reg_page_list(ia->ri_id->device,
1015 RPCRDMA_MAX_SEGS);
1016 if (IS_ERR(r->r.frmr.fr_pgl)) {
1017 rc = PTR_ERR(r->r.frmr.fr_pgl);
1018 dprintk("RPC: %s: "
1019 "ib_alloc_fast_reg_page_list "
1020 "failed %i\n", __func__, rc);
1021 goto out;
1023 list_add(&r->mw_list, &buf->rb_mws);
1024 ++r;
1026 break;
1027 case RPCRDMA_MTHCAFMR:
1028 /* TBD we are perhaps overallocating here */
1029 for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
1030 static struct ib_fmr_attr fa =
1031 { RPCRDMA_MAX_DATA_SEGS, 1, PAGE_SHIFT };
1032 r->r.fmr = ib_alloc_fmr(ia->ri_pd,
1033 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ,
1034 &fa);
1035 if (IS_ERR(r->r.fmr)) {
1036 rc = PTR_ERR(r->r.fmr);
1037 dprintk("RPC: %s: ib_alloc_fmr"
1038 " failed %i\n", __func__, rc);
1039 goto out;
1041 list_add(&r->mw_list, &buf->rb_mws);
1042 ++r;
1044 break;
1045 case RPCRDMA_MEMWINDOWS_ASYNC:
1046 case RPCRDMA_MEMWINDOWS:
1047 /* Allocate one extra request's worth, for full cycling */
1048 for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
1049 r->r.mw = ib_alloc_mw(ia->ri_pd);
1050 if (IS_ERR(r->r.mw)) {
1051 rc = PTR_ERR(r->r.mw);
1052 dprintk("RPC: %s: ib_alloc_mw"
1053 " failed %i\n", __func__, rc);
1054 goto out;
1056 list_add(&r->mw_list, &buf->rb_mws);
1057 ++r;
1059 break;
1060 default:
1061 break;
1065 * Allocate/init the request/reply buffers. Doing this
1066 * using kmalloc for now -- one for each buf.
1068 for (i = 0; i < buf->rb_max_requests; i++) {
1069 struct rpcrdma_req *req;
1070 struct rpcrdma_rep *rep;
1072 len = cdata->inline_wsize + sizeof(struct rpcrdma_req);
1073 /* RPC layer requests *double* size + 1K RPC_SLACK_SPACE! */
1074 /* Typical ~2400b, so rounding up saves work later */
1075 if (len < 4096)
1076 len = 4096;
1077 req = kmalloc(len, GFP_KERNEL);
1078 if (req == NULL) {
1079 dprintk("RPC: %s: request buffer %d alloc"
1080 " failed\n", __func__, i);
1081 rc = -ENOMEM;
1082 goto out;
1084 memset(req, 0, sizeof(struct rpcrdma_req));
1085 buf->rb_send_bufs[i] = req;
1086 buf->rb_send_bufs[i]->rl_buffer = buf;
1088 rc = rpcrdma_register_internal(ia, req->rl_base,
1089 len - offsetof(struct rpcrdma_req, rl_base),
1090 &buf->rb_send_bufs[i]->rl_handle,
1091 &buf->rb_send_bufs[i]->rl_iov);
1092 if (rc)
1093 goto out;
1095 buf->rb_send_bufs[i]->rl_size = len-sizeof(struct rpcrdma_req);
1097 len = cdata->inline_rsize + sizeof(struct rpcrdma_rep);
1098 rep = kmalloc(len, GFP_KERNEL);
1099 if (rep == NULL) {
1100 dprintk("RPC: %s: reply buffer %d alloc failed\n",
1101 __func__, i);
1102 rc = -ENOMEM;
1103 goto out;
1105 memset(rep, 0, sizeof(struct rpcrdma_rep));
1106 buf->rb_recv_bufs[i] = rep;
1107 buf->rb_recv_bufs[i]->rr_buffer = buf;
1108 init_waitqueue_head(&rep->rr_unbind);
1110 rc = rpcrdma_register_internal(ia, rep->rr_base,
1111 len - offsetof(struct rpcrdma_rep, rr_base),
1112 &buf->rb_recv_bufs[i]->rr_handle,
1113 &buf->rb_recv_bufs[i]->rr_iov);
1114 if (rc)
1115 goto out;
1118 dprintk("RPC: %s: max_requests %d\n",
1119 __func__, buf->rb_max_requests);
1120 /* done */
1121 return 0;
1122 out:
1123 rpcrdma_buffer_destroy(buf);
1124 return rc;
1128 * Unregister and destroy buffer memory. Need to deal with
1129 * partial initialization, so it's callable from failed create.
1130 * Must be called before destroying endpoint, as registrations
1131 * reference it.
1133 void
1134 rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
1136 int rc, i;
1137 struct rpcrdma_ia *ia = rdmab_to_ia(buf);
1138 struct rpcrdma_mw *r;
1140 /* clean up in reverse order from create
1141 * 1. recv mr memory (mr free, then kfree)
1142 * 1a. bind mw memory
1143 * 2. send mr memory (mr free, then kfree)
1144 * 3. padding (if any) [moved to rpcrdma_ep_destroy]
1145 * 4. arrays
1147 dprintk("RPC: %s: entering\n", __func__);
1149 for (i = 0; i < buf->rb_max_requests; i++) {
1150 if (buf->rb_recv_bufs && buf->rb_recv_bufs[i]) {
1151 rpcrdma_deregister_internal(ia,
1152 buf->rb_recv_bufs[i]->rr_handle,
1153 &buf->rb_recv_bufs[i]->rr_iov);
1154 kfree(buf->rb_recv_bufs[i]);
1156 if (buf->rb_send_bufs && buf->rb_send_bufs[i]) {
1157 while (!list_empty(&buf->rb_mws)) {
1158 r = list_entry(buf->rb_mws.next,
1159 struct rpcrdma_mw, mw_list);
1160 list_del(&r->mw_list);
1161 switch (ia->ri_memreg_strategy) {
1162 case RPCRDMA_FRMR:
1163 rc = ib_dereg_mr(r->r.frmr.fr_mr);
1164 if (rc)
1165 dprintk("RPC: %s:"
1166 " ib_dereg_mr"
1167 " failed %i\n",
1168 __func__, rc);
1169 ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
1170 break;
1171 case RPCRDMA_MTHCAFMR:
1172 rc = ib_dealloc_fmr(r->r.fmr);
1173 if (rc)
1174 dprintk("RPC: %s:"
1175 " ib_dealloc_fmr"
1176 " failed %i\n",
1177 __func__, rc);
1178 break;
1179 case RPCRDMA_MEMWINDOWS_ASYNC:
1180 case RPCRDMA_MEMWINDOWS:
1181 rc = ib_dealloc_mw(r->r.mw);
1182 if (rc)
1183 dprintk("RPC: %s:"
1184 " ib_dealloc_mw"
1185 " failed %i\n",
1186 __func__, rc);
1187 break;
1188 default:
1189 break;
1192 rpcrdma_deregister_internal(ia,
1193 buf->rb_send_bufs[i]->rl_handle,
1194 &buf->rb_send_bufs[i]->rl_iov);
1195 kfree(buf->rb_send_bufs[i]);
1199 kfree(buf->rb_pool);
1203 * Get a set of request/reply buffers.
1205 * Reply buffer (if needed) is attached to send buffer upon return.
1206 * Rule:
1207 * rb_send_index and rb_recv_index MUST always be pointing to the
1208 * *next* available buffer (non-NULL). They are incremented after
1209 * removing buffers, and decremented *before* returning them.
1211 struct rpcrdma_req *
1212 rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
1214 struct rpcrdma_req *req;
1215 unsigned long flags;
1216 int i;
1217 struct rpcrdma_mw *r;
1219 spin_lock_irqsave(&buffers->rb_lock, flags);
1220 if (buffers->rb_send_index == buffers->rb_max_requests) {
1221 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1222 dprintk("RPC: %s: out of request buffers\n", __func__);
1223 return ((struct rpcrdma_req *)NULL);
1226 req = buffers->rb_send_bufs[buffers->rb_send_index];
1227 if (buffers->rb_send_index < buffers->rb_recv_index) {
1228 dprintk("RPC: %s: %d extra receives outstanding (ok)\n",
1229 __func__,
1230 buffers->rb_recv_index - buffers->rb_send_index);
1231 req->rl_reply = NULL;
1232 } else {
1233 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1234 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1236 buffers->rb_send_bufs[buffers->rb_send_index++] = NULL;
1237 if (!list_empty(&buffers->rb_mws)) {
1238 i = RPCRDMA_MAX_SEGS - 1;
1239 do {
1240 r = list_entry(buffers->rb_mws.next,
1241 struct rpcrdma_mw, mw_list);
1242 list_del(&r->mw_list);
1243 req->rl_segments[i].mr_chunk.rl_mw = r;
1244 } while (--i >= 0);
1246 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1247 return req;
1251 * Put request/reply buffers back into pool.
1252 * Pre-decrement counter/array index.
1254 void
1255 rpcrdma_buffer_put(struct rpcrdma_req *req)
1257 struct rpcrdma_buffer *buffers = req->rl_buffer;
1258 struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
1259 int i;
1260 unsigned long flags;
1262 BUG_ON(req->rl_nchunks != 0);
1263 spin_lock_irqsave(&buffers->rb_lock, flags);
1264 buffers->rb_send_bufs[--buffers->rb_send_index] = req;
1265 req->rl_niovs = 0;
1266 if (req->rl_reply) {
1267 buffers->rb_recv_bufs[--buffers->rb_recv_index] = req->rl_reply;
1268 init_waitqueue_head(&req->rl_reply->rr_unbind);
1269 req->rl_reply->rr_func = NULL;
1270 req->rl_reply = NULL;
1272 switch (ia->ri_memreg_strategy) {
1273 case RPCRDMA_FRMR:
1274 case RPCRDMA_MTHCAFMR:
1275 case RPCRDMA_MEMWINDOWS_ASYNC:
1276 case RPCRDMA_MEMWINDOWS:
1278 * Cycle mw's back in reverse order, and "spin" them.
1279 * This delays and scrambles reuse as much as possible.
1281 i = 1;
1282 do {
1283 struct rpcrdma_mw **mw;
1284 mw = &req->rl_segments[i].mr_chunk.rl_mw;
1285 list_add_tail(&(*mw)->mw_list, &buffers->rb_mws);
1286 *mw = NULL;
1287 } while (++i < RPCRDMA_MAX_SEGS);
1288 list_add_tail(&req->rl_segments[0].mr_chunk.rl_mw->mw_list,
1289 &buffers->rb_mws);
1290 req->rl_segments[0].mr_chunk.rl_mw = NULL;
1291 break;
1292 default:
1293 break;
1295 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1299 * Recover reply buffers from pool.
1300 * This happens when recovering from error conditions.
1301 * Post-increment counter/array index.
1303 void
1304 rpcrdma_recv_buffer_get(struct rpcrdma_req *req)
1306 struct rpcrdma_buffer *buffers = req->rl_buffer;
1307 unsigned long flags;
1309 if (req->rl_iov.length == 0) /* special case xprt_rdma_allocate() */
1310 buffers = ((struct rpcrdma_req *) buffers)->rl_buffer;
1311 spin_lock_irqsave(&buffers->rb_lock, flags);
1312 if (buffers->rb_recv_index < buffers->rb_max_requests) {
1313 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1314 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1316 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1320 * Put reply buffers back into pool when not attached to
1321 * request. This happens in error conditions, and when
1322 * aborting unbinds. Pre-decrement counter/array index.
1324 void
1325 rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
1327 struct rpcrdma_buffer *buffers = rep->rr_buffer;
1328 unsigned long flags;
1330 rep->rr_func = NULL;
1331 spin_lock_irqsave(&buffers->rb_lock, flags);
1332 buffers->rb_recv_bufs[--buffers->rb_recv_index] = rep;
1333 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1337 * Wrappers for internal-use kmalloc memory registration, used by buffer code.
1341 rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
1342 struct ib_mr **mrp, struct ib_sge *iov)
1344 struct ib_phys_buf ipb;
1345 struct ib_mr *mr;
1346 int rc;
1349 * All memory passed here was kmalloc'ed, therefore phys-contiguous.
1351 iov->addr = ib_dma_map_single(ia->ri_id->device,
1352 va, len, DMA_BIDIRECTIONAL);
1353 iov->length = len;
1355 if (ia->ri_have_dma_lkey) {
1356 *mrp = NULL;
1357 iov->lkey = ia->ri_dma_lkey;
1358 return 0;
1359 } else if (ia->ri_bind_mem != NULL) {
1360 *mrp = NULL;
1361 iov->lkey = ia->ri_bind_mem->lkey;
1362 return 0;
1365 ipb.addr = iov->addr;
1366 ipb.size = iov->length;
1367 mr = ib_reg_phys_mr(ia->ri_pd, &ipb, 1,
1368 IB_ACCESS_LOCAL_WRITE, &iov->addr);
1370 dprintk("RPC: %s: phys convert: 0x%llx "
1371 "registered 0x%llx length %d\n",
1372 __func__, (unsigned long long)ipb.addr,
1373 (unsigned long long)iov->addr, len);
1375 if (IS_ERR(mr)) {
1376 *mrp = NULL;
1377 rc = PTR_ERR(mr);
1378 dprintk("RPC: %s: failed with %i\n", __func__, rc);
1379 } else {
1380 *mrp = mr;
1381 iov->lkey = mr->lkey;
1382 rc = 0;
1385 return rc;
1389 rpcrdma_deregister_internal(struct rpcrdma_ia *ia,
1390 struct ib_mr *mr, struct ib_sge *iov)
1392 int rc;
1394 ib_dma_unmap_single(ia->ri_id->device,
1395 iov->addr, iov->length, DMA_BIDIRECTIONAL);
1397 if (NULL == mr)
1398 return 0;
1400 rc = ib_dereg_mr(mr);
1401 if (rc)
1402 dprintk("RPC: %s: ib_dereg_mr failed %i\n", __func__, rc);
1403 return rc;
1407 * Wrappers for chunk registration, shared by read/write chunk code.
1410 static void
1411 rpcrdma_map_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg, int writing)
1413 seg->mr_dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
1414 seg->mr_dmalen = seg->mr_len;
1415 if (seg->mr_page)
1416 seg->mr_dma = ib_dma_map_page(ia->ri_id->device,
1417 seg->mr_page, offset_in_page(seg->mr_offset),
1418 seg->mr_dmalen, seg->mr_dir);
1419 else
1420 seg->mr_dma = ib_dma_map_single(ia->ri_id->device,
1421 seg->mr_offset,
1422 seg->mr_dmalen, seg->mr_dir);
1425 static void
1426 rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg)
1428 if (seg->mr_page)
1429 ib_dma_unmap_page(ia->ri_id->device,
1430 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1431 else
1432 ib_dma_unmap_single(ia->ri_id->device,
1433 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1436 static int
1437 rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
1438 int *nsegs, int writing, struct rpcrdma_ia *ia,
1439 struct rpcrdma_xprt *r_xprt)
1441 struct rpcrdma_mr_seg *seg1 = seg;
1442 struct ib_send_wr frmr_wr, *bad_wr;
1443 u8 key;
1444 int len, pageoff;
1445 int i, rc;
1447 pageoff = offset_in_page(seg1->mr_offset);
1448 seg1->mr_offset -= pageoff; /* start of page */
1449 seg1->mr_len += pageoff;
1450 len = -pageoff;
1451 if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
1452 *nsegs = RPCRDMA_MAX_DATA_SEGS;
1453 for (i = 0; i < *nsegs;) {
1454 rpcrdma_map_one(ia, seg, writing);
1455 seg1->mr_chunk.rl_mw->r.frmr.fr_pgl->page_list[i] = seg->mr_dma;
1456 len += seg->mr_len;
1457 ++seg;
1458 ++i;
1459 /* Check for holes */
1460 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1461 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
1462 break;
1464 dprintk("RPC: %s: Using frmr %p to map %d segments\n",
1465 __func__, seg1->mr_chunk.rl_mw, i);
1467 /* Bump the key */
1468 key = (u8)(seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey & 0x000000FF);
1469 ib_update_fast_reg_key(seg1->mr_chunk.rl_mw->r.frmr.fr_mr, ++key);
1471 /* Prepare FRMR WR */
1472 memset(&frmr_wr, 0, sizeof frmr_wr);
1473 frmr_wr.opcode = IB_WR_FAST_REG_MR;
1474 frmr_wr.send_flags = 0; /* unsignaled */
1475 frmr_wr.wr.fast_reg.iova_start = (unsigned long)seg1->mr_dma;
1476 frmr_wr.wr.fast_reg.page_list = seg1->mr_chunk.rl_mw->r.frmr.fr_pgl;
1477 frmr_wr.wr.fast_reg.page_list_len = i;
1478 frmr_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
1479 frmr_wr.wr.fast_reg.length = i << PAGE_SHIFT;
1480 frmr_wr.wr.fast_reg.access_flags = (writing ?
1481 IB_ACCESS_REMOTE_WRITE : IB_ACCESS_REMOTE_READ);
1482 frmr_wr.wr.fast_reg.rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1483 DECR_CQCOUNT(&r_xprt->rx_ep);
1485 rc = ib_post_send(ia->ri_id->qp, &frmr_wr, &bad_wr);
1487 if (rc) {
1488 dprintk("RPC: %s: failed ib_post_send for register,"
1489 " status %i\n", __func__, rc);
1490 while (i--)
1491 rpcrdma_unmap_one(ia, --seg);
1492 } else {
1493 seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1494 seg1->mr_base = seg1->mr_dma + pageoff;
1495 seg1->mr_nsegs = i;
1496 seg1->mr_len = len;
1498 *nsegs = i;
1499 return rc;
1502 static int
1503 rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg,
1504 struct rpcrdma_ia *ia, struct rpcrdma_xprt *r_xprt)
1506 struct rpcrdma_mr_seg *seg1 = seg;
1507 struct ib_send_wr invalidate_wr, *bad_wr;
1508 int rc;
1510 while (seg1->mr_nsegs--)
1511 rpcrdma_unmap_one(ia, seg++);
1513 memset(&invalidate_wr, 0, sizeof invalidate_wr);
1514 invalidate_wr.opcode = IB_WR_LOCAL_INV;
1515 invalidate_wr.send_flags = 0; /* unsignaled */
1516 invalidate_wr.ex.invalidate_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1517 DECR_CQCOUNT(&r_xprt->rx_ep);
1519 rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
1520 if (rc)
1521 dprintk("RPC: %s: failed ib_post_send for invalidate,"
1522 " status %i\n", __func__, rc);
1523 return rc;
1526 static int
1527 rpcrdma_register_fmr_external(struct rpcrdma_mr_seg *seg,
1528 int *nsegs, int writing, struct rpcrdma_ia *ia)
1530 struct rpcrdma_mr_seg *seg1 = seg;
1531 u64 physaddrs[RPCRDMA_MAX_DATA_SEGS];
1532 int len, pageoff, i, rc;
1534 pageoff = offset_in_page(seg1->mr_offset);
1535 seg1->mr_offset -= pageoff; /* start of page */
1536 seg1->mr_len += pageoff;
1537 len = -pageoff;
1538 if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
1539 *nsegs = RPCRDMA_MAX_DATA_SEGS;
1540 for (i = 0; i < *nsegs;) {
1541 rpcrdma_map_one(ia, seg, writing);
1542 physaddrs[i] = seg->mr_dma;
1543 len += seg->mr_len;
1544 ++seg;
1545 ++i;
1546 /* Check for holes */
1547 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1548 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
1549 break;
1551 rc = ib_map_phys_fmr(seg1->mr_chunk.rl_mw->r.fmr,
1552 physaddrs, i, seg1->mr_dma);
1553 if (rc) {
1554 dprintk("RPC: %s: failed ib_map_phys_fmr "
1555 "%u@0x%llx+%i (%d)... status %i\n", __func__,
1556 len, (unsigned long long)seg1->mr_dma,
1557 pageoff, i, rc);
1558 while (i--)
1559 rpcrdma_unmap_one(ia, --seg);
1560 } else {
1561 seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.fmr->rkey;
1562 seg1->mr_base = seg1->mr_dma + pageoff;
1563 seg1->mr_nsegs = i;
1564 seg1->mr_len = len;
1566 *nsegs = i;
1567 return rc;
1570 static int
1571 rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg,
1572 struct rpcrdma_ia *ia)
1574 struct rpcrdma_mr_seg *seg1 = seg;
1575 LIST_HEAD(l);
1576 int rc;
1578 list_add(&seg1->mr_chunk.rl_mw->r.fmr->list, &l);
1579 rc = ib_unmap_fmr(&l);
1580 while (seg1->mr_nsegs--)
1581 rpcrdma_unmap_one(ia, seg++);
1582 if (rc)
1583 dprintk("RPC: %s: failed ib_unmap_fmr,"
1584 " status %i\n", __func__, rc);
1585 return rc;
1588 static int
1589 rpcrdma_register_memwin_external(struct rpcrdma_mr_seg *seg,
1590 int *nsegs, int writing, struct rpcrdma_ia *ia,
1591 struct rpcrdma_xprt *r_xprt)
1593 int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE :
1594 IB_ACCESS_REMOTE_READ);
1595 struct ib_mw_bind param;
1596 int rc;
1598 *nsegs = 1;
1599 rpcrdma_map_one(ia, seg, writing);
1600 param.mr = ia->ri_bind_mem;
1601 param.wr_id = 0ULL; /* no send cookie */
1602 param.addr = seg->mr_dma;
1603 param.length = seg->mr_len;
1604 param.send_flags = 0;
1605 param.mw_access_flags = mem_priv;
1607 DECR_CQCOUNT(&r_xprt->rx_ep);
1608 rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, &param);
1609 if (rc) {
1610 dprintk("RPC: %s: failed ib_bind_mw "
1611 "%u@0x%llx status %i\n",
1612 __func__, seg->mr_len,
1613 (unsigned long long)seg->mr_dma, rc);
1614 rpcrdma_unmap_one(ia, seg);
1615 } else {
1616 seg->mr_rkey = seg->mr_chunk.rl_mw->r.mw->rkey;
1617 seg->mr_base = param.addr;
1618 seg->mr_nsegs = 1;
1620 return rc;
1623 static int
1624 rpcrdma_deregister_memwin_external(struct rpcrdma_mr_seg *seg,
1625 struct rpcrdma_ia *ia,
1626 struct rpcrdma_xprt *r_xprt, void **r)
1628 struct ib_mw_bind param;
1629 LIST_HEAD(l);
1630 int rc;
1632 BUG_ON(seg->mr_nsegs != 1);
1633 param.mr = ia->ri_bind_mem;
1634 param.addr = 0ULL; /* unbind */
1635 param.length = 0;
1636 param.mw_access_flags = 0;
1637 if (*r) {
1638 param.wr_id = (u64) (unsigned long) *r;
1639 param.send_flags = IB_SEND_SIGNALED;
1640 INIT_CQCOUNT(&r_xprt->rx_ep);
1641 } else {
1642 param.wr_id = 0ULL;
1643 param.send_flags = 0;
1644 DECR_CQCOUNT(&r_xprt->rx_ep);
1646 rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, &param);
1647 rpcrdma_unmap_one(ia, seg);
1648 if (rc)
1649 dprintk("RPC: %s: failed ib_(un)bind_mw,"
1650 " status %i\n", __func__, rc);
1651 else
1652 *r = NULL; /* will upcall on completion */
1653 return rc;
1656 static int
1657 rpcrdma_register_default_external(struct rpcrdma_mr_seg *seg,
1658 int *nsegs, int writing, struct rpcrdma_ia *ia)
1660 int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE :
1661 IB_ACCESS_REMOTE_READ);
1662 struct rpcrdma_mr_seg *seg1 = seg;
1663 struct ib_phys_buf ipb[RPCRDMA_MAX_DATA_SEGS];
1664 int len, i, rc = 0;
1666 if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
1667 *nsegs = RPCRDMA_MAX_DATA_SEGS;
1668 for (len = 0, i = 0; i < *nsegs;) {
1669 rpcrdma_map_one(ia, seg, writing);
1670 ipb[i].addr = seg->mr_dma;
1671 ipb[i].size = seg->mr_len;
1672 len += seg->mr_len;
1673 ++seg;
1674 ++i;
1675 /* Check for holes */
1676 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1677 offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len))
1678 break;
1680 seg1->mr_base = seg1->mr_dma;
1681 seg1->mr_chunk.rl_mr = ib_reg_phys_mr(ia->ri_pd,
1682 ipb, i, mem_priv, &seg1->mr_base);
1683 if (IS_ERR(seg1->mr_chunk.rl_mr)) {
1684 rc = PTR_ERR(seg1->mr_chunk.rl_mr);
1685 dprintk("RPC: %s: failed ib_reg_phys_mr "
1686 "%u@0x%llx (%d)... status %i\n",
1687 __func__, len,
1688 (unsigned long long)seg1->mr_dma, i, rc);
1689 while (i--)
1690 rpcrdma_unmap_one(ia, --seg);
1691 } else {
1692 seg1->mr_rkey = seg1->mr_chunk.rl_mr->rkey;
1693 seg1->mr_nsegs = i;
1694 seg1->mr_len = len;
1696 *nsegs = i;
1697 return rc;
1700 static int
1701 rpcrdma_deregister_default_external(struct rpcrdma_mr_seg *seg,
1702 struct rpcrdma_ia *ia)
1704 struct rpcrdma_mr_seg *seg1 = seg;
1705 int rc;
1707 rc = ib_dereg_mr(seg1->mr_chunk.rl_mr);
1708 seg1->mr_chunk.rl_mr = NULL;
1709 while (seg1->mr_nsegs--)
1710 rpcrdma_unmap_one(ia, seg++);
1711 if (rc)
1712 dprintk("RPC: %s: failed ib_dereg_mr,"
1713 " status %i\n", __func__, rc);
1714 return rc;
1718 rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
1719 int nsegs, int writing, struct rpcrdma_xprt *r_xprt)
1721 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
1722 int rc = 0;
1724 switch (ia->ri_memreg_strategy) {
1726 #if RPCRDMA_PERSISTENT_REGISTRATION
1727 case RPCRDMA_ALLPHYSICAL:
1728 rpcrdma_map_one(ia, seg, writing);
1729 seg->mr_rkey = ia->ri_bind_mem->rkey;
1730 seg->mr_base = seg->mr_dma;
1731 seg->mr_nsegs = 1;
1732 nsegs = 1;
1733 break;
1734 #endif
1736 /* Registration using frmr registration */
1737 case RPCRDMA_FRMR:
1738 rc = rpcrdma_register_frmr_external(seg, &nsegs, writing, ia, r_xprt);
1739 break;
1741 /* Registration using fmr memory registration */
1742 case RPCRDMA_MTHCAFMR:
1743 rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia);
1744 break;
1746 /* Registration using memory windows */
1747 case RPCRDMA_MEMWINDOWS_ASYNC:
1748 case RPCRDMA_MEMWINDOWS:
1749 rc = rpcrdma_register_memwin_external(seg, &nsegs, writing, ia, r_xprt);
1750 break;
1752 /* Default registration each time */
1753 default:
1754 rc = rpcrdma_register_default_external(seg, &nsegs, writing, ia);
1755 break;
1757 if (rc)
1758 return -1;
1760 return nsegs;
1764 rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
1765 struct rpcrdma_xprt *r_xprt, void *r)
1767 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
1768 int nsegs = seg->mr_nsegs, rc;
1770 switch (ia->ri_memreg_strategy) {
1772 #if RPCRDMA_PERSISTENT_REGISTRATION
1773 case RPCRDMA_ALLPHYSICAL:
1774 BUG_ON(nsegs != 1);
1775 rpcrdma_unmap_one(ia, seg);
1776 rc = 0;
1777 break;
1778 #endif
1780 case RPCRDMA_FRMR:
1781 rc = rpcrdma_deregister_frmr_external(seg, ia, r_xprt);
1782 break;
1784 case RPCRDMA_MTHCAFMR:
1785 rc = rpcrdma_deregister_fmr_external(seg, ia);
1786 break;
1788 case RPCRDMA_MEMWINDOWS_ASYNC:
1789 case RPCRDMA_MEMWINDOWS:
1790 rc = rpcrdma_deregister_memwin_external(seg, ia, r_xprt, &r);
1791 break;
1793 default:
1794 rc = rpcrdma_deregister_default_external(seg, ia);
1795 break;
1797 if (r) {
1798 struct rpcrdma_rep *rep = r;
1799 void (*func)(struct rpcrdma_rep *) = rep->rr_func;
1800 rep->rr_func = NULL;
1801 func(rep); /* dereg done, callback now */
1803 return nsegs;
1807 * Prepost any receive buffer, then post send.
1809 * Receive buffer is donated to hardware, reclaimed upon recv completion.
1812 rpcrdma_ep_post(struct rpcrdma_ia *ia,
1813 struct rpcrdma_ep *ep,
1814 struct rpcrdma_req *req)
1816 struct ib_send_wr send_wr, *send_wr_fail;
1817 struct rpcrdma_rep *rep = req->rl_reply;
1818 int rc;
1820 if (rep) {
1821 rc = rpcrdma_ep_post_recv(ia, ep, rep);
1822 if (rc)
1823 goto out;
1824 req->rl_reply = NULL;
1827 send_wr.next = NULL;
1828 send_wr.wr_id = 0ULL; /* no send cookie */
1829 send_wr.sg_list = req->rl_send_iov;
1830 send_wr.num_sge = req->rl_niovs;
1831 send_wr.opcode = IB_WR_SEND;
1832 if (send_wr.num_sge == 4) /* no need to sync any pad (constant) */
1833 ib_dma_sync_single_for_device(ia->ri_id->device,
1834 req->rl_send_iov[3].addr, req->rl_send_iov[3].length,
1835 DMA_TO_DEVICE);
1836 ib_dma_sync_single_for_device(ia->ri_id->device,
1837 req->rl_send_iov[1].addr, req->rl_send_iov[1].length,
1838 DMA_TO_DEVICE);
1839 ib_dma_sync_single_for_device(ia->ri_id->device,
1840 req->rl_send_iov[0].addr, req->rl_send_iov[0].length,
1841 DMA_TO_DEVICE);
1843 if (DECR_CQCOUNT(ep) > 0)
1844 send_wr.send_flags = 0;
1845 else { /* Provider must take a send completion every now and then */
1846 INIT_CQCOUNT(ep);
1847 send_wr.send_flags = IB_SEND_SIGNALED;
1850 rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail);
1851 if (rc)
1852 dprintk("RPC: %s: ib_post_send returned %i\n", __func__,
1853 rc);
1854 out:
1855 return rc;
1859 * (Re)post a receive buffer.
1862 rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
1863 struct rpcrdma_ep *ep,
1864 struct rpcrdma_rep *rep)
1866 struct ib_recv_wr recv_wr, *recv_wr_fail;
1867 int rc;
1869 recv_wr.next = NULL;
1870 recv_wr.wr_id = (u64) (unsigned long) rep;
1871 recv_wr.sg_list = &rep->rr_iov;
1872 recv_wr.num_sge = 1;
1874 ib_dma_sync_single_for_cpu(ia->ri_id->device,
1875 rep->rr_iov.addr, rep->rr_iov.length, DMA_BIDIRECTIONAL);
1877 DECR_CQCOUNT(ep);
1878 rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail);
1880 if (rc)
1881 dprintk("RPC: %s: ib_post_recv returned %i\n", __func__,
1882 rc);
1883 return rc;