2 * Copyright(c) 2015, 2016 Intel Corporation.
4 * This file is provided under a dual BSD/GPLv2 license. When using or
5 * redistributing this file, you may do so under either license.
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of version 2 of the GNU General Public License as
11 * published by the Free Software Foundation.
13 * This program is distributed in the hope that it will be useful, but
14 * WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * General Public License for more details.
20 * Redistribution and use in source and binary forms, with or without
21 * modification, are permitted provided that the following conditions
24 * - Redistributions of source code must retain the above copyright
25 * notice, this list of conditions and the following disclaimer.
26 * - Redistributions in binary form must reproduce the above copyright
27 * notice, this list of conditions and the following disclaimer in
28 * the documentation and/or other materials provided with the
30 * - Neither the name of Intel Corporation nor the names of its
31 * contributors may be used to endorse or promote products derived
32 * from this software without specific prior written permission.
34 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
35 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
36 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
37 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
38 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
39 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
40 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
41 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
42 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
43 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
44 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
49 #include <rdma/rdma_vt.h>
50 #include <rdma/rdmavt_qp.h>
54 #include "verbs_txreq.h"
57 /* cut down ridiculously long IB macro names */
58 #define OP(x) IB_OPCODE_RC_##x
61 * hfi1_add_retry_timer - add/start a retry timer
64 * add a retry timer on the QP
66 static inline void hfi1_add_retry_timer(struct rvt_qp
*qp
)
68 struct ib_qp
*ibqp
= &qp
->ibqp
;
69 struct rvt_dev_info
*rdi
= ib_to_rvt(ibqp
->device
);
71 qp
->s_flags
|= RVT_S_TIMER
;
72 /* 4.096 usec. * (1 << qp->timeout) */
73 qp
->s_timer
.expires
= jiffies
+ qp
->timeout_jiffies
+
75 add_timer(&qp
->s_timer
);
79 * hfi1_add_rnr_timer - add/start an rnr timer
81 * @to - timeout in usecs
83 * add an rnr timer on the QP
85 void hfi1_add_rnr_timer(struct rvt_qp
*qp
, u32 to
)
87 struct hfi1_qp_priv
*priv
= qp
->priv
;
89 qp
->s_flags
|= RVT_S_WAIT_RNR
;
90 qp
->s_timer
.expires
= jiffies
+ usecs_to_jiffies(to
);
91 add_timer(&priv
->s_rnr_timer
);
95 * hfi1_mod_retry_timer - mod a retry timer
98 * Modify a potentially already running retry
101 static inline void hfi1_mod_retry_timer(struct rvt_qp
*qp
)
103 struct ib_qp
*ibqp
= &qp
->ibqp
;
104 struct rvt_dev_info
*rdi
= ib_to_rvt(ibqp
->device
);
106 qp
->s_flags
|= RVT_S_TIMER
;
107 /* 4.096 usec. * (1 << qp->timeout) */
108 mod_timer(&qp
->s_timer
, jiffies
+ qp
->timeout_jiffies
+
113 * hfi1_stop_retry_timer - stop a retry timer
116 * stop a retry timer and return if the timer
119 static inline int hfi1_stop_retry_timer(struct rvt_qp
*qp
)
123 /* Remove QP from retry */
124 if (qp
->s_flags
& RVT_S_TIMER
) {
125 qp
->s_flags
&= ~RVT_S_TIMER
;
126 rval
= del_timer(&qp
->s_timer
);
132 * hfi1_stop_rc_timers - stop all timers
135 * stop any pending timers
137 void hfi1_stop_rc_timers(struct rvt_qp
*qp
)
139 struct hfi1_qp_priv
*priv
= qp
->priv
;
141 /* Remove QP from all timers */
142 if (qp
->s_flags
& (RVT_S_TIMER
| RVT_S_WAIT_RNR
)) {
143 qp
->s_flags
&= ~(RVT_S_TIMER
| RVT_S_WAIT_RNR
);
144 del_timer(&qp
->s_timer
);
145 del_timer(&priv
->s_rnr_timer
);
150 * hfi1_stop_rnr_timer - stop an rnr timer
153 * stop an rnr timer and return if the timer
156 static inline int hfi1_stop_rnr_timer(struct rvt_qp
*qp
)
159 struct hfi1_qp_priv
*priv
= qp
->priv
;
161 /* Remove QP from rnr timer */
162 if (qp
->s_flags
& RVT_S_WAIT_RNR
) {
163 qp
->s_flags
&= ~RVT_S_WAIT_RNR
;
164 rval
= del_timer(&priv
->s_rnr_timer
);
170 * hfi1_del_timers_sync - wait for any timeout routines to exit
173 void hfi1_del_timers_sync(struct rvt_qp
*qp
)
175 struct hfi1_qp_priv
*priv
= qp
->priv
;
177 del_timer_sync(&qp
->s_timer
);
178 del_timer_sync(&priv
->s_rnr_timer
);
181 /* only opcode mask for adaptive pio */
182 const u32 rc_only_opcode
=
183 BIT(OP(SEND_ONLY
) & 0x1f) |
184 BIT(OP(SEND_ONLY_WITH_IMMEDIATE
& 0x1f)) |
185 BIT(OP(RDMA_WRITE_ONLY
& 0x1f)) |
186 BIT(OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE
& 0x1f)) |
187 BIT(OP(RDMA_READ_REQUEST
& 0x1f)) |
188 BIT(OP(ACKNOWLEDGE
& 0x1f)) |
189 BIT(OP(ATOMIC_ACKNOWLEDGE
& 0x1f)) |
190 BIT(OP(COMPARE_SWAP
& 0x1f)) |
191 BIT(OP(FETCH_ADD
& 0x1f));
193 static u32
restart_sge(struct rvt_sge_state
*ss
, struct rvt_swqe
*wqe
,
198 len
= delta_psn(psn
, wqe
->psn
) * pmtu
;
199 ss
->sge
= wqe
->sg_list
[0];
200 ss
->sg_list
= wqe
->sg_list
+ 1;
201 ss
->num_sge
= wqe
->wr
.num_sge
;
202 ss
->total_len
= wqe
->length
;
203 hfi1_skip_sge(ss
, len
, 0);
204 return wqe
->length
- len
;
208 * make_rc_ack - construct a response packet (ACK, NAK, or RDMA read)
209 * @dev: the device for this QP
210 * @qp: a pointer to the QP
211 * @ohdr: a pointer to the IB header being constructed
212 * @ps: the xmit packet state
214 * Return 1 if constructed; otherwise, return 0.
215 * Note that we are in the responder's side of the QP context.
216 * Note the QP s_lock must be held.
218 static int make_rc_ack(struct hfi1_ibdev
*dev
, struct rvt_qp
*qp
,
219 struct hfi1_other_headers
*ohdr
,
220 struct hfi1_pkt_state
*ps
)
222 struct rvt_ack_entry
*e
;
229 struct hfi1_qp_priv
*priv
= qp
->priv
;
231 /* Don't send an ACK if we aren't supposed to. */
232 if (!(ib_rvt_state_ops
[qp
->state
] & RVT_PROCESS_RECV_OK
))
235 /* header size in 32-bit words LRH+BTH = (8+12)/4. */
238 switch (qp
->s_ack_state
) {
239 case OP(RDMA_READ_RESPONSE_LAST
):
240 case OP(RDMA_READ_RESPONSE_ONLY
):
241 e
= &qp
->s_ack_queue
[qp
->s_tail_ack_queue
];
242 if (e
->rdma_sge
.mr
) {
243 rvt_put_mr(e
->rdma_sge
.mr
);
244 e
->rdma_sge
.mr
= NULL
;
247 case OP(ATOMIC_ACKNOWLEDGE
):
249 * We can increment the tail pointer now that the last
250 * response has been sent instead of only being
253 if (++qp
->s_tail_ack_queue
> HFI1_MAX_RDMA_ATOMIC
)
254 qp
->s_tail_ack_queue
= 0;
257 case OP(ACKNOWLEDGE
):
258 /* Check for no next entry in the queue. */
259 if (qp
->r_head_ack_queue
== qp
->s_tail_ack_queue
) {
260 if (qp
->s_flags
& RVT_S_ACK_PENDING
)
265 e
= &qp
->s_ack_queue
[qp
->s_tail_ack_queue
];
266 if (e
->opcode
== OP(RDMA_READ_REQUEST
)) {
268 * If a RDMA read response is being resent and
269 * we haven't seen the duplicate request yet,
270 * then stop sending the remaining responses the
271 * responder has seen until the requester re-sends it.
273 len
= e
->rdma_sge
.sge_length
;
274 if (len
&& !e
->rdma_sge
.mr
) {
275 qp
->s_tail_ack_queue
= qp
->r_head_ack_queue
;
278 /* Copy SGE state in case we need to resend */
279 ps
->s_txreq
->mr
= e
->rdma_sge
.mr
;
281 rvt_get_mr(ps
->s_txreq
->mr
);
282 qp
->s_ack_rdma_sge
.sge
= e
->rdma_sge
;
283 qp
->s_ack_rdma_sge
.num_sge
= 1;
284 qp
->s_cur_sge
= &qp
->s_ack_rdma_sge
;
287 qp
->s_ack_state
= OP(RDMA_READ_RESPONSE_FIRST
);
289 qp
->s_ack_state
= OP(RDMA_READ_RESPONSE_ONLY
);
292 ohdr
->u
.aeth
= hfi1_compute_aeth(qp
);
294 qp
->s_ack_rdma_psn
= e
->psn
;
295 bth2
= mask_psn(qp
->s_ack_rdma_psn
++);
297 /* COMPARE_SWAP or FETCH_ADD */
298 qp
->s_cur_sge
= NULL
;
300 qp
->s_ack_state
= OP(ATOMIC_ACKNOWLEDGE
);
301 ohdr
->u
.at
.aeth
= hfi1_compute_aeth(qp
);
302 ohdr
->u
.at
.atomic_ack_eth
[0] =
303 cpu_to_be32(e
->atomic_data
>> 32);
304 ohdr
->u
.at
.atomic_ack_eth
[1] =
305 cpu_to_be32(e
->atomic_data
);
306 hwords
+= sizeof(ohdr
->u
.at
) / sizeof(u32
);
307 bth2
= mask_psn(e
->psn
);
310 bth0
= qp
->s_ack_state
<< 24;
313 case OP(RDMA_READ_RESPONSE_FIRST
):
314 qp
->s_ack_state
= OP(RDMA_READ_RESPONSE_MIDDLE
);
316 case OP(RDMA_READ_RESPONSE_MIDDLE
):
317 qp
->s_cur_sge
= &qp
->s_ack_rdma_sge
;
318 ps
->s_txreq
->mr
= qp
->s_ack_rdma_sge
.sge
.mr
;
320 rvt_get_mr(ps
->s_txreq
->mr
);
321 len
= qp
->s_ack_rdma_sge
.sge
.sge_length
;
324 middle
= HFI1_CAP_IS_KSET(SDMA_AHG
);
326 ohdr
->u
.aeth
= hfi1_compute_aeth(qp
);
328 qp
->s_ack_state
= OP(RDMA_READ_RESPONSE_LAST
);
329 e
= &qp
->s_ack_queue
[qp
->s_tail_ack_queue
];
332 bth0
= qp
->s_ack_state
<< 24;
333 bth2
= mask_psn(qp
->s_ack_rdma_psn
++);
339 * Send a regular ACK.
340 * Set the s_ack_state so we wait until after sending
341 * the ACK before setting s_ack_state to ACKNOWLEDGE
344 qp
->s_ack_state
= OP(SEND_ONLY
);
345 qp
->s_flags
&= ~RVT_S_ACK_PENDING
;
346 qp
->s_cur_sge
= NULL
;
349 cpu_to_be32((qp
->r_msn
& HFI1_MSN_MASK
) |
351 HFI1_AETH_CREDIT_SHIFT
));
353 ohdr
->u
.aeth
= hfi1_compute_aeth(qp
);
356 bth0
= OP(ACKNOWLEDGE
) << 24;
357 bth2
= mask_psn(qp
->s_ack_psn
);
359 qp
->s_rdma_ack_cnt
++;
360 qp
->s_hdrwords
= hwords
;
361 ps
->s_txreq
->sde
= priv
->s_sde
;
362 qp
->s_cur_size
= len
;
363 hfi1_make_ruc_header(qp
, ohdr
, bth0
, bth2
, middle
, ps
);
365 ps
->s_txreq
->hdr_dwords
= qp
->s_hdrwords
+ 2;
369 qp
->s_ack_state
= OP(ACKNOWLEDGE
);
371 * Ensure s_rdma_ack_cnt changes are committed prior to resetting
375 qp
->s_flags
&= ~(RVT_S_RESP_PENDING
382 * hfi1_make_rc_req - construct a request packet (SEND, RDMA r/w, ATOMIC)
383 * @qp: a pointer to the QP
385 * Assumes s_lock is held.
387 * Return 1 if constructed; otherwise, return 0.
389 int hfi1_make_rc_req(struct rvt_qp
*qp
, struct hfi1_pkt_state
*ps
)
391 struct hfi1_qp_priv
*priv
= qp
->priv
;
392 struct hfi1_ibdev
*dev
= to_idev(qp
->ibqp
.device
);
393 struct hfi1_other_headers
*ohdr
;
394 struct rvt_sge_state
*ss
;
395 struct rvt_swqe
*wqe
;
396 /* header size in 32-bit words LRH+BTH = (8+12)/4. */
406 ps
->s_txreq
= get_txreq(ps
->dev
, qp
);
407 if (IS_ERR(ps
->s_txreq
))
410 ohdr
= &ps
->s_txreq
->phdr
.hdr
.u
.oth
;
411 if (qp
->remote_ah_attr
.ah_flags
& IB_AH_GRH
)
412 ohdr
= &ps
->s_txreq
->phdr
.hdr
.u
.l
.oth
;
414 /* Sending responses has higher priority over sending requests. */
415 if ((qp
->s_flags
& RVT_S_RESP_PENDING
) &&
416 make_rc_ack(dev
, qp
, ohdr
, ps
))
419 if (!(ib_rvt_state_ops
[qp
->state
] & RVT_PROCESS_SEND_OK
)) {
420 if (!(ib_rvt_state_ops
[qp
->state
] & RVT_FLUSH_SEND
))
422 /* We are in the error state, flush the work request. */
423 smp_read_barrier_depends(); /* see post_one_send() */
424 if (qp
->s_last
== ACCESS_ONCE(qp
->s_head
))
426 /* If DMAs are in progress, we can't flush immediately. */
427 if (iowait_sdma_pending(&priv
->s_iowait
)) {
428 qp
->s_flags
|= RVT_S_WAIT_DMA
;
432 wqe
= rvt_get_swqe_ptr(qp
, qp
->s_last
);
433 hfi1_send_complete(qp
, wqe
, qp
->s_last
!= qp
->s_acked
?
434 IB_WC_SUCCESS
: IB_WC_WR_FLUSH_ERR
);
435 /* will get called again */
439 if (qp
->s_flags
& (RVT_S_WAIT_RNR
| RVT_S_WAIT_ACK
))
442 if (cmp_psn(qp
->s_psn
, qp
->s_sending_hpsn
) <= 0) {
443 if (cmp_psn(qp
->s_sending_psn
, qp
->s_sending_hpsn
) <= 0) {
444 qp
->s_flags
|= RVT_S_WAIT_PSN
;
447 qp
->s_sending_psn
= qp
->s_psn
;
448 qp
->s_sending_hpsn
= qp
->s_psn
- 1;
451 /* Send a request. */
452 wqe
= rvt_get_swqe_ptr(qp
, qp
->s_cur
);
453 switch (qp
->s_state
) {
455 if (!(ib_rvt_state_ops
[qp
->state
] & RVT_PROCESS_NEXT_SEND_OK
))
458 * Resend an old request or start a new one.
460 * We keep track of the current SWQE so that
461 * we don't reset the "furthest progress" state
462 * if we need to back up.
465 if (qp
->s_cur
== qp
->s_tail
) {
466 /* Check if send work queue is empty. */
467 if (qp
->s_tail
== qp
->s_head
) {
472 * If a fence is requested, wait for previous
473 * RDMA read and atomic operations to finish.
475 if ((wqe
->wr
.send_flags
& IB_SEND_FENCE
) &&
476 qp
->s_num_rd_atomic
) {
477 qp
->s_flags
|= RVT_S_WAIT_FENCE
;
481 qp
->s_psn
= wqe
->psn
;
484 * Note that we have to be careful not to modify the
485 * original work request since we may need to resend
490 bth2
= mask_psn(qp
->s_psn
);
491 switch (wqe
->wr
.opcode
) {
493 case IB_WR_SEND_WITH_IMM
:
494 /* If no credit, return. */
495 if (!(qp
->s_flags
& RVT_S_UNLIMITED_CREDIT
) &&
496 cmp_msn(wqe
->ssn
, qp
->s_lsn
+ 1) > 0) {
497 qp
->s_flags
|= RVT_S_WAIT_SSN_CREDIT
;
501 qp
->s_state
= OP(SEND_FIRST
);
505 if (wqe
->wr
.opcode
== IB_WR_SEND
) {
506 qp
->s_state
= OP(SEND_ONLY
);
508 qp
->s_state
= OP(SEND_ONLY_WITH_IMMEDIATE
);
509 /* Immediate data comes after the BTH */
510 ohdr
->u
.imm_data
= wqe
->wr
.ex
.imm_data
;
513 if (wqe
->wr
.send_flags
& IB_SEND_SOLICITED
)
514 bth0
|= IB_BTH_SOLICITED
;
515 bth2
|= IB_BTH_REQ_ACK
;
516 if (++qp
->s_cur
== qp
->s_size
)
520 case IB_WR_RDMA_WRITE
:
521 if (newreq
&& !(qp
->s_flags
& RVT_S_UNLIMITED_CREDIT
))
524 case IB_WR_RDMA_WRITE_WITH_IMM
:
525 /* If no credit, return. */
526 if (!(qp
->s_flags
& RVT_S_UNLIMITED_CREDIT
) &&
527 cmp_msn(wqe
->ssn
, qp
->s_lsn
+ 1) > 0) {
528 qp
->s_flags
|= RVT_S_WAIT_SSN_CREDIT
;
531 ohdr
->u
.rc
.reth
.vaddr
=
532 cpu_to_be64(wqe
->rdma_wr
.remote_addr
);
533 ohdr
->u
.rc
.reth
.rkey
=
534 cpu_to_be32(wqe
->rdma_wr
.rkey
);
535 ohdr
->u
.rc
.reth
.length
= cpu_to_be32(len
);
536 hwords
+= sizeof(struct ib_reth
) / sizeof(u32
);
538 qp
->s_state
= OP(RDMA_WRITE_FIRST
);
542 if (wqe
->wr
.opcode
== IB_WR_RDMA_WRITE
) {
543 qp
->s_state
= OP(RDMA_WRITE_ONLY
);
546 OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE
);
547 /* Immediate data comes after RETH */
548 ohdr
->u
.rc
.imm_data
= wqe
->wr
.ex
.imm_data
;
550 if (wqe
->wr
.send_flags
& IB_SEND_SOLICITED
)
551 bth0
|= IB_BTH_SOLICITED
;
553 bth2
|= IB_BTH_REQ_ACK
;
554 if (++qp
->s_cur
== qp
->s_size
)
558 case IB_WR_RDMA_READ
:
560 * Don't allow more operations to be started
561 * than the QP limits allow.
564 if (qp
->s_num_rd_atomic
>=
565 qp
->s_max_rd_atomic
) {
566 qp
->s_flags
|= RVT_S_WAIT_RDMAR
;
569 qp
->s_num_rd_atomic
++;
570 if (!(qp
->s_flags
& RVT_S_UNLIMITED_CREDIT
))
573 ohdr
->u
.rc
.reth
.vaddr
=
574 cpu_to_be64(wqe
->rdma_wr
.remote_addr
);
575 ohdr
->u
.rc
.reth
.rkey
=
576 cpu_to_be32(wqe
->rdma_wr
.rkey
);
577 ohdr
->u
.rc
.reth
.length
= cpu_to_be32(len
);
578 qp
->s_state
= OP(RDMA_READ_REQUEST
);
579 hwords
+= sizeof(ohdr
->u
.rc
.reth
) / sizeof(u32
);
582 bth2
|= IB_BTH_REQ_ACK
;
583 if (++qp
->s_cur
== qp
->s_size
)
587 case IB_WR_ATOMIC_CMP_AND_SWP
:
588 case IB_WR_ATOMIC_FETCH_AND_ADD
:
590 * Don't allow more operations to be started
591 * than the QP limits allow.
594 if (qp
->s_num_rd_atomic
>=
595 qp
->s_max_rd_atomic
) {
596 qp
->s_flags
|= RVT_S_WAIT_RDMAR
;
599 qp
->s_num_rd_atomic
++;
600 if (!(qp
->s_flags
& RVT_S_UNLIMITED_CREDIT
))
603 if (wqe
->wr
.opcode
== IB_WR_ATOMIC_CMP_AND_SWP
) {
604 qp
->s_state
= OP(COMPARE_SWAP
);
605 ohdr
->u
.atomic_eth
.swap_data
= cpu_to_be64(
606 wqe
->atomic_wr
.swap
);
607 ohdr
->u
.atomic_eth
.compare_data
= cpu_to_be64(
608 wqe
->atomic_wr
.compare_add
);
610 qp
->s_state
= OP(FETCH_ADD
);
611 ohdr
->u
.atomic_eth
.swap_data
= cpu_to_be64(
612 wqe
->atomic_wr
.compare_add
);
613 ohdr
->u
.atomic_eth
.compare_data
= 0;
615 ohdr
->u
.atomic_eth
.vaddr
[0] = cpu_to_be32(
616 wqe
->atomic_wr
.remote_addr
>> 32);
617 ohdr
->u
.atomic_eth
.vaddr
[1] = cpu_to_be32(
618 wqe
->atomic_wr
.remote_addr
);
619 ohdr
->u
.atomic_eth
.rkey
= cpu_to_be32(
620 wqe
->atomic_wr
.rkey
);
621 hwords
+= sizeof(struct ib_atomic_eth
) / sizeof(u32
);
624 bth2
|= IB_BTH_REQ_ACK
;
625 if (++qp
->s_cur
== qp
->s_size
)
632 qp
->s_sge
.sge
= wqe
->sg_list
[0];
633 qp
->s_sge
.sg_list
= wqe
->sg_list
+ 1;
634 qp
->s_sge
.num_sge
= wqe
->wr
.num_sge
;
635 qp
->s_sge
.total_len
= wqe
->length
;
636 qp
->s_len
= wqe
->length
;
639 if (qp
->s_tail
>= qp
->s_size
)
642 if (wqe
->wr
.opcode
== IB_WR_RDMA_READ
)
643 qp
->s_psn
= wqe
->lpsn
+ 1;
648 case OP(RDMA_READ_RESPONSE_FIRST
):
650 * qp->s_state is normally set to the opcode of the
651 * last packet constructed for new requests and therefore
652 * is never set to RDMA read response.
653 * RDMA_READ_RESPONSE_FIRST is used by the ACK processing
654 * thread to indicate a SEND needs to be restarted from an
655 * earlier PSN without interfering with the sending thread.
658 qp
->s_len
= restart_sge(&qp
->s_sge
, wqe
, qp
->s_psn
, pmtu
);
661 qp
->s_state
= OP(SEND_MIDDLE
);
663 case OP(SEND_MIDDLE
):
664 bth2
= mask_psn(qp
->s_psn
++);
669 middle
= HFI1_CAP_IS_KSET(SDMA_AHG
);
672 if (wqe
->wr
.opcode
== IB_WR_SEND
) {
673 qp
->s_state
= OP(SEND_LAST
);
675 qp
->s_state
= OP(SEND_LAST_WITH_IMMEDIATE
);
676 /* Immediate data comes after the BTH */
677 ohdr
->u
.imm_data
= wqe
->wr
.ex
.imm_data
;
680 if (wqe
->wr
.send_flags
& IB_SEND_SOLICITED
)
681 bth0
|= IB_BTH_SOLICITED
;
682 bth2
|= IB_BTH_REQ_ACK
;
684 if (qp
->s_cur
>= qp
->s_size
)
688 case OP(RDMA_READ_RESPONSE_LAST
):
690 * qp->s_state is normally set to the opcode of the
691 * last packet constructed for new requests and therefore
692 * is never set to RDMA read response.
693 * RDMA_READ_RESPONSE_LAST is used by the ACK processing
694 * thread to indicate a RDMA write needs to be restarted from
695 * an earlier PSN without interfering with the sending thread.
698 qp
->s_len
= restart_sge(&qp
->s_sge
, wqe
, qp
->s_psn
, pmtu
);
700 case OP(RDMA_WRITE_FIRST
):
701 qp
->s_state
= OP(RDMA_WRITE_MIDDLE
);
703 case OP(RDMA_WRITE_MIDDLE
):
704 bth2
= mask_psn(qp
->s_psn
++);
709 middle
= HFI1_CAP_IS_KSET(SDMA_AHG
);
712 if (wqe
->wr
.opcode
== IB_WR_RDMA_WRITE
) {
713 qp
->s_state
= OP(RDMA_WRITE_LAST
);
715 qp
->s_state
= OP(RDMA_WRITE_LAST_WITH_IMMEDIATE
);
716 /* Immediate data comes after the BTH */
717 ohdr
->u
.imm_data
= wqe
->wr
.ex
.imm_data
;
719 if (wqe
->wr
.send_flags
& IB_SEND_SOLICITED
)
720 bth0
|= IB_BTH_SOLICITED
;
722 bth2
|= IB_BTH_REQ_ACK
;
724 if (qp
->s_cur
>= qp
->s_size
)
728 case OP(RDMA_READ_RESPONSE_MIDDLE
):
730 * qp->s_state is normally set to the opcode of the
731 * last packet constructed for new requests and therefore
732 * is never set to RDMA read response.
733 * RDMA_READ_RESPONSE_MIDDLE is used by the ACK processing
734 * thread to indicate a RDMA read needs to be restarted from
735 * an earlier PSN without interfering with the sending thread.
738 len
= (delta_psn(qp
->s_psn
, wqe
->psn
)) * pmtu
;
739 ohdr
->u
.rc
.reth
.vaddr
=
740 cpu_to_be64(wqe
->rdma_wr
.remote_addr
+ len
);
741 ohdr
->u
.rc
.reth
.rkey
=
742 cpu_to_be32(wqe
->rdma_wr
.rkey
);
743 ohdr
->u
.rc
.reth
.length
= cpu_to_be32(wqe
->length
- len
);
744 qp
->s_state
= OP(RDMA_READ_REQUEST
);
745 hwords
+= sizeof(ohdr
->u
.rc
.reth
) / sizeof(u32
);
746 bth2
= mask_psn(qp
->s_psn
) | IB_BTH_REQ_ACK
;
747 qp
->s_psn
= wqe
->lpsn
+ 1;
751 if (qp
->s_cur
== qp
->s_size
)
755 qp
->s_sending_hpsn
= bth2
;
756 delta
= delta_psn(bth2
, wqe
->psn
);
757 if (delta
&& delta
% HFI1_PSN_CREDIT
== 0)
758 bth2
|= IB_BTH_REQ_ACK
;
759 if (qp
->s_flags
& RVT_S_SEND_ONE
) {
760 qp
->s_flags
&= ~RVT_S_SEND_ONE
;
761 qp
->s_flags
|= RVT_S_WAIT_ACK
;
762 bth2
|= IB_BTH_REQ_ACK
;
765 qp
->s_hdrwords
= hwords
;
766 ps
->s_txreq
->sde
= priv
->s_sde
;
768 qp
->s_cur_size
= len
;
769 hfi1_make_ruc_header(
772 bth0
| (qp
->s_state
<< 24),
777 ps
->s_txreq
->hdr_dwords
= qp
->s_hdrwords
+ 2;
781 hfi1_put_txreq(ps
->s_txreq
);
786 hfi1_put_txreq(ps
->s_txreq
);
790 qp
->s_flags
&= ~RVT_S_BUSY
;
796 * hfi1_send_rc_ack - Construct an ACK packet and send it
797 * @qp: a pointer to the QP
799 * This is called from hfi1_rc_rcv() and handle_receive_interrupt().
800 * Note that RDMA reads and atomics are handled in the
801 * send side QP state and tasklet.
803 void hfi1_send_rc_ack(struct hfi1_ctxtdata
*rcd
, struct rvt_qp
*qp
,
806 struct hfi1_ibport
*ibp
= to_iport(qp
->ibqp
.device
, qp
->port_num
);
807 struct hfi1_pportdata
*ppd
= ppd_from_ibp(ibp
);
808 u64 pbc
, pbc_flags
= 0;
814 struct send_context
*sc
;
815 struct pio_buf
*pbuf
;
816 struct hfi1_ib_header hdr
;
817 struct hfi1_other_headers
*ohdr
;
820 /* Don't send ACK or NAK if a RDMA read or atomic is pending. */
821 if (qp
->s_flags
& RVT_S_RESP_PENDING
)
824 /* Ensure s_rdma_ack_cnt changes are committed */
825 smp_read_barrier_depends();
826 if (qp
->s_rdma_ack_cnt
)
829 /* Construct the header */
830 /* header size in 32-bit words LRH+BTH+AETH = (8+12+4)/4 */
832 if (unlikely(qp
->remote_ah_attr
.ah_flags
& IB_AH_GRH
)) {
833 hwords
+= hfi1_make_grh(ibp
, &hdr
.u
.l
.grh
,
834 &qp
->remote_ah_attr
.grh
, hwords
, 0);
841 /* read pkey_index w/o lock (its atomic) */
842 bth0
= hfi1_get_pkey(ibp
, qp
->s_pkey_index
) | (OP(ACKNOWLEDGE
) << 24);
843 if (qp
->s_mig_state
== IB_MIG_MIGRATED
)
844 bth0
|= IB_BTH_MIG_REQ
;
846 ohdr
->u
.aeth
= cpu_to_be32((qp
->r_msn
& HFI1_MSN_MASK
) |
848 HFI1_AETH_CREDIT_SHIFT
));
850 ohdr
->u
.aeth
= hfi1_compute_aeth(qp
);
851 sc5
= ibp
->sl_to_sc
[qp
->remote_ah_attr
.sl
];
852 /* set PBC_DC_INFO bit (aka SC[4]) in pbc_flags */
853 pbc_flags
|= ((!!(sc5
& 0x10)) << PBC_DC_INFO_SHIFT
);
854 lrh0
|= (sc5
& 0xf) << 12 | (qp
->remote_ah_attr
.sl
& 0xf) << 4;
855 hdr
.lrh
[0] = cpu_to_be16(lrh0
);
856 hdr
.lrh
[1] = cpu_to_be16(qp
->remote_ah_attr
.dlid
);
857 hdr
.lrh
[2] = cpu_to_be16(hwords
+ SIZE_OF_CRC
);
858 hdr
.lrh
[3] = cpu_to_be16(ppd
->lid
| qp
->remote_ah_attr
.src_path_bits
);
859 ohdr
->bth
[0] = cpu_to_be32(bth0
);
860 ohdr
->bth
[1] = cpu_to_be32(qp
->remote_qpn
);
861 ohdr
->bth
[1] |= cpu_to_be32((!!is_fecn
) << HFI1_BECN_SHIFT
);
862 ohdr
->bth
[2] = cpu_to_be32(mask_psn(qp
->r_ack_psn
));
864 /* Don't try to send ACKs if the link isn't ACTIVE */
865 if (driver_lstate(ppd
) != IB_PORT_ACTIVE
)
869 plen
= 2 /* PBC */ + hwords
;
870 vl
= sc_to_vlt(ppd
->dd
, sc5
);
871 pbc
= create_pbc(ppd
, pbc_flags
, qp
->srate_mbps
, vl
, plen
);
873 pbuf
= sc_buffer_alloc(sc
, plen
, NULL
, NULL
);
876 * We have no room to send at the moment. Pass
877 * responsibility for sending the ACK to the send tasklet
878 * so that when enough buffer space becomes available,
879 * the ACK is sent ahead of other outgoing packets.
884 trace_ack_output_ibhdr(dd_from_ibdev(qp
->ibqp
.device
), &hdr
);
886 /* write the pbc and data */
887 ppd
->dd
->pio_inline_send(ppd
->dd
, pbuf
, pbc
, &hdr
, hwords
);
892 this_cpu_inc(*ibp
->rvp
.rc_qacks
);
893 spin_lock_irqsave(&qp
->s_lock
, flags
);
894 qp
->s_flags
|= RVT_S_ACK_PENDING
| RVT_S_RESP_PENDING
;
895 qp
->s_nak_state
= qp
->r_nak_state
;
896 qp
->s_ack_psn
= qp
->r_ack_psn
;
898 qp
->s_flags
|= RVT_S_ECN
;
900 /* Schedule the send tasklet. */
901 hfi1_schedule_send(qp
);
902 spin_unlock_irqrestore(&qp
->s_lock
, flags
);
906 * reset_psn - reset the QP state to send starting from PSN
908 * @psn: the packet sequence number to restart at
910 * This is called from hfi1_rc_rcv() to process an incoming RC ACK
912 * Called at interrupt level with the QP s_lock held.
914 static void reset_psn(struct rvt_qp
*qp
, u32 psn
)
917 struct rvt_swqe
*wqe
= rvt_get_swqe_ptr(qp
, n
);
923 * If we are starting the request from the beginning,
924 * let the normal send code handle initialization.
926 if (cmp_psn(psn
, wqe
->psn
) <= 0) {
927 qp
->s_state
= OP(SEND_LAST
);
931 /* Find the work request opcode corresponding to the given PSN. */
932 opcode
= wqe
->wr
.opcode
;
936 if (++n
== qp
->s_size
)
940 wqe
= rvt_get_swqe_ptr(qp
, n
);
941 diff
= cmp_psn(psn
, wqe
->psn
);
946 * If we are starting the request from the beginning,
947 * let the normal send code handle initialization.
950 qp
->s_state
= OP(SEND_LAST
);
953 opcode
= wqe
->wr
.opcode
;
957 * Set the state to restart in the middle of a request.
958 * Don't change the s_sge, s_cur_sge, or s_cur_size.
959 * See hfi1_make_rc_req().
963 case IB_WR_SEND_WITH_IMM
:
964 qp
->s_state
= OP(RDMA_READ_RESPONSE_FIRST
);
967 case IB_WR_RDMA_WRITE
:
968 case IB_WR_RDMA_WRITE_WITH_IMM
:
969 qp
->s_state
= OP(RDMA_READ_RESPONSE_LAST
);
972 case IB_WR_RDMA_READ
:
973 qp
->s_state
= OP(RDMA_READ_RESPONSE_MIDDLE
);
978 * This case shouldn't happen since its only
981 qp
->s_state
= OP(SEND_LAST
);
986 * Set RVT_S_WAIT_PSN as rc_complete() may start the timer
987 * asynchronously before the send tasklet can get scheduled.
988 * Doing it in hfi1_make_rc_req() is too late.
990 if ((cmp_psn(qp
->s_psn
, qp
->s_sending_hpsn
) <= 0) &&
991 (cmp_psn(qp
->s_sending_psn
, qp
->s_sending_hpsn
) <= 0))
992 qp
->s_flags
|= RVT_S_WAIT_PSN
;
993 qp
->s_flags
&= ~RVT_S_AHG_VALID
;
997 * Back up requester to resend the last un-ACKed request.
998 * The QP r_lock and s_lock should be held and interrupts disabled.
1000 static void restart_rc(struct rvt_qp
*qp
, u32 psn
, int wait
)
1002 struct rvt_swqe
*wqe
= rvt_get_swqe_ptr(qp
, qp
->s_acked
);
1003 struct hfi1_ibport
*ibp
;
1005 if (qp
->s_retry
== 0) {
1006 if (qp
->s_mig_state
== IB_MIG_ARMED
) {
1007 hfi1_migrate_qp(qp
);
1008 qp
->s_retry
= qp
->s_retry_cnt
;
1009 } else if (qp
->s_last
== qp
->s_acked
) {
1010 hfi1_send_complete(qp
, wqe
, IB_WC_RETRY_EXC_ERR
);
1011 rvt_error_qp(qp
, IB_WC_WR_FLUSH_ERR
);
1013 } else { /* need to handle delayed completion */
1020 ibp
= to_iport(qp
->ibqp
.device
, qp
->port_num
);
1021 if (wqe
->wr
.opcode
== IB_WR_RDMA_READ
)
1022 ibp
->rvp
.n_rc_resends
++;
1024 ibp
->rvp
.n_rc_resends
+= delta_psn(qp
->s_psn
, psn
);
1026 qp
->s_flags
&= ~(RVT_S_WAIT_FENCE
| RVT_S_WAIT_RDMAR
|
1027 RVT_S_WAIT_SSN_CREDIT
| RVT_S_WAIT_PSN
|
1030 qp
->s_flags
|= RVT_S_SEND_ONE
;
1035 * This is called from s_timer for missing responses.
1037 void hfi1_rc_timeout(unsigned long arg
)
1039 struct rvt_qp
*qp
= (struct rvt_qp
*)arg
;
1040 struct hfi1_ibport
*ibp
;
1041 unsigned long flags
;
1043 spin_lock_irqsave(&qp
->r_lock
, flags
);
1044 spin_lock(&qp
->s_lock
);
1045 if (qp
->s_flags
& RVT_S_TIMER
) {
1046 ibp
= to_iport(qp
->ibqp
.device
, qp
->port_num
);
1047 ibp
->rvp
.n_rc_timeouts
++;
1048 qp
->s_flags
&= ~RVT_S_TIMER
;
1049 del_timer(&qp
->s_timer
);
1050 trace_hfi1_rc_timeout(qp
, qp
->s_last_psn
+ 1);
1051 restart_rc(qp
, qp
->s_last_psn
+ 1, 1);
1052 hfi1_schedule_send(qp
);
1054 spin_unlock(&qp
->s_lock
);
1055 spin_unlock_irqrestore(&qp
->r_lock
, flags
);
1059 * This is called from s_timer for RNR timeouts.
1061 void hfi1_rc_rnr_retry(unsigned long arg
)
1063 struct rvt_qp
*qp
= (struct rvt_qp
*)arg
;
1064 unsigned long flags
;
1066 spin_lock_irqsave(&qp
->s_lock
, flags
);
1067 hfi1_stop_rnr_timer(qp
);
1068 hfi1_schedule_send(qp
);
1069 spin_unlock_irqrestore(&qp
->s_lock
, flags
);
1073 * Set qp->s_sending_psn to the next PSN after the given one.
1074 * This would be psn+1 except when RDMA reads are present.
1076 static void reset_sending_psn(struct rvt_qp
*qp
, u32 psn
)
1078 struct rvt_swqe
*wqe
;
1081 /* Find the work request corresponding to the given PSN. */
1083 wqe
= rvt_get_swqe_ptr(qp
, n
);
1084 if (cmp_psn(psn
, wqe
->lpsn
) <= 0) {
1085 if (wqe
->wr
.opcode
== IB_WR_RDMA_READ
)
1086 qp
->s_sending_psn
= wqe
->lpsn
+ 1;
1088 qp
->s_sending_psn
= psn
+ 1;
1091 if (++n
== qp
->s_size
)
1093 if (n
== qp
->s_tail
)
1099 * This should be called with the QP s_lock held and interrupts disabled.
1101 void hfi1_rc_send_complete(struct rvt_qp
*qp
, struct hfi1_ib_header
*hdr
)
1103 struct hfi1_other_headers
*ohdr
;
1104 struct rvt_swqe
*wqe
;
1110 if (!(ib_rvt_state_ops
[qp
->state
] & RVT_PROCESS_OR_FLUSH_SEND
))
1113 /* Find out where the BTH is */
1114 if ((be16_to_cpu(hdr
->lrh
[0]) & 3) == HFI1_LRH_BTH
)
1117 ohdr
= &hdr
->u
.l
.oth
;
1119 opcode
= be32_to_cpu(ohdr
->bth
[0]) >> 24;
1120 if (opcode
>= OP(RDMA_READ_RESPONSE_FIRST
) &&
1121 opcode
<= OP(ATOMIC_ACKNOWLEDGE
)) {
1122 WARN_ON(!qp
->s_rdma_ack_cnt
);
1123 qp
->s_rdma_ack_cnt
--;
1127 psn
= be32_to_cpu(ohdr
->bth
[2]);
1128 reset_sending_psn(qp
, psn
);
1131 * Start timer after a packet requesting an ACK has been sent and
1132 * there are still requests that haven't been acked.
1134 if ((psn
& IB_BTH_REQ_ACK
) && qp
->s_acked
!= qp
->s_tail
&&
1136 (RVT_S_TIMER
| RVT_S_WAIT_RNR
| RVT_S_WAIT_PSN
)) &&
1137 (ib_rvt_state_ops
[qp
->state
] & RVT_PROCESS_RECV_OK
))
1138 hfi1_add_retry_timer(qp
);
1140 while (qp
->s_last
!= qp
->s_acked
) {
1143 wqe
= rvt_get_swqe_ptr(qp
, qp
->s_last
);
1144 if (cmp_psn(wqe
->lpsn
, qp
->s_sending_psn
) >= 0 &&
1145 cmp_psn(qp
->s_sending_psn
, qp
->s_sending_hpsn
) <= 0)
1147 s_last
= qp
->s_last
;
1148 if (++s_last
>= qp
->s_size
)
1150 qp
->s_last
= s_last
;
1151 /* see post_send() */
1153 for (i
= 0; i
< wqe
->wr
.num_sge
; i
++) {
1154 struct rvt_sge
*sge
= &wqe
->sg_list
[i
];
1156 rvt_put_mr(sge
->mr
);
1158 /* Post a send completion queue entry if requested. */
1159 if (!(qp
->s_flags
& RVT_S_SIGNAL_REQ_WR
) ||
1160 (wqe
->wr
.send_flags
& IB_SEND_SIGNALED
)) {
1161 memset(&wc
, 0, sizeof(wc
));
1162 wc
.wr_id
= wqe
->wr
.wr_id
;
1163 wc
.status
= IB_WC_SUCCESS
;
1164 wc
.opcode
= ib_hfi1_wc_opcode
[wqe
->wr
.opcode
];
1165 wc
.byte_len
= wqe
->length
;
1167 rvt_cq_enter(ibcq_to_rvtcq(qp
->ibqp
.send_cq
), &wc
, 0);
1171 * If we were waiting for sends to complete before re-sending,
1172 * and they are now complete, restart sending.
1174 trace_hfi1_rc_sendcomplete(qp
, psn
);
1175 if (qp
->s_flags
& RVT_S_WAIT_PSN
&&
1176 cmp_psn(qp
->s_sending_psn
, qp
->s_sending_hpsn
) > 0) {
1177 qp
->s_flags
&= ~RVT_S_WAIT_PSN
;
1178 qp
->s_sending_psn
= qp
->s_psn
;
1179 qp
->s_sending_hpsn
= qp
->s_psn
- 1;
1180 hfi1_schedule_send(qp
);
1184 static inline void update_last_psn(struct rvt_qp
*qp
, u32 psn
)
1186 qp
->s_last_psn
= psn
;
1190 * Generate a SWQE completion.
1191 * This is similar to hfi1_send_complete but has to check to be sure
1192 * that the SGEs are not being referenced if the SWQE is being resent.
1194 static struct rvt_swqe
*do_rc_completion(struct rvt_qp
*qp
,
1195 struct rvt_swqe
*wqe
,
1196 struct hfi1_ibport
*ibp
)
1202 * Don't decrement refcount and don't generate a
1203 * completion if the SWQE is being resent until the send
1206 if (cmp_psn(wqe
->lpsn
, qp
->s_sending_psn
) < 0 ||
1207 cmp_psn(qp
->s_sending_psn
, qp
->s_sending_hpsn
) > 0) {
1210 for (i
= 0; i
< wqe
->wr
.num_sge
; i
++) {
1211 struct rvt_sge
*sge
= &wqe
->sg_list
[i
];
1213 rvt_put_mr(sge
->mr
);
1215 s_last
= qp
->s_last
;
1216 if (++s_last
>= qp
->s_size
)
1218 qp
->s_last
= s_last
;
1219 /* see post_send() */
1221 /* Post a send completion queue entry if requested. */
1222 if (!(qp
->s_flags
& RVT_S_SIGNAL_REQ_WR
) ||
1223 (wqe
->wr
.send_flags
& IB_SEND_SIGNALED
)) {
1224 memset(&wc
, 0, sizeof(wc
));
1225 wc
.wr_id
= wqe
->wr
.wr_id
;
1226 wc
.status
= IB_WC_SUCCESS
;
1227 wc
.opcode
= ib_hfi1_wc_opcode
[wqe
->wr
.opcode
];
1228 wc
.byte_len
= wqe
->length
;
1230 rvt_cq_enter(ibcq_to_rvtcq(qp
->ibqp
.send_cq
), &wc
, 0);
1233 struct hfi1_pportdata
*ppd
= ppd_from_ibp(ibp
);
1235 this_cpu_inc(*ibp
->rvp
.rc_delayed_comp
);
1237 * If send progress not running attempt to progress
1240 if (ppd
->dd
->flags
& HFI1_HAS_SEND_DMA
) {
1241 struct sdma_engine
*engine
;
1244 /* For now use sc to find engine */
1245 sc5
= ibp
->sl_to_sc
[qp
->remote_ah_attr
.sl
];
1246 engine
= qp_to_sdma_engine(qp
, sc5
);
1247 sdma_engine_progress_schedule(engine
);
1251 qp
->s_retry
= qp
->s_retry_cnt
;
1252 update_last_psn(qp
, wqe
->lpsn
);
1255 * If we are completing a request which is in the process of
1256 * being resent, we can stop re-sending it since we know the
1257 * responder has already seen it.
1259 if (qp
->s_acked
== qp
->s_cur
) {
1260 if (++qp
->s_cur
>= qp
->s_size
)
1262 qp
->s_acked
= qp
->s_cur
;
1263 wqe
= rvt_get_swqe_ptr(qp
, qp
->s_cur
);
1264 if (qp
->s_acked
!= qp
->s_tail
) {
1265 qp
->s_state
= OP(SEND_LAST
);
1266 qp
->s_psn
= wqe
->psn
;
1269 if (++qp
->s_acked
>= qp
->s_size
)
1271 if (qp
->state
== IB_QPS_SQD
&& qp
->s_acked
== qp
->s_cur
)
1273 wqe
= rvt_get_swqe_ptr(qp
, qp
->s_acked
);
1279 * do_rc_ack - process an incoming RC ACK
1280 * @qp: the QP the ACK came in on
1281 * @psn: the packet sequence number of the ACK
1282 * @opcode: the opcode of the request that resulted in the ACK
1284 * This is called from rc_rcv_resp() to process an incoming RC ACK
1286 * May be called at interrupt level, with the QP s_lock held.
1287 * Returns 1 if OK, 0 if current operation should be aborted (NAK).
1289 static int do_rc_ack(struct rvt_qp
*qp
, u32 aeth
, u32 psn
, int opcode
,
1290 u64 val
, struct hfi1_ctxtdata
*rcd
)
1292 struct hfi1_ibport
*ibp
;
1293 enum ib_wc_status status
;
1294 struct rvt_swqe
*wqe
;
1301 * Note that NAKs implicitly ACK outstanding SEND and RDMA write
1302 * requests and implicitly NAK RDMA read and atomic requests issued
1303 * before the NAK'ed request. The MSN won't include the NAK'ed
1304 * request but will include an ACK'ed request(s).
1309 wqe
= rvt_get_swqe_ptr(qp
, qp
->s_acked
);
1310 ibp
= to_iport(qp
->ibqp
.device
, qp
->port_num
);
1313 * The MSN might be for a later WQE than the PSN indicates so
1314 * only complete WQEs that the PSN finishes.
1316 while ((diff
= delta_psn(ack_psn
, wqe
->lpsn
)) >= 0) {
1318 * RDMA_READ_RESPONSE_ONLY is a special case since
1319 * we want to generate completion events for everything
1320 * before the RDMA read, copy the data, then generate
1321 * the completion for the read.
1323 if (wqe
->wr
.opcode
== IB_WR_RDMA_READ
&&
1324 opcode
== OP(RDMA_READ_RESPONSE_ONLY
) &&
1330 * If this request is a RDMA read or atomic, and the ACK is
1331 * for a later operation, this ACK NAKs the RDMA read or
1332 * atomic. In other words, only a RDMA_READ_LAST or ONLY
1333 * can ACK a RDMA read and likewise for atomic ops. Note
1334 * that the NAK case can only happen if relaxed ordering is
1335 * used and requests are sent after an RDMA read or atomic
1336 * is sent but before the response is received.
1338 if ((wqe
->wr
.opcode
== IB_WR_RDMA_READ
&&
1339 (opcode
!= OP(RDMA_READ_RESPONSE_LAST
) || diff
!= 0)) ||
1340 ((wqe
->wr
.opcode
== IB_WR_ATOMIC_CMP_AND_SWP
||
1341 wqe
->wr
.opcode
== IB_WR_ATOMIC_FETCH_AND_ADD
) &&
1342 (opcode
!= OP(ATOMIC_ACKNOWLEDGE
) || diff
!= 0))) {
1343 /* Retry this request. */
1344 if (!(qp
->r_flags
& RVT_R_RDMAR_SEQ
)) {
1345 qp
->r_flags
|= RVT_R_RDMAR_SEQ
;
1346 restart_rc(qp
, qp
->s_last_psn
+ 1, 0);
1347 if (list_empty(&qp
->rspwait
)) {
1348 qp
->r_flags
|= RVT_R_RSP_SEND
;
1349 atomic_inc(&qp
->refcount
);
1350 list_add_tail(&qp
->rspwait
,
1351 &rcd
->qp_wait_list
);
1355 * No need to process the ACK/NAK since we are
1356 * restarting an earlier request.
1360 if (wqe
->wr
.opcode
== IB_WR_ATOMIC_CMP_AND_SWP
||
1361 wqe
->wr
.opcode
== IB_WR_ATOMIC_FETCH_AND_ADD
) {
1362 u64
*vaddr
= wqe
->sg_list
[0].vaddr
;
1365 if (qp
->s_num_rd_atomic
&&
1366 (wqe
->wr
.opcode
== IB_WR_RDMA_READ
||
1367 wqe
->wr
.opcode
== IB_WR_ATOMIC_CMP_AND_SWP
||
1368 wqe
->wr
.opcode
== IB_WR_ATOMIC_FETCH_AND_ADD
)) {
1369 qp
->s_num_rd_atomic
--;
1370 /* Restart sending task if fence is complete */
1371 if ((qp
->s_flags
& RVT_S_WAIT_FENCE
) &&
1372 !qp
->s_num_rd_atomic
) {
1373 qp
->s_flags
&= ~(RVT_S_WAIT_FENCE
|
1375 hfi1_schedule_send(qp
);
1376 } else if (qp
->s_flags
& RVT_S_WAIT_RDMAR
) {
1377 qp
->s_flags
&= ~(RVT_S_WAIT_RDMAR
|
1379 hfi1_schedule_send(qp
);
1382 wqe
= do_rc_completion(qp
, wqe
, ibp
);
1383 if (qp
->s_acked
== qp
->s_tail
)
1387 switch (aeth
>> 29) {
1389 this_cpu_inc(*ibp
->rvp
.rc_acks
);
1390 if (qp
->s_acked
!= qp
->s_tail
) {
1392 * We are expecting more ACKs so
1393 * mod the retry timer.
1395 hfi1_mod_retry_timer(qp
);
1397 * We can stop re-sending the earlier packets and
1398 * continue with the next packet the receiver wants.
1400 if (cmp_psn(qp
->s_psn
, psn
) <= 0)
1401 reset_psn(qp
, psn
+ 1);
1403 /* No more acks - kill all timers */
1404 hfi1_stop_rc_timers(qp
);
1405 if (cmp_psn(qp
->s_psn
, psn
) <= 0) {
1406 qp
->s_state
= OP(SEND_LAST
);
1407 qp
->s_psn
= psn
+ 1;
1410 if (qp
->s_flags
& RVT_S_WAIT_ACK
) {
1411 qp
->s_flags
&= ~RVT_S_WAIT_ACK
;
1412 hfi1_schedule_send(qp
);
1414 hfi1_get_credit(qp
, aeth
);
1415 qp
->s_rnr_retry
= qp
->s_rnr_retry_cnt
;
1416 qp
->s_retry
= qp
->s_retry_cnt
;
1417 update_last_psn(qp
, psn
);
1420 case 1: /* RNR NAK */
1421 ibp
->rvp
.n_rnr_naks
++;
1422 if (qp
->s_acked
== qp
->s_tail
)
1424 if (qp
->s_flags
& RVT_S_WAIT_RNR
)
1426 if (qp
->s_rnr_retry
== 0) {
1427 status
= IB_WC_RNR_RETRY_EXC_ERR
;
1430 if (qp
->s_rnr_retry_cnt
< 7)
1433 /* The last valid PSN is the previous PSN. */
1434 update_last_psn(qp
, psn
- 1);
1436 ibp
->rvp
.n_rc_resends
+= delta_psn(qp
->s_psn
, psn
);
1440 qp
->s_flags
&= ~(RVT_S_WAIT_SSN_CREDIT
| RVT_S_WAIT_ACK
);
1441 hfi1_stop_rc_timers(qp
);
1443 ib_hfi1_rnr_table
[(aeth
>> HFI1_AETH_CREDIT_SHIFT
) &
1444 HFI1_AETH_CREDIT_MASK
];
1445 hfi1_add_rnr_timer(qp
, to
);
1449 if (qp
->s_acked
== qp
->s_tail
)
1451 /* The last valid PSN is the previous PSN. */
1452 update_last_psn(qp
, psn
- 1);
1453 switch ((aeth
>> HFI1_AETH_CREDIT_SHIFT
) &
1454 HFI1_AETH_CREDIT_MASK
) {
1455 case 0: /* PSN sequence error */
1456 ibp
->rvp
.n_seq_naks
++;
1458 * Back up to the responder's expected PSN.
1459 * Note that we might get a NAK in the middle of an
1460 * RDMA READ response which terminates the RDMA
1463 restart_rc(qp
, psn
, 0);
1464 hfi1_schedule_send(qp
);
1467 case 1: /* Invalid Request */
1468 status
= IB_WC_REM_INV_REQ_ERR
;
1469 ibp
->rvp
.n_other_naks
++;
1472 case 2: /* Remote Access Error */
1473 status
= IB_WC_REM_ACCESS_ERR
;
1474 ibp
->rvp
.n_other_naks
++;
1477 case 3: /* Remote Operation Error */
1478 status
= IB_WC_REM_OP_ERR
;
1479 ibp
->rvp
.n_other_naks
++;
1481 if (qp
->s_last
== qp
->s_acked
) {
1482 hfi1_send_complete(qp
, wqe
, status
);
1483 rvt_error_qp(qp
, IB_WC_WR_FLUSH_ERR
);
1488 /* Ignore other reserved NAK error codes */
1491 qp
->s_retry
= qp
->s_retry_cnt
;
1492 qp
->s_rnr_retry
= qp
->s_rnr_retry_cnt
;
1495 default: /* 2: reserved */
1497 /* Ignore reserved NAK codes. */
1500 /* cannot be reached */
1502 hfi1_stop_rc_timers(qp
);
1507 * We have seen an out of sequence RDMA read middle or last packet.
1508 * This ACKs SENDs and RDMA writes up to the first RDMA read or atomic SWQE.
1510 static void rdma_seq_err(struct rvt_qp
*qp
, struct hfi1_ibport
*ibp
, u32 psn
,
1511 struct hfi1_ctxtdata
*rcd
)
1513 struct rvt_swqe
*wqe
;
1515 /* Remove QP from retry timer */
1516 hfi1_stop_rc_timers(qp
);
1518 wqe
= rvt_get_swqe_ptr(qp
, qp
->s_acked
);
1520 while (cmp_psn(psn
, wqe
->lpsn
) > 0) {
1521 if (wqe
->wr
.opcode
== IB_WR_RDMA_READ
||
1522 wqe
->wr
.opcode
== IB_WR_ATOMIC_CMP_AND_SWP
||
1523 wqe
->wr
.opcode
== IB_WR_ATOMIC_FETCH_AND_ADD
)
1525 wqe
= do_rc_completion(qp
, wqe
, ibp
);
1528 ibp
->rvp
.n_rdma_seq
++;
1529 qp
->r_flags
|= RVT_R_RDMAR_SEQ
;
1530 restart_rc(qp
, qp
->s_last_psn
+ 1, 0);
1531 if (list_empty(&qp
->rspwait
)) {
1532 qp
->r_flags
|= RVT_R_RSP_SEND
;
1533 atomic_inc(&qp
->refcount
);
1534 list_add_tail(&qp
->rspwait
, &rcd
->qp_wait_list
);
1539 * rc_rcv_resp - process an incoming RC response packet
1540 * @ibp: the port this packet came in on
1541 * @ohdr: the other headers for this packet
1542 * @data: the packet data
1543 * @tlen: the packet length
1544 * @qp: the QP for this packet
1545 * @opcode: the opcode for this packet
1546 * @psn: the packet sequence number for this packet
1547 * @hdrsize: the header length
1548 * @pmtu: the path MTU
1550 * This is called from hfi1_rc_rcv() to process an incoming RC response
1551 * packet for the given QP.
1552 * Called at interrupt level.
1554 static void rc_rcv_resp(struct hfi1_ibport
*ibp
,
1555 struct hfi1_other_headers
*ohdr
,
1556 void *data
, u32 tlen
, struct rvt_qp
*qp
,
1557 u32 opcode
, u32 psn
, u32 hdrsize
, u32 pmtu
,
1558 struct hfi1_ctxtdata
*rcd
)
1560 struct rvt_swqe
*wqe
;
1561 enum ib_wc_status status
;
1562 unsigned long flags
;
1568 spin_lock_irqsave(&qp
->s_lock
, flags
);
1570 trace_hfi1_rc_ack(qp
, psn
);
1572 /* Ignore invalid responses. */
1573 smp_read_barrier_depends(); /* see post_one_send */
1574 if (cmp_psn(psn
, ACCESS_ONCE(qp
->s_next_psn
)) >= 0)
1577 /* Ignore duplicate responses. */
1578 diff
= cmp_psn(psn
, qp
->s_last_psn
);
1579 if (unlikely(diff
<= 0)) {
1580 /* Update credits for "ghost" ACKs */
1581 if (diff
== 0 && opcode
== OP(ACKNOWLEDGE
)) {
1582 aeth
= be32_to_cpu(ohdr
->u
.aeth
);
1583 if ((aeth
>> 29) == 0)
1584 hfi1_get_credit(qp
, aeth
);
1590 * Skip everything other than the PSN we expect, if we are waiting
1591 * for a reply to a restarted RDMA read or atomic op.
1593 if (qp
->r_flags
& RVT_R_RDMAR_SEQ
) {
1594 if (cmp_psn(psn
, qp
->s_last_psn
+ 1) != 0)
1596 qp
->r_flags
&= ~RVT_R_RDMAR_SEQ
;
1599 if (unlikely(qp
->s_acked
== qp
->s_tail
))
1601 wqe
= rvt_get_swqe_ptr(qp
, qp
->s_acked
);
1602 status
= IB_WC_SUCCESS
;
1605 case OP(ACKNOWLEDGE
):
1606 case OP(ATOMIC_ACKNOWLEDGE
):
1607 case OP(RDMA_READ_RESPONSE_FIRST
):
1608 aeth
= be32_to_cpu(ohdr
->u
.aeth
);
1609 if (opcode
== OP(ATOMIC_ACKNOWLEDGE
)) {
1610 __be32
*p
= ohdr
->u
.at
.atomic_ack_eth
;
1612 val
= ((u64
)be32_to_cpu(p
[0]) << 32) |
1617 if (!do_rc_ack(qp
, aeth
, psn
, opcode
, val
, rcd
) ||
1618 opcode
!= OP(RDMA_READ_RESPONSE_FIRST
))
1620 wqe
= rvt_get_swqe_ptr(qp
, qp
->s_acked
);
1621 if (unlikely(wqe
->wr
.opcode
!= IB_WR_RDMA_READ
))
1624 * If this is a response to a resent RDMA read, we
1625 * have to be careful to copy the data to the right
1628 qp
->s_rdma_read_len
= restart_sge(&qp
->s_rdma_read_sge
,
1632 case OP(RDMA_READ_RESPONSE_MIDDLE
):
1633 /* no AETH, no ACK */
1634 if (unlikely(cmp_psn(psn
, qp
->s_last_psn
+ 1)))
1636 if (unlikely(wqe
->wr
.opcode
!= IB_WR_RDMA_READ
))
1639 if (unlikely(tlen
!= (hdrsize
+ pmtu
+ 4)))
1641 if (unlikely(pmtu
>= qp
->s_rdma_read_len
))
1645 * We got a response so update the timeout.
1646 * 4.096 usec. * (1 << qp->timeout)
1648 qp
->s_flags
|= RVT_S_TIMER
;
1649 mod_timer(&qp
->s_timer
, jiffies
+ qp
->timeout_jiffies
);
1650 if (qp
->s_flags
& RVT_S_WAIT_ACK
) {
1651 qp
->s_flags
&= ~RVT_S_WAIT_ACK
;
1652 hfi1_schedule_send(qp
);
1655 if (opcode
== OP(RDMA_READ_RESPONSE_MIDDLE
))
1656 qp
->s_retry
= qp
->s_retry_cnt
;
1659 * Update the RDMA receive state but do the copy w/o
1660 * holding the locks and blocking interrupts.
1662 qp
->s_rdma_read_len
-= pmtu
;
1663 update_last_psn(qp
, psn
);
1664 spin_unlock_irqrestore(&qp
->s_lock
, flags
);
1665 hfi1_copy_sge(&qp
->s_rdma_read_sge
, data
, pmtu
, 0, 0);
1668 case OP(RDMA_READ_RESPONSE_ONLY
):
1669 aeth
= be32_to_cpu(ohdr
->u
.aeth
);
1670 if (!do_rc_ack(qp
, aeth
, psn
, opcode
, 0, rcd
))
1672 /* Get the number of bytes the message was padded by. */
1673 pad
= (be32_to_cpu(ohdr
->bth
[0]) >> 20) & 3;
1675 * Check that the data size is >= 0 && <= pmtu.
1676 * Remember to account for ICRC (4).
1678 if (unlikely(tlen
< (hdrsize
+ pad
+ 4)))
1681 * If this is a response to a resent RDMA read, we
1682 * have to be careful to copy the data to the right
1685 wqe
= rvt_get_swqe_ptr(qp
, qp
->s_acked
);
1686 qp
->s_rdma_read_len
= restart_sge(&qp
->s_rdma_read_sge
,
1690 case OP(RDMA_READ_RESPONSE_LAST
):
1691 /* ACKs READ req. */
1692 if (unlikely(cmp_psn(psn
, qp
->s_last_psn
+ 1)))
1694 if (unlikely(wqe
->wr
.opcode
!= IB_WR_RDMA_READ
))
1696 /* Get the number of bytes the message was padded by. */
1697 pad
= (be32_to_cpu(ohdr
->bth
[0]) >> 20) & 3;
1699 * Check that the data size is >= 1 && <= pmtu.
1700 * Remember to account for ICRC (4).
1702 if (unlikely(tlen
<= (hdrsize
+ pad
+ 4)))
1705 tlen
-= hdrsize
+ pad
+ 4;
1706 if (unlikely(tlen
!= qp
->s_rdma_read_len
))
1708 aeth
= be32_to_cpu(ohdr
->u
.aeth
);
1709 hfi1_copy_sge(&qp
->s_rdma_read_sge
, data
, tlen
, 0, 0);
1710 WARN_ON(qp
->s_rdma_read_sge
.num_sge
);
1711 (void)do_rc_ack(qp
, aeth
, psn
,
1712 OP(RDMA_READ_RESPONSE_LAST
), 0, rcd
);
1717 status
= IB_WC_LOC_QP_OP_ERR
;
1721 rdma_seq_err(qp
, ibp
, psn
, rcd
);
1725 status
= IB_WC_LOC_LEN_ERR
;
1727 if (qp
->s_last
== qp
->s_acked
) {
1728 hfi1_send_complete(qp
, wqe
, status
);
1729 rvt_error_qp(qp
, IB_WC_WR_FLUSH_ERR
);
1732 spin_unlock_irqrestore(&qp
->s_lock
, flags
);
1737 static inline void rc_defered_ack(struct hfi1_ctxtdata
*rcd
,
1740 if (list_empty(&qp
->rspwait
)) {
1741 qp
->r_flags
|= RVT_R_RSP_NAK
;
1742 atomic_inc(&qp
->refcount
);
1743 list_add_tail(&qp
->rspwait
, &rcd
->qp_wait_list
);
1747 static inline void rc_cancel_ack(struct rvt_qp
*qp
)
1749 struct hfi1_qp_priv
*priv
= qp
->priv
;
1751 priv
->r_adefered
= 0;
1752 if (list_empty(&qp
->rspwait
))
1754 list_del_init(&qp
->rspwait
);
1755 qp
->r_flags
&= ~RVT_R_RSP_NAK
;
1756 if (atomic_dec_and_test(&qp
->refcount
))
1761 * rc_rcv_error - process an incoming duplicate or error RC packet
1762 * @ohdr: the other headers for this packet
1763 * @data: the packet data
1764 * @qp: the QP for this packet
1765 * @opcode: the opcode for this packet
1766 * @psn: the packet sequence number for this packet
1767 * @diff: the difference between the PSN and the expected PSN
1769 * This is called from hfi1_rc_rcv() to process an unexpected
1770 * incoming RC packet for the given QP.
1771 * Called at interrupt level.
1772 * Return 1 if no more processing is needed; otherwise return 0 to
1773 * schedule a response to be sent.
1775 static noinline
int rc_rcv_error(struct hfi1_other_headers
*ohdr
, void *data
,
1776 struct rvt_qp
*qp
, u32 opcode
, u32 psn
,
1777 int diff
, struct hfi1_ctxtdata
*rcd
)
1779 struct hfi1_ibport
*ibp
= to_iport(qp
->ibqp
.device
, qp
->port_num
);
1780 struct rvt_ack_entry
*e
;
1781 unsigned long flags
;
1785 trace_hfi1_rc_rcv_error(qp
, psn
);
1788 * Packet sequence error.
1789 * A NAK will ACK earlier sends and RDMA writes.
1790 * Don't queue the NAK if we already sent one.
1792 if (!qp
->r_nak_state
) {
1793 ibp
->rvp
.n_rc_seqnak
++;
1794 qp
->r_nak_state
= IB_NAK_PSN_ERROR
;
1795 /* Use the expected PSN. */
1796 qp
->r_ack_psn
= qp
->r_psn
;
1798 * Wait to send the sequence NAK until all packets
1799 * in the receive queue have been processed.
1800 * Otherwise, we end up propagating congestion.
1802 rc_defered_ack(rcd
, qp
);
1808 * Handle a duplicate request. Don't re-execute SEND, RDMA
1809 * write or atomic op. Don't NAK errors, just silently drop
1810 * the duplicate request. Note that r_sge, r_len, and
1811 * r_rcv_len may be in use so don't modify them.
1813 * We are supposed to ACK the earliest duplicate PSN but we
1814 * can coalesce an outstanding duplicate ACK. We have to
1815 * send the earliest so that RDMA reads can be restarted at
1816 * the requester's expected PSN.
1818 * First, find where this duplicate PSN falls within the
1819 * ACKs previously sent.
1820 * old_req is true if there is an older response that is scheduled
1821 * to be sent before sending this one.
1825 ibp
->rvp
.n_rc_dupreq
++;
1827 spin_lock_irqsave(&qp
->s_lock
, flags
);
1829 for (i
= qp
->r_head_ack_queue
; ; i
= prev
) {
1830 if (i
== qp
->s_tail_ack_queue
)
1835 prev
= HFI1_MAX_RDMA_ATOMIC
;
1836 if (prev
== qp
->r_head_ack_queue
) {
1840 e
= &qp
->s_ack_queue
[prev
];
1845 if (cmp_psn(psn
, e
->psn
) >= 0) {
1846 if (prev
== qp
->s_tail_ack_queue
&&
1847 cmp_psn(psn
, e
->lpsn
) <= 0)
1853 case OP(RDMA_READ_REQUEST
): {
1854 struct ib_reth
*reth
;
1859 * If we didn't find the RDMA read request in the ack queue,
1860 * we can ignore this request.
1862 if (!e
|| e
->opcode
!= OP(RDMA_READ_REQUEST
))
1864 /* RETH comes after BTH */
1865 reth
= &ohdr
->u
.rc
.reth
;
1867 * Address range must be a subset of the original
1868 * request and start on pmtu boundaries.
1869 * We reuse the old ack_queue slot since the requester
1870 * should not back up and request an earlier PSN for the
1873 offset
= delta_psn(psn
, e
->psn
) * qp
->pmtu
;
1874 len
= be32_to_cpu(reth
->length
);
1875 if (unlikely(offset
+ len
!= e
->rdma_sge
.sge_length
))
1877 if (e
->rdma_sge
.mr
) {
1878 rvt_put_mr(e
->rdma_sge
.mr
);
1879 e
->rdma_sge
.mr
= NULL
;
1882 u32 rkey
= be32_to_cpu(reth
->rkey
);
1883 u64 vaddr
= be64_to_cpu(reth
->vaddr
);
1886 ok
= rvt_rkey_ok(qp
, &e
->rdma_sge
, len
, vaddr
, rkey
,
1887 IB_ACCESS_REMOTE_READ
);
1891 e
->rdma_sge
.vaddr
= NULL
;
1892 e
->rdma_sge
.length
= 0;
1893 e
->rdma_sge
.sge_length
= 0;
1898 qp
->s_tail_ack_queue
= prev
;
1902 case OP(COMPARE_SWAP
):
1903 case OP(FETCH_ADD
): {
1905 * If we didn't find the atomic request in the ack queue
1906 * or the send tasklet is already backed up to send an
1907 * earlier entry, we can ignore this request.
1909 if (!e
|| e
->opcode
!= (u8
)opcode
|| old_req
)
1911 qp
->s_tail_ack_queue
= prev
;
1917 * Ignore this operation if it doesn't request an ACK
1918 * or an earlier RDMA read or atomic is going to be resent.
1920 if (!(psn
& IB_BTH_REQ_ACK
) || old_req
)
1923 * Resend the most recent ACK if this request is
1924 * after all the previous RDMA reads and atomics.
1926 if (i
== qp
->r_head_ack_queue
) {
1927 spin_unlock_irqrestore(&qp
->s_lock
, flags
);
1928 qp
->r_nak_state
= 0;
1929 qp
->r_ack_psn
= qp
->r_psn
- 1;
1934 * Resend the RDMA read or atomic op which
1935 * ACKs this duplicate request.
1937 qp
->s_tail_ack_queue
= i
;
1940 qp
->s_ack_state
= OP(ACKNOWLEDGE
);
1941 qp
->s_flags
|= RVT_S_RESP_PENDING
;
1942 qp
->r_nak_state
= 0;
1943 hfi1_schedule_send(qp
);
1946 spin_unlock_irqrestore(&qp
->s_lock
, flags
);
1954 void hfi1_rc_error(struct rvt_qp
*qp
, enum ib_wc_status err
)
1956 unsigned long flags
;
1959 spin_lock_irqsave(&qp
->s_lock
, flags
);
1960 lastwqe
= rvt_error_qp(qp
, err
);
1961 spin_unlock_irqrestore(&qp
->s_lock
, flags
);
1966 ev
.device
= qp
->ibqp
.device
;
1967 ev
.element
.qp
= &qp
->ibqp
;
1968 ev
.event
= IB_EVENT_QP_LAST_WQE_REACHED
;
1969 qp
->ibqp
.event_handler(&ev
, qp
->ibqp
.qp_context
);
1973 static inline void update_ack_queue(struct rvt_qp
*qp
, unsigned n
)
1978 if (next
> HFI1_MAX_RDMA_ATOMIC
)
1980 qp
->s_tail_ack_queue
= next
;
1981 qp
->s_ack_state
= OP(ACKNOWLEDGE
);
1984 static void log_cca_event(struct hfi1_pportdata
*ppd
, u8 sl
, u32 rlid
,
1985 u32 lqpn
, u32 rqpn
, u8 svc_type
)
1987 struct opa_hfi1_cong_log_event_internal
*cc_event
;
1988 unsigned long flags
;
1990 if (sl
>= OPA_MAX_SLS
)
1993 spin_lock_irqsave(&ppd
->cc_log_lock
, flags
);
1995 ppd
->threshold_cong_event_map
[sl
/ 8] |= 1 << (sl
% 8);
1996 ppd
->threshold_event_counter
++;
1998 cc_event
= &ppd
->cc_events
[ppd
->cc_log_idx
++];
1999 if (ppd
->cc_log_idx
== OPA_CONG_LOG_ELEMS
)
2000 ppd
->cc_log_idx
= 0;
2001 cc_event
->lqpn
= lqpn
& RVT_QPN_MASK
;
2002 cc_event
->rqpn
= rqpn
& RVT_QPN_MASK
;
2004 cc_event
->svc_type
= svc_type
;
2005 cc_event
->rlid
= rlid
;
2006 /* keep timestamp in units of 1.024 usec */
2007 cc_event
->timestamp
= ktime_to_ns(ktime_get()) / 1024;
2009 spin_unlock_irqrestore(&ppd
->cc_log_lock
, flags
);
2012 void process_becn(struct hfi1_pportdata
*ppd
, u8 sl
, u16 rlid
, u32 lqpn
,
2013 u32 rqpn
, u8 svc_type
)
2015 struct cca_timer
*cca_timer
;
2016 u16 ccti
, ccti_incr
, ccti_timer
, ccti_limit
;
2017 u8 trigger_threshold
;
2018 struct cc_state
*cc_state
;
2019 unsigned long flags
;
2021 if (sl
>= OPA_MAX_SLS
)
2024 cc_state
= get_cc_state(ppd
);
2030 * 1) increase CCTI (for this SL)
2031 * 2) select IPG (i.e., call set_link_ipg())
2034 ccti_limit
= cc_state
->cct
.ccti_limit
;
2035 ccti_incr
= cc_state
->cong_setting
.entries
[sl
].ccti_increase
;
2036 ccti_timer
= cc_state
->cong_setting
.entries
[sl
].ccti_timer
;
2038 cc_state
->cong_setting
.entries
[sl
].trigger_threshold
;
2040 spin_lock_irqsave(&ppd
->cca_timer_lock
, flags
);
2042 cca_timer
= &ppd
->cca_timer
[sl
];
2043 if (cca_timer
->ccti
< ccti_limit
) {
2044 if (cca_timer
->ccti
+ ccti_incr
<= ccti_limit
)
2045 cca_timer
->ccti
+= ccti_incr
;
2047 cca_timer
->ccti
= ccti_limit
;
2051 ccti
= cca_timer
->ccti
;
2053 if (!hrtimer_active(&cca_timer
->hrtimer
)) {
2054 /* ccti_timer is in units of 1.024 usec */
2055 unsigned long nsec
= 1024 * ccti_timer
;
2057 hrtimer_start(&cca_timer
->hrtimer
, ns_to_ktime(nsec
),
2061 spin_unlock_irqrestore(&ppd
->cca_timer_lock
, flags
);
2063 if ((trigger_threshold
!= 0) && (ccti
>= trigger_threshold
))
2064 log_cca_event(ppd
, sl
, rlid
, lqpn
, rqpn
, svc_type
);
2068 * hfi1_rc_rcv - process an incoming RC packet
2069 * @rcd: the context pointer
2070 * @hdr: the header of this packet
2071 * @rcv_flags: flags relevant to rcv processing
2072 * @data: the packet data
2073 * @tlen: the packet length
2074 * @qp: the QP for this packet
2076 * This is called from qp_rcv() to process an incoming RC packet
2078 * May be called at interrupt level.
2080 void hfi1_rc_rcv(struct hfi1_packet
*packet
)
2082 struct hfi1_ctxtdata
*rcd
= packet
->rcd
;
2083 struct hfi1_ib_header
*hdr
= packet
->hdr
;
2084 u32 rcv_flags
= packet
->rcv_flags
;
2085 void *data
= packet
->ebuf
;
2086 u32 tlen
= packet
->tlen
;
2087 struct rvt_qp
*qp
= packet
->qp
;
2088 struct hfi1_ibport
*ibp
= to_iport(qp
->ibqp
.device
, qp
->port_num
);
2089 struct hfi1_pportdata
*ppd
= ppd_from_ibp(ibp
);
2090 struct hfi1_other_headers
*ohdr
= packet
->ohdr
;
2092 u32 hdrsize
= packet
->hlen
;
2096 u32 pmtu
= qp
->pmtu
;
2098 struct ib_reth
*reth
;
2099 unsigned long flags
;
2101 int ret
, is_fecn
= 0;
2104 bth0
= be32_to_cpu(ohdr
->bth
[0]);
2105 if (hfi1_ruc_check_hdr(ibp
, hdr
, rcv_flags
& HFI1_HAS_GRH
, qp
, bth0
))
2108 bth1
= be32_to_cpu(ohdr
->bth
[1]);
2109 if (unlikely(bth1
& (HFI1_BECN_SMASK
| HFI1_FECN_SMASK
))) {
2110 if (bth1
& HFI1_BECN_SMASK
) {
2111 u16 rlid
= qp
->remote_ah_attr
.dlid
;
2114 lqpn
= qp
->ibqp
.qp_num
;
2115 rqpn
= qp
->remote_qpn
;
2118 qp
->remote_ah_attr
.sl
,
2122 is_fecn
= bth1
& HFI1_FECN_SMASK
;
2125 psn
= be32_to_cpu(ohdr
->bth
[2]);
2126 opcode
= (bth0
>> 24) & 0xff;
2129 * Process responses (ACKs) before anything else. Note that the
2130 * packet sequence number will be for something in the send work
2131 * queue rather than the expected receive packet sequence number.
2132 * In other words, this QP is the requester.
2134 if (opcode
>= OP(RDMA_READ_RESPONSE_FIRST
) &&
2135 opcode
<= OP(ATOMIC_ACKNOWLEDGE
)) {
2136 rc_rcv_resp(ibp
, ohdr
, data
, tlen
, qp
, opcode
, psn
,
2137 hdrsize
, pmtu
, rcd
);
2143 /* Compute 24 bits worth of difference. */
2144 diff
= delta_psn(psn
, qp
->r_psn
);
2145 if (unlikely(diff
)) {
2146 if (rc_rcv_error(ohdr
, data
, qp
, opcode
, psn
, diff
, rcd
))
2151 /* Check for opcode sequence errors. */
2152 switch (qp
->r_state
) {
2153 case OP(SEND_FIRST
):
2154 case OP(SEND_MIDDLE
):
2155 if (opcode
== OP(SEND_MIDDLE
) ||
2156 opcode
== OP(SEND_LAST
) ||
2157 opcode
== OP(SEND_LAST_WITH_IMMEDIATE
))
2161 case OP(RDMA_WRITE_FIRST
):
2162 case OP(RDMA_WRITE_MIDDLE
):
2163 if (opcode
== OP(RDMA_WRITE_MIDDLE
) ||
2164 opcode
== OP(RDMA_WRITE_LAST
) ||
2165 opcode
== OP(RDMA_WRITE_LAST_WITH_IMMEDIATE
))
2170 if (opcode
== OP(SEND_MIDDLE
) ||
2171 opcode
== OP(SEND_LAST
) ||
2172 opcode
== OP(SEND_LAST_WITH_IMMEDIATE
) ||
2173 opcode
== OP(RDMA_WRITE_MIDDLE
) ||
2174 opcode
== OP(RDMA_WRITE_LAST
) ||
2175 opcode
== OP(RDMA_WRITE_LAST_WITH_IMMEDIATE
))
2178 * Note that it is up to the requester to not send a new
2179 * RDMA read or atomic operation before receiving an ACK
2180 * for the previous operation.
2185 if (qp
->state
== IB_QPS_RTR
&& !(qp
->r_flags
& RVT_R_COMM_EST
))
2188 /* OK, process the packet. */
2190 case OP(SEND_FIRST
):
2191 ret
= hfi1_rvt_get_rwqe(qp
, 0);
2198 case OP(SEND_MIDDLE
):
2199 case OP(RDMA_WRITE_MIDDLE
):
2201 /* Check for invalid length PMTU or posted rwqe len. */
2202 if (unlikely(tlen
!= (hdrsize
+ pmtu
+ 4)))
2204 qp
->r_rcv_len
+= pmtu
;
2205 if (unlikely(qp
->r_rcv_len
> qp
->r_len
))
2207 hfi1_copy_sge(&qp
->r_sge
, data
, pmtu
, 1, 0);
2210 case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE
):
2212 ret
= hfi1_rvt_get_rwqe(qp
, 1);
2220 case OP(SEND_ONLY_WITH_IMMEDIATE
):
2221 ret
= hfi1_rvt_get_rwqe(qp
, 0);
2227 if (opcode
== OP(SEND_ONLY
))
2228 goto no_immediate_data
;
2229 /* FALLTHROUGH for SEND_ONLY_WITH_IMMEDIATE */
2230 case OP(SEND_LAST_WITH_IMMEDIATE
):
2232 wc
.ex
.imm_data
= ohdr
->u
.imm_data
;
2233 wc
.wc_flags
= IB_WC_WITH_IMM
;
2235 case OP(RDMA_WRITE_LAST
):
2236 copy_last
= ibpd_to_rvtpd(qp
->ibqp
.pd
)->user
;
2243 /* Get the number of bytes the message was padded by. */
2244 pad
= (bth0
>> 20) & 3;
2245 /* Check for invalid length. */
2246 /* LAST len should be >= 1 */
2247 if (unlikely(tlen
< (hdrsize
+ pad
+ 4)))
2249 /* Don't count the CRC. */
2250 tlen
-= (hdrsize
+ pad
+ 4);
2251 wc
.byte_len
= tlen
+ qp
->r_rcv_len
;
2252 if (unlikely(wc
.byte_len
> qp
->r_len
))
2254 hfi1_copy_sge(&qp
->r_sge
, data
, tlen
, 1, copy_last
);
2255 rvt_put_ss(&qp
->r_sge
);
2257 if (!test_and_clear_bit(RVT_R_WRID_VALID
, &qp
->r_aflags
))
2259 wc
.wr_id
= qp
->r_wr_id
;
2260 wc
.status
= IB_WC_SUCCESS
;
2261 if (opcode
== OP(RDMA_WRITE_LAST_WITH_IMMEDIATE
) ||
2262 opcode
== OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE
))
2263 wc
.opcode
= IB_WC_RECV_RDMA_WITH_IMM
;
2265 wc
.opcode
= IB_WC_RECV
;
2267 wc
.src_qp
= qp
->remote_qpn
;
2268 wc
.slid
= qp
->remote_ah_attr
.dlid
;
2270 * It seems that IB mandates the presence of an SL in a
2271 * work completion only for the UD transport (see section
2272 * 11.4.2 of IBTA Vol. 1).
2274 * However, the way the SL is chosen below is consistent
2275 * with the way that IB/qib works and is trying avoid
2276 * introducing incompatibilities.
2278 * See also OPA Vol. 1, section 9.7.6, and table 9-17.
2280 wc
.sl
= qp
->remote_ah_attr
.sl
;
2281 /* zero fields that are N/A */
2284 wc
.dlid_path_bits
= 0;
2286 /* Signal completion event if the solicited bit is set. */
2287 rvt_cq_enter(ibcq_to_rvtcq(qp
->ibqp
.recv_cq
), &wc
,
2288 (bth0
& IB_BTH_SOLICITED
) != 0);
2291 case OP(RDMA_WRITE_ONLY
):
2294 case OP(RDMA_WRITE_FIRST
):
2295 case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE
):
2296 if (unlikely(!(qp
->qp_access_flags
& IB_ACCESS_REMOTE_WRITE
)))
2299 reth
= &ohdr
->u
.rc
.reth
;
2300 qp
->r_len
= be32_to_cpu(reth
->length
);
2302 qp
->r_sge
.sg_list
= NULL
;
2303 if (qp
->r_len
!= 0) {
2304 u32 rkey
= be32_to_cpu(reth
->rkey
);
2305 u64 vaddr
= be64_to_cpu(reth
->vaddr
);
2308 /* Check rkey & NAK */
2309 ok
= rvt_rkey_ok(qp
, &qp
->r_sge
.sge
, qp
->r_len
, vaddr
,
2310 rkey
, IB_ACCESS_REMOTE_WRITE
);
2313 qp
->r_sge
.num_sge
= 1;
2315 qp
->r_sge
.num_sge
= 0;
2316 qp
->r_sge
.sge
.mr
= NULL
;
2317 qp
->r_sge
.sge
.vaddr
= NULL
;
2318 qp
->r_sge
.sge
.length
= 0;
2319 qp
->r_sge
.sge
.sge_length
= 0;
2321 if (opcode
== OP(RDMA_WRITE_FIRST
))
2323 else if (opcode
== OP(RDMA_WRITE_ONLY
))
2324 goto no_immediate_data
;
2325 ret
= hfi1_rvt_get_rwqe(qp
, 1);
2330 wc
.ex
.imm_data
= ohdr
->u
.rc
.imm_data
;
2331 wc
.wc_flags
= IB_WC_WITH_IMM
;
2334 case OP(RDMA_READ_REQUEST
): {
2335 struct rvt_ack_entry
*e
;
2339 if (unlikely(!(qp
->qp_access_flags
& IB_ACCESS_REMOTE_READ
)))
2341 next
= qp
->r_head_ack_queue
+ 1;
2342 /* s_ack_queue is size HFI1_MAX_RDMA_ATOMIC+1 so use > not >= */
2343 if (next
> HFI1_MAX_RDMA_ATOMIC
)
2345 spin_lock_irqsave(&qp
->s_lock
, flags
);
2346 if (unlikely(next
== qp
->s_tail_ack_queue
)) {
2347 if (!qp
->s_ack_queue
[next
].sent
)
2348 goto nack_inv_unlck
;
2349 update_ack_queue(qp
, next
);
2351 e
= &qp
->s_ack_queue
[qp
->r_head_ack_queue
];
2352 if (e
->opcode
== OP(RDMA_READ_REQUEST
) && e
->rdma_sge
.mr
) {
2353 rvt_put_mr(e
->rdma_sge
.mr
);
2354 e
->rdma_sge
.mr
= NULL
;
2356 reth
= &ohdr
->u
.rc
.reth
;
2357 len
= be32_to_cpu(reth
->length
);
2359 u32 rkey
= be32_to_cpu(reth
->rkey
);
2360 u64 vaddr
= be64_to_cpu(reth
->vaddr
);
2363 /* Check rkey & NAK */
2364 ok
= rvt_rkey_ok(qp
, &e
->rdma_sge
, len
, vaddr
,
2365 rkey
, IB_ACCESS_REMOTE_READ
);
2367 goto nack_acc_unlck
;
2369 * Update the next expected PSN. We add 1 later
2370 * below, so only add the remainder here.
2373 qp
->r_psn
+= (len
- 1) / pmtu
;
2375 e
->rdma_sge
.mr
= NULL
;
2376 e
->rdma_sge
.vaddr
= NULL
;
2377 e
->rdma_sge
.length
= 0;
2378 e
->rdma_sge
.sge_length
= 0;
2383 e
->lpsn
= qp
->r_psn
;
2385 * We need to increment the MSN here instead of when we
2386 * finish sending the result since a duplicate request would
2387 * increment it more than once.
2391 qp
->r_state
= opcode
;
2392 qp
->r_nak_state
= 0;
2393 qp
->r_head_ack_queue
= next
;
2395 /* Schedule the send tasklet. */
2396 qp
->s_flags
|= RVT_S_RESP_PENDING
;
2397 hfi1_schedule_send(qp
);
2399 spin_unlock_irqrestore(&qp
->s_lock
, flags
);
2405 case OP(COMPARE_SWAP
):
2406 case OP(FETCH_ADD
): {
2407 struct ib_atomic_eth
*ateth
;
2408 struct rvt_ack_entry
*e
;
2415 if (unlikely(!(qp
->qp_access_flags
& IB_ACCESS_REMOTE_ATOMIC
)))
2417 next
= qp
->r_head_ack_queue
+ 1;
2418 if (next
> HFI1_MAX_RDMA_ATOMIC
)
2420 spin_lock_irqsave(&qp
->s_lock
, flags
);
2421 if (unlikely(next
== qp
->s_tail_ack_queue
)) {
2422 if (!qp
->s_ack_queue
[next
].sent
)
2423 goto nack_inv_unlck
;
2424 update_ack_queue(qp
, next
);
2426 e
= &qp
->s_ack_queue
[qp
->r_head_ack_queue
];
2427 if (e
->opcode
== OP(RDMA_READ_REQUEST
) && e
->rdma_sge
.mr
) {
2428 rvt_put_mr(e
->rdma_sge
.mr
);
2429 e
->rdma_sge
.mr
= NULL
;
2431 ateth
= &ohdr
->u
.atomic_eth
;
2432 vaddr
= ((u64
)be32_to_cpu(ateth
->vaddr
[0]) << 32) |
2433 be32_to_cpu(ateth
->vaddr
[1]);
2434 if (unlikely(vaddr
& (sizeof(u64
) - 1)))
2435 goto nack_inv_unlck
;
2436 rkey
= be32_to_cpu(ateth
->rkey
);
2437 /* Check rkey & NAK */
2438 if (unlikely(!rvt_rkey_ok(qp
, &qp
->r_sge
.sge
, sizeof(u64
),
2440 IB_ACCESS_REMOTE_ATOMIC
)))
2441 goto nack_acc_unlck
;
2442 /* Perform atomic OP and save result. */
2443 maddr
= (atomic64_t
*)qp
->r_sge
.sge
.vaddr
;
2444 sdata
= be64_to_cpu(ateth
->swap_data
);
2445 e
->atomic_data
= (opcode
== OP(FETCH_ADD
)) ?
2446 (u64
)atomic64_add_return(sdata
, maddr
) - sdata
:
2447 (u64
)cmpxchg((u64
*)qp
->r_sge
.sge
.vaddr
,
2448 be64_to_cpu(ateth
->compare_data
),
2450 rvt_put_mr(qp
->r_sge
.sge
.mr
);
2451 qp
->r_sge
.num_sge
= 0;
2458 qp
->r_state
= opcode
;
2459 qp
->r_nak_state
= 0;
2460 qp
->r_head_ack_queue
= next
;
2462 /* Schedule the send tasklet. */
2463 qp
->s_flags
|= RVT_S_RESP_PENDING
;
2464 hfi1_schedule_send(qp
);
2466 spin_unlock_irqrestore(&qp
->s_lock
, flags
);
2473 /* NAK unknown opcodes. */
2477 qp
->r_state
= opcode
;
2478 qp
->r_ack_psn
= psn
;
2479 qp
->r_nak_state
= 0;
2480 /* Send an ACK if requested or required. */
2481 if (psn
& IB_BTH_REQ_ACK
) {
2482 struct hfi1_qp_priv
*priv
= qp
->priv
;
2484 if (packet
->numpkt
== 0) {
2488 if (priv
->r_adefered
>= HFI1_PSN_CREDIT
) {
2492 if (unlikely(is_fecn
)) {
2497 rc_defered_ack(rcd
, qp
);
2502 qp
->r_nak_state
= qp
->r_min_rnr_timer
| IB_RNR_NAK
;
2503 qp
->r_ack_psn
= qp
->r_psn
;
2504 /* Queue RNR NAK for later */
2505 rc_defered_ack(rcd
, qp
);
2509 hfi1_rc_error(qp
, IB_WC_LOC_QP_OP_ERR
);
2510 qp
->r_nak_state
= IB_NAK_REMOTE_OPERATIONAL_ERROR
;
2511 qp
->r_ack_psn
= qp
->r_psn
;
2512 /* Queue NAK for later */
2513 rc_defered_ack(rcd
, qp
);
2517 spin_unlock_irqrestore(&qp
->s_lock
, flags
);
2519 hfi1_rc_error(qp
, IB_WC_LOC_QP_OP_ERR
);
2520 qp
->r_nak_state
= IB_NAK_INVALID_REQUEST
;
2521 qp
->r_ack_psn
= qp
->r_psn
;
2522 /* Queue NAK for later */
2523 rc_defered_ack(rcd
, qp
);
2527 spin_unlock_irqrestore(&qp
->s_lock
, flags
);
2529 hfi1_rc_error(qp
, IB_WC_LOC_PROT_ERR
);
2530 qp
->r_nak_state
= IB_NAK_REMOTE_ACCESS_ERROR
;
2531 qp
->r_ack_psn
= qp
->r_psn
;
2533 hfi1_send_rc_ack(rcd
, qp
, is_fecn
);
2536 void hfi1_rc_hdrerr(
2537 struct hfi1_ctxtdata
*rcd
,
2538 struct hfi1_ib_header
*hdr
,
2542 int has_grh
= rcv_flags
& HFI1_HAS_GRH
;
2543 struct hfi1_other_headers
*ohdr
;
2544 struct hfi1_ibport
*ibp
= to_iport(qp
->ibqp
.device
, qp
->port_num
);
2552 ohdr
= &hdr
->u
.l
.oth
;
2554 bth0
= be32_to_cpu(ohdr
->bth
[0]);
2555 if (hfi1_ruc_check_hdr(ibp
, hdr
, has_grh
, qp
, bth0
))
2558 psn
= be32_to_cpu(ohdr
->bth
[2]);
2559 opcode
= (bth0
>> 24) & 0xff;
2561 /* Only deal with RDMA Writes for now */
2562 if (opcode
< IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST
) {
2563 diff
= delta_psn(psn
, qp
->r_psn
);
2564 if (!qp
->r_nak_state
&& diff
>= 0) {
2565 ibp
->rvp
.n_rc_seqnak
++;
2566 qp
->r_nak_state
= IB_NAK_PSN_ERROR
;
2567 /* Use the expected PSN. */
2568 qp
->r_ack_psn
= qp
->r_psn
;
2570 * Wait to send the sequence
2571 * NAK until all packets
2572 * in the receive queue have
2574 * Otherwise, we end up
2575 * propagating congestion.
2577 rc_defered_ack(rcd
, qp
);
2578 } /* Out of sequence NAK */
2579 } /* QP Request NAKs */