2 * Copyright (c) 2006 QLogic, Inc. All rights reserved.
3 * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
5 * This software is available to you under a choice of one of two
6 * licenses. You may choose to be licensed under the terms of the GNU
7 * General Public License (GPL) Version 2, available from the file
8 * COPYING in the main directory of this source tree, or the
9 * OpenIB.org BSD license below:
11 * Redistribution and use in source and binary forms, with or
12 * without modification, are permitted provided that the following
15 * - Redistributions of source code must retain the above
16 * copyright notice, this list of conditions and the following
19 * - Redistributions in binary form must reproduce the above
20 * copyright notice, this list of conditions and the following
21 * disclaimer in the documentation and/or other materials
22 * provided with the distribution.
24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
34 #include "ipath_verbs.h"
35 #include "ips_common.h"
37 /* cut down ridiculously long IB macro names */
38 #define OP(x) IB_OPCODE_RC_##x
41 * ipath_init_restart- initialize the qp->s_sge after a restart
42 * @qp: the QP who's SGE we're restarting
43 * @wqe: the work queue to initialize the QP's SGE from
45 * The QP s_lock should be held.
47 static void ipath_init_restart(struct ipath_qp
*qp
, struct ipath_swqe
*wqe
)
49 struct ipath_ibdev
*dev
;
52 len
= ((qp
->s_psn
- wqe
->psn
) & IPS_PSN_MASK
) *
53 ib_mtu_enum_to_int(qp
->path_mtu
);
54 qp
->s_sge
.sge
= wqe
->sg_list
[0];
55 qp
->s_sge
.sg_list
= wqe
->sg_list
+ 1;
56 qp
->s_sge
.num_sge
= wqe
->wr
.num_sge
;
57 ipath_skip_sge(&qp
->s_sge
, len
);
58 qp
->s_len
= wqe
->length
- len
;
59 dev
= to_idev(qp
->ibqp
.device
);
60 spin_lock(&dev
->pending_lock
);
61 if (list_empty(&qp
->timerwait
))
62 list_add_tail(&qp
->timerwait
,
63 &dev
->pending
[dev
->pending_index
]);
64 spin_unlock(&dev
->pending_lock
);
68 * ipath_make_rc_ack - construct a response packet (ACK, NAK, or RDMA read)
69 * @qp: a pointer to the QP
70 * @ohdr: a pointer to the IB header being constructed
73 * Return bth0 if constructed; otherwise, return 0.
74 * Note the QP s_lock must be held.
76 static inline u32
ipath_make_rc_ack(struct ipath_qp
*qp
,
77 struct ipath_other_headers
*ohdr
,
80 struct ipath_sge_state
*ss
;
85 /* header size in 32-bit words LRH+BTH = (8+12)/4. */
89 * Send a response. Note that we are in the responder's
90 * side of the QP context.
92 switch (qp
->s_ack_state
) {
93 case OP(RDMA_READ_REQUEST
):
98 qp
->s_ack_state
= OP(RDMA_READ_RESPONSE_FIRST
);
101 qp
->s_ack_state
= OP(RDMA_READ_RESPONSE_ONLY
);
102 qp
->s_rdma_len
-= len
;
103 bth0
= qp
->s_ack_state
<< 24;
104 ohdr
->u
.aeth
= ipath_compute_aeth(qp
);
108 case OP(RDMA_READ_RESPONSE_FIRST
):
109 qp
->s_ack_state
= OP(RDMA_READ_RESPONSE_MIDDLE
);
111 case OP(RDMA_READ_RESPONSE_MIDDLE
):
112 ss
= &qp
->s_rdma_sge
;
113 len
= qp
->s_rdma_len
;
117 ohdr
->u
.aeth
= ipath_compute_aeth(qp
);
119 qp
->s_ack_state
= OP(RDMA_READ_RESPONSE_LAST
);
121 qp
->s_rdma_len
-= len
;
122 bth0
= qp
->s_ack_state
<< 24;
125 case OP(RDMA_READ_RESPONSE_LAST
):
126 case OP(RDMA_READ_RESPONSE_ONLY
):
128 * We have to prevent new requests from changing
129 * the r_sge state while a ipath_verbs_send()
131 * Changing r_state allows the receiver
132 * to continue processing new packets.
133 * We do it here now instead of above so
134 * that we are sure the packet was sent before
135 * changing the state.
137 qp
->r_state
= OP(RDMA_READ_RESPONSE_LAST
);
138 qp
->s_ack_state
= OP(ACKNOWLEDGE
);
141 case OP(COMPARE_SWAP
):
145 qp
->r_state
= OP(SEND_LAST
);
146 qp
->s_ack_state
= OP(ACKNOWLEDGE
);
147 bth0
= IB_OPCODE_ATOMIC_ACKNOWLEDGE
<< 24;
148 ohdr
->u
.at
.aeth
= ipath_compute_aeth(qp
);
149 ohdr
->u
.at
.atomic_ack_eth
= cpu_to_be64(qp
->s_ack_atomic
);
150 hwords
+= sizeof(ohdr
->u
.at
) / 4;
154 /* Send a regular ACK. */
157 qp
->s_ack_state
= OP(ACKNOWLEDGE
);
158 bth0
= qp
->s_ack_state
<< 24;
159 ohdr
->u
.aeth
= ipath_compute_aeth(qp
);
162 qp
->s_hdrwords
= hwords
;
164 qp
->s_cur_size
= len
;
170 * ipath_make_rc_req - construct a request packet (SEND, RDMA r/w, ATOMIC)
171 * @qp: a pointer to the QP
172 * @ohdr: a pointer to the IB header being constructed
173 * @pmtu: the path MTU
174 * @bth0p: pointer to the BTH opcode word
175 * @bth2p: pointer to the BTH PSN word
177 * Return 1 if constructed; otherwise, return 0.
178 * Note the QP s_lock must be held.
180 static inline int ipath_make_rc_req(struct ipath_qp
*qp
,
181 struct ipath_other_headers
*ohdr
,
182 u32 pmtu
, u32
*bth0p
, u32
*bth2p
)
184 struct ipath_ibdev
*dev
= to_idev(qp
->ibqp
.device
);
185 struct ipath_sge_state
*ss
;
186 struct ipath_swqe
*wqe
;
193 if (!(ib_ipath_state_ops
[qp
->state
] & IPATH_PROCESS_SEND_OK
) ||
197 /* header size in 32-bit words LRH+BTH = (8+12)/4. */
201 /* Send a request. */
202 wqe
= get_swqe_ptr(qp
, qp
->s_cur
);
203 switch (qp
->s_state
) {
206 * Resend an old request or start a new one.
208 * We keep track of the current SWQE so that
209 * we don't reset the "furthest progress" state
210 * if we need to back up.
213 if (qp
->s_cur
== qp
->s_tail
) {
214 /* Check if send work queue is empty. */
215 if (qp
->s_tail
== qp
->s_head
)
217 qp
->s_psn
= wqe
->psn
= qp
->s_next_psn
;
221 * Note that we have to be careful not to modify the
222 * original work request since we may need to resend
225 qp
->s_sge
.sge
= wqe
->sg_list
[0];
226 qp
->s_sge
.sg_list
= wqe
->sg_list
+ 1;
227 qp
->s_sge
.num_sge
= wqe
->wr
.num_sge
;
228 qp
->s_len
= len
= wqe
->length
;
231 switch (wqe
->wr
.opcode
) {
233 case IB_WR_SEND_WITH_IMM
:
234 /* If no credit, return. */
235 if (qp
->s_lsn
!= (u32
) -1 &&
236 ipath_cmp24(wqe
->ssn
, qp
->s_lsn
+ 1) > 0)
238 wqe
->lpsn
= wqe
->psn
;
240 wqe
->lpsn
+= (len
- 1) / pmtu
;
241 qp
->s_state
= OP(SEND_FIRST
);
245 if (wqe
->wr
.opcode
== IB_WR_SEND
)
246 qp
->s_state
= OP(SEND_ONLY
);
248 qp
->s_state
= OP(SEND_ONLY_WITH_IMMEDIATE
);
249 /* Immediate data comes after the BTH */
250 ohdr
->u
.imm_data
= wqe
->wr
.imm_data
;
253 if (wqe
->wr
.send_flags
& IB_SEND_SOLICITED
)
255 bth2
= 1 << 31; /* Request ACK. */
256 if (++qp
->s_cur
== qp
->s_size
)
260 case IB_WR_RDMA_WRITE
:
264 case IB_WR_RDMA_WRITE_WITH_IMM
:
265 /* If no credit, return. */
266 if (qp
->s_lsn
!= (u32
) -1 &&
267 ipath_cmp24(wqe
->ssn
, qp
->s_lsn
+ 1) > 0)
269 ohdr
->u
.rc
.reth
.vaddr
=
270 cpu_to_be64(wqe
->wr
.wr
.rdma
.remote_addr
);
271 ohdr
->u
.rc
.reth
.rkey
=
272 cpu_to_be32(wqe
->wr
.wr
.rdma
.rkey
);
273 ohdr
->u
.rc
.reth
.length
= cpu_to_be32(len
);
274 hwords
+= sizeof(struct ib_reth
) / 4;
275 wqe
->lpsn
= wqe
->psn
;
277 wqe
->lpsn
+= (len
- 1) / pmtu
;
278 qp
->s_state
= OP(RDMA_WRITE_FIRST
);
282 if (wqe
->wr
.opcode
== IB_WR_RDMA_WRITE
)
283 qp
->s_state
= OP(RDMA_WRITE_ONLY
);
286 OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE
);
287 /* Immediate data comes
289 ohdr
->u
.rc
.imm_data
= wqe
->wr
.imm_data
;
291 if (wqe
->wr
.send_flags
& IB_SEND_SOLICITED
)
294 bth2
= 1 << 31; /* Request ACK. */
295 if (++qp
->s_cur
== qp
->s_size
)
299 case IB_WR_RDMA_READ
:
300 ohdr
->u
.rc
.reth
.vaddr
=
301 cpu_to_be64(wqe
->wr
.wr
.rdma
.remote_addr
);
302 ohdr
->u
.rc
.reth
.rkey
=
303 cpu_to_be32(wqe
->wr
.wr
.rdma
.rkey
);
304 ohdr
->u
.rc
.reth
.length
= cpu_to_be32(len
);
305 qp
->s_state
= OP(RDMA_READ_REQUEST
);
306 hwords
+= sizeof(ohdr
->u
.rc
.reth
) / 4;
310 * Adjust s_next_psn to count the
311 * expected number of responses.
314 qp
->s_next_psn
+= (len
- 1) / pmtu
;
315 wqe
->lpsn
= qp
->s_next_psn
++;
319 if (++qp
->s_cur
== qp
->s_size
)
323 case IB_WR_ATOMIC_CMP_AND_SWP
:
324 case IB_WR_ATOMIC_FETCH_AND_ADD
:
325 if (wqe
->wr
.opcode
== IB_WR_ATOMIC_CMP_AND_SWP
)
326 qp
->s_state
= OP(COMPARE_SWAP
);
328 qp
->s_state
= OP(FETCH_ADD
);
329 ohdr
->u
.atomic_eth
.vaddr
= cpu_to_be64(
330 wqe
->wr
.wr
.atomic
.remote_addr
);
331 ohdr
->u
.atomic_eth
.rkey
= cpu_to_be32(
332 wqe
->wr
.wr
.atomic
.rkey
);
333 ohdr
->u
.atomic_eth
.swap_data
= cpu_to_be64(
334 wqe
->wr
.wr
.atomic
.swap
);
335 ohdr
->u
.atomic_eth
.compare_data
= cpu_to_be64(
336 wqe
->wr
.wr
.atomic
.compare_add
);
337 hwords
+= sizeof(struct ib_atomic_eth
) / 4;
340 wqe
->lpsn
= wqe
->psn
;
342 if (++qp
->s_cur
== qp
->s_size
)
353 if (qp
->s_tail
>= qp
->s_size
)
356 bth2
|= qp
->s_psn
++ & IPS_PSN_MASK
;
357 if ((int)(qp
->s_psn
- qp
->s_next_psn
) > 0)
358 qp
->s_next_psn
= qp
->s_psn
;
359 spin_lock(&dev
->pending_lock
);
360 if (list_empty(&qp
->timerwait
))
361 list_add_tail(&qp
->timerwait
,
362 &dev
->pending
[dev
->pending_index
]);
363 spin_unlock(&dev
->pending_lock
);
366 case OP(RDMA_READ_RESPONSE_FIRST
):
368 * This case can only happen if a send is restarted. See
369 * ipath_restart_rc().
371 ipath_init_restart(qp
, wqe
);
374 qp
->s_state
= OP(SEND_MIDDLE
);
376 case OP(SEND_MIDDLE
):
377 bth2
= qp
->s_psn
++ & IPS_PSN_MASK
;
378 if ((int)(qp
->s_psn
- qp
->s_next_psn
) > 0)
379 qp
->s_next_psn
= qp
->s_psn
;
384 * Request an ACK every 1/2 MB to avoid retransmit
387 if (((wqe
->length
- len
) % (512 * 1024)) == 0)
392 if (wqe
->wr
.opcode
== IB_WR_SEND
)
393 qp
->s_state
= OP(SEND_LAST
);
395 qp
->s_state
= OP(SEND_LAST_WITH_IMMEDIATE
);
396 /* Immediate data comes after the BTH */
397 ohdr
->u
.imm_data
= wqe
->wr
.imm_data
;
400 if (wqe
->wr
.send_flags
& IB_SEND_SOLICITED
)
402 bth2
|= 1 << 31; /* Request ACK. */
404 if (qp
->s_cur
>= qp
->s_size
)
408 case OP(RDMA_READ_RESPONSE_LAST
):
410 * This case can only happen if a RDMA write is restarted.
411 * See ipath_restart_rc().
413 ipath_init_restart(qp
, wqe
);
415 case OP(RDMA_WRITE_FIRST
):
416 qp
->s_state
= OP(RDMA_WRITE_MIDDLE
);
418 case OP(RDMA_WRITE_MIDDLE
):
419 bth2
= qp
->s_psn
++ & IPS_PSN_MASK
;
420 if ((int)(qp
->s_psn
- qp
->s_next_psn
) > 0)
421 qp
->s_next_psn
= qp
->s_psn
;
426 * Request an ACK every 1/2 MB to avoid retransmit
429 if (((wqe
->length
- len
) % (512 * 1024)) == 0)
434 if (wqe
->wr
.opcode
== IB_WR_RDMA_WRITE
)
435 qp
->s_state
= OP(RDMA_WRITE_LAST
);
437 qp
->s_state
= OP(RDMA_WRITE_LAST_WITH_IMMEDIATE
);
438 /* Immediate data comes after the BTH */
439 ohdr
->u
.imm_data
= wqe
->wr
.imm_data
;
441 if (wqe
->wr
.send_flags
& IB_SEND_SOLICITED
)
444 bth2
|= 1 << 31; /* Request ACK. */
446 if (qp
->s_cur
>= qp
->s_size
)
450 case OP(RDMA_READ_RESPONSE_MIDDLE
):
452 * This case can only happen if a RDMA read is restarted.
453 * See ipath_restart_rc().
455 ipath_init_restart(qp
, wqe
);
456 len
= ((qp
->s_psn
- wqe
->psn
) & IPS_PSN_MASK
) * pmtu
;
457 ohdr
->u
.rc
.reth
.vaddr
=
458 cpu_to_be64(wqe
->wr
.wr
.rdma
.remote_addr
+ len
);
459 ohdr
->u
.rc
.reth
.rkey
=
460 cpu_to_be32(wqe
->wr
.wr
.rdma
.rkey
);
461 ohdr
->u
.rc
.reth
.length
= cpu_to_be32(qp
->s_len
);
462 qp
->s_state
= OP(RDMA_READ_REQUEST
);
463 hwords
+= sizeof(ohdr
->u
.rc
.reth
) / 4;
464 bth2
= qp
->s_psn
++ & IPS_PSN_MASK
;
465 if ((int)(qp
->s_psn
- qp
->s_next_psn
) > 0)
466 qp
->s_next_psn
= qp
->s_psn
;
470 if (qp
->s_cur
== qp
->s_size
)
474 case OP(RDMA_READ_REQUEST
):
475 case OP(COMPARE_SWAP
):
478 * We shouldn't start anything new until this request is
479 * finished. The ACK will handle rescheduling us. XXX The
480 * number of outstanding ones is negotiated at connection
481 * setup time (see pg. 258,289)? XXX Also, if we support
482 * multiple outstanding requests, we need to check the WQE
483 * IB_SEND_FENCE flag and not send a new request if a RDMA
484 * read or atomic is pending.
489 qp
->s_hdrwords
= hwords
;
491 qp
->s_cur_size
= len
;
492 *bth0p
= bth0
| (qp
->s_state
<< 24);
500 static inline void ipath_make_rc_grh(struct ipath_qp
*qp
,
501 struct ib_global_route
*grh
,
504 struct ipath_ibdev
*dev
= to_idev(qp
->ibqp
.device
);
506 /* GRH header size in 32-bit words. */
507 qp
->s_hdrwords
+= 10;
508 qp
->s_hdr
.u
.l
.grh
.version_tclass_flow
=
509 cpu_to_be32((6 << 28) |
510 (grh
->traffic_class
<< 20) |
512 qp
->s_hdr
.u
.l
.grh
.paylen
=
513 cpu_to_be16(((qp
->s_hdrwords
- 12) + nwords
+
515 /* next_hdr is defined by C8-7 in ch. 8.4.1 */
516 qp
->s_hdr
.u
.l
.grh
.next_hdr
= 0x1B;
517 qp
->s_hdr
.u
.l
.grh
.hop_limit
= grh
->hop_limit
;
518 /* The SGID is 32-bit aligned. */
519 qp
->s_hdr
.u
.l
.grh
.sgid
.global
.subnet_prefix
= dev
->gid_prefix
;
520 qp
->s_hdr
.u
.l
.grh
.sgid
.global
.interface_id
=
521 ipath_layer_get_guid(dev
->dd
);
522 qp
->s_hdr
.u
.l
.grh
.dgid
= grh
->dgid
;
526 * ipath_do_rc_send - perform a send on an RC QP
527 * @data: contains a pointer to the QP
529 * Process entries in the send work queue until credit or queue is
530 * exhausted. Only allow one CPU to send a packet per QP (tasklet).
531 * Otherwise, after we drop the QP s_lock, two threads could send
532 * packets out of order.
534 void ipath_do_rc_send(unsigned long data
)
536 struct ipath_qp
*qp
= (struct ipath_qp
*)data
;
537 struct ipath_ibdev
*dev
= to_idev(qp
->ibqp
.device
);
544 u32 pmtu
= ib_mtu_enum_to_int(qp
->path_mtu
);
545 struct ipath_other_headers
*ohdr
;
547 if (test_and_set_bit(IPATH_S_BUSY
, &qp
->s_flags
))
550 if (unlikely(qp
->remote_ah_attr
.dlid
==
551 ipath_layer_get_lid(dev
->dd
))) {
555 * Pass in an uninitialized ib_wc to be consistent with
556 * other places where ipath_ruc_loopback() is called.
558 ipath_ruc_loopback(qp
, &wc
);
562 ohdr
= &qp
->s_hdr
.u
.oth
;
563 if (qp
->remote_ah_attr
.ah_flags
& IB_AH_GRH
)
564 ohdr
= &qp
->s_hdr
.u
.l
.oth
;
567 /* Check for a constructed packet to be sent. */
568 if (qp
->s_hdrwords
!= 0) {
570 * If no PIO bufs are available, return. An interrupt will
571 * call ipath_ib_piobufavail() when one is available.
573 _VERBS_INFO("h %u %p\n", qp
->s_hdrwords
, &qp
->s_hdr
);
574 _VERBS_INFO("d %u %p %u %p %u %u %u %u\n", qp
->s_cur_size
,
575 qp
->s_cur_sge
->sg_list
,
576 qp
->s_cur_sge
->num_sge
,
577 qp
->s_cur_sge
->sge
.vaddr
,
578 qp
->s_cur_sge
->sge
.sge_length
,
579 qp
->s_cur_sge
->sge
.length
,
580 qp
->s_cur_sge
->sge
.m
,
581 qp
->s_cur_sge
->sge
.n
);
582 if (ipath_verbs_send(dev
->dd
, qp
->s_hdrwords
,
583 (u32
*) &qp
->s_hdr
, qp
->s_cur_size
,
585 ipath_no_bufs_available(qp
, dev
);
588 dev
->n_unicast_xmit
++;
589 /* Record that we sent the packet and s_hdr is empty. */
594 * The lock is needed to synchronize between setting
595 * qp->s_ack_state, resend timer, and post_send().
597 spin_lock_irqsave(&qp
->s_lock
, flags
);
599 /* Sending responses has higher priority over sending requests. */
600 if (qp
->s_ack_state
!= OP(ACKNOWLEDGE
) &&
601 (bth0
= ipath_make_rc_ack(qp
, ohdr
, pmtu
)) != 0)
602 bth2
= qp
->s_ack_psn
++ & IPS_PSN_MASK
;
603 else if (!ipath_make_rc_req(qp
, ohdr
, pmtu
, &bth0
, &bth2
))
606 spin_unlock_irqrestore(&qp
->s_lock
, flags
);
608 /* Construct the header. */
609 extra_bytes
= (4 - qp
->s_cur_size
) & 3;
610 nwords
= (qp
->s_cur_size
+ extra_bytes
) >> 2;
612 if (unlikely(qp
->remote_ah_attr
.ah_flags
& IB_AH_GRH
)) {
613 ipath_make_rc_grh(qp
, &qp
->remote_ah_attr
.grh
, nwords
);
616 lrh0
|= qp
->remote_ah_attr
.sl
<< 4;
617 qp
->s_hdr
.lrh
[0] = cpu_to_be16(lrh0
);
618 qp
->s_hdr
.lrh
[1] = cpu_to_be16(qp
->remote_ah_attr
.dlid
);
619 qp
->s_hdr
.lrh
[2] = cpu_to_be16(qp
->s_hdrwords
+ nwords
+
621 qp
->s_hdr
.lrh
[3] = cpu_to_be16(ipath_layer_get_lid(dev
->dd
));
622 bth0
|= ipath_layer_get_pkey(dev
->dd
, qp
->s_pkey_index
);
623 bth0
|= extra_bytes
<< 20;
624 ohdr
->bth
[0] = cpu_to_be32(bth0
);
625 ohdr
->bth
[1] = cpu_to_be32(qp
->remote_qpn
);
626 ohdr
->bth
[2] = cpu_to_be32(bth2
);
628 /* Check for more work to do. */
632 spin_unlock_irqrestore(&qp
->s_lock
, flags
);
634 clear_bit(IPATH_S_BUSY
, &qp
->s_flags
);
639 static void send_rc_ack(struct ipath_qp
*qp
)
641 struct ipath_ibdev
*dev
= to_idev(qp
->ibqp
.device
);
644 struct ipath_other_headers
*ohdr
;
646 /* Construct the header. */
647 ohdr
= &qp
->s_hdr
.u
.oth
;
649 /* header size in 32-bit words LRH+BTH+AETH = (8+12+4)/4. */
651 if (unlikely(qp
->remote_ah_attr
.ah_flags
& IB_AH_GRH
)) {
652 ipath_make_rc_grh(qp
, &qp
->remote_ah_attr
.grh
, 0);
653 ohdr
= &qp
->s_hdr
.u
.l
.oth
;
656 bth0
= ipath_layer_get_pkey(dev
->dd
, qp
->s_pkey_index
);
657 ohdr
->u
.aeth
= ipath_compute_aeth(qp
);
658 if (qp
->s_ack_state
>= OP(COMPARE_SWAP
)) {
659 bth0
|= IB_OPCODE_ATOMIC_ACKNOWLEDGE
<< 24;
660 ohdr
->u
.at
.atomic_ack_eth
= cpu_to_be64(qp
->s_ack_atomic
);
661 qp
->s_hdrwords
+= sizeof(ohdr
->u
.at
.atomic_ack_eth
) / 4;
664 bth0
|= OP(ACKNOWLEDGE
) << 24;
665 lrh0
|= qp
->remote_ah_attr
.sl
<< 4;
666 qp
->s_hdr
.lrh
[0] = cpu_to_be16(lrh0
);
667 qp
->s_hdr
.lrh
[1] = cpu_to_be16(qp
->remote_ah_attr
.dlid
);
668 qp
->s_hdr
.lrh
[2] = cpu_to_be16(qp
->s_hdrwords
+ SIZE_OF_CRC
);
669 qp
->s_hdr
.lrh
[3] = cpu_to_be16(ipath_layer_get_lid(dev
->dd
));
670 ohdr
->bth
[0] = cpu_to_be32(bth0
);
671 ohdr
->bth
[1] = cpu_to_be32(qp
->remote_qpn
);
672 ohdr
->bth
[2] = cpu_to_be32(qp
->s_ack_psn
& IPS_PSN_MASK
);
675 * If we can send the ACK, clear the ACK state.
677 if (ipath_verbs_send(dev
->dd
, qp
->s_hdrwords
, (u32
*) &qp
->s_hdr
,
679 qp
->s_ack_state
= OP(ACKNOWLEDGE
);
681 dev
->n_unicast_xmit
++;
686 * ipath_restart_rc - back up requester to resend the last un-ACKed request
687 * @qp: the QP to restart
688 * @psn: packet sequence number for the request
689 * @wc: the work completion request
691 * The QP s_lock should be held.
693 void ipath_restart_rc(struct ipath_qp
*qp
, u32 psn
, struct ib_wc
*wc
)
695 struct ipath_swqe
*wqe
= get_swqe_ptr(qp
, qp
->s_last
);
696 struct ipath_ibdev
*dev
;
700 * If there are no requests pending, we are done.
702 if (ipath_cmp24(psn
, qp
->s_next_psn
) >= 0 ||
703 qp
->s_last
== qp
->s_tail
)
706 if (qp
->s_retry
== 0) {
707 wc
->wr_id
= wqe
->wr
.wr_id
;
708 wc
->status
= IB_WC_RETRY_EXC_ERR
;
709 wc
->opcode
= ib_ipath_wc_opcode
[wqe
->wr
.opcode
];
712 wc
->qp_num
= qp
->ibqp
.qp_num
;
713 wc
->src_qp
= qp
->remote_qpn
;
715 wc
->slid
= qp
->remote_ah_attr
.dlid
;
716 wc
->sl
= qp
->remote_ah_attr
.sl
;
717 wc
->dlid_path_bits
= 0;
719 ipath_sqerror_qp(qp
, wc
);
725 * Remove the QP from the timeout queue.
726 * Note: it may already have been removed by ipath_ib_timer().
728 dev
= to_idev(qp
->ibqp
.device
);
729 spin_lock(&dev
->pending_lock
);
730 if (!list_empty(&qp
->timerwait
))
731 list_del_init(&qp
->timerwait
);
732 spin_unlock(&dev
->pending_lock
);
734 if (wqe
->wr
.opcode
== IB_WR_RDMA_READ
)
737 dev
->n_rc_resends
+= (int)qp
->s_psn
- (int)psn
;
740 * If we are starting the request from the beginning, let the normal
741 * send code handle initialization.
743 qp
->s_cur
= qp
->s_last
;
744 if (ipath_cmp24(psn
, wqe
->psn
) <= 0) {
745 qp
->s_state
= OP(SEND_LAST
);
746 qp
->s_psn
= wqe
->psn
;
750 if (++n
== qp
->s_size
)
752 if (n
== qp
->s_tail
) {
753 if (ipath_cmp24(psn
, qp
->s_next_psn
) >= 0) {
755 wqe
= get_swqe_ptr(qp
, n
);
759 wqe
= get_swqe_ptr(qp
, n
);
760 if (ipath_cmp24(psn
, wqe
->psn
) < 0)
767 * Reset the state to restart in the middle of a request.
768 * Don't change the s_sge, s_cur_sge, or s_cur_size.
769 * See ipath_do_rc_send().
771 switch (wqe
->wr
.opcode
) {
773 case IB_WR_SEND_WITH_IMM
:
774 qp
->s_state
= OP(RDMA_READ_RESPONSE_FIRST
);
777 case IB_WR_RDMA_WRITE
:
778 case IB_WR_RDMA_WRITE_WITH_IMM
:
779 qp
->s_state
= OP(RDMA_READ_RESPONSE_LAST
);
782 case IB_WR_RDMA_READ
:
784 OP(RDMA_READ_RESPONSE_MIDDLE
);
789 * This case shouldn't happen since its only
792 qp
->s_state
= OP(SEND_LAST
);
797 tasklet_hi_schedule(&qp
->s_task
);
804 * reset_psn - reset the QP state to send starting from PSN
806 * @psn: the packet sequence number to restart at
808 * This is called from ipath_rc_rcv() to process an incoming RC ACK
810 * Called at interrupt level with the QP s_lock held.
812 static void reset_psn(struct ipath_qp
*qp
, u32 psn
)
814 struct ipath_swqe
*wqe
;
818 wqe
= get_swqe_ptr(qp
, n
);
820 if (++n
== qp
->s_size
)
822 if (n
== qp
->s_tail
) {
823 if (ipath_cmp24(psn
, qp
->s_next_psn
) >= 0) {
825 wqe
= get_swqe_ptr(qp
, n
);
829 wqe
= get_swqe_ptr(qp
, n
);
830 if (ipath_cmp24(psn
, wqe
->psn
) < 0)
837 * Set the state to restart in the middle of a
838 * request. Don't change the s_sge, s_cur_sge, or
839 * s_cur_size. See ipath_do_rc_send().
841 switch (wqe
->wr
.opcode
) {
843 case IB_WR_SEND_WITH_IMM
:
844 qp
->s_state
= OP(RDMA_READ_RESPONSE_FIRST
);
847 case IB_WR_RDMA_WRITE
:
848 case IB_WR_RDMA_WRITE_WITH_IMM
:
849 qp
->s_state
= OP(RDMA_READ_RESPONSE_LAST
);
852 case IB_WR_RDMA_READ
:
853 qp
->s_state
= OP(RDMA_READ_RESPONSE_MIDDLE
);
858 * This case shouldn't happen since its only
861 qp
->s_state
= OP(SEND_LAST
);
866 * do_rc_ack - process an incoming RC ACK
867 * @qp: the QP the ACK came in on
868 * @psn: the packet sequence number of the ACK
869 * @opcode: the opcode of the request that resulted in the ACK
871 * This is called from ipath_rc_rcv() to process an incoming RC ACK
873 * Called at interrupt level with the QP s_lock held.
874 * Returns 1 if OK, 0 if current operation should be aborted (NAK).
876 static int do_rc_ack(struct ipath_qp
*qp
, u32 aeth
, u32 psn
, int opcode
)
878 struct ipath_ibdev
*dev
= to_idev(qp
->ibqp
.device
);
880 struct ipath_swqe
*wqe
;
884 * Remove the QP from the timeout queue (or RNR timeout queue).
885 * If ipath_ib_timer() has already removed it,
886 * it's OK since we hold the QP s_lock and ipath_restart_rc()
887 * just won't find anything to restart if we ACK everything.
889 spin_lock(&dev
->pending_lock
);
890 if (!list_empty(&qp
->timerwait
))
891 list_del_init(&qp
->timerwait
);
892 spin_unlock(&dev
->pending_lock
);
895 * Note that NAKs implicitly ACK outstanding SEND and RDMA write
896 * requests and implicitly NAK RDMA read and atomic requests issued
897 * before the NAK'ed request. The MSN won't include the NAK'ed
898 * request but will include an ACK'ed request(s).
900 wqe
= get_swqe_ptr(qp
, qp
->s_last
);
902 /* Nothing is pending to ACK/NAK. */
903 if (qp
->s_last
== qp
->s_tail
)
907 * The MSN might be for a later WQE than the PSN indicates so
908 * only complete WQEs that the PSN finishes.
910 while (ipath_cmp24(psn
, wqe
->lpsn
) >= 0) {
911 /* If we are ACKing a WQE, the MSN should be >= the SSN. */
912 if (ipath_cmp24(aeth
, wqe
->ssn
) < 0)
915 * If this request is a RDMA read or atomic, and the ACK is
916 * for a later operation, this ACK NAKs the RDMA read or
917 * atomic. In other words, only a RDMA_READ_LAST or ONLY
918 * can ACK a RDMA read and likewise for atomic ops. Note
919 * that the NAK case can only happen if relaxed ordering is
920 * used and requests are sent after an RDMA read or atomic
921 * is sent but before the response is received.
923 if ((wqe
->wr
.opcode
== IB_WR_RDMA_READ
&&
924 opcode
!= OP(RDMA_READ_RESPONSE_LAST
)) ||
925 ((wqe
->wr
.opcode
== IB_WR_ATOMIC_CMP_AND_SWP
||
926 wqe
->wr
.opcode
== IB_WR_ATOMIC_FETCH_AND_ADD
) &&
927 (opcode
!= OP(ATOMIC_ACKNOWLEDGE
) ||
928 ipath_cmp24(wqe
->psn
, psn
) != 0))) {
930 * The last valid PSN seen is the previous
933 qp
->s_last_psn
= wqe
->psn
- 1;
934 /* Retry this request. */
935 ipath_restart_rc(qp
, wqe
->psn
, &wc
);
937 * No need to process the ACK/NAK since we are
938 * restarting an earlier request.
942 /* Post a send completion queue entry if requested. */
943 if (!test_bit(IPATH_S_SIGNAL_REQ_WR
, &qp
->s_flags
) ||
944 (wqe
->wr
.send_flags
& IB_SEND_SIGNALED
)) {
945 wc
.wr_id
= wqe
->wr
.wr_id
;
946 wc
.status
= IB_WC_SUCCESS
;
947 wc
.opcode
= ib_ipath_wc_opcode
[wqe
->wr
.opcode
];
949 wc
.byte_len
= wqe
->length
;
950 wc
.qp_num
= qp
->ibqp
.qp_num
;
951 wc
.src_qp
= qp
->remote_qpn
;
953 wc
.slid
= qp
->remote_ah_attr
.dlid
;
954 wc
.sl
= qp
->remote_ah_attr
.sl
;
955 wc
.dlid_path_bits
= 0;
957 ipath_cq_enter(to_icq(qp
->ibqp
.send_cq
), &wc
, 0);
959 qp
->s_retry
= qp
->s_retry_cnt
;
961 * If we are completing a request which is in the process of
962 * being resent, we can stop resending it since we know the
963 * responder has already seen it.
965 if (qp
->s_last
== qp
->s_cur
) {
966 if (++qp
->s_cur
>= qp
->s_size
)
968 wqe
= get_swqe_ptr(qp
, qp
->s_cur
);
969 qp
->s_state
= OP(SEND_LAST
);
970 qp
->s_psn
= wqe
->psn
;
972 if (++qp
->s_last
>= qp
->s_size
)
974 wqe
= get_swqe_ptr(qp
, qp
->s_last
);
975 if (qp
->s_last
== qp
->s_tail
)
979 switch (aeth
>> 29) {
982 /* If this is a partial ACK, reset the retransmit timer. */
983 if (qp
->s_last
!= qp
->s_tail
) {
984 spin_lock(&dev
->pending_lock
);
985 list_add_tail(&qp
->timerwait
,
986 &dev
->pending
[dev
->pending_index
]);
987 spin_unlock(&dev
->pending_lock
);
989 ipath_get_credit(qp
, aeth
);
990 qp
->s_rnr_retry
= qp
->s_rnr_retry_cnt
;
991 qp
->s_retry
= qp
->s_retry_cnt
;
992 qp
->s_last_psn
= psn
;
996 case 1: /* RNR NAK */
998 if (qp
->s_rnr_retry
== 0) {
999 if (qp
->s_last
== qp
->s_tail
)
1002 wc
.status
= IB_WC_RNR_RETRY_EXC_ERR
;
1005 if (qp
->s_rnr_retry_cnt
< 7)
1007 if (qp
->s_last
== qp
->s_tail
)
1010 /* The last valid PSN seen is the previous request's. */
1011 qp
->s_last_psn
= wqe
->psn
- 1;
1013 dev
->n_rc_resends
+= (int)qp
->s_psn
- (int)psn
;
1016 * If we are starting the request from the beginning, let
1017 * the normal send code handle initialization.
1019 qp
->s_cur
= qp
->s_last
;
1020 wqe
= get_swqe_ptr(qp
, qp
->s_cur
);
1021 if (ipath_cmp24(psn
, wqe
->psn
) <= 0) {
1022 qp
->s_state
= OP(SEND_LAST
);
1023 qp
->s_psn
= wqe
->psn
;
1028 ib_ipath_rnr_table
[(aeth
>> IPS_AETH_CREDIT_SHIFT
) &
1029 IPS_AETH_CREDIT_MASK
];
1030 ipath_insert_rnr_queue(qp
);
1034 /* The last valid PSN seen is the previous request's. */
1035 if (qp
->s_last
!= qp
->s_tail
)
1036 qp
->s_last_psn
= wqe
->psn
- 1;
1037 switch ((aeth
>> IPS_AETH_CREDIT_SHIFT
) &
1038 IPS_AETH_CREDIT_MASK
) {
1039 case 0: /* PSN sequence error */
1042 * Back up to the responder's expected PSN. XXX
1043 * Note that we might get a NAK in the middle of an
1044 * RDMA READ response which terminates the RDMA
1047 if (qp
->s_last
== qp
->s_tail
)
1050 if (ipath_cmp24(psn
, wqe
->psn
) < 0)
1053 /* Retry the request. */
1054 ipath_restart_rc(qp
, psn
, &wc
);
1057 case 1: /* Invalid Request */
1058 wc
.status
= IB_WC_REM_INV_REQ_ERR
;
1059 dev
->n_other_naks
++;
1062 case 2: /* Remote Access Error */
1063 wc
.status
= IB_WC_REM_ACCESS_ERR
;
1064 dev
->n_other_naks
++;
1067 case 3: /* Remote Operation Error */
1068 wc
.status
= IB_WC_REM_OP_ERR
;
1069 dev
->n_other_naks
++;
1071 wc
.wr_id
= wqe
->wr
.wr_id
;
1072 wc
.opcode
= ib_ipath_wc_opcode
[wqe
->wr
.opcode
];
1075 wc
.qp_num
= qp
->ibqp
.qp_num
;
1076 wc
.src_qp
= qp
->remote_qpn
;
1078 wc
.slid
= qp
->remote_ah_attr
.dlid
;
1079 wc
.sl
= qp
->remote_ah_attr
.sl
;
1080 wc
.dlid_path_bits
= 0;
1082 ipath_sqerror_qp(qp
, &wc
);
1086 /* Ignore other reserved NAK error codes */
1089 qp
->s_rnr_retry
= qp
->s_rnr_retry_cnt
;
1092 default: /* 2: reserved */
1094 /* Ignore reserved NAK codes. */
1103 * ipath_rc_rcv_resp - process an incoming RC response packet
1104 * @dev: the device this packet came in on
1105 * @ohdr: the other headers for this packet
1106 * @data: the packet data
1107 * @tlen: the packet length
1108 * @qp: the QP for this packet
1109 * @opcode: the opcode for this packet
1110 * @psn: the packet sequence number for this packet
1111 * @hdrsize: the header length
1112 * @pmtu: the path MTU
1113 * @header_in_data: true if part of the header data is in the data buffer
1115 * This is called from ipath_rc_rcv() to process an incoming RC response
1116 * packet for the given QP.
1117 * Called at interrupt level.
1119 static inline void ipath_rc_rcv_resp(struct ipath_ibdev
*dev
,
1120 struct ipath_other_headers
*ohdr
,
1121 void *data
, u32 tlen
,
1122 struct ipath_qp
*qp
,
1124 u32 psn
, u32 hdrsize
, u32 pmtu
,
1127 unsigned long flags
;
1133 spin_lock_irqsave(&qp
->s_lock
, flags
);
1135 /* Ignore invalid responses. */
1136 if (ipath_cmp24(psn
, qp
->s_next_psn
) >= 0)
1139 /* Ignore duplicate responses. */
1140 diff
= ipath_cmp24(psn
, qp
->s_last_psn
);
1141 if (unlikely(diff
<= 0)) {
1142 /* Update credits for "ghost" ACKs */
1143 if (diff
== 0 && opcode
== OP(ACKNOWLEDGE
)) {
1144 if (!header_in_data
)
1145 aeth
= be32_to_cpu(ohdr
->u
.aeth
);
1147 aeth
= be32_to_cpu(((__be32
*) data
)[0]);
1148 data
+= sizeof(__be32
);
1150 if ((aeth
>> 29) == 0)
1151 ipath_get_credit(qp
, aeth
);
1157 case OP(ACKNOWLEDGE
):
1158 case OP(ATOMIC_ACKNOWLEDGE
):
1159 case OP(RDMA_READ_RESPONSE_FIRST
):
1160 if (!header_in_data
)
1161 aeth
= be32_to_cpu(ohdr
->u
.aeth
);
1163 aeth
= be32_to_cpu(((__be32
*) data
)[0]);
1164 data
+= sizeof(__be32
);
1166 if (opcode
== OP(ATOMIC_ACKNOWLEDGE
))
1167 *(u64
*) qp
->s_sge
.sge
.vaddr
= *(u64
*) data
;
1168 if (!do_rc_ack(qp
, aeth
, psn
, opcode
) ||
1169 opcode
!= OP(RDMA_READ_RESPONSE_FIRST
))
1173 * do_rc_ack() has already checked the PSN so skip
1174 * the sequence check.
1178 case OP(RDMA_READ_RESPONSE_MIDDLE
):
1179 /* no AETH, no ACK */
1180 if (unlikely(ipath_cmp24(psn
, qp
->s_last_psn
+ 1))) {
1182 ipath_restart_rc(qp
, qp
->s_last_psn
+ 1, &wc
);
1186 if (unlikely(qp
->s_state
!= OP(RDMA_READ_REQUEST
)))
1188 if (unlikely(tlen
!= (hdrsize
+ pmtu
+ 4)))
1190 if (unlikely(pmtu
>= qp
->s_len
))
1192 /* We got a response so update the timeout. */
1193 if (unlikely(qp
->s_last
== qp
->s_tail
||
1194 get_swqe_ptr(qp
, qp
->s_last
)->wr
.opcode
!=
1197 spin_lock(&dev
->pending_lock
);
1198 if (qp
->s_rnr_timeout
== 0 && !list_empty(&qp
->timerwait
))
1199 list_move_tail(&qp
->timerwait
,
1200 &dev
->pending
[dev
->pending_index
]);
1201 spin_unlock(&dev
->pending_lock
);
1203 * Update the RDMA receive state but do the copy w/o holding the
1204 * locks and blocking interrupts. XXX Yet another place that
1205 * affects relaxed RDMA order since we don't want s_sge modified.
1208 qp
->s_last_psn
= psn
;
1209 spin_unlock_irqrestore(&qp
->s_lock
, flags
);
1210 ipath_copy_sge(&qp
->s_sge
, data
, pmtu
);
1213 case OP(RDMA_READ_RESPONSE_LAST
):
1214 /* ACKs READ req. */
1215 if (unlikely(ipath_cmp24(psn
, qp
->s_last_psn
+ 1))) {
1217 ipath_restart_rc(qp
, qp
->s_last_psn
+ 1, &wc
);
1221 case OP(RDMA_READ_RESPONSE_ONLY
):
1222 if (unlikely(qp
->s_state
!= OP(RDMA_READ_REQUEST
)))
1225 * Get the number of bytes the message was padded by.
1227 pad
= (be32_to_cpu(ohdr
->bth
[0]) >> 20) & 3;
1229 * Check that the data size is >= 1 && <= pmtu.
1230 * Remember to account for the AETH header (4) and
1233 if (unlikely(tlen
<= (hdrsize
+ pad
+ 8))) {
1235 * XXX Need to generate an error CQ
1240 tlen
-= hdrsize
+ pad
+ 8;
1241 if (unlikely(tlen
!= qp
->s_len
)) {
1243 * XXX Need to generate an error CQ
1248 if (!header_in_data
)
1249 aeth
= be32_to_cpu(ohdr
->u
.aeth
);
1251 aeth
= be32_to_cpu(((__be32
*) data
)[0]);
1252 data
+= sizeof(__be32
);
1254 ipath_copy_sge(&qp
->s_sge
, data
, tlen
);
1255 if (do_rc_ack(qp
, aeth
, psn
, OP(RDMA_READ_RESPONSE_LAST
))) {
1257 * Change the state so we contimue
1258 * processing new requests.
1260 qp
->s_state
= OP(SEND_LAST
);
1266 spin_unlock_irqrestore(&qp
->s_lock
, flags
);
1272 * ipath_rc_rcv_error - process an incoming duplicate or error RC packet
1273 * @dev: the device this packet came in on
1274 * @ohdr: the other headers for this packet
1275 * @data: the packet data
1276 * @qp: the QP for this packet
1277 * @opcode: the opcode for this packet
1278 * @psn: the packet sequence number for this packet
1279 * @diff: the difference between the PSN and the expected PSN
1280 * @header_in_data: true if part of the header data is in the data buffer
1282 * This is called from ipath_rc_rcv() to process an unexpected
1283 * incoming RC packet for the given QP.
1284 * Called at interrupt level.
1285 * Return 1 if no more processing is needed; otherwise return 0 to
1286 * schedule a response to be sent and the s_lock unlocked.
1288 static inline int ipath_rc_rcv_error(struct ipath_ibdev
*dev
,
1289 struct ipath_other_headers
*ohdr
,
1291 struct ipath_qp
*qp
,
1297 struct ib_reth
*reth
;
1301 * Packet sequence error.
1302 * A NAK will ACK earlier sends and RDMA writes.
1303 * Don't queue the NAK if a RDMA read, atomic, or
1304 * NAK is pending though.
1306 spin_lock(&qp
->s_lock
);
1307 if ((qp
->s_ack_state
>= OP(RDMA_READ_REQUEST
) &&
1308 qp
->s_ack_state
!= IB_OPCODE_ACKNOWLEDGE
) ||
1309 qp
->s_nak_state
!= 0) {
1310 spin_unlock(&qp
->s_lock
);
1313 qp
->s_ack_state
= OP(SEND_ONLY
);
1314 qp
->s_nak_state
= IB_NAK_PSN_ERROR
;
1315 /* Use the expected PSN. */
1316 qp
->s_ack_psn
= qp
->r_psn
;
1321 * Handle a duplicate request. Don't re-execute SEND, RDMA
1322 * write or atomic op. Don't NAK errors, just silently drop
1323 * the duplicate request. Note that r_sge, r_len, and
1324 * r_rcv_len may be in use so don't modify them.
1326 * We are supposed to ACK the earliest duplicate PSN but we
1327 * can coalesce an outstanding duplicate ACK. We have to
1328 * send the earliest so that RDMA reads can be restarted at
1329 * the requester's expected PSN.
1331 spin_lock(&qp
->s_lock
);
1332 if (qp
->s_ack_state
!= IB_OPCODE_ACKNOWLEDGE
&&
1333 ipath_cmp24(psn
, qp
->s_ack_psn
) >= 0) {
1334 if (qp
->s_ack_state
< IB_OPCODE_RDMA_READ_REQUEST
)
1335 qp
->s_ack_psn
= psn
;
1336 spin_unlock(&qp
->s_lock
);
1340 case OP(RDMA_READ_REQUEST
):
1342 * We have to be careful to not change s_rdma_sge
1343 * while ipath_do_rc_send() is using it and not
1344 * holding the s_lock.
1346 if (qp
->s_ack_state
!= OP(ACKNOWLEDGE
) &&
1347 qp
->s_ack_state
>= IB_OPCODE_RDMA_READ_REQUEST
) {
1348 spin_unlock(&qp
->s_lock
);
1349 dev
->n_rdma_dup_busy
++;
1352 /* RETH comes after BTH */
1353 if (!header_in_data
)
1354 reth
= &ohdr
->u
.rc
.reth
;
1356 reth
= (struct ib_reth
*)data
;
1357 data
+= sizeof(*reth
);
1359 qp
->s_rdma_len
= be32_to_cpu(reth
->length
);
1360 if (qp
->s_rdma_len
!= 0) {
1361 u32 rkey
= be32_to_cpu(reth
->rkey
);
1362 u64 vaddr
= be64_to_cpu(reth
->vaddr
);
1366 * Address range must be a subset of the original
1367 * request and start on pmtu boundaries.
1369 ok
= ipath_rkey_ok(dev
, &qp
->s_rdma_sge
,
1370 qp
->s_rdma_len
, vaddr
, rkey
,
1371 IB_ACCESS_REMOTE_READ
);
1375 qp
->s_rdma_sge
.sg_list
= NULL
;
1376 qp
->s_rdma_sge
.num_sge
= 0;
1377 qp
->s_rdma_sge
.sge
.mr
= NULL
;
1378 qp
->s_rdma_sge
.sge
.vaddr
= NULL
;
1379 qp
->s_rdma_sge
.sge
.length
= 0;
1380 qp
->s_rdma_sge
.sge
.sge_length
= 0;
1384 case OP(COMPARE_SWAP
):
1387 * Check for the PSN of the last atomic operations
1388 * performed and resend the result if found.
1390 if ((psn
& IPS_PSN_MASK
) != qp
->r_atomic_psn
) {
1391 spin_unlock(&qp
->s_lock
);
1394 qp
->s_ack_atomic
= qp
->r_atomic_data
;
1397 qp
->s_ack_state
= opcode
;
1398 qp
->s_nak_state
= 0;
1399 qp
->s_ack_psn
= psn
;
1408 * ipath_rc_rcv - process an incoming RC packet
1409 * @dev: the device this packet came in on
1410 * @hdr: the header of this packet
1411 * @has_grh: true if the header has a GRH
1412 * @data: the packet data
1413 * @tlen: the packet length
1414 * @qp: the QP for this packet
1416 * This is called from ipath_qp_rcv() to process an incoming RC packet
1418 * Called at interrupt level.
1420 void ipath_rc_rcv(struct ipath_ibdev
*dev
, struct ipath_ib_header
*hdr
,
1421 int has_grh
, void *data
, u32 tlen
, struct ipath_qp
*qp
)
1423 struct ipath_other_headers
*ohdr
;
1428 unsigned long flags
;
1430 u32 pmtu
= ib_mtu_enum_to_int(qp
->path_mtu
);
1432 struct ib_reth
*reth
;
1438 hdrsize
= 8 + 12; /* LRH + BTH */
1439 psn
= be32_to_cpu(ohdr
->bth
[2]);
1442 ohdr
= &hdr
->u
.l
.oth
;
1443 hdrsize
= 8 + 40 + 12; /* LRH + GRH + BTH */
1445 * The header with GRH is 60 bytes and the core driver sets
1446 * the eager header buffer size to 56 bytes so the last 4
1447 * bytes of the BTH header (PSN) is in the data buffer.
1450 ipath_layer_get_rcvhdrentsize(dev
->dd
) == 16;
1451 if (header_in_data
) {
1452 psn
= be32_to_cpu(((__be32
*) data
)[0]);
1453 data
+= sizeof(__be32
);
1455 psn
= be32_to_cpu(ohdr
->bth
[2]);
1458 * The opcode is in the low byte when its in network order
1459 * (top byte when in host order).
1461 opcode
= be32_to_cpu(ohdr
->bth
[0]) >> 24;
1464 * Process responses (ACKs) before anything else. Note that the
1465 * packet sequence number will be for something in the send work
1466 * queue rather than the expected receive packet sequence number.
1467 * In other words, this QP is the requester.
1469 if (opcode
>= OP(RDMA_READ_RESPONSE_FIRST
) &&
1470 opcode
<= OP(ATOMIC_ACKNOWLEDGE
)) {
1471 ipath_rc_rcv_resp(dev
, ohdr
, data
, tlen
, qp
, opcode
, psn
,
1472 hdrsize
, pmtu
, header_in_data
);
1476 spin_lock_irqsave(&qp
->r_rq
.lock
, flags
);
1478 /* Compute 24 bits worth of difference. */
1479 diff
= ipath_cmp24(psn
, qp
->r_psn
);
1480 if (unlikely(diff
)) {
1481 if (ipath_rc_rcv_error(dev
, ohdr
, data
, qp
, opcode
,
1482 psn
, diff
, header_in_data
))
1487 /* Check for opcode sequence errors. */
1488 switch (qp
->r_state
) {
1489 case OP(SEND_FIRST
):
1490 case OP(SEND_MIDDLE
):
1491 if (opcode
== OP(SEND_MIDDLE
) ||
1492 opcode
== OP(SEND_LAST
) ||
1493 opcode
== OP(SEND_LAST_WITH_IMMEDIATE
))
1497 * A NAK will ACK earlier sends and RDMA writes. Don't queue the
1498 * NAK if a RDMA read, atomic, or NAK is pending though.
1500 spin_lock(&qp
->s_lock
);
1501 if (qp
->s_ack_state
>= OP(RDMA_READ_REQUEST
) &&
1502 qp
->s_ack_state
!= IB_OPCODE_ACKNOWLEDGE
) {
1503 spin_unlock(&qp
->s_lock
);
1506 /* XXX Flush WQEs */
1507 qp
->state
= IB_QPS_ERR
;
1508 qp
->s_ack_state
= OP(SEND_ONLY
);
1509 qp
->s_nak_state
= IB_NAK_INVALID_REQUEST
;
1510 qp
->s_ack_psn
= qp
->r_psn
;
1513 case OP(RDMA_WRITE_FIRST
):
1514 case OP(RDMA_WRITE_MIDDLE
):
1515 if (opcode
== OP(RDMA_WRITE_MIDDLE
) ||
1516 opcode
== OP(RDMA_WRITE_LAST
) ||
1517 opcode
== OP(RDMA_WRITE_LAST_WITH_IMMEDIATE
))
1521 case OP(RDMA_READ_REQUEST
):
1522 case OP(COMPARE_SWAP
):
1525 * Drop all new requests until a response has been sent. A
1526 * new request then ACKs the RDMA response we sent. Relaxed
1527 * ordering would allow new requests to be processed but we
1528 * would need to keep a queue of rwqe's for all that are in
1529 * progress. Note that we can't RNR NAK this request since
1530 * the RDMA READ or atomic response is already queued to be
1531 * sent (unless we implement a response send queue).
1536 if (opcode
== OP(SEND_MIDDLE
) ||
1537 opcode
== OP(SEND_LAST
) ||
1538 opcode
== OP(SEND_LAST_WITH_IMMEDIATE
) ||
1539 opcode
== OP(RDMA_WRITE_MIDDLE
) ||
1540 opcode
== OP(RDMA_WRITE_LAST
) ||
1541 opcode
== OP(RDMA_WRITE_LAST_WITH_IMMEDIATE
))
1549 /* OK, process the packet. */
1551 case OP(SEND_FIRST
):
1552 if (!ipath_get_rwqe(qp
, 0)) {
1555 * A RNR NAK will ACK earlier sends and RDMA writes.
1556 * Don't queue the NAK if a RDMA read or atomic
1557 * is pending though.
1559 spin_lock(&qp
->s_lock
);
1560 if (qp
->s_ack_state
>=
1561 OP(RDMA_READ_REQUEST
) &&
1562 qp
->s_ack_state
!= IB_OPCODE_ACKNOWLEDGE
) {
1563 spin_unlock(&qp
->s_lock
);
1566 qp
->s_ack_state
= OP(SEND_ONLY
);
1567 qp
->s_nak_state
= IB_RNR_NAK
| qp
->s_min_rnr_timer
;
1568 qp
->s_ack_psn
= qp
->r_psn
;
1573 case OP(SEND_MIDDLE
):
1574 case OP(RDMA_WRITE_MIDDLE
):
1576 /* Check for invalid length PMTU or posted rwqe len. */
1577 if (unlikely(tlen
!= (hdrsize
+ pmtu
+ 4)))
1579 qp
->r_rcv_len
+= pmtu
;
1580 if (unlikely(qp
->r_rcv_len
> qp
->r_len
))
1582 ipath_copy_sge(&qp
->r_sge
, data
, pmtu
);
1585 case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE
):
1587 if (!ipath_get_rwqe(qp
, 1))
1592 case OP(SEND_ONLY_WITH_IMMEDIATE
):
1593 if (!ipath_get_rwqe(qp
, 0))
1596 if (opcode
== OP(SEND_ONLY
))
1599 case OP(SEND_LAST_WITH_IMMEDIATE
):
1601 if (header_in_data
) {
1602 wc
.imm_data
= *(__be32
*) data
;
1603 data
+= sizeof(__be32
);
1605 /* Immediate data comes after BTH */
1606 wc
.imm_data
= ohdr
->u
.imm_data
;
1609 wc
.wc_flags
= IB_WC_WITH_IMM
;
1612 case OP(RDMA_WRITE_LAST
):
1614 /* Get the number of bytes the message was padded by. */
1615 pad
= (be32_to_cpu(ohdr
->bth
[0]) >> 20) & 3;
1616 /* Check for invalid length. */
1617 /* XXX LAST len should be >= 1 */
1618 if (unlikely(tlen
< (hdrsize
+ pad
+ 4)))
1620 /* Don't count the CRC. */
1621 tlen
-= (hdrsize
+ pad
+ 4);
1622 wc
.byte_len
= tlen
+ qp
->r_rcv_len
;
1623 if (unlikely(wc
.byte_len
> qp
->r_len
))
1625 ipath_copy_sge(&qp
->r_sge
, data
, tlen
);
1626 atomic_inc(&qp
->msn
);
1627 if (opcode
== OP(RDMA_WRITE_LAST
) ||
1628 opcode
== OP(RDMA_WRITE_ONLY
))
1630 wc
.wr_id
= qp
->r_wr_id
;
1631 wc
.status
= IB_WC_SUCCESS
;
1632 wc
.opcode
= IB_WC_RECV
;
1634 wc
.qp_num
= qp
->ibqp
.qp_num
;
1635 wc
.src_qp
= qp
->remote_qpn
;
1637 wc
.slid
= qp
->remote_ah_attr
.dlid
;
1638 wc
.sl
= qp
->remote_ah_attr
.sl
;
1639 wc
.dlid_path_bits
= 0;
1641 /* Signal completion event if the solicited bit is set. */
1642 ipath_cq_enter(to_icq(qp
->ibqp
.recv_cq
), &wc
,
1644 __constant_cpu_to_be32(1 << 23)) != 0);
1647 case OP(RDMA_WRITE_FIRST
):
1648 case OP(RDMA_WRITE_ONLY
):
1649 case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE
):
1651 /* RETH comes after BTH */
1652 if (!header_in_data
)
1653 reth
= &ohdr
->u
.rc
.reth
;
1655 reth
= (struct ib_reth
*)data
;
1656 data
+= sizeof(*reth
);
1658 hdrsize
+= sizeof(*reth
);
1659 qp
->r_len
= be32_to_cpu(reth
->length
);
1661 if (qp
->r_len
!= 0) {
1662 u32 rkey
= be32_to_cpu(reth
->rkey
);
1663 u64 vaddr
= be64_to_cpu(reth
->vaddr
);
1666 /* Check rkey & NAK */
1667 ok
= ipath_rkey_ok(dev
, &qp
->r_sge
,
1668 qp
->r_len
, vaddr
, rkey
,
1669 IB_ACCESS_REMOTE_WRITE
);
1670 if (unlikely(!ok
)) {
1673 * A NAK will ACK earlier sends and RDMA
1674 * writes. Don't queue the NAK if a RDMA
1675 * read, atomic, or NAK is pending though.
1677 spin_lock(&qp
->s_lock
);
1678 if (qp
->s_ack_state
>=
1679 OP(RDMA_READ_REQUEST
) &&
1681 IB_OPCODE_ACKNOWLEDGE
) {
1682 spin_unlock(&qp
->s_lock
);
1685 /* XXX Flush WQEs */
1686 qp
->state
= IB_QPS_ERR
;
1687 qp
->s_ack_state
= OP(RDMA_WRITE_ONLY
);
1689 IB_NAK_REMOTE_ACCESS_ERROR
;
1690 qp
->s_ack_psn
= qp
->r_psn
;
1694 qp
->r_sge
.sg_list
= NULL
;
1695 qp
->r_sge
.sge
.mr
= NULL
;
1696 qp
->r_sge
.sge
.vaddr
= NULL
;
1697 qp
->r_sge
.sge
.length
= 0;
1698 qp
->r_sge
.sge
.sge_length
= 0;
1700 if (unlikely(!(qp
->qp_access_flags
&
1701 IB_ACCESS_REMOTE_WRITE
)))
1703 if (opcode
== OP(RDMA_WRITE_FIRST
))
1705 else if (opcode
== OP(RDMA_WRITE_ONLY
))
1707 if (!ipath_get_rwqe(qp
, 1))
1711 case OP(RDMA_READ_REQUEST
):
1712 /* RETH comes after BTH */
1713 if (!header_in_data
)
1714 reth
= &ohdr
->u
.rc
.reth
;
1716 reth
= (struct ib_reth
*)data
;
1717 data
+= sizeof(*reth
);
1719 spin_lock(&qp
->s_lock
);
1720 if (qp
->s_ack_state
!= OP(ACKNOWLEDGE
) &&
1721 qp
->s_ack_state
>= IB_OPCODE_RDMA_READ_REQUEST
) {
1722 spin_unlock(&qp
->s_lock
);
1725 qp
->s_rdma_len
= be32_to_cpu(reth
->length
);
1726 if (qp
->s_rdma_len
!= 0) {
1727 u32 rkey
= be32_to_cpu(reth
->rkey
);
1728 u64 vaddr
= be64_to_cpu(reth
->vaddr
);
1731 /* Check rkey & NAK */
1732 ok
= ipath_rkey_ok(dev
, &qp
->s_rdma_sge
,
1733 qp
->s_rdma_len
, vaddr
, rkey
,
1734 IB_ACCESS_REMOTE_READ
);
1735 if (unlikely(!ok
)) {
1736 spin_unlock(&qp
->s_lock
);
1740 * Update the next expected PSN. We add 1 later
1741 * below, so only add the remainder here.
1743 if (qp
->s_rdma_len
> pmtu
)
1744 qp
->r_psn
+= (qp
->s_rdma_len
- 1) / pmtu
;
1746 qp
->s_rdma_sge
.sg_list
= NULL
;
1747 qp
->s_rdma_sge
.num_sge
= 0;
1748 qp
->s_rdma_sge
.sge
.mr
= NULL
;
1749 qp
->s_rdma_sge
.sge
.vaddr
= NULL
;
1750 qp
->s_rdma_sge
.sge
.length
= 0;
1751 qp
->s_rdma_sge
.sge
.sge_length
= 0;
1753 if (unlikely(!(qp
->qp_access_flags
&
1754 IB_ACCESS_REMOTE_READ
)))
1757 * We need to increment the MSN here instead of when we
1758 * finish sending the result since a duplicate request would
1759 * increment it more than once.
1761 atomic_inc(&qp
->msn
);
1762 qp
->s_ack_state
= opcode
;
1763 qp
->s_nak_state
= 0;
1764 qp
->s_ack_psn
= psn
;
1766 qp
->r_state
= opcode
;
1769 case OP(COMPARE_SWAP
):
1770 case OP(FETCH_ADD
): {
1771 struct ib_atomic_eth
*ateth
;
1776 if (!header_in_data
)
1777 ateth
= &ohdr
->u
.atomic_eth
;
1779 ateth
= (struct ib_atomic_eth
*)data
;
1780 data
+= sizeof(*ateth
);
1782 vaddr
= be64_to_cpu(ateth
->vaddr
);
1783 if (unlikely(vaddr
& (sizeof(u64
) - 1)))
1785 rkey
= be32_to_cpu(ateth
->rkey
);
1786 /* Check rkey & NAK */
1787 if (unlikely(!ipath_rkey_ok(dev
, &qp
->r_sge
,
1788 sizeof(u64
), vaddr
, rkey
,
1789 IB_ACCESS_REMOTE_ATOMIC
)))
1791 if (unlikely(!(qp
->qp_access_flags
&
1792 IB_ACCESS_REMOTE_ATOMIC
)))
1794 /* Perform atomic OP and save result. */
1795 sdata
= be64_to_cpu(ateth
->swap_data
);
1796 spin_lock(&dev
->pending_lock
);
1797 qp
->r_atomic_data
= *(u64
*) qp
->r_sge
.sge
.vaddr
;
1798 if (opcode
== OP(FETCH_ADD
))
1799 *(u64
*) qp
->r_sge
.sge
.vaddr
=
1800 qp
->r_atomic_data
+ sdata
;
1801 else if (qp
->r_atomic_data
==
1802 be64_to_cpu(ateth
->compare_data
))
1803 *(u64
*) qp
->r_sge
.sge
.vaddr
= sdata
;
1804 spin_unlock(&dev
->pending_lock
);
1805 atomic_inc(&qp
->msn
);
1806 qp
->r_atomic_psn
= psn
& IPS_PSN_MASK
;
1812 /* Drop packet for unknown opcodes. */
1816 qp
->r_state
= opcode
;
1817 /* Send an ACK if requested or required. */
1818 if (psn
& (1 << 31)) {
1820 * Coalesce ACKs unless there is a RDMA READ or
1823 spin_lock(&qp
->s_lock
);
1824 if (qp
->s_ack_state
== OP(ACKNOWLEDGE
) ||
1825 qp
->s_ack_state
< IB_OPCODE_RDMA_READ_REQUEST
) {
1826 qp
->s_ack_state
= opcode
;
1827 qp
->s_nak_state
= 0;
1828 qp
->s_ack_psn
= psn
;
1829 qp
->s_ack_atomic
= qp
->r_atomic_data
;
1832 spin_unlock(&qp
->s_lock
);
1835 spin_unlock_irqrestore(&qp
->r_rq
.lock
, flags
);
1840 * Try to send ACK right away but not if ipath_do_rc_send() is
1843 if (qp
->s_hdrwords
== 0 &&
1844 (qp
->s_ack_state
< IB_OPCODE_RDMA_READ_REQUEST
||
1845 qp
->s_ack_state
>= IB_OPCODE_COMPARE_SWAP
))
1849 spin_unlock(&qp
->s_lock
);
1850 spin_unlock_irqrestore(&qp
->r_rq
.lock
, flags
);
1852 /* Call ipath_do_rc_send() in another thread. */
1853 tasklet_hi_schedule(&qp
->s_task
);