4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
29 * Tavor Queue Pair Processing Routines
31 * Implements all the routines necessary for allocating, freeing, and
32 * querying the Tavor queue pairs.
35 #include <sys/types.h>
38 #include <sys/sunddi.h>
39 #include <sys/modctl.h>
40 #include <sys/bitmap.h>
41 #include <sys/sysmacros.h>
43 #include <sys/ib/adapters/tavor/tavor.h>
44 #include <sys/ib/ib_pkt_hdrs.h>
46 static int tavor_qp_create_qpn(tavor_state_t
*state
, tavor_qphdl_t qp
,
48 static int tavor_qpn_avl_compare(const void *q
, const void *e
);
49 static int tavor_special_qp_rsrc_alloc(tavor_state_t
*state
,
50 ibt_sqp_type_t type
, uint_t port
, tavor_rsrc_t
**qp_rsrc
);
51 static int tavor_special_qp_rsrc_free(tavor_state_t
*state
, ibt_sqp_type_t type
,
53 static void tavor_qp_sgl_to_logwqesz(tavor_state_t
*state
, uint_t num_sgl
,
54 tavor_qp_wq_type_t wq_type
, uint_t
*logwqesz
, uint_t
*max_sgl
);
58 * Context: Can be called only from user or kernel context.
61 tavor_qp_alloc(tavor_state_t
*state
, tavor_qp_info_t
*qpinfo
,
62 uint_t sleepflag
, tavor_qp_options_t
*op
)
64 tavor_rsrc_pool_info_t
*rsrc_pool
;
65 tavor_rsrc_t
*qpc
, *rsrc
, *rdb
;
66 tavor_umap_db_entry_t
*umapdb
;
68 ibt_qp_alloc_attr_t
*attr_p
;
70 ibtl_qp_hdl_t ibt_qphdl
;
71 ibt_chan_sizes_t
*queuesz_p
;
74 ibt_mr_attr_t mr_attr
;
75 tavor_mr_options_t mr_op
;
78 tavor_cqhdl_t sq_cq
, rq_cq
;
80 uint64_t value
, qp_desc_off
;
81 uint32_t *sq_buf
, *rq_buf
;
82 uint32_t log_qp_sq_size
, log_qp_rq_size
;
83 uint32_t sq_size
, rq_size
;
84 uint32_t sq_wqe_size
, rq_wqe_size
;
85 uint32_t max_rdb
, max_sgl
, uarpg
;
86 uint_t wq_location
, dma_xfer_mode
, qp_is_umap
;
91 TAVOR_TNF_ENTER(tavor_qp_alloc
);
93 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*attr_p
, *queuesz_p
))
96 * Check the "options" flag. Currently this flag tells the driver
97 * whether or not the QP's work queues should be come from normal
98 * system memory or whether they should be allocated from DDR memory.
101 wq_location
= TAVOR_QUEUE_LOCATION_NORMAL
;
103 wq_location
= op
->qpo_wq_loc
;
107 * Extract the necessary info from the tavor_qp_info_t structure
109 attr_p
= qpinfo
->qpi_attrp
;
110 type
= qpinfo
->qpi_type
;
111 ibt_qphdl
= qpinfo
->qpi_ibt_qphdl
;
112 queuesz_p
= qpinfo
->qpi_queueszp
;
113 qpn
= qpinfo
->qpi_qpn
;
114 qphdl
= &qpinfo
->qpi_qphdl
;
117 * Determine whether QP is being allocated for userland access or
118 * whether it is being allocated for kernel access. If the QP is
119 * being allocated for userland access, then lookup the UAR doorbell
120 * page number for the current process. Note: If this is not found
121 * (e.g. if the process has not previously open()'d the Tavor driver),
122 * then an error is returned.
124 qp_is_umap
= (attr_p
->qp_alloc_flags
& IBT_QP_USER_MAP
) ? 1 : 0;
126 status
= tavor_umap_db_find(state
->ts_instance
, ddi_get_pid(),
127 MLNX_UMAP_UARPG_RSRC
, &value
, 0, NULL
);
128 if (status
!= DDI_SUCCESS
) {
129 /* Set "status" and "errormsg" and goto failure */
130 TAVOR_TNF_FAIL(IBT_INVALID_PARAM
, "failed UAR page");
133 uarpg
= ((tavor_rsrc_t
*)(uintptr_t)value
)->tr_indx
;
137 * Determine whether QP is being associated with an SRQ
139 qp_srq_en
= (attr_p
->qp_alloc_flags
& IBT_QP_USES_SRQ
) ? 1 : 0;
142 * Check for valid SRQ handle pointers
144 if (attr_p
->qp_ibc_srq_hdl
== NULL
) {
145 /* Set "status" and "errormsg" and goto failure */
146 TAVOR_TNF_FAIL(IBT_SRQ_HDL_INVALID
,
147 "invalid SRQ handle");
150 srq
= (tavor_srqhdl_t
)attr_p
->qp_ibc_srq_hdl
;
154 * Check for valid QP service type (only UD/RC/UC supported)
156 if (((type
!= IBT_UD_RQP
) && (type
!= IBT_RC_RQP
) &&
157 (type
!= IBT_UC_RQP
))) {
158 /* Set "status" and "errormsg" and goto failure */
159 TAVOR_TNF_FAIL(IBT_QP_SRV_TYPE_INVALID
, "invalid serv type");
164 * Only RC is supported on an SRQ -- This is a Tavor hardware
165 * limitation. Arbel native mode will not have this shortcoming.
167 if (qp_srq_en
&& type
!= IBT_RC_RQP
) {
168 /* Set "status" and "errormsg" and goto failure */
169 TAVOR_TNF_FAIL(IBT_INVALID_PARAM
, "invalid serv type with SRQ");
174 * Check for valid PD handle pointer
176 if (attr_p
->qp_pd_hdl
== NULL
) {
177 /* Set "status" and "errormsg" and goto failure */
178 TAVOR_TNF_FAIL(IBT_PD_HDL_INVALID
, "invalid PD handle");
181 pd
= (tavor_pdhdl_t
)attr_p
->qp_pd_hdl
;
184 * If on an SRQ, check to make sure the PD is the same
186 if (qp_srq_en
&& (pd
->pd_pdnum
!= srq
->srq_pdhdl
->pd_pdnum
)) {
187 /* Set "status" and "errormsg" and goto failure */
188 TAVOR_TNF_FAIL(IBT_PD_HDL_INVALID
, "invalid PD handle");
192 /* Increment the reference count on the protection domain (PD) */
193 tavor_pd_refcnt_inc(pd
);
196 * Check for valid CQ handle pointers
198 if ((attr_p
->qp_ibc_scq_hdl
== NULL
) ||
199 (attr_p
->qp_ibc_rcq_hdl
== NULL
)) {
200 /* Set "status" and "errormsg" and goto failure */
201 TAVOR_TNF_FAIL(IBT_CQ_HDL_INVALID
, "invalid CQ handle");
204 sq_cq
= (tavor_cqhdl_t
)attr_p
->qp_ibc_scq_hdl
;
205 rq_cq
= (tavor_cqhdl_t
)attr_p
->qp_ibc_rcq_hdl
;
208 * Increment the reference count on the CQs. One or both of these
209 * could return error if we determine that the given CQ is already
210 * being used with a special (SMI/GSI) QP.
212 status
= tavor_cq_refcnt_inc(sq_cq
, TAVOR_CQ_IS_NORMAL
);
213 if (status
!= DDI_SUCCESS
) {
214 /* Set "status" and "errormsg" and goto failure */
215 TAVOR_TNF_FAIL(IBT_CQ_HDL_INVALID
, "invalid CQ handle");
218 status
= tavor_cq_refcnt_inc(rq_cq
, TAVOR_CQ_IS_NORMAL
);
219 if (status
!= DDI_SUCCESS
) {
220 /* Set "status" and "errormsg" and goto failure */
221 TAVOR_TNF_FAIL(IBT_CQ_HDL_INVALID
, "invalid CQ handle");
226 * Allocate an QP context entry. This will be filled in with all
227 * the necessary parameters to define the Queue Pair. Unlike
228 * other Tavor hardware resources, ownership is not immediately
229 * given to hardware in the final step here. Instead, we must
230 * wait until the QP is later transitioned to the "Init" state before
231 * passing the QP to hardware. If we fail here, we must undo all
232 * the reference count (CQ and PD).
234 status
= tavor_rsrc_alloc(state
, TAVOR_QPC
, 1, sleepflag
, &qpc
);
235 if (status
!= DDI_SUCCESS
) {
236 /* Set "status" and "errormsg" and goto failure */
237 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE
, "failed QP context");
242 * Allocate the software structure for tracking the queue pair
243 * (i.e. the Tavor Queue Pair handle). If we fail here, we must
244 * undo the reference counts and the previous resource allocation.
246 status
= tavor_rsrc_alloc(state
, TAVOR_QPHDL
, 1, sleepflag
, &rsrc
);
247 if (status
!= DDI_SUCCESS
) {
248 /* Set "status" and "errormsg" and goto failure */
249 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE
, "failed QP handle");
252 qp
= (tavor_qphdl_t
)rsrc
->tr_addr
;
253 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*qp
))
256 * Calculate the QP number from QPC index. This routine handles
257 * all of the operations necessary to keep track of used, unused,
258 * and released QP numbers.
260 status
= tavor_qp_create_qpn(state
, qp
, qpc
);
261 if (status
!= DDI_SUCCESS
) {
262 /* Set "status" and "errormsg" and goto failure */
263 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE
, "failed QPN create");
268 * If this will be a user-mappable QP, then allocate an entry for
269 * the "userland resources database". This will later be added to
270 * the database (after all further QP operations are successful).
271 * If we fail here, we must undo the reference counts and the
272 * previous resource allocation.
275 umapdb
= tavor_umap_db_alloc(state
->ts_instance
, qp
->qp_qpnum
,
276 MLNX_UMAP_QPMEM_RSRC
, (uint64_t)(uintptr_t)rsrc
);
277 if (umapdb
== NULL
) {
278 /* Set "status" and "errormsg" and goto failure */
279 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE
, "failed umap add");
285 * If this is an RC QP, then pre-allocate the maximum number of RDB
286 * entries. This allows us to ensure that we can later cover all
287 * the resources needed by hardware for handling multiple incoming
288 * RDMA Reads. Note: These resources are obviously not always
289 * necessary. They are allocated here anyway. Someday maybe this
290 * can be modified to allocate these on-the-fly (i.e. only if RDMA
291 * Read or Atomic operations are enabled) XXX
292 * If we fail here, we have a bunch of resource and reference count
295 if (type
== IBT_RC_RQP
) {
296 max_rdb
= state
->ts_cfg_profile
->cp_hca_max_rdma_in_qp
;
297 status
= tavor_rsrc_alloc(state
, TAVOR_RDB
, max_rdb
,
299 if (status
!= DDI_SUCCESS
) {
300 /* Set "status" and "errormsg" and goto failure */
301 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE
, "failed RDB");
304 qp
->qp_rdbrsrcp
= rdb
;
305 /* Calculate offset (into DDR memory) of RDB entries */
306 rsrc_pool
= &state
->ts_rsrc_hdl
[TAVOR_RDB
];
307 qp
->qp_rdb_ddraddr
= (uintptr_t)rsrc_pool
->rsrc_ddr_offset
+
308 (rdb
->tr_indx
<< TAVOR_RDB_SIZE_SHIFT
);
312 * Calculate the appropriate size for the work queues.
313 * Note: All Tavor QP work queues must be a power-of-2 in size. Also
314 * they may not be any smaller than TAVOR_QP_MIN_SIZE. This step is
315 * to round the requested size up to the next highest power-of-2
317 attr_p
->qp_sizes
.cs_sq
= max(attr_p
->qp_sizes
.cs_sq
, TAVOR_QP_MIN_SIZE
);
318 attr_p
->qp_sizes
.cs_rq
= max(attr_p
->qp_sizes
.cs_rq
, TAVOR_QP_MIN_SIZE
);
319 log_qp_sq_size
= highbit(attr_p
->qp_sizes
.cs_sq
);
320 if (ISP2(attr_p
->qp_sizes
.cs_sq
)) {
321 log_qp_sq_size
= log_qp_sq_size
- 1;
323 log_qp_rq_size
= highbit(attr_p
->qp_sizes
.cs_rq
);
324 if (ISP2(attr_p
->qp_sizes
.cs_rq
)) {
325 log_qp_rq_size
= log_qp_rq_size
- 1;
329 * Next we verify that the rounded-up size is valid (i.e. consistent
330 * with the device limits and/or software-configured limits). If not,
331 * then obviously we have a lot of cleanup to do before returning.
333 if ((log_qp_sq_size
> state
->ts_cfg_profile
->cp_log_max_qp_sz
) ||
334 (!qp_srq_en
&& (log_qp_rq_size
>
335 state
->ts_cfg_profile
->cp_log_max_qp_sz
))) {
336 /* Set "status" and "errormsg" and goto failure */
337 TAVOR_TNF_FAIL(IBT_HCA_WR_EXCEEDED
, "max QP size");
342 * Next we verify that the requested number of SGL is valid (i.e.
343 * consistent with the device limits and/or software-configured
344 * limits). If not, then obviously the same cleanup needs to be done.
346 max_sgl
= state
->ts_cfg_profile
->cp_wqe_real_max_sgl
;
347 if ((attr_p
->qp_sizes
.cs_sq_sgl
> max_sgl
) ||
348 (!qp_srq_en
&& (attr_p
->qp_sizes
.cs_rq_sgl
> max_sgl
))) {
349 /* Set "status" and "errormsg" and goto failure */
350 TAVOR_TNF_FAIL(IBT_HCA_SGL_EXCEEDED
, "max QP SGL");
355 * Determine this QP's WQE sizes (for both the Send and Recv WQEs).
356 * This will depend on the requested number of SGLs. Note: this
357 * has the side-effect of also calculating the real number of SGLs
358 * (for the calculated WQE size).
360 * For QP's on an SRQ, we set these to 0.
363 qp
->qp_rq_log_wqesz
= 0;
366 tavor_qp_sgl_to_logwqesz(state
, attr_p
->qp_sizes
.cs_rq_sgl
,
367 TAVOR_QP_WQ_TYPE_RECVQ
, &qp
->qp_rq_log_wqesz
,
370 tavor_qp_sgl_to_logwqesz(state
, attr_p
->qp_sizes
.cs_sq_sgl
,
371 TAVOR_QP_WQ_TYPE_SENDQ
, &qp
->qp_sq_log_wqesz
, &qp
->qp_sq_sgl
);
374 * Allocate the memory for QP work queues. Note: The location from
375 * which we will allocate these work queues has been passed in
376 * through the tavor_qp_options_t structure. Since Tavor work queues
377 * are not allowed to cross a 32-bit (4GB) boundary, the alignment of
378 * the work queue memory is very important. We used to allocate
379 * work queues (the combined receive and send queues) so that they
380 * would be aligned on their combined size. That alignment guaranteed
381 * that they would never cross the 4GB boundary (Tavor work queues
382 * are on the order of MBs at maximum). Now we are able to relax
383 * this alignment constraint by ensuring that the IB address assigned
384 * to the queue memory (as a result of the tavor_mr_register() call)
385 * is offset from zero.
386 * Previously, we had wanted to use the ddi_dma_mem_alloc() routine to
387 * guarantee the alignment, but when attempting to use IOMMU bypass
388 * mode we found that we were not allowed to specify any alignment
389 * that was more restrictive than the system page size.
390 * So we avoided this constraint by passing two alignment values,
391 * one for the memory allocation itself and the other for the DMA
392 * handle (for later bind). This used to cause more memory than
393 * necessary to be allocated (in order to guarantee the more
394 * restrictive alignment contraint). But be guaranteeing the
395 * zero-based IB virtual address for the queue, we are able to
396 * conserve this memory.
397 * Note: If QP is not user-mappable, then it may come from either
398 * kernel system memory or from HCA-attached local DDR memory.
400 sq_wqe_size
= 1 << qp
->qp_sq_log_wqesz
;
401 sq_size
= (1 << log_qp_sq_size
) * sq_wqe_size
;
403 /* QP on SRQ sets these to 0 */
408 rq_wqe_size
= 1 << qp
->qp_rq_log_wqesz
;
409 rq_size
= (1 << log_qp_rq_size
) * rq_wqe_size
;
412 qp
->qp_wqinfo
.qa_size
= sq_size
+ rq_size
;
413 qp
->qp_wqinfo
.qa_alloc_align
= max(sq_wqe_size
, rq_wqe_size
);
414 qp
->qp_wqinfo
.qa_bind_align
= max(sq_wqe_size
, rq_wqe_size
);
416 qp
->qp_wqinfo
.qa_location
= TAVOR_QUEUE_LOCATION_USERLAND
;
418 qp
->qp_wqinfo
.qa_location
= wq_location
;
420 status
= tavor_queue_alloc(state
, &qp
->qp_wqinfo
, sleepflag
);
421 if (status
!= DDI_SUCCESS
) {
422 /* Set "status" and "errormsg" and goto failure */
423 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE
, "failed work queue");
426 if (sq_wqe_size
> rq_wqe_size
) {
427 sq_buf
= qp
->qp_wqinfo
.qa_buf_aligned
;
430 * If QP's on an SRQ, we set the rq_buf to NULL
435 rq_buf
= (uint32_t *)((uintptr_t)sq_buf
+ sq_size
);
437 rq_buf
= qp
->qp_wqinfo
.qa_buf_aligned
;
438 sq_buf
= (uint32_t *)((uintptr_t)rq_buf
+ rq_size
);
442 * Register the memory for the QP work queues. The memory for the
443 * QP must be registered in the Tavor TPT tables. This gives us the
444 * LKey to specify in the QP context later. Note: The memory for
445 * Tavor work queues (both Send and Recv) must be contiguous and
446 * registered as a single memory region. Note also: If the work
447 * queue is to be allocated from DDR memory, then only a "bypass"
448 * mapping is appropriate. And if the QP memory is user-mappable,
449 * then we force DDI_DMA_CONSISTENT mapping.
450 * Also, in order to meet the alignment restriction, we pass the
451 * "mro_bind_override_addr" flag in the call to tavor_mr_register().
452 * This guarantees that the resulting IB vaddr will be zero-based
453 * (modulo the offset into the first page).
454 * If we fail here, we still have the bunch of resource and reference
455 * count cleanup to do.
457 flag
= (sleepflag
== TAVOR_SLEEP
) ? IBT_MR_SLEEP
:
459 mr_attr
.mr_vaddr
= (uint64_t)(uintptr_t)qp
->qp_wqinfo
.qa_buf_aligned
;
460 mr_attr
.mr_len
= qp
->qp_wqinfo
.qa_size
;
461 mr_attr
.mr_as
= NULL
;
462 mr_attr
.mr_flags
= flag
;
464 mr_op
.mro_bind_type
= state
->ts_cfg_profile
->cp_iommu_bypass
;
466 if (wq_location
== TAVOR_QUEUE_LOCATION_NORMAL
) {
467 mr_op
.mro_bind_type
=
468 state
->ts_cfg_profile
->cp_iommu_bypass
;
470 state
->ts_cfg_profile
->cp_streaming_consistent
;
471 if (dma_xfer_mode
== DDI_DMA_STREAMING
) {
472 mr_attr
.mr_flags
|= IBT_MR_NONCOHERENT
;
475 mr_op
.mro_bind_type
= TAVOR_BINDMEM_BYPASS
;
478 mr_op
.mro_bind_dmahdl
= qp
->qp_wqinfo
.qa_dmahdl
;
479 mr_op
.mro_bind_override_addr
= 1;
480 status
= tavor_mr_register(state
, pd
, &mr_attr
, &mr
, &mr_op
);
481 if (status
!= DDI_SUCCESS
) {
482 /* Set "status" and "errormsg" and goto failure */
483 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE
, "failed register mr");
488 * Calculate the offset between the kernel virtual address space
489 * and the IB virtual address space. This will be used when
490 * posting work requests to properly initialize each WQE.
492 qp_desc_off
= (uint64_t)(uintptr_t)qp
->qp_wqinfo
.qa_buf_aligned
-
493 (uint64_t)mr
->mr_bindinfo
.bi_addr
;
496 * Fill in all the return arguments (if necessary). This includes
497 * real work queue sizes, real SGLs, and QP number
499 if (queuesz_p
!= NULL
) {
500 queuesz_p
->cs_sq
= (1 << log_qp_sq_size
);
501 queuesz_p
->cs_sq_sgl
= qp
->qp_sq_sgl
;
503 /* QP on an SRQ set these to 0 */
505 queuesz_p
->cs_rq
= 0;
506 queuesz_p
->cs_rq_sgl
= 0;
508 queuesz_p
->cs_rq
= (1 << log_qp_rq_size
);
509 queuesz_p
->cs_rq_sgl
= qp
->qp_rq_sgl
;
513 *qpn
= (ib_qpn_t
)qp
->qp_qpnum
;
517 * Fill in the rest of the Tavor Queue Pair handle. We can update
518 * the following fields for use in further operations on the QP.
520 qp
->qp_qpcrsrcp
= qpc
;
522 qp
->qp_state
= TAVOR_QP_RESET
;
525 qp
->qp_sq_sigtype
= (attr_p
->qp_flags
& IBT_WR_SIGNALED
) ?
526 TAVOR_QP_SQ_WR_SIGNALED
: TAVOR_QP_SQ_ALL_SIGNALED
;
527 qp
->qp_is_special
= 0;
528 qp
->qp_is_umap
= qp_is_umap
;
529 qp
->qp_uarpg
= (qp
->qp_is_umap
) ? uarpg
: 0;
530 qp
->qp_umap_dhp
= (devmap_cookie_t
)NULL
;
531 qp
->qp_sq_cqhdl
= sq_cq
;
532 qp
->qp_sq_lastwqeaddr
= NULL
;
533 qp
->qp_sq_bufsz
= (1 << log_qp_sq_size
);
534 qp
->qp_sq_buf
= sq_buf
;
535 qp
->qp_desc_off
= qp_desc_off
;
536 qp
->qp_rq_cqhdl
= rq_cq
;
537 qp
->qp_rq_lastwqeaddr
= NULL
;
538 qp
->qp_rq_buf
= rq_buf
;
540 /* QP on an SRQ sets this to 0 */
544 qp
->qp_rq_bufsz
= (1 << log_qp_rq_size
);
547 qp
->qp_forward_sqd_event
= 0;
548 qp
->qp_sqd_still_draining
= 0;
549 qp
->qp_hdlrarg
= (void *)ibt_qphdl
;
550 qp
->qp_mcg_refcnt
= 0;
553 * If this QP is to be associated with an SRQ, then set the SRQ handle
558 qp
->qp_srq_en
= TAVOR_QP_SRQ_ENABLED
;
559 tavor_srq_refcnt_inc(qp
->qp_srqhdl
);
561 qp
->qp_srqhdl
= NULL
;
562 qp
->qp_srq_en
= TAVOR_QP_SRQ_DISABLED
;
565 /* Determine if later ddi_dma_sync will be necessary */
566 qp
->qp_sync
= TAVOR_QP_IS_SYNC_REQ(state
, qp
->qp_wqinfo
);
568 /* Determine the QP service type */
569 if (type
== IBT_RC_RQP
) {
570 qp
->qp_serv_type
= TAVOR_QP_RC
;
571 } else if (type
== IBT_UD_RQP
) {
572 qp
->qp_serv_type
= TAVOR_QP_UD
;
574 qp
->qp_serv_type
= TAVOR_QP_UC
;
577 /* Zero out the QP context */
578 bzero(&qp
->qpc
, sizeof (tavor_hw_qpc_t
));
581 * Put QP handle in Tavor QPNum-to-QPHdl list. Then fill in the
582 * "qphdl" and return success
584 ASSERT(state
->ts_qphdl
[qpc
->tr_indx
] == NULL
);
585 state
->ts_qphdl
[qpc
->tr_indx
] = qp
;
588 * If this is a user-mappable QP, then we need to insert the previously
589 * allocated entry into the "userland resources database". This will
590 * allow for later lookup during devmap() (i.e. mmap()) calls.
593 tavor_umap_db_add(umapdb
);
598 TAVOR_TNF_EXIT(tavor_qp_alloc
);
599 return (DDI_SUCCESS
);
602 * The following is cleanup for all possible failure cases in this routine
605 tavor_queue_free(state
, &qp
->qp_wqinfo
);
607 if (type
== IBT_RC_RQP
) {
608 tavor_rsrc_free(state
, &rdb
);
612 tavor_umap_db_free(umapdb
);
616 * Releasing the QPN will also free up the QPC context. Update
617 * the QPC context pointer to indicate this.
619 tavor_qp_release_qpn(state
, qp
->qp_qpn_hdl
, TAVOR_QPN_RELEASE
);
622 tavor_rsrc_free(state
, &rsrc
);
625 tavor_rsrc_free(state
, &qpc
);
628 tavor_cq_refcnt_dec(rq_cq
);
630 tavor_cq_refcnt_dec(sq_cq
);
632 tavor_pd_refcnt_dec(pd
);
634 TNF_PROBE_1(tavor_qp_alloc_fail
, TAVOR_TNF_ERROR
, "",
635 tnf_string
, msg
, errormsg
);
636 TAVOR_TNF_EXIT(tavor_qp_alloc
);
643 * tavor_special_qp_alloc()
644 * Context: Can be called only from user or kernel context.
647 tavor_special_qp_alloc(tavor_state_t
*state
, tavor_qp_info_t
*qpinfo
,
648 uint_t sleepflag
, tavor_qp_options_t
*op
)
650 tavor_rsrc_t
*qpc
, *rsrc
;
652 ibt_qp_alloc_attr_t
*attr_p
;
655 ibtl_qp_hdl_t ibt_qphdl
;
656 ibt_chan_sizes_t
*queuesz_p
;
657 tavor_qphdl_t
*qphdl
;
658 ibt_mr_attr_t mr_attr
;
659 tavor_mr_options_t mr_op
;
661 tavor_cqhdl_t sq_cq
, rq_cq
;
663 uint64_t qp_desc_off
;
664 uint32_t *sq_buf
, *rq_buf
;
665 uint32_t log_qp_sq_size
, log_qp_rq_size
;
666 uint32_t sq_size
, rq_size
, max_sgl
;
667 uint32_t sq_wqe_size
, rq_wqe_size
;
668 uint_t wq_location
, dma_xfer_mode
;
672 TAVOR_TNF_ENTER(tavor_special_qp_alloc
);
675 * Check the "options" flag. Currently this flag tells the driver
676 * whether or not the QP's work queues should be come from normal
677 * system memory or whether they should be allocated from DDR memory.
680 wq_location
= TAVOR_QUEUE_LOCATION_NORMAL
;
682 wq_location
= op
->qpo_wq_loc
;
686 * Extract the necessary info from the tavor_qp_info_t structure
688 attr_p
= qpinfo
->qpi_attrp
;
689 type
= qpinfo
->qpi_type
;
690 port
= qpinfo
->qpi_port
;
691 ibt_qphdl
= qpinfo
->qpi_ibt_qphdl
;
692 queuesz_p
= qpinfo
->qpi_queueszp
;
693 qphdl
= &qpinfo
->qpi_qphdl
;
696 * Check for valid special QP type (only SMI & GSI supported)
698 if ((type
!= IBT_SMI_SQP
) && (type
!= IBT_GSI_SQP
)) {
699 /* Set "status" and "errormsg" and goto failure */
700 TAVOR_TNF_FAIL(IBT_QP_SPECIAL_TYPE_INVALID
, "invalid QP type");
701 goto spec_qpalloc_fail
;
705 * Check for valid port number
707 if (!tavor_portnum_is_valid(state
, port
)) {
708 /* Set "status" and "errormsg" and goto failure */
709 TAVOR_TNF_FAIL(IBT_HCA_PORT_INVALID
, "invalid port num");
710 goto spec_qpalloc_fail
;
715 * Check for valid PD handle pointer
717 if (attr_p
->qp_pd_hdl
== NULL
) {
718 /* Set "status" and "errormsg" and goto failure */
719 TAVOR_TNF_FAIL(IBT_PD_HDL_INVALID
, "invalid PD handle");
720 goto spec_qpalloc_fail
;
722 pd
= (tavor_pdhdl_t
)attr_p
->qp_pd_hdl
;
724 /* Increment the reference count on the PD */
725 tavor_pd_refcnt_inc(pd
);
728 * Check for valid CQ handle pointers
730 if ((attr_p
->qp_ibc_scq_hdl
== NULL
) ||
731 (attr_p
->qp_ibc_rcq_hdl
== NULL
)) {
732 /* Set "status" and "errormsg" and goto failure */
733 TAVOR_TNF_FAIL(IBT_CQ_HDL_INVALID
, "invalid CQ handle");
734 goto spec_qpalloc_fail1
;
736 sq_cq
= (tavor_cqhdl_t
)attr_p
->qp_ibc_scq_hdl
;
737 rq_cq
= (tavor_cqhdl_t
)attr_p
->qp_ibc_rcq_hdl
;
740 * Increment the reference count on the CQs. One or both of these
741 * could return error if we determine that the given CQ is already
742 * being used with a non-special QP (i.e. a normal QP).
744 status
= tavor_cq_refcnt_inc(sq_cq
, TAVOR_CQ_IS_SPECIAL
);
745 if (status
!= DDI_SUCCESS
) {
746 /* Set "status" and "errormsg" and goto failure */
747 TAVOR_TNF_FAIL(IBT_CQ_HDL_INVALID
, "invalid CQ handle");
748 goto spec_qpalloc_fail1
;
750 status
= tavor_cq_refcnt_inc(rq_cq
, TAVOR_CQ_IS_SPECIAL
);
751 if (status
!= DDI_SUCCESS
) {
752 /* Set "status" and "errormsg" and goto failure */
753 TAVOR_TNF_FAIL(IBT_CQ_HDL_INVALID
, "invalid CQ handle");
754 goto spec_qpalloc_fail2
;
758 * Allocate the special QP resources. Essentially, this allocation
759 * amounts to checking if the request special QP has already been
760 * allocated. If successful, the QP context return is an actual
761 * QP context that has been "aliased" to act as a special QP of the
762 * appropriate type (and for the appropriate port). Just as in
763 * tavor_qp_alloc() above, ownership for this QP context is not
764 * immediately given to hardware in the final step here. Instead, we
765 * wait until the QP is later transitioned to the "Init" state before
766 * passing the QP to hardware. If we fail here, we must undo all
767 * the reference count (CQ and PD).
769 status
= tavor_special_qp_rsrc_alloc(state
, type
, port
, &qpc
);
770 if (status
!= DDI_SUCCESS
) {
771 /* Set "status" and "errormsg" and goto failure */
772 TAVOR_TNF_FAIL(status
, "failed special QP rsrc");
773 goto spec_qpalloc_fail3
;
777 * Allocate the software structure for tracking the special queue
778 * pair (i.e. the Tavor Queue Pair handle). If we fail here, we
779 * must undo the reference counts and the previous resource allocation.
781 status
= tavor_rsrc_alloc(state
, TAVOR_QPHDL
, 1, sleepflag
, &rsrc
);
782 if (status
!= DDI_SUCCESS
) {
783 /* Set "status" and "errormsg" and goto failure */
784 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE
, "failed QP handle");
785 goto spec_qpalloc_fail4
;
787 qp
= (tavor_qphdl_t
)rsrc
->tr_addr
;
788 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*qp
))
791 * Actual QP number is a combination of the index of the QPC and
792 * the port number. This is because the special QP contexts must
793 * be allocated two-at-a-time.
795 qp
->qp_qpnum
= qpc
->tr_indx
+ port
;
798 * Calculate the appropriate size for the work queues.
799 * Note: All Tavor QP work queues must be a power-of-2 in size. Also
800 * they may not be any smaller than TAVOR_QP_MIN_SIZE. This step is
801 * to round the requested size up to the next highest power-of-2
803 attr_p
->qp_sizes
.cs_sq
= max(attr_p
->qp_sizes
.cs_sq
, TAVOR_QP_MIN_SIZE
);
804 attr_p
->qp_sizes
.cs_rq
= max(attr_p
->qp_sizes
.cs_rq
, TAVOR_QP_MIN_SIZE
);
805 log_qp_sq_size
= highbit(attr_p
->qp_sizes
.cs_sq
);
806 if (ISP2(attr_p
->qp_sizes
.cs_sq
)) {
807 log_qp_sq_size
= log_qp_sq_size
- 1;
809 log_qp_rq_size
= highbit(attr_p
->qp_sizes
.cs_rq
);
810 if (ISP2(attr_p
->qp_sizes
.cs_rq
)) {
811 log_qp_rq_size
= log_qp_rq_size
- 1;
815 * Next we verify that the rounded-up size is valid (i.e. consistent
816 * with the device limits and/or software-configured limits). If not,
817 * then obviously we have a bit of cleanup to do before returning.
819 if ((log_qp_sq_size
> state
->ts_cfg_profile
->cp_log_max_qp_sz
) ||
820 (log_qp_rq_size
> state
->ts_cfg_profile
->cp_log_max_qp_sz
)) {
821 /* Set "status" and "errormsg" and goto failure */
822 TAVOR_TNF_FAIL(IBT_HCA_WR_EXCEEDED
, "max QP size");
823 goto spec_qpalloc_fail5
;
827 * Next we verify that the requested number of SGL is valid (i.e.
828 * consistent with the device limits and/or software-configured
829 * limits). If not, then obviously the same cleanup needs to be done.
831 max_sgl
= state
->ts_cfg_profile
->cp_wqe_real_max_sgl
;
832 if ((attr_p
->qp_sizes
.cs_sq_sgl
> max_sgl
) ||
833 (attr_p
->qp_sizes
.cs_rq_sgl
> max_sgl
)) {
834 /* Set "status" and "errormsg" and goto failure */
835 TAVOR_TNF_FAIL(IBT_HCA_SGL_EXCEEDED
, "max QP SGL");
836 goto spec_qpalloc_fail5
;
840 * Determine this QP's WQE sizes (for both the Send and Recv WQEs).
841 * This will depend on the requested number of SGLs. Note: this
842 * has the side-effect of also calculating the real number of SGLs
843 * (for the calculated WQE size).
845 tavor_qp_sgl_to_logwqesz(state
, attr_p
->qp_sizes
.cs_rq_sgl
,
846 TAVOR_QP_WQ_TYPE_RECVQ
, &qp
->qp_rq_log_wqesz
, &qp
->qp_rq_sgl
);
847 if (type
== IBT_SMI_SQP
) {
848 tavor_qp_sgl_to_logwqesz(state
, attr_p
->qp_sizes
.cs_sq_sgl
,
849 TAVOR_QP_WQ_TYPE_SENDMLX_QP0
, &qp
->qp_sq_log_wqesz
,
852 tavor_qp_sgl_to_logwqesz(state
, attr_p
->qp_sizes
.cs_sq_sgl
,
853 TAVOR_QP_WQ_TYPE_SENDMLX_QP1
, &qp
->qp_sq_log_wqesz
,
858 * Allocate the memory for QP work queues. Note: The location from
859 * which we will allocate these work queues has been passed in
860 * through the tavor_qp_options_t structure. Since Tavor work queues
861 * are not allowed to cross a 32-bit (4GB) boundary, the alignment of
862 * the work queue memory is very important. We used to allocate
863 * work queues (the combined receive and send queues) so that they
864 * would be aligned on their combined size. That alignment guaranteed
865 * that they would never cross the 4GB boundary (Tavor work queues
866 * are on the order of MBs at maximum). Now we are able to relax
867 * this alignment constraint by ensuring that the IB address assigned
868 * to the queue memory (as a result of the tavor_mr_register() call)
869 * is offset from zero.
870 * Previously, we had wanted to use the ddi_dma_mem_alloc() routine to
871 * guarantee the alignment, but when attempting to use IOMMU bypass
872 * mode we found that we were not allowed to specify any alignment
873 * that was more restrictive than the system page size.
874 * So we avoided this constraint by passing two alignment values,
875 * one for the memory allocation itself and the other for the DMA
876 * handle (for later bind). This used to cause more memory than
877 * necessary to be allocated (in order to guarantee the more
878 * restrictive alignment contraint). But be guaranteeing the
879 * zero-based IB virtual address for the queue, we are able to
880 * conserve this memory.
882 sq_wqe_size
= 1 << qp
->qp_sq_log_wqesz
;
883 rq_wqe_size
= 1 << qp
->qp_rq_log_wqesz
;
884 sq_size
= (1 << log_qp_sq_size
) * sq_wqe_size
;
885 rq_size
= (1 << log_qp_rq_size
) * rq_wqe_size
;
886 qp
->qp_wqinfo
.qa_size
= sq_size
+ rq_size
;
887 qp
->qp_wqinfo
.qa_alloc_align
= max(sq_wqe_size
, rq_wqe_size
);
888 qp
->qp_wqinfo
.qa_bind_align
= max(sq_wqe_size
, rq_wqe_size
);
889 qp
->qp_wqinfo
.qa_location
= wq_location
;
890 status
= tavor_queue_alloc(state
, &qp
->qp_wqinfo
, sleepflag
);
891 if (status
!= NULL
) {
892 /* Set "status" and "errormsg" and goto failure */
893 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE
, "failed work queue");
894 goto spec_qpalloc_fail5
;
896 if (sq_wqe_size
> rq_wqe_size
) {
897 sq_buf
= qp
->qp_wqinfo
.qa_buf_aligned
;
898 rq_buf
= (uint32_t *)((uintptr_t)sq_buf
+ sq_size
);
900 rq_buf
= qp
->qp_wqinfo
.qa_buf_aligned
;
901 sq_buf
= (uint32_t *)((uintptr_t)rq_buf
+ rq_size
);
905 * Register the memory for the special QP work queues. The memory for
906 * the special QP must be registered in the Tavor TPT tables. This
907 * gives us the LKey to specify in the QP context later. Note: The
908 * memory for Tavor work queues (both Send and Recv) must be contiguous
909 * and registered as a single memory region. Note also: If the work
910 * queue is to be allocated from DDR memory, then only a "bypass"
911 * mapping is appropriate.
912 * Also, in order to meet the alignment restriction, we pass the
913 * "mro_bind_override_addr" flag in the call to tavor_mr_register().
914 * This guarantees that the resulting IB vaddr will be zero-based
915 * (modulo the offset into the first page).
916 * If we fail here, we have a bunch of resource and reference count
919 flag
= (sleepflag
== TAVOR_SLEEP
) ? IBT_MR_SLEEP
:
921 mr_attr
.mr_vaddr
= (uint64_t)(uintptr_t)qp
->qp_wqinfo
.qa_buf_aligned
;
922 mr_attr
.mr_len
= qp
->qp_wqinfo
.qa_size
;
923 mr_attr
.mr_as
= NULL
;
924 mr_attr
.mr_flags
= flag
;
925 if (wq_location
== TAVOR_QUEUE_LOCATION_NORMAL
) {
926 mr_op
.mro_bind_type
= state
->ts_cfg_profile
->cp_iommu_bypass
;
928 dma_xfer_mode
= state
->ts_cfg_profile
->cp_streaming_consistent
;
929 if (dma_xfer_mode
== DDI_DMA_STREAMING
) {
930 mr_attr
.mr_flags
|= IBT_MR_NONCOHERENT
;
933 mr_op
.mro_bind_type
= TAVOR_BINDMEM_BYPASS
;
935 mr_op
.mro_bind_dmahdl
= qp
->qp_wqinfo
.qa_dmahdl
;
936 mr_op
.mro_bind_override_addr
= 1;
937 status
= tavor_mr_register(state
, pd
, &mr_attr
, &mr
, &mr_op
);
938 if (status
!= DDI_SUCCESS
) {
939 /* Set "status" and "errormsg" and goto failure */
940 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE
, "failed register mr");
941 goto spec_qpalloc_fail6
;
945 * Calculate the offset between the kernel virtual address space
946 * and the IB virtual address space. This will be used when
947 * posting work requests to properly initialize each WQE.
949 qp_desc_off
= (uint64_t)(uintptr_t)qp
->qp_wqinfo
.qa_buf_aligned
-
950 (uint64_t)mr
->mr_bindinfo
.bi_addr
;
953 * Fill in all the return arguments (if necessary). This includes
954 * real work queue sizes, real SGLs, and QP number (which will be
955 * either zero or one, depending on the special QP type)
957 if (queuesz_p
!= NULL
) {
958 queuesz_p
->cs_sq
= (1 << log_qp_sq_size
);
959 queuesz_p
->cs_sq_sgl
= qp
->qp_sq_sgl
;
960 queuesz_p
->cs_rq
= (1 << log_qp_rq_size
);
961 queuesz_p
->cs_rq_sgl
= qp
->qp_rq_sgl
;
965 * Fill in the rest of the Tavor Queue Pair handle. We can update
966 * the following fields for use in further operations on the QP.
968 qp
->qp_qpcrsrcp
= qpc
;
970 qp
->qp_state
= TAVOR_QP_RESET
;
973 qp
->qp_sq_sigtype
= (attr_p
->qp_flags
& IBT_WR_SIGNALED
) ?
974 TAVOR_QP_SQ_WR_SIGNALED
: TAVOR_QP_SQ_ALL_SIGNALED
;
975 qp
->qp_is_special
= (type
== IBT_SMI_SQP
) ?
976 TAVOR_QP_SMI
: TAVOR_QP_GSI
;
979 qp
->qp_sq_cqhdl
= sq_cq
;
980 qp
->qp_sq_lastwqeaddr
= NULL
;
981 qp
->qp_sq_bufsz
= (1 << log_qp_sq_size
);
982 qp
->qp_sq_buf
= sq_buf
;
983 qp
->qp_desc_off
= qp_desc_off
;
984 qp
->qp_rq_cqhdl
= rq_cq
;
985 qp
->qp_rq_lastwqeaddr
= NULL
;
986 qp
->qp_rq_bufsz
= (1 << log_qp_rq_size
);
987 qp
->qp_rq_buf
= rq_buf
;
988 qp
->qp_portnum
= port
;
990 qp
->qp_hdlrarg
= (void *)ibt_qphdl
;
991 qp
->qp_mcg_refcnt
= 0;
993 qp
->qp_srqhdl
= NULL
;
995 /* Determine if later ddi_dma_sync will be necessary */
996 qp
->qp_sync
= TAVOR_QP_IS_SYNC_REQ(state
, qp
->qp_wqinfo
);
998 /* All special QPs are UD QP service type */
999 qp
->qp_serv_type
= TAVOR_QP_UD
;
1001 /* Zero out the QP context */
1002 bzero(&qp
->qpc
, sizeof (tavor_hw_qpc_t
));
1005 * Put QP handle in Tavor QPNum-to-QPHdl list. Then fill in the
1006 * "qphdl" and return success
1008 ASSERT(state
->ts_qphdl
[qpc
->tr_indx
+ port
] == NULL
);
1009 state
->ts_qphdl
[qpc
->tr_indx
+ port
] = qp
;
1013 TAVOR_TNF_EXIT(tavor_special_qp_alloc
);
1014 return (DDI_SUCCESS
);
1017 * The following is cleanup for all possible failure cases in this routine
1020 tavor_queue_free(state
, &qp
->qp_wqinfo
);
1022 tavor_rsrc_free(state
, &rsrc
);
1024 if (tavor_special_qp_rsrc_free(state
, type
, port
) != DDI_SUCCESS
) {
1025 TAVOR_WARNING(state
, "failed to free special QP rsrc");
1028 tavor_cq_refcnt_dec(rq_cq
);
1030 tavor_cq_refcnt_dec(sq_cq
);
1032 tavor_pd_refcnt_dec(pd
);
1034 TNF_PROBE_1(tavor_special_qp_alloc_fail
, TAVOR_TNF_ERROR
, "",
1035 tnf_string
, msg
, errormsg
);
1036 TAVOR_TNF_EXIT(tavor_special_qp_alloc
);
1043 * This function frees up the QP resources. Depending on the value
1044 * of the "free_qp_flags", the QP number may not be released until
1045 * a subsequent call to tavor_qp_release_qpn().
1047 * Context: Can be called only from user or kernel context.
1051 tavor_qp_free(tavor_state_t
*state
, tavor_qphdl_t
*qphdl
,
1052 ibc_free_qp_flags_t free_qp_flags
, ibc_qpn_hdl_t
*qpnh
,
1055 tavor_rsrc_t
*qpc
, *rdb
, *rsrc
;
1056 tavor_umap_db_entry_t
*umapdb
;
1057 tavor_qpn_entry_t
*entry
;
1060 tavor_cqhdl_t sq_cq
, rq_cq
;
1070 TAVOR_TNF_ENTER(tavor_qp_free
);
1073 * Pull all the necessary information from the Tavor Queue Pair
1074 * handle. This is necessary here because the resource for the
1075 * QP handle is going to be freed up as part of this operation.
1078 mutex_enter(&qp
->qp_lock
);
1079 qpc
= qp
->qp_qpcrsrcp
;
1080 rsrc
= qp
->qp_rsrcp
;
1082 srq
= qp
->qp_srqhdl
;
1084 rq_cq
= qp
->qp_rq_cqhdl
;
1085 sq_cq
= qp
->qp_sq_cqhdl
;
1086 rdb
= qp
->qp_rdbrsrcp
;
1087 port
= qp
->qp_portnum
;
1088 qp_srq_en
= qp
->qp_srq_en
;
1091 * If the QP is part of an MCG, then we fail the qp_free
1093 if (qp
->qp_mcg_refcnt
!= 0) {
1094 mutex_exit(&qp
->qp_lock
);
1095 TAVOR_TNF_FAIL(ibc_get_ci_failure(0), "QP part of MCG on free");
1100 * If the QP is not already in "Reset" state, then transition to
1101 * "Reset". This is necessary because software does not reclaim
1102 * ownership of the QP context until the QP is in the "Reset" state.
1103 * If the ownership transfer fails for any reason, then it is an
1104 * indication that something (either in HW or SW) has gone seriously
1105 * wrong. So we print a warning message and return.
1107 if (qp
->qp_state
!= TAVOR_QP_RESET
) {
1108 if (tavor_qp_to_reset(state
, qp
) != DDI_SUCCESS
) {
1109 mutex_exit(&qp
->qp_lock
);
1110 TAVOR_WARNING(state
, "failed to reset QP context");
1111 /* Set "status" and "errormsg" and goto failure */
1112 TAVOR_TNF_FAIL(ibc_get_ci_failure(0),
1113 "reset QP context");
1116 qp
->qp_state
= TAVOR_QP_RESET
;
1119 * Do any additional handling necessary for the transition
1120 * to the "Reset" state (e.g. update the WRID lists)
1122 tavor_wrid_to_reset_handling(state
, qp
);
1126 * If this was a user-mappable QP, then we need to remove its entry
1127 * from the "userland resources database". If it is also currently
1128 * mmap()'d out to a user process, then we need to call
1129 * devmap_devmem_remap() to remap the QP memory to an invalid mapping.
1130 * We also need to invalidate the QP tracking information for the
1133 if (qp
->qp_is_umap
) {
1134 status
= tavor_umap_db_find(state
->ts_instance
, qp
->qp_qpnum
,
1135 MLNX_UMAP_QPMEM_RSRC
, &value
, TAVOR_UMAP_DB_REMOVE
,
1137 if (status
!= DDI_SUCCESS
) {
1138 mutex_exit(&qp
->qp_lock
);
1139 TAVOR_WARNING(state
, "failed to find in database");
1140 TAVOR_TNF_EXIT(tavor_qp_free
);
1141 return (ibc_get_ci_failure(0));
1143 tavor_umap_db_free(umapdb
);
1144 if (qp
->qp_umap_dhp
!= NULL
) {
1145 maxprot
= (PROT_READ
| PROT_WRITE
| PROT_USER
);
1146 status
= devmap_devmem_remap(qp
->qp_umap_dhp
,
1147 state
->ts_dip
, 0, 0, qp
->qp_wqinfo
.qa_size
,
1148 maxprot
, DEVMAP_MAPPING_INVALID
, NULL
);
1149 if (status
!= DDI_SUCCESS
) {
1150 mutex_exit(&qp
->qp_lock
);
1151 TAVOR_WARNING(state
, "failed in QP memory "
1152 "devmap_devmem_remap()");
1153 TAVOR_TNF_EXIT(tavor_qp_free
);
1154 return (ibc_get_ci_failure(0));
1156 qp
->qp_umap_dhp
= (devmap_cookie_t
)NULL
;
1161 * Put NULL into the Tavor QPNum-to-QPHdl list. This will allow any
1162 * in-progress events to detect that the QP corresponding to this
1163 * number has been freed. Note: it does depend in whether we are
1164 * freeing a special QP or not.
1166 if (qp
->qp_is_special
) {
1167 state
->ts_qphdl
[qpc
->tr_indx
+ port
] = NULL
;
1169 state
->ts_qphdl
[qpc
->tr_indx
] = NULL
;
1174 * At this point the lock is no longer necessary. We cannot
1175 * protect from multiple simultaneous calls to free the same QP.
1176 * In addition, since the QP lock is contained in the QP "software
1177 * handle" resource, which we will free (see below), it is
1178 * important that we have no further references to that memory.
1180 mutex_exit(&qp
->qp_lock
);
1181 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*qp
))
1184 * Free the QP resources
1185 * Start by deregistering and freeing the memory for work queues.
1186 * Next free any previously allocated context information
1187 * (depending on QP type)
1188 * Finally, decrement the necessary reference counts.
1189 * If this fails for any reason, then it is an indication that
1190 * something (either in HW or SW) has gone seriously wrong. So we
1191 * print a warning message and return.
1193 status
= tavor_mr_deregister(state
, &mr
, TAVOR_MR_DEREG_ALL
,
1195 if (status
!= DDI_SUCCESS
) {
1196 TAVOR_WARNING(state
, "failed to deregister QP memory");
1197 /* Set "status" and "errormsg" and goto failure */
1198 TAVOR_TNF_FAIL(ibc_get_ci_failure(0), "failed deregister mr");
1202 /* Free the memory for the QP */
1203 tavor_queue_free(state
, &qp
->qp_wqinfo
);
1206 * Free up the remainder of the QP resources. Note: we have a few
1207 * different resources to free up depending on whether the QP is a
1208 * special QP or not. As described above, if any of these fail for
1209 * any reason it is an indication that something (either in HW or SW)
1210 * has gone seriously wrong. So we print a warning message and
1213 if (qp
->qp_is_special
) {
1214 type
= (qp
->qp_is_special
== TAVOR_QP_SMI
) ?
1215 IBT_SMI_SQP
: IBT_GSI_SQP
;
1217 /* Free up resources for the special QP */
1218 status
= tavor_special_qp_rsrc_free(state
, type
, port
);
1219 if (status
!= DDI_SUCCESS
) {
1220 TAVOR_WARNING(state
, "failed to free special QP rsrc");
1221 /* Set "status" and "errormsg" and goto failure */
1222 TAVOR_TNF_FAIL(ibc_get_ci_failure(0),
1223 "failed special QP rsrc");
1228 type
= qp
->qp_serv_type
;
1230 /* Free up the RDB entries resource */
1231 if (type
== TAVOR_QP_RC
) {
1232 tavor_rsrc_free(state
, &rdb
);
1236 * Check the flags and determine whether to release the
1237 * QPN or not, based on their value.
1239 if (free_qp_flags
== IBC_FREE_QP_ONLY
) {
1240 entry
= qp
->qp_qpn_hdl
;
1241 tavor_qp_release_qpn(state
, qp
->qp_qpn_hdl
,
1242 TAVOR_QPN_FREE_ONLY
);
1243 *qpnh
= (ibc_qpn_hdl_t
)entry
;
1245 tavor_qp_release_qpn(state
, qp
->qp_qpn_hdl
,
1250 /* Free the Tavor Queue Pair handle */
1251 tavor_rsrc_free(state
, &rsrc
);
1253 /* Decrement the reference counts on CQs, PD and SRQ (if needed) */
1254 tavor_cq_refcnt_dec(rq_cq
);
1255 tavor_cq_refcnt_dec(sq_cq
);
1256 tavor_pd_refcnt_dec(pd
);
1257 if (qp_srq_en
== TAVOR_QP_SRQ_ENABLED
) {
1258 tavor_srq_refcnt_dec(srq
);
1261 /* Set the qphdl pointer to NULL and return success */
1264 TAVOR_TNF_EXIT(tavor_qp_free
);
1265 return (DDI_SUCCESS
);
1268 TNF_PROBE_1(tavor_qp_free_fail
, TAVOR_TNF_ERROR
, "",
1269 tnf_string
, msg
, errormsg
);
1270 TAVOR_TNF_EXIT(tavor_qp_free
);
1277 * Context: Can be called from interrupt or base context.
1280 tavor_qp_query(tavor_state_t
*state
, tavor_qphdl_t qp
,
1281 ibt_qp_query_attr_t
*attr_p
)
1283 ibt_cep_state_t qp_state
;
1284 ibt_qp_ud_attr_t
*ud
;
1285 ibt_qp_rc_attr_t
*rc
;
1286 ibt_qp_uc_attr_t
*uc
;
1287 ibt_cep_flags_t enable_flags
;
1288 tavor_hw_addr_path_t
*qpc_path
, *qpc_alt_path
;
1289 ibt_cep_path_t
*path_ptr
, *alt_path_ptr
;
1290 tavor_hw_qpc_t
*qpc
;
1293 TAVOR_TNF_ENTER(tavor_qp_query
);
1295 mutex_enter(&qp
->qp_lock
);
1298 * Grab the temporary QPC entry from QP software state
1302 /* Convert the current Tavor QP state to IBTF QP state */
1303 switch (qp
->qp_state
) {
1304 case TAVOR_QP_RESET
:
1305 qp_state
= IBT_STATE_RESET
; /* "Reset" */
1308 qp_state
= IBT_STATE_INIT
; /* Initialized */
1311 qp_state
= IBT_STATE_RTR
; /* Ready to Receive */
1314 qp_state
= IBT_STATE_RTS
; /* Ready to Send */
1316 case TAVOR_QP_SQERR
:
1317 qp_state
= IBT_STATE_SQE
; /* Send Queue Error */
1320 if (qp
->qp_sqd_still_draining
) {
1321 qp_state
= IBT_STATE_SQDRAIN
; /* SQ Draining */
1323 qp_state
= IBT_STATE_SQD
; /* SQ Drained */
1327 qp_state
= IBT_STATE_ERROR
; /* Error */
1330 mutex_exit(&qp
->qp_lock
);
1331 TNF_PROBE_1(tavor_qp_query_inv_qpstate_fail
,
1332 TAVOR_TNF_ERROR
, "", tnf_uint
, qpstate
, qp
->qp_state
);
1333 TAVOR_TNF_EXIT(tavor_qp_query
);
1334 return (ibc_get_ci_failure(0));
1336 attr_p
->qp_info
.qp_state
= qp_state
;
1339 attr_p
->qp_srq
= NULL
;
1342 * The following QP information is always returned, regardless of
1343 * the current QP state. Note: Some special handling is necessary
1344 * for calculating the QP number on special QP (QP0 and QP1).
1346 attr_p
->qp_sq_cq
= qp
->qp_sq_cqhdl
->cq_hdlrarg
;
1347 attr_p
->qp_rq_cq
= qp
->qp_rq_cqhdl
->cq_hdlrarg
;
1348 if (qp
->qp_is_special
) {
1349 attr_p
->qp_qpn
= (qp
->qp_is_special
== TAVOR_QP_SMI
) ? 0 : 1;
1351 attr_p
->qp_qpn
= (ib_qpn_t
)qp
->qp_qpnum
;
1353 attr_p
->qp_sq_sgl
= qp
->qp_sq_sgl
;
1354 attr_p
->qp_rq_sgl
= qp
->qp_rq_sgl
;
1355 attr_p
->qp_info
.qp_sq_sz
= qp
->qp_sq_bufsz
;
1356 attr_p
->qp_info
.qp_rq_sz
= qp
->qp_rq_bufsz
;
1359 * If QP is currently in the "Reset" state, then only the above are
1362 if (qp_state
== IBT_STATE_RESET
) {
1363 mutex_exit(&qp
->qp_lock
);
1364 TAVOR_TNF_EXIT(tavor_qp_query
);
1365 return (DDI_SUCCESS
);
1369 * Post QUERY_QP command to firmware
1371 * We do a TAVOR_NOSLEEP here because we are holding the "qp_lock".
1372 * Since we may be in the interrupt context (or subsequently raised
1373 * to interrupt level by priority inversion), we do not want to block
1374 * in this routine waiting for success.
1376 status
= tavor_cmn_query_cmd_post(state
, QUERY_QP
, qp
->qp_qpnum
,
1377 qpc
, sizeof (tavor_hw_qpc_t
), TAVOR_CMD_NOSLEEP_SPIN
);
1378 if (status
!= TAVOR_CMD_SUCCESS
) {
1379 mutex_exit(&qp
->qp_lock
);
1380 cmn_err(CE_CONT
, "Tavor: QUERY_QP command failed: %08x\n",
1382 TNF_PROBE_1(tavor_qp_query_cmd_fail
, TAVOR_TNF_ERROR
, "",
1383 tnf_uint
, status
, status
);
1384 TAVOR_TNF_EXIT(tavor_qp_query
);
1385 return (ibc_get_ci_failure(0));
1389 * Fill in the additional QP info based on the QP's transport type.
1391 if (qp
->qp_serv_type
== TAVOR_QP_UD
) {
1393 /* Fill in the UD-specific info */
1394 ud
= &attr_p
->qp_info
.qp_transport
.ud
;
1395 ud
->ud_qkey
= (ib_qkey_t
)qpc
->qkey
;
1396 ud
->ud_sq_psn
= qpc
->next_snd_psn
;
1397 ud
->ud_pkey_ix
= qpc
->pri_addr_path
.pkey_indx
;
1398 ud
->ud_port
= qpc
->pri_addr_path
.portnum
;
1400 attr_p
->qp_info
.qp_trans
= IBT_UD_SRV
;
1402 } else if (qp
->qp_serv_type
== TAVOR_QP_RC
) {
1404 /* Fill in the RC-specific info */
1405 rc
= &attr_p
->qp_info
.qp_transport
.rc
;
1406 rc
->rc_sq_psn
= qpc
->next_snd_psn
;
1407 rc
->rc_rq_psn
= qpc
->next_rcv_psn
;
1408 rc
->rc_dst_qpn
= qpc
->rem_qpn
;
1410 /* Grab the path migration state information */
1411 if (qpc
->pm_state
== TAVOR_QP_PMSTATE_MIGRATED
) {
1412 rc
->rc_mig_state
= IBT_STATE_MIGRATED
;
1413 } else if (qpc
->pm_state
== TAVOR_QP_PMSTATE_REARM
) {
1414 rc
->rc_mig_state
= IBT_STATE_REARMED
;
1416 rc
->rc_mig_state
= IBT_STATE_ARMED
;
1418 rc
->rc_rdma_ra_out
= (1 << qpc
->sra_max
);
1419 rc
->rc_rdma_ra_in
= (1 << qpc
->rra_max
);
1420 rc
->rc_min_rnr_nak
= qpc
->min_rnr_nak
;
1421 rc
->rc_path_mtu
= qpc
->mtu
;
1422 rc
->rc_retry_cnt
= qpc
->retry_cnt
;
1424 /* Get the common primary address path fields */
1425 qpc_path
= &qpc
->pri_addr_path
;
1426 path_ptr
= &rc
->rc_path
;
1427 tavor_get_addr_path(state
, qpc_path
, &path_ptr
->cep_adds_vect
,
1428 TAVOR_ADDRPATH_QP
, qp
);
1430 /* Fill in the additional primary address path fields */
1431 path_ptr
->cep_pkey_ix
= qpc_path
->pkey_indx
;
1432 path_ptr
->cep_hca_port_num
= qpc_path
->portnum
;
1433 path_ptr
->cep_timeout
= qpc_path
->ack_timeout
;
1435 /* Get the common alternate address path fields */
1436 qpc_alt_path
= &qpc
->alt_addr_path
;
1437 alt_path_ptr
= &rc
->rc_alt_path
;
1438 tavor_get_addr_path(state
, qpc_alt_path
,
1439 &alt_path_ptr
->cep_adds_vect
, TAVOR_ADDRPATH_QP
, qp
);
1441 /* Fill in the additional alternate address path fields */
1442 alt_path_ptr
->cep_pkey_ix
= qpc_alt_path
->pkey_indx
;
1443 alt_path_ptr
->cep_hca_port_num
= qpc_alt_path
->portnum
;
1444 alt_path_ptr
->cep_timeout
= qpc_alt_path
->ack_timeout
;
1446 /* Get the RNR retry time from primary path */
1447 rc
->rc_rnr_retry_cnt
= qpc_path
->rnr_retry
;
1449 /* Set the enable flags based on RDMA/Atomic enable bits */
1450 enable_flags
= IBT_CEP_NO_FLAGS
;
1451 enable_flags
|= ((qpc
->rre
== 0) ? 0 : IBT_CEP_RDMA_RD
);
1452 enable_flags
|= ((qpc
->rwe
== 0) ? 0 : IBT_CEP_RDMA_WR
);
1453 enable_flags
|= ((qpc
->rae
== 0) ? 0 : IBT_CEP_ATOMIC
);
1454 attr_p
->qp_info
.qp_flags
= enable_flags
;
1456 attr_p
->qp_info
.qp_trans
= IBT_RC_SRV
;
1458 } else if (qp
->qp_serv_type
== TAVOR_QP_UC
) {
1460 /* Fill in the UC-specific info */
1461 uc
= &attr_p
->qp_info
.qp_transport
.uc
;
1462 uc
->uc_sq_psn
= qpc
->next_snd_psn
;
1463 uc
->uc_rq_psn
= qpc
->next_rcv_psn
;
1464 uc
->uc_dst_qpn
= qpc
->rem_qpn
;
1466 /* Grab the path migration state information */
1467 if (qpc
->pm_state
== TAVOR_QP_PMSTATE_MIGRATED
) {
1468 uc
->uc_mig_state
= IBT_STATE_MIGRATED
;
1469 } else if (qpc
->pm_state
== TAVOR_QP_PMSTATE_REARM
) {
1470 uc
->uc_mig_state
= IBT_STATE_REARMED
;
1472 uc
->uc_mig_state
= IBT_STATE_ARMED
;
1474 uc
->uc_path_mtu
= qpc
->mtu
;
1476 /* Get the common primary address path fields */
1477 qpc_path
= &qpc
->pri_addr_path
;
1478 path_ptr
= &uc
->uc_path
;
1479 tavor_get_addr_path(state
, qpc_path
, &path_ptr
->cep_adds_vect
,
1480 TAVOR_ADDRPATH_QP
, qp
);
1482 /* Fill in the additional primary address path fields */
1483 path_ptr
->cep_pkey_ix
= qpc_path
->pkey_indx
;
1484 path_ptr
->cep_hca_port_num
= qpc_path
->portnum
;
1486 /* Get the common alternate address path fields */
1487 qpc_alt_path
= &qpc
->alt_addr_path
;
1488 alt_path_ptr
= &uc
->uc_alt_path
;
1489 tavor_get_addr_path(state
, qpc_alt_path
,
1490 &alt_path_ptr
->cep_adds_vect
, TAVOR_ADDRPATH_QP
, qp
);
1492 /* Fill in the additional alternate address path fields */
1493 alt_path_ptr
->cep_pkey_ix
= qpc_alt_path
->pkey_indx
;
1494 alt_path_ptr
->cep_hca_port_num
= qpc_alt_path
->portnum
;
1497 * Set the enable flags based on RDMA enable bits (by
1498 * definition UC doesn't support Atomic or RDMA Read)
1500 enable_flags
= ((qpc
->rwe
== 0) ? 0 : IBT_CEP_RDMA_WR
);
1501 attr_p
->qp_info
.qp_flags
= enable_flags
;
1503 attr_p
->qp_info
.qp_trans
= IBT_UC_SRV
;
1506 TAVOR_WARNING(state
, "unexpected QP transport type");
1507 mutex_exit(&qp
->qp_lock
);
1508 return (ibc_get_ci_failure(0));
1512 * Under certain circumstances it is possible for the Tavor hardware
1513 * to transition to one of the error states without software directly
1514 * knowing about it. The QueryQP() call is the one place where we
1515 * have an opportunity to sample and update our view of the QP state.
1517 if (qpc
->state
== TAVOR_QP_SQERR
) {
1518 attr_p
->qp_info
.qp_state
= IBT_STATE_SQE
;
1519 qp
->qp_state
= TAVOR_QP_SQERR
;
1521 if (qpc
->state
== TAVOR_QP_ERR
) {
1522 attr_p
->qp_info
.qp_state
= IBT_STATE_ERROR
;
1523 qp
->qp_state
= TAVOR_QP_ERR
;
1525 mutex_exit(&qp
->qp_lock
);
1527 TAVOR_TNF_EXIT(tavor_qp_query
);
1528 return (DDI_SUCCESS
);
1533 * tavor_qp_create_qpn()
1534 * Context: Can be called from interrupt or base context.
1537 tavor_qp_create_qpn(tavor_state_t
*state
, tavor_qphdl_t qp
, tavor_rsrc_t
*qpc
)
1539 tavor_qpn_entry_t query
;
1540 tavor_qpn_entry_t
*entry
;
1543 TAVOR_TNF_ENTER(tavor_qp_create_qpn
);
1546 * Build a query (for the AVL tree lookup) and attempt to find
1547 * a previously added entry that has a matching QPC index. If
1548 * no matching entry is found, then allocate, initialize, and
1549 * add an entry to the AVL tree.
1550 * If a matching entry is found, then increment its QPN counter
1551 * and reference counter.
1553 query
.qpn_indx
= qpc
->tr_indx
;
1554 mutex_enter(&state
->ts_qpn_avl_lock
);
1555 entry
= (tavor_qpn_entry_t
*)avl_find(&state
->ts_qpn_avl
,
1557 if (entry
== NULL
) {
1559 * Allocate and initialize a QPN entry, then insert
1560 * it into the AVL tree.
1562 entry
= (tavor_qpn_entry_t
*)kmem_zalloc(
1563 sizeof (tavor_qpn_entry_t
), KM_NOSLEEP
);
1564 if (entry
== NULL
) {
1565 mutex_exit(&state
->ts_qpn_avl_lock
);
1566 TAVOR_TNF_EXIT(tavor_qp_create_qpn
);
1567 return (DDI_FAILURE
);
1569 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*entry
))
1571 entry
->qpn_indx
= qpc
->tr_indx
;
1572 entry
->qpn_refcnt
= 0;
1573 entry
->qpn_counter
= 0;
1575 avl_insert(&state
->ts_qpn_avl
, entry
, where
);
1579 * Make the AVL tree entry point to the QP context resource that
1580 * it will be responsible for tracking
1582 entry
->qpn_qpc
= qpc
;
1585 * Setup the QP handle to point to the AVL tree entry. Then
1586 * generate the new QP number from the entry's QPN counter value
1587 * and the hardware's QP context table index.
1589 qp
->qp_qpn_hdl
= entry
;
1590 qp
->qp_qpnum
= ((entry
->qpn_counter
<<
1591 state
->ts_cfg_profile
->cp_log_num_qp
) | qpc
->tr_indx
) &
1592 TAVOR_QP_MAXNUMBER_MSK
;
1595 * Increment the reference counter and QPN counter. The QPN
1596 * counter always indicates the next available number for use.
1598 entry
->qpn_counter
++;
1599 entry
->qpn_refcnt
++;
1601 mutex_exit(&state
->ts_qpn_avl_lock
);
1602 TAVOR_TNF_EXIT(tavor_qp_create_qpn
);
1603 return (DDI_SUCCESS
);
1608 * tavor_qp_release_qpn()
1609 * Context: Can be called only from user or kernel context.
1612 tavor_qp_release_qpn(tavor_state_t
*state
, tavor_qpn_entry_t
*entry
, int flags
)
1614 TAVOR_TNF_ENTER(tavor_qp_release_qpn
);
1616 ASSERT(entry
!= NULL
);
1618 mutex_enter(&state
->ts_qpn_avl_lock
);
1621 * If we are releasing the QP number here, then we decrement the
1622 * reference count and check for zero references. If there are
1623 * zero references, then we free the QPC context (if it hadn't
1624 * already been freed during a TAVOR_QPN_FREE_ONLY free, i.e. for
1625 * reuse with another similar QP number) and remove the tracking
1626 * structure from the QP number AVL tree and free the structure.
1627 * If we are not releasing the QP number here, then, as long as we
1628 * have not exhausted the usefulness of the QPC context (that is,
1629 * re-used it too many times without the reference count having
1630 * gone to zero), we free up the QPC context for use by another
1631 * thread (which will use it to construct a different QP number
1632 * from the same QPC table index).
1634 if (flags
== TAVOR_QPN_RELEASE
) {
1635 entry
->qpn_refcnt
--;
1638 * If the reference count is zero, then we free the QPC
1639 * context (if it hadn't already been freed in an early
1640 * step, e.g. TAVOR_QPN_FREE_ONLY) and remove/free the
1641 * tracking structure from the QP number AVL tree.
1643 if (entry
->qpn_refcnt
== 0) {
1644 if (entry
->qpn_qpc
!= NULL
) {
1645 tavor_rsrc_free(state
, &entry
->qpn_qpc
);
1649 * If the current entry has served it's useful
1650 * purpose (i.e. been reused the maximum allowable
1651 * number of times), then remove it from QP number
1652 * AVL tree and free it up.
1654 if (entry
->qpn_counter
>= (1 <<
1655 (24 - state
->ts_cfg_profile
->cp_log_num_qp
))) {
1656 avl_remove(&state
->ts_qpn_avl
, entry
);
1657 kmem_free(entry
, sizeof (tavor_qpn_entry_t
));
1661 } else if (flags
== TAVOR_QPN_FREE_ONLY
) {
1663 * Even if we are not freeing the QP number, that will not
1664 * always prevent us from releasing the QPC context. In fact,
1665 * since the QPC context only forms part of the whole QPN,
1666 * we want to free it up for use by other consumers. But
1667 * if the reference count is non-zero (which it will always
1668 * be when we are doing TAVOR_QPN_FREE_ONLY) and the counter
1669 * has reached its maximum value, then we cannot reuse the
1670 * QPC context until the reference count eventually reaches
1671 * zero (in TAVOR_QPN_RELEASE, above).
1673 if (entry
->qpn_counter
< (1 <<
1674 (24 - state
->ts_cfg_profile
->cp_log_num_qp
))) {
1675 tavor_rsrc_free(state
, &entry
->qpn_qpc
);
1678 mutex_exit(&state
->ts_qpn_avl_lock
);
1680 TAVOR_TNF_EXIT(tavor_qp_release_qpn
);
1685 * tavor_qpn_db_compare()
1686 * Context: Can be called from user or kernel context.
1689 tavor_qpn_avl_compare(const void *q
, const void *e
)
1691 tavor_qpn_entry_t
*entry
, *query
;
1693 TAVOR_TNF_ENTER(tavor_qpn_avl_compare
);
1695 entry
= (tavor_qpn_entry_t
*)e
;
1696 query
= (tavor_qpn_entry_t
*)q
;
1698 if (query
->qpn_indx
< entry
->qpn_indx
) {
1699 TAVOR_TNF_EXIT(tavor_qpn_avl_compare
);
1701 } else if (query
->qpn_indx
> entry
->qpn_indx
) {
1702 TAVOR_TNF_EXIT(tavor_qpn_avl_compare
);
1705 TAVOR_TNF_EXIT(tavor_qpn_avl_compare
);
1712 * tavor_qpn_avl_init()
1713 * Context: Only called from attach() path context
1716 tavor_qpn_avl_init(tavor_state_t
*state
)
1718 TAVOR_TNF_ENTER(tavor_qpn_avl_init
);
1720 /* Initialize the lock used for QP number (QPN) AVL tree access */
1721 mutex_init(&state
->ts_qpn_avl_lock
, NULL
, MUTEX_DRIVER
,
1722 DDI_INTR_PRI(state
->ts_intrmsi_pri
));
1724 /* Initialize the AVL tree for the QP number (QPN) storage */
1725 avl_create(&state
->ts_qpn_avl
, tavor_qpn_avl_compare
,
1726 sizeof (tavor_qpn_entry_t
),
1727 offsetof(tavor_qpn_entry_t
, qpn_avlnode
));
1729 TAVOR_TNF_EXIT(tavor_qpn_avl_init
);
1734 * tavor_qpn_avl_fini()
1735 * Context: Only called from attach() and/or detach() path contexts
1738 tavor_qpn_avl_fini(tavor_state_t
*state
)
1740 tavor_qpn_entry_t
*entry
;
1743 TAVOR_TNF_ENTER(tavor_qpn_avl_fini
);
1746 * Empty all entries (if necessary) and destroy the AVL tree
1747 * that was used for QP number (QPN) tracking.
1750 while ((entry
= (tavor_qpn_entry_t
*)avl_destroy_nodes(
1751 &state
->ts_qpn_avl
, &cookie
)) != NULL
) {
1752 kmem_free(entry
, sizeof (tavor_qpn_entry_t
));
1754 avl_destroy(&state
->ts_qpn_avl
);
1756 /* Destroy the lock used for QP number (QPN) AVL tree access */
1757 mutex_destroy(&state
->ts_qpn_avl_lock
);
1759 TAVOR_TNF_EXIT(tavor_qpn_avl_fini
);
1764 * tavor_qphdl_from_qpnum()
1765 * Context: Can be called from interrupt or base context.
1767 * This routine is important because changing the unconstrained
1768 * portion of the QP number is critical to the detection of a
1769 * potential race condition in the QP event handler code (i.e. the case
1770 * where a QP is freed and alloc'd again before an event for the
1771 * "old" QP can be handled).
1773 * While this is not a perfect solution (not sure that one exists)
1774 * it does help to mitigate the chance that this race condition will
1775 * cause us to deliver a "stale" event to the new QP owner. Note:
1776 * this solution does not scale well because the number of constrained
1777 * bits increases (and, hence, the number of unconstrained bits
1778 * decreases) as the number of supported QPs grows. For small and
1779 * intermediate values, it should hopefully provide sufficient
1783 tavor_qphdl_from_qpnum(tavor_state_t
*state
, uint_t qpnum
)
1785 uint_t qpindx
, qpmask
;
1787 /* Calculate the QP table index from the qpnum */
1788 qpmask
= (1 << state
->ts_cfg_profile
->cp_log_num_qp
) - 1;
1789 qpindx
= qpnum
& qpmask
;
1790 return (state
->ts_qphdl
[qpindx
]);
1795 * tavor_special_qp_rsrc_alloc
1796 * Context: Can be called from interrupt or base context.
1799 tavor_special_qp_rsrc_alloc(tavor_state_t
*state
, ibt_sqp_type_t type
,
1800 uint_t port
, tavor_rsrc_t
**qp_rsrc
)
1805 TAVOR_TNF_ENTER(tavor_special_qp_rsrc_alloc
);
1807 mutex_enter(&state
->ts_spec_qplock
);
1808 flags
= state
->ts_spec_qpflags
;
1809 if (type
== IBT_SMI_SQP
) {
1811 * Check here to see if the driver has been configured
1812 * to instruct the Tavor firmware to handle all incoming
1813 * SMP messages (i.e. messages sent to SMA). If so,
1814 * then we will treat QP0 as if it has already been
1815 * allocated (for internal use). Otherwise, if we allow
1816 * the allocation to happen, it will cause unexpected
1817 * behaviors (e.g. Tavor SMA becomes unresponsive).
1819 if (state
->ts_cfg_profile
->cp_qp0_agents_in_fw
!= 0) {
1820 mutex_exit(&state
->ts_spec_qplock
);
1821 TNF_PROBE_0(tavor_special_qp0_alloc_already_in_fw
,
1822 TAVOR_TNF_ERROR
, "");
1823 TAVOR_TNF_EXIT(tavor_special_qp_rsrc_alloc
);
1824 return (IBT_QP_IN_USE
);
1828 * If this is the first QP0 allocation, then post
1829 * a CONF_SPECIAL_QP firmware command
1831 if ((flags
& TAVOR_SPECIAL_QP0_RSRC_MASK
) == 0) {
1832 status
= tavor_conf_special_qp_cmd_post(state
,
1833 state
->ts_spec_qp0
->tr_indx
, TAVOR_CMD_QP_SMI
,
1834 TAVOR_CMD_NOSLEEP_SPIN
);
1835 if (status
!= TAVOR_CMD_SUCCESS
) {
1836 mutex_exit(&state
->ts_spec_qplock
);
1837 cmn_err(CE_CONT
, "Tavor: CONF_SPECIAL_QP "
1838 "command failed: %08x\n", status
);
1839 TNF_PROBE_1(tavor_conf_special_qp_cmd_fail
,
1840 TAVOR_TNF_ERROR
, "", tnf_uint
, status
,
1842 TAVOR_TNF_EXIT(tavor_special_qp_rsrc_alloc
);
1843 return (IBT_INSUFF_RESOURCE
);
1848 * Now check (and, if necessary, modify) the flags to indicate
1849 * whether the allocation was successful
1851 mask
= (1 << (TAVOR_SPECIAL_QP0_RSRC
+ port
));
1853 mutex_exit(&state
->ts_spec_qplock
);
1854 TNF_PROBE_1(tavor_ts_spec_qp0_alloc_already
,
1855 TAVOR_TNF_ERROR
, "", tnf_uint
, port
, port
);
1856 TAVOR_TNF_EXIT(tavor_special_qp_rsrc_alloc
);
1857 return (IBT_QP_IN_USE
);
1859 state
->ts_spec_qpflags
|= mask
;
1860 *qp_rsrc
= state
->ts_spec_qp0
;
1864 * If this is the first QP1 allocation, then post
1865 * a CONF_SPECIAL_QP firmware command
1867 if ((flags
& TAVOR_SPECIAL_QP1_RSRC_MASK
) == 0) {
1868 status
= tavor_conf_special_qp_cmd_post(state
,
1869 state
->ts_spec_qp1
->tr_indx
, TAVOR_CMD_QP_GSI
,
1870 TAVOR_CMD_NOSLEEP_SPIN
);
1871 if (status
!= TAVOR_CMD_SUCCESS
) {
1872 mutex_exit(&state
->ts_spec_qplock
);
1873 cmn_err(CE_CONT
, "Tavor: CONF_SPECIAL_QP "
1874 "command failed: %08x\n", status
);
1875 TNF_PROBE_1(tavor_conf_special_qp_cmd_fail
,
1876 TAVOR_TNF_ERROR
, "", tnf_uint
, status
,
1878 TAVOR_TNF_EXIT(tavor_special_qp_rsrc_alloc
);
1879 return (IBT_INSUFF_RESOURCE
);
1884 * Now check (and, if necessary, modify) the flags to indicate
1885 * whether the allocation was successful
1887 mask
= (1 << (TAVOR_SPECIAL_QP1_RSRC
+ port
));
1889 mutex_exit(&state
->ts_spec_qplock
);
1890 TNF_PROBE_0(tavor_ts_spec_qp1_alloc_already
,
1891 TAVOR_TNF_ERROR
, "");
1892 TAVOR_TNF_EXIT(tavor_special_qp_rsrc_alloc
);
1893 return (IBT_QP_IN_USE
);
1895 state
->ts_spec_qpflags
|= mask
;
1896 *qp_rsrc
= state
->ts_spec_qp1
;
1899 mutex_exit(&state
->ts_spec_qplock
);
1900 TAVOR_TNF_EXIT(tavor_special_qp_rsrc_alloc
);
1901 return (DDI_SUCCESS
);
1906 * tavor_special_qp_rsrc_free
1907 * Context: Can be called from interrupt or base context.
1910 tavor_special_qp_rsrc_free(tavor_state_t
*state
, ibt_sqp_type_t type
,
1916 TAVOR_TNF_ENTER(tavor_special_qp_rsrc_free
);
1918 mutex_enter(&state
->ts_spec_qplock
);
1919 if (type
== IBT_SMI_SQP
) {
1920 mask
= (1 << (TAVOR_SPECIAL_QP0_RSRC
+ port
));
1921 state
->ts_spec_qpflags
&= ~mask
;
1922 flags
= state
->ts_spec_qpflags
;
1925 * If this is the last QP0 free, then post a CONF_SPECIAL_QP
1928 if ((flags
& TAVOR_SPECIAL_QP0_RSRC_MASK
) == 0) {
1929 status
= tavor_conf_special_qp_cmd_post(state
, 0,
1930 TAVOR_CMD_QP_SMI
, TAVOR_CMD_NOSLEEP_SPIN
);
1931 if (status
!= TAVOR_CMD_SUCCESS
) {
1932 mutex_exit(&state
->ts_spec_qplock
);
1933 cmn_err(CE_CONT
, "Tavor: CONF_SPECIAL_QP "
1934 "command failed: %08x\n", status
);
1935 TNF_PROBE_1(tavor_conf_special_qp_cmd_fail
,
1936 TAVOR_TNF_ERROR
, "", tnf_uint
, status
,
1938 TAVOR_TNF_EXIT(tavor_special_qp_rsrc_free
);
1939 return (ibc_get_ci_failure(0));
1943 mask
= (1 << (TAVOR_SPECIAL_QP1_RSRC
+ port
));
1944 state
->ts_spec_qpflags
&= ~mask
;
1945 flags
= state
->ts_spec_qpflags
;
1948 * If this is the last QP1 free, then post a CONF_SPECIAL_QP
1951 if ((flags
& TAVOR_SPECIAL_QP1_RSRC_MASK
) == 0) {
1952 status
= tavor_conf_special_qp_cmd_post(state
, 0,
1953 TAVOR_CMD_QP_GSI
, TAVOR_CMD_NOSLEEP_SPIN
);
1954 if (status
!= TAVOR_CMD_SUCCESS
) {
1955 mutex_exit(&state
->ts_spec_qplock
);
1956 cmn_err(CE_CONT
, "Tavor: CONF_SPECIAL_QP "
1957 "command failed: %08x\n", status
);
1958 TNF_PROBE_1(tavor_conf_special_qp_cmd_fail
,
1959 TAVOR_TNF_ERROR
, "", tnf_uint
, status
,
1961 TAVOR_TNF_EXIT(tavor_special_qp_rsrc_free
);
1962 return (ibc_get_ci_failure(0));
1967 mutex_exit(&state
->ts_spec_qplock
);
1968 TAVOR_TNF_EXIT(tavor_special_qp_rsrc_free
);
1969 return (DDI_SUCCESS
);
1974 * tavor_qp_sgl_to_logwqesz()
1975 * Context: Can be called from interrupt or base context.
1978 tavor_qp_sgl_to_logwqesz(tavor_state_t
*state
, uint_t num_sgl
,
1979 tavor_qp_wq_type_t wq_type
, uint_t
*logwqesz
, uint_t
*max_sgl
)
1981 uint_t max_size
, log2
, actual_sgl
;
1983 TAVOR_TNF_ENTER(tavor_qp_sgl_to_logwqesz
);
1986 case TAVOR_QP_WQ_TYPE_SENDQ
:
1988 * Use requested maximum SGL to calculate max descriptor size
1989 * (while guaranteeing that the descriptor size is a
1990 * power-of-2 cachelines).
1992 max_size
= (TAVOR_QP_WQE_MLX_SND_HDRS
+ (num_sgl
<< 4));
1993 log2
= highbit(max_size
);
1994 if (ISP2(max_size
)) {
1998 /* Make sure descriptor is at least the minimum size */
1999 log2
= max(log2
, TAVOR_QP_WQE_LOG_MINIMUM
);
2001 /* Calculate actual number of SGL (given WQE size) */
2002 actual_sgl
= ((1 << log2
) - TAVOR_QP_WQE_MLX_SND_HDRS
) >> 4;
2005 case TAVOR_QP_WQ_TYPE_RECVQ
:
2007 * Same as above (except for Recv WQEs)
2009 max_size
= (TAVOR_QP_WQE_MLX_RCV_HDRS
+ (num_sgl
<< 4));
2010 log2
= highbit(max_size
);
2011 if (ISP2(max_size
)) {
2015 /* Make sure descriptor is at least the minimum size */
2016 log2
= max(log2
, TAVOR_QP_WQE_LOG_MINIMUM
);
2018 /* Calculate actual number of SGL (given WQE size) */
2019 actual_sgl
= ((1 << log2
) - TAVOR_QP_WQE_MLX_RCV_HDRS
) >> 4;
2022 case TAVOR_QP_WQ_TYPE_SENDMLX_QP0
:
2024 * Same as above (except for MLX transport WQEs). For these
2025 * WQEs we have to account for the space consumed by the
2026 * "inline" packet headers. (This is smaller than for QP1
2027 * below because QP0 is not allowed to send packets with a GRH.
2029 max_size
= (TAVOR_QP_WQE_MLX_QP0_HDRS
+ (num_sgl
<< 4));
2030 log2
= highbit(max_size
);
2031 if (ISP2(max_size
)) {
2035 /* Make sure descriptor is at least the minimum size */
2036 log2
= max(log2
, TAVOR_QP_WQE_LOG_MINIMUM
);
2038 /* Calculate actual number of SGL (given WQE size) */
2039 actual_sgl
= ((1 << log2
) - TAVOR_QP_WQE_MLX_QP0_HDRS
) >> 4;
2042 case TAVOR_QP_WQ_TYPE_SENDMLX_QP1
:
2044 * Same as above. For these WQEs we again have to account for
2045 * the space consumed by the "inline" packet headers. (This
2046 * is larger than for QP0 above because we have to account for
2047 * the possibility of a GRH in each packet - and this
2048 * introduces an alignment issue that causes us to consume
2049 * an additional 8 bytes).
2051 max_size
= (TAVOR_QP_WQE_MLX_QP1_HDRS
+ (num_sgl
<< 4));
2052 log2
= highbit(max_size
);
2053 if (ISP2(max_size
)) {
2057 /* Make sure descriptor is at least the minimum size */
2058 log2
= max(log2
, TAVOR_QP_WQE_LOG_MINIMUM
);
2060 /* Calculate actual number of SGL (given WQE size) */
2061 actual_sgl
= ((1 << log2
) - TAVOR_QP_WQE_MLX_QP1_HDRS
) >> 4;
2065 TAVOR_WARNING(state
, "unexpected work queue type");
2066 TNF_PROBE_0(tavor_qp_sgl_to_logwqesz_inv_wqtype_fail
,
2067 TAVOR_TNF_ERROR
, "");
2071 /* Fill in the return values */
2073 *max_sgl
= min(state
->ts_cfg_profile
->cp_wqe_real_max_sgl
, actual_sgl
);
2075 TAVOR_TNF_EXIT(tavor_qp_sgl_to_logwqesz
);