5255 uts shouldn't open-code ISP2
[illumos-gate.git] / usr / src / uts / common / io / ib / adapters / tavor / tavor_qp.c
blob7c43fb8aaae8c67937691eec48d32c5496f613ee
1 /*
2 * CDDL HEADER START
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
19 * CDDL HEADER END
23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
28 * tavor_qp.c
29 * Tavor Queue Pair Processing Routines
31 * Implements all the routines necessary for allocating, freeing, and
32 * querying the Tavor queue pairs.
35 #include <sys/types.h>
36 #include <sys/conf.h>
37 #include <sys/ddi.h>
38 #include <sys/sunddi.h>
39 #include <sys/modctl.h>
40 #include <sys/bitmap.h>
41 #include <sys/sysmacros.h>
43 #include <sys/ib/adapters/tavor/tavor.h>
44 #include <sys/ib/ib_pkt_hdrs.h>
46 static int tavor_qp_create_qpn(tavor_state_t *state, tavor_qphdl_t qp,
47 tavor_rsrc_t *qpc);
48 static int tavor_qpn_avl_compare(const void *q, const void *e);
49 static int tavor_special_qp_rsrc_alloc(tavor_state_t *state,
50 ibt_sqp_type_t type, uint_t port, tavor_rsrc_t **qp_rsrc);
51 static int tavor_special_qp_rsrc_free(tavor_state_t *state, ibt_sqp_type_t type,
52 uint_t port);
53 static void tavor_qp_sgl_to_logwqesz(tavor_state_t *state, uint_t num_sgl,
54 tavor_qp_wq_type_t wq_type, uint_t *logwqesz, uint_t *max_sgl);
57 * tavor_qp_alloc()
58 * Context: Can be called only from user or kernel context.
60 int
61 tavor_qp_alloc(tavor_state_t *state, tavor_qp_info_t *qpinfo,
62 uint_t sleepflag, tavor_qp_options_t *op)
64 tavor_rsrc_pool_info_t *rsrc_pool;
65 tavor_rsrc_t *qpc, *rsrc, *rdb;
66 tavor_umap_db_entry_t *umapdb;
67 tavor_qphdl_t qp;
68 ibt_qp_alloc_attr_t *attr_p;
69 ibt_qp_type_t type;
70 ibtl_qp_hdl_t ibt_qphdl;
71 ibt_chan_sizes_t *queuesz_p;
72 ib_qpn_t *qpn;
73 tavor_qphdl_t *qphdl;
74 ibt_mr_attr_t mr_attr;
75 tavor_mr_options_t mr_op;
76 tavor_srqhdl_t srq;
77 tavor_pdhdl_t pd;
78 tavor_cqhdl_t sq_cq, rq_cq;
79 tavor_mrhdl_t mr;
80 uint64_t value, qp_desc_off;
81 uint32_t *sq_buf, *rq_buf;
82 uint32_t log_qp_sq_size, log_qp_rq_size;
83 uint32_t sq_size, rq_size;
84 uint32_t sq_wqe_size, rq_wqe_size;
85 uint32_t max_rdb, max_sgl, uarpg;
86 uint_t wq_location, dma_xfer_mode, qp_is_umap;
87 uint_t qp_srq_en;
88 int status, flag;
89 char *errormsg;
91 TAVOR_TNF_ENTER(tavor_qp_alloc);
93 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*attr_p, *queuesz_p))
96 * Check the "options" flag. Currently this flag tells the driver
97 * whether or not the QP's work queues should be come from normal
98 * system memory or whether they should be allocated from DDR memory.
100 if (op == NULL) {
101 wq_location = TAVOR_QUEUE_LOCATION_NORMAL;
102 } else {
103 wq_location = op->qpo_wq_loc;
107 * Extract the necessary info from the tavor_qp_info_t structure
109 attr_p = qpinfo->qpi_attrp;
110 type = qpinfo->qpi_type;
111 ibt_qphdl = qpinfo->qpi_ibt_qphdl;
112 queuesz_p = qpinfo->qpi_queueszp;
113 qpn = qpinfo->qpi_qpn;
114 qphdl = &qpinfo->qpi_qphdl;
117 * Determine whether QP is being allocated for userland access or
118 * whether it is being allocated for kernel access. If the QP is
119 * being allocated for userland access, then lookup the UAR doorbell
120 * page number for the current process. Note: If this is not found
121 * (e.g. if the process has not previously open()'d the Tavor driver),
122 * then an error is returned.
124 qp_is_umap = (attr_p->qp_alloc_flags & IBT_QP_USER_MAP) ? 1 : 0;
125 if (qp_is_umap) {
126 status = tavor_umap_db_find(state->ts_instance, ddi_get_pid(),
127 MLNX_UMAP_UARPG_RSRC, &value, 0, NULL);
128 if (status != DDI_SUCCESS) {
129 /* Set "status" and "errormsg" and goto failure */
130 TAVOR_TNF_FAIL(IBT_INVALID_PARAM, "failed UAR page");
131 goto qpalloc_fail;
133 uarpg = ((tavor_rsrc_t *)(uintptr_t)value)->tr_indx;
137 * Determine whether QP is being associated with an SRQ
139 qp_srq_en = (attr_p->qp_alloc_flags & IBT_QP_USES_SRQ) ? 1 : 0;
140 if (qp_srq_en) {
142 * Check for valid SRQ handle pointers
144 if (attr_p->qp_ibc_srq_hdl == NULL) {
145 /* Set "status" and "errormsg" and goto failure */
146 TAVOR_TNF_FAIL(IBT_SRQ_HDL_INVALID,
147 "invalid SRQ handle");
148 goto qpalloc_fail;
150 srq = (tavor_srqhdl_t)attr_p->qp_ibc_srq_hdl;
154 * Check for valid QP service type (only UD/RC/UC supported)
156 if (((type != IBT_UD_RQP) && (type != IBT_RC_RQP) &&
157 (type != IBT_UC_RQP))) {
158 /* Set "status" and "errormsg" and goto failure */
159 TAVOR_TNF_FAIL(IBT_QP_SRV_TYPE_INVALID, "invalid serv type");
160 goto qpalloc_fail;
164 * Only RC is supported on an SRQ -- This is a Tavor hardware
165 * limitation. Arbel native mode will not have this shortcoming.
167 if (qp_srq_en && type != IBT_RC_RQP) {
168 /* Set "status" and "errormsg" and goto failure */
169 TAVOR_TNF_FAIL(IBT_INVALID_PARAM, "invalid serv type with SRQ");
170 goto qpalloc_fail;
174 * Check for valid PD handle pointer
176 if (attr_p->qp_pd_hdl == NULL) {
177 /* Set "status" and "errormsg" and goto failure */
178 TAVOR_TNF_FAIL(IBT_PD_HDL_INVALID, "invalid PD handle");
179 goto qpalloc_fail;
181 pd = (tavor_pdhdl_t)attr_p->qp_pd_hdl;
184 * If on an SRQ, check to make sure the PD is the same
186 if (qp_srq_en && (pd->pd_pdnum != srq->srq_pdhdl->pd_pdnum)) {
187 /* Set "status" and "errormsg" and goto failure */
188 TAVOR_TNF_FAIL(IBT_PD_HDL_INVALID, "invalid PD handle");
189 goto qpalloc_fail;
192 /* Increment the reference count on the protection domain (PD) */
193 tavor_pd_refcnt_inc(pd);
196 * Check for valid CQ handle pointers
198 if ((attr_p->qp_ibc_scq_hdl == NULL) ||
199 (attr_p->qp_ibc_rcq_hdl == NULL)) {
200 /* Set "status" and "errormsg" and goto failure */
201 TAVOR_TNF_FAIL(IBT_CQ_HDL_INVALID, "invalid CQ handle");
202 goto qpalloc_fail1;
204 sq_cq = (tavor_cqhdl_t)attr_p->qp_ibc_scq_hdl;
205 rq_cq = (tavor_cqhdl_t)attr_p->qp_ibc_rcq_hdl;
208 * Increment the reference count on the CQs. One or both of these
209 * could return error if we determine that the given CQ is already
210 * being used with a special (SMI/GSI) QP.
212 status = tavor_cq_refcnt_inc(sq_cq, TAVOR_CQ_IS_NORMAL);
213 if (status != DDI_SUCCESS) {
214 /* Set "status" and "errormsg" and goto failure */
215 TAVOR_TNF_FAIL(IBT_CQ_HDL_INVALID, "invalid CQ handle");
216 goto qpalloc_fail1;
218 status = tavor_cq_refcnt_inc(rq_cq, TAVOR_CQ_IS_NORMAL);
219 if (status != DDI_SUCCESS) {
220 /* Set "status" and "errormsg" and goto failure */
221 TAVOR_TNF_FAIL(IBT_CQ_HDL_INVALID, "invalid CQ handle");
222 goto qpalloc_fail2;
226 * Allocate an QP context entry. This will be filled in with all
227 * the necessary parameters to define the Queue Pair. Unlike
228 * other Tavor hardware resources, ownership is not immediately
229 * given to hardware in the final step here. Instead, we must
230 * wait until the QP is later transitioned to the "Init" state before
231 * passing the QP to hardware. If we fail here, we must undo all
232 * the reference count (CQ and PD).
234 status = tavor_rsrc_alloc(state, TAVOR_QPC, 1, sleepflag, &qpc);
235 if (status != DDI_SUCCESS) {
236 /* Set "status" and "errormsg" and goto failure */
237 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed QP context");
238 goto qpalloc_fail3;
242 * Allocate the software structure for tracking the queue pair
243 * (i.e. the Tavor Queue Pair handle). If we fail here, we must
244 * undo the reference counts and the previous resource allocation.
246 status = tavor_rsrc_alloc(state, TAVOR_QPHDL, 1, sleepflag, &rsrc);
247 if (status != DDI_SUCCESS) {
248 /* Set "status" and "errormsg" and goto failure */
249 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed QP handle");
250 goto qpalloc_fail4;
252 qp = (tavor_qphdl_t)rsrc->tr_addr;
253 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*qp))
256 * Calculate the QP number from QPC index. This routine handles
257 * all of the operations necessary to keep track of used, unused,
258 * and released QP numbers.
260 status = tavor_qp_create_qpn(state, qp, qpc);
261 if (status != DDI_SUCCESS) {
262 /* Set "status" and "errormsg" and goto failure */
263 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed QPN create");
264 goto qpalloc_fail5;
268 * If this will be a user-mappable QP, then allocate an entry for
269 * the "userland resources database". This will later be added to
270 * the database (after all further QP operations are successful).
271 * If we fail here, we must undo the reference counts and the
272 * previous resource allocation.
274 if (qp_is_umap) {
275 umapdb = tavor_umap_db_alloc(state->ts_instance, qp->qp_qpnum,
276 MLNX_UMAP_QPMEM_RSRC, (uint64_t)(uintptr_t)rsrc);
277 if (umapdb == NULL) {
278 /* Set "status" and "errormsg" and goto failure */
279 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed umap add");
280 goto qpalloc_fail6;
285 * If this is an RC QP, then pre-allocate the maximum number of RDB
286 * entries. This allows us to ensure that we can later cover all
287 * the resources needed by hardware for handling multiple incoming
288 * RDMA Reads. Note: These resources are obviously not always
289 * necessary. They are allocated here anyway. Someday maybe this
290 * can be modified to allocate these on-the-fly (i.e. only if RDMA
291 * Read or Atomic operations are enabled) XXX
292 * If we fail here, we have a bunch of resource and reference count
293 * cleanup to do.
295 if (type == IBT_RC_RQP) {
296 max_rdb = state->ts_cfg_profile->cp_hca_max_rdma_in_qp;
297 status = tavor_rsrc_alloc(state, TAVOR_RDB, max_rdb,
298 sleepflag, &rdb);
299 if (status != DDI_SUCCESS) {
300 /* Set "status" and "errormsg" and goto failure */
301 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed RDB");
302 goto qpalloc_fail7;
304 qp->qp_rdbrsrcp = rdb;
305 /* Calculate offset (into DDR memory) of RDB entries */
306 rsrc_pool = &state->ts_rsrc_hdl[TAVOR_RDB];
307 qp->qp_rdb_ddraddr = (uintptr_t)rsrc_pool->rsrc_ddr_offset +
308 (rdb->tr_indx << TAVOR_RDB_SIZE_SHIFT);
312 * Calculate the appropriate size for the work queues.
313 * Note: All Tavor QP work queues must be a power-of-2 in size. Also
314 * they may not be any smaller than TAVOR_QP_MIN_SIZE. This step is
315 * to round the requested size up to the next highest power-of-2
317 attr_p->qp_sizes.cs_sq = max(attr_p->qp_sizes.cs_sq, TAVOR_QP_MIN_SIZE);
318 attr_p->qp_sizes.cs_rq = max(attr_p->qp_sizes.cs_rq, TAVOR_QP_MIN_SIZE);
319 log_qp_sq_size = highbit(attr_p->qp_sizes.cs_sq);
320 if (ISP2(attr_p->qp_sizes.cs_sq)) {
321 log_qp_sq_size = log_qp_sq_size - 1;
323 log_qp_rq_size = highbit(attr_p->qp_sizes.cs_rq);
324 if (ISP2(attr_p->qp_sizes.cs_rq)) {
325 log_qp_rq_size = log_qp_rq_size - 1;
329 * Next we verify that the rounded-up size is valid (i.e. consistent
330 * with the device limits and/or software-configured limits). If not,
331 * then obviously we have a lot of cleanup to do before returning.
333 if ((log_qp_sq_size > state->ts_cfg_profile->cp_log_max_qp_sz) ||
334 (!qp_srq_en && (log_qp_rq_size >
335 state->ts_cfg_profile->cp_log_max_qp_sz))) {
336 /* Set "status" and "errormsg" and goto failure */
337 TAVOR_TNF_FAIL(IBT_HCA_WR_EXCEEDED, "max QP size");
338 goto qpalloc_fail8;
342 * Next we verify that the requested number of SGL is valid (i.e.
343 * consistent with the device limits and/or software-configured
344 * limits). If not, then obviously the same cleanup needs to be done.
346 max_sgl = state->ts_cfg_profile->cp_wqe_real_max_sgl;
347 if ((attr_p->qp_sizes.cs_sq_sgl > max_sgl) ||
348 (!qp_srq_en && (attr_p->qp_sizes.cs_rq_sgl > max_sgl))) {
349 /* Set "status" and "errormsg" and goto failure */
350 TAVOR_TNF_FAIL(IBT_HCA_SGL_EXCEEDED, "max QP SGL");
351 goto qpalloc_fail8;
355 * Determine this QP's WQE sizes (for both the Send and Recv WQEs).
356 * This will depend on the requested number of SGLs. Note: this
357 * has the side-effect of also calculating the real number of SGLs
358 * (for the calculated WQE size).
360 * For QP's on an SRQ, we set these to 0.
362 if (qp_srq_en) {
363 qp->qp_rq_log_wqesz = 0;
364 qp->qp_rq_sgl = 0;
365 } else {
366 tavor_qp_sgl_to_logwqesz(state, attr_p->qp_sizes.cs_rq_sgl,
367 TAVOR_QP_WQ_TYPE_RECVQ, &qp->qp_rq_log_wqesz,
368 &qp->qp_rq_sgl);
370 tavor_qp_sgl_to_logwqesz(state, attr_p->qp_sizes.cs_sq_sgl,
371 TAVOR_QP_WQ_TYPE_SENDQ, &qp->qp_sq_log_wqesz, &qp->qp_sq_sgl);
374 * Allocate the memory for QP work queues. Note: The location from
375 * which we will allocate these work queues has been passed in
376 * through the tavor_qp_options_t structure. Since Tavor work queues
377 * are not allowed to cross a 32-bit (4GB) boundary, the alignment of
378 * the work queue memory is very important. We used to allocate
379 * work queues (the combined receive and send queues) so that they
380 * would be aligned on their combined size. That alignment guaranteed
381 * that they would never cross the 4GB boundary (Tavor work queues
382 * are on the order of MBs at maximum). Now we are able to relax
383 * this alignment constraint by ensuring that the IB address assigned
384 * to the queue memory (as a result of the tavor_mr_register() call)
385 * is offset from zero.
386 * Previously, we had wanted to use the ddi_dma_mem_alloc() routine to
387 * guarantee the alignment, but when attempting to use IOMMU bypass
388 * mode we found that we were not allowed to specify any alignment
389 * that was more restrictive than the system page size.
390 * So we avoided this constraint by passing two alignment values,
391 * one for the memory allocation itself and the other for the DMA
392 * handle (for later bind). This used to cause more memory than
393 * necessary to be allocated (in order to guarantee the more
394 * restrictive alignment contraint). But be guaranteeing the
395 * zero-based IB virtual address for the queue, we are able to
396 * conserve this memory.
397 * Note: If QP is not user-mappable, then it may come from either
398 * kernel system memory or from HCA-attached local DDR memory.
400 sq_wqe_size = 1 << qp->qp_sq_log_wqesz;
401 sq_size = (1 << log_qp_sq_size) * sq_wqe_size;
403 /* QP on SRQ sets these to 0 */
404 if (qp_srq_en) {
405 rq_wqe_size = 0;
406 rq_size = 0;
407 } else {
408 rq_wqe_size = 1 << qp->qp_rq_log_wqesz;
409 rq_size = (1 << log_qp_rq_size) * rq_wqe_size;
412 qp->qp_wqinfo.qa_size = sq_size + rq_size;
413 qp->qp_wqinfo.qa_alloc_align = max(sq_wqe_size, rq_wqe_size);
414 qp->qp_wqinfo.qa_bind_align = max(sq_wqe_size, rq_wqe_size);
415 if (qp_is_umap) {
416 qp->qp_wqinfo.qa_location = TAVOR_QUEUE_LOCATION_USERLAND;
417 } else {
418 qp->qp_wqinfo.qa_location = wq_location;
420 status = tavor_queue_alloc(state, &qp->qp_wqinfo, sleepflag);
421 if (status != DDI_SUCCESS) {
422 /* Set "status" and "errormsg" and goto failure */
423 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed work queue");
424 goto qpalloc_fail8;
426 if (sq_wqe_size > rq_wqe_size) {
427 sq_buf = qp->qp_wqinfo.qa_buf_aligned;
430 * If QP's on an SRQ, we set the rq_buf to NULL
432 if (qp_srq_en)
433 rq_buf = NULL;
434 else
435 rq_buf = (uint32_t *)((uintptr_t)sq_buf + sq_size);
436 } else {
437 rq_buf = qp->qp_wqinfo.qa_buf_aligned;
438 sq_buf = (uint32_t *)((uintptr_t)rq_buf + rq_size);
442 * Register the memory for the QP work queues. The memory for the
443 * QP must be registered in the Tavor TPT tables. This gives us the
444 * LKey to specify in the QP context later. Note: The memory for
445 * Tavor work queues (both Send and Recv) must be contiguous and
446 * registered as a single memory region. Note also: If the work
447 * queue is to be allocated from DDR memory, then only a "bypass"
448 * mapping is appropriate. And if the QP memory is user-mappable,
449 * then we force DDI_DMA_CONSISTENT mapping.
450 * Also, in order to meet the alignment restriction, we pass the
451 * "mro_bind_override_addr" flag in the call to tavor_mr_register().
452 * This guarantees that the resulting IB vaddr will be zero-based
453 * (modulo the offset into the first page).
454 * If we fail here, we still have the bunch of resource and reference
455 * count cleanup to do.
457 flag = (sleepflag == TAVOR_SLEEP) ? IBT_MR_SLEEP :
458 IBT_MR_NOSLEEP;
459 mr_attr.mr_vaddr = (uint64_t)(uintptr_t)qp->qp_wqinfo.qa_buf_aligned;
460 mr_attr.mr_len = qp->qp_wqinfo.qa_size;
461 mr_attr.mr_as = NULL;
462 mr_attr.mr_flags = flag;
463 if (qp_is_umap) {
464 mr_op.mro_bind_type = state->ts_cfg_profile->cp_iommu_bypass;
465 } else {
466 if (wq_location == TAVOR_QUEUE_LOCATION_NORMAL) {
467 mr_op.mro_bind_type =
468 state->ts_cfg_profile->cp_iommu_bypass;
469 dma_xfer_mode =
470 state->ts_cfg_profile->cp_streaming_consistent;
471 if (dma_xfer_mode == DDI_DMA_STREAMING) {
472 mr_attr.mr_flags |= IBT_MR_NONCOHERENT;
474 } else {
475 mr_op.mro_bind_type = TAVOR_BINDMEM_BYPASS;
478 mr_op.mro_bind_dmahdl = qp->qp_wqinfo.qa_dmahdl;
479 mr_op.mro_bind_override_addr = 1;
480 status = tavor_mr_register(state, pd, &mr_attr, &mr, &mr_op);
481 if (status != DDI_SUCCESS) {
482 /* Set "status" and "errormsg" and goto failure */
483 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed register mr");
484 goto qpalloc_fail9;
488 * Calculate the offset between the kernel virtual address space
489 * and the IB virtual address space. This will be used when
490 * posting work requests to properly initialize each WQE.
492 qp_desc_off = (uint64_t)(uintptr_t)qp->qp_wqinfo.qa_buf_aligned -
493 (uint64_t)mr->mr_bindinfo.bi_addr;
496 * Fill in all the return arguments (if necessary). This includes
497 * real work queue sizes, real SGLs, and QP number
499 if (queuesz_p != NULL) {
500 queuesz_p->cs_sq = (1 << log_qp_sq_size);
501 queuesz_p->cs_sq_sgl = qp->qp_sq_sgl;
503 /* QP on an SRQ set these to 0 */
504 if (qp_srq_en) {
505 queuesz_p->cs_rq = 0;
506 queuesz_p->cs_rq_sgl = 0;
507 } else {
508 queuesz_p->cs_rq = (1 << log_qp_rq_size);
509 queuesz_p->cs_rq_sgl = qp->qp_rq_sgl;
512 if (qpn != NULL) {
513 *qpn = (ib_qpn_t)qp->qp_qpnum;
517 * Fill in the rest of the Tavor Queue Pair handle. We can update
518 * the following fields for use in further operations on the QP.
520 qp->qp_qpcrsrcp = qpc;
521 qp->qp_rsrcp = rsrc;
522 qp->qp_state = TAVOR_QP_RESET;
523 qp->qp_pdhdl = pd;
524 qp->qp_mrhdl = mr;
525 qp->qp_sq_sigtype = (attr_p->qp_flags & IBT_WR_SIGNALED) ?
526 TAVOR_QP_SQ_WR_SIGNALED : TAVOR_QP_SQ_ALL_SIGNALED;
527 qp->qp_is_special = 0;
528 qp->qp_is_umap = qp_is_umap;
529 qp->qp_uarpg = (qp->qp_is_umap) ? uarpg : 0;
530 qp->qp_umap_dhp = (devmap_cookie_t)NULL;
531 qp->qp_sq_cqhdl = sq_cq;
532 qp->qp_sq_lastwqeaddr = NULL;
533 qp->qp_sq_bufsz = (1 << log_qp_sq_size);
534 qp->qp_sq_buf = sq_buf;
535 qp->qp_desc_off = qp_desc_off;
536 qp->qp_rq_cqhdl = rq_cq;
537 qp->qp_rq_lastwqeaddr = NULL;
538 qp->qp_rq_buf = rq_buf;
540 /* QP on an SRQ sets this to 0 */
541 if (qp_srq_en) {
542 qp->qp_rq_bufsz = 0;
543 } else {
544 qp->qp_rq_bufsz = (1 << log_qp_rq_size);
547 qp->qp_forward_sqd_event = 0;
548 qp->qp_sqd_still_draining = 0;
549 qp->qp_hdlrarg = (void *)ibt_qphdl;
550 qp->qp_mcg_refcnt = 0;
553 * If this QP is to be associated with an SRQ, then set the SRQ handle
554 * appropriately.
556 if (qp_srq_en) {
557 qp->qp_srqhdl = srq;
558 qp->qp_srq_en = TAVOR_QP_SRQ_ENABLED;
559 tavor_srq_refcnt_inc(qp->qp_srqhdl);
560 } else {
561 qp->qp_srqhdl = NULL;
562 qp->qp_srq_en = TAVOR_QP_SRQ_DISABLED;
565 /* Determine if later ddi_dma_sync will be necessary */
566 qp->qp_sync = TAVOR_QP_IS_SYNC_REQ(state, qp->qp_wqinfo);
568 /* Determine the QP service type */
569 if (type == IBT_RC_RQP) {
570 qp->qp_serv_type = TAVOR_QP_RC;
571 } else if (type == IBT_UD_RQP) {
572 qp->qp_serv_type = TAVOR_QP_UD;
573 } else {
574 qp->qp_serv_type = TAVOR_QP_UC;
577 /* Zero out the QP context */
578 bzero(&qp->qpc, sizeof (tavor_hw_qpc_t));
581 * Put QP handle in Tavor QPNum-to-QPHdl list. Then fill in the
582 * "qphdl" and return success
584 ASSERT(state->ts_qphdl[qpc->tr_indx] == NULL);
585 state->ts_qphdl[qpc->tr_indx] = qp;
588 * If this is a user-mappable QP, then we need to insert the previously
589 * allocated entry into the "userland resources database". This will
590 * allow for later lookup during devmap() (i.e. mmap()) calls.
592 if (qp_is_umap) {
593 tavor_umap_db_add(umapdb);
596 *qphdl = qp;
598 TAVOR_TNF_EXIT(tavor_qp_alloc);
599 return (DDI_SUCCESS);
602 * The following is cleanup for all possible failure cases in this routine
604 qpalloc_fail9:
605 tavor_queue_free(state, &qp->qp_wqinfo);
606 qpalloc_fail8:
607 if (type == IBT_RC_RQP) {
608 tavor_rsrc_free(state, &rdb);
610 qpalloc_fail7:
611 if (qp_is_umap) {
612 tavor_umap_db_free(umapdb);
614 qpalloc_fail6:
616 * Releasing the QPN will also free up the QPC context. Update
617 * the QPC context pointer to indicate this.
619 tavor_qp_release_qpn(state, qp->qp_qpn_hdl, TAVOR_QPN_RELEASE);
620 qpc = NULL;
621 qpalloc_fail5:
622 tavor_rsrc_free(state, &rsrc);
623 qpalloc_fail4:
624 if (qpc) {
625 tavor_rsrc_free(state, &qpc);
627 qpalloc_fail3:
628 tavor_cq_refcnt_dec(rq_cq);
629 qpalloc_fail2:
630 tavor_cq_refcnt_dec(sq_cq);
631 qpalloc_fail1:
632 tavor_pd_refcnt_dec(pd);
633 qpalloc_fail:
634 TNF_PROBE_1(tavor_qp_alloc_fail, TAVOR_TNF_ERROR, "",
635 tnf_string, msg, errormsg);
636 TAVOR_TNF_EXIT(tavor_qp_alloc);
637 return (status);
643 * tavor_special_qp_alloc()
644 * Context: Can be called only from user or kernel context.
647 tavor_special_qp_alloc(tavor_state_t *state, tavor_qp_info_t *qpinfo,
648 uint_t sleepflag, tavor_qp_options_t *op)
650 tavor_rsrc_t *qpc, *rsrc;
651 tavor_qphdl_t qp;
652 ibt_qp_alloc_attr_t *attr_p;
653 ibt_sqp_type_t type;
654 uint8_t port;
655 ibtl_qp_hdl_t ibt_qphdl;
656 ibt_chan_sizes_t *queuesz_p;
657 tavor_qphdl_t *qphdl;
658 ibt_mr_attr_t mr_attr;
659 tavor_mr_options_t mr_op;
660 tavor_pdhdl_t pd;
661 tavor_cqhdl_t sq_cq, rq_cq;
662 tavor_mrhdl_t mr;
663 uint64_t qp_desc_off;
664 uint32_t *sq_buf, *rq_buf;
665 uint32_t log_qp_sq_size, log_qp_rq_size;
666 uint32_t sq_size, rq_size, max_sgl;
667 uint32_t sq_wqe_size, rq_wqe_size;
668 uint_t wq_location, dma_xfer_mode;
669 int status, flag;
670 char *errormsg;
672 TAVOR_TNF_ENTER(tavor_special_qp_alloc);
675 * Check the "options" flag. Currently this flag tells the driver
676 * whether or not the QP's work queues should be come from normal
677 * system memory or whether they should be allocated from DDR memory.
679 if (op == NULL) {
680 wq_location = TAVOR_QUEUE_LOCATION_NORMAL;
681 } else {
682 wq_location = op->qpo_wq_loc;
686 * Extract the necessary info from the tavor_qp_info_t structure
688 attr_p = qpinfo->qpi_attrp;
689 type = qpinfo->qpi_type;
690 port = qpinfo->qpi_port;
691 ibt_qphdl = qpinfo->qpi_ibt_qphdl;
692 queuesz_p = qpinfo->qpi_queueszp;
693 qphdl = &qpinfo->qpi_qphdl;
696 * Check for valid special QP type (only SMI & GSI supported)
698 if ((type != IBT_SMI_SQP) && (type != IBT_GSI_SQP)) {
699 /* Set "status" and "errormsg" and goto failure */
700 TAVOR_TNF_FAIL(IBT_QP_SPECIAL_TYPE_INVALID, "invalid QP type");
701 goto spec_qpalloc_fail;
705 * Check for valid port number
707 if (!tavor_portnum_is_valid(state, port)) {
708 /* Set "status" and "errormsg" and goto failure */
709 TAVOR_TNF_FAIL(IBT_HCA_PORT_INVALID, "invalid port num");
710 goto spec_qpalloc_fail;
712 port = port - 1;
715 * Check for valid PD handle pointer
717 if (attr_p->qp_pd_hdl == NULL) {
718 /* Set "status" and "errormsg" and goto failure */
719 TAVOR_TNF_FAIL(IBT_PD_HDL_INVALID, "invalid PD handle");
720 goto spec_qpalloc_fail;
722 pd = (tavor_pdhdl_t)attr_p->qp_pd_hdl;
724 /* Increment the reference count on the PD */
725 tavor_pd_refcnt_inc(pd);
728 * Check for valid CQ handle pointers
730 if ((attr_p->qp_ibc_scq_hdl == NULL) ||
731 (attr_p->qp_ibc_rcq_hdl == NULL)) {
732 /* Set "status" and "errormsg" and goto failure */
733 TAVOR_TNF_FAIL(IBT_CQ_HDL_INVALID, "invalid CQ handle");
734 goto spec_qpalloc_fail1;
736 sq_cq = (tavor_cqhdl_t)attr_p->qp_ibc_scq_hdl;
737 rq_cq = (tavor_cqhdl_t)attr_p->qp_ibc_rcq_hdl;
740 * Increment the reference count on the CQs. One or both of these
741 * could return error if we determine that the given CQ is already
742 * being used with a non-special QP (i.e. a normal QP).
744 status = tavor_cq_refcnt_inc(sq_cq, TAVOR_CQ_IS_SPECIAL);
745 if (status != DDI_SUCCESS) {
746 /* Set "status" and "errormsg" and goto failure */
747 TAVOR_TNF_FAIL(IBT_CQ_HDL_INVALID, "invalid CQ handle");
748 goto spec_qpalloc_fail1;
750 status = tavor_cq_refcnt_inc(rq_cq, TAVOR_CQ_IS_SPECIAL);
751 if (status != DDI_SUCCESS) {
752 /* Set "status" and "errormsg" and goto failure */
753 TAVOR_TNF_FAIL(IBT_CQ_HDL_INVALID, "invalid CQ handle");
754 goto spec_qpalloc_fail2;
758 * Allocate the special QP resources. Essentially, this allocation
759 * amounts to checking if the request special QP has already been
760 * allocated. If successful, the QP context return is an actual
761 * QP context that has been "aliased" to act as a special QP of the
762 * appropriate type (and for the appropriate port). Just as in
763 * tavor_qp_alloc() above, ownership for this QP context is not
764 * immediately given to hardware in the final step here. Instead, we
765 * wait until the QP is later transitioned to the "Init" state before
766 * passing the QP to hardware. If we fail here, we must undo all
767 * the reference count (CQ and PD).
769 status = tavor_special_qp_rsrc_alloc(state, type, port, &qpc);
770 if (status != DDI_SUCCESS) {
771 /* Set "status" and "errormsg" and goto failure */
772 TAVOR_TNF_FAIL(status, "failed special QP rsrc");
773 goto spec_qpalloc_fail3;
777 * Allocate the software structure for tracking the special queue
778 * pair (i.e. the Tavor Queue Pair handle). If we fail here, we
779 * must undo the reference counts and the previous resource allocation.
781 status = tavor_rsrc_alloc(state, TAVOR_QPHDL, 1, sleepflag, &rsrc);
782 if (status != DDI_SUCCESS) {
783 /* Set "status" and "errormsg" and goto failure */
784 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed QP handle");
785 goto spec_qpalloc_fail4;
787 qp = (tavor_qphdl_t)rsrc->tr_addr;
788 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*qp))
791 * Actual QP number is a combination of the index of the QPC and
792 * the port number. This is because the special QP contexts must
793 * be allocated two-at-a-time.
795 qp->qp_qpnum = qpc->tr_indx + port;
798 * Calculate the appropriate size for the work queues.
799 * Note: All Tavor QP work queues must be a power-of-2 in size. Also
800 * they may not be any smaller than TAVOR_QP_MIN_SIZE. This step is
801 * to round the requested size up to the next highest power-of-2
803 attr_p->qp_sizes.cs_sq = max(attr_p->qp_sizes.cs_sq, TAVOR_QP_MIN_SIZE);
804 attr_p->qp_sizes.cs_rq = max(attr_p->qp_sizes.cs_rq, TAVOR_QP_MIN_SIZE);
805 log_qp_sq_size = highbit(attr_p->qp_sizes.cs_sq);
806 if (ISP2(attr_p->qp_sizes.cs_sq)) {
807 log_qp_sq_size = log_qp_sq_size - 1;
809 log_qp_rq_size = highbit(attr_p->qp_sizes.cs_rq);
810 if (ISP2(attr_p->qp_sizes.cs_rq)) {
811 log_qp_rq_size = log_qp_rq_size - 1;
815 * Next we verify that the rounded-up size is valid (i.e. consistent
816 * with the device limits and/or software-configured limits). If not,
817 * then obviously we have a bit of cleanup to do before returning.
819 if ((log_qp_sq_size > state->ts_cfg_profile->cp_log_max_qp_sz) ||
820 (log_qp_rq_size > state->ts_cfg_profile->cp_log_max_qp_sz)) {
821 /* Set "status" and "errormsg" and goto failure */
822 TAVOR_TNF_FAIL(IBT_HCA_WR_EXCEEDED, "max QP size");
823 goto spec_qpalloc_fail5;
827 * Next we verify that the requested number of SGL is valid (i.e.
828 * consistent with the device limits and/or software-configured
829 * limits). If not, then obviously the same cleanup needs to be done.
831 max_sgl = state->ts_cfg_profile->cp_wqe_real_max_sgl;
832 if ((attr_p->qp_sizes.cs_sq_sgl > max_sgl) ||
833 (attr_p->qp_sizes.cs_rq_sgl > max_sgl)) {
834 /* Set "status" and "errormsg" and goto failure */
835 TAVOR_TNF_FAIL(IBT_HCA_SGL_EXCEEDED, "max QP SGL");
836 goto spec_qpalloc_fail5;
840 * Determine this QP's WQE sizes (for both the Send and Recv WQEs).
841 * This will depend on the requested number of SGLs. Note: this
842 * has the side-effect of also calculating the real number of SGLs
843 * (for the calculated WQE size).
845 tavor_qp_sgl_to_logwqesz(state, attr_p->qp_sizes.cs_rq_sgl,
846 TAVOR_QP_WQ_TYPE_RECVQ, &qp->qp_rq_log_wqesz, &qp->qp_rq_sgl);
847 if (type == IBT_SMI_SQP) {
848 tavor_qp_sgl_to_logwqesz(state, attr_p->qp_sizes.cs_sq_sgl,
849 TAVOR_QP_WQ_TYPE_SENDMLX_QP0, &qp->qp_sq_log_wqesz,
850 &qp->qp_sq_sgl);
851 } else {
852 tavor_qp_sgl_to_logwqesz(state, attr_p->qp_sizes.cs_sq_sgl,
853 TAVOR_QP_WQ_TYPE_SENDMLX_QP1, &qp->qp_sq_log_wqesz,
854 &qp->qp_sq_sgl);
858 * Allocate the memory for QP work queues. Note: The location from
859 * which we will allocate these work queues has been passed in
860 * through the tavor_qp_options_t structure. Since Tavor work queues
861 * are not allowed to cross a 32-bit (4GB) boundary, the alignment of
862 * the work queue memory is very important. We used to allocate
863 * work queues (the combined receive and send queues) so that they
864 * would be aligned on their combined size. That alignment guaranteed
865 * that they would never cross the 4GB boundary (Tavor work queues
866 * are on the order of MBs at maximum). Now we are able to relax
867 * this alignment constraint by ensuring that the IB address assigned
868 * to the queue memory (as a result of the tavor_mr_register() call)
869 * is offset from zero.
870 * Previously, we had wanted to use the ddi_dma_mem_alloc() routine to
871 * guarantee the alignment, but when attempting to use IOMMU bypass
872 * mode we found that we were not allowed to specify any alignment
873 * that was more restrictive than the system page size.
874 * So we avoided this constraint by passing two alignment values,
875 * one for the memory allocation itself and the other for the DMA
876 * handle (for later bind). This used to cause more memory than
877 * necessary to be allocated (in order to guarantee the more
878 * restrictive alignment contraint). But be guaranteeing the
879 * zero-based IB virtual address for the queue, we are able to
880 * conserve this memory.
882 sq_wqe_size = 1 << qp->qp_sq_log_wqesz;
883 rq_wqe_size = 1 << qp->qp_rq_log_wqesz;
884 sq_size = (1 << log_qp_sq_size) * sq_wqe_size;
885 rq_size = (1 << log_qp_rq_size) * rq_wqe_size;
886 qp->qp_wqinfo.qa_size = sq_size + rq_size;
887 qp->qp_wqinfo.qa_alloc_align = max(sq_wqe_size, rq_wqe_size);
888 qp->qp_wqinfo.qa_bind_align = max(sq_wqe_size, rq_wqe_size);
889 qp->qp_wqinfo.qa_location = wq_location;
890 status = tavor_queue_alloc(state, &qp->qp_wqinfo, sleepflag);
891 if (status != NULL) {
892 /* Set "status" and "errormsg" and goto failure */
893 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed work queue");
894 goto spec_qpalloc_fail5;
896 if (sq_wqe_size > rq_wqe_size) {
897 sq_buf = qp->qp_wqinfo.qa_buf_aligned;
898 rq_buf = (uint32_t *)((uintptr_t)sq_buf + sq_size);
899 } else {
900 rq_buf = qp->qp_wqinfo.qa_buf_aligned;
901 sq_buf = (uint32_t *)((uintptr_t)rq_buf + rq_size);
905 * Register the memory for the special QP work queues. The memory for
906 * the special QP must be registered in the Tavor TPT tables. This
907 * gives us the LKey to specify in the QP context later. Note: The
908 * memory for Tavor work queues (both Send and Recv) must be contiguous
909 * and registered as a single memory region. Note also: If the work
910 * queue is to be allocated from DDR memory, then only a "bypass"
911 * mapping is appropriate.
912 * Also, in order to meet the alignment restriction, we pass the
913 * "mro_bind_override_addr" flag in the call to tavor_mr_register().
914 * This guarantees that the resulting IB vaddr will be zero-based
915 * (modulo the offset into the first page).
916 * If we fail here, we have a bunch of resource and reference count
917 * cleanup to do.
919 flag = (sleepflag == TAVOR_SLEEP) ? IBT_MR_SLEEP :
920 IBT_MR_NOSLEEP;
921 mr_attr.mr_vaddr = (uint64_t)(uintptr_t)qp->qp_wqinfo.qa_buf_aligned;
922 mr_attr.mr_len = qp->qp_wqinfo.qa_size;
923 mr_attr.mr_as = NULL;
924 mr_attr.mr_flags = flag;
925 if (wq_location == TAVOR_QUEUE_LOCATION_NORMAL) {
926 mr_op.mro_bind_type = state->ts_cfg_profile->cp_iommu_bypass;
928 dma_xfer_mode = state->ts_cfg_profile->cp_streaming_consistent;
929 if (dma_xfer_mode == DDI_DMA_STREAMING) {
930 mr_attr.mr_flags |= IBT_MR_NONCOHERENT;
932 } else {
933 mr_op.mro_bind_type = TAVOR_BINDMEM_BYPASS;
935 mr_op.mro_bind_dmahdl = qp->qp_wqinfo.qa_dmahdl;
936 mr_op.mro_bind_override_addr = 1;
937 status = tavor_mr_register(state, pd, &mr_attr, &mr, &mr_op);
938 if (status != DDI_SUCCESS) {
939 /* Set "status" and "errormsg" and goto failure */
940 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE, "failed register mr");
941 goto spec_qpalloc_fail6;
945 * Calculate the offset between the kernel virtual address space
946 * and the IB virtual address space. This will be used when
947 * posting work requests to properly initialize each WQE.
949 qp_desc_off = (uint64_t)(uintptr_t)qp->qp_wqinfo.qa_buf_aligned -
950 (uint64_t)mr->mr_bindinfo.bi_addr;
953 * Fill in all the return arguments (if necessary). This includes
954 * real work queue sizes, real SGLs, and QP number (which will be
955 * either zero or one, depending on the special QP type)
957 if (queuesz_p != NULL) {
958 queuesz_p->cs_sq = (1 << log_qp_sq_size);
959 queuesz_p->cs_sq_sgl = qp->qp_sq_sgl;
960 queuesz_p->cs_rq = (1 << log_qp_rq_size);
961 queuesz_p->cs_rq_sgl = qp->qp_rq_sgl;
965 * Fill in the rest of the Tavor Queue Pair handle. We can update
966 * the following fields for use in further operations on the QP.
968 qp->qp_qpcrsrcp = qpc;
969 qp->qp_rsrcp = rsrc;
970 qp->qp_state = TAVOR_QP_RESET;
971 qp->qp_pdhdl = pd;
972 qp->qp_mrhdl = mr;
973 qp->qp_sq_sigtype = (attr_p->qp_flags & IBT_WR_SIGNALED) ?
974 TAVOR_QP_SQ_WR_SIGNALED : TAVOR_QP_SQ_ALL_SIGNALED;
975 qp->qp_is_special = (type == IBT_SMI_SQP) ?
976 TAVOR_QP_SMI : TAVOR_QP_GSI;
977 qp->qp_is_umap = 0;
978 qp->qp_uarpg = 0;
979 qp->qp_sq_cqhdl = sq_cq;
980 qp->qp_sq_lastwqeaddr = NULL;
981 qp->qp_sq_bufsz = (1 << log_qp_sq_size);
982 qp->qp_sq_buf = sq_buf;
983 qp->qp_desc_off = qp_desc_off;
984 qp->qp_rq_cqhdl = rq_cq;
985 qp->qp_rq_lastwqeaddr = NULL;
986 qp->qp_rq_bufsz = (1 << log_qp_rq_size);
987 qp->qp_rq_buf = rq_buf;
988 qp->qp_portnum = port;
989 qp->qp_pkeyindx = 0;
990 qp->qp_hdlrarg = (void *)ibt_qphdl;
991 qp->qp_mcg_refcnt = 0;
992 qp->qp_srq_en = 0;
993 qp->qp_srqhdl = NULL;
995 /* Determine if later ddi_dma_sync will be necessary */
996 qp->qp_sync = TAVOR_QP_IS_SYNC_REQ(state, qp->qp_wqinfo);
998 /* All special QPs are UD QP service type */
999 qp->qp_serv_type = TAVOR_QP_UD;
1001 /* Zero out the QP context */
1002 bzero(&qp->qpc, sizeof (tavor_hw_qpc_t));
1005 * Put QP handle in Tavor QPNum-to-QPHdl list. Then fill in the
1006 * "qphdl" and return success
1008 ASSERT(state->ts_qphdl[qpc->tr_indx + port] == NULL);
1009 state->ts_qphdl[qpc->tr_indx + port] = qp;
1011 *qphdl = qp;
1013 TAVOR_TNF_EXIT(tavor_special_qp_alloc);
1014 return (DDI_SUCCESS);
1017 * The following is cleanup for all possible failure cases in this routine
1019 spec_qpalloc_fail6:
1020 tavor_queue_free(state, &qp->qp_wqinfo);
1021 spec_qpalloc_fail5:
1022 tavor_rsrc_free(state, &rsrc);
1023 spec_qpalloc_fail4:
1024 if (tavor_special_qp_rsrc_free(state, type, port) != DDI_SUCCESS) {
1025 TAVOR_WARNING(state, "failed to free special QP rsrc");
1027 spec_qpalloc_fail3:
1028 tavor_cq_refcnt_dec(rq_cq);
1029 spec_qpalloc_fail2:
1030 tavor_cq_refcnt_dec(sq_cq);
1031 spec_qpalloc_fail1:
1032 tavor_pd_refcnt_dec(pd);
1033 spec_qpalloc_fail:
1034 TNF_PROBE_1(tavor_special_qp_alloc_fail, TAVOR_TNF_ERROR, "",
1035 tnf_string, msg, errormsg);
1036 TAVOR_TNF_EXIT(tavor_special_qp_alloc);
1037 return (status);
1042 * tavor_qp_free()
1043 * This function frees up the QP resources. Depending on the value
1044 * of the "free_qp_flags", the QP number may not be released until
1045 * a subsequent call to tavor_qp_release_qpn().
1047 * Context: Can be called only from user or kernel context.
1049 /* ARGSUSED */
1051 tavor_qp_free(tavor_state_t *state, tavor_qphdl_t *qphdl,
1052 ibc_free_qp_flags_t free_qp_flags, ibc_qpn_hdl_t *qpnh,
1053 uint_t sleepflag)
1055 tavor_rsrc_t *qpc, *rdb, *rsrc;
1056 tavor_umap_db_entry_t *umapdb;
1057 tavor_qpn_entry_t *entry;
1058 tavor_pdhdl_t pd;
1059 tavor_mrhdl_t mr;
1060 tavor_cqhdl_t sq_cq, rq_cq;
1061 tavor_srqhdl_t srq;
1062 tavor_qphdl_t qp;
1063 uint64_t value;
1064 uint_t type, port;
1065 uint_t maxprot;
1066 uint_t qp_srq_en;
1067 int status;
1068 char *errormsg;
1070 TAVOR_TNF_ENTER(tavor_qp_free);
1073 * Pull all the necessary information from the Tavor Queue Pair
1074 * handle. This is necessary here because the resource for the
1075 * QP handle is going to be freed up as part of this operation.
1077 qp = *qphdl;
1078 mutex_enter(&qp->qp_lock);
1079 qpc = qp->qp_qpcrsrcp;
1080 rsrc = qp->qp_rsrcp;
1081 pd = qp->qp_pdhdl;
1082 srq = qp->qp_srqhdl;
1083 mr = qp->qp_mrhdl;
1084 rq_cq = qp->qp_rq_cqhdl;
1085 sq_cq = qp->qp_sq_cqhdl;
1086 rdb = qp->qp_rdbrsrcp;
1087 port = qp->qp_portnum;
1088 qp_srq_en = qp->qp_srq_en;
1091 * If the QP is part of an MCG, then we fail the qp_free
1093 if (qp->qp_mcg_refcnt != 0) {
1094 mutex_exit(&qp->qp_lock);
1095 TAVOR_TNF_FAIL(ibc_get_ci_failure(0), "QP part of MCG on free");
1096 goto qpfree_fail;
1100 * If the QP is not already in "Reset" state, then transition to
1101 * "Reset". This is necessary because software does not reclaim
1102 * ownership of the QP context until the QP is in the "Reset" state.
1103 * If the ownership transfer fails for any reason, then it is an
1104 * indication that something (either in HW or SW) has gone seriously
1105 * wrong. So we print a warning message and return.
1107 if (qp->qp_state != TAVOR_QP_RESET) {
1108 if (tavor_qp_to_reset(state, qp) != DDI_SUCCESS) {
1109 mutex_exit(&qp->qp_lock);
1110 TAVOR_WARNING(state, "failed to reset QP context");
1111 /* Set "status" and "errormsg" and goto failure */
1112 TAVOR_TNF_FAIL(ibc_get_ci_failure(0),
1113 "reset QP context");
1114 goto qpfree_fail;
1116 qp->qp_state = TAVOR_QP_RESET;
1119 * Do any additional handling necessary for the transition
1120 * to the "Reset" state (e.g. update the WRID lists)
1122 tavor_wrid_to_reset_handling(state, qp);
1126 * If this was a user-mappable QP, then we need to remove its entry
1127 * from the "userland resources database". If it is also currently
1128 * mmap()'d out to a user process, then we need to call
1129 * devmap_devmem_remap() to remap the QP memory to an invalid mapping.
1130 * We also need to invalidate the QP tracking information for the
1131 * user mapping.
1133 if (qp->qp_is_umap) {
1134 status = tavor_umap_db_find(state->ts_instance, qp->qp_qpnum,
1135 MLNX_UMAP_QPMEM_RSRC, &value, TAVOR_UMAP_DB_REMOVE,
1136 &umapdb);
1137 if (status != DDI_SUCCESS) {
1138 mutex_exit(&qp->qp_lock);
1139 TAVOR_WARNING(state, "failed to find in database");
1140 TAVOR_TNF_EXIT(tavor_qp_free);
1141 return (ibc_get_ci_failure(0));
1143 tavor_umap_db_free(umapdb);
1144 if (qp->qp_umap_dhp != NULL) {
1145 maxprot = (PROT_READ | PROT_WRITE | PROT_USER);
1146 status = devmap_devmem_remap(qp->qp_umap_dhp,
1147 state->ts_dip, 0, 0, qp->qp_wqinfo.qa_size,
1148 maxprot, DEVMAP_MAPPING_INVALID, NULL);
1149 if (status != DDI_SUCCESS) {
1150 mutex_exit(&qp->qp_lock);
1151 TAVOR_WARNING(state, "failed in QP memory "
1152 "devmap_devmem_remap()");
1153 TAVOR_TNF_EXIT(tavor_qp_free);
1154 return (ibc_get_ci_failure(0));
1156 qp->qp_umap_dhp = (devmap_cookie_t)NULL;
1161 * Put NULL into the Tavor QPNum-to-QPHdl list. This will allow any
1162 * in-progress events to detect that the QP corresponding to this
1163 * number has been freed. Note: it does depend in whether we are
1164 * freeing a special QP or not.
1166 if (qp->qp_is_special) {
1167 state->ts_qphdl[qpc->tr_indx + port] = NULL;
1168 } else {
1169 state->ts_qphdl[qpc->tr_indx] = NULL;
1173 * Drop the QP lock
1174 * At this point the lock is no longer necessary. We cannot
1175 * protect from multiple simultaneous calls to free the same QP.
1176 * In addition, since the QP lock is contained in the QP "software
1177 * handle" resource, which we will free (see below), it is
1178 * important that we have no further references to that memory.
1180 mutex_exit(&qp->qp_lock);
1181 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*qp))
1184 * Free the QP resources
1185 * Start by deregistering and freeing the memory for work queues.
1186 * Next free any previously allocated context information
1187 * (depending on QP type)
1188 * Finally, decrement the necessary reference counts.
1189 * If this fails for any reason, then it is an indication that
1190 * something (either in HW or SW) has gone seriously wrong. So we
1191 * print a warning message and return.
1193 status = tavor_mr_deregister(state, &mr, TAVOR_MR_DEREG_ALL,
1194 sleepflag);
1195 if (status != DDI_SUCCESS) {
1196 TAVOR_WARNING(state, "failed to deregister QP memory");
1197 /* Set "status" and "errormsg" and goto failure */
1198 TAVOR_TNF_FAIL(ibc_get_ci_failure(0), "failed deregister mr");
1199 goto qpfree_fail;
1202 /* Free the memory for the QP */
1203 tavor_queue_free(state, &qp->qp_wqinfo);
1206 * Free up the remainder of the QP resources. Note: we have a few
1207 * different resources to free up depending on whether the QP is a
1208 * special QP or not. As described above, if any of these fail for
1209 * any reason it is an indication that something (either in HW or SW)
1210 * has gone seriously wrong. So we print a warning message and
1211 * return.
1213 if (qp->qp_is_special) {
1214 type = (qp->qp_is_special == TAVOR_QP_SMI) ?
1215 IBT_SMI_SQP : IBT_GSI_SQP;
1217 /* Free up resources for the special QP */
1218 status = tavor_special_qp_rsrc_free(state, type, port);
1219 if (status != DDI_SUCCESS) {
1220 TAVOR_WARNING(state, "failed to free special QP rsrc");
1221 /* Set "status" and "errormsg" and goto failure */
1222 TAVOR_TNF_FAIL(ibc_get_ci_failure(0),
1223 "failed special QP rsrc");
1224 goto qpfree_fail;
1227 } else {
1228 type = qp->qp_serv_type;
1230 /* Free up the RDB entries resource */
1231 if (type == TAVOR_QP_RC) {
1232 tavor_rsrc_free(state, &rdb);
1236 * Check the flags and determine whether to release the
1237 * QPN or not, based on their value.
1239 if (free_qp_flags == IBC_FREE_QP_ONLY) {
1240 entry = qp->qp_qpn_hdl;
1241 tavor_qp_release_qpn(state, qp->qp_qpn_hdl,
1242 TAVOR_QPN_FREE_ONLY);
1243 *qpnh = (ibc_qpn_hdl_t)entry;
1244 } else {
1245 tavor_qp_release_qpn(state, qp->qp_qpn_hdl,
1246 TAVOR_QPN_RELEASE);
1250 /* Free the Tavor Queue Pair handle */
1251 tavor_rsrc_free(state, &rsrc);
1253 /* Decrement the reference counts on CQs, PD and SRQ (if needed) */
1254 tavor_cq_refcnt_dec(rq_cq);
1255 tavor_cq_refcnt_dec(sq_cq);
1256 tavor_pd_refcnt_dec(pd);
1257 if (qp_srq_en == TAVOR_QP_SRQ_ENABLED) {
1258 tavor_srq_refcnt_dec(srq);
1261 /* Set the qphdl pointer to NULL and return success */
1262 *qphdl = NULL;
1264 TAVOR_TNF_EXIT(tavor_qp_free);
1265 return (DDI_SUCCESS);
1267 qpfree_fail:
1268 TNF_PROBE_1(tavor_qp_free_fail, TAVOR_TNF_ERROR, "",
1269 tnf_string, msg, errormsg);
1270 TAVOR_TNF_EXIT(tavor_qp_free);
1271 return (status);
1276 * tavor_qp_query()
1277 * Context: Can be called from interrupt or base context.
1280 tavor_qp_query(tavor_state_t *state, tavor_qphdl_t qp,
1281 ibt_qp_query_attr_t *attr_p)
1283 ibt_cep_state_t qp_state;
1284 ibt_qp_ud_attr_t *ud;
1285 ibt_qp_rc_attr_t *rc;
1286 ibt_qp_uc_attr_t *uc;
1287 ibt_cep_flags_t enable_flags;
1288 tavor_hw_addr_path_t *qpc_path, *qpc_alt_path;
1289 ibt_cep_path_t *path_ptr, *alt_path_ptr;
1290 tavor_hw_qpc_t *qpc;
1291 int status;
1293 TAVOR_TNF_ENTER(tavor_qp_query);
1295 mutex_enter(&qp->qp_lock);
1298 * Grab the temporary QPC entry from QP software state
1300 qpc = &qp->qpc;
1302 /* Convert the current Tavor QP state to IBTF QP state */
1303 switch (qp->qp_state) {
1304 case TAVOR_QP_RESET:
1305 qp_state = IBT_STATE_RESET; /* "Reset" */
1306 break;
1307 case TAVOR_QP_INIT:
1308 qp_state = IBT_STATE_INIT; /* Initialized */
1309 break;
1310 case TAVOR_QP_RTR:
1311 qp_state = IBT_STATE_RTR; /* Ready to Receive */
1312 break;
1313 case TAVOR_QP_RTS:
1314 qp_state = IBT_STATE_RTS; /* Ready to Send */
1315 break;
1316 case TAVOR_QP_SQERR:
1317 qp_state = IBT_STATE_SQE; /* Send Queue Error */
1318 break;
1319 case TAVOR_QP_SQD:
1320 if (qp->qp_sqd_still_draining) {
1321 qp_state = IBT_STATE_SQDRAIN; /* SQ Draining */
1322 } else {
1323 qp_state = IBT_STATE_SQD; /* SQ Drained */
1325 break;
1326 case TAVOR_QP_ERR:
1327 qp_state = IBT_STATE_ERROR; /* Error */
1328 break;
1329 default:
1330 mutex_exit(&qp->qp_lock);
1331 TNF_PROBE_1(tavor_qp_query_inv_qpstate_fail,
1332 TAVOR_TNF_ERROR, "", tnf_uint, qpstate, qp->qp_state);
1333 TAVOR_TNF_EXIT(tavor_qp_query);
1334 return (ibc_get_ci_failure(0));
1336 attr_p->qp_info.qp_state = qp_state;
1338 /* SRQ Hook. */
1339 attr_p->qp_srq = NULL;
1342 * The following QP information is always returned, regardless of
1343 * the current QP state. Note: Some special handling is necessary
1344 * for calculating the QP number on special QP (QP0 and QP1).
1346 attr_p->qp_sq_cq = qp->qp_sq_cqhdl->cq_hdlrarg;
1347 attr_p->qp_rq_cq = qp->qp_rq_cqhdl->cq_hdlrarg;
1348 if (qp->qp_is_special) {
1349 attr_p->qp_qpn = (qp->qp_is_special == TAVOR_QP_SMI) ? 0 : 1;
1350 } else {
1351 attr_p->qp_qpn = (ib_qpn_t)qp->qp_qpnum;
1353 attr_p->qp_sq_sgl = qp->qp_sq_sgl;
1354 attr_p->qp_rq_sgl = qp->qp_rq_sgl;
1355 attr_p->qp_info.qp_sq_sz = qp->qp_sq_bufsz;
1356 attr_p->qp_info.qp_rq_sz = qp->qp_rq_bufsz;
1359 * If QP is currently in the "Reset" state, then only the above are
1360 * returned
1362 if (qp_state == IBT_STATE_RESET) {
1363 mutex_exit(&qp->qp_lock);
1364 TAVOR_TNF_EXIT(tavor_qp_query);
1365 return (DDI_SUCCESS);
1369 * Post QUERY_QP command to firmware
1371 * We do a TAVOR_NOSLEEP here because we are holding the "qp_lock".
1372 * Since we may be in the interrupt context (or subsequently raised
1373 * to interrupt level by priority inversion), we do not want to block
1374 * in this routine waiting for success.
1376 status = tavor_cmn_query_cmd_post(state, QUERY_QP, qp->qp_qpnum,
1377 qpc, sizeof (tavor_hw_qpc_t), TAVOR_CMD_NOSLEEP_SPIN);
1378 if (status != TAVOR_CMD_SUCCESS) {
1379 mutex_exit(&qp->qp_lock);
1380 cmn_err(CE_CONT, "Tavor: QUERY_QP command failed: %08x\n",
1381 status);
1382 TNF_PROBE_1(tavor_qp_query_cmd_fail, TAVOR_TNF_ERROR, "",
1383 tnf_uint, status, status);
1384 TAVOR_TNF_EXIT(tavor_qp_query);
1385 return (ibc_get_ci_failure(0));
1389 * Fill in the additional QP info based on the QP's transport type.
1391 if (qp->qp_serv_type == TAVOR_QP_UD) {
1393 /* Fill in the UD-specific info */
1394 ud = &attr_p->qp_info.qp_transport.ud;
1395 ud->ud_qkey = (ib_qkey_t)qpc->qkey;
1396 ud->ud_sq_psn = qpc->next_snd_psn;
1397 ud->ud_pkey_ix = qpc->pri_addr_path.pkey_indx;
1398 ud->ud_port = qpc->pri_addr_path.portnum;
1400 attr_p->qp_info.qp_trans = IBT_UD_SRV;
1402 } else if (qp->qp_serv_type == TAVOR_QP_RC) {
1404 /* Fill in the RC-specific info */
1405 rc = &attr_p->qp_info.qp_transport.rc;
1406 rc->rc_sq_psn = qpc->next_snd_psn;
1407 rc->rc_rq_psn = qpc->next_rcv_psn;
1408 rc->rc_dst_qpn = qpc->rem_qpn;
1410 /* Grab the path migration state information */
1411 if (qpc->pm_state == TAVOR_QP_PMSTATE_MIGRATED) {
1412 rc->rc_mig_state = IBT_STATE_MIGRATED;
1413 } else if (qpc->pm_state == TAVOR_QP_PMSTATE_REARM) {
1414 rc->rc_mig_state = IBT_STATE_REARMED;
1415 } else {
1416 rc->rc_mig_state = IBT_STATE_ARMED;
1418 rc->rc_rdma_ra_out = (1 << qpc->sra_max);
1419 rc->rc_rdma_ra_in = (1 << qpc->rra_max);
1420 rc->rc_min_rnr_nak = qpc->min_rnr_nak;
1421 rc->rc_path_mtu = qpc->mtu;
1422 rc->rc_retry_cnt = qpc->retry_cnt;
1424 /* Get the common primary address path fields */
1425 qpc_path = &qpc->pri_addr_path;
1426 path_ptr = &rc->rc_path;
1427 tavor_get_addr_path(state, qpc_path, &path_ptr->cep_adds_vect,
1428 TAVOR_ADDRPATH_QP, qp);
1430 /* Fill in the additional primary address path fields */
1431 path_ptr->cep_pkey_ix = qpc_path->pkey_indx;
1432 path_ptr->cep_hca_port_num = qpc_path->portnum;
1433 path_ptr->cep_timeout = qpc_path->ack_timeout;
1435 /* Get the common alternate address path fields */
1436 qpc_alt_path = &qpc->alt_addr_path;
1437 alt_path_ptr = &rc->rc_alt_path;
1438 tavor_get_addr_path(state, qpc_alt_path,
1439 &alt_path_ptr->cep_adds_vect, TAVOR_ADDRPATH_QP, qp);
1441 /* Fill in the additional alternate address path fields */
1442 alt_path_ptr->cep_pkey_ix = qpc_alt_path->pkey_indx;
1443 alt_path_ptr->cep_hca_port_num = qpc_alt_path->portnum;
1444 alt_path_ptr->cep_timeout = qpc_alt_path->ack_timeout;
1446 /* Get the RNR retry time from primary path */
1447 rc->rc_rnr_retry_cnt = qpc_path->rnr_retry;
1449 /* Set the enable flags based on RDMA/Atomic enable bits */
1450 enable_flags = IBT_CEP_NO_FLAGS;
1451 enable_flags |= ((qpc->rre == 0) ? 0 : IBT_CEP_RDMA_RD);
1452 enable_flags |= ((qpc->rwe == 0) ? 0 : IBT_CEP_RDMA_WR);
1453 enable_flags |= ((qpc->rae == 0) ? 0 : IBT_CEP_ATOMIC);
1454 attr_p->qp_info.qp_flags = enable_flags;
1456 attr_p->qp_info.qp_trans = IBT_RC_SRV;
1458 } else if (qp->qp_serv_type == TAVOR_QP_UC) {
1460 /* Fill in the UC-specific info */
1461 uc = &attr_p->qp_info.qp_transport.uc;
1462 uc->uc_sq_psn = qpc->next_snd_psn;
1463 uc->uc_rq_psn = qpc->next_rcv_psn;
1464 uc->uc_dst_qpn = qpc->rem_qpn;
1466 /* Grab the path migration state information */
1467 if (qpc->pm_state == TAVOR_QP_PMSTATE_MIGRATED) {
1468 uc->uc_mig_state = IBT_STATE_MIGRATED;
1469 } else if (qpc->pm_state == TAVOR_QP_PMSTATE_REARM) {
1470 uc->uc_mig_state = IBT_STATE_REARMED;
1471 } else {
1472 uc->uc_mig_state = IBT_STATE_ARMED;
1474 uc->uc_path_mtu = qpc->mtu;
1476 /* Get the common primary address path fields */
1477 qpc_path = &qpc->pri_addr_path;
1478 path_ptr = &uc->uc_path;
1479 tavor_get_addr_path(state, qpc_path, &path_ptr->cep_adds_vect,
1480 TAVOR_ADDRPATH_QP, qp);
1482 /* Fill in the additional primary address path fields */
1483 path_ptr->cep_pkey_ix = qpc_path->pkey_indx;
1484 path_ptr->cep_hca_port_num = qpc_path->portnum;
1486 /* Get the common alternate address path fields */
1487 qpc_alt_path = &qpc->alt_addr_path;
1488 alt_path_ptr = &uc->uc_alt_path;
1489 tavor_get_addr_path(state, qpc_alt_path,
1490 &alt_path_ptr->cep_adds_vect, TAVOR_ADDRPATH_QP, qp);
1492 /* Fill in the additional alternate address path fields */
1493 alt_path_ptr->cep_pkey_ix = qpc_alt_path->pkey_indx;
1494 alt_path_ptr->cep_hca_port_num = qpc_alt_path->portnum;
1497 * Set the enable flags based on RDMA enable bits (by
1498 * definition UC doesn't support Atomic or RDMA Read)
1500 enable_flags = ((qpc->rwe == 0) ? 0 : IBT_CEP_RDMA_WR);
1501 attr_p->qp_info.qp_flags = enable_flags;
1503 attr_p->qp_info.qp_trans = IBT_UC_SRV;
1505 } else {
1506 TAVOR_WARNING(state, "unexpected QP transport type");
1507 mutex_exit(&qp->qp_lock);
1508 return (ibc_get_ci_failure(0));
1512 * Under certain circumstances it is possible for the Tavor hardware
1513 * to transition to one of the error states without software directly
1514 * knowing about it. The QueryQP() call is the one place where we
1515 * have an opportunity to sample and update our view of the QP state.
1517 if (qpc->state == TAVOR_QP_SQERR) {
1518 attr_p->qp_info.qp_state = IBT_STATE_SQE;
1519 qp->qp_state = TAVOR_QP_SQERR;
1521 if (qpc->state == TAVOR_QP_ERR) {
1522 attr_p->qp_info.qp_state = IBT_STATE_ERROR;
1523 qp->qp_state = TAVOR_QP_ERR;
1525 mutex_exit(&qp->qp_lock);
1527 TAVOR_TNF_EXIT(tavor_qp_query);
1528 return (DDI_SUCCESS);
1533 * tavor_qp_create_qpn()
1534 * Context: Can be called from interrupt or base context.
1536 static int
1537 tavor_qp_create_qpn(tavor_state_t *state, tavor_qphdl_t qp, tavor_rsrc_t *qpc)
1539 tavor_qpn_entry_t query;
1540 tavor_qpn_entry_t *entry;
1541 avl_index_t where;
1543 TAVOR_TNF_ENTER(tavor_qp_create_qpn);
1546 * Build a query (for the AVL tree lookup) and attempt to find
1547 * a previously added entry that has a matching QPC index. If
1548 * no matching entry is found, then allocate, initialize, and
1549 * add an entry to the AVL tree.
1550 * If a matching entry is found, then increment its QPN counter
1551 * and reference counter.
1553 query.qpn_indx = qpc->tr_indx;
1554 mutex_enter(&state->ts_qpn_avl_lock);
1555 entry = (tavor_qpn_entry_t *)avl_find(&state->ts_qpn_avl,
1556 &query, &where);
1557 if (entry == NULL) {
1559 * Allocate and initialize a QPN entry, then insert
1560 * it into the AVL tree.
1562 entry = (tavor_qpn_entry_t *)kmem_zalloc(
1563 sizeof (tavor_qpn_entry_t), KM_NOSLEEP);
1564 if (entry == NULL) {
1565 mutex_exit(&state->ts_qpn_avl_lock);
1566 TAVOR_TNF_EXIT(tavor_qp_create_qpn);
1567 return (DDI_FAILURE);
1569 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*entry))
1571 entry->qpn_indx = qpc->tr_indx;
1572 entry->qpn_refcnt = 0;
1573 entry->qpn_counter = 0;
1575 avl_insert(&state->ts_qpn_avl, entry, where);
1579 * Make the AVL tree entry point to the QP context resource that
1580 * it will be responsible for tracking
1582 entry->qpn_qpc = qpc;
1585 * Setup the QP handle to point to the AVL tree entry. Then
1586 * generate the new QP number from the entry's QPN counter value
1587 * and the hardware's QP context table index.
1589 qp->qp_qpn_hdl = entry;
1590 qp->qp_qpnum = ((entry->qpn_counter <<
1591 state->ts_cfg_profile->cp_log_num_qp) | qpc->tr_indx) &
1592 TAVOR_QP_MAXNUMBER_MSK;
1595 * Increment the reference counter and QPN counter. The QPN
1596 * counter always indicates the next available number for use.
1598 entry->qpn_counter++;
1599 entry->qpn_refcnt++;
1601 mutex_exit(&state->ts_qpn_avl_lock);
1602 TAVOR_TNF_EXIT(tavor_qp_create_qpn);
1603 return (DDI_SUCCESS);
1608 * tavor_qp_release_qpn()
1609 * Context: Can be called only from user or kernel context.
1611 void
1612 tavor_qp_release_qpn(tavor_state_t *state, tavor_qpn_entry_t *entry, int flags)
1614 TAVOR_TNF_ENTER(tavor_qp_release_qpn);
1616 ASSERT(entry != NULL);
1618 mutex_enter(&state->ts_qpn_avl_lock);
1621 * If we are releasing the QP number here, then we decrement the
1622 * reference count and check for zero references. If there are
1623 * zero references, then we free the QPC context (if it hadn't
1624 * already been freed during a TAVOR_QPN_FREE_ONLY free, i.e. for
1625 * reuse with another similar QP number) and remove the tracking
1626 * structure from the QP number AVL tree and free the structure.
1627 * If we are not releasing the QP number here, then, as long as we
1628 * have not exhausted the usefulness of the QPC context (that is,
1629 * re-used it too many times without the reference count having
1630 * gone to zero), we free up the QPC context for use by another
1631 * thread (which will use it to construct a different QP number
1632 * from the same QPC table index).
1634 if (flags == TAVOR_QPN_RELEASE) {
1635 entry->qpn_refcnt--;
1638 * If the reference count is zero, then we free the QPC
1639 * context (if it hadn't already been freed in an early
1640 * step, e.g. TAVOR_QPN_FREE_ONLY) and remove/free the
1641 * tracking structure from the QP number AVL tree.
1643 if (entry->qpn_refcnt == 0) {
1644 if (entry->qpn_qpc != NULL) {
1645 tavor_rsrc_free(state, &entry->qpn_qpc);
1649 * If the current entry has served it's useful
1650 * purpose (i.e. been reused the maximum allowable
1651 * number of times), then remove it from QP number
1652 * AVL tree and free it up.
1654 if (entry->qpn_counter >= (1 <<
1655 (24 - state->ts_cfg_profile->cp_log_num_qp))) {
1656 avl_remove(&state->ts_qpn_avl, entry);
1657 kmem_free(entry, sizeof (tavor_qpn_entry_t));
1661 } else if (flags == TAVOR_QPN_FREE_ONLY) {
1663 * Even if we are not freeing the QP number, that will not
1664 * always prevent us from releasing the QPC context. In fact,
1665 * since the QPC context only forms part of the whole QPN,
1666 * we want to free it up for use by other consumers. But
1667 * if the reference count is non-zero (which it will always
1668 * be when we are doing TAVOR_QPN_FREE_ONLY) and the counter
1669 * has reached its maximum value, then we cannot reuse the
1670 * QPC context until the reference count eventually reaches
1671 * zero (in TAVOR_QPN_RELEASE, above).
1673 if (entry->qpn_counter < (1 <<
1674 (24 - state->ts_cfg_profile->cp_log_num_qp))) {
1675 tavor_rsrc_free(state, &entry->qpn_qpc);
1678 mutex_exit(&state->ts_qpn_avl_lock);
1680 TAVOR_TNF_EXIT(tavor_qp_release_qpn);
1685 * tavor_qpn_db_compare()
1686 * Context: Can be called from user or kernel context.
1688 static int
1689 tavor_qpn_avl_compare(const void *q, const void *e)
1691 tavor_qpn_entry_t *entry, *query;
1693 TAVOR_TNF_ENTER(tavor_qpn_avl_compare);
1695 entry = (tavor_qpn_entry_t *)e;
1696 query = (tavor_qpn_entry_t *)q;
1698 if (query->qpn_indx < entry->qpn_indx) {
1699 TAVOR_TNF_EXIT(tavor_qpn_avl_compare);
1700 return (-1);
1701 } else if (query->qpn_indx > entry->qpn_indx) {
1702 TAVOR_TNF_EXIT(tavor_qpn_avl_compare);
1703 return (+1);
1704 } else {
1705 TAVOR_TNF_EXIT(tavor_qpn_avl_compare);
1706 return (0);
1712 * tavor_qpn_avl_init()
1713 * Context: Only called from attach() path context
1715 void
1716 tavor_qpn_avl_init(tavor_state_t *state)
1718 TAVOR_TNF_ENTER(tavor_qpn_avl_init);
1720 /* Initialize the lock used for QP number (QPN) AVL tree access */
1721 mutex_init(&state->ts_qpn_avl_lock, NULL, MUTEX_DRIVER,
1722 DDI_INTR_PRI(state->ts_intrmsi_pri));
1724 /* Initialize the AVL tree for the QP number (QPN) storage */
1725 avl_create(&state->ts_qpn_avl, tavor_qpn_avl_compare,
1726 sizeof (tavor_qpn_entry_t),
1727 offsetof(tavor_qpn_entry_t, qpn_avlnode));
1729 TAVOR_TNF_EXIT(tavor_qpn_avl_init);
1734 * tavor_qpn_avl_fini()
1735 * Context: Only called from attach() and/or detach() path contexts
1737 void
1738 tavor_qpn_avl_fini(tavor_state_t *state)
1740 tavor_qpn_entry_t *entry;
1741 void *cookie;
1743 TAVOR_TNF_ENTER(tavor_qpn_avl_fini);
1746 * Empty all entries (if necessary) and destroy the AVL tree
1747 * that was used for QP number (QPN) tracking.
1749 cookie = NULL;
1750 while ((entry = (tavor_qpn_entry_t *)avl_destroy_nodes(
1751 &state->ts_qpn_avl, &cookie)) != NULL) {
1752 kmem_free(entry, sizeof (tavor_qpn_entry_t));
1754 avl_destroy(&state->ts_qpn_avl);
1756 /* Destroy the lock used for QP number (QPN) AVL tree access */
1757 mutex_destroy(&state->ts_qpn_avl_lock);
1759 TAVOR_TNF_EXIT(tavor_qpn_avl_fini);
1764 * tavor_qphdl_from_qpnum()
1765 * Context: Can be called from interrupt or base context.
1767 * This routine is important because changing the unconstrained
1768 * portion of the QP number is critical to the detection of a
1769 * potential race condition in the QP event handler code (i.e. the case
1770 * where a QP is freed and alloc'd again before an event for the
1771 * "old" QP can be handled).
1773 * While this is not a perfect solution (not sure that one exists)
1774 * it does help to mitigate the chance that this race condition will
1775 * cause us to deliver a "stale" event to the new QP owner. Note:
1776 * this solution does not scale well because the number of constrained
1777 * bits increases (and, hence, the number of unconstrained bits
1778 * decreases) as the number of supported QPs grows. For small and
1779 * intermediate values, it should hopefully provide sufficient
1780 * protection.
1782 tavor_qphdl_t
1783 tavor_qphdl_from_qpnum(tavor_state_t *state, uint_t qpnum)
1785 uint_t qpindx, qpmask;
1787 /* Calculate the QP table index from the qpnum */
1788 qpmask = (1 << state->ts_cfg_profile->cp_log_num_qp) - 1;
1789 qpindx = qpnum & qpmask;
1790 return (state->ts_qphdl[qpindx]);
1795 * tavor_special_qp_rsrc_alloc
1796 * Context: Can be called from interrupt or base context.
1798 static int
1799 tavor_special_qp_rsrc_alloc(tavor_state_t *state, ibt_sqp_type_t type,
1800 uint_t port, tavor_rsrc_t **qp_rsrc)
1802 uint_t mask, flags;
1803 int status;
1805 TAVOR_TNF_ENTER(tavor_special_qp_rsrc_alloc);
1807 mutex_enter(&state->ts_spec_qplock);
1808 flags = state->ts_spec_qpflags;
1809 if (type == IBT_SMI_SQP) {
1811 * Check here to see if the driver has been configured
1812 * to instruct the Tavor firmware to handle all incoming
1813 * SMP messages (i.e. messages sent to SMA). If so,
1814 * then we will treat QP0 as if it has already been
1815 * allocated (for internal use). Otherwise, if we allow
1816 * the allocation to happen, it will cause unexpected
1817 * behaviors (e.g. Tavor SMA becomes unresponsive).
1819 if (state->ts_cfg_profile->cp_qp0_agents_in_fw != 0) {
1820 mutex_exit(&state->ts_spec_qplock);
1821 TNF_PROBE_0(tavor_special_qp0_alloc_already_in_fw,
1822 TAVOR_TNF_ERROR, "");
1823 TAVOR_TNF_EXIT(tavor_special_qp_rsrc_alloc);
1824 return (IBT_QP_IN_USE);
1828 * If this is the first QP0 allocation, then post
1829 * a CONF_SPECIAL_QP firmware command
1831 if ((flags & TAVOR_SPECIAL_QP0_RSRC_MASK) == 0) {
1832 status = tavor_conf_special_qp_cmd_post(state,
1833 state->ts_spec_qp0->tr_indx, TAVOR_CMD_QP_SMI,
1834 TAVOR_CMD_NOSLEEP_SPIN);
1835 if (status != TAVOR_CMD_SUCCESS) {
1836 mutex_exit(&state->ts_spec_qplock);
1837 cmn_err(CE_CONT, "Tavor: CONF_SPECIAL_QP "
1838 "command failed: %08x\n", status);
1839 TNF_PROBE_1(tavor_conf_special_qp_cmd_fail,
1840 TAVOR_TNF_ERROR, "", tnf_uint, status,
1841 status);
1842 TAVOR_TNF_EXIT(tavor_special_qp_rsrc_alloc);
1843 return (IBT_INSUFF_RESOURCE);
1848 * Now check (and, if necessary, modify) the flags to indicate
1849 * whether the allocation was successful
1851 mask = (1 << (TAVOR_SPECIAL_QP0_RSRC + port));
1852 if (flags & mask) {
1853 mutex_exit(&state->ts_spec_qplock);
1854 TNF_PROBE_1(tavor_ts_spec_qp0_alloc_already,
1855 TAVOR_TNF_ERROR, "", tnf_uint, port, port);
1856 TAVOR_TNF_EXIT(tavor_special_qp_rsrc_alloc);
1857 return (IBT_QP_IN_USE);
1859 state->ts_spec_qpflags |= mask;
1860 *qp_rsrc = state->ts_spec_qp0;
1862 } else {
1864 * If this is the first QP1 allocation, then post
1865 * a CONF_SPECIAL_QP firmware command
1867 if ((flags & TAVOR_SPECIAL_QP1_RSRC_MASK) == 0) {
1868 status = tavor_conf_special_qp_cmd_post(state,
1869 state->ts_spec_qp1->tr_indx, TAVOR_CMD_QP_GSI,
1870 TAVOR_CMD_NOSLEEP_SPIN);
1871 if (status != TAVOR_CMD_SUCCESS) {
1872 mutex_exit(&state->ts_spec_qplock);
1873 cmn_err(CE_CONT, "Tavor: CONF_SPECIAL_QP "
1874 "command failed: %08x\n", status);
1875 TNF_PROBE_1(tavor_conf_special_qp_cmd_fail,
1876 TAVOR_TNF_ERROR, "", tnf_uint, status,
1877 status);
1878 TAVOR_TNF_EXIT(tavor_special_qp_rsrc_alloc);
1879 return (IBT_INSUFF_RESOURCE);
1884 * Now check (and, if necessary, modify) the flags to indicate
1885 * whether the allocation was successful
1887 mask = (1 << (TAVOR_SPECIAL_QP1_RSRC + port));
1888 if (flags & mask) {
1889 mutex_exit(&state->ts_spec_qplock);
1890 TNF_PROBE_0(tavor_ts_spec_qp1_alloc_already,
1891 TAVOR_TNF_ERROR, "");
1892 TAVOR_TNF_EXIT(tavor_special_qp_rsrc_alloc);
1893 return (IBT_QP_IN_USE);
1895 state->ts_spec_qpflags |= mask;
1896 *qp_rsrc = state->ts_spec_qp1;
1899 mutex_exit(&state->ts_spec_qplock);
1900 TAVOR_TNF_EXIT(tavor_special_qp_rsrc_alloc);
1901 return (DDI_SUCCESS);
1906 * tavor_special_qp_rsrc_free
1907 * Context: Can be called from interrupt or base context.
1909 static int
1910 tavor_special_qp_rsrc_free(tavor_state_t *state, ibt_sqp_type_t type,
1911 uint_t port)
1913 uint_t mask, flags;
1914 int status;
1916 TAVOR_TNF_ENTER(tavor_special_qp_rsrc_free);
1918 mutex_enter(&state->ts_spec_qplock);
1919 if (type == IBT_SMI_SQP) {
1920 mask = (1 << (TAVOR_SPECIAL_QP0_RSRC + port));
1921 state->ts_spec_qpflags &= ~mask;
1922 flags = state->ts_spec_qpflags;
1925 * If this is the last QP0 free, then post a CONF_SPECIAL_QP
1926 * firmware command
1928 if ((flags & TAVOR_SPECIAL_QP0_RSRC_MASK) == 0) {
1929 status = tavor_conf_special_qp_cmd_post(state, 0,
1930 TAVOR_CMD_QP_SMI, TAVOR_CMD_NOSLEEP_SPIN);
1931 if (status != TAVOR_CMD_SUCCESS) {
1932 mutex_exit(&state->ts_spec_qplock);
1933 cmn_err(CE_CONT, "Tavor: CONF_SPECIAL_QP "
1934 "command failed: %08x\n", status);
1935 TNF_PROBE_1(tavor_conf_special_qp_cmd_fail,
1936 TAVOR_TNF_ERROR, "", tnf_uint, status,
1937 status);
1938 TAVOR_TNF_EXIT(tavor_special_qp_rsrc_free);
1939 return (ibc_get_ci_failure(0));
1942 } else {
1943 mask = (1 << (TAVOR_SPECIAL_QP1_RSRC + port));
1944 state->ts_spec_qpflags &= ~mask;
1945 flags = state->ts_spec_qpflags;
1948 * If this is the last QP1 free, then post a CONF_SPECIAL_QP
1949 * firmware command
1951 if ((flags & TAVOR_SPECIAL_QP1_RSRC_MASK) == 0) {
1952 status = tavor_conf_special_qp_cmd_post(state, 0,
1953 TAVOR_CMD_QP_GSI, TAVOR_CMD_NOSLEEP_SPIN);
1954 if (status != TAVOR_CMD_SUCCESS) {
1955 mutex_exit(&state->ts_spec_qplock);
1956 cmn_err(CE_CONT, "Tavor: CONF_SPECIAL_QP "
1957 "command failed: %08x\n", status);
1958 TNF_PROBE_1(tavor_conf_special_qp_cmd_fail,
1959 TAVOR_TNF_ERROR, "", tnf_uint, status,
1960 status);
1961 TAVOR_TNF_EXIT(tavor_special_qp_rsrc_free);
1962 return (ibc_get_ci_failure(0));
1967 mutex_exit(&state->ts_spec_qplock);
1968 TAVOR_TNF_EXIT(tavor_special_qp_rsrc_free);
1969 return (DDI_SUCCESS);
1974 * tavor_qp_sgl_to_logwqesz()
1975 * Context: Can be called from interrupt or base context.
1977 static void
1978 tavor_qp_sgl_to_logwqesz(tavor_state_t *state, uint_t num_sgl,
1979 tavor_qp_wq_type_t wq_type, uint_t *logwqesz, uint_t *max_sgl)
1981 uint_t max_size, log2, actual_sgl;
1983 TAVOR_TNF_ENTER(tavor_qp_sgl_to_logwqesz);
1985 switch (wq_type) {
1986 case TAVOR_QP_WQ_TYPE_SENDQ:
1988 * Use requested maximum SGL to calculate max descriptor size
1989 * (while guaranteeing that the descriptor size is a
1990 * power-of-2 cachelines).
1992 max_size = (TAVOR_QP_WQE_MLX_SND_HDRS + (num_sgl << 4));
1993 log2 = highbit(max_size);
1994 if (ISP2(max_size)) {
1995 log2 = log2 - 1;
1998 /* Make sure descriptor is at least the minimum size */
1999 log2 = max(log2, TAVOR_QP_WQE_LOG_MINIMUM);
2001 /* Calculate actual number of SGL (given WQE size) */
2002 actual_sgl = ((1 << log2) - TAVOR_QP_WQE_MLX_SND_HDRS) >> 4;
2003 break;
2005 case TAVOR_QP_WQ_TYPE_RECVQ:
2007 * Same as above (except for Recv WQEs)
2009 max_size = (TAVOR_QP_WQE_MLX_RCV_HDRS + (num_sgl << 4));
2010 log2 = highbit(max_size);
2011 if (ISP2(max_size)) {
2012 log2 = log2 - 1;
2015 /* Make sure descriptor is at least the minimum size */
2016 log2 = max(log2, TAVOR_QP_WQE_LOG_MINIMUM);
2018 /* Calculate actual number of SGL (given WQE size) */
2019 actual_sgl = ((1 << log2) - TAVOR_QP_WQE_MLX_RCV_HDRS) >> 4;
2020 break;
2022 case TAVOR_QP_WQ_TYPE_SENDMLX_QP0:
2024 * Same as above (except for MLX transport WQEs). For these
2025 * WQEs we have to account for the space consumed by the
2026 * "inline" packet headers. (This is smaller than for QP1
2027 * below because QP0 is not allowed to send packets with a GRH.
2029 max_size = (TAVOR_QP_WQE_MLX_QP0_HDRS + (num_sgl << 4));
2030 log2 = highbit(max_size);
2031 if (ISP2(max_size)) {
2032 log2 = log2 - 1;
2035 /* Make sure descriptor is at least the minimum size */
2036 log2 = max(log2, TAVOR_QP_WQE_LOG_MINIMUM);
2038 /* Calculate actual number of SGL (given WQE size) */
2039 actual_sgl = ((1 << log2) - TAVOR_QP_WQE_MLX_QP0_HDRS) >> 4;
2040 break;
2042 case TAVOR_QP_WQ_TYPE_SENDMLX_QP1:
2044 * Same as above. For these WQEs we again have to account for
2045 * the space consumed by the "inline" packet headers. (This
2046 * is larger than for QP0 above because we have to account for
2047 * the possibility of a GRH in each packet - and this
2048 * introduces an alignment issue that causes us to consume
2049 * an additional 8 bytes).
2051 max_size = (TAVOR_QP_WQE_MLX_QP1_HDRS + (num_sgl << 4));
2052 log2 = highbit(max_size);
2053 if (ISP2(max_size)) {
2054 log2 = log2 - 1;
2057 /* Make sure descriptor is at least the minimum size */
2058 log2 = max(log2, TAVOR_QP_WQE_LOG_MINIMUM);
2060 /* Calculate actual number of SGL (given WQE size) */
2061 actual_sgl = ((1 << log2) - TAVOR_QP_WQE_MLX_QP1_HDRS) >> 4;
2062 break;
2064 default:
2065 TAVOR_WARNING(state, "unexpected work queue type");
2066 TNF_PROBE_0(tavor_qp_sgl_to_logwqesz_inv_wqtype_fail,
2067 TAVOR_TNF_ERROR, "");
2068 break;
2071 /* Fill in the return values */
2072 *logwqesz = log2;
2073 *max_sgl = min(state->ts_cfg_profile->cp_wqe_real_max_sgl, actual_sgl);
2075 TAVOR_TNF_EXIT(tavor_qp_sgl_to_logwqesz);