4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
29 * Tavor Shared Receive Queue Processing Routines
31 * Implements all the routines necessary for allocating, freeing, querying,
32 * modifying and posting shared receive queues.
35 #include <sys/sysmacros.h>
36 #include <sys/types.h>
39 #include <sys/sunddi.h>
40 #include <sys/modctl.h>
41 #include <sys/bitmap.h>
43 #include <sys/ib/adapters/tavor/tavor.h>
45 static void tavor_srq_sgl_to_logwqesz(tavor_state_t
*state
, uint_t num_sgl
,
46 tavor_qp_wq_type_t wq_type
, uint_t
*logwqesz
, uint_t
*max_sgl
);
50 * Context: Can be called only from user or kernel context.
53 tavor_srq_alloc(tavor_state_t
*state
, tavor_srq_info_t
*srqinfo
,
54 uint_t sleepflag
, tavor_srq_options_t
*op
)
56 ibt_srq_hdl_t ibt_srqhdl
;
58 ibt_srq_sizes_t
*sizes
;
59 ibt_srq_sizes_t
*real_sizes
;
60 tavor_srqhdl_t
*srqhdl
;
61 ibt_srq_flags_t flags
;
62 tavor_rsrc_t
*srqc
, *rsrc
;
63 tavor_hw_srqc_t srqc_entry
;
66 tavor_umap_db_entry_t
*umapdb
;
67 ibt_mr_attr_t mr_attr
;
68 tavor_mr_options_t mr_op
;
71 uint64_t value
, srq_desc_off
;
73 uint32_t log_srq_size
;
75 uint_t wq_location
, dma_xfer_mode
, srq_is_umap
;
81 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*sizes
))
83 TAVOR_TNF_ENTER(tavor_srq_alloc
);
86 * Check the "options" flag. Currently this flag tells the driver
87 * whether or not the SRQ's work queues should be come from normal
88 * system memory or whether they should be allocated from DDR memory.
91 wq_location
= TAVOR_QUEUE_LOCATION_NORMAL
;
93 wq_location
= op
->srqo_wq_loc
;
97 * Extract the necessary info from the tavor_srq_info_t structure
99 real_sizes
= srqinfo
->srqi_real_sizes
;
100 sizes
= srqinfo
->srqi_sizes
;
101 pd
= srqinfo
->srqi_pd
;
102 ibt_srqhdl
= srqinfo
->srqi_ibt_srqhdl
;
103 flags
= srqinfo
->srqi_flags
;
104 srqhdl
= srqinfo
->srqi_srqhdl
;
107 * Determine whether SRQ is being allocated for userland access or
108 * whether it is being allocated for kernel access. If the SRQ is
109 * being allocated for userland access, then lookup the UAR doorbell
110 * page number for the current process. Note: If this is not found
111 * (e.g. if the process has not previously open()'d the Tavor driver),
112 * then an error is returned.
114 srq_is_umap
= (flags
& IBT_SRQ_USER_MAP
) ? 1 : 0;
116 status
= tavor_umap_db_find(state
->ts_instance
, ddi_get_pid(),
117 MLNX_UMAP_UARPG_RSRC
, &value
, 0, NULL
);
118 if (status
!= DDI_SUCCESS
) {
119 /* Set "status" and "errormsg" and goto failure */
120 TAVOR_TNF_FAIL(IBT_INVALID_PARAM
, "failed UAR page");
123 uarpg
= ((tavor_rsrc_t
*)(uintptr_t)value
)->tr_indx
;
126 /* Increase PD refcnt */
127 tavor_pd_refcnt_inc(pd
);
129 /* Allocate an SRQ context entry */
130 status
= tavor_rsrc_alloc(state
, TAVOR_SRQC
, 1, sleepflag
, &srqc
);
131 if (status
!= DDI_SUCCESS
) {
132 /* Set "status" and "errormsg" and goto failure */
133 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE
, "failed SRQ context");
137 /* Allocate the SRQ Handle entry */
138 status
= tavor_rsrc_alloc(state
, TAVOR_SRQHDL
, 1, sleepflag
, &rsrc
);
139 if (status
!= DDI_SUCCESS
) {
140 /* Set "status" and "errormsg" and goto failure */
141 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE
, "failed SRQ handle");
145 srq
= (tavor_srqhdl_t
)rsrc
->tr_addr
;
146 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*srq
))
148 srq
->srq_srqnum
= srqc
->tr_indx
; /* just use index */
151 * If this will be a user-mappable SRQ, then allocate an entry for
152 * the "userland resources database". This will later be added to
153 * the database (after all further SRQ operations are successful).
154 * If we fail here, we must undo the reference counts and the
155 * previous resource allocation.
158 umapdb
= tavor_umap_db_alloc(state
->ts_instance
,
159 srq
->srq_srqnum
, MLNX_UMAP_SRQMEM_RSRC
,
160 (uint64_t)(uintptr_t)rsrc
);
161 if (umapdb
== NULL
) {
162 /* Set "status" and "errormsg" and goto failure */
163 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE
, "failed umap add");
169 * Calculate the appropriate size for the SRQ.
170 * Note: All Tavor SRQs must be a power-of-2 in size. Also
171 * they may not be any smaller than TAVOR_SRQ_MIN_SIZE. This step
172 * is to round the requested size up to the next highest power-of-2
174 sizes
->srq_wr_sz
= max(sizes
->srq_wr_sz
, TAVOR_SRQ_MIN_SIZE
);
175 log_srq_size
= highbit(sizes
->srq_wr_sz
);
176 if (ISP2(sizes
->srq_wr_sz
)) {
177 log_srq_size
= log_srq_size
- 1;
181 * Next we verify that the rounded-up size is valid (i.e. consistent
182 * with the device limits and/or software-configured limits). If not,
183 * then obviously we have a lot of cleanup to do before returning.
185 if (log_srq_size
> state
->ts_cfg_profile
->cp_log_max_srq_sz
) {
186 /* Set "status" and "errormsg" and goto failure */
187 TAVOR_TNF_FAIL(IBT_HCA_WR_EXCEEDED
, "max SRQ size");
192 * Next we verify that the requested number of SGL is valid (i.e.
193 * consistent with the device limits and/or software-configured
194 * limits). If not, then obviously the same cleanup needs to be done.
196 max_sgl
= state
->ts_cfg_profile
->cp_srq_max_sgl
;
197 if (sizes
->srq_sgl_sz
> max_sgl
) {
198 /* Set "status" and "errormsg" and goto failure */
199 TAVOR_TNF_FAIL(IBT_HCA_SGL_EXCEEDED
, "max SRQ SGL");
204 * Determine the SRQ's WQE sizes. This depends on the requested
205 * number of SGLs. Note: This also has the side-effect of
206 * calculating the real number of SGLs (for the calculated WQE size)
208 tavor_srq_sgl_to_logwqesz(state
, sizes
->srq_sgl_sz
,
209 TAVOR_QP_WQ_TYPE_RECVQ
, &srq
->srq_wq_log_wqesz
,
213 * Allocate the memory for SRQ work queues. Note: The location from
214 * which we will allocate these work queues has been passed in through
215 * the tavor_qp_options_t structure. Since Tavor work queues are not
216 * allowed to cross a 32-bit (4GB) boundary, the alignment of the work
217 * queue memory is very important. We used to allocate work queues
218 * (the combined receive and send queues) so that they would be aligned
219 * on their combined size. That alignment guaranteed that they would
220 * never cross the 4GB boundary (Tavor work queues are on the order of
221 * MBs at maximum). Now we are able to relax this alignment constraint
222 * by ensuring that the IB address assigned to the queue memory (as a
223 * result of the tavor_mr_register() call) is offset from zero.
224 * Previously, we had wanted to use the ddi_dma_mem_alloc() routine to
225 * guarantee the alignment, but when attempting to use IOMMU bypass
226 * mode we found that we were not allowed to specify any alignment that
227 * was more restrictive than the system page size. So we avoided this
228 * constraint by passing two alignment values, one for the memory
229 * allocation itself and the other for the DMA handle (for later bind).
230 * This used to cause more memory than necessary to be allocated (in
231 * order to guarantee the more restrictive alignment contraint). But
232 * be guaranteeing the zero-based IB virtual address for the queue, we
233 * are able to conserve this memory.
235 * Note: If SRQ is not user-mappable, then it may come from either
236 * kernel system memory or from HCA-attached local DDR memory.
238 * Note2: We align this queue on a pagesize boundary. This is required
239 * to make sure that all the resulting IB addresses will start at 0, for
240 * a zero-based queue. By making sure we are aligned on at least a
241 * page, any offset we use into our queue will be the same as when we
242 * perform tavor_srq_modify() operations later.
244 wqesz
= (1 << srq
->srq_wq_log_wqesz
);
245 srq
->srq_wqinfo
.qa_size
= (1 << log_srq_size
) * wqesz
;
246 srq
->srq_wqinfo
.qa_alloc_align
= PAGESIZE
;
247 srq
->srq_wqinfo
.qa_bind_align
= PAGESIZE
;
249 srq
->srq_wqinfo
.qa_location
= TAVOR_QUEUE_LOCATION_USERLAND
;
251 srq
->srq_wqinfo
.qa_location
= wq_location
;
253 status
= tavor_queue_alloc(state
, &srq
->srq_wqinfo
, sleepflag
);
254 if (status
!= DDI_SUCCESS
) {
255 /* Set "status" and "errormsg" and goto failure */
256 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE
, "failed srq");
259 buf
= (uint32_t *)srq
->srq_wqinfo
.qa_buf_aligned
;
260 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*buf
))
263 * Register the memory for the SRQ work queues. The memory for the SRQ
264 * must be registered in the Tavor TPT tables. This gives us the LKey
265 * to specify in the SRQ context later. Note: If the work queue is to
266 * be allocated from DDR memory, then only a "bypass" mapping is
267 * appropriate. And if the SRQ memory is user-mappable, then we force
268 * DDI_DMA_CONSISTENT mapping. Also, in order to meet the alignment
269 * restriction, we pass the "mro_bind_override_addr" flag in the call
270 * to tavor_mr_register(). This guarantees that the resulting IB vaddr
271 * will be zero-based (modulo the offset into the first page). If we
272 * fail here, we still have the bunch of resource and reference count
275 flag
= (sleepflag
== TAVOR_SLEEP
) ? IBT_MR_SLEEP
:
277 mr_attr
.mr_vaddr
= (uint64_t)(uintptr_t)buf
;
278 mr_attr
.mr_len
= srq
->srq_wqinfo
.qa_size
;
279 mr_attr
.mr_as
= NULL
;
280 mr_attr
.mr_flags
= flag
| IBT_MR_ENABLE_LOCAL_WRITE
;
282 mr_op
.mro_bind_type
= state
->ts_cfg_profile
->cp_iommu_bypass
;
284 if (wq_location
== TAVOR_QUEUE_LOCATION_NORMAL
) {
285 mr_op
.mro_bind_type
=
286 state
->ts_cfg_profile
->cp_iommu_bypass
;
288 state
->ts_cfg_profile
->cp_streaming_consistent
;
289 if (dma_xfer_mode
== DDI_DMA_STREAMING
) {
290 mr_attr
.mr_flags
|= IBT_MR_NONCOHERENT
;
293 mr_op
.mro_bind_type
= TAVOR_BINDMEM_BYPASS
;
296 mr_op
.mro_bind_dmahdl
= srq
->srq_wqinfo
.qa_dmahdl
;
297 mr_op
.mro_bind_override_addr
= 1;
298 status
= tavor_mr_register(state
, pd
, &mr_attr
, &mr
, &mr_op
);
299 if (status
!= DDI_SUCCESS
) {
300 /* Set "status" and "errormsg" and goto failure */
301 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE
, "failed register mr");
304 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*mr
))
305 addr
= mr
->mr_bindinfo
.bi_addr
;
309 * Calculate the offset between the kernel virtual address space
310 * and the IB virtual address space. This will be used when
311 * posting work requests to properly initialize each WQE.
313 srq_desc_off
= (uint64_t)(uintptr_t)srq
->srq_wqinfo
.qa_buf_aligned
-
314 (uint64_t)mr
->mr_bindinfo
.bi_addr
;
317 * Create WQL and Wridlist for use by this SRQ
319 srq
->srq_wrid_wql
= tavor_wrid_wql_create(state
);
320 if (srq
->srq_wrid_wql
== NULL
) {
321 /* Set "status" and "errormsg" and goto failure */
322 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE
, "failed wql create");
325 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*(srq
->srq_wrid_wql
)))
327 srq
->srq_wridlist
= tavor_wrid_get_list(1 << log_srq_size
);
328 if (srq
->srq_wridlist
== NULL
) {
329 /* Set "status" and "errormsg" and goto failure */
330 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE
, "failed wridlist create");
333 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*(srq
->srq_wridlist
)))
335 srq
->srq_wridlist
->wl_srq_en
= 1;
336 srq
->srq_wridlist
->wl_free_list_indx
= -1;
339 * Fill in all the return arguments (if necessary). This includes
340 * real queue size and real SGLs.
342 if (real_sizes
!= NULL
) {
343 real_sizes
->srq_wr_sz
= (1 << log_srq_size
);
344 real_sizes
->srq_sgl_sz
= srq
->srq_wq_sgl
;
348 * Fill in the SRQC entry. This is the final step before passing
349 * ownership of the SRQC entry to the Tavor hardware. We use all of
350 * the information collected/calculated above to fill in the
351 * requisite portions of the SRQC. Note: If this SRQ is going to be
352 * used for userland access, then we need to set the UAR page number
353 * appropriately (otherwise it's a "don't care")
355 bzero(&srqc_entry
, sizeof (tavor_hw_srqc_t
));
356 srqc_entry
.wqe_addr_h
= (addr
>> 32);
357 srqc_entry
.next_wqe_addr_l
= 0;
358 srqc_entry
.ds
= (wqesz
>> 4);
359 srqc_entry
.state
= TAVOR_SRQ_STATE_HW_OWNER
;
360 srqc_entry
.pd
= pd
->pd_pdnum
;
361 srqc_entry
.lkey
= lkey
;
362 srqc_entry
.wqe_cnt
= 0;
364 srqc_entry
.uar
= uarpg
;
370 * Write the SRQC entry to hardware. Lastly, we pass ownership of
371 * the entry to the hardware (using the Tavor SW2HW_SRQ firmware
372 * command). Note: In general, this operation shouldn't fail. But
373 * if it does, we have to undo everything we've done above before
376 status
= tavor_cmn_ownership_cmd_post(state
, SW2HW_SRQ
, &srqc_entry
,
377 sizeof (tavor_hw_srqc_t
), srq
->srq_srqnum
,
379 if (status
!= TAVOR_CMD_SUCCESS
) {
380 cmn_err(CE_CONT
, "Tavor: SW2HW_SRQ command failed: %08x\n",
382 TNF_PROBE_1(tavor_srq_alloc_sw2hw_srq_cmd_fail
,
383 TAVOR_TNF_ERROR
, "", tnf_uint
, status
, status
);
384 /* Set "status" and "errormsg" and goto failure */
385 TAVOR_TNF_FAIL(IBT_FAILURE
, "tavor SW2HW_SRQ command");
390 * Fill in the rest of the Tavor SRQ handle. We can update
391 * the following fields for use in further operations on the SRQ.
393 srq
->srq_srqcrsrcp
= srqc
;
394 srq
->srq_rsrcp
= rsrc
;
397 srq
->srq_is_umap
= srq_is_umap
;
398 srq
->srq_uarpg
= (srq
->srq_is_umap
) ? uarpg
: 0;
399 srq
->srq_umap_dhp
= (devmap_cookie_t
)NULL
;
401 srq
->srq_wq_lastwqeindx
= -1;
402 srq
->srq_wq_bufsz
= (1 << log_srq_size
);
403 srq
->srq_wq_buf
= buf
;
404 srq
->srq_desc_off
= srq_desc_off
;
405 srq
->srq_hdlrarg
= (void *)ibt_srqhdl
;
407 srq
->srq_real_sizes
.srq_wr_sz
= (1 << log_srq_size
);
408 srq
->srq_real_sizes
.srq_sgl_sz
= srq
->srq_wq_sgl
;
410 /* Determine if later ddi_dma_sync will be necessary */
411 srq
->srq_sync
= TAVOR_SRQ_IS_SYNC_REQ(state
, srq
->srq_wqinfo
);
414 * Put SRQ handle in Tavor SRQNum-to-SRQhdl list. Then fill in the
415 * "srqhdl" and return success
417 ASSERT(state
->ts_srqhdl
[srqc
->tr_indx
] == NULL
);
418 state
->ts_srqhdl
[srqc
->tr_indx
] = srq
;
421 * If this is a user-mappable SRQ, then we need to insert the
422 * previously allocated entry into the "userland resources database".
423 * This will allow for later lookup during devmap() (i.e. mmap())
426 if (srq
->srq_is_umap
) {
427 tavor_umap_db_add(umapdb
);
429 mutex_enter(&srq
->srq_wrid_wql
->wql_lock
);
430 tavor_wrid_list_srq_init(srq
->srq_wridlist
, srq
, 0);
431 mutex_exit(&srq
->srq_wrid_wql
->wql_lock
);
436 TAVOR_TNF_EXIT(tavor_srq_alloc
);
440 * The following is cleanup for all possible failure cases in this routine
443 kmem_free(srq
->srq_wridlist
->wl_wre
, srq
->srq_wridlist
->wl_size
*
444 sizeof (tavor_wrid_entry_t
));
445 kmem_free(srq
->srq_wridlist
, sizeof (tavor_wrid_list_hdr_t
));
447 tavor_wql_refcnt_dec(srq
->srq_wrid_wql
);
449 if (tavor_mr_deregister(state
, &mr
, TAVOR_MR_DEREG_ALL
,
450 TAVOR_SLEEPFLAG_FOR_CONTEXT()) != DDI_SUCCESS
) {
451 TAVOR_WARNING(state
, "failed to deregister SRQ memory");
454 tavor_queue_free(state
, &srq
->srq_wqinfo
);
457 tavor_umap_db_free(umapdb
);
460 tavor_rsrc_free(state
, &rsrc
);
462 tavor_rsrc_free(state
, &srqc
);
464 tavor_pd_refcnt_dec(pd
);
466 TNF_PROBE_1(tavor_srq_alloc_fail
, TAVOR_TNF_ERROR
, "",
467 tnf_string
, msg
, errormsg
);
468 TAVOR_TNF_EXIT(tavor_srq_alloc
);
475 * Context: Can be called only from user or kernel context.
479 tavor_srq_free(tavor_state_t
*state
, tavor_srqhdl_t
*srqhdl
, uint_t sleepflag
)
481 tavor_rsrc_t
*srqc
, *rsrc
;
482 tavor_umap_db_entry_t
*umapdb
;
487 tavor_hw_srqc_t srqc_entry
;
493 TAVOR_TNF_ENTER(tavor_srq_free
);
496 * Pull all the necessary information from the Tavor Shared Receive
497 * Queue handle. This is necessary here because the resource for the
498 * SRQ handle is going to be freed up as part of this operation.
501 mutex_enter(&srq
->srq_lock
);
502 srqc
= srq
->srq_srqcrsrcp
;
503 rsrc
= srq
->srq_rsrcp
;
506 srqnum
= srq
->srq_srqnum
;
509 * If there are work queues still associated with the SRQ, then return
510 * an error. Otherwise, we will be holding the SRQ lock.
512 if (srq
->srq_refcnt
!= 0) {
513 mutex_exit(&srq
->srq_lock
);
514 TNF_PROBE_1(tavor_srq_free_refcnt_fail
, TAVOR_TNF_ERROR
, "",
515 tnf_int
, refcnt
, srq
->srq_refcnt
);
516 TAVOR_TNF_EXIT(tavor_srq_free
);
517 return (IBT_SRQ_IN_USE
);
521 * If this was a user-mappable SRQ, then we need to remove its entry
522 * from the "userland resources database". If it is also currently
523 * mmap()'d out to a user process, then we need to call
524 * devmap_devmem_remap() to remap the SRQ memory to an invalid mapping.
525 * We also need to invalidate the SRQ tracking information for the
528 if (srq
->srq_is_umap
) {
529 status
= tavor_umap_db_find(state
->ts_instance
, srq
->srq_srqnum
,
530 MLNX_UMAP_SRQMEM_RSRC
, &value
, TAVOR_UMAP_DB_REMOVE
,
532 if (status
!= DDI_SUCCESS
) {
533 mutex_exit(&srq
->srq_lock
);
534 TAVOR_WARNING(state
, "failed to find in database");
535 TAVOR_TNF_EXIT(tavor_srq_free
);
536 return (ibc_get_ci_failure(0));
538 tavor_umap_db_free(umapdb
);
539 if (srq
->srq_umap_dhp
!= NULL
) {
540 maxprot
= (PROT_READ
| PROT_WRITE
| PROT_USER
);
541 status
= devmap_devmem_remap(srq
->srq_umap_dhp
,
542 state
->ts_dip
, 0, 0, srq
->srq_wqinfo
.qa_size
,
543 maxprot
, DEVMAP_MAPPING_INVALID
, NULL
);
544 if (status
!= DDI_SUCCESS
) {
545 mutex_exit(&srq
->srq_lock
);
546 TAVOR_WARNING(state
, "failed in SRQ memory "
547 "devmap_devmem_remap()");
548 TAVOR_TNF_EXIT(tavor_srq_free
);
549 return (ibc_get_ci_failure(0));
551 srq
->srq_umap_dhp
= (devmap_cookie_t
)NULL
;
556 * Put NULL into the Tavor SRQNum-to-SRQHdl list. This will allow any
557 * in-progress events to detect that the SRQ corresponding to this
558 * number has been freed.
560 state
->ts_srqhdl
[srqc
->tr_indx
] = NULL
;
562 mutex_exit(&srq
->srq_lock
);
563 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*srq
));
564 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*srq
->srq_wridlist
));
567 * Reclaim SRQC entry from hardware (using the Tavor HW2SW_SRQ
568 * firmware command). If the ownership transfer fails for any reason,
569 * then it is an indication that something (either in HW or SW) has
570 * gone seriously wrong.
572 status
= tavor_cmn_ownership_cmd_post(state
, HW2SW_SRQ
, &srqc_entry
,
573 sizeof (tavor_hw_srqc_t
), srqnum
, sleepflag
);
574 if (status
!= TAVOR_CMD_SUCCESS
) {
575 TAVOR_WARNING(state
, "failed to reclaim SRQC ownership");
576 cmn_err(CE_CONT
, "Tavor: HW2SW_SRQ command failed: %08x\n",
578 TNF_PROBE_1(tavor_srq_free_hw2sw_srq_cmd_fail
,
579 TAVOR_TNF_ERROR
, "", tnf_uint
, status
, status
);
580 TAVOR_TNF_EXIT(tavor_srq_free
);
581 return (IBT_FAILURE
);
585 * Deregister the memory for the Shared Receive Queue. If this fails
586 * for any reason, then it is an indication that something (either
587 * in HW or SW) has gone seriously wrong. So we print a warning
588 * message and return.
590 status
= tavor_mr_deregister(state
, &mr
, TAVOR_MR_DEREG_ALL
,
592 if (status
!= DDI_SUCCESS
) {
593 TAVOR_WARNING(state
, "failed to deregister SRQ memory");
594 TNF_PROBE_0(tavor_srq_free_dereg_mr_fail
, TAVOR_TNF_ERROR
, "");
595 TAVOR_TNF_EXIT(tavor_srq_free
);
596 return (IBT_FAILURE
);
599 /* Calculate the size and free the wridlist container */
600 if (srq
->srq_wridlist
!= NULL
) {
601 size
= (srq
->srq_wridlist
->wl_size
*
602 sizeof (tavor_wrid_entry_t
));
603 kmem_free(srq
->srq_wridlist
->wl_wre
, size
);
604 kmem_free(srq
->srq_wridlist
, sizeof (tavor_wrid_list_hdr_t
));
607 * Release reference to WQL; If this is the last reference,
608 * this call also has the side effect of freeing up the
609 * 'srq_wrid_wql' memory.
611 tavor_wql_refcnt_dec(srq
->srq_wrid_wql
);
614 /* Free the memory for the SRQ */
615 tavor_queue_free(state
, &srq
->srq_wqinfo
);
617 /* Free the Tavor SRQ Handle */
618 tavor_rsrc_free(state
, &rsrc
);
620 /* Free the SRQC entry resource */
621 tavor_rsrc_free(state
, &srqc
);
623 /* Decrement the reference count on the protection domain (PD) */
624 tavor_pd_refcnt_dec(pd
);
626 /* Set the srqhdl pointer to NULL and return success */
629 TAVOR_TNF_EXIT(tavor_srq_free
);
630 return (DDI_SUCCESS
);
636 * Context: Can be called only from user or kernel context.
639 tavor_srq_modify(tavor_state_t
*state
, tavor_srqhdl_t srq
, uint_t size
,
640 uint_t
*real_size
, uint_t sleepflag
)
642 tavor_qalloc_info_t new_srqinfo
, old_srqinfo
;
643 tavor_rsrc_t
*mtt
, *mpt
, *old_mtt
;
644 tavor_bind_info_t bind
;
645 tavor_bind_info_t old_bind
;
646 tavor_rsrc_pool_info_t
*rsrc_pool
;
648 tavor_hw_mpt_t mpt_entry
;
649 tavor_wrid_entry_t
*wre_new
, *wre_old
;
650 uint64_t mtt_ddrbaseaddr
, mtt_addr
;
651 uint64_t srq_desc_off
;
652 uint32_t *buf
, srq_old_bufsz
;
655 uint_t dma_xfer_mode
, mtt_pgsize_bits
;
656 uint_t srq_sync
, log_srq_size
, maxprot
;
661 TAVOR_TNF_ENTER(tavor_srq_modify
);
664 * Check the "inddr" flag. This flag tells the driver whether or not
665 * the SRQ's work queues should be come from normal system memory or
666 * whether they should be allocated from DDR memory.
668 wq_location
= state
->ts_cfg_profile
->cp_srq_wq_inddr
;
671 * If size requested is larger than device capability, return
672 * Insufficient Resources
674 max_srq_size
= (1 << state
->ts_cfg_profile
->cp_log_max_srq_sz
);
675 if (size
> max_srq_size
) {
676 TNF_PROBE_0(tavor_srq_modify_size_larger_than_maxsize
,
677 TAVOR_TNF_ERROR
, "");
678 TAVOR_TNF_EXIT(tavor_srq_modify
);
679 return (IBT_HCA_WR_EXCEEDED
);
683 * Calculate the appropriate size for the SRQ.
684 * Note: All Tavor SRQs must be a power-of-2 in size. Also
685 * they may not be any smaller than TAVOR_SRQ_MIN_SIZE. This step
686 * is to round the requested size up to the next highest power-of-2
688 size
= max(size
, TAVOR_SRQ_MIN_SIZE
);
689 log_srq_size
= highbit(size
);
691 log_srq_size
= log_srq_size
- 1;
695 * Next we verify that the rounded-up size is valid (i.e. consistent
696 * with the device limits and/or software-configured limits).
698 if (log_srq_size
> state
->ts_cfg_profile
->cp_log_max_srq_sz
) {
699 /* Set "status" and "errormsg" and goto failure */
700 TAVOR_TNF_FAIL(IBT_HCA_WR_EXCEEDED
, "max SRQ size");
705 * Allocate the memory for newly resized Shared Receive Queue.
707 * Note: If SRQ is not user-mappable, then it may come from either
708 * kernel system memory or from HCA-attached local DDR memory.
710 * Note2: We align this queue on a pagesize boundary. This is required
711 * to make sure that all the resulting IB addresses will start at 0,
712 * for a zero-based queue. By making sure we are aligned on at least a
713 * page, any offset we use into our queue will be the same as it was
714 * when we allocated it at tavor_srq_alloc() time.
716 wqesz
= (1 << srq
->srq_wq_log_wqesz
);
717 new_srqinfo
.qa_size
= (1 << log_srq_size
) * wqesz
;
718 new_srqinfo
.qa_alloc_align
= PAGESIZE
;
719 new_srqinfo
.qa_bind_align
= PAGESIZE
;
720 if (srq
->srq_is_umap
) {
721 new_srqinfo
.qa_location
= TAVOR_QUEUE_LOCATION_USERLAND
;
723 new_srqinfo
.qa_location
= wq_location
;
725 status
= tavor_queue_alloc(state
, &new_srqinfo
, sleepflag
);
726 if (status
!= DDI_SUCCESS
) {
727 /* Set "status" and "errormsg" and goto failure */
728 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE
, "failed srq");
731 buf
= (uint32_t *)new_srqinfo
.qa_buf_aligned
;
732 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*buf
))
735 * Allocate the memory for the new WRE list. This will be used later
736 * when we resize the wridlist based on the new SRQ size.
738 wre_new
= (tavor_wrid_entry_t
*)kmem_zalloc((1 << log_srq_size
) *
739 sizeof (tavor_wrid_entry_t
), sleepflag
);
740 if (wre_new
== NULL
) {
741 /* Set "status" and "errormsg" and goto failure */
742 TAVOR_TNF_FAIL(IBT_INSUFF_RESOURCE
,
743 "failed wre_new alloc");
748 * Fill in the "bind" struct. This struct provides the majority
749 * of the information that will be used to distinguish between an
750 * "addr" binding (as is the case here) and a "buf" binding (see
751 * below). The "bind" struct is later passed to tavor_mr_mem_bind()
752 * which does most of the "heavy lifting" for the Tavor memory
753 * registration routines.
755 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(bind
))
756 bzero(&bind
, sizeof (tavor_bind_info_t
));
757 bind
.bi_type
= TAVOR_BINDHDL_VADDR
;
758 bind
.bi_addr
= (uint64_t)(uintptr_t)buf
;
759 bind
.bi_len
= new_srqinfo
.qa_size
;
761 bind
.bi_flags
= sleepflag
== TAVOR_SLEEP
? IBT_MR_SLEEP
:
762 IBT_MR_NOSLEEP
| IBT_MR_ENABLE_LOCAL_WRITE
;
763 if (srq
->srq_is_umap
) {
764 bind
.bi_bypass
= state
->ts_cfg_profile
->cp_iommu_bypass
;
766 if (wq_location
== TAVOR_QUEUE_LOCATION_NORMAL
) {
768 state
->ts_cfg_profile
->cp_iommu_bypass
;
770 state
->ts_cfg_profile
->cp_streaming_consistent
;
771 if (dma_xfer_mode
== DDI_DMA_STREAMING
) {
772 bind
.bi_flags
|= IBT_MR_NONCOHERENT
;
775 bind
.bi_bypass
= TAVOR_BINDMEM_BYPASS
;
778 status
= tavor_mr_mtt_bind(state
, &bind
, new_srqinfo
.qa_dmahdl
, &mtt
,
780 if (status
!= DDI_SUCCESS
) {
781 /* Set "status" and "errormsg" and goto failure */
782 TAVOR_TNF_FAIL(status
, "failed mtt bind");
783 kmem_free(wre_new
, srq
->srq_wq_bufsz
*
784 sizeof (tavor_wrid_entry_t
));
785 tavor_queue_free(state
, &new_srqinfo
);
790 * Calculate the offset between the kernel virtual address space
791 * and the IB virtual address space. This will be used when
792 * posting work requests to properly initialize each WQE.
794 * Note: bind addr is zero-based (from alloc) so we calculate the
795 * correct new offset here.
797 bind
.bi_addr
= bind
.bi_addr
& ((1 << mtt_pgsize_bits
) - 1);
798 srq_desc_off
= (uint64_t)(uintptr_t)new_srqinfo
.qa_buf_aligned
-
799 (uint64_t)bind
.bi_addr
;
802 * Get the base address for the MTT table. This will be necessary
803 * below when we are modifying the MPT entry.
805 rsrc_pool
= &state
->ts_rsrc_hdl
[TAVOR_MTT
];
806 mtt_ddrbaseaddr
= (uint64_t)(uintptr_t)rsrc_pool
->rsrc_ddr_offset
;
809 * Fill in the MPT entry. This is the final step before passing
810 * ownership of the MPT entry to the Tavor hardware. We use all of
811 * the information collected/calculated above to fill in the
812 * requisite portions of the MPT.
814 bzero(&mpt_entry
, sizeof (tavor_hw_mpt_t
));
815 mpt_entry
.reg_win_len
= bind
.bi_len
;
816 mtt_addr
= mtt_ddrbaseaddr
+ (mtt
->tr_indx
<< TAVOR_MTT_SIZE_SHIFT
);
817 mpt_entry
.mttseg_addr_h
= mtt_addr
>> 32;
818 mpt_entry
.mttseg_addr_l
= mtt_addr
>> 6;
821 * Now we grab the SRQ lock. Since we will be updating the actual
822 * SRQ location and the producer/consumer indexes, we should hold
825 * We do a TAVOR_NOSLEEP here (and below), though, because we are
826 * holding the "srq_lock" and if we got raised to interrupt level
827 * by priority inversion, we would not want to block in this routine
828 * waiting for success.
830 mutex_enter(&srq
->srq_lock
);
833 * Copy old entries to new buffer
835 srq_old_bufsz
= srq
->srq_wq_bufsz
;
836 bcopy(srq
->srq_wq_buf
, buf
, srq_old_bufsz
* wqesz
);
838 /* Determine if later ddi_dma_sync will be necessary */
839 srq_sync
= TAVOR_SRQ_IS_SYNC_REQ(state
, srq
->srq_wqinfo
);
841 /* Sync entire "new" SRQ for use by hardware (if necessary) */
843 (void) ddi_dma_sync(bind
.bi_dmahdl
, 0,
844 new_srqinfo
.qa_size
, DDI_DMA_SYNC_FORDEV
);
848 * Setup MPT information for use in the MODIFY_MPT command
851 mutex_enter(&mr
->mr_lock
);
852 mpt
= srq
->srq_mrhdl
->mr_mptrsrcp
;
857 * If this fails for any reason, then it is an indication that
858 * something (either in HW or SW) has gone seriously wrong. So we
859 * print a warning message and return.
861 status
= tavor_modify_mpt_cmd_post(state
, &mpt_entry
, mpt
->tr_indx
,
862 TAVOR_CMD_MODIFY_MPT_RESIZESRQ
, sleepflag
);
863 if (status
!= TAVOR_CMD_SUCCESS
) {
864 cmn_err(CE_CONT
, "Tavor: MODIFY_MPT command failed: %08x\n",
866 TNF_PROBE_1(tavor_mr_common_reg_sw2hw_mpt_cmd_fail
,
867 TAVOR_TNF_ERROR
, "", tnf_uint
, status
, status
);
868 TAVOR_TNF_FAIL(status
, "MODIFY_MPT command failed");
869 (void) tavor_mr_mtt_unbind(state
, &srq
->srq_mrhdl
->mr_bindinfo
,
870 srq
->srq_mrhdl
->mr_mttrsrcp
);
871 kmem_free(wre_new
, srq
->srq_wq_bufsz
*
872 sizeof (tavor_wrid_entry_t
));
873 tavor_queue_free(state
, &new_srqinfo
);
874 mutex_exit(&mr
->mr_lock
);
875 mutex_exit(&srq
->srq_lock
);
876 return (ibc_get_ci_failure(0));
880 * Update the Tavor Shared Receive Queue handle with all the new
881 * information. At the same time, save away all the necessary
882 * information for freeing up the old resources
884 old_srqinfo
= srq
->srq_wqinfo
;
885 old_mtt
= srq
->srq_mrhdl
->mr_mttrsrcp
;
886 bcopy(&srq
->srq_mrhdl
->mr_bindinfo
, &old_bind
,
887 sizeof (tavor_bind_info_t
));
889 /* Now set the new info */
890 srq
->srq_wqinfo
= new_srqinfo
;
891 srq
->srq_wq_buf
= buf
;
892 srq
->srq_wq_bufsz
= (1 << log_srq_size
);
893 bcopy(&bind
, &srq
->srq_mrhdl
->mr_bindinfo
, sizeof (tavor_bind_info_t
));
894 srq
->srq_mrhdl
->mr_mttrsrcp
= mtt
;
895 srq
->srq_desc_off
= srq_desc_off
;
896 srq
->srq_real_sizes
.srq_wr_sz
= (1 << log_srq_size
);
898 /* Update MR mtt pagesize */
899 mr
->mr_logmttpgsz
= mtt_pgsize_bits
;
900 mutex_exit(&mr
->mr_lock
);
903 mutex_enter(&srq
->srq_wrid_wql
->wql_lock
);
905 if (srq
->srq_wrid_wql
!= NULL
) {
906 mutex_enter(&srq
->srq_wrid_wql
->wql_lock
);
911 * Initialize new wridlist, if needed.
913 * If a wridlist already is setup on an SRQ (the QP associated with an
914 * SRQ has moved "from_reset") then we must update this wridlist based
915 * on the new SRQ size. We allocate the new size of Work Request ID
916 * Entries, copy over the old entries to the new list, and
917 * re-initialize the srq wridlist in non-umap case
920 if (srq
->srq_wridlist
!= NULL
) {
921 wre_old
= srq
->srq_wridlist
->wl_wre
;
923 bcopy(wre_old
, wre_new
, srq_old_bufsz
*
924 sizeof (tavor_wrid_entry_t
));
926 /* Setup new sizes in wre */
927 srq
->srq_wridlist
->wl_wre
= wre_new
;
928 srq
->srq_wridlist
->wl_size
= srq
->srq_wq_bufsz
;
930 if (!srq
->srq_is_umap
) {
931 tavor_wrid_list_srq_init(srq
->srq_wridlist
, srq
,
937 mutex_exit(&srq
->srq_wrid_wql
->wql_lock
);
939 if (srq
->srq_wrid_wql
!= NULL
) {
940 mutex_exit(&srq
->srq_wrid_wql
->wql_lock
);
945 * If "old" SRQ was a user-mappable SRQ that is currently mmap()'d out
946 * to a user process, then we need to call devmap_devmem_remap() to
947 * invalidate the mapping to the SRQ memory. We also need to
948 * invalidate the SRQ tracking information for the user mapping.
950 * Note: On failure, the remap really shouldn't ever happen. So, if it
951 * does, it is an indication that something has gone seriously wrong.
952 * So we print a warning message and return error (knowing, of course,
953 * that the "old" SRQ memory will be leaked)
955 if ((srq
->srq_is_umap
) && (srq
->srq_umap_dhp
!= NULL
)) {
956 maxprot
= (PROT_READ
| PROT_WRITE
| PROT_USER
);
957 status
= devmap_devmem_remap(srq
->srq_umap_dhp
,
958 state
->ts_dip
, 0, 0, srq
->srq_wqinfo
.qa_size
, maxprot
,
959 DEVMAP_MAPPING_INVALID
, NULL
);
960 if (status
!= DDI_SUCCESS
) {
961 mutex_exit(&srq
->srq_lock
);
962 TAVOR_WARNING(state
, "failed in SRQ memory "
963 "devmap_devmem_remap()");
964 /* We can, however, free the memory for old wre */
965 if (wre_old
!= NULL
) {
966 kmem_free(wre_old
, srq_old_bufsz
*
967 sizeof (tavor_wrid_entry_t
));
969 TAVOR_TNF_EXIT(tavor_srq_modify
);
970 return (ibc_get_ci_failure(0));
972 srq
->srq_umap_dhp
= (devmap_cookie_t
)NULL
;
976 * Drop the SRQ lock now. The only thing left to do is to free up
979 mutex_exit(&srq
->srq_lock
);
982 * Unbind the MTT entries.
984 status
= tavor_mr_mtt_unbind(state
, &old_bind
, old_mtt
);
985 if (status
!= DDI_SUCCESS
) {
986 TAVOR_WARNING(state
, "failed to unbind old SRQ memory");
987 /* Set "status" and "errormsg" and goto failure */
988 TAVOR_TNF_FAIL(ibc_get_ci_failure(0),
989 "failed to unbind (old)");
993 /* Free the memory for old wre */
994 if (wre_old
!= NULL
) {
995 kmem_free(wre_old
, srq_old_bufsz
*
996 sizeof (tavor_wrid_entry_t
));
999 /* Free the memory for the old SRQ */
1000 tavor_queue_free(state
, &old_srqinfo
);
1003 * Fill in the return arguments (if necessary). This includes the
1004 * real new completion queue size.
1006 if (real_size
!= NULL
) {
1007 *real_size
= (1 << log_srq_size
);
1010 TAVOR_TNF_EXIT(tavor_srq_modify
);
1011 return (DDI_SUCCESS
);
1014 TNF_PROBE_1(tavor_srq_modify_fail
, TAVOR_TNF_ERROR
, "",
1015 tnf_string
, msg
, errormsg
);
1016 TAVOR_TNF_EXIT(tavor_srq_modify
);
1022 * tavor_srq_refcnt_inc()
1023 * Context: Can be called from interrupt or base context.
1026 tavor_srq_refcnt_inc(tavor_srqhdl_t srq
)
1028 mutex_enter(&srq
->srq_lock
);
1029 TNF_PROBE_1_DEBUG(tavor_srq_refcnt_inc
, TAVOR_TNF_TRACE
, "",
1030 tnf_uint
, refcnt
, srq
->srq_refcnt
);
1032 mutex_exit(&srq
->srq_lock
);
1037 * tavor_srq_refcnt_dec()
1038 * Context: Can be called from interrupt or base context.
1041 tavor_srq_refcnt_dec(tavor_srqhdl_t srq
)
1043 mutex_enter(&srq
->srq_lock
);
1045 TNF_PROBE_1_DEBUG(tavor_srq_refcnt_dec
, TAVOR_TNF_TRACE
, "",
1046 tnf_uint
, refcnt
, srq
->srq_refcnt
);
1047 mutex_exit(&srq
->srq_lock
);
1052 * tavor_srqhdl_from_srqnum()
1053 * Context: Can be called from interrupt or base context.
1055 * This routine is important because changing the unconstrained
1056 * portion of the SRQ number is critical to the detection of a
1057 * potential race condition in the SRQ handler code (i.e. the case
1058 * where a SRQ is freed and alloc'd again before an event for the
1059 * "old" SRQ can be handled).
1061 * While this is not a perfect solution (not sure that one exists)
1062 * it does help to mitigate the chance that this race condition will
1063 * cause us to deliver a "stale" event to the new SRQ owner. Note:
1064 * this solution does not scale well because the number of constrained
1065 * bits increases (and, hence, the number of unconstrained bits
1066 * decreases) as the number of supported SRQ grows. For small and
1067 * intermediate values, it should hopefully provide sufficient
1071 tavor_srqhdl_from_srqnum(tavor_state_t
*state
, uint_t srqnum
)
1073 uint_t srqindx
, srqmask
;
1075 /* Calculate the SRQ table index from the srqnum */
1076 srqmask
= (1 << state
->ts_cfg_profile
->cp_log_num_srq
) - 1;
1077 srqindx
= srqnum
& srqmask
;
1078 return (state
->ts_srqhdl
[srqindx
]);
1083 * tavor_srq_sgl_to_logwqesz()
1084 * Context: Can be called from interrupt or base context.
1087 tavor_srq_sgl_to_logwqesz(tavor_state_t
*state
, uint_t num_sgl
,
1088 tavor_qp_wq_type_t wq_type
, uint_t
*logwqesz
, uint_t
*max_sgl
)
1090 uint_t max_size
, log2
, actual_sgl
;
1092 TAVOR_TNF_ENTER(tavor_srq_sgl_to_logwqesz
);
1095 case TAVOR_QP_WQ_TYPE_RECVQ
:
1097 * Use requested maximum SGL to calculate max descriptor size
1098 * (while guaranteeing that the descriptor size is a
1099 * power-of-2 cachelines).
1101 max_size
= (TAVOR_QP_WQE_MLX_RCV_HDRS
+ (num_sgl
<< 4));
1102 log2
= highbit(max_size
);
1103 if (ISP2(max_size
)) {
1107 /* Make sure descriptor is at least the minimum size */
1108 log2
= max(log2
, TAVOR_QP_WQE_LOG_MINIMUM
);
1110 /* Calculate actual number of SGL (given WQE size) */
1111 actual_sgl
= ((1 << log2
) - TAVOR_QP_WQE_MLX_RCV_HDRS
) >> 4;
1115 TAVOR_WARNING(state
, "unexpected work queue type");
1116 TNF_PROBE_0(tavor_srq_sgl_to_logwqesz_inv_wqtype_fail
,
1117 TAVOR_TNF_ERROR
, "");
1121 /* Fill in the return values */
1123 *max_sgl
= min(state
->ts_cfg_profile
->cp_srq_max_sgl
, actual_sgl
);
1125 TAVOR_TNF_EXIT(tavor_qp_sgl_to_logwqesz
);