4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright 2017, Joyent, Inc.
26 #include <sys/types.h>
27 #include <sys/callb.h>
28 #include <sys/cpupart.h>
30 #include <sys/pool_pset.h>
32 #include <sys/strsubr.h>
33 #include <sys/strsun.h>
35 #include <inet/ipsec_impl.h>
36 #include <inet/ip_impl.h>
37 #include <inet/sadb.h>
38 #include <inet/ipsecesp.h>
39 #include <inet/ipsecah.h>
41 #include <sys/mac_impl.h>
42 #include <sys/mac_client_impl.h>
43 #include <sys/mac_client_priv.h>
44 #include <sys/mac_soft_ring.h>
45 #include <sys/mac_flow_impl.h>
46 #include <sys/mac_stat.h>
48 static void mac_srs_soft_rings_signal(mac_soft_ring_set_t
*, uint_t
);
49 static void mac_srs_update_fanout_list(mac_soft_ring_set_t
*);
50 static void mac_srs_poll_unbind(mac_soft_ring_set_t
*);
51 static void mac_srs_worker_unbind(mac_soft_ring_set_t
*);
52 static void mac_srs_soft_rings_quiesce(mac_soft_ring_set_t
*, uint_t
);
54 static int mac_srs_cpu_setup(cpu_setup_t
, int, void *);
55 static void mac_srs_worker_bind(mac_soft_ring_set_t
*, processorid_t
);
56 static void mac_srs_poll_bind(mac_soft_ring_set_t
*, processorid_t
);
57 static void mac_srs_threads_unbind(mac_soft_ring_set_t
*);
58 static void mac_srs_add_glist(mac_soft_ring_set_t
*);
59 static void mac_srs_remove_glist(mac_soft_ring_set_t
*);
60 static void mac_srs_fanout_list_free(mac_soft_ring_set_t
*);
61 static void mac_soft_ring_remove(mac_soft_ring_set_t
*, mac_soft_ring_t
*);
63 static int mac_compute_soft_ring_count(flow_entry_t
*, int, int);
64 static void mac_walk_srs_and_bind(int);
65 static void mac_walk_srs_and_unbind(int);
67 extern boolean_t mac_latency_optimize
;
69 static kmem_cache_t
*mac_srs_cache
;
70 kmem_cache_t
*mac_soft_ring_cache
;
73 * The duration in msec we wait before signalling the soft ring
74 * worker thread in case packets get queued.
76 uint32_t mac_soft_ring_worker_wait
= 0;
79 * A global tunable for turning polling on/off. By default, dynamic
80 * polling is always on and is always very beneficial. It should be
81 * turned off with absolute care and for the rare workload (very
82 * low latency sensitive traffic).
84 int mac_poll_enable
= B_TRUE
;
87 * Need to set mac_soft_ring_max_q_cnt based on bandwidth and perhaps latency.
88 * Large values could end up in consuming lot of system memory and cause
91 int mac_soft_ring_max_q_cnt
= 1024;
92 int mac_soft_ring_min_q_cnt
= 256;
93 int mac_soft_ring_poll_thres
= 16;
95 boolean_t mac_tx_serialize
= B_FALSE
;
98 * mac_tx_srs_hiwat is the queue depth threshold at which callers of
99 * mac_tx() will be notified of flow control condition.
101 * TCP does not honour flow control condition sent up by mac_tx().
102 * Thus provision is made for TCP to allow more packets to be queued
103 * in SRS upto a maximum of mac_tx_srs_max_q_cnt.
105 * Note that mac_tx_srs_hiwat is always be lesser than
106 * mac_tx_srs_max_q_cnt.
108 uint32_t mac_tx_srs_max_q_cnt
= 100000;
109 uint32_t mac_tx_srs_hiwat
= 1000;
112 * mac_rx_soft_ring_count, mac_soft_ring_10gig_count:
114 * Global tunables that determines the number of soft rings to be used for
115 * fanning out incoming traffic on a link. These count will be used only
116 * when no explicit set of CPUs was assigned to the data-links.
118 * mac_rx_soft_ring_count tunable will come into effect only if
119 * mac_soft_ring_enable is set. mac_soft_ring_enable is turned on by
120 * default only for sun4v platforms.
122 * mac_rx_soft_ring_10gig_count will come into effect if you are running on a
123 * 10Gbps link and is not dependent upon mac_soft_ring_enable.
125 * The number of soft rings for fanout for a link or a flow is determined
126 * by mac_compute_soft_ring_count() routine. This routine will take into
127 * account mac_soft_ring_enable, mac_rx_soft_ring_count and
128 * mac_rx_soft_ring_10gig_count to determine the soft ring count for a link.
130 * If a bandwidth is specified, the determination of the number of soft
131 * rings is based on specified bandwidth, CPU speed and number of CPUs in
134 uint_t mac_rx_soft_ring_count
= 8;
135 uint_t mac_rx_soft_ring_10gig_count
= 8;
138 * Every Tx and Rx mac_soft_ring_set_t (mac_srs) created gets added
139 * to mac_srs_g_list and mac_srs_g_lock protects mac_srs_g_list. The
140 * list is used to walk the list of all MAC threads when a CPU is
141 * coming online or going offline.
143 static mac_soft_ring_set_t
*mac_srs_g_list
= NULL
;
144 static krwlock_t mac_srs_g_lock
;
147 * Whether the SRS threads should be bound, or not.
149 boolean_t mac_srs_thread_bind
= B_TRUE
;
152 * Whether Rx/Tx interrupts should be re-targeted. Disabled by default.
153 * dladm command would override this.
155 boolean_t mac_tx_intr_retarget
= B_FALSE
;
156 boolean_t mac_rx_intr_retarget
= B_FALSE
;
159 * If cpu bindings are specified by user, then Tx SRS and its soft
160 * rings should also be bound to the CPUs specified by user. The
161 * CPUs for Tx bindings are at the end of the cpu list provided by
162 * the user. If enough CPUs are not available (for Tx and Rx
163 * SRSes), then the CPUs are shared by both Tx and Rx SRSes.
165 #define BIND_TX_SRS_AND_SOFT_RINGS(mac_tx_srs, mrp) { \
166 processorid_t cpuid; \
168 mac_soft_ring_t *softring; \
169 mac_cpus_t *srs_cpu; \
171 srs_cpu = &mac_tx_srs->srs_cpu; \
172 cpuid = srs_cpu->mc_tx_fanout_cpus[0]; \
173 mac_srs_worker_bind(mac_tx_srs, cpuid); \
174 if (MAC_TX_SOFT_RINGS(mac_tx_srs)) { \
175 for (i = 0; i < mac_tx_srs->srs_tx_ring_count; i++) { \
176 cpuid = srs_cpu->mc_tx_fanout_cpus[i]; \
177 softring = mac_tx_srs->srs_tx_soft_rings[i]; \
179 (void) mac_soft_ring_bind(softring, \
187 * Re-targeting is allowed only for exclusive group or for primary.
189 #define RETARGETABLE_CLIENT(group, mcip) \
190 ((((group) != NULL) && \
191 ((group)->mrg_state == MAC_GROUP_STATE_RESERVED)) || \
192 mac_is_primary_client(mcip))
194 #define MAC_RING_RETARGETABLE(ring) \
195 (((ring) != NULL) && \
196 ((ring)->mr_info.mri_intr.mi_ddi_handle != NULL) && \
197 !((ring)->mr_info.mri_intr.mi_ddi_shared))
200 /* INIT and FINI ROUTINES */
203 mac_soft_ring_init(void)
205 mac_soft_ring_cache
= kmem_cache_create("mac_soft_ring_cache",
206 sizeof (mac_soft_ring_t
), 64, NULL
, NULL
, NULL
, NULL
, NULL
, 0);
208 mac_srs_cache
= kmem_cache_create("mac_srs_cache",
209 sizeof (mac_soft_ring_set_t
),
210 64, NULL
, NULL
, NULL
, NULL
, NULL
, 0);
212 rw_init(&mac_srs_g_lock
, NULL
, RW_DEFAULT
, NULL
);
213 mutex_enter(&cpu_lock
);
214 register_cpu_setup_func(mac_srs_cpu_setup
, NULL
);
215 mutex_exit(&cpu_lock
);
219 mac_soft_ring_finish(void)
221 mutex_enter(&cpu_lock
);
222 unregister_cpu_setup_func(mac_srs_cpu_setup
, NULL
);
223 mutex_exit(&cpu_lock
);
224 rw_destroy(&mac_srs_g_lock
);
225 kmem_cache_destroy(mac_soft_ring_cache
);
226 kmem_cache_destroy(mac_srs_cache
);
230 mac_srs_soft_rings_free(mac_soft_ring_set_t
*mac_srs
)
232 mac_soft_ring_t
*softring
, *next
, *head
;
235 * Synchronize with mac_walk_srs_bind/unbind which are callbacks from
236 * DR. The callbacks from DR are called with cpu_lock held, and hence
237 * can't wait to grab the mac perimeter. The soft ring list is hence
238 * protected for read access by srs_lock. Changing the soft ring list
239 * needs the mac perimeter and the srs_lock.
241 mutex_enter(&mac_srs
->srs_lock
);
243 head
= mac_srs
->srs_soft_ring_head
;
244 mac_srs
->srs_soft_ring_head
= NULL
;
245 mac_srs
->srs_soft_ring_tail
= NULL
;
246 mac_srs
->srs_soft_ring_count
= 0;
248 mutex_exit(&mac_srs
->srs_lock
);
250 for (softring
= head
; softring
!= NULL
; softring
= next
) {
251 next
= softring
->s_ring_next
;
252 mac_soft_ring_free(softring
);
257 mac_srs_add_glist(mac_soft_ring_set_t
*mac_srs
)
259 ASSERT(mac_srs
->srs_next
== NULL
&& mac_srs
->srs_prev
== NULL
);
260 ASSERT(MAC_PERIM_HELD((mac_handle_t
)mac_srs
->srs_mcip
->mci_mip
));
262 rw_enter(&mac_srs_g_lock
, RW_WRITER
);
263 mutex_enter(&mac_srs
->srs_lock
);
265 ASSERT((mac_srs
->srs_state
& SRS_IN_GLIST
) == 0);
267 if (mac_srs_g_list
== NULL
) {
268 mac_srs_g_list
= mac_srs
;
270 mac_srs
->srs_next
= mac_srs_g_list
;
271 mac_srs_g_list
->srs_prev
= mac_srs
;
272 mac_srs
->srs_prev
= NULL
;
273 mac_srs_g_list
= mac_srs
;
275 mac_srs
->srs_state
|= SRS_IN_GLIST
;
277 mutex_exit(&mac_srs
->srs_lock
);
278 rw_exit(&mac_srs_g_lock
);
282 mac_srs_remove_glist(mac_soft_ring_set_t
*mac_srs
)
284 ASSERT(MAC_PERIM_HELD((mac_handle_t
)mac_srs
->srs_mcip
->mci_mip
));
286 rw_enter(&mac_srs_g_lock
, RW_WRITER
);
287 mutex_enter(&mac_srs
->srs_lock
);
289 ASSERT((mac_srs
->srs_state
& SRS_IN_GLIST
) != 0);
291 if (mac_srs
== mac_srs_g_list
) {
292 mac_srs_g_list
= mac_srs
->srs_next
;
293 if (mac_srs_g_list
!= NULL
)
294 mac_srs_g_list
->srs_prev
= NULL
;
296 mac_srs
->srs_prev
->srs_next
= mac_srs
->srs_next
;
297 if (mac_srs
->srs_next
!= NULL
)
298 mac_srs
->srs_next
->srs_prev
= mac_srs
->srs_prev
;
300 mac_srs
->srs_state
&= ~SRS_IN_GLIST
;
302 mutex_exit(&mac_srs
->srs_lock
);
303 rw_exit(&mac_srs_g_lock
);
306 /* POLLING SETUP AND TEAR DOWN ROUTINES */
309 * mac_srs_client_poll_quiesce and mac_srs_client_poll_restart
311 * These routines are used to call back into the upper layer
312 * (primarily TCP squeue) to stop polling the soft rings or
316 mac_srs_client_poll_quiesce(mac_client_impl_t
*mcip
,
317 mac_soft_ring_set_t
*mac_srs
)
319 mac_soft_ring_t
*softring
;
321 ASSERT(MAC_PERIM_HELD((mac_handle_t
)mcip
->mci_mip
));
323 if (!(mac_srs
->srs_type
& SRST_CLIENT_POLL_ENABLED
)) {
324 ASSERT(!(mac_srs
->srs_type
& SRST_DLS_BYPASS
));
328 for (softring
= mac_srs
->srs_soft_ring_head
;
329 softring
!= NULL
; softring
= softring
->s_ring_next
) {
330 if ((softring
->s_ring_type
& ST_RING_TCP
) &&
331 (softring
->s_ring_rx_arg2
!= NULL
)) {
332 mcip
->mci_resource_quiesce(mcip
->mci_resource_arg
,
333 softring
->s_ring_rx_arg2
);
339 mac_srs_client_poll_restart(mac_client_impl_t
*mcip
,
340 mac_soft_ring_set_t
*mac_srs
)
342 mac_soft_ring_t
*softring
;
344 ASSERT(MAC_PERIM_HELD((mac_handle_t
)mcip
->mci_mip
));
346 if (!(mac_srs
->srs_type
& SRST_CLIENT_POLL_ENABLED
)) {
347 ASSERT(!(mac_srs
->srs_type
& SRST_DLS_BYPASS
));
351 for (softring
= mac_srs
->srs_soft_ring_head
;
352 softring
!= NULL
; softring
= softring
->s_ring_next
) {
353 if ((softring
->s_ring_type
& ST_RING_TCP
) &&
354 (softring
->s_ring_rx_arg2
!= NULL
)) {
355 mcip
->mci_resource_restart(mcip
->mci_resource_arg
,
356 softring
->s_ring_rx_arg2
);
362 * Register the given SRS and associated soft rings with the consumer and
363 * enable the polling interface used by the consumer.(i.e IP) over this
364 * SRS and associated soft rings.
367 mac_srs_client_poll_enable(mac_client_impl_t
*mcip
,
368 mac_soft_ring_set_t
*mac_srs
)
371 mac_soft_ring_t
*softring
;
373 ASSERT(mac_srs
->srs_mcip
== mcip
);
374 ASSERT(MAC_PERIM_HELD((mac_handle_t
)mcip
->mci_mip
));
376 if (!(mcip
->mci_state_flags
& MCIS_CLIENT_POLL_CAPABLE
))
379 bzero(&mrf
, sizeof (mac_rx_fifo_t
));
380 mrf
.mrf_type
= MAC_RX_FIFO
;
383 * A SRS is capable of acting as a soft ring for cases
384 * where no fanout is needed. This is the case for userland
387 if (mac_srs
->srs_type
& SRST_NO_SOFT_RINGS
)
390 mrf
.mrf_receive
= (mac_receive_t
)mac_soft_ring_poll
;
391 mrf
.mrf_intr_enable
= (mac_intr_enable_t
)mac_soft_ring_intr_enable
;
392 mrf
.mrf_intr_disable
= (mac_intr_disable_t
)mac_soft_ring_intr_disable
;
393 mac_srs
->srs_type
|= SRST_CLIENT_POLL_ENABLED
;
395 softring
= mac_srs
->srs_soft_ring_head
;
396 while (softring
!= NULL
) {
397 if (softring
->s_ring_type
& (ST_RING_TCP
| ST_RING_UDP
)) {
399 * TCP and UDP support DLS bypass. Squeue polling
400 * support implies DLS bypass since the squeue poll
401 * path does not have DLS processing.
403 mac_soft_ring_dls_bypass(softring
,
404 mcip
->mci_direct_rx_fn
, mcip
->mci_direct_rx_arg
);
407 * Non-TCP protocols don't support squeues. Hence we don't
408 * make any ring addition callbacks for non-TCP rings
410 if (!(softring
->s_ring_type
& ST_RING_TCP
)) {
411 softring
->s_ring_rx_arg2
= NULL
;
412 softring
= softring
->s_ring_next
;
415 mrf
.mrf_rx_arg
= softring
;
416 mrf
.mrf_intr_handle
= (mac_intr_handle_t
)softring
;
417 mrf
.mrf_cpu_id
= softring
->s_ring_cpuid
;
418 mrf
.mrf_flow_priority
= mac_srs
->srs_pri
;
420 softring
->s_ring_rx_arg2
= mcip
->mci_resource_add(
421 mcip
->mci_resource_arg
, (mac_resource_t
*)&mrf
);
423 softring
= softring
->s_ring_next
;
428 * Unregister the given SRS and associated soft rings with the consumer and
429 * disable the polling interface used by the consumer.(i.e IP) over this
430 * SRS and associated soft rings.
433 mac_srs_client_poll_disable(mac_client_impl_t
*mcip
,
434 mac_soft_ring_set_t
*mac_srs
)
436 mac_soft_ring_t
*softring
;
438 ASSERT(MAC_PERIM_HELD((mac_handle_t
)mcip
->mci_mip
));
441 * A SRS is capable of acting as a soft ring for cases
442 * where no protocol fanout is needed. This is the case
443 * for userland flows. Nothing to do here.
445 if (mac_srs
->srs_type
& SRST_NO_SOFT_RINGS
)
448 mutex_enter(&mac_srs
->srs_lock
);
449 if (!(mac_srs
->srs_type
& SRST_CLIENT_POLL_ENABLED
)) {
450 ASSERT(!(mac_srs
->srs_type
& SRST_DLS_BYPASS
));
451 mutex_exit(&mac_srs
->srs_lock
);
454 mac_srs
->srs_type
&= ~(SRST_CLIENT_POLL_ENABLED
| SRST_DLS_BYPASS
);
455 mutex_exit(&mac_srs
->srs_lock
);
458 * DLS bypass is now disabled in the case of both TCP and UDP.
459 * Reset the soft ring callbacks to the standard 'mac_rx_deliver'
460 * callback. In addition, in the case of TCP, invoke IP's callback
463 for (softring
= mac_srs
->srs_soft_ring_head
;
464 softring
!= NULL
; softring
= softring
->s_ring_next
) {
465 if (!(softring
->s_ring_type
& (ST_RING_UDP
| ST_RING_TCP
)))
468 if ((softring
->s_ring_type
& ST_RING_TCP
) &&
469 softring
->s_ring_rx_arg2
!= NULL
) {
470 mcip
->mci_resource_remove(mcip
->mci_resource_arg
,
471 softring
->s_ring_rx_arg2
);
474 mutex_enter(&softring
->s_ring_lock
);
475 while (softring
->s_ring_state
& S_RING_PROC
) {
476 softring
->s_ring_state
|= S_RING_CLIENT_WAIT
;
477 cv_wait(&softring
->s_ring_client_cv
,
478 &softring
->s_ring_lock
);
480 softring
->s_ring_state
&= ~S_RING_CLIENT_WAIT
;
481 softring
->s_ring_rx_arg2
= NULL
;
482 softring
->s_ring_rx_func
= mac_rx_deliver
;
483 softring
->s_ring_rx_arg1
= mcip
;
484 mutex_exit(&softring
->s_ring_lock
);
489 * Enable or disable poll capability of the SRS on the underlying Rx ring.
491 * There is a need to enable or disable the poll capability of an SRS over an
492 * Rx ring depending on the number of mac clients sharing the ring and also
493 * whether user flows are configured on it. However the poll state is actively
494 * manipulated by the SRS worker and poll threads and uncoordinated changes by
495 * yet another thread to the underlying capability can surprise them leading
496 * to assert failures. Instead we quiesce the SRS, make the changes and then
500 mac_srs_poll_state_change(mac_soft_ring_set_t
*mac_srs
,
501 boolean_t turn_off_poll_capab
, mac_rx_func_t rx_func
)
503 boolean_t need_restart
= B_FALSE
;
504 mac_srs_rx_t
*srs_rx
= &mac_srs
->srs_rx
;
507 if (!SRS_QUIESCED(mac_srs
)) {
508 mac_rx_srs_quiesce(mac_srs
, SRS_QUIESCE
);
509 need_restart
= B_TRUE
;
512 ring
= mac_srs
->srs_ring
;
513 if ((ring
!= NULL
) &&
514 (ring
->mr_classify_type
== MAC_HW_CLASSIFIER
)) {
515 if (turn_off_poll_capab
)
516 mac_srs
->srs_state
&= ~SRS_POLLING_CAPAB
;
517 else if (mac_poll_enable
)
518 mac_srs
->srs_state
|= SRS_POLLING_CAPAB
;
520 srs_rx
->sr_lower_proc
= rx_func
;
523 mac_rx_srs_restart(mac_srs
);
526 /* CPU RECONFIGURATION AND FANOUT COMPUTATION ROUTINES */
529 * Return the next CPU to be used to bind a MAC kernel thread.
530 * If a cpupart is specified, the cpu chosen must be from that
534 mac_next_bind_cpu(cpupart_t
*cpupart
)
536 static cpu_t
*cp
= NULL
;
539 ASSERT(MUTEX_HELD(&cpu_lock
));
544 cp
= cp
->cpu_next_onln
;
548 if ((cpupart
== NULL
) || (cp
->cpu_part
== cpupart
))
551 } while ((cp
= cp
->cpu_next_onln
) != cp_start
);
558 mac_srs_cpu_setup(cpu_setup_t what
, int id
, void *arg
)
560 ASSERT(MUTEX_HELD(&cpu_lock
));
565 mac_walk_srs_and_bind(id
);
570 case CPU_CPUPART_OUT
:
571 mac_walk_srs_and_unbind(id
);
581 * mac_compute_soft_ring_count():
583 * This routine computes the number of soft rings needed to handle incoming
584 * load given a flow_entry.
586 * The routine does the following:
587 * 1) soft rings will be created if mac_soft_ring_enable is set.
588 * 2) If the underlying link is a 10Gbps link, then soft rings will be
589 * created even if mac_soft_ring_enable is not set. The number of soft
590 * rings, so created, will equal mac_rx_soft_ring_10gig_count.
591 * 3) On a sun4v platform (i.e., mac_soft_ring_enable is set), 2 times the
592 * mac_rx_soft_ring_10gig_count number of soft rings will be created for a
595 * If a bandwidth limit is specified, the number that gets computed is
596 * dependent upon CPU speed, the number of Rx rings configured, and
597 * the bandwidth limit.
598 * If more Rx rings are available, less number of soft rings is needed.
600 * mac_use_bw_heuristic is another "hidden" variable that can be used to
601 * override the default use of soft ring count computation. Depending upon
602 * the usefulness of it, mac_use_bw_heuristic can later be made into a
603 * data-link property or removed altogether.
605 * TODO: Cleanup and tighten some of the assumptions.
607 boolean_t mac_use_bw_heuristic
= B_TRUE
;
609 mac_compute_soft_ring_count(flow_entry_t
*flent
, int rx_srs_cnt
, int maxcpus
)
611 uint64_t cpu_speed
, bw
= 0;
613 boolean_t bw_enabled
= B_FALSE
;
615 ASSERT(!(flent
->fe_type
& FLOW_USER
));
616 if (flent
->fe_resource_props
.mrp_mask
& MRP_MAXBW
&&
617 mac_use_bw_heuristic
) {
618 /* bandwidth enabled */
620 bw
= flent
->fe_resource_props
.mrp_maxbw
;
623 /* No bandwidth enabled */
624 if (mac_soft_ring_enable
)
625 srings
= mac_rx_soft_ring_count
;
627 /* Is this a 10Gig link? */
628 flent
->fe_nic_speed
= mac_client_stat_get(flent
->fe_mcip
,
630 /* convert to Mbps */
631 if (((flent
->fe_nic_speed
)/1000000) > 1000 &&
632 mac_rx_soft_ring_10gig_count
> 0) {
633 /* This is a 10Gig link */
634 srings
= mac_rx_soft_ring_10gig_count
;
636 * Use 2 times mac_rx_soft_ring_10gig_count for
639 if (mac_soft_ring_enable
)
644 * Soft ring computation using CPU speed and specified
647 /* Assumption: all CPUs have the same frequency */
648 cpu_speed
= (uint64_t)CPU
->cpu_type_info
.pi_clock
;
650 /* cpu_speed is in MHz; make bw in units of Mbps. */
655 * bw is greater than or equal to 1Gbps.
656 * The number of soft rings required is a function
657 * of bandwidth and CPU speed. To keep this simple,
658 * let's use this rule: 1GHz CPU can handle 1Gbps.
659 * If bw is less than 1 Gbps, then there is no need
660 * for soft rings. Assumption is that CPU speeds
661 * (on modern systems) are at least 1GHz.
663 srings
= bw
/cpu_speed
;
664 if (srings
<= 1 && mac_soft_ring_enable
) {
666 * Give at least 2 soft rings
674 * If the flent has multiple Rx SRSs, then each SRS need not
675 * have that many soft rings on top of it. The number of
676 * soft rings for each Rx SRS is found by dividing srings by
679 if (rx_srs_cnt
> 1) {
682 remainder
= srings
%rx_srs_cnt
;
683 srings
= srings
/rx_srs_cnt
;
687 * Fanning out to 1 soft ring is not very useful.
688 * Set it as well to 0 and mac_srs_fanout_init()
689 * will take care of creating a single soft ring
695 /* Do some more massaging */
696 srings
= min(srings
, maxcpus
);
697 srings
= min(srings
, MAX_SR_FANOUT
);
703 * set up CPUs for Tx interrupt re-targeting and Tx worker
707 mac_tx_cpu_init(flow_entry_t
*flent
, mac_resource_props_t
*mrp
,
710 mac_soft_ring_set_t
*tx_srs
= flent
->fe_tx_srs
;
711 mac_srs_tx_t
*srs_tx
= &tx_srs
->srs_tx
;
712 mac_cpus_t
*srs_cpu
= &tx_srs
->srs_cpu
;
713 mac_soft_ring_t
*sringp
;
715 processorid_t worker_cpuid
;
716 boolean_t retargetable_client
= B_FALSE
;
719 if (RETARGETABLE_CLIENT((mac_group_t
*)flent
->fe_tx_ring_group
,
721 retargetable_client
= B_TRUE
;
724 if (MAC_TX_SOFT_RINGS(tx_srs
)) {
726 j
= mrp
->mrp_ncpus
- 1;
727 for (i
= 0; i
< tx_srs
->srs_tx_ring_count
; i
++) {
730 j
= mrp
->mrp_ncpus
- 1;
731 worker_cpuid
= mrp
->mrp_cpu
[j
];
734 * Bind interrupt to the next CPU available
735 * and leave the worker unbound.
739 sringp
= tx_srs
->srs_tx_soft_rings
[i
];
740 ring
= (mac_ring_t
*)sringp
->s_ring_tx_arg2
;
741 srs_cpu
->mc_tx_fanout_cpus
[i
] = worker_cpuid
;
742 if (MAC_RING_RETARGETABLE(ring
) &&
743 retargetable_client
) {
744 mutex_enter(&cpu_lock
);
745 srs_cpu
->mc_tx_intr_cpu
[i
] =
746 (mrp
!= NULL
) ? mrp
->mrp_cpu
[j
] :
747 (mac_tx_intr_retarget
?
748 mac_next_bind_cpu(cpupart
) : -1);
749 mutex_exit(&cpu_lock
);
751 srs_cpu
->mc_tx_intr_cpu
[i
] = -1;
757 /* Tx mac_ring_handle_t is stored in st_arg2 */
758 srs_cpu
->mc_tx_fanout_cpus
[0] =
759 (mrp
!= NULL
) ? mrp
->mrp_cpu
[mrp
->mrp_ncpus
- 1] : -1;
760 ring
= (mac_ring_t
*)srs_tx
->st_arg2
;
761 if (MAC_RING_RETARGETABLE(ring
) && retargetable_client
) {
762 mutex_enter(&cpu_lock
);
763 srs_cpu
->mc_tx_intr_cpu
[0] = (mrp
!= NULL
) ?
764 mrp
->mrp_cpu
[mrp
->mrp_ncpus
- 1] :
765 (mac_tx_intr_retarget
?
766 mac_next_bind_cpu(cpupart
) : -1);
767 mutex_exit(&cpu_lock
);
769 srs_cpu
->mc_tx_intr_cpu
[0] = -1;
775 * Assignment of user specified CPUs to a link.
777 * Minimum CPUs required to get an optimal assignmet:
778 * For each Rx SRS, atleast two CPUs are needed if mac_latency_optimize
779 * flag is set -- one for polling, one for fanout soft ring.
780 * If mac_latency_optimize is not set, then 3 CPUs are needed -- one
781 * for polling, one for SRS worker thread and one for fanout soft ring.
783 * The CPUs needed for Tx side is equal to the number of Tx rings
786 * mac_flow_user_cpu_init() categorizes the CPU assignment depending
787 * upon the number of CPUs in 3 different buckets.
789 * In the first bucket, the most optimal case is handled. The user has
790 * passed enough number of CPUs and every thread gets its own CPU.
792 * The second and third are the sub-optimal cases. Enough CPUs are not
795 * The second bucket handles the case where atleast one distinct CPU is
796 * is available for each of the Rx rings (Rx SRSes) and Tx rings (Tx
797 * SRS or soft rings).
799 * In the third case (worst case scenario), specified CPU count is less
800 * than the Rx rings configured for the link. In this case, we round
801 * robin the CPUs among the Rx SRSes and Tx SRS/soft rings.
804 mac_flow_user_cpu_init(flow_entry_t
*flent
, mac_resource_props_t
*mrp
)
806 mac_soft_ring_set_t
*rx_srs
, *tx_srs
;
809 int no_of_cpus
, cpu_cnt
;
810 int rx_srs_cnt
, reqd_rx_cpu_cnt
;
811 int fanout_cpu_cnt
, reqd_tx_cpu_cnt
;
812 int reqd_poll_worker_cnt
, fanout_cnt_per_srs
;
813 mac_resource_props_t
*emrp
= &flent
->fe_effective_props
;
815 ASSERT(mrp
->mrp_fanout_mode
== MCM_CPUS
);
817 * The check for nbc_ncpus to be within limits for
818 * the user specified case was done earlier and if
819 * not within limits, an error would have been
820 * returned to the user.
822 ASSERT(mrp
->mrp_ncpus
> 0);
824 no_of_cpus
= mrp
->mrp_ncpus
;
826 if (mrp
->mrp_rx_intr_cpu
!= -1) {
828 * interrupt has been re-targetted. Poll
829 * thread needs to be bound to interrupt
832 * Find where in the list is the intr
833 * CPU and swap it with the first one.
834 * We will be using the first CPU in the
837 for (i
= 0; i
< no_of_cpus
; i
++) {
838 if (mrp
->mrp_cpu
[i
] == mrp
->mrp_rx_intr_cpu
)
841 mrp
->mrp_cpu
[i
] = mrp
->mrp_cpu
[0];
842 mrp
->mrp_cpu
[0] = mrp
->mrp_rx_intr_cpu
;
847 * The number of CPUs that each Rx ring needs is dependent
848 * upon mac_latency_optimize flag.
849 * 1) If set, atleast 2 CPUs are needed -- one for
850 * polling, one for fanout soft ring.
851 * 2) If not set, then atleast 3 CPUs are needed -- one
852 * for polling, one for srs worker thread, and one for
855 rx_srs_cnt
= (flent
->fe_rx_srs_cnt
> 1) ?
856 (flent
->fe_rx_srs_cnt
- 1) : flent
->fe_rx_srs_cnt
;
857 reqd_rx_cpu_cnt
= mac_latency_optimize
?
858 (rx_srs_cnt
* 2) : (rx_srs_cnt
* 3);
860 /* How many CPUs are needed for Tx side? */
861 tx_srs
= flent
->fe_tx_srs
;
862 reqd_tx_cpu_cnt
= MAC_TX_SOFT_RINGS(tx_srs
) ?
863 tx_srs
->srs_tx_ring_count
: 1;
865 /* CPUs needed for Rx SRSes poll and worker threads */
866 reqd_poll_worker_cnt
= mac_latency_optimize
?
867 rx_srs_cnt
: rx_srs_cnt
* 2;
869 /* Has the user provided enough CPUs? */
870 if (no_of_cpus
>= (reqd_rx_cpu_cnt
+ reqd_tx_cpu_cnt
)) {
872 * Best case scenario. There is enough CPUs. All
873 * Rx rings will get their own set of CPUs plus
874 * Tx soft rings will get their own.
877 * fanout_cpu_cnt is the number of CPUs available
878 * for Rx side fanout soft rings.
880 fanout_cpu_cnt
= no_of_cpus
-
881 reqd_poll_worker_cnt
- reqd_tx_cpu_cnt
;
884 * Divide fanout_cpu_cnt by rx_srs_cnt to find
885 * out how many fanout soft rings each Rx SRS
888 fanout_cnt_per_srs
= fanout_cpu_cnt
/rx_srs_cnt
;
890 /* fanout_cnt_per_srs should not be > MAX_SR_FANOUT */
891 fanout_cnt_per_srs
= min(fanout_cnt_per_srs
, MAX_SR_FANOUT
);
893 /* Do the assignment for the default Rx ring */
895 rx_srs
= flent
->fe_rx_srs
[0];
896 ASSERT(rx_srs
->srs_ring
== NULL
);
897 if (rx_srs
->srs_fanout_state
== SRS_FANOUT_INIT
)
898 rx_srs
->srs_fanout_state
= SRS_FANOUT_REINIT
;
899 srs_cpu
= &rx_srs
->srs_cpu
;
900 srs_cpu
->mc_ncpus
= no_of_cpus
;
902 srs_cpu
->mc_cpus
, sizeof (srs_cpu
->mc_cpus
));
903 srs_cpu
->mc_rx_fanout_cnt
= fanout_cnt_per_srs
;
904 srs_cpu
->mc_rx_pollid
= mrp
->mrp_cpu
[cpu_cnt
++];
905 /* Retarget the interrupt to the same CPU as the poll */
906 srs_cpu
->mc_rx_intr_cpu
= srs_cpu
->mc_rx_pollid
;
907 srs_cpu
->mc_rx_workerid
= (mac_latency_optimize
?
908 srs_cpu
->mc_rx_pollid
: mrp
->mrp_cpu
[cpu_cnt
++]);
909 for (i
= 0; i
< fanout_cnt_per_srs
; i
++)
910 srs_cpu
->mc_rx_fanout_cpus
[i
] = mrp
->mrp_cpu
[cpu_cnt
++];
912 /* Do the assignment for h/w Rx SRSes */
913 if (flent
->fe_rx_srs_cnt
> 1) {
916 srs_cnt
< flent
->fe_rx_srs_cnt
; srs_cnt
++) {
917 rx_srs
= flent
->fe_rx_srs
[srs_cnt
];
918 ASSERT(rx_srs
->srs_ring
!= NULL
);
919 if (rx_srs
->srs_fanout_state
==
921 rx_srs
->srs_fanout_state
=
924 srs_cpu
= &rx_srs
->srs_cpu
;
925 srs_cpu
->mc_ncpus
= no_of_cpus
;
926 bcopy(mrp
->mrp_cpu
, srs_cpu
->mc_cpus
,
927 sizeof (srs_cpu
->mc_cpus
));
928 srs_cpu
->mc_rx_fanout_cnt
= fanout_cnt_per_srs
;
929 /* The first CPU in the list is the intr CPU */
930 srs_cpu
->mc_rx_pollid
= mrp
->mrp_cpu
[cpu_cnt
++];
931 srs_cpu
->mc_rx_intr_cpu
= srs_cpu
->mc_rx_pollid
;
932 srs_cpu
->mc_rx_workerid
=
933 (mac_latency_optimize
?
934 srs_cpu
->mc_rx_pollid
:
935 mrp
->mrp_cpu
[cpu_cnt
++]);
936 for (i
= 0; i
< fanout_cnt_per_srs
; i
++) {
937 srs_cpu
->mc_rx_fanout_cpus
[i
] =
938 mrp
->mrp_cpu
[cpu_cnt
++];
940 ASSERT(cpu_cnt
<= no_of_cpus
);
948 * We have the following information:
949 * no_of_cpus - no. of cpus that user passed.
950 * rx_srs_cnt - no. of rx rings.
951 * reqd_rx_cpu_cnt = mac_latency_optimize?rx_srs_cnt*2:rx_srs_cnt*3
952 * reqd_tx_cpu_cnt - no. of cpus reqd. for Tx side.
953 * reqd_poll_worker_cnt = mac_latency_optimize?rx_srs_cnt:rx_srs_cnt*2
956 * If we bind the Rx fanout soft rings to the same CPUs
957 * as poll/worker, would that be enough?
959 if (no_of_cpus
>= (rx_srs_cnt
+ reqd_tx_cpu_cnt
)) {
960 boolean_t worker_assign
= B_FALSE
;
963 * If mac_latency_optimize is not set, are there
964 * enough CPUs to assign a CPU for worker also?
966 if (no_of_cpus
>= (reqd_poll_worker_cnt
+ reqd_tx_cpu_cnt
))
967 worker_assign
= B_TRUE
;
969 * Zero'th Rx SRS is the default Rx ring. It is not
970 * associated with h/w Rx ring.
972 rx_srs
= flent
->fe_rx_srs
[0];
973 ASSERT(rx_srs
->srs_ring
== NULL
);
974 if (rx_srs
->srs_fanout_state
== SRS_FANOUT_INIT
)
975 rx_srs
->srs_fanout_state
= SRS_FANOUT_REINIT
;
977 srs_cpu
= &rx_srs
->srs_cpu
;
978 srs_cpu
->mc_ncpus
= no_of_cpus
;
980 srs_cpu
->mc_cpus
, sizeof (srs_cpu
->mc_cpus
));
981 srs_cpu
->mc_rx_fanout_cnt
= 1;
982 srs_cpu
->mc_rx_pollid
= mrp
->mrp_cpu
[cpu_cnt
++];
983 /* Retarget the interrupt to the same CPU as the poll */
984 srs_cpu
->mc_rx_intr_cpu
= srs_cpu
->mc_rx_pollid
;
985 srs_cpu
->mc_rx_workerid
=
986 ((!mac_latency_optimize
&& worker_assign
) ?
987 mrp
->mrp_cpu
[cpu_cnt
++] : srs_cpu
->mc_rx_pollid
);
989 srs_cpu
->mc_rx_fanout_cpus
[0] = mrp
->mrp_cpu
[cpu_cnt
];
991 /* Do CPU bindings for SRSes having h/w Rx rings */
992 if (flent
->fe_rx_srs_cnt
> 1) {
995 srs_cnt
< flent
->fe_rx_srs_cnt
; srs_cnt
++) {
996 rx_srs
= flent
->fe_rx_srs
[srs_cnt
];
997 ASSERT(rx_srs
->srs_ring
!= NULL
);
998 if (rx_srs
->srs_fanout_state
==
1000 rx_srs
->srs_fanout_state
=
1003 srs_cpu
= &rx_srs
->srs_cpu
;
1004 srs_cpu
->mc_ncpus
= no_of_cpus
;
1005 bcopy(mrp
->mrp_cpu
, srs_cpu
->mc_cpus
,
1006 sizeof (srs_cpu
->mc_cpus
));
1007 srs_cpu
->mc_rx_pollid
=
1008 mrp
->mrp_cpu
[cpu_cnt
];
1009 srs_cpu
->mc_rx_intr_cpu
= srs_cpu
->mc_rx_pollid
;
1010 srs_cpu
->mc_rx_workerid
=
1011 ((!mac_latency_optimize
&& worker_assign
) ?
1012 mrp
->mrp_cpu
[++cpu_cnt
] :
1013 srs_cpu
->mc_rx_pollid
);
1014 srs_cpu
->mc_rx_fanout_cnt
= 1;
1015 srs_cpu
->mc_rx_fanout_cpus
[0] =
1016 mrp
->mrp_cpu
[cpu_cnt
];
1018 ASSERT(cpu_cnt
<= no_of_cpus
);
1025 * Real sub-optimal case. Not enough CPUs for poll and
1026 * Tx soft rings. Do a round robin assignment where
1027 * each Rx SRS will get the same CPU for poll, worker
1028 * and fanout soft ring.
1031 for (srs_cnt
= 0; srs_cnt
< flent
->fe_rx_srs_cnt
; srs_cnt
++) {
1032 rx_srs
= flent
->fe_rx_srs
[srs_cnt
];
1033 srs_cpu
= &rx_srs
->srs_cpu
;
1034 if (rx_srs
->srs_fanout_state
== SRS_FANOUT_INIT
)
1035 rx_srs
->srs_fanout_state
= SRS_FANOUT_REINIT
;
1036 srs_cpu
->mc_ncpus
= no_of_cpus
;
1038 srs_cpu
->mc_cpus
, sizeof (srs_cpu
->mc_cpus
));
1039 srs_cpu
->mc_rx_fanout_cnt
= 1;
1040 srs_cpu
->mc_rx_pollid
= mrp
->mrp_cpu
[cpu_cnt
];
1041 /* Retarget the interrupt to the same CPU as the poll */
1042 srs_cpu
->mc_rx_intr_cpu
= srs_cpu
->mc_rx_pollid
;
1043 srs_cpu
->mc_rx_workerid
= mrp
->mrp_cpu
[cpu_cnt
];
1044 srs_cpu
->mc_rx_fanout_cpus
[0] = mrp
->mrp_cpu
[cpu_cnt
];
1045 if (++cpu_cnt
>= no_of_cpus
)
1050 mac_tx_cpu_init(flent
, mrp
, NULL
);
1053 * Copy the user specified CPUs to the effective CPUs
1055 for (i
= 0; i
< mrp
->mrp_ncpus
; i
++) {
1056 emrp
->mrp_cpu
[i
] = mrp
->mrp_cpu
[i
];
1058 emrp
->mrp_ncpus
= mrp
->mrp_ncpus
;
1059 emrp
->mrp_mask
= mrp
->mrp_mask
;
1060 bzero(emrp
->mrp_pool
, MAXPATHLEN
);
1064 * mac_flow_cpu_init():
1066 * Each SRS has a mac_cpu_t structure, srs_cpu. This routine fills in
1067 * the CPU binding information in srs_cpu for all Rx SRSes associated
1071 mac_flow_cpu_init(flow_entry_t
*flent
, cpupart_t
*cpupart
)
1073 mac_soft_ring_set_t
*rx_srs
;
1074 processorid_t cpuid
;
1075 int i
, j
, k
, srs_cnt
, nscpus
, maxcpus
, soft_ring_cnt
= 0;
1076 mac_cpus_t
*srs_cpu
;
1077 mac_resource_props_t
*emrp
= &flent
->fe_effective_props
;
1078 uint32_t cpus
[MRP_NCPUS
];
1081 * The maximum number of CPUs available can either be
1082 * the number of CPUs in the pool or the number of CPUs
1085 maxcpus
= (cpupart
!= NULL
) ? cpupart
->cp_ncpus
: ncpus
;
1088 * Compute the number of soft rings needed on top for each Rx
1089 * SRS. "rx_srs_cnt-1" indicates the number of Rx SRS
1090 * associated with h/w Rx rings. Soft ring count needed for
1091 * each h/w Rx SRS is computed and the same is applied to
1092 * software classified Rx SRS. The first Rx SRS in fe_rx_srs[]
1093 * is the software classified Rx SRS.
1095 soft_ring_cnt
= mac_compute_soft_ring_count(flent
,
1096 flent
->fe_rx_srs_cnt
- 1, maxcpus
);
1097 if (soft_ring_cnt
== 0) {
1099 * Even when soft_ring_cnt is 0, we still need
1100 * to create a soft ring for TCP, UDP and
1101 * OTHER. So set it to 1.
1105 for (srs_cnt
= 0; srs_cnt
< flent
->fe_rx_srs_cnt
; srs_cnt
++) {
1106 rx_srs
= flent
->fe_rx_srs
[srs_cnt
];
1107 srs_cpu
= &rx_srs
->srs_cpu
;
1108 if (rx_srs
->srs_fanout_state
== SRS_FANOUT_INIT
)
1109 rx_srs
->srs_fanout_state
= SRS_FANOUT_REINIT
;
1110 srs_cpu
->mc_ncpus
= soft_ring_cnt
;
1111 srs_cpu
->mc_rx_fanout_cnt
= soft_ring_cnt
;
1112 mutex_enter(&cpu_lock
);
1113 for (j
= 0; j
< soft_ring_cnt
; j
++) {
1114 cpuid
= mac_next_bind_cpu(cpupart
);
1115 srs_cpu
->mc_cpus
[j
] = cpuid
;
1116 srs_cpu
->mc_rx_fanout_cpus
[j
] = cpuid
;
1118 cpuid
= mac_next_bind_cpu(cpupart
);
1119 srs_cpu
->mc_rx_pollid
= cpuid
;
1120 srs_cpu
->mc_rx_intr_cpu
= (mac_rx_intr_retarget
?
1121 srs_cpu
->mc_rx_pollid
: -1);
1122 /* increment ncpus to account for polling cpu */
1123 srs_cpu
->mc_ncpus
++;
1124 srs_cpu
->mc_cpus
[j
++] = cpuid
;
1125 if (!mac_latency_optimize
) {
1126 cpuid
= mac_next_bind_cpu(cpupart
);
1127 srs_cpu
->mc_ncpus
++;
1128 srs_cpu
->mc_cpus
[j
++] = cpuid
;
1130 srs_cpu
->mc_rx_workerid
= cpuid
;
1131 mutex_exit(&cpu_lock
);
1135 for (srs_cnt
= 0; srs_cnt
< flent
->fe_rx_srs_cnt
; srs_cnt
++) {
1136 rx_srs
= flent
->fe_rx_srs
[srs_cnt
];
1137 srs_cpu
= &rx_srs
->srs_cpu
;
1138 for (j
= 0; j
< srs_cpu
->mc_ncpus
; j
++) {
1139 cpus
[nscpus
++] = srs_cpu
->mc_cpus
[j
];
1145 * Copy cpu list to fe_effective_props
1146 * without duplicates.
1149 for (i
= 0; i
< nscpus
; i
++) {
1150 for (j
= 0; j
< k
; j
++) {
1151 if (emrp
->mrp_cpu
[j
] == cpus
[i
])
1155 emrp
->mrp_cpu
[k
++] = cpus
[i
];
1157 emrp
->mrp_ncpus
= k
;
1159 mac_tx_cpu_init(flent
, NULL
, cpupart
);
1163 * DATAPATH SETUP ROUTINES
1164 * (setup SRS and set/update FANOUT, B/W and PRIORITY)
1168 * mac_srs_fanout_list_alloc:
1170 * The underlying device can expose upto MAX_RINGS_PER_GROUP worth of
1171 * rings to a client. In such a case, MAX_RINGS_PER_GROUP worth of
1172 * array space is needed to store Tx soft rings. Thus we allocate so
1173 * much array space for srs_tx_soft_rings.
1175 * And when it is an aggr, again we allocate MAX_RINGS_PER_GROUP worth
1176 * of space to st_soft_rings. This array is used for quick access to
1177 * soft ring associated with a pseudo Tx ring based on the pseudo
1178 * ring's index (mr_index).
1181 mac_srs_fanout_list_alloc(mac_soft_ring_set_t
*mac_srs
)
1183 mac_client_impl_t
*mcip
= mac_srs
->srs_mcip
;
1185 if (mac_srs
->srs_type
& SRST_TX
) {
1186 mac_srs
->srs_tx_soft_rings
= (mac_soft_ring_t
**)
1187 kmem_zalloc(sizeof (mac_soft_ring_t
*) *
1188 MAX_RINGS_PER_GROUP
, KM_SLEEP
);
1189 if (mcip
->mci_state_flags
& MCIS_IS_AGGR
) {
1190 mac_srs_tx_t
*tx
= &mac_srs
->srs_tx
;
1192 tx
->st_soft_rings
= (mac_soft_ring_t
**)
1193 kmem_zalloc(sizeof (mac_soft_ring_t
*) *
1194 MAX_RINGS_PER_GROUP
, KM_SLEEP
);
1197 mac_srs
->srs_tcp_soft_rings
= (mac_soft_ring_t
**)
1198 kmem_zalloc(sizeof (mac_soft_ring_t
*) * MAX_SR_FANOUT
,
1200 mac_srs
->srs_udp_soft_rings
= (mac_soft_ring_t
**)
1201 kmem_zalloc(sizeof (mac_soft_ring_t
*) * MAX_SR_FANOUT
,
1203 mac_srs
->srs_oth_soft_rings
= (mac_soft_ring_t
**)
1204 kmem_zalloc(sizeof (mac_soft_ring_t
*) * MAX_SR_FANOUT
,
1210 mac_srs_worker_bind(mac_soft_ring_set_t
*mac_srs
, processorid_t cpuid
)
1213 boolean_t clear
= B_FALSE
;
1215 ASSERT(MUTEX_HELD(&cpu_lock
));
1217 if (!mac_srs_thread_bind
)
1220 cp
= cpu_get(cpuid
);
1221 if (cp
== NULL
|| !cpu_is_online(cp
))
1224 mutex_enter(&mac_srs
->srs_lock
);
1225 mac_srs
->srs_state
|= SRS_WORKER_BOUND
;
1226 if (mac_srs
->srs_worker_cpuid
!= -1)
1228 mac_srs
->srs_worker_cpuid
= cpuid
;
1229 mutex_exit(&mac_srs
->srs_lock
);
1232 thread_affinity_clear(mac_srs
->srs_worker
);
1234 thread_affinity_set(mac_srs
->srs_worker
, cpuid
);
1235 DTRACE_PROBE1(worker__CPU
, processorid_t
, cpuid
);
1239 mac_srs_poll_bind(mac_soft_ring_set_t
*mac_srs
, processorid_t cpuid
)
1242 boolean_t clear
= B_FALSE
;
1244 ASSERT(MUTEX_HELD(&cpu_lock
));
1246 if (!mac_srs_thread_bind
|| mac_srs
->srs_poll_thr
== NULL
)
1249 cp
= cpu_get(cpuid
);
1250 if (cp
== NULL
|| !cpu_is_online(cp
))
1253 mutex_enter(&mac_srs
->srs_lock
);
1254 mac_srs
->srs_state
|= SRS_POLL_BOUND
;
1255 if (mac_srs
->srs_poll_cpuid
!= -1)
1257 mac_srs
->srs_poll_cpuid
= cpuid
;
1258 mutex_exit(&mac_srs
->srs_lock
);
1261 thread_affinity_clear(mac_srs
->srs_poll_thr
);
1263 thread_affinity_set(mac_srs
->srs_poll_thr
, cpuid
);
1264 DTRACE_PROBE1(poll__CPU
, processorid_t
, cpuid
);
1268 * Re-target interrupt to the passed CPU. If re-target is successful,
1269 * set mc_rx_intr_cpu to the re-targeted CPU. Otherwise set it to -1.
1272 mac_rx_srs_retarget_intr(mac_soft_ring_set_t
*mac_srs
, processorid_t cpuid
)
1275 mac_ring_t
*ring
= mac_srs
->srs_ring
;
1276 mac_intr_t
*mintr
= &ring
->mr_info
.mri_intr
;
1277 flow_entry_t
*flent
= mac_srs
->srs_flent
;
1278 boolean_t primary
= mac_is_primary_client(mac_srs
->srs_mcip
);
1280 ASSERT(MUTEX_HELD(&cpu_lock
));
1283 * Don't re-target the interrupt for these cases:
1285 * 2) the interrupt is shared (mi_ddi_shared)
1286 * 3) ddi_handle is NULL and !primary
1287 * 4) primary, ddi_handle is NULL but fe_rx_srs_cnt > 2
1288 * Case 3 & 4 are because of mac_client_intr_cpu() routine.
1289 * This routine will re-target fixed interrupt for primary
1290 * mac client if the client has only one ring. In that
1291 * case, mc_rx_intr_cpu will already have the correct value.
1293 if (ring
== NULL
|| mintr
->mi_ddi_shared
|| cpuid
== -1 ||
1294 (mintr
->mi_ddi_handle
== NULL
&& !primary
) || (primary
&&
1295 mintr
->mi_ddi_handle
== NULL
&& flent
->fe_rx_srs_cnt
> 2)) {
1296 mac_srs
->srs_cpu
.mc_rx_intr_cpu
= -1;
1300 if (mintr
->mi_ddi_handle
== NULL
)
1303 cp
= cpu_get(cpuid
);
1304 if (cp
== NULL
|| !cpu_is_online(cp
))
1307 /* Drop the cpu_lock as set_intr_affinity() holds it */
1308 mutex_exit(&cpu_lock
);
1309 if (set_intr_affinity(mintr
->mi_ddi_handle
, cpuid
) == DDI_SUCCESS
)
1310 mac_srs
->srs_cpu
.mc_rx_intr_cpu
= cpuid
;
1312 mac_srs
->srs_cpu
.mc_rx_intr_cpu
= -1;
1313 mutex_enter(&cpu_lock
);
1317 * Re-target Tx interrupts
1320 mac_tx_srs_retarget_intr(mac_soft_ring_set_t
*mac_srs
)
1325 mac_soft_ring_t
*sringp
;
1326 mac_srs_tx_t
*srs_tx
;
1327 mac_cpus_t
*srs_cpu
;
1328 processorid_t cpuid
;
1331 ASSERT(MUTEX_HELD(&cpu_lock
));
1333 srs_cpu
= &mac_srs
->srs_cpu
;
1334 if (MAC_TX_SOFT_RINGS(mac_srs
)) {
1335 for (i
= 0; i
< mac_srs
->srs_tx_ring_count
; i
++) {
1336 sringp
= mac_srs
->srs_tx_soft_rings
[i
];
1337 ring
= (mac_ring_t
*)sringp
->s_ring_tx_arg2
;
1338 cpuid
= srs_cpu
->mc_tx_intr_cpu
[i
];
1339 cp
= cpu_get(cpuid
);
1340 if (cp
== NULL
|| !cpu_is_online(cp
) ||
1341 !MAC_RING_RETARGETABLE(ring
)) {
1342 srs_cpu
->mc_tx_retargeted_cpu
[i
] = -1;
1345 mintr
= &ring
->mr_info
.mri_intr
;
1347 * Drop the cpu_lock as set_intr_affinity()
1350 mutex_exit(&cpu_lock
);
1351 if (set_intr_affinity(mintr
->mi_ddi_handle
,
1352 cpuid
) == DDI_SUCCESS
) {
1353 srs_cpu
->mc_tx_retargeted_cpu
[i
] = cpuid
;
1355 srs_cpu
->mc_tx_retargeted_cpu
[i
] = -1;
1357 mutex_enter(&cpu_lock
);
1360 cpuid
= srs_cpu
->mc_tx_intr_cpu
[0];
1361 cp
= cpu_get(cpuid
);
1362 if (cp
== NULL
|| !cpu_is_online(cp
)) {
1363 srs_cpu
->mc_tx_retargeted_cpu
[0] = -1;
1366 srs_tx
= &mac_srs
->srs_tx
;
1367 ring
= (mac_ring_t
*)srs_tx
->st_arg2
;
1368 if (MAC_RING_RETARGETABLE(ring
)) {
1369 mintr
= &ring
->mr_info
.mri_intr
;
1370 mutex_exit(&cpu_lock
);
1371 if ((set_intr_affinity(mintr
->mi_ddi_handle
,
1372 cpuid
) == DDI_SUCCESS
)) {
1373 srs_cpu
->mc_tx_retargeted_cpu
[0] = cpuid
;
1375 srs_cpu
->mc_tx_retargeted_cpu
[0] = -1;
1377 mutex_enter(&cpu_lock
);
1383 * When a CPU comes back online, bind the MAC kernel threads which
1384 * were previously bound to that CPU, and had to be unbound because
1385 * the CPU was going away.
1387 * These functions are called with cpu_lock held and hence we can't
1388 * cv_wait to grab the mac perimeter. Since these functions walk the soft
1389 * ring list of an SRS without being in the perimeter, the list itself
1390 * is protected by the SRS lock.
1393 mac_walk_srs_and_bind(int cpuid
)
1395 mac_soft_ring_set_t
*mac_srs
;
1396 mac_soft_ring_t
*soft_ring
;
1398 rw_enter(&mac_srs_g_lock
, RW_READER
);
1400 if ((mac_srs
= mac_srs_g_list
) == NULL
)
1403 for (; mac_srs
!= NULL
; mac_srs
= mac_srs
->srs_next
) {
1404 if (mac_srs
->srs_worker_cpuid
== -1 &&
1405 mac_srs
->srs_worker_cpuid_save
== cpuid
) {
1406 mac_srs
->srs_worker_cpuid_save
= -1;
1407 mac_srs_worker_bind(mac_srs
, cpuid
);
1410 if (!(mac_srs
->srs_type
& SRST_TX
)) {
1411 if (mac_srs
->srs_poll_cpuid
== -1 &&
1412 mac_srs
->srs_poll_cpuid_save
== cpuid
) {
1413 mac_srs
->srs_poll_cpuid_save
= -1;
1414 mac_srs_poll_bind(mac_srs
, cpuid
);
1418 /* Next tackle the soft rings associated with the srs */
1419 mutex_enter(&mac_srs
->srs_lock
);
1420 for (soft_ring
= mac_srs
->srs_soft_ring_head
; soft_ring
!= NULL
;
1421 soft_ring
= soft_ring
->s_ring_next
) {
1422 if (soft_ring
->s_ring_cpuid
== -1 &&
1423 soft_ring
->s_ring_cpuid_save
== cpuid
) {
1424 soft_ring
->s_ring_cpuid_save
= -1;
1425 (void) mac_soft_ring_bind(soft_ring
, cpuid
);
1428 mutex_exit(&mac_srs
->srs_lock
);
1431 rw_exit(&mac_srs_g_lock
);
1435 * Change the priority of the SRS's poll and worker thread. Additionally,
1436 * update the priority of the worker threads for the SRS's soft rings.
1437 * Need to modify any associated squeue threads.
1440 mac_update_srs_priority(mac_soft_ring_set_t
*mac_srs
, pri_t prival
)
1442 mac_soft_ring_t
*ringp
;
1444 mac_srs
->srs_pri
= prival
;
1445 thread_lock(mac_srs
->srs_worker
);
1446 (void) thread_change_pri(mac_srs
->srs_worker
, mac_srs
->srs_pri
, 0);
1447 thread_unlock(mac_srs
->srs_worker
);
1448 if (mac_srs
->srs_poll_thr
!= NULL
) {
1449 thread_lock(mac_srs
->srs_poll_thr
);
1450 (void) thread_change_pri(mac_srs
->srs_poll_thr
,
1451 mac_srs
->srs_pri
, 0);
1452 thread_unlock(mac_srs
->srs_poll_thr
);
1454 if ((ringp
= mac_srs
->srs_soft_ring_head
) == NULL
)
1456 while (ringp
!= mac_srs
->srs_soft_ring_tail
) {
1457 thread_lock(ringp
->s_ring_worker
);
1458 (void) thread_change_pri(ringp
->s_ring_worker
,
1459 mac_srs
->srs_pri
, 0);
1460 thread_unlock(ringp
->s_ring_worker
);
1461 ringp
= ringp
->s_ring_next
;
1463 ASSERT(ringp
== mac_srs
->srs_soft_ring_tail
);
1464 thread_lock(ringp
->s_ring_worker
);
1465 (void) thread_change_pri(ringp
->s_ring_worker
, mac_srs
->srs_pri
, 0);
1466 thread_unlock(ringp
->s_ring_worker
);
1470 * Change the receive bandwidth limit.
1473 mac_rx_srs_update_bwlimit(mac_soft_ring_set_t
*srs
, mac_resource_props_t
*mrp
)
1475 mac_soft_ring_t
*softring
;
1477 mutex_enter(&srs
->srs_lock
);
1478 mutex_enter(&srs
->srs_bw
->mac_bw_lock
);
1480 if (mrp
->mrp_maxbw
== MRP_MAXBW_RESETVAL
) {
1481 /* Reset bandwidth limit */
1482 if (srs
->srs_type
& SRST_BW_CONTROL
) {
1483 softring
= srs
->srs_soft_ring_head
;
1484 while (softring
!= NULL
) {
1485 softring
->s_ring_type
&= ~ST_RING_BW_CTL
;
1486 softring
= softring
->s_ring_next
;
1488 srs
->srs_type
&= ~SRST_BW_CONTROL
;
1489 srs
->srs_drain_func
= mac_rx_srs_drain
;
1492 /* Set/Modify bandwidth limit */
1493 srs
->srs_bw
->mac_bw_limit
= FLOW_BYTES_PER_TICK(mrp
->mrp_maxbw
);
1495 * Give twice the queuing capability before
1496 * dropping packets. The unit is bytes/tick.
1498 srs
->srs_bw
->mac_bw_drop_threshold
=
1499 srs
->srs_bw
->mac_bw_limit
<< 1;
1500 if (!(srs
->srs_type
& SRST_BW_CONTROL
)) {
1501 softring
= srs
->srs_soft_ring_head
;
1502 while (softring
!= NULL
) {
1503 softring
->s_ring_type
|= ST_RING_BW_CTL
;
1504 softring
= softring
->s_ring_next
;
1506 srs
->srs_type
|= SRST_BW_CONTROL
;
1507 srs
->srs_drain_func
= mac_rx_srs_drain_bw
;
1511 mutex_exit(&srs
->srs_bw
->mac_bw_lock
);
1512 mutex_exit(&srs
->srs_lock
);
1515 /* Change the transmit bandwidth limit */
1517 mac_tx_srs_update_bwlimit(mac_soft_ring_set_t
*srs
, mac_resource_props_t
*mrp
)
1519 uint32_t tx_mode
, ring_info
= 0;
1520 mac_srs_tx_t
*srs_tx
= &srs
->srs_tx
;
1521 mac_client_impl_t
*mcip
= srs
->srs_mcip
;
1524 * We need to quiesce/restart the client here because mac_tx() and
1525 * srs->srs_tx->st_func do not hold srs->srs_lock while accessing
1526 * st_mode and related fields, which are modified by the code below.
1528 mac_tx_client_quiesce((mac_client_handle_t
)mcip
);
1530 mutex_enter(&srs
->srs_lock
);
1531 mutex_enter(&srs
->srs_bw
->mac_bw_lock
);
1533 tx_mode
= srs_tx
->st_mode
;
1534 if (mrp
->mrp_maxbw
== MRP_MAXBW_RESETVAL
) {
1535 /* Reset bandwidth limit */
1536 if (tx_mode
== SRS_TX_BW
) {
1537 if (srs_tx
->st_arg2
!= NULL
)
1538 ring_info
= mac_hwring_getinfo(srs_tx
->st_arg2
);
1539 if (mac_tx_serialize
||
1540 (ring_info
& MAC_RING_TX_SERIALIZE
)) {
1541 srs_tx
->st_mode
= SRS_TX_SERIALIZE
;
1543 srs_tx
->st_mode
= SRS_TX_DEFAULT
;
1545 } else if (tx_mode
== SRS_TX_BW_FANOUT
) {
1546 srs_tx
->st_mode
= SRS_TX_FANOUT
;
1547 } else if (tx_mode
== SRS_TX_BW_AGGR
) {
1548 srs_tx
->st_mode
= SRS_TX_AGGR
;
1550 srs
->srs_type
&= ~SRST_BW_CONTROL
;
1552 /* Set/Modify bandwidth limit */
1553 srs
->srs_bw
->mac_bw_limit
= FLOW_BYTES_PER_TICK(mrp
->mrp_maxbw
);
1555 * Give twice the queuing capability before
1556 * dropping packets. The unit is bytes/tick.
1558 srs
->srs_bw
->mac_bw_drop_threshold
=
1559 srs
->srs_bw
->mac_bw_limit
<< 1;
1560 srs
->srs_type
|= SRST_BW_CONTROL
;
1561 if (tx_mode
!= SRS_TX_BW
&& tx_mode
!= SRS_TX_BW_FANOUT
&&
1562 tx_mode
!= SRS_TX_BW_AGGR
) {
1563 if (tx_mode
== SRS_TX_SERIALIZE
||
1564 tx_mode
== SRS_TX_DEFAULT
) {
1565 srs_tx
->st_mode
= SRS_TX_BW
;
1566 } else if (tx_mode
== SRS_TX_FANOUT
) {
1567 srs_tx
->st_mode
= SRS_TX_BW_FANOUT
;
1568 } else if (tx_mode
== SRS_TX_AGGR
) {
1569 srs_tx
->st_mode
= SRS_TX_BW_AGGR
;
1576 srs_tx
->st_func
= mac_tx_get_func(srs_tx
->st_mode
);
1577 mutex_exit(&srs
->srs_bw
->mac_bw_lock
);
1578 mutex_exit(&srs
->srs_lock
);
1580 mac_tx_client_restart((mac_client_handle_t
)mcip
);
1584 * The uber function that deals with any update to bandwidth limits.
1587 mac_srs_update_bwlimit(flow_entry_t
*flent
, mac_resource_props_t
*mrp
)
1591 for (count
= 0; count
< flent
->fe_rx_srs_cnt
; count
++)
1592 mac_rx_srs_update_bwlimit(flent
->fe_rx_srs
[count
], mrp
);
1593 mac_tx_srs_update_bwlimit(flent
->fe_tx_srs
, mrp
);
1597 mac_srs_change_upcall(void *arg
, mac_direct_rx_t rx_func
, void *rx_arg1
)
1599 mac_soft_ring_set_t
*mac_srs
= arg
;
1600 mac_srs_rx_t
*srs_rx
= &mac_srs
->srs_rx
;
1601 mac_soft_ring_t
*softring
;
1603 mutex_enter(&mac_srs
->srs_lock
);
1604 ASSERT((mac_srs
->srs_type
& SRST_TX
) == 0);
1605 srs_rx
->sr_func
= rx_func
;
1606 srs_rx
->sr_arg1
= rx_arg1
;
1608 softring
= mac_srs
->srs_soft_ring_head
;
1609 while (softring
!= NULL
) {
1610 mutex_enter(&softring
->s_ring_lock
);
1611 softring
->s_ring_rx_func
= rx_func
;
1612 softring
->s_ring_rx_arg1
= rx_arg1
;
1613 mutex_exit(&softring
->s_ring_lock
);
1614 softring
= softring
->s_ring_next
;
1617 mutex_exit(&mac_srs
->srs_lock
);
1621 * When the first sub-flow is added to a link, we disable polling on the
1622 * link and also modify the entry point to mac_rx_srs_subflow_process.
1623 * (polling is disabled because with the subflow added, accounting
1624 * for polling needs additional logic, it is assumed that when a subflow is
1625 * added, we can take some hit as a result of disabling polling rather than
1626 * adding more complexity - if this becomes a perf. issue we need to
1627 * re-rvaluate this logic). When the last subflow is removed, we turn back
1628 * polling and also reset the entry point to mac_rx_srs_process.
1630 * In the future if there are multiple SRS, we can simply
1631 * take one and give it to the flow rather than disabling polling and
1632 * resetting the entry point.
1635 mac_client_update_classifier(mac_client_impl_t
*mcip
, boolean_t enable
)
1637 flow_entry_t
*flent
= mcip
->mci_flent
;
1639 mac_impl_t
*mip
= mcip
->mci_mip
;
1640 mac_rx_func_t rx_func
;
1642 boolean_t enable_classifier
;
1644 ASSERT(MAC_PERIM_HELD((mac_handle_t
)mip
));
1646 enable_classifier
= !FLOW_TAB_EMPTY(mcip
->mci_subflow_tab
) && enable
;
1648 rx_func
= enable_classifier
? mac_rx_srs_subflow_process
:
1651 /* Tell mac_srs_poll_state_change to disable polling if necessary */
1652 if (mip
->mi_state_flags
& MIS_POLL_DISABLE
)
1653 enable_classifier
= B_TRUE
;
1656 * If receive function has already been configured correctly for
1657 * current subflow configuration, do nothing.
1659 if (flent
->fe_cb_fn
== (flow_fn_t
)rx_func
)
1662 rx_srs_cnt
= flent
->fe_rx_srs_cnt
;
1663 for (i
= 0; i
< rx_srs_cnt
; i
++) {
1664 ASSERT(flent
->fe_rx_srs
[i
] != NULL
);
1665 mac_srs_poll_state_change(flent
->fe_rx_srs
[i
],
1666 enable_classifier
, rx_func
);
1670 * Change the S/W classifier so that we can land in the
1671 * correct processing function with correct argument.
1672 * If all subflows have been removed we can revert to
1673 * mac_rx_srsprocess, else we need mac_rx_srs_subflow_process.
1675 mutex_enter(&flent
->fe_lock
);
1676 flent
->fe_cb_fn
= (flow_fn_t
)rx_func
;
1677 flent
->fe_cb_arg1
= (void *)mip
;
1678 flent
->fe_cb_arg2
= flent
->fe_rx_srs
[0];
1679 mutex_exit(&flent
->fe_lock
);
1683 mac_srs_update_fanout_list(mac_soft_ring_set_t
*mac_srs
)
1685 int tcp_count
= 0, udp_count
= 0, oth_count
= 0, tx_count
= 0;
1686 mac_soft_ring_t
*softring
;
1688 softring
= mac_srs
->srs_soft_ring_head
;
1689 if (softring
== NULL
) {
1690 ASSERT(mac_srs
->srs_soft_ring_count
== 0);
1691 mac_srs
->srs_tcp_ring_count
= 0;
1692 mac_srs
->srs_udp_ring_count
= 0;
1693 mac_srs
->srs_oth_ring_count
= 0;
1694 mac_srs
->srs_tx_ring_count
= 0;
1698 while (softring
!= NULL
) {
1699 if (softring
->s_ring_type
& ST_RING_TCP
) {
1700 mac_srs
->srs_tcp_soft_rings
[tcp_count
++] = softring
;
1701 } else if (softring
->s_ring_type
& ST_RING_UDP
) {
1702 mac_srs
->srs_udp_soft_rings
[udp_count
++] = softring
;
1703 } else if (softring
->s_ring_type
& ST_RING_OTH
) {
1704 mac_srs
->srs_oth_soft_rings
[oth_count
++] = softring
;
1706 ASSERT(softring
->s_ring_type
& ST_RING_TX
);
1707 mac_srs
->srs_tx_soft_rings
[tx_count
++] = softring
;
1709 softring
= softring
->s_ring_next
;
1712 ASSERT(mac_srs
->srs_soft_ring_count
==
1713 (tcp_count
+ udp_count
+ oth_count
+ tx_count
));
1714 mac_srs
->srs_tcp_ring_count
= tcp_count
;
1715 mac_srs
->srs_udp_ring_count
= udp_count
;
1716 mac_srs
->srs_oth_ring_count
= oth_count
;
1717 mac_srs
->srs_tx_ring_count
= tx_count
;
1721 mac_srs_create_proto_softrings(int id
, uint16_t type
, pri_t pri
,
1722 mac_client_impl_t
*mcip
, mac_soft_ring_set_t
*mac_srs
,
1723 processorid_t cpuid
, mac_direct_rx_t rx_func
, void *x_arg1
,
1724 mac_resource_handle_t x_arg2
, boolean_t set_bypass
)
1726 mac_soft_ring_t
*softring
;
1729 bzero(&mrf
, sizeof (mac_rx_fifo_t
));
1730 mrf
.mrf_type
= MAC_RX_FIFO
;
1731 mrf
.mrf_receive
= (mac_receive_t
)mac_soft_ring_poll
;
1732 mrf
.mrf_intr_enable
=
1733 (mac_intr_enable_t
)mac_soft_ring_intr_enable
;
1734 mrf
.mrf_intr_disable
=
1735 (mac_intr_disable_t
)mac_soft_ring_intr_disable
;
1736 mrf
.mrf_flow_priority
= pri
;
1738 softring
= mac_soft_ring_create(id
, mac_soft_ring_worker_wait
,
1739 (type
|ST_RING_TCP
), pri
, mcip
, mac_srs
,
1740 cpuid
, rx_func
, x_arg1
, x_arg2
);
1741 softring
->s_ring_rx_arg2
= NULL
;
1744 * TCP and UDP support DLS bypass. In addition TCP
1745 * squeue can also poll their corresponding soft rings.
1747 if (set_bypass
&& (mcip
->mci_resource_arg
!= NULL
)) {
1748 mac_soft_ring_dls_bypass(softring
,
1749 mcip
->mci_direct_rx_fn
,
1750 mcip
->mci_direct_rx_arg
);
1752 mrf
.mrf_rx_arg
= softring
;
1753 mrf
.mrf_intr_handle
= (mac_intr_handle_t
)softring
;
1756 * Make a call in IP to get a TCP squeue assigned to
1757 * this softring to maintain full CPU locality through
1758 * the stack and allow the squeue to be able to poll
1759 * the softring so the flow control can be pushed
1760 * all the way to H/W.
1762 softring
->s_ring_rx_arg2
=
1763 mcip
->mci_resource_add((void *)mcip
->mci_resource_arg
,
1764 (mac_resource_t
*)&mrf
);
1768 * Non-TCP protocols don't support squeues. Hence we
1769 * don't make any ring addition callbacks for non-TCP
1770 * rings. Now create the UDP softring and allow it to
1771 * bypass the DLS layer.
1773 softring
= mac_soft_ring_create(id
, mac_soft_ring_worker_wait
,
1774 (type
|ST_RING_UDP
), pri
, mcip
, mac_srs
,
1775 cpuid
, rx_func
, x_arg1
, x_arg2
);
1776 softring
->s_ring_rx_arg2
= NULL
;
1778 if (set_bypass
&& (mcip
->mci_resource_arg
!= NULL
)) {
1779 mac_soft_ring_dls_bypass(softring
,
1780 mcip
->mci_direct_rx_fn
,
1781 mcip
->mci_direct_rx_arg
);
1784 /* Create the Oth softrings which has to go through the DLS */
1785 softring
= mac_soft_ring_create(id
, mac_soft_ring_worker_wait
,
1786 (type
|ST_RING_OTH
), pri
, mcip
, mac_srs
,
1787 cpuid
, rx_func
, x_arg1
, x_arg2
);
1788 softring
->s_ring_rx_arg2
= NULL
;
1792 * This routine associates a CPU or a set of CPU to process incoming
1793 * traffic from a mac client. If multiple CPUs are specified, then
1794 * so many soft rings are created with each soft ring worker thread
1795 * bound to a CPU in the set. Each soft ring in turn will be
1796 * associated with an squeue and the squeue will be moved to the
1797 * same CPU as that of the soft ring's.
1800 mac_srs_fanout_modify(mac_client_impl_t
*mcip
, mac_direct_rx_t rx_func
,
1801 void *x_arg1
, mac_resource_handle_t x_arg2
,
1802 mac_soft_ring_set_t
*mac_rx_srs
, mac_soft_ring_set_t
*mac_tx_srs
)
1804 mac_soft_ring_t
*softring
;
1805 uint32_t soft_ring_flag
= 0;
1806 processorid_t cpuid
= -1;
1807 int i
, srings_present
, new_fanout_cnt
;
1808 mac_cpus_t
*srs_cpu
;
1810 /* fanout state is REINIT. Set it back to INIT */
1811 ASSERT(mac_rx_srs
->srs_fanout_state
== SRS_FANOUT_REINIT
);
1812 mac_rx_srs
->srs_fanout_state
= SRS_FANOUT_INIT
;
1814 /* how many are present right now */
1815 srings_present
= mac_rx_srs
->srs_tcp_ring_count
;
1817 srs_cpu
= &mac_rx_srs
->srs_cpu
;
1818 new_fanout_cnt
= srs_cpu
->mc_rx_fanout_cnt
;
1820 mutex_enter(&mac_rx_srs
->srs_lock
);
1821 if (mac_rx_srs
->srs_type
& SRST_BW_CONTROL
)
1822 soft_ring_flag
|= ST_RING_BW_CTL
;
1823 mutex_exit(&mac_rx_srs
->srs_lock
);
1825 if (new_fanout_cnt
> srings_present
) {
1826 /* soft rings increased */
1827 mutex_enter(&mac_rx_srs
->srs_lock
);
1828 mac_rx_srs
->srs_type
|= SRST_FANOUT_SRC_IP
;
1829 mutex_exit(&mac_rx_srs
->srs_lock
);
1831 for (i
= mac_rx_srs
->srs_tcp_ring_count
;
1832 i
< new_fanout_cnt
; i
++) {
1834 * Create the protocol softrings and set the
1835 * DLS bypass where possible.
1837 mac_srs_create_proto_softrings(i
, soft_ring_flag
,
1838 mac_rx_srs
->srs_pri
, mcip
, mac_rx_srs
, cpuid
,
1839 rx_func
, x_arg1
, x_arg2
, B_TRUE
);
1841 mac_srs_update_fanout_list(mac_rx_srs
);
1842 } else if (new_fanout_cnt
< srings_present
) {
1843 /* soft rings decreased */
1844 if (new_fanout_cnt
== 1) {
1845 mutex_enter(&mac_rx_srs
->srs_lock
);
1846 mac_rx_srs
->srs_type
&= ~SRST_FANOUT_SRC_IP
;
1847 ASSERT(mac_rx_srs
->srs_type
& SRST_FANOUT_PROTO
);
1848 mutex_exit(&mac_rx_srs
->srs_lock
);
1850 /* Get rid of extra soft rings */
1851 for (i
= new_fanout_cnt
;
1852 i
< mac_rx_srs
->srs_tcp_ring_count
; i
++) {
1853 softring
= mac_rx_srs
->srs_tcp_soft_rings
[i
];
1854 if (softring
->s_ring_rx_arg2
!= NULL
) {
1855 mcip
->mci_resource_remove(
1856 (void *)mcip
->mci_resource_arg
,
1857 softring
->s_ring_rx_arg2
);
1859 mac_soft_ring_remove(mac_rx_srs
,
1860 mac_rx_srs
->srs_tcp_soft_rings
[i
]);
1861 mac_soft_ring_remove(mac_rx_srs
,
1862 mac_rx_srs
->srs_udp_soft_rings
[i
]);
1863 mac_soft_ring_remove(mac_rx_srs
,
1864 mac_rx_srs
->srs_oth_soft_rings
[i
]);
1866 mac_srs_update_fanout_list(mac_rx_srs
);
1869 ASSERT(new_fanout_cnt
== mac_rx_srs
->srs_tcp_ring_count
);
1870 mutex_enter(&cpu_lock
);
1871 for (i
= 0; i
< mac_rx_srs
->srs_tcp_ring_count
; i
++) {
1872 cpuid
= srs_cpu
->mc_rx_fanout_cpus
[i
];
1873 (void) mac_soft_ring_bind(mac_rx_srs
->srs_udp_soft_rings
[i
],
1875 (void) mac_soft_ring_bind(mac_rx_srs
->srs_oth_soft_rings
[i
],
1877 (void) mac_soft_ring_bind(mac_rx_srs
->srs_tcp_soft_rings
[i
],
1879 softring
= mac_rx_srs
->srs_tcp_soft_rings
[i
];
1880 if (softring
->s_ring_rx_arg2
!= NULL
) {
1881 mcip
->mci_resource_bind((void *)mcip
->mci_resource_arg
,
1882 softring
->s_ring_rx_arg2
, cpuid
);
1886 mac_srs_worker_bind(mac_rx_srs
, srs_cpu
->mc_rx_workerid
);
1887 mac_srs_poll_bind(mac_rx_srs
, srs_cpu
->mc_rx_pollid
);
1888 mac_rx_srs_retarget_intr(mac_rx_srs
, srs_cpu
->mc_rx_intr_cpu
);
1890 * Bind Tx srs and soft ring threads too. Let's bind tx
1891 * srs to the last cpu in mrp list.
1893 if (mac_tx_srs
!= NULL
) {
1894 BIND_TX_SRS_AND_SOFT_RINGS(mac_tx_srs
, mrp
);
1895 mac_tx_srs_retarget_intr(mac_tx_srs
);
1897 mutex_exit(&cpu_lock
);
1901 * Bind SRS threads and soft rings to CPUs/create fanout list.
1904 mac_srs_fanout_init(mac_client_impl_t
*mcip
, mac_resource_props_t
*mrp
,
1905 mac_direct_rx_t rx_func
, void *x_arg1
, mac_resource_handle_t x_arg2
,
1906 mac_soft_ring_set_t
*mac_rx_srs
, mac_soft_ring_set_t
*mac_tx_srs
,
1910 processorid_t cpuid
;
1911 uint32_t soft_ring_flag
= 0;
1913 mac_cpus_t
*srs_cpu
= &mac_rx_srs
->srs_cpu
;
1916 * Remove the no soft ring flag and we will adjust it
1917 * appropriately further down.
1919 mutex_enter(&mac_rx_srs
->srs_lock
);
1920 mac_rx_srs
->srs_type
&= ~SRST_NO_SOFT_RINGS
;
1921 mutex_exit(&mac_rx_srs
->srs_lock
);
1923 ASSERT(mac_rx_srs
->srs_soft_ring_head
== NULL
);
1925 if (mac_rx_srs
->srs_type
& SRST_BW_CONTROL
)
1926 soft_ring_flag
|= ST_RING_BW_CTL
;
1928 ASSERT(mac_rx_srs
->srs_fanout_state
== SRS_FANOUT_UNINIT
);
1929 mac_rx_srs
->srs_fanout_state
= SRS_FANOUT_INIT
;
1931 * Ring count can be 0 if no fanout is required and no cpu
1932 * were specified. Leave the SRS worker and poll thread
1935 ASSERT(mrp
!= NULL
);
1936 soft_ring_cnt
= srs_cpu
->mc_rx_fanout_cnt
;
1938 /* Step 1: bind cpu contains cpu list where threads need to bind */
1939 if (soft_ring_cnt
> 0) {
1940 mutex_enter(&cpu_lock
);
1941 for (i
= 0; i
< soft_ring_cnt
; i
++) {
1942 cpuid
= srs_cpu
->mc_rx_fanout_cpus
[i
];
1943 /* Create the protocol softrings */
1944 mac_srs_create_proto_softrings(i
, soft_ring_flag
,
1945 mac_rx_srs
->srs_pri
, mcip
, mac_rx_srs
, cpuid
,
1946 rx_func
, x_arg1
, x_arg2
, B_FALSE
);
1948 mac_srs_worker_bind(mac_rx_srs
, srs_cpu
->mc_rx_workerid
);
1949 mac_srs_poll_bind(mac_rx_srs
, srs_cpu
->mc_rx_pollid
);
1950 mac_rx_srs_retarget_intr(mac_rx_srs
, srs_cpu
->mc_rx_intr_cpu
);
1952 * Bind Tx srs and soft ring threads too.
1953 * Let's bind tx srs to the last cpu in
1956 if (mac_tx_srs
== NULL
) {
1957 mutex_exit(&cpu_lock
);
1961 BIND_TX_SRS_AND_SOFT_RINGS(mac_tx_srs
, mrp
);
1962 mac_tx_srs_retarget_intr(mac_tx_srs
);
1963 mutex_exit(&cpu_lock
);
1965 mutex_enter(&cpu_lock
);
1967 * For a subflow, mrp_workerid and mrp_pollid
1970 mac_srs_worker_bind(mac_rx_srs
, mrp
->mrp_rx_workerid
);
1971 mac_srs_poll_bind(mac_rx_srs
, mrp
->mrp_rx_pollid
);
1972 mutex_exit(&cpu_lock
);
1977 if (soft_ring_cnt
> 1)
1978 mac_rx_srs
->srs_type
|= SRST_FANOUT_SRC_IP
;
1979 mac_srs_update_fanout_list(mac_rx_srs
);
1980 mac_srs_client_poll_enable(mcip
, mac_rx_srs
);
1984 if (mac_rx_srs
->srs_type
& SRST_FANOUT_PROTO
) {
1985 mutex_enter(&cpu_lock
);
1986 cpuid
= mac_next_bind_cpu(cpupart
);
1987 /* Create the protocol softrings */
1988 mac_srs_create_proto_softrings(0, soft_ring_flag
,
1989 mac_rx_srs
->srs_pri
, mcip
, mac_rx_srs
, cpuid
,
1990 rx_func
, x_arg1
, x_arg2
, B_FALSE
);
1991 mutex_exit(&cpu_lock
);
1994 * This is the case when there is no fanout which is
1995 * true for subflows.
1997 mac_rx_srs
->srs_type
|= SRST_NO_SOFT_RINGS
;
1999 mac_srs_update_fanout_list(mac_rx_srs
);
2000 mac_srs_client_poll_enable(mcip
, mac_rx_srs
);
2006 * Calls mac_srs_fanout_init() or modify() depending upon whether
2007 * the SRS is getting initialized or re-initialized.
2010 mac_fanout_setup(mac_client_impl_t
*mcip
, flow_entry_t
*flent
,
2011 mac_resource_props_t
*mrp
, mac_direct_rx_t rx_func
, void *x_arg1
,
2012 mac_resource_handle_t x_arg2
, cpupart_t
*cpupart
)
2014 mac_soft_ring_set_t
*mac_rx_srs
, *mac_tx_srs
;
2017 ASSERT(MAC_PERIM_HELD((mac_handle_t
)mcip
->mci_mip
));
2019 * This is an aggregation port. Fanout will be setup
2020 * over the aggregation itself.
2022 if (mcip
->mci_state_flags
& MCIS_EXCLUSIVE
)
2025 mac_rx_srs
= flent
->fe_rx_srs
[0];
2027 * Set up the fanout on the tx side only once, with the
2028 * first rx SRS. The CPU binding, fanout, and bandwidth
2029 * criteria are common to both RX and TX, so
2030 * initializing them along side avoids redundant code.
2032 mac_tx_srs
= flent
->fe_tx_srs
;
2033 rx_srs_cnt
= flent
->fe_rx_srs_cnt
;
2035 /* No fanout for subflows */
2036 if (flent
->fe_type
& FLOW_USER
) {
2037 mac_srs_fanout_init(mcip
, mrp
, rx_func
,
2038 x_arg1
, x_arg2
, mac_rx_srs
, mac_tx_srs
,
2043 if (mrp
->mrp_mask
& MRP_CPUS_USERSPEC
)
2044 mac_flow_user_cpu_init(flent
, mrp
);
2046 mac_flow_cpu_init(flent
, cpupart
);
2048 mrp
->mrp_rx_fanout_cnt
= mac_rx_srs
->srs_cpu
.mc_rx_fanout_cnt
;
2051 * Set up fanout for both SW (0th SRS) and HW classified
2052 * SRS (the rest of Rx SRSs in flent).
2054 for (i
= 0; i
< rx_srs_cnt
; i
++) {
2055 mac_rx_srs
= flent
->fe_rx_srs
[i
];
2058 switch (mac_rx_srs
->srs_fanout_state
) {
2059 case SRS_FANOUT_UNINIT
:
2060 mac_srs_fanout_init(mcip
, mrp
, rx_func
,
2061 x_arg1
, x_arg2
, mac_rx_srs
, mac_tx_srs
,
2064 case SRS_FANOUT_INIT
:
2066 case SRS_FANOUT_REINIT
:
2067 mac_rx_srs_quiesce(mac_rx_srs
, SRS_QUIESCE
);
2068 mac_srs_fanout_modify(mcip
, rx_func
, x_arg1
,
2069 x_arg2
, mac_rx_srs
, mac_tx_srs
);
2070 mac_rx_srs_restart(mac_rx_srs
);
2073 VERIFY(mac_rx_srs
->srs_fanout_state
<=
2083 * Create a mac_soft_ring_set_t (SRS). If soft_ring_fanout_type is
2084 * SRST_TX, an SRS for Tx side is created. Otherwise an SRS for Rx side
2085 * processing is created.
2087 * Details on Rx SRS:
2088 * Create a SRS and also add the necessary soft rings for TCP and
2089 * non-TCP based on fanout type and count specified.
2091 * mac_soft_ring_fanout, mac_srs_fanout_modify (?),
2092 * mac_soft_ring_stop_workers, mac_soft_ring_set_destroy, etc need
2093 * to be heavily modified.
2095 * mi_soft_ring_list_size, mi_soft_ring_size, etc need to disappear.
2097 mac_soft_ring_set_t
*
2098 mac_srs_create(mac_client_impl_t
*mcip
, flow_entry_t
*flent
, uint32_t srs_type
,
2099 mac_direct_rx_t rx_func
, void *x_arg1
, mac_resource_handle_t x_arg2
,
2102 mac_soft_ring_set_t
*mac_srs
;
2103 mac_srs_rx_t
*srs_rx
;
2104 mac_srs_tx_t
*srs_tx
;
2105 mac_bw_ctl_t
*mac_bw
;
2106 mac_resource_props_t
*mrp
;
2107 boolean_t is_tx_srs
= ((srs_type
& SRST_TX
) != 0);
2109 mac_srs
= kmem_cache_alloc(mac_srs_cache
, KM_SLEEP
);
2110 bzero(mac_srs
, sizeof (mac_soft_ring_set_t
));
2111 srs_rx
= &mac_srs
->srs_rx
;
2112 srs_tx
= &mac_srs
->srs_tx
;
2114 mutex_enter(&flent
->fe_lock
);
2117 * Get the bandwidth control structure from the flent. Get
2118 * rid of any residual values in the control structure for
2119 * the tx bw struct and also for the rx, if the rx srs is
2120 * the 1st one being brought up (the rx bw ctl struct may
2121 * be shared by multiple SRSs)
2124 mac_srs
->srs_bw
= &flent
->fe_tx_bw
;
2125 bzero(mac_srs
->srs_bw
, sizeof (mac_bw_ctl_t
));
2126 flent
->fe_tx_srs
= mac_srs
;
2129 * The bw counter (stored in the flent) is shared
2130 * by SRS's within an rx group.
2132 mac_srs
->srs_bw
= &flent
->fe_rx_bw
;
2133 /* First rx SRS, clear the bw structure */
2134 if (flent
->fe_rx_srs_cnt
== 0)
2135 bzero(mac_srs
->srs_bw
, sizeof (mac_bw_ctl_t
));
2138 * It is better to panic here rather than just assert because
2139 * on a non-debug kernel we might end up courrupting memory
2140 * and making it difficult to debug.
2142 if (flent
->fe_rx_srs_cnt
>= MAX_RINGS_PER_GROUP
) {
2143 panic("Array Overrun detected due to MAC client %p "
2144 " having more rings than %d", (void *)mcip
,
2145 MAX_RINGS_PER_GROUP
);
2147 flent
->fe_rx_srs
[flent
->fe_rx_srs_cnt
] = mac_srs
;
2148 flent
->fe_rx_srs_cnt
++;
2150 mac_srs
->srs_flent
= flent
;
2151 mutex_exit(&flent
->fe_lock
);
2153 mac_srs
->srs_state
= 0;
2154 mac_srs
->srs_type
= (srs_type
| SRST_NO_SOFT_RINGS
);
2155 mac_srs
->srs_worker_cpuid
= mac_srs
->srs_worker_cpuid_save
= -1;
2156 mac_srs
->srs_poll_cpuid
= mac_srs
->srs_poll_cpuid_save
= -1;
2157 mac_srs
->srs_mcip
= mcip
;
2158 mac_srs_fanout_list_alloc(mac_srs
);
2161 * For a flow we use the underlying MAC client's priority range with
2162 * the priority value to find an absolute priority value. For a MAC
2163 * client we use the MAC client's maximum priority as the value.
2165 mrp
= &flent
->fe_effective_props
;
2166 if ((mac_srs
->srs_type
& SRST_FLOW
) != 0) {
2167 mac_srs
->srs_pri
= FLOW_PRIORITY(mcip
->mci_min_pri
,
2168 mcip
->mci_max_pri
, mrp
->mrp_priority
);
2170 mac_srs
->srs_pri
= mcip
->mci_max_pri
;
2173 * We need to insert the SRS in the global list before
2174 * binding the SRS and SR threads. Otherwise there is a
2175 * is a small window where the cpu reconfig callbacks
2176 * may miss the SRS in the list walk and DR could fail
2177 * as there are bound threads.
2179 mac_srs_add_glist(mac_srs
);
2181 /* Initialize bw limit */
2182 if ((mrp
->mrp_mask
& MRP_MAXBW
) != 0) {
2183 mac_srs
->srs_drain_func
= mac_rx_srs_drain_bw
;
2185 mac_bw
= mac_srs
->srs_bw
;
2186 mutex_enter(&mac_bw
->mac_bw_lock
);
2187 mac_bw
->mac_bw_limit
= FLOW_BYTES_PER_TICK(mrp
->mrp_maxbw
);
2190 * Give twice the queuing capability before
2191 * dropping packets. The unit is bytes/tick.
2193 mac_bw
->mac_bw_drop_threshold
= mac_bw
->mac_bw_limit
<< 1;
2194 mutex_exit(&mac_bw
->mac_bw_lock
);
2195 mac_srs
->srs_type
|= SRST_BW_CONTROL
;
2197 mac_srs
->srs_drain_func
= mac_rx_srs_drain
;
2201 * We use the following policy to control Receive
2202 * Side Dynamic Polling:
2203 * 1) We switch to poll mode anytime the processing thread causes
2204 * a backlog to build up in SRS and its associated Soft Rings
2205 * (sr_poll_pkt_cnt > 0).
2206 * 2) As long as the backlog stays under the low water mark
2207 * (sr_lowat), we poll the H/W for more packets.
2208 * 3) If the backlog (sr_poll_pkt_cnt) exceeds low water mark, we
2209 * stay in poll mode but don't poll the H/W for more packets.
2210 * 4) Anytime in polling mode, if we poll the H/W for packets and
2211 * find nothing plus we have an existing backlog
2212 * (sr_poll_pkt_cnt > 0), we stay in polling mode but don't poll
2213 * the H/W for packets anymore (let the polling thread go to sleep).
2214 * 5) Once the backlog is relived (packets are processed) we reenable
2215 * polling (by signalling the poll thread) only when the backlog
2216 * dips below sr_poll_thres.
2217 * 6) sr_hiwat is used exclusively when we are not polling capable
2218 * and is used to decide when to drop packets so the SRS queue
2219 * length doesn't grow infinitely.
2222 srs_rx
->sr_hiwat
= mac_soft_ring_max_q_cnt
;
2223 /* Low water mark needs to be less than high water mark */
2224 srs_rx
->sr_lowat
= mac_soft_ring_min_q_cnt
<=
2225 mac_soft_ring_max_q_cnt
? mac_soft_ring_min_q_cnt
:
2226 (mac_soft_ring_max_q_cnt
>> 2);
2227 /* Poll threshold need to be half of low water mark or less */
2228 srs_rx
->sr_poll_thres
= mac_soft_ring_poll_thres
<=
2229 (srs_rx
->sr_lowat
>> 1) ? mac_soft_ring_poll_thres
:
2230 (srs_rx
->sr_lowat
>> 1);
2231 if (mac_latency_optimize
)
2232 mac_srs
->srs_state
|= SRS_LATENCY_OPT
;
2234 mac_srs
->srs_state
|= SRS_SOFTRING_QUEUE
;
2237 mac_srs
->srs_worker
= thread_create(NULL
, 0,
2238 mac_srs_worker
, mac_srs
, 0, &p0
, TS_RUN
, mac_srs
->srs_pri
);
2241 /* Handle everything about Tx SRS and return */
2242 mac_srs
->srs_drain_func
= mac_tx_srs_drain
;
2243 srs_tx
->st_max_q_cnt
= mac_tx_srs_max_q_cnt
;
2245 (mac_tx_srs_hiwat
> mac_tx_srs_max_q_cnt
) ?
2246 mac_tx_srs_max_q_cnt
: mac_tx_srs_hiwat
;
2247 srs_tx
->st_arg1
= x_arg1
;
2248 srs_tx
->st_arg2
= x_arg2
;
2252 if ((srs_type
& SRST_FLOW
) != 0 ||
2253 FLOW_TAB_EMPTY(mcip
->mci_subflow_tab
))
2254 srs_rx
->sr_lower_proc
= mac_rx_srs_process
;
2256 srs_rx
->sr_lower_proc
= mac_rx_srs_subflow_process
;
2258 srs_rx
->sr_func
= rx_func
;
2259 srs_rx
->sr_arg1
= x_arg1
;
2260 srs_rx
->sr_arg2
= x_arg2
;
2265 /* Is the mac_srs created over the RX default group? */
2266 if (ring
->mr_gh
== (mac_group_handle_t
)
2267 MAC_DEFAULT_RX_GROUP(mcip
->mci_mip
)) {
2268 mac_srs
->srs_type
|= SRST_DEFAULT_GRP
;
2270 mac_srs
->srs_ring
= ring
;
2271 ring
->mr_srs
= mac_srs
;
2272 ring
->mr_classify_type
= MAC_HW_CLASSIFIER
;
2273 ring
->mr_flag
|= MR_INCIPIENT
;
2275 if (!(mcip
->mci_mip
->mi_state_flags
& MIS_POLL_DISABLE
) &&
2276 FLOW_TAB_EMPTY(mcip
->mci_subflow_tab
) && mac_poll_enable
)
2277 mac_srs
->srs_state
|= SRS_POLLING_CAPAB
;
2279 mac_srs
->srs_poll_thr
= thread_create(NULL
, 0,
2280 mac_rx_srs_poll_ring
, mac_srs
, 0, &p0
, TS_RUN
,
2283 * Some drivers require serialization and don't send
2284 * packet chains in interrupt context. For such
2285 * drivers, we should always queue in soft ring
2286 * so that we get a chance to switch into a polling
2287 * mode under backlog.
2289 ring_info
= mac_hwring_getinfo((mac_ring_handle_t
)ring
);
2290 if (ring_info
& MAC_RING_RX_ENQUEUE
)
2291 mac_srs
->srs_state
|= SRS_SOFTRING_QUEUE
;
2294 mac_srs_stat_create(mac_srs
);
2299 * Figure out the number of soft rings required. Its dependant on
2300 * if protocol fanout is required (for LINKs), global settings
2301 * require us to do fanout for performance (based on mac_soft_ring_enable),
2302 * or user has specifically requested fanout.
2305 mac_find_fanout(flow_entry_t
*flent
, uint32_t link_type
)
2307 uint32_t fanout_type
;
2308 mac_resource_props_t
*mrp
= &flent
->fe_effective_props
;
2310 /* no fanout for subflows */
2311 switch (link_type
) {
2313 fanout_type
= SRST_NO_SOFT_RINGS
;
2316 fanout_type
= SRST_FANOUT_PROTO
;
2320 /* A primary NIC/link is being plumbed */
2321 if (flent
->fe_type
& FLOW_PRIMARY_MAC
) {
2322 if (mac_soft_ring_enable
&& mac_rx_soft_ring_count
> 1) {
2323 fanout_type
|= SRST_FANOUT_SRC_IP
;
2325 } else if (flent
->fe_type
& FLOW_VNIC
) {
2326 /* A VNIC is being created */
2327 if (mrp
!= NULL
&& mrp
->mrp_ncpus
> 0) {
2328 fanout_type
|= SRST_FANOUT_SRC_IP
;
2332 return (fanout_type
);
2336 * Change a group from h/w to s/w classification.
2339 mac_rx_switch_grp_to_sw(mac_group_t
*group
)
2342 mac_soft_ring_set_t
*mac_srs
;
2344 for (ring
= group
->mrg_rings
; ring
!= NULL
; ring
= ring
->mr_next
) {
2345 if (ring
->mr_classify_type
== MAC_HW_CLASSIFIER
) {
2347 * Remove the SRS associated with the HW ring.
2348 * As a result, polling will be disabled.
2350 mac_srs
= ring
->mr_srs
;
2351 ASSERT(mac_srs
!= NULL
);
2352 mac_rx_srs_remove(mac_srs
);
2353 ring
->mr_srs
= NULL
;
2356 if (ring
->mr_state
!= MR_INUSE
)
2357 (void) mac_start_ring(ring
);
2360 * We need to perform SW classification
2361 * for packets landing in these rings
2364 ring
->mr_classify_type
= MAC_SW_CLASSIFIER
;
2369 * Create the Rx SRS for S/W classifier and for each ring in the
2370 * group (if exclusive group). Also create the Tx SRS.
2373 mac_srs_group_setup(mac_client_impl_t
*mcip
, flow_entry_t
*flent
,
2377 mac_resource_props_t
*mrp
= MCIP_RESOURCE_PROPS(mcip
);
2378 mac_resource_props_t
*emrp
= MCIP_EFFECTIVE_PROPS(mcip
);
2379 boolean_t use_default
= B_FALSE
;
2381 mac_rx_srs_group_setup(mcip
, flent
, link_type
);
2382 mac_tx_srs_group_setup(mcip
, flent
, link_type
);
2385 cpupart
= mac_pset_find(mrp
, &use_default
);
2386 mac_fanout_setup(mcip
, flent
, MCIP_RESOURCE_PROPS(mcip
),
2387 mac_rx_deliver
, mcip
, NULL
, cpupart
);
2388 mac_set_pool_effective(use_default
, cpupart
, mrp
, emrp
);
2393 * Set up the RX SRSs. If the S/W SRS is not set, set it up, if there
2394 * is a group associated with this MAC client, set up SRSs for individual
2398 mac_rx_srs_group_setup(mac_client_impl_t
*mcip
, flow_entry_t
*flent
,
2401 mac_impl_t
*mip
= mcip
->mci_mip
;
2402 mac_soft_ring_set_t
*mac_srs
;
2404 uint32_t fanout_type
;
2405 mac_group_t
*rx_group
= flent
->fe_rx_ring_group
;
2407 fanout_type
= mac_find_fanout(flent
, link_type
);
2409 /* Create the SRS for S/W classification if none exists */
2410 if (flent
->fe_rx_srs
[0] == NULL
) {
2411 ASSERT(flent
->fe_rx_srs_cnt
== 0);
2412 /* Setup the Rx SRS */
2413 mac_srs
= mac_srs_create(mcip
, flent
, fanout_type
| link_type
,
2414 mac_rx_deliver
, mcip
, NULL
, NULL
);
2415 mutex_enter(&flent
->fe_lock
);
2416 flent
->fe_cb_fn
= (flow_fn_t
)mac_srs
->srs_rx
.sr_lower_proc
;
2417 flent
->fe_cb_arg1
= (void *)mip
;
2418 flent
->fe_cb_arg2
= (void *)mac_srs
;
2419 mutex_exit(&flent
->fe_lock
);
2422 if (rx_group
== NULL
)
2425 * fanout for default SRS is done when default SRS are created
2426 * above. As each ring is added to the group, we setup the
2427 * SRS and fanout to it.
2429 switch (rx_group
->mrg_state
) {
2430 case MAC_GROUP_STATE_RESERVED
:
2431 for (ring
= rx_group
->mrg_rings
; ring
!= NULL
;
2432 ring
= ring
->mr_next
) {
2433 switch (ring
->mr_state
) {
2436 if (ring
->mr_srs
!= NULL
)
2438 if (ring
->mr_state
!= MR_INUSE
)
2439 (void) mac_start_ring(ring
);
2442 * Since the group is exclusively ours create
2443 * an SRS for this ring to allow the
2444 * individual SRS to dynamically poll the
2445 * ring. Do this only if the client is not
2446 * a VLAN MAC client, since for VLAN we do
2447 * s/w classification for the VID check, and
2448 * if it has a unicast address.
2450 if ((mcip
->mci_state_flags
&
2451 MCIS_NO_UNICAST_ADDR
) ||
2452 i_mac_flow_vid(mcip
->mci_flent
) !=
2456 mac_srs
= mac_srs_create(mcip
, flent
,
2457 fanout_type
| link_type
,
2458 mac_rx_deliver
, mcip
, NULL
, ring
);
2462 "srs_setup: mcip = %p "
2463 "trying to add UNKNOWN ring = %p\n",
2464 (void *)mcip
, (void *)ring
);
2469 case MAC_GROUP_STATE_SHARED
:
2471 * Set all rings of this group to software classified.
2473 * If the group is current RESERVED, the existing mac
2474 * client (the only client on this group) is using
2475 * this group exclusively. In that case we need to
2476 * disable polling on the rings of the group (if it
2477 * was enabled), and free the SRS associated with the
2480 mac_rx_switch_grp_to_sw(rx_group
);
2489 * Set up the TX SRS.
2492 mac_tx_srs_group_setup(mac_client_impl_t
*mcip
, flow_entry_t
*flent
,
2501 * If we are opened exclusively (like aggr does for aggr_ports),
2502 * don't set up Tx SRS and Tx soft rings as they won't be used.
2503 * The same thing has to be done for Rx side also. See bug:
2506 if (mcip
->mci_state_flags
& MCIS_EXCLUSIVE
) {
2508 * If we have rings, start them here.
2510 if (flent
->fe_tx_ring_group
== NULL
)
2512 grp
= (mac_group_t
*)flent
->fe_tx_ring_group
;
2513 ringcnt
= grp
->mrg_cur_count
;
2514 ring
= grp
->mrg_rings
;
2515 for (cnt
= 0; cnt
< ringcnt
; cnt
++) {
2516 if (ring
->mr_state
!= MR_INUSE
) {
2517 (void) mac_start_ring(ring
);
2519 ring
= ring
->mr_next
;
2523 if (flent
->fe_tx_srs
== NULL
) {
2524 (void) mac_srs_create(mcip
, flent
, SRST_TX
| link_type
,
2525 NULL
, mcip
, NULL
, NULL
);
2527 mac_tx_srs_setup(mcip
, flent
);
2531 * Remove all the RX SRSs. If we want to remove only the SRSs associated
2532 * with h/w rings, leave the S/W SRS alone. This is used when we want to
2533 * move the MAC client from one group to another, so we need to teardown
2537 mac_rx_srs_group_teardown(flow_entry_t
*flent
, boolean_t hwonly
)
2539 mac_soft_ring_set_t
*mac_srs
;
2541 int count
= flent
->fe_rx_srs_cnt
;
2543 for (i
= 0; i
< count
; i
++) {
2544 if (i
== 0 && hwonly
)
2546 mac_srs
= flent
->fe_rx_srs
[i
];
2547 mac_rx_srs_quiesce(mac_srs
, SRS_CONDEMNED
);
2548 mac_srs_free(mac_srs
);
2549 flent
->fe_rx_srs
[i
] = NULL
;
2550 flent
->fe_rx_srs_cnt
--;
2552 ASSERT(!hwonly
|| flent
->fe_rx_srs_cnt
== 1);
2553 ASSERT(hwonly
|| flent
->fe_rx_srs_cnt
== 0);
2557 * Remove the TX SRS.
2560 mac_tx_srs_group_teardown(mac_client_impl_t
*mcip
, flow_entry_t
*flent
,
2563 mac_soft_ring_set_t
*tx_srs
;
2566 if ((tx_srs
= flent
->fe_tx_srs
) == NULL
)
2569 tx
= &tx_srs
->srs_tx
;
2570 switch (link_type
) {
2573 * For flows, we need to work with passed
2574 * flent to find the Rx/Tx SRS.
2576 mac_tx_srs_quiesce(tx_srs
, SRS_CONDEMNED
);
2579 mac_tx_client_condemn((mac_client_handle_t
)mcip
);
2580 if (tx
->st_arg2
!= NULL
) {
2581 ASSERT(tx_srs
->srs_type
& SRST_TX
);
2583 * The ring itself will be stopped when
2584 * we release the group or in the
2585 * mac_datapath_teardown (for the default
2595 mac_srs_free(tx_srs
);
2596 flent
->fe_tx_srs
= NULL
;
2600 * This is the group state machine.
2602 * The state of an Rx group is given by
2603 * the following table. The default group and its rings are started in
2604 * mac_start itself and the default group stays in SHARED state until
2605 * mac_stop at which time the group and rings are stopped and and it
2606 * reverts to the Registered state.
2608 * Typically this function is called on a group after adding or removing a
2609 * client from it, to find out what should be the new state of the group.
2610 * If the new state is RESERVED, then the client that owns this group
2611 * exclusively is also returned. Note that adding or removing a client from
2612 * a group could also impact the default group and the caller needs to
2613 * evaluate the effect on the default group.
2615 * Group type # of clients mi_nactiveclients Group State
2618 * Non-default 0 N.A. REGISTERED
2619 * Non-default 1 N.A. RESERVED
2621 * Default 0 N.A. SHARED
2622 * Default 1 1 RESERVED
2623 * Default 1 > 1 SHARED
2624 * Default > 1 N.A. SHARED
2626 * For a TX group, the following is the state table.
2628 * Group type # of clients Group State
2631 * Non-default 0 REGISTERED
2632 * Non-default 1 RESERVED
2634 * Default 0 REGISTERED
2635 * Default 1 RESERVED
2636 * Default > 1 SHARED
2639 mac_group_next_state(mac_group_t
*grp
, mac_client_impl_t
**group_only_mcip
,
2640 mac_group_t
*defgrp
, boolean_t rx_group
)
2642 mac_impl_t
*mip
= (mac_impl_t
*)grp
->mrg_mh
;
2644 *group_only_mcip
= NULL
;
2646 /* Non-default group */
2648 if (grp
!= defgrp
) {
2649 if (MAC_GROUP_NO_CLIENT(grp
))
2650 return (MAC_GROUP_STATE_REGISTERED
);
2652 *group_only_mcip
= MAC_GROUP_ONLY_CLIENT(grp
);
2653 if (*group_only_mcip
!= NULL
)
2654 return (MAC_GROUP_STATE_RESERVED
);
2656 return (MAC_GROUP_STATE_SHARED
);
2661 if (MAC_GROUP_NO_CLIENT(grp
)) {
2663 return (MAC_GROUP_STATE_SHARED
);
2665 return (MAC_GROUP_STATE_REGISTERED
);
2667 *group_only_mcip
= MAC_GROUP_ONLY_CLIENT(grp
);
2668 if (*group_only_mcip
== NULL
)
2669 return (MAC_GROUP_STATE_SHARED
);
2671 if (rx_group
&& mip
->mi_nactiveclients
!= 1)
2672 return (MAC_GROUP_STATE_SHARED
);
2674 ASSERT(*group_only_mcip
!= NULL
);
2675 return (MAC_GROUP_STATE_RESERVED
);
2679 * OVERVIEW NOTES FOR DATAPATH
2680 * ===========================
2682 * Create an SRS and setup the corresponding flow function and args.
2683 * Add a classification rule for the flow specified by 'flent' and program
2684 * the hardware classifier when applicable.
2686 * Rx ring assignment, SRS, polling and B/W enforcement
2687 * ----------------------------------------------------
2689 * We try to use H/W classification on NIC and assign traffic to a
2690 * MAC address to a particular Rx ring. There is a 1-1 mapping
2691 * between a SRS and a Rx ring. The SRS (short for soft ring set)
2692 * dynamically switches the underlying Rx ring between interrupt
2693 * and polling mode and enforces any specified B/W control.
2695 * There is always a SRS created and tied to each H/W and S/W rule.
2696 * Whenever we create a H/W rule, we always add the the same rule to
2697 * S/W classifier and tie a SRS to it.
2699 * In case a B/W control is specified, its broken into bytes
2700 * per ticks and as soon as the quota for a tick is exhausted,
2701 * the underlying Rx ring is forced into poll mode for remianing
2702 * tick. The SRS poll thread only polls for bytes that are
2703 * allowed to come in the SRS. We typically let 4x the configured
2704 * B/W worth of packets to come in the SRS (to prevent unnecessary
2705 * drops due to bursts) but only process the specified amount.
2707 * A Link (primary NIC, VNIC, VLAN or aggr) can have 1 or more
2708 * Rx rings (and corresponding SRSs) assigned to it. The SRS
2709 * in turn can have softrings to do protocol level fanout or
2710 * softrings to do S/W based fanout or both. In case the NIC
2711 * has no Rx rings, we do S/W classification to respective SRS.
2712 * The S/W classification rule is always setup and ready. This
2713 * allows the MAC layer to reassign Rx rings whenever needed
2714 * but packets still continue to flow via the default path and
2715 * getting S/W classified to correct SRS.
2717 * In other cases where a NIC or VNIC is plumbed, our goal is use
2718 * H/W classifier and get two Rx ring assigned for the Link. One
2719 * for TCP and one for UDP|SCTP. The respective SRS still do the
2720 * polling on the Rx ring. For Link that is plumbed for IP, there
2721 * is a TCP squeue which also does polling and can control the
2722 * the Rx ring directly (where SRS is just pass through). For
2723 * the following cases, the SRS does the polling underneath.
2724 * 1) non IP based Links (Links which are not plumbed via ifconfig)
2725 * and paths which have no IP squeues (UDP & SCTP)
2726 * 2) If B/W control is specified on the Link
2727 * 3) If S/W fanout is secified
2729 * Note1: As of current implementation, we try to assign only 1 Rx
2730 * ring per Link and more than 1 Rx ring for primary Link for
2731 * H/W based fanout. We always create following softrings per SRS:
2732 * 1) TCP softring which is polled by TCP squeue where possible
2733 * (and also bypasses DLS)
2734 * 2) UDP/SCTP based which bypasses DLS
2735 * 3) OTH softring which goes via DLS (currently deal with IPv6
2736 * and non TCP/UDP/SCTP for IPv4 packets).
2738 * It is necessary to create 3 softrings since SRS has to poll
2739 * the single Rx ring underneath and enforce any link level B/W
2740 * control (we can't switch the Rx ring in poll mode just based
2741 * on TCP squeue if the same Rx ring is sharing UDP and other
2742 * traffic as well). Once polling is done and any Link level B/W
2743 * control is specified, the packets are assigned to respective
2744 * softring based on protocol. Since TCP has IP based squeue
2745 * which benefits by polling, we separate TCP packets into
2746 * its own softring which can be polled by IP squeue. We need
2747 * to separate out UDP/SCTP to UDP softring since it can bypass
2748 * the DLS layer which has heavy performance advanatges and we
2749 * need a softring (OTH) for rest.
2751 * ToDo: The 3 softrings for protocol are needed only till we can
2752 * get rid of DLS from datapath, make IPv4 and IPv6 paths
2753 * symmetric (deal with mac_header_info for v6 and polling for
2754 * IPv4 TCP - ip_accept_tcp is IPv4 specific although squeues
2755 * are generic), and bring SAP based classification to MAC layer
2757 * H/W and S/W based fanout and multiple Rx rings per Link
2758 * -------------------------------------------------------
2760 * In case, fanout is requested (or determined automatically based
2761 * on Link speed and processor speed), we try to assign multiple
2762 * Rx rings per Link with their respective SRS. In this case
2763 * the NIC should be capable of fanning out incoming packets between
2764 * the assigned Rx rings (H/W based fanout). All the SRS
2765 * individually switch their Rx ring between interrupt and polling
2766 * mode but share a common B/W control counter in case of Link
2767 * level B/W is specified.
2769 * If S/W based fanout is specified in lieu of H/W based fanout,
2770 * the Link SRS creates the specified number of softrings for
2771 * each protocol (TCP, UDP, OTH). Incoming packets are fanned
2772 * out to the correct softring based on their protocol and
2773 * protocol specific hash function.
2775 * Primary and non primary MAC clients
2776 * -----------------------------------
2778 * The NICs, VNICs, Vlans, and Aggrs are typically termed as Links
2779 * and are a Layer 2 construct.
2782 * The Link that owns the primary MAC address and typically
2783 * is used as the data NIC in non virtualized cases. As such
2784 * H/W resources are preferntially given to primary NIC. As
2785 * far as code is concerned, there is no difference in the
2786 * primary NIC vs VNICs. They are all treated as Links.
2787 * At the very first call to mac_unicast_add() we program the S/W
2788 * classifier for the primary MAC address, get a soft ring set
2789 * (and soft rings based on 'ip_soft_ring_cnt')
2790 * and a Rx ring assigned for polling to get enabled.
2791 * When IP get plumbed and negotiates polling, we can
2792 * let squeue do the polling on TCP softring.
2795 * Same as any other Link. As long as the H/W resource assignments
2796 * are equal, the data path and setup for all Links is same.
2799 * Can be configured on Links. They have their own SRS and the
2800 * S/W classifier is programmed appropriately based on the flow.
2801 * The flows typically deal with layer 3 and above and
2802 * creates a soft ring set specific to the flow. The receive
2803 * side function is switched from mac_rx_srs_process to
2804 * mac_rx_srs_subflow_process which first tries to assign the
2805 * packet to appropriate flow SRS and failing which assigns it
2806 * to link SRS. This allows us to avoid the layered approach
2807 * which gets complex.
2809 * By the time mac_datapath_setup() completes, we already have the
2810 * soft rings set, Rx rings, soft rings, etc figured out and both H/W
2811 * and S/W classifiers programmed. IP is not plumbed yet (and might
2812 * never be for Virtual Machines guest OS path). When IP is plumbed
2813 * (for both NIC and VNIC), we do a capability negotiation for polling
2814 * and upcall functions etc.
2816 * Rx ring Assignement NOTES
2817 * -------------------------
2819 * For NICs which have only 1 Rx ring (we treat NICs with no Rx rings
2820 * as NIC with a single default ring), we assign the only ring to
2821 * primary Link. The primary Link SRS can do polling on it as long as
2822 * it is the only link in use and we compare the MAC address for unicast
2823 * packets before accepting an incoming packet (there is no need for S/W
2824 * classification in this case). We disable polling on the only ring the
2825 * moment 2nd link gets created (the polling remains enabled even though
2826 * there are broadcast and * multicast flows created).
2828 * If the NIC has more than 1 Rx ring, we assign the default ring (the
2829 * 1st ring) to deal with broadcast, multicast and traffic for other
2830 * NICs which needs S/W classification. We assign the primary mac
2831 * addresses to another ring by specifiying a classification rule for
2832 * primary unicast MAC address to the selected ring. The primary Link
2833 * (and its SRS) can continue to poll the assigned Rx ring at all times
2836 * Note: In future, if no fanout is specified, we try to assign 2 Rx
2837 * rings for the primary Link with the primary MAC address + TCP going
2838 * to one ring and primary MAC address + UDP|SCTP going to other ring.
2839 * Any remaining traffic for primary MAC address can go to the default
2840 * Rx ring and get S/W classified. This way the respective SRSs don't
2841 * need to do proto fanout and don't need to have softrings at all and
2842 * can poll their respective Rx rings.
2844 * As an optimization, when a new NIC or VNIC is created, we can get
2845 * only one Rx ring and make it a TCP specific Rx ring and use the
2846 * H/W default Rx ring for the rest (this Rx ring is never polled).
2848 * For clients that don't have MAC address, but want to receive and
2849 * transmit packets (e.g, bpf, gvrp etc.), we need to setup the datapath.
2850 * For such clients (identified by the MCIS_NO_UNICAST_ADDR flag) we
2851 * always give the default group and use software classification (i.e.
2852 * even if this is the only client in the default group, we will
2853 * leave group as shared).
2856 mac_datapath_setup(mac_client_impl_t
*mcip
, flow_entry_t
*flent
,
2859 mac_impl_t
*mip
= mcip
->mci_mip
;
2860 mac_group_t
*rgroup
= NULL
;
2861 mac_group_t
*tgroup
= NULL
;
2862 mac_group_t
*default_rgroup
;
2863 mac_group_t
*default_tgroup
;
2866 mac_group_state_t next_state
;
2867 mac_client_impl_t
*group_only_mcip
;
2868 mac_resource_props_t
*mrp
= MCIP_RESOURCE_PROPS(mcip
);
2869 mac_resource_props_t
*emrp
= MCIP_EFFECTIVE_PROPS(mcip
);
2872 boolean_t use_default
= B_FALSE
;
2874 boolean_t no_unicast
;
2875 boolean_t isprimary
= flent
->fe_type
& FLOW_PRIMARY_MAC
;
2876 mac_client_impl_t
*reloc_pmcip
= NULL
;
2878 ASSERT(MAC_PERIM_HELD((mac_handle_t
)mip
));
2880 switch (link_type
) {
2882 mac_srs_group_setup(mcip
, flent
, link_type
);
2886 no_unicast
= mcip
->mci_state_flags
& MCIS_NO_UNICAST_ADDR
;
2887 mac_addr
= flent
->fe_flow_desc
.fd_dst_mac
;
2889 /* Default RX group */
2890 default_rgroup
= MAC_DEFAULT_RX_GROUP(mip
);
2892 /* Default TX group */
2893 default_tgroup
= MAC_DEFAULT_TX_GROUP(mip
);
2896 rgroup
= default_rgroup
;
2897 tgroup
= default_tgroup
;
2900 rxhw
= (mrp
->mrp_mask
& MRP_RX_RINGS
) &&
2901 (mrp
->mrp_nrxrings
> 0 ||
2902 (mrp
->mrp_mask
& MRP_RXRINGS_UNSPEC
));
2903 txhw
= (mrp
->mrp_mask
& MRP_TX_RINGS
) &&
2904 (mrp
->mrp_ntxrings
> 0 ||
2905 (mrp
->mrp_mask
& MRP_TXRINGS_UNSPEC
));
2908 * By default we have given the primary all the rings
2909 * i.e. the default group. Let's see if the primary
2910 * needs to be relocated so that the addition of this
2911 * client doesn't impact the primary's performance,
2912 * i.e. if the primary is in the default group and
2913 * we add this client, the primary will lose polling.
2914 * We do this only for NICs supporting dynamic ring
2915 * grouping and only when this is the first client
2916 * after the primary (i.e. nactiveclients is 2)
2918 if (!isprimary
&& mip
->mi_nactiveclients
== 2 &&
2919 (group_only_mcip
= mac_primary_client_handle(mip
)) !=
2920 NULL
&& mip
->mi_rx_group_type
== MAC_GROUP_TYPE_DYNAMIC
) {
2921 reloc_pmcip
= mac_check_primary_relocation(
2922 group_only_mcip
, rxhw
);
2925 * Check to see if we can get an exclusive group for
2926 * this mac address or if there already exists a
2927 * group that has this mac address (case of VLANs).
2928 * If no groups are available, use the default group.
2930 rgroup
= mac_reserve_rx_group(mcip
, mac_addr
, B_FALSE
);
2931 if (rgroup
== NULL
&& rxhw
) {
2934 } else if (rgroup
== NULL
) {
2935 rgroup
= default_rgroup
;
2938 * Check to see if we can get an exclusive group for
2939 * this mac client. If no groups are available, use
2940 * the default group.
2942 tgroup
= mac_reserve_tx_group(mcip
, B_FALSE
);
2943 if (tgroup
== NULL
&& txhw
) {
2944 if (rgroup
!= NULL
&& rgroup
!= default_rgroup
)
2945 mac_release_rx_group(mcip
, rgroup
);
2948 } else if (tgroup
== NULL
) {
2949 tgroup
= default_tgroup
;
2953 * Some NICs don't support any Rx rings, so there may not
2954 * even be a default group.
2957 if (rgroup
!= NULL
) {
2958 if (rgroup
!= default_rgroup
&&
2959 MAC_GROUP_NO_CLIENT(rgroup
) &&
2960 (rxhw
|| mcip
->mci_share
!= (uintptr_t)NULL
)) {
2961 MAC_RX_GRP_RESERVED(mip
);
2962 if (mip
->mi_rx_group_type
==
2963 MAC_GROUP_TYPE_DYNAMIC
) {
2964 MAC_RX_RING_RESERVED(mip
,
2965 rgroup
->mrg_cur_count
);
2968 flent
->fe_rx_ring_group
= rgroup
;
2970 * Add the client to the group. This could cause
2971 * either this group to move to the shared state or
2972 * cause the default group to move to the shared state.
2973 * The actions on this group are done here, while the
2974 * actions on the default group are postponed to
2975 * the end of this function.
2977 mac_group_add_client(rgroup
, mcip
);
2978 next_state
= mac_group_next_state(rgroup
,
2979 &group_only_mcip
, default_rgroup
, B_TRUE
);
2980 mac_set_group_state(rgroup
, next_state
);
2983 if (tgroup
!= NULL
) {
2984 if (tgroup
!= default_tgroup
&&
2985 MAC_GROUP_NO_CLIENT(tgroup
) &&
2986 (txhw
|| mcip
->mci_share
!= (uintptr_t)NULL
)) {
2987 MAC_TX_GRP_RESERVED(mip
);
2988 if (mip
->mi_tx_group_type
==
2989 MAC_GROUP_TYPE_DYNAMIC
) {
2990 MAC_TX_RING_RESERVED(mip
,
2991 tgroup
->mrg_cur_count
);
2994 flent
->fe_tx_ring_group
= tgroup
;
2995 mac_group_add_client(tgroup
, mcip
);
2996 next_state
= mac_group_next_state(tgroup
,
2997 &group_only_mcip
, default_tgroup
, B_FALSE
);
2998 tgroup
->mrg_state
= next_state
;
3001 * Setup the Rx and Tx SRSes. If we got a pristine group
3002 * exclusively above, mac_srs_group_setup would simply create
3003 * the required SRSes. If we ended up sharing a previously
3004 * reserved group, mac_srs_group_setup would also dismantle the
3005 * SRSes of the previously exclusive group
3007 mac_srs_group_setup(mcip
, flent
, link_type
);
3009 /* We are setting up minimal datapath only */
3012 /* Program the S/W Classifer */
3013 if ((err
= mac_flow_add(mip
->mi_flow_tab
, flent
)) != 0)
3016 /* Program the H/W Classifier */
3017 if ((err
= mac_add_macaddr(mip
, rgroup
, mac_addr
,
3018 (mcip
->mci_state_flags
& MCIS_UNICAST_HW
) != 0)) != 0)
3020 mcip
->mci_unicast
= mac_find_macaddr(mip
, mac_addr
);
3021 ASSERT(mcip
->mci_unicast
!= NULL
);
3022 /* (Re)init the v6 token & local addr used by link protection */
3023 mac_protect_update_mac_token(mcip
);
3032 * All broadcast and multicast traffic is received only on the default
3033 * group. If we have setup the datapath for a non-default group above
3034 * then move the default group to shared state to allow distribution of
3035 * incoming broadcast traffic to the other groups and dismantle the
3036 * SRSes over the default group.
3038 if (rgroup
!= NULL
) {
3039 if (rgroup
!= default_rgroup
) {
3040 if (default_rgroup
->mrg_state
==
3041 MAC_GROUP_STATE_RESERVED
) {
3042 group_only_mcip
= MAC_GROUP_ONLY_CLIENT(
3044 ASSERT(group_only_mcip
!= NULL
&&
3045 mip
->mi_nactiveclients
> 1);
3047 mac_set_group_state(default_rgroup
,
3048 MAC_GROUP_STATE_SHARED
);
3049 mac_rx_srs_group_setup(group_only_mcip
,
3050 group_only_mcip
->mci_flent
, SRST_LINK
);
3052 cpupart
= mac_pset_find(mrp
, &use_default
);
3053 mac_fanout_setup(group_only_mcip
,
3054 group_only_mcip
->mci_flent
,
3055 MCIP_RESOURCE_PROPS(group_only_mcip
),
3056 mac_rx_deliver
, group_only_mcip
, NULL
,
3058 mac_set_pool_effective(use_default
, cpupart
,
3062 ASSERT(default_rgroup
->mrg_state
==
3063 MAC_GROUP_STATE_SHARED
);
3066 * If we get an exclusive group for a VLAN MAC client we
3067 * need to take the s/w path to make the additional check for
3068 * the vid. Disable polling and set it to s/w classification.
3069 * Similarly for clients that don't have a unicast address.
3071 if (rgroup
->mrg_state
== MAC_GROUP_STATE_RESERVED
&&
3072 (i_mac_flow_vid(flent
) != VLAN_ID_NONE
|| no_unicast
)) {
3073 mac_rx_switch_grp_to_sw(rgroup
);
3076 mac_set_rings_effective(mcip
);
3080 /* Switch the primary back to default group */
3081 if (reloc_pmcip
!= NULL
) {
3082 (void) mac_rx_switch_group(reloc_pmcip
,
3083 reloc_pmcip
->mci_flent
->fe_rx_ring_group
, default_rgroup
);
3085 mac_datapath_teardown(mcip
, flent
, link_type
);
3090 mac_datapath_teardown(mac_client_impl_t
*mcip
, flow_entry_t
*flent
,
3093 mac_impl_t
*mip
= mcip
->mci_mip
;
3094 mac_group_t
*group
= NULL
;
3095 mac_client_impl_t
*grp_only_mcip
;
3096 flow_entry_t
*group_only_flent
;
3097 mac_group_t
*default_group
;
3098 boolean_t check_default_group
= B_FALSE
;
3099 mac_group_state_t next_state
;
3100 mac_resource_props_t
*mrp
= MCIP_RESOURCE_PROPS(mcip
);
3102 ASSERT(MAC_PERIM_HELD((mac_handle_t
)mip
));
3104 switch (link_type
) {
3106 mac_rx_srs_group_teardown(flent
, B_FALSE
);
3107 mac_tx_srs_group_teardown(mcip
, flent
, SRST_FLOW
);
3111 /* Stop sending packets */
3112 mac_tx_client_block(mcip
);
3114 /* Stop the packets coming from the H/W */
3115 if (mcip
->mci_unicast
!= NULL
) {
3117 err
= mac_remove_macaddr(mcip
->mci_unicast
);
3119 cmn_err(CE_WARN
, "%s: failed to remove a MAC"
3120 " address because of error 0x%x",
3123 mcip
->mci_unicast
= NULL
;
3126 /* Stop the packets coming from the S/W classifier */
3127 mac_flow_remove(mip
->mi_flow_tab
, flent
, B_FALSE
);
3128 mac_flow_wait(flent
, FLOW_DRIVER_UPCALL
);
3130 /* Now quiesce and destroy all SRS and soft rings */
3131 mac_rx_srs_group_teardown(flent
, B_FALSE
);
3132 mac_tx_srs_group_teardown(mcip
, flent
, SRST_LINK
);
3134 ASSERT((mcip
->mci_flent
== flent
) &&
3135 (flent
->fe_next
== NULL
));
3138 * Release our hold on the group as well. We need
3139 * to check if the shared group has only one client
3140 * left who can use it exclusively. Also, if we
3141 * were the last client, release the group.
3143 group
= flent
->fe_rx_ring_group
;
3144 default_group
= MAC_DEFAULT_RX_GROUP(mip
);
3145 if (group
!= NULL
) {
3146 mac_group_remove_client(group
, mcip
);
3147 next_state
= mac_group_next_state(group
,
3148 &grp_only_mcip
, default_group
, B_TRUE
);
3149 if (next_state
== MAC_GROUP_STATE_RESERVED
) {
3151 * Only one client left on this RX group.
3153 ASSERT(grp_only_mcip
!= NULL
);
3154 mac_set_group_state(group
,
3155 MAC_GROUP_STATE_RESERVED
);
3156 group_only_flent
= grp_only_mcip
->mci_flent
;
3159 * The only remaining client has exclusive
3160 * access on the group. Allow it to
3161 * dynamically poll the H/W rings etc.
3163 mac_rx_srs_group_setup(grp_only_mcip
,
3164 group_only_flent
, SRST_LINK
);
3165 mac_fanout_setup(grp_only_mcip
,
3167 MCIP_RESOURCE_PROPS(grp_only_mcip
),
3168 mac_rx_deliver
, grp_only_mcip
, NULL
, NULL
);
3169 mac_rx_group_unmark(group
, MR_INCIPIENT
);
3170 mac_set_rings_effective(grp_only_mcip
);
3171 } else if (next_state
== MAC_GROUP_STATE_REGISTERED
) {
3173 * This is a non-default group being freed up.
3174 * We need to reevaluate the default group
3175 * to see if the primary client can get
3176 * exclusive access to the default group.
3178 ASSERT(group
!= MAC_DEFAULT_RX_GROUP(mip
));
3179 if (mrp
->mrp_mask
& MRP_RX_RINGS
) {
3180 MAC_RX_GRP_RELEASED(mip
);
3181 if (mip
->mi_rx_group_type
==
3182 MAC_GROUP_TYPE_DYNAMIC
) {
3183 MAC_RX_RING_RELEASED(mip
,
3184 group
->mrg_cur_count
);
3187 mac_release_rx_group(mcip
, group
);
3188 mac_set_group_state(group
,
3189 MAC_GROUP_STATE_REGISTERED
);
3190 check_default_group
= B_TRUE
;
3192 ASSERT(next_state
== MAC_GROUP_STATE_SHARED
);
3193 mac_set_group_state(group
,
3194 MAC_GROUP_STATE_SHARED
);
3195 mac_rx_group_unmark(group
, MR_CONDEMNED
);
3197 flent
->fe_rx_ring_group
= NULL
;
3200 * Remove the client from the TX group. Additionally, if
3201 * this a non-default group, then we also need to release
3204 group
= flent
->fe_tx_ring_group
;
3205 default_group
= MAC_DEFAULT_TX_GROUP(mip
);
3206 if (group
!= NULL
) {
3207 mac_group_remove_client(group
, mcip
);
3208 next_state
= mac_group_next_state(group
,
3209 &grp_only_mcip
, default_group
, B_FALSE
);
3210 if (next_state
== MAC_GROUP_STATE_REGISTERED
) {
3211 if (group
!= default_group
) {
3212 if (mrp
->mrp_mask
& MRP_TX_RINGS
) {
3213 MAC_TX_GRP_RELEASED(mip
);
3214 if (mip
->mi_tx_group_type
==
3215 MAC_GROUP_TYPE_DYNAMIC
) {
3216 MAC_TX_RING_RELEASED(
3221 mac_release_tx_group(mcip
, group
);
3223 * If the default group is reserved,
3224 * then we need to set the effective
3225 * rings as we would have given
3226 * back some rings when the group
3229 if (mip
->mi_tx_group_type
==
3230 MAC_GROUP_TYPE_DYNAMIC
&&
3231 default_group
->mrg_state
==
3232 MAC_GROUP_STATE_RESERVED
) {
3234 MAC_GROUP_ONLY_CLIENT
3236 mac_set_rings_effective(
3245 * Stop all the rings except the
3248 ringcnt
= group
->mrg_cur_count
;
3249 ring
= group
->mrg_rings
;
3250 for (cnt
= 0; cnt
< ringcnt
; cnt
++) {
3251 if (ring
->mr_state
==
3254 mip
->mi_default_tx_ring
) {
3255 mac_stop_ring(ring
);
3258 ring
= ring
->mr_next
;
3261 } else if (next_state
== MAC_GROUP_STATE_RESERVED
) {
3262 mac_set_rings_effective(grp_only_mcip
);
3264 flent
->fe_tx_ring_group
= NULL
;
3265 group
->mrg_state
= next_state
;
3274 * The mac client using the default group gets exclusive access to the
3275 * default group if and only if it is the sole client on the entire
3276 * mip. If so set the group state to reserved, and set up the SRSes
3277 * over the default group.
3279 if (check_default_group
) {
3280 default_group
= MAC_DEFAULT_RX_GROUP(mip
);
3281 ASSERT(default_group
->mrg_state
== MAC_GROUP_STATE_SHARED
);
3282 next_state
= mac_group_next_state(default_group
,
3283 &grp_only_mcip
, default_group
, B_TRUE
);
3284 if (next_state
== MAC_GROUP_STATE_RESERVED
) {
3285 ASSERT(grp_only_mcip
!= NULL
&&
3286 mip
->mi_nactiveclients
== 1);
3287 mac_set_group_state(default_group
,
3288 MAC_GROUP_STATE_RESERVED
);
3289 mac_rx_srs_group_setup(grp_only_mcip
,
3290 grp_only_mcip
->mci_flent
, SRST_LINK
);
3291 mac_fanout_setup(grp_only_mcip
,
3292 grp_only_mcip
->mci_flent
,
3293 MCIP_RESOURCE_PROPS(grp_only_mcip
), mac_rx_deliver
,
3294 grp_only_mcip
, NULL
, NULL
);
3295 mac_rx_group_unmark(default_group
, MR_INCIPIENT
);
3296 mac_set_rings_effective(grp_only_mcip
);
3301 * If the primary is the only one left and the MAC supports
3302 * dynamic grouping, we need to see if the primary needs to
3303 * be moved to the default group so that it can use all the
3306 if (!(flent
->fe_type
& FLOW_PRIMARY_MAC
) &&
3307 mip
->mi_nactiveclients
== 1 &&
3308 mip
->mi_rx_group_type
== MAC_GROUP_TYPE_DYNAMIC
) {
3309 default_group
= MAC_DEFAULT_RX_GROUP(mip
);
3310 grp_only_mcip
= mac_primary_client_handle(mip
);
3311 if (grp_only_mcip
== NULL
)
3313 group_only_flent
= grp_only_mcip
->mci_flent
;
3314 mrp
= MCIP_RESOURCE_PROPS(grp_only_mcip
);
3316 * If the primary has an explicit property set, leave it
3319 if (mrp
->mrp_mask
& MRP_RX_RINGS
)
3322 * Switch the primary to the default group.
3324 (void) mac_rx_switch_group(grp_only_mcip
,
3325 group_only_flent
->fe_rx_ring_group
, default_group
);
3329 /* DATAPATH TEAR DOWN ROUTINES (SRS and FANOUT teardown) */
3332 mac_srs_fanout_list_free(mac_soft_ring_set_t
*mac_srs
)
3334 if (mac_srs
->srs_type
& SRST_TX
) {
3337 ASSERT(mac_srs
->srs_tcp_soft_rings
== NULL
);
3338 ASSERT(mac_srs
->srs_udp_soft_rings
== NULL
);
3339 ASSERT(mac_srs
->srs_oth_soft_rings
== NULL
);
3340 ASSERT(mac_srs
->srs_tx_soft_rings
!= NULL
);
3341 kmem_free(mac_srs
->srs_tx_soft_rings
,
3342 sizeof (mac_soft_ring_t
*) * MAX_RINGS_PER_GROUP
);
3343 mac_srs
->srs_tx_soft_rings
= NULL
;
3344 tx
= &mac_srs
->srs_tx
;
3345 if (tx
->st_soft_rings
!= NULL
) {
3346 kmem_free(tx
->st_soft_rings
,
3347 sizeof (mac_soft_ring_t
*) * MAX_RINGS_PER_GROUP
);
3350 ASSERT(mac_srs
->srs_tx_soft_rings
== NULL
);
3351 ASSERT(mac_srs
->srs_tcp_soft_rings
!= NULL
);
3352 kmem_free(mac_srs
->srs_tcp_soft_rings
,
3353 sizeof (mac_soft_ring_t
*) * MAX_SR_FANOUT
);
3354 mac_srs
->srs_tcp_soft_rings
= NULL
;
3355 ASSERT(mac_srs
->srs_udp_soft_rings
!= NULL
);
3356 kmem_free(mac_srs
->srs_udp_soft_rings
,
3357 sizeof (mac_soft_ring_t
*) * MAX_SR_FANOUT
);
3358 mac_srs
->srs_udp_soft_rings
= NULL
;
3359 ASSERT(mac_srs
->srs_oth_soft_rings
!= NULL
);
3360 kmem_free(mac_srs
->srs_oth_soft_rings
,
3361 sizeof (mac_soft_ring_t
*) * MAX_SR_FANOUT
);
3362 mac_srs
->srs_oth_soft_rings
= NULL
;
3367 * An RX SRS is attached to at most one mac_ring.
3368 * A TX SRS has no rings.
3371 mac_srs_ring_free(mac_soft_ring_set_t
*mac_srs
)
3373 mac_client_impl_t
*mcip
;
3375 flow_entry_t
*flent
;
3377 ring
= mac_srs
->srs_ring
;
3378 if (mac_srs
->srs_type
& SRST_TX
) {
3379 ASSERT(ring
== NULL
);
3387 * Broadcast flows don't have a client impl association, but they
3388 * use only soft rings.
3390 flent
= mac_srs
->srs_flent
;
3391 mcip
= flent
->fe_mcip
;
3392 ASSERT(mcip
!= NULL
);
3394 ring
->mr_classify_type
= MAC_NO_CLASSIFIER
;
3395 ring
->mr_srs
= NULL
;
3399 * Physical unlink and free of the data structures happen below. This is
3400 * driven from mac_flow_destroy(), on the last refrele of a flow.
3402 * Assumes Rx srs is 1-1 mapped with an ring.
3405 mac_srs_free(mac_soft_ring_set_t
*mac_srs
)
3407 ASSERT(mac_srs
->srs_mcip
== NULL
||
3408 MAC_PERIM_HELD((mac_handle_t
)mac_srs
->srs_mcip
->mci_mip
));
3409 ASSERT((mac_srs
->srs_state
& (SRS_CONDEMNED
| SRS_CONDEMNED_DONE
|
3410 SRS_PROC
| SRS_PROC_FAST
)) == (SRS_CONDEMNED
| SRS_CONDEMNED_DONE
));
3412 mac_pkt_drop(NULL
, NULL
, mac_srs
->srs_first
, B_FALSE
);
3413 mac_srs_ring_free(mac_srs
);
3414 mac_srs_soft_rings_free(mac_srs
);
3415 mac_srs_fanout_list_free(mac_srs
);
3417 mac_srs
->srs_bw
= NULL
;
3418 mac_srs_stat_delete(mac_srs
);
3419 kmem_cache_free(mac_srs_cache
, mac_srs
);
3423 mac_srs_soft_rings_quiesce(mac_soft_ring_set_t
*mac_srs
, uint_t s_ring_flag
)
3425 mac_soft_ring_t
*softring
;
3427 ASSERT(MUTEX_HELD(&mac_srs
->srs_lock
));
3429 mac_srs_soft_rings_signal(mac_srs
, s_ring_flag
);
3430 if (s_ring_flag
== S_RING_CONDEMNED
) {
3431 while (mac_srs
->srs_soft_ring_condemned_count
!=
3432 mac_srs
->srs_soft_ring_count
)
3433 cv_wait(&mac_srs
->srs_async
, &mac_srs
->srs_lock
);
3435 while (mac_srs
->srs_soft_ring_quiesced_count
!=
3436 mac_srs
->srs_soft_ring_count
)
3437 cv_wait(&mac_srs
->srs_async
, &mac_srs
->srs_lock
);
3439 mutex_exit(&mac_srs
->srs_lock
);
3441 for (softring
= mac_srs
->srs_soft_ring_head
; softring
!= NULL
;
3442 softring
= softring
->s_ring_next
) {
3443 (void) untimeout(softring
->s_ring_tid
);
3444 softring
->s_ring_tid
= NULL
;
3447 (void) untimeout(mac_srs
->srs_tid
);
3448 mac_srs
->srs_tid
= NULL
;
3450 mutex_enter(&mac_srs
->srs_lock
);
3454 * The block comment above mac_rx_classify_flow_state_change explains the
3455 * background. At this point upcalls from the driver (both hardware classified
3456 * and software classified) have been cut off. We now need to quiesce the
3457 * SRS worker, poll, and softring threads. The SRS worker thread serves as
3458 * the master controller. The steps involved are described below in the function
3461 mac_srs_worker_quiesce(mac_soft_ring_set_t
*mac_srs
)
3464 uint_t srs_poll_wait_flag
;
3466 ASSERT(MUTEX_HELD(&mac_srs
->srs_lock
));
3467 ASSERT(mac_srs
->srs_state
& (SRS_CONDEMNED
| SRS_QUIESCE
));
3469 if (mac_srs
->srs_state
& SRS_CONDEMNED
) {
3470 s_ring_flag
= S_RING_CONDEMNED
;
3471 srs_poll_wait_flag
= SRS_POLL_THR_EXITED
;
3473 s_ring_flag
= S_RING_QUIESCE
;
3474 srs_poll_wait_flag
= SRS_POLL_THR_QUIESCED
;
3478 * In the case of Rx SRS wait till the poll thread is done.
3480 if ((mac_srs
->srs_type
& SRST_TX
) == 0 &&
3481 mac_srs
->srs_poll_thr
!= NULL
) {
3482 while (!(mac_srs
->srs_state
& srs_poll_wait_flag
))
3483 cv_wait(&mac_srs
->srs_async
, &mac_srs
->srs_lock
);
3486 * Turn off polling as part of the quiesce operation.
3488 MAC_SRS_POLLING_OFF(mac_srs
);
3489 mac_srs
->srs_state
&= ~(SRS_POLLING
| SRS_GET_PKTS
);
3493 * Then signal the soft ring worker threads to quiesce or quit
3494 * as needed and then wait till that happens.
3496 mac_srs_soft_rings_quiesce(mac_srs
, s_ring_flag
);
3498 if (mac_srs
->srs_state
& SRS_CONDEMNED
)
3499 mac_srs
->srs_state
|= (SRS_QUIESCE_DONE
| SRS_CONDEMNED_DONE
);
3501 mac_srs
->srs_state
|= SRS_QUIESCE_DONE
;
3502 cv_signal(&mac_srs
->srs_quiesce_done_cv
);
3506 * Signal an SRS to start a temporary quiesce, or permanent removal, or restart
3507 * a quiesced SRS by setting the appropriate flags and signaling the SRS worker
3508 * or poll thread. This function is internal to the quiescing logic and is
3509 * called internally from the SRS quiesce or flow quiesce or client quiesce
3510 * higher level functions.
3513 mac_srs_signal(mac_soft_ring_set_t
*mac_srs
, uint_t srs_flag
)
3517 ring
= mac_srs
->srs_ring
;
3518 ASSERT(ring
== NULL
|| ring
->mr_refcnt
== 0);
3520 if (srs_flag
== SRS_CONDEMNED
) {
3522 * The SRS is going away. We need to unbind the SRS and SR
3523 * threads before removing from the global SRS list. Otherwise
3524 * there is a small window where the cpu reconfig callbacks
3525 * may miss the SRS in the list walk and DR could fail since
3526 * there are still bound threads.
3528 mac_srs_threads_unbind(mac_srs
);
3529 mac_srs_remove_glist(mac_srs
);
3532 * Wakeup the SRS worker and poll threads.
3534 mutex_enter(&mac_srs
->srs_lock
);
3535 mac_srs
->srs_state
|= srs_flag
;
3536 cv_signal(&mac_srs
->srs_async
);
3537 cv_signal(&mac_srs
->srs_cv
);
3538 mutex_exit(&mac_srs
->srs_lock
);
3542 * In the Rx side, the quiescing is done bottom up. After the Rx upcalls
3543 * from the driver are done, then the Rx SRS is quiesced and only then can
3544 * we signal the soft rings. Thus this function can't be called arbitrarily
3545 * without satisfying the prerequisites. On the Tx side, the threads from
3546 * top need to quiesced, then the Tx SRS and only then can we signal the
3550 mac_srs_soft_rings_signal(mac_soft_ring_set_t
*mac_srs
, uint_t sr_flag
)
3552 mac_soft_ring_t
*softring
;
3554 for (softring
= mac_srs
->srs_soft_ring_head
; softring
!= NULL
;
3555 softring
= softring
->s_ring_next
)
3556 mac_soft_ring_signal(softring
, sr_flag
);
3560 * The block comment above mac_rx_classify_flow_state_change explains the
3561 * background. At this point the SRS is quiesced and we need to restart the
3562 * SRS worker, poll, and softring threads. The SRS worker thread serves as
3563 * the master controller. The steps involved are described below in the function
3566 mac_srs_worker_restart(mac_soft_ring_set_t
*mac_srs
)
3568 boolean_t iam_rx_srs
;
3569 mac_soft_ring_t
*softring
;
3571 ASSERT(MUTEX_HELD(&mac_srs
->srs_lock
));
3572 if ((mac_srs
->srs_type
& SRST_TX
) != 0) {
3573 iam_rx_srs
= B_FALSE
;
3574 ASSERT((mac_srs
->srs_state
&
3575 (SRS_POLL_THR_QUIESCED
| SRS_QUIESCE_DONE
| SRS_QUIESCE
)) ==
3576 (SRS_QUIESCE_DONE
| SRS_QUIESCE
));
3578 iam_rx_srs
= B_TRUE
;
3579 ASSERT((mac_srs
->srs_state
&
3580 (SRS_QUIESCE_DONE
| SRS_QUIESCE
)) ==
3581 (SRS_QUIESCE_DONE
| SRS_QUIESCE
));
3582 if (mac_srs
->srs_poll_thr
!= NULL
) {
3583 ASSERT((mac_srs
->srs_state
& SRS_POLL_THR_QUIESCED
) ==
3584 SRS_POLL_THR_QUIESCED
);
3589 * Signal any quiesced soft ring workers to restart and wait for the
3590 * soft ring down count to come down to zero.
3592 if (mac_srs
->srs_soft_ring_quiesced_count
!= 0) {
3593 for (softring
= mac_srs
->srs_soft_ring_head
; softring
!= NULL
;
3594 softring
= softring
->s_ring_next
) {
3595 if (!(softring
->s_ring_state
& S_RING_QUIESCE
))
3597 mac_soft_ring_signal(softring
, S_RING_RESTART
);
3599 while (mac_srs
->srs_soft_ring_quiesced_count
!= 0)
3600 cv_wait(&mac_srs
->srs_async
, &mac_srs
->srs_lock
);
3603 mac_srs
->srs_state
&= ~(SRS_QUIESCE_DONE
| SRS_QUIESCE
| SRS_RESTART
);
3604 if (iam_rx_srs
&& mac_srs
->srs_poll_thr
!= NULL
) {
3606 * Signal the poll thread and ask it to restart. Wait till it
3607 * actually restarts and the SRS_POLL_THR_QUIESCED flag gets
3610 mac_srs
->srs_state
|= SRS_POLL_THR_RESTART
;
3611 cv_signal(&mac_srs
->srs_cv
);
3612 while (mac_srs
->srs_state
& SRS_POLL_THR_QUIESCED
)
3613 cv_wait(&mac_srs
->srs_async
, &mac_srs
->srs_lock
);
3614 ASSERT(!(mac_srs
->srs_state
& SRS_POLL_THR_RESTART
));
3616 /* Wake up any waiter waiting for the restart to complete */
3617 mac_srs
->srs_state
|= SRS_RESTART_DONE
;
3618 cv_signal(&mac_srs
->srs_quiesce_done_cv
);
3622 mac_srs_worker_unbind(mac_soft_ring_set_t
*mac_srs
)
3624 mutex_enter(&mac_srs
->srs_lock
);
3625 if (!(mac_srs
->srs_state
& SRS_WORKER_BOUND
)) {
3626 ASSERT(mac_srs
->srs_worker_cpuid
== -1);
3627 mutex_exit(&mac_srs
->srs_lock
);
3631 mac_srs
->srs_worker_cpuid
= -1;
3632 mac_srs
->srs_state
&= ~SRS_WORKER_BOUND
;
3633 thread_affinity_clear(mac_srs
->srs_worker
);
3634 mutex_exit(&mac_srs
->srs_lock
);
3638 mac_srs_poll_unbind(mac_soft_ring_set_t
*mac_srs
)
3640 mutex_enter(&mac_srs
->srs_lock
);
3641 if (mac_srs
->srs_poll_thr
== NULL
||
3642 (mac_srs
->srs_state
& SRS_POLL_BOUND
) == 0) {
3643 ASSERT(mac_srs
->srs_poll_cpuid
== -1);
3644 mutex_exit(&mac_srs
->srs_lock
);
3648 mac_srs
->srs_poll_cpuid
= -1;
3649 mac_srs
->srs_state
&= ~SRS_POLL_BOUND
;
3650 thread_affinity_clear(mac_srs
->srs_poll_thr
);
3651 mutex_exit(&mac_srs
->srs_lock
);
3655 mac_srs_threads_unbind(mac_soft_ring_set_t
*mac_srs
)
3657 mac_soft_ring_t
*soft_ring
;
3659 ASSERT(MAC_PERIM_HELD((mac_handle_t
)mac_srs
->srs_mcip
->mci_mip
));
3661 mutex_enter(&cpu_lock
);
3662 mac_srs_worker_unbind(mac_srs
);
3663 if (!(mac_srs
->srs_type
& SRST_TX
))
3664 mac_srs_poll_unbind(mac_srs
);
3666 for (soft_ring
= mac_srs
->srs_soft_ring_head
; soft_ring
!= NULL
;
3667 soft_ring
= soft_ring
->s_ring_next
) {
3668 mac_soft_ring_unbind(soft_ring
);
3670 mutex_exit(&cpu_lock
);
3674 * When a CPU is going away, unbind all MAC threads which are bound
3675 * to that CPU. The affinity of the thread to the CPU is saved to allow
3676 * the thread to be rebound to the CPU if it comes back online.
3679 mac_walk_srs_and_unbind(int cpuid
)
3681 mac_soft_ring_set_t
*mac_srs
;
3682 mac_soft_ring_t
*soft_ring
;
3684 rw_enter(&mac_srs_g_lock
, RW_READER
);
3686 if ((mac_srs
= mac_srs_g_list
) == NULL
)
3689 for (; mac_srs
!= NULL
; mac_srs
= mac_srs
->srs_next
) {
3690 if (mac_srs
->srs_worker_cpuid
== cpuid
) {
3691 mac_srs
->srs_worker_cpuid_save
= cpuid
;
3692 mac_srs_worker_unbind(mac_srs
);
3695 if (!(mac_srs
->srs_type
& SRST_TX
)) {
3696 if (mac_srs
->srs_poll_cpuid
== cpuid
) {
3697 mac_srs
->srs_poll_cpuid_save
= cpuid
;
3698 mac_srs_poll_unbind(mac_srs
);
3702 /* Next tackle the soft rings associated with the srs */
3703 mutex_enter(&mac_srs
->srs_lock
);
3704 for (soft_ring
= mac_srs
->srs_soft_ring_head
; soft_ring
!= NULL
;
3705 soft_ring
= soft_ring
->s_ring_next
) {
3706 if (soft_ring
->s_ring_cpuid
== cpuid
) {
3707 soft_ring
->s_ring_cpuid_save
= cpuid
;
3708 mac_soft_ring_unbind(soft_ring
);
3711 mutex_exit(&mac_srs
->srs_lock
);
3714 rw_exit(&mac_srs_g_lock
);
3717 /* TX SETUP and TEARDOWN ROUTINES */
3720 * XXXHIO need to make sure the two mac_tx_srs_{add,del}_ring()
3721 * handle the case where the number of rings is one. I.e. there is
3722 * a ring pointed to by mac_srs->srs_tx_arg2.
3725 mac_tx_srs_add_ring(mac_soft_ring_set_t
*mac_srs
, mac_ring_t
*tx_ring
)
3727 mac_client_impl_t
*mcip
= mac_srs
->srs_mcip
;
3728 mac_soft_ring_t
*soft_ring
;
3729 int count
= mac_srs
->srs_tx_ring_count
;
3730 uint32_t soft_ring_type
= ST_RING_TX
;
3733 ASSERT(mac_srs
->srs_state
& SRS_QUIESCE
);
3734 ring_info
= mac_hwring_getinfo((mac_ring_handle_t
)tx_ring
);
3735 if (mac_tx_serialize
|| (ring_info
& MAC_RING_TX_SERIALIZE
))
3736 soft_ring_type
|= ST_RING_WORKER_ONLY
;
3737 soft_ring
= mac_soft_ring_create(count
, 0,
3738 soft_ring_type
, maxclsyspri
, mcip
, mac_srs
, -1,
3739 NULL
, mcip
, (mac_resource_handle_t
)tx_ring
);
3740 mac_srs
->srs_tx_ring_count
++;
3741 mac_srs_update_fanout_list(mac_srs
);
3743 * put this soft ring in quiesce mode too so when we restart
3744 * all soft rings in the srs are in the same state.
3746 mac_soft_ring_signal(soft_ring
, S_RING_QUIESCE
);
3750 mac_soft_ring_remove(mac_soft_ring_set_t
*mac_srs
, mac_soft_ring_t
*softring
)
3754 mutex_enter(&mac_srs
->srs_lock
);
3755 sringcnt
= mac_srs
->srs_soft_ring_count
;
3756 ASSERT(sringcnt
> 0);
3757 mac_soft_ring_signal(softring
, S_RING_CONDEMNED
);
3759 ASSERT(mac_srs
->srs_soft_ring_condemned_count
== 0);
3760 while (mac_srs
->srs_soft_ring_condemned_count
!= 1)
3761 cv_wait(&mac_srs
->srs_async
, &mac_srs
->srs_lock
);
3763 if (softring
== mac_srs
->srs_soft_ring_head
) {
3764 mac_srs
->srs_soft_ring_head
= softring
->s_ring_next
;
3765 if (mac_srs
->srs_soft_ring_head
!= NULL
) {
3766 mac_srs
->srs_soft_ring_head
->s_ring_prev
= NULL
;
3768 mac_srs
->srs_soft_ring_tail
= NULL
;
3771 softring
->s_ring_prev
->s_ring_next
=
3772 softring
->s_ring_next
;
3773 if (softring
->s_ring_next
!= NULL
) {
3774 softring
->s_ring_next
->s_ring_prev
=
3775 softring
->s_ring_prev
;
3777 mac_srs
->srs_soft_ring_tail
=
3778 softring
->s_ring_prev
;
3781 mac_srs
->srs_soft_ring_count
--;
3783 mac_srs
->srs_soft_ring_condemned_count
--;
3784 mutex_exit(&mac_srs
->srs_lock
);
3786 mac_soft_ring_free(softring
);
3790 mac_tx_srs_del_ring(mac_soft_ring_set_t
*mac_srs
, mac_ring_t
*tx_ring
)
3793 mac_soft_ring_t
*soft_ring
, *remove_sring
;
3794 mac_client_impl_t
*mcip
= mac_srs
->srs_mcip
;
3796 mutex_enter(&mac_srs
->srs_lock
);
3797 for (i
= 0; i
< mac_srs
->srs_tx_ring_count
; i
++) {
3798 soft_ring
= mac_srs
->srs_tx_soft_rings
[i
];
3799 if (soft_ring
->s_ring_tx_arg2
== tx_ring
)
3802 mutex_exit(&mac_srs
->srs_lock
);
3803 ASSERT(i
< mac_srs
->srs_tx_ring_count
);
3804 remove_sring
= soft_ring
;
3806 * In the case of aggr, the soft ring associated with a Tx ring
3807 * is also stored in st_soft_rings[] array. That entry should
3810 if (mcip
->mci_state_flags
& MCIS_IS_AGGR
) {
3811 mac_srs_tx_t
*tx
= &mac_srs
->srs_tx
;
3813 ASSERT(tx
->st_soft_rings
[tx_ring
->mr_index
] == remove_sring
);
3814 tx
->st_soft_rings
[tx_ring
->mr_index
] = NULL
;
3816 mac_soft_ring_remove(mac_srs
, remove_sring
);
3817 mac_srs_update_fanout_list(mac_srs
);
3821 * mac_tx_srs_setup():
3822 * Used to setup Tx rings. If no free Tx ring is available, then default
3826 mac_tx_srs_setup(mac_client_impl_t
*mcip
, flow_entry_t
*flent
)
3828 mac_impl_t
*mip
= mcip
->mci_mip
;
3829 mac_soft_ring_set_t
*tx_srs
= flent
->fe_tx_srs
;
3831 int tx_ring_count
= 0;
3832 uint32_t soft_ring_type
;
3833 mac_group_t
*grp
= NULL
;
3835 mac_srs_tx_t
*tx
= &tx_srs
->srs_tx
;
3837 uint_t ring_info
= 0;
3839 is_aggr
= (mcip
->mci_state_flags
& MCIS_IS_AGGR
) != 0;
3840 grp
= flent
->fe_tx_ring_group
;
3842 ring
= (mac_ring_t
*)mip
->mi_default_tx_ring
;
3845 tx_ring_count
= grp
->mrg_cur_count
;
3846 ring
= grp
->mrg_rings
;
3848 * An attempt is made to reserve 'tx_ring_count' number
3849 * of Tx rings. If tx_ring_count is 0, default Tx ring
3850 * is used. If it is 1, an attempt is made to reserve one
3851 * Tx ring. In both the cases, the ring information is
3852 * stored in Tx SRS. If multiple Tx rings are specified,
3853 * then each Tx ring will have a Tx-side soft ring. All
3854 * these soft rings will be hang off Tx SRS.
3856 switch (grp
->mrg_state
) {
3857 case MAC_GROUP_STATE_SHARED
:
3858 case MAC_GROUP_STATE_RESERVED
:
3859 if (tx_ring_count
<= 1 && !is_aggr
) {
3862 ring
->mr_state
!= MR_INUSE
) {
3863 (void) mac_start_ring(ring
);
3864 ring_info
= mac_hwring_getinfo(
3865 (mac_ring_handle_t
)ring
);
3867 tx
->st_arg2
= (void *)ring
;
3868 mac_tx_srs_stat_recreate(tx_srs
, B_FALSE
);
3869 if (tx_srs
->srs_type
& SRST_BW_CONTROL
) {
3870 tx
->st_mode
= SRS_TX_BW
;
3871 } else if (mac_tx_serialize
||
3872 (ring_info
& MAC_RING_TX_SERIALIZE
)) {
3873 tx
->st_mode
= SRS_TX_SERIALIZE
;
3875 tx
->st_mode
= SRS_TX_DEFAULT
;
3879 soft_ring_type
= ST_RING_TX
;
3880 if (tx_srs
->srs_type
& SRST_BW_CONTROL
) {
3881 tx
->st_mode
= is_aggr
?
3882 SRS_TX_BW_AGGR
: SRS_TX_BW_FANOUT
;
3884 tx
->st_mode
= is_aggr
? SRS_TX_AGGR
:
3887 for (i
= 0; i
< tx_ring_count
; i
++) {
3888 ASSERT(ring
!= NULL
);
3889 switch (ring
->mr_state
) {
3892 ASSERT(ring
->mr_srs
== NULL
);
3894 if (ring
->mr_state
!= MR_INUSE
)
3895 (void) mac_start_ring(ring
);
3896 ring_info
= mac_hwring_getinfo(
3897 (mac_ring_handle_t
)ring
);
3898 if (mac_tx_serialize
|| (ring_info
&
3899 MAC_RING_TX_SERIALIZE
)) {
3901 ST_RING_WORKER_ONLY
;
3903 (void) mac_soft_ring_create(i
, 0,
3904 soft_ring_type
, maxclsyspri
,
3905 mcip
, tx_srs
, -1, NULL
, mcip
,
3906 (mac_resource_handle_t
)ring
);
3910 "srs_setup: mcip = %p "
3911 "trying to add UNKNOWN ring = %p\n",
3912 (void *)mcip
, (void *)ring
);
3915 ring
= ring
->mr_next
;
3917 mac_srs_update_fanout_list(tx_srs
);
3923 tx
->st_func
= mac_tx_get_func(tx
->st_mode
);
3925 VERIFY(i_mac_capab_get((mac_handle_t
)mip
,
3926 MAC_CAPAB_AGGR
, &tx
->st_capab_aggr
));
3928 DTRACE_PROBE3(tx__srs___setup__return
, mac_soft_ring_set_t
*, tx_srs
,
3929 int, tx
->st_mode
, int, tx_srs
->srs_tx_ring_count
);
3933 * Update the fanout of a client if its recorded link speed doesn't match
3934 * its current link speed.
3937 mac_fanout_recompute_client(mac_client_impl_t
*mcip
, cpupart_t
*cpupart
)
3939 uint64_t link_speed
;
3940 mac_resource_props_t
*mcip_mrp
;
3941 flow_entry_t
*flent
= mcip
->mci_flent
;
3942 mac_soft_ring_set_t
*rx_srs
;
3943 mac_cpus_t
*srs_cpu
;
3944 int soft_ring_count
, maxcpus
;
3946 ASSERT(MAC_PERIM_HELD((mac_handle_t
)mcip
->mci_mip
));
3948 link_speed
= mac_client_stat_get(mcip
->mci_flent
->fe_mcip
,
3951 if ((link_speed
!= 0) &&
3952 (link_speed
!= mcip
->mci_flent
->fe_nic_speed
)) {
3953 mcip_mrp
= MCIP_RESOURCE_PROPS(mcip
);
3955 * Before calling mac_fanout_setup(), check to see if
3956 * the SRSes already have the right number of soft
3957 * rings. mac_fanout_setup() is a heavy duty operation
3958 * where new cpu bindings are done for SRS and soft
3959 * ring threads and interrupts re-targeted.
3961 maxcpus
= (cpupart
!= NULL
) ? cpupart
->cp_ncpus
: ncpus
;
3962 soft_ring_count
= mac_compute_soft_ring_count(flent
,
3963 flent
->fe_rx_srs_cnt
- 1, maxcpus
);
3965 * If soft_ring_count returned by
3966 * mac_compute_soft_ring_count() is 0, bump it
3967 * up by 1 because we always have atleast one
3968 * TCP, UDP, and OTH soft ring associated with
3971 soft_ring_count
= (soft_ring_count
== 0) ?
3972 1 : soft_ring_count
;
3973 rx_srs
= flent
->fe_rx_srs
[0];
3974 srs_cpu
= &rx_srs
->srs_cpu
;
3975 if (soft_ring_count
!= srs_cpu
->mc_rx_fanout_cnt
) {
3976 mac_fanout_setup(mcip
, flent
, mcip_mrp
,
3977 mac_rx_deliver
, mcip
, NULL
, cpupart
);
3983 * Walk through the list of mac clients for the MAC.
3984 * For each active mac client, recompute the number of soft rings
3985 * associated with every client, only if current speed is different
3986 * from the speed that was previously used for soft ring computation.
3987 * If the cable is disconnected whlie the NIC is started, we would get
3988 * notification with speed set to 0. We do not recompute in that case.
3991 mac_fanout_recompute(mac_impl_t
*mip
)
3993 mac_client_impl_t
*mcip
;
3995 boolean_t use_default
;
3996 mac_resource_props_t
*mrp
, *emrp
;
3998 i_mac_perim_enter(mip
);
3999 if ((mip
->mi_state_flags
& MIS_IS_VNIC
) != 0 ||
4000 mip
->mi_linkstate
!= LINK_STATE_UP
) {
4001 i_mac_perim_exit(mip
);
4005 for (mcip
= mip
->mi_clients_list
; mcip
!= NULL
;
4006 mcip
= mcip
->mci_client_next
) {
4007 if ((mcip
->mci_state_flags
& MCIS_SHARE_BOUND
) != 0 ||
4008 !MCIP_DATAPATH_SETUP(mcip
))
4010 mrp
= MCIP_RESOURCE_PROPS(mcip
);
4011 emrp
= MCIP_EFFECTIVE_PROPS(mcip
);
4012 use_default
= B_FALSE
;
4014 cpupart
= mac_pset_find(mrp
, &use_default
);
4015 mac_fanout_recompute_client(mcip
, cpupart
);
4016 mac_set_pool_effective(use_default
, cpupart
, mrp
, emrp
);
4019 i_mac_perim_exit(mip
);
4023 * Given a MAC, change the polling state for all its MAC clients. 'enable' is
4024 * B_TRUE to enable polling or B_FALSE to disable. Polling is enabled by
4028 mac_poll_state_change(mac_handle_t mh
, boolean_t enable
)
4030 mac_impl_t
*mip
= (mac_impl_t
*)mh
;
4031 mac_client_impl_t
*mcip
;
4033 i_mac_perim_enter(mip
);
4035 mip
->mi_state_flags
&= ~MIS_POLL_DISABLE
;
4037 mip
->mi_state_flags
|= MIS_POLL_DISABLE
;
4038 for (mcip
= mip
->mi_clients_list
; mcip
!= NULL
;
4039 mcip
= mcip
->mci_client_next
)
4040 mac_client_update_classifier(mcip
, B_TRUE
);
4041 i_mac_perim_exit(mip
);