usr/src/uts/common/io/mac/mac_sched.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  * Copyright 2011 Joyent, Inc.  All rights reserved.
  25  * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
  26  */
  27
  28 #include <sys/types.h>
  29 #include <sys/callb.h>
  30 #include <sys/sdt.h>
  31 #include <sys/strsubr.h>
  32 #include <sys/strsun.h>
  33 #include <sys/vlan.h>
  34 #include <sys/stack.h>
  35 #include <sys/archsystm.h>
  36 #include <inet/ipsec_impl.h>
  37 #include <inet/ip_impl.h>
  38 #include <inet/sadb.h>
  39 #include <inet/ipsecesp.h>
  40 #include <inet/ipsecah.h>
  41 #include <inet/ip6.h>
  42
  43 #include <sys/mac_impl.h>
  44 #include <sys/mac_client_impl.h>
  45 #include <sys/mac_client_priv.h>
  46 #include <sys/mac_soft_ring.h>
  47 #include <sys/mac_flow_impl.h>
  48
  49 static mac_tx_cookie_t mac_tx_single_ring_mode(mac_soft_ring_set_t *, mblk_t *,
  50     uintptr_t, uint16_t, mblk_t **);
  51 static mac_tx_cookie_t mac_tx_serializer_mode(mac_soft_ring_set_t *, mblk_t *,
  52     uintptr_t, uint16_t, mblk_t **);
  53 static mac_tx_cookie_t mac_tx_fanout_mode(mac_soft_ring_set_t *, mblk_t *,
  54     uintptr_t, uint16_t, mblk_t **);
  55 static mac_tx_cookie_t mac_tx_bw_mode(mac_soft_ring_set_t *, mblk_t *,
  56     uintptr_t, uint16_t, mblk_t **);
  57 static mac_tx_cookie_t mac_tx_aggr_mode(mac_soft_ring_set_t *, mblk_t *,
  58     uintptr_t, uint16_t, mblk_t **);
  59
  60 typedef struct mac_tx_mode_s {
  61         mac_tx_srs_mode_t       mac_tx_mode;
  62         mac_tx_func_t           mac_tx_func;
  63 } mac_tx_mode_t;
  64
  65 /*
  66  * There are seven modes of operation on the Tx side. These modes get set
  67  * in mac_tx_srs_setup(). Except for the experimental TX_SERIALIZE mode,
  68  * none of the other modes are user configurable. They get selected by
  69  * the system depending upon whether the link (or flow) has multiple Tx
  70  * rings or a bandwidth configured, or if the link is an aggr, etc.
  71  *
  72  * When the Tx SRS is operating in aggr mode (st_mode) or if there are
  73  * multiple Tx rings owned by Tx SRS, then each Tx ring (pseudo or
  74  * otherwise) will have a soft ring associated with it. These soft rings
  75  * are stored in srs_tx_soft_rings[] array.
  76  *
  77  * Additionally in the case of aggr, there is the st_soft_rings[] array
  78  * in the mac_srs_tx_t structure. This array is used to store the same
  79  * set of soft rings that are present in srs_tx_soft_rings[] array but
  80  * in a different manner. The soft ring associated with the pseudo Tx
  81  * ring is saved at mr_index (of the pseudo ring) in st_soft_rings[]
  82  * array. This helps in quickly getting the soft ring associated with the
  83  * Tx ring when aggr_find_tx_ring() returns the pseudo Tx ring that is to
  84  * be used for transmit.
  85  */
  86 mac_tx_mode_t mac_tx_mode_list[] = {
  87         {SRS_TX_DEFAULT,        mac_tx_single_ring_mode},
  88         {SRS_TX_SERIALIZE,      mac_tx_serializer_mode},
  89         {SRS_TX_FANOUT,         mac_tx_fanout_mode},
  90         {SRS_TX_BW,             mac_tx_bw_mode},
  91         {SRS_TX_BW_FANOUT,      mac_tx_bw_mode},
  92         {SRS_TX_AGGR,           mac_tx_aggr_mode},
  93         {SRS_TX_BW_AGGR,        mac_tx_bw_mode}
  94 };
  95
  96 /*
  97  * Soft Ring Set (SRS) - The Run time code that deals with
  98  * dynamic polling from the hardware, bandwidth enforcement,
  99  * fanout etc.
 100  *
 101  * We try to use H/W classification on NIC and assign traffic for
 102  * a MAC address to a particular Rx ring or ring group. There is a
 103  * 1-1 mapping between a SRS and a Rx ring. The SRS dynamically
 104  * switches the underlying Rx ring between interrupt and
 105  * polling mode and enforces any specified B/W control.
 106  *
 107  * There is always a SRS created and tied to each H/W and S/W rule.
 108  * Whenever we create a H/W rule, we always add the the same rule to
 109  * S/W classifier and tie a SRS to it.
 110  *
 111  * In case a B/W control is specified, it is broken into bytes
 112  * per ticks and as soon as the quota for a tick is exhausted,
 113  * the underlying Rx ring is forced into poll mode for remainder of
 114  * the tick. The SRS poll thread only polls for bytes that are
 115  * allowed to come in the SRS. We typically let 4x the configured
 116  * B/W worth of packets to come in the SRS (to prevent unnecessary
 117  * drops due to bursts) but only process the specified amount.
 118  *
 119  * A MAC client (e.g. a VNIC or aggr) can have 1 or more
 120  * Rx rings (and corresponding SRSs) assigned to it. The SRS
 121  * in turn can have softrings to do protocol level fanout or
 122  * softrings to do S/W based fanout or both. In case the NIC
 123  * has no Rx rings, we do S/W classification to respective SRS.
 124  * The S/W classification rule is always setup and ready. This
 125  * allows the MAC layer to reassign Rx rings whenever needed
 126  * but packets still continue to flow via the default path and
 127  * getting S/W classified to correct SRS.
 128  *
 129  * The SRS's are used on both Tx and Rx side. They use the same
 130  * data structure but the processing routines have slightly different
 131  * semantics due to the fact that Rx side needs to do dynamic
 132  * polling etc.
 133  *
 134  * Dynamic Polling Notes
 135  * =====================
 136  *
 137  * Each Soft ring set is capable of switching its Rx ring between
 138  * interrupt and poll mode and actively 'polls' for packets in
 139  * poll mode. If the SRS is implementing a B/W limit, it makes
 140  * sure that only Max allowed packets are pulled in poll mode
 141  * and goes to poll mode as soon as B/W limit is exceeded. As
 142  * such, there are no overheads to implement B/W limits.
 143  *
 144  * In poll mode, its better to keep the pipeline going where the
 145  * SRS worker thread keeps processing packets and poll thread
 146  * keeps bringing more packets (specially if they get to run
 147  * on different CPUs). This also prevents the overheads associated
 148  * by excessive signalling (on NUMA machines, this can be
 149  * pretty devastating). The exception is latency optimized case
 150  * where worker thread does no work and interrupt and poll thread
 151  * are allowed to do their own drain.
 152  *
 153  * We use the following policy to control Dynamic Polling:
 154  * 1) We switch to poll mode anytime the processing
 155  *    thread causes a backlog to build up in SRS and
 156  *    its associated Soft Rings (sr_poll_pkt_cnt > 0).
 157  * 2) As long as the backlog stays under the low water
 158  *    mark (sr_lowat), we poll the H/W for more packets.
 159  * 3) If the backlog (sr_poll_pkt_cnt) exceeds low
 160  *    water mark, we stay in poll mode but don't poll
 161  *    the H/W for more packets.
 162  * 4) Anytime in polling mode, if we poll the H/W for
 163  *    packets and find nothing plus we have an existing
 164  *    backlog (sr_poll_pkt_cnt > 0), we stay in polling
 165  *    mode but don't poll the H/W for packets anymore
 166  *    (let the polling thread go to sleep).
 167  * 5) Once the backlog is relived (packets are processed)
 168  *    we reenable polling (by signalling the poll thread)
 169  *    only when the backlog dips below sr_poll_thres.
 170  * 6) sr_hiwat is used exclusively when we are not
 171  *    polling capable and is used to decide when to
 172  *    drop packets so the SRS queue length doesn't grow
 173  *    infinitely.
 174  *
 175  * NOTE: Also see the block level comment on top of mac_soft_ring.c
 176  */
 177
 178 /*
 179  * mac_latency_optimize
 180  *
 181  * Controls whether the poll thread can process the packets inline
 182  * or let the SRS worker thread do the processing. This applies if
 183  * the SRS was not being processed. For latency sensitive traffic,
 184  * this needs to be true to allow inline processing. For throughput
 185  * under load, this should be false.
 186  *
 187  * This (and other similar) tunable should be rolled into a link
 188  * or flow specific workload hint that can be set using dladm
 189  * linkprop (instead of multiple such tunables).
 190  */
 191 boolean_t mac_latency_optimize = B_TRUE;
 192
 193 /*
 194  * MAC_RX_SRS_ENQUEUE_CHAIN and MAC_TX_SRS_ENQUEUE_CHAIN
 195  *
 196  * queue a mp or chain in soft ring set and increment the
 197  * local count (srs_count) for the SRS and the shared counter
 198  * (srs_poll_pkt_cnt - shared between SRS and its soft rings
 199  * to track the total unprocessed packets for polling to work
 200  * correctly).
 201  *
 202  * The size (total bytes queued) counters are incremented only
 203  * if we are doing B/W control.
 204  */
 205 #define MAC_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz) {         \
 206         ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock));                       \
 207         if ((mac_srs)->srs_last != NULL)                                \
 208                 (mac_srs)->srs_last->b_next = (head);                   \
 209         else                                                            \
 210                 (mac_srs)->srs_first = (head);                          \
 211         (mac_srs)->srs_last = (tail);                                   \
 212         (mac_srs)->srs_count += count;                                  \
 213 }
 214
 215 #define MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz) {      \
 216         mac_srs_rx_t    *srs_rx = &(mac_srs)->srs_rx;                   \
 217                                                                         \
 218         MAC_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz);          \
 219         srs_rx->sr_poll_pkt_cnt += count;                               \
 220         ASSERT(srs_rx->sr_poll_pkt_cnt > 0);                            \
 221         if ((mac_srs)->srs_type & SRST_BW_CONTROL) {                    \
 222                 (mac_srs)->srs_size += (sz);                            \
 223                 mutex_enter(&(mac_srs)->srs_bw->mac_bw_lock);           \
 224                 (mac_srs)->srs_bw->mac_bw_sz += (sz);                   \
 225                 mutex_exit(&(mac_srs)->srs_bw->mac_bw_lock);            \
 226         }                                                               \
 227 }
 228
 229 #define MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz) {      \
 230         mac_srs->srs_state |= SRS_ENQUEUED;                             \
 231         MAC_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz);          \
 232         if ((mac_srs)->srs_type & SRST_BW_CONTROL) {                    \
 233                 (mac_srs)->srs_size += (sz);                            \
 234                 (mac_srs)->srs_bw->mac_bw_sz += (sz);                   \
 235         }                                                               \
 236 }
 237
 238 /*
 239  * Turn polling on routines
 240  */
 241 #define MAC_SRS_POLLING_ON(mac_srs) {                                   \
 242         ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock));                       \
 243         if (((mac_srs)->srs_state &                                     \
 244             (SRS_POLLING_CAPAB|SRS_POLLING)) == SRS_POLLING_CAPAB) {    \
 245                 (mac_srs)->srs_state |= SRS_POLLING;                    \
 246                 (void) mac_hwring_disable_intr((mac_ring_handle_t)      \
 247                     (mac_srs)->srs_ring);                               \
 248                 (mac_srs)->srs_rx.sr_poll_on++;                         \
 249         }                                                               \
 250 }
 251
 252 #define MAC_SRS_WORKER_POLLING_ON(mac_srs) {                            \
 253         ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock));                       \
 254         if (((mac_srs)->srs_state &                                     \
 255             (SRS_POLLING_CAPAB|SRS_WORKER|SRS_POLLING)) ==              \
 256             (SRS_POLLING_CAPAB|SRS_WORKER)) {                           \
 257                 (mac_srs)->srs_state |= SRS_POLLING;                    \
 258                 (void) mac_hwring_disable_intr((mac_ring_handle_t)      \
 259                     (mac_srs)->srs_ring);                               \
 260                 (mac_srs)->srs_rx.sr_worker_poll_on++;                  \
 261         }                                                               \
 262 }
 263
 264 /*
 265  * MAC_SRS_POLL_RING
 266  *
 267  * Signal the SRS poll thread to poll the underlying H/W ring
 268  * provided it wasn't already polling (SRS_GET_PKTS was set).
 269  *
 270  * Poll thread gets to run only from mac_rx_srs_drain() and only
 271  * if the drain was being done by the worker thread.
 272  */
 273 #define MAC_SRS_POLL_RING(mac_srs) {                                    \
 274         mac_srs_rx_t    *srs_rx = &(mac_srs)->srs_rx;                   \
 275                                                                         \
 276         ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock));                       \
 277         srs_rx->sr_poll_thr_sig++;                                      \
 278         if (((mac_srs)->srs_state &                                     \
 279             (SRS_POLLING_CAPAB|SRS_WORKER|SRS_GET_PKTS)) ==             \
 280                 (SRS_WORKER|SRS_POLLING_CAPAB)) {                       \
 281                 (mac_srs)->srs_state |= SRS_GET_PKTS;                   \
 282                 cv_signal(&(mac_srs)->srs_cv);                          \
 283         } else {                                                        \
 284                 srs_rx->sr_poll_thr_busy++;                             \
 285         }                                                               \
 286 }
 287
 288 /*
 289  * MAC_SRS_CHECK_BW_CONTROL
 290  *
 291  * Check to see if next tick has started so we can reset the
 292  * SRS_BW_ENFORCED flag and allow more packets to come in the
 293  * system.
 294  */
 295 #define MAC_SRS_CHECK_BW_CONTROL(mac_srs) {                             \
 296         ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock));                       \
 297         ASSERT(((mac_srs)->srs_type & SRST_TX) ||                       \
 298             MUTEX_HELD(&(mac_srs)->srs_bw->mac_bw_lock));               \
 299         clock_t now = ddi_get_lbolt();                                  \
 300         if ((mac_srs)->srs_bw->mac_bw_curr_time != now) {               \
 301                 (mac_srs)->srs_bw->mac_bw_curr_time = now;              \
 302                 (mac_srs)->srs_bw->mac_bw_used = 0;                     \
 303                 if ((mac_srs)->srs_bw->mac_bw_state & SRS_BW_ENFORCED)  \
 304                         (mac_srs)->srs_bw->mac_bw_state &= ~SRS_BW_ENFORCED; \
 305         }                                                               \
 306 }
 307
 308 /*
 309  * MAC_SRS_WORKER_WAKEUP
 310  *
 311  * Wake up the SRS worker thread to process the queue as long as
 312  * no one else is processing the queue. If we are optimizing for
 313  * latency, we wake up the worker thread immediately or else we
 314  * wait mac_srs_worker_wakeup_ticks before worker thread gets
 315  * woken up.
 316  */
 317 int mac_srs_worker_wakeup_ticks = 0;
 318 #define MAC_SRS_WORKER_WAKEUP(mac_srs) {                                \
 319         ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock));                       \
 320         if (!((mac_srs)->srs_state & SRS_PROC) &&                       \
 321                 (mac_srs)->srs_tid == NULL) {                           \
 322                 if (((mac_srs)->srs_state & SRS_LATENCY_OPT) ||         \
 323                         (mac_srs_worker_wakeup_ticks == 0))             \
 324                         cv_signal(&(mac_srs)->srs_async);               \
 325                 else                                                    \
 326                         (mac_srs)->srs_tid =                            \
 327                                 timeout(mac_srs_fire, (mac_srs),        \
 328                                         mac_srs_worker_wakeup_ticks);   \
 329         }                                                               \
 330 }
 331
 332 #define TX_BANDWIDTH_MODE(mac_srs)                              \
 333         ((mac_srs)->srs_tx.st_mode == SRS_TX_BW ||              \
 334             (mac_srs)->srs_tx.st_mode == SRS_TX_BW_FANOUT ||    \
 335             (mac_srs)->srs_tx.st_mode == SRS_TX_BW_AGGR)
 336
 337 #define TX_SRS_TO_SOFT_RING(mac_srs, head, hint) {                      \
 338         if (tx_mode == SRS_TX_BW_FANOUT)                                \
 339                 (void) mac_tx_fanout_mode(mac_srs, head, hint, 0, NULL);\
 340         else                                                            \
 341                 (void) mac_tx_aggr_mode(mac_srs, head, hint, 0, NULL);  \
 342 }
 343
 344 /*
 345  * MAC_TX_SRS_BLOCK
 346  *
 347  * Always called from mac_tx_srs_drain() function. SRS_TX_BLOCKED
 348  * will be set only if srs_tx_woken_up is FALSE. If
 349  * srs_tx_woken_up is TRUE, it indicates that the wakeup arrived
 350  * before we grabbed srs_lock to set SRS_TX_BLOCKED. We need to
 351  * attempt to transmit again and not setting SRS_TX_BLOCKED does
 352  * that.
 353  */
 354 #define MAC_TX_SRS_BLOCK(srs, mp)       {                       \
 355         ASSERT(MUTEX_HELD(&(srs)->srs_lock));                   \
 356         if ((srs)->srs_tx.st_woken_up) {                        \
 357                 (srs)->srs_tx.st_woken_up = B_FALSE;            \
 358         } else {                                                \
 359                 ASSERT(!((srs)->srs_state & SRS_TX_BLOCKED));   \
 360                 (srs)->srs_state |= SRS_TX_BLOCKED;             \
 361                 (srs)->srs_tx.st_stat.mts_blockcnt++;           \
 362         }                                                       \
 363 }
 364
 365 /*
 366  * MAC_TX_SRS_TEST_HIWAT
 367  *
 368  * Called before queueing a packet onto Tx SRS to test and set
 369  * SRS_TX_HIWAT if srs_count exceeds srs_tx_hiwat.
 370  */
 371 #define MAC_TX_SRS_TEST_HIWAT(srs, mp, tail, cnt, sz, cookie) {         \
 372         boolean_t enqueue = 1;                                          \
 373                                                                         \
 374         if ((srs)->srs_count > (srs)->srs_tx.st_hiwat) {                \
 375                 /*                                                      \
 376                  * flow-controlled. Store srs in cookie so that it      \
 377                  * can be returned as mac_tx_cookie_t to client         \
 378                  */                                                     \
 379                 (srs)->srs_state |= SRS_TX_HIWAT;                       \
 380                 cookie = (mac_tx_cookie_t)srs;                          \
 381                 (srs)->srs_tx.st_hiwat_cnt++;                           \
 382                 if ((srs)->srs_count > (srs)->srs_tx.st_max_q_cnt) {    \
 383                         /* increment freed stats */                     \
 384                         (srs)->srs_tx.st_stat.mts_sdrops += cnt;        \
 385                         /*                                              \
 386                          * b_prev may be set to the fanout hint         \
 387                          * hence can't use freemsg directly             \
 388                          */                                             \
 389                         mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE);    \
 390                         DTRACE_PROBE1(tx_queued_hiwat,                  \
 391                             mac_soft_ring_set_t *, srs);                \
 392                         enqueue = 0;                                    \
 393                 }                                                       \
 394         }                                                               \
 395         if (enqueue)                                                    \
 396                 MAC_TX_SRS_ENQUEUE_CHAIN(srs, mp, tail, cnt, sz);       \
 397 }
 398
 399 /* Some utility macros */
 400 #define MAC_SRS_BW_LOCK(srs)                                            \
 401         if (!(srs->srs_type & SRST_TX))                                 \
 402                 mutex_enter(&srs->srs_bw->mac_bw_lock);
 403
 404 #define MAC_SRS_BW_UNLOCK(srs)                                          \
 405         if (!(srs->srs_type & SRST_TX))                                 \
 406                 mutex_exit(&srs->srs_bw->mac_bw_lock);
 407
 408 #define MAC_TX_SRS_DROP_MESSAGE(srs, mp, cookie) {              \
 409         mac_pkt_drop(NULL, NULL, mp, B_FALSE);                  \
 410         /* increment freed stats */                             \
 411         mac_srs->srs_tx.st_stat.mts_sdrops++;                   \
 412         cookie = (mac_tx_cookie_t)srs;                          \
 413 }
 414
 415 #define MAC_TX_SET_NO_ENQUEUE(srs, mp_chain, ret_mp, cookie) {          \
 416         mac_srs->srs_state |= SRS_TX_WAKEUP_CLIENT;                     \
 417         cookie = (mac_tx_cookie_t)srs;                                  \
 418         *ret_mp = mp_chain;                                             \
 419 }
 420
 421 /*
 422  * MAC_RX_SRS_TOODEEP
 423  *
 424  * Macro called as part of receive-side processing to determine if handling
 425  * can occur in situ (in the interrupt thread) or if it should be left to a
 426  * worker thread.  Note that the constant used to make this determination is
 427  * not entirely made-up, and is a result of some emprical validation. That
 428  * said, the constant is left as a static variable to allow it to be
 429  * dynamically tuned in the field if and as needed.
 430  */
 431 static uintptr_t mac_rx_srs_stack_needed = 10240;
 432 static uint_t mac_rx_srs_stack_toodeep;
 433
 434 #ifndef STACK_GROWTH_DOWN
 435 #error Downward stack growth assumed.
 436 #endif
 437
 438 #define MAC_RX_SRS_TOODEEP() (STACK_BIAS + (uintptr_t)getfp() - \
 439         (uintptr_t)curthread->t_stkbase < mac_rx_srs_stack_needed && \
 440         ++mac_rx_srs_stack_toodeep)
 441
 442
 443 /*
 444  * Drop the rx packet and advance to the next one in the chain.
 445  */
 446 static void
 447 mac_rx_drop_pkt(mac_soft_ring_set_t *srs, mblk_t *mp)
 448 {
 449         mac_srs_rx_t    *srs_rx = &srs->srs_rx;
 450
 451         ASSERT(mp->b_next == NULL);
 452         mutex_enter(&srs->srs_lock);
 453         MAC_UPDATE_SRS_COUNT_LOCKED(srs, 1);
 454         MAC_UPDATE_SRS_SIZE_LOCKED(srs, msgdsize(mp));
 455         mutex_exit(&srs->srs_lock);
 456
 457         srs_rx->sr_stat.mrs_sdrops++;
 458         freemsg(mp);
 459 }
 460
 461 /* DATAPATH RUNTIME ROUTINES */
 462
 463 /*
 464  * mac_srs_fire
 465  *
 466  * Timer callback routine for waking up the SRS worker thread.
 467  */
 468 static void
 469 mac_srs_fire(void *arg)
 470 {
 471         mac_soft_ring_set_t *mac_srs = (mac_soft_ring_set_t *)arg;
 472
 473         mutex_enter(&mac_srs->srs_lock);
 474         if (mac_srs->srs_tid == 0) {
 475                 mutex_exit(&mac_srs->srs_lock);
 476                 return;
 477         }
 478
 479         mac_srs->srs_tid = 0;
 480         if (!(mac_srs->srs_state & SRS_PROC))
 481                 cv_signal(&mac_srs->srs_async);
 482
 483         mutex_exit(&mac_srs->srs_lock);
 484 }
 485
 486 /*
 487  * 'hint' is fanout_hint (type of uint64_t) which is given by the TCP/IP stack,
 488  * and it is used on the TX path.
 489  */
 490 #define HASH_HINT(hint) \
 491         ((hint) ^ ((hint) >> 24) ^ ((hint) >> 16) ^ ((hint) >> 8))
 492
 493
 494 /*
 495  * hash based on the src address, dst address and the port information.
 496  */
 497 #define HASH_ADDR(src, dst, ports)                                      \
 498         (ntohl((src) + (dst)) ^ ((ports) >> 24) ^ ((ports) >> 16) ^     \
 499         ((ports) >> 8) ^ (ports))
 500
 501 #define COMPUTE_INDEX(key, sz)  (key % sz)
 502
 503 #define FANOUT_ENQUEUE_MP(head, tail, cnt, bw_ctl, sz, sz0, mp) {       \
 504         if ((tail) != NULL) {                                           \
 505                 ASSERT((tail)->b_next == NULL);                         \
 506                 (tail)->b_next = (mp);                                  \
 507         } else {                                                        \
 508                 ASSERT((head) == NULL);                                 \
 509                 (head) = (mp);                                          \
 510         }                                                               \
 511         (tail) = (mp);                                                  \
 512         (cnt)++;                                                        \
 513         if ((bw_ctl))                                                   \
 514                 (sz) += (sz0);                                          \
 515 }
 516
 517 #define MAC_FANOUT_DEFAULT      0
 518 #define MAC_FANOUT_RND_ROBIN    1
 519 int mac_fanout_type = MAC_FANOUT_DEFAULT;
 520
 521 #define MAX_SR_TYPES    3
 522 /* fanout types for port based hashing */
 523 enum pkt_type {
 524         V4_TCP = 0,
 525         V4_UDP,
 526         OTH,
 527         UNDEF
 528 };
 529
 530 /*
 531  * Pair of local and remote ports in the transport header
 532  */
 533 #define PORTS_SIZE 4
 534
 535 /*
 536  * mac_rx_srs_proto_fanout
 537  *
 538  * This routine delivers packets destined to an SRS into one of the
 539  * protocol soft rings.
 540  *
 541  * Given a chain of packets we need to split it up into multiple sub chains
 542  * destined into TCP, UDP or OTH soft ring. Instead of entering
 543  * the soft ring one packet at a time, we want to enter it in the form of a
 544  * chain otherwise we get this start/stop behaviour where the worker thread
 545  * goes to sleep and then next packets comes in forcing it to wake up etc.
 546  */
 547 static void
 548 mac_rx_srs_proto_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head)
 549 {
 550         struct ether_header             *ehp;
 551         struct ether_vlan_header        *evhp;
 552         uint32_t                        sap;
 553         ipha_t                          *ipha;
 554         uint8_t                         *dstaddr;
 555         size_t                          hdrsize;
 556         mblk_t                          *mp;
 557         mblk_t                          *headmp[MAX_SR_TYPES];
 558         mblk_t                          *tailmp[MAX_SR_TYPES];
 559         int                             cnt[MAX_SR_TYPES];
 560         size_t                          sz[MAX_SR_TYPES];
 561         size_t                          sz1;
 562         boolean_t                       bw_ctl;
 563         boolean_t                       hw_classified;
 564         boolean_t                       dls_bypass;
 565         boolean_t                       is_ether;
 566         boolean_t                       is_unicast;
 567         enum pkt_type                   type;
 568         mac_client_impl_t               *mcip = mac_srs->srs_mcip;
 569
 570         is_ether = (mcip->mci_mip->mi_info.mi_nativemedia == DL_ETHER);
 571         bw_ctl = ((mac_srs->srs_type & SRST_BW_CONTROL) != 0);
 572
 573         /*
 574          * If we don't have a Rx ring, S/W classification would have done
 575          * its job and its a packet meant for us. If we were polling on
 576          * the default ring (i.e. there was a ring assigned to this SRS),
 577          * then we need to make sure that the mac address really belongs
 578          * to us.
 579          */
 580         hw_classified = mac_srs->srs_ring != NULL &&
 581             mac_srs->srs_ring->mr_classify_type == MAC_HW_CLASSIFIER;
 582
 583         /*
 584          * Special clients (eg. VLAN, non ether, etc) need DLS
 585          * processing in the Rx path. SRST_DLS_BYPASS will be clear for
 586          * such SRSs. Another way of disabling bypass is to set the
 587          * MCIS_RX_BYPASS_DISABLE flag.
 588          */
 589         dls_bypass = ((mac_srs->srs_type & SRST_DLS_BYPASS) != 0) &&
 590             ((mcip->mci_state_flags & MCIS_RX_BYPASS_DISABLE) == 0);
 591
 592         bzero(headmp, MAX_SR_TYPES * sizeof (mblk_t *));
 593         bzero(tailmp, MAX_SR_TYPES * sizeof (mblk_t *));
 594         bzero(cnt, MAX_SR_TYPES * sizeof (int));
 595         bzero(sz, MAX_SR_TYPES * sizeof (size_t));
 596
 597         /*
 598          * We got a chain from SRS that we need to send to the soft rings.
 599          * Since squeues for TCP & IPv4 sap poll their soft rings (for
 600          * performance reasons), we need to separate out v4_tcp, v4_udp
 601          * and the rest goes in other.
 602          */
 603         while (head != NULL) {
 604                 mp = head;
 605                 head = head->b_next;
 606                 mp->b_next = NULL;
 607
 608                 type = OTH;
 609                 sz1 = (mp->b_cont == NULL) ? MBLKL(mp) : msgdsize(mp);
 610
 611                 if (is_ether) {
 612                         /*
 613                          * At this point we can be sure the packet at least
 614                          * has an ether header.
 615                          */
 616                         if (sz1 < sizeof (struct ether_header)) {
 617                                 mac_rx_drop_pkt(mac_srs, mp);
 618                                 continue;
 619                         }
 620                         ehp = (struct ether_header *)mp->b_rptr;
 621
 622                         /*
 623                          * Determine if this is a VLAN or non-VLAN packet.
 624                          */
 625                         if ((sap = ntohs(ehp->ether_type)) == VLAN_TPID) {
 626                                 evhp = (struct ether_vlan_header *)mp->b_rptr;
 627                                 sap = ntohs(evhp->ether_type);
 628                                 hdrsize = sizeof (struct ether_vlan_header);
 629                                 /*
 630                                  * Check if the VID of the packet, if any,
 631                                  * belongs to this client.
 632                                  */
 633                                 if (!mac_client_check_flow_vid(mcip,
 634                                     VLAN_ID(ntohs(evhp->ether_tci)))) {
 635                                         mac_rx_drop_pkt(mac_srs, mp);
 636                                         continue;
 637                                 }
 638                         } else {
 639                                 hdrsize = sizeof (struct ether_header);
 640                         }
 641                         is_unicast =
 642                             ((((uint8_t *)&ehp->ether_dhost)[0] & 0x01) == 0);
 643                         dstaddr = (uint8_t *)&ehp->ether_dhost;
 644                 } else {
 645                         mac_header_info_t               mhi;
 646
 647                         if (mac_header_info((mac_handle_t)mcip->mci_mip,
 648                             mp, &mhi) != 0) {
 649                                 mac_rx_drop_pkt(mac_srs, mp);
 650                                 continue;
 651                         }
 652                         hdrsize = mhi.mhi_hdrsize;
 653                         sap = mhi.mhi_bindsap;
 654                         is_unicast = (mhi.mhi_dsttype == MAC_ADDRTYPE_UNICAST);
 655                         dstaddr = (uint8_t *)mhi.mhi_daddr;
 656                 }
 657
 658                 if (!dls_bypass) {
 659                         FANOUT_ENQUEUE_MP(headmp[type], tailmp[type],
 660                             cnt[type], bw_ctl, sz[type], sz1, mp);
 661                         continue;
 662                 }
 663
 664                 if (sap == ETHERTYPE_IP) {
 665                         /*
 666                          * If we are H/W classified, but we have promisc
 667                          * on, then we need to check for the unicast address.
 668                          */
 669                         if (hw_classified && mcip->mci_promisc_list != NULL) {
 670                                 mac_address_t           *map;
 671
 672                                 rw_enter(&mcip->mci_rw_lock, RW_READER);
 673                                 map = mcip->mci_unicast;
 674                                 if (bcmp(dstaddr, map->ma_addr,
 675                                     map->ma_len) == 0)
 676                                         type = UNDEF;
 677                                 rw_exit(&mcip->mci_rw_lock);
 678                         } else if (is_unicast) {
 679                                 type = UNDEF;
 680                         }
 681                 }
 682
 683                 /*
 684                  * This needs to become a contract with the driver for
 685                  * the fast path.
 686                  *
 687                  * In the normal case the packet will have at least the L2
 688                  * header and the IP + Transport header in the same mblk.
 689                  * This is usually the case when the NIC driver sends up
 690                  * the packet. This is also true when the stack generates
 691                  * a packet that is looped back and when the stack uses the
 692                  * fastpath mechanism. The normal case is optimized for
 693                  * performance and may bypass DLS. All other cases go through
 694                  * the 'OTH' type path without DLS bypass.
 695                  */
 696
 697                 ipha = (ipha_t *)(mp->b_rptr + hdrsize);
 698                 if ((type != OTH) && MBLK_RX_FANOUT_SLOWPATH(mp, ipha))
 699                         type = OTH;
 700
 701                 if (type == OTH) {
 702                         FANOUT_ENQUEUE_MP(headmp[type], tailmp[type],
 703                             cnt[type], bw_ctl, sz[type], sz1, mp);
 704                         continue;
 705                 }
 706
 707                 ASSERT(type == UNDEF);
 708                 /*
 709                  * We look for at least 4 bytes past the IP header to get
 710                  * the port information. If we get an IP fragment, we don't
 711                  * have the port information, and we use just the protocol
 712                  * information.
 713                  */
 714                 switch (ipha->ipha_protocol) {
 715                 case IPPROTO_TCP:
 716                         type = V4_TCP;
 717                         mp->b_rptr += hdrsize;
 718                         break;
 719                 case IPPROTO_UDP:
 720                         type = V4_UDP;
 721                         mp->b_rptr += hdrsize;
 722                         break;
 723                 default:
 724                         type = OTH;
 725                         break;
 726                 }
 727
 728                 FANOUT_ENQUEUE_MP(headmp[type], tailmp[type], cnt[type],
 729                     bw_ctl, sz[type], sz1, mp);
 730         }
 731
 732         for (type = V4_TCP; type < UNDEF; type++) {
 733                 if (headmp[type] != NULL) {
 734                         mac_soft_ring_t                 *softring;
 735
 736                         ASSERT(tailmp[type]->b_next == NULL);
 737                         switch (type) {
 738                         case V4_TCP:
 739                                 softring = mac_srs->srs_tcp_soft_rings[0];
 740                                 break;
 741                         case V4_UDP:
 742                                 softring = mac_srs->srs_udp_soft_rings[0];
 743                                 break;
 744                         case OTH:
 745                                 softring = mac_srs->srs_oth_soft_rings[0];
 746                         }
 747                         mac_rx_soft_ring_process(mcip, softring,
 748                             headmp[type], tailmp[type], cnt[type], sz[type]);
 749                 }
 750         }
 751 }
 752
 753 int     fanout_unaligned = 0;
 754
 755 /*
 756  * mac_rx_srs_long_fanout
 757  *
 758  * The fanout routine for VLANs, and for anything else that isn't performing
 759  * explicit dls bypass.  Returns -1 on an error (drop the packet due to a
 760  * malformed packet), 0 on success, with values written in *indx and *type.
 761  */
 762 static int
 763 mac_rx_srs_long_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *mp,
 764     uint32_t sap, size_t hdrsize, enum pkt_type *type, uint_t *indx)
 765 {
 766         ip6_t           *ip6h;
 767         ipha_t          *ipha;
 768         uint8_t         *whereptr;
 769         uint_t          hash;
 770         uint16_t        remlen;
 771         uint8_t         nexthdr;
 772         uint16_t        hdr_len;
 773         uint32_t        src_val, dst_val;
 774         boolean_t       modifiable = B_TRUE;
 775         boolean_t       v6;
 776
 777         ASSERT(MBLKL(mp) >= hdrsize);
 778
 779         if (sap == ETHERTYPE_IPV6) {
 780                 v6 = B_TRUE;
 781                 hdr_len = IPV6_HDR_LEN;
 782         } else if (sap == ETHERTYPE_IP) {
 783                 v6 = B_FALSE;
 784                 hdr_len = IP_SIMPLE_HDR_LENGTH;
 785         } else {
 786                 *indx = 0;
 787                 *type = OTH;
 788                 return (0);
 789         }
 790
 791         ip6h = (ip6_t *)(mp->b_rptr + hdrsize);
 792         ipha = (ipha_t *)ip6h;
 793
 794         if ((uint8_t *)ip6h == mp->b_wptr) {
 795                 /*
 796                  * The first mblk_t only includes the mac header.
 797                  * Note that it is safe to change the mp pointer here,
 798                  * as the subsequent operation does not assume mp
 799                  * points to the start of the mac header.
 800                  */
 801                 mp = mp->b_cont;
 802
 803                 /*
 804                  * Make sure the IP header points to an entire one.
 805                  */
 806                 if (mp == NULL)
 807                         return (-1);
 808
 809                 if (MBLKL(mp) < hdr_len) {
 810                         modifiable = (DB_REF(mp) == 1);
 811
 812                         if (modifiable && !pullupmsg(mp, hdr_len))
 813                                 return (-1);
 814                 }
 815
 816                 ip6h = (ip6_t *)mp->b_rptr;
 817                 ipha = (ipha_t *)ip6h;
 818         }
 819
 820         if (!modifiable || !(OK_32PTR((char *)ip6h)) ||
 821             ((uint8_t *)ip6h + hdr_len > mp->b_wptr)) {
 822                 /*
 823                  * If either the IP header is not aligned, or it does not hold
 824                  * the complete simple structure (a pullupmsg() is not an
 825                  * option since it would result in an unaligned IP header),
 826                  * fanout to the default ring.
 827                  *
 828                  * Note that this may cause packet reordering.
 829                  */
 830                 *indx = 0;
 831                 *type = OTH;
 832                 fanout_unaligned++;
 833                 return (0);
 834         }
 835
 836         /*
 837          * Extract next-header, full header length, and source-hash value
 838          * using v4/v6 specific fields.
 839          */
 840         if (v6) {
 841                 remlen = ntohs(ip6h->ip6_plen);
 842                 nexthdr = ip6h->ip6_nxt;
 843                 src_val = V4_PART_OF_V6(ip6h->ip6_src);
 844                 dst_val = V4_PART_OF_V6(ip6h->ip6_dst);
 845                 /*
 846                  * Do src based fanout if below tunable is set to B_TRUE or
 847                  * when mac_ip_hdr_length_v6() fails because of malformed
 848                  * packets or because mblks need to be concatenated using
 849                  * pullupmsg().
 850                  */
 851                 if (!mac_ip_hdr_length_v6(ip6h, mp->b_wptr, &hdr_len, &nexthdr,
 852                     NULL)) {
 853                         goto src_dst_based_fanout;
 854                 }
 855         } else {
 856                 hdr_len = IPH_HDR_LENGTH(ipha);
 857                 remlen = ntohs(ipha->ipha_length) - hdr_len;
 858                 nexthdr = ipha->ipha_protocol;
 859                 src_val = (uint32_t)ipha->ipha_src;
 860                 dst_val = (uint32_t)ipha->ipha_dst;
 861                 /*
 862                  * Catch IPv4 fragment case here.  IPv6 has nexthdr == FRAG
 863                  * for its equivalent case.
 864                  */
 865                 if ((ntohs(ipha->ipha_fragment_offset_and_flags) &
 866                     (IPH_MF | IPH_OFFSET)) != 0) {
 867                         goto src_dst_based_fanout;
 868                 }
 869         }
 870         if (remlen < MIN_EHDR_LEN)
 871                 return (-1);
 872         whereptr = (uint8_t *)ip6h + hdr_len;
 873
 874         /* If the transport is one of below, we do port/SPI based fanout */
 875         switch (nexthdr) {
 876         case IPPROTO_TCP:
 877         case IPPROTO_UDP:
 878         case IPPROTO_SCTP:
 879         case IPPROTO_ESP:
 880                 /*
 881                  * If the ports or SPI in the transport header is not part of
 882                  * the mblk, do src_based_fanout, instead of calling
 883                  * pullupmsg().
 884                  */
 885                 if (mp->b_cont == NULL || whereptr + PORTS_SIZE <= mp->b_wptr)
 886                         break;  /* out of switch... */
 887                 /* FALLTHRU */
 888         default:
 889                 goto src_dst_based_fanout;
 890         }
 891
 892         switch (nexthdr) {
 893         case IPPROTO_TCP:
 894                 hash = HASH_ADDR(src_val, dst_val, *(uint32_t *)whereptr);
 895                 *indx = COMPUTE_INDEX(hash, mac_srs->srs_tcp_ring_count);
 896                 *type = OTH;
 897                 break;
 898         case IPPROTO_UDP:
 899         case IPPROTO_SCTP:
 900         case IPPROTO_ESP:
 901                 if (mac_fanout_type == MAC_FANOUT_DEFAULT) {
 902                         hash = HASH_ADDR(src_val, dst_val,
 903                             *(uint32_t *)whereptr);
 904                         *indx = COMPUTE_INDEX(hash,
 905                             mac_srs->srs_udp_ring_count);
 906                 } else {
 907                         *indx = mac_srs->srs_ind % mac_srs->srs_udp_ring_count;
 908                         mac_srs->srs_ind++;
 909                 }
 910                 *type = OTH;
 911                 break;
 912         }
 913         return (0);
 914
 915 src_dst_based_fanout:
 916         hash = HASH_ADDR(src_val, dst_val, (uint32_t)0);
 917         *indx = COMPUTE_INDEX(hash, mac_srs->srs_oth_ring_count);
 918         *type = OTH;
 919         return (0);
 920 }
 921
 922 /*
 923  * mac_rx_srs_fanout
 924  *
 925  * This routine delivers packets destined to an SRS into a soft ring member
 926  * of the set.
 927  *
 928  * Given a chain of packets we need to split it up into multiple sub chains
 929  * destined for one of the TCP, UDP or OTH soft rings. Instead of entering
 930  * the soft ring one packet at a time, we want to enter it in the form of a
 931  * chain otherwise we get this start/stop behaviour where the worker thread
 932  * goes to sleep and then next packets comes in forcing it to wake up etc.
 933  *
 934  * Note:
 935  * Since we know what is the maximum fanout possible, we create a 2D array
 936  * of 'softring types * MAX_SR_FANOUT' for the head, tail, cnt and sz
 937  * variables so that we can enter the softrings with chain. We need the
 938  * MAX_SR_FANOUT so we can allocate the arrays on the stack (a kmem_alloc
 939  * for each packet would be expensive). If we ever want to have the
 940  * ability to have unlimited fanout, we should probably declare a head,
 941  * tail, cnt, sz with each soft ring (a data struct which contains a softring
 942  * along with these members) and create an array of this uber struct so we
 943  * don't have to do kmem_alloc.
 944  */
 945 int     fanout_oth1 = 0;
 946 int     fanout_oth2 = 0;
 947 int     fanout_oth3 = 0;
 948 int     fanout_oth4 = 0;
 949 int     fanout_oth5 = 0;
 950
 951 static void
 952 mac_rx_srs_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head)
 953 {
 954         struct ether_header             *ehp;
 955         struct ether_vlan_header        *evhp;
 956         uint32_t                        sap;
 957         ipha_t                          *ipha;
 958         uint8_t                         *dstaddr;
 959         uint_t                          indx;
 960         size_t                          ports_offset;
 961         size_t                          ipha_len;
 962         size_t                          hdrsize;
 963         uint_t                          hash;
 964         mblk_t                          *mp;
 965         mblk_t                          *headmp[MAX_SR_TYPES][MAX_SR_FANOUT];
 966         mblk_t                          *tailmp[MAX_SR_TYPES][MAX_SR_FANOUT];
 967         int                             cnt[MAX_SR_TYPES][MAX_SR_FANOUT];
 968         size_t                          sz[MAX_SR_TYPES][MAX_SR_FANOUT];
 969         size_t                          sz1;
 970         boolean_t                       bw_ctl;
 971         boolean_t                       hw_classified;
 972         boolean_t                       dls_bypass;
 973         boolean_t                       is_ether;
 974         boolean_t                       is_unicast;
 975         int                             fanout_cnt;
 976         enum pkt_type                   type;
 977         mac_client_impl_t               *mcip = mac_srs->srs_mcip;
 978
 979         is_ether = (mcip->mci_mip->mi_info.mi_nativemedia == DL_ETHER);
 980         bw_ctl = ((mac_srs->srs_type & SRST_BW_CONTROL) != 0);
 981
 982         /*
 983          * If we don't have a Rx ring, S/W classification would have done
 984          * its job and its a packet meant for us. If we were polling on
 985          * the default ring (i.e. there was a ring assigned to this SRS),
 986          * then we need to make sure that the mac address really belongs
 987          * to us.
 988          */
 989         hw_classified = mac_srs->srs_ring != NULL &&
 990             mac_srs->srs_ring->mr_classify_type == MAC_HW_CLASSIFIER;
 991
 992         /*
 993          * Special clients (eg. VLAN, non ether, etc) need DLS
 994          * processing in the Rx path. SRST_DLS_BYPASS will be clear for
 995          * such SRSs. Another way of disabling bypass is to set the
 996          * MCIS_RX_BYPASS_DISABLE flag.
 997          */
 998         dls_bypass = ((mac_srs->srs_type & SRST_DLS_BYPASS) != 0) &&
 999             ((mcip->mci_state_flags & MCIS_RX_BYPASS_DISABLE) == 0);
1000
1001         /*
1002          * Since the softrings are never destroyed and we always
1003          * create equal number of softrings for TCP, UDP and rest,
1004          * its OK to check one of them for count and use it without
1005          * any lock. In future, if soft rings get destroyed because
1006          * of reduction in fanout, we will need to ensure that happens
1007          * behind the SRS_PROC.
1008          */
1009         fanout_cnt = mac_srs->srs_tcp_ring_count;
1010
1011         bzero(headmp, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (mblk_t *));
1012         bzero(tailmp, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (mblk_t *));
1013         bzero(cnt, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (int));
1014         bzero(sz, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (size_t));
1015
1016         /*
1017          * We got a chain from SRS that we need to send to the soft rings.
1018          * Since squeues for TCP & IPv4 sap poll their soft rings (for
1019          * performance reasons), we need to separate out v4_tcp, v4_udp
1020          * and the rest goes in other.
1021          */
1022         while (head != NULL) {
1023                 mp = head;
1024                 head = head->b_next;
1025                 mp->b_next = NULL;
1026
1027                 type = OTH;
1028                 sz1 = (mp->b_cont == NULL) ? MBLKL(mp) : msgdsize(mp);
1029
1030                 if (is_ether) {
1031                         /*
1032                          * At this point we can be sure the packet at least
1033                          * has an ether header.
1034                          */
1035                         if (sz1 < sizeof (struct ether_header)) {
1036                                 mac_rx_drop_pkt(mac_srs, mp);
1037                                 continue;
1038                         }
1039                         ehp = (struct ether_header *)mp->b_rptr;
1040
1041                         /*
1042                          * Determine if this is a VLAN or non-VLAN packet.
1043                          */
1044                         if ((sap = ntohs(ehp->ether_type)) == VLAN_TPID) {
1045                                 evhp = (struct ether_vlan_header *)mp->b_rptr;
1046                                 sap = ntohs(evhp->ether_type);
1047                                 hdrsize = sizeof (struct ether_vlan_header);
1048                                 /*
1049                                  * Check if the VID of the packet, if any,
1050                                  * belongs to this client.
1051                                  */
1052                                 if (!mac_client_check_flow_vid(mcip,
1053                                     VLAN_ID(ntohs(evhp->ether_tci)))) {
1054                                         mac_rx_drop_pkt(mac_srs, mp);
1055                                         continue;
1056                                 }
1057                         } else {
1058                                 hdrsize = sizeof (struct ether_header);
1059                         }
1060                         is_unicast =
1061                             ((((uint8_t *)&ehp->ether_dhost)[0] & 0x01) == 0);
1062                         dstaddr = (uint8_t *)&ehp->ether_dhost;
1063                 } else {
1064                         mac_header_info_t               mhi;
1065
1066                         if (mac_header_info((mac_handle_t)mcip->mci_mip,
1067                             mp, &mhi) != 0) {
1068                                 mac_rx_drop_pkt(mac_srs, mp);
1069                                 continue;
1070                         }
1071                         hdrsize = mhi.mhi_hdrsize;
1072                         sap = mhi.mhi_bindsap;
1073                         is_unicast = (mhi.mhi_dsttype == MAC_ADDRTYPE_UNICAST);
1074                         dstaddr = (uint8_t *)mhi.mhi_daddr;
1075                 }
1076
1077                 if (!dls_bypass) {
1078                         if (mac_rx_srs_long_fanout(mac_srs, mp, sap,
1079                             hdrsize, &type, &indx) == -1) {
1080                                 mac_rx_drop_pkt(mac_srs, mp);
1081                                 continue;
1082                         }
1083
1084                         FANOUT_ENQUEUE_MP(headmp[type][indx],
1085                             tailmp[type][indx], cnt[type][indx], bw_ctl,
1086                             sz[type][indx], sz1, mp);
1087                         continue;
1088                 }
1089
1090
1091                 /*
1092                  * If we are using the default Rx ring where H/W or S/W
1093                  * classification has not happened, we need to verify if
1094                  * this unicast packet really belongs to us.
1095                  */
1096                 if (sap == ETHERTYPE_IP) {
1097                         /*
1098                          * If we are H/W classified, but we have promisc
1099                          * on, then we need to check for the unicast address.
1100                          */
1101                         if (hw_classified && mcip->mci_promisc_list != NULL) {
1102                                 mac_address_t           *map;
1103
1104                                 rw_enter(&mcip->mci_rw_lock, RW_READER);
1105                                 map = mcip->mci_unicast;
1106                                 if (bcmp(dstaddr, map->ma_addr,
1107                                     map->ma_len) == 0)
1108                                         type = UNDEF;
1109                                 rw_exit(&mcip->mci_rw_lock);
1110                         } else if (is_unicast) {
1111                                 type = UNDEF;
1112                         }
1113                 }
1114
1115                 /*
1116                  * This needs to become a contract with the driver for
1117                  * the fast path.
1118                  */
1119
1120                 ipha = (ipha_t *)(mp->b_rptr + hdrsize);
1121                 if ((type != OTH) && MBLK_RX_FANOUT_SLOWPATH(mp, ipha)) {
1122                         type = OTH;
1123                         fanout_oth1++;
1124                 }
1125
1126                 if (type != OTH) {
1127                         uint16_t        frag_offset_flags;
1128
1129                         switch (ipha->ipha_protocol) {
1130                         case IPPROTO_TCP:
1131                         case IPPROTO_UDP:
1132                         case IPPROTO_SCTP:
1133                         case IPPROTO_ESP:
1134                                 ipha_len = IPH_HDR_LENGTH(ipha);
1135                                 if ((uchar_t *)ipha + ipha_len + PORTS_SIZE >
1136                                     mp->b_wptr) {
1137                                         type = OTH;
1138                                         break;
1139                                 }
1140                                 frag_offset_flags =
1141                                     ntohs(ipha->ipha_fragment_offset_and_flags);
1142                                 if ((frag_offset_flags &
1143                                     (IPH_MF | IPH_OFFSET)) != 0) {
1144                                         type = OTH;
1145                                         fanout_oth3++;
1146                                         break;
1147                                 }
1148                                 ports_offset = hdrsize + ipha_len;
1149                                 break;
1150                         default:
1151                                 type = OTH;
1152                                 fanout_oth4++;
1153                                 break;
1154                         }
1155                 }
1156
1157                 if (type == OTH) {
1158                         if (mac_rx_srs_long_fanout(mac_srs, mp, sap,
1159                             hdrsize, &type, &indx) == -1) {
1160                                 mac_rx_drop_pkt(mac_srs, mp);
1161                                 continue;
1162                         }
1163
1164                         FANOUT_ENQUEUE_MP(headmp[type][indx],
1165                             tailmp[type][indx], cnt[type][indx], bw_ctl,
1166                             sz[type][indx], sz1, mp);
1167                         continue;
1168                 }
1169
1170                 ASSERT(type == UNDEF);
1171
1172                 /*
1173                  * XXX-Sunay: We should hold srs_lock since ring_count
1174                  * below can change. But if we are always called from
1175                  * mac_rx_srs_drain and SRS_PROC is set, then we can
1176                  * enforce that ring_count can't be changed i.e.
1177                  * to change fanout type or ring count, the calling
1178                  * thread needs to be behind SRS_PROC.
1179                  */
1180                 switch (ipha->ipha_protocol) {
1181                 case IPPROTO_TCP:
1182                         /*
1183                          * Note that for ESP, we fanout on SPI and it is at the
1184                          * same offset as the 2x16-bit ports. So it is clumped
1185                          * along with TCP, UDP and SCTP.
1186                          */
1187                         hash = HASH_ADDR(ipha->ipha_src, ipha->ipha_dst,
1188                             *(uint32_t *)(mp->b_rptr + ports_offset));
1189                         indx = COMPUTE_INDEX(hash, mac_srs->srs_tcp_ring_count);
1190                         type = V4_TCP;
1191                         mp->b_rptr += hdrsize;
1192                         break;
1193                 case IPPROTO_UDP:
1194                 case IPPROTO_SCTP:
1195                 case IPPROTO_ESP:
1196                         if (mac_fanout_type == MAC_FANOUT_DEFAULT) {
1197                                 hash = HASH_ADDR(ipha->ipha_src, ipha->ipha_dst,
1198                                     *(uint32_t *)(mp->b_rptr + ports_offset));
1199                                 indx = COMPUTE_INDEX(hash,
1200                                     mac_srs->srs_udp_ring_count);
1201                         } else {
1202                                 indx = mac_srs->srs_ind %
1203                                     mac_srs->srs_udp_ring_count;
1204                                 mac_srs->srs_ind++;
1205                         }
1206                         type = V4_UDP;
1207                         mp->b_rptr += hdrsize;
1208                         break;
1209                 default:
1210                         indx = 0;
1211                         type = OTH;
1212                 }
1213
1214                 FANOUT_ENQUEUE_MP(headmp[type][indx], tailmp[type][indx],
1215                     cnt[type][indx], bw_ctl, sz[type][indx], sz1, mp);
1216         }
1217
1218         for (type = V4_TCP; type < UNDEF; type++) {
1219                 int     i;
1220
1221                 for (i = 0; i < fanout_cnt; i++) {
1222                         if (headmp[type][i] != NULL) {
1223                                 mac_soft_ring_t *softring;
1224
1225                                 ASSERT(tailmp[type][i]->b_next == NULL);
1226                                 switch (type) {
1227                                 case V4_TCP:
1228                                         softring =
1229                                             mac_srs->srs_tcp_soft_rings[i];
1230                                         break;
1231                                 case V4_UDP:
1232                                         softring =
1233                                             mac_srs->srs_udp_soft_rings[i];
1234                                         break;
1235                                 case OTH:
1236                                         softring =
1237                                             mac_srs->srs_oth_soft_rings[i];
1238                                         break;
1239                                 }
1240                                 mac_rx_soft_ring_process(mcip,
1241                                     softring, headmp[type][i], tailmp[type][i],
1242                                     cnt[type][i], sz[type][i]);
1243                         }
1244                 }
1245         }
1246 }
1247
1248 #define SRS_BYTES_TO_PICKUP     150000
1249 ssize_t max_bytes_to_pickup = SRS_BYTES_TO_PICKUP;
1250
1251 /*
1252  * mac_rx_srs_poll_ring
1253  *
1254  * This SRS Poll thread uses this routine to poll the underlying hardware
1255  * Rx ring to get a chain of packets. It can inline process that chain
1256  * if mac_latency_optimize is set (default) or signal the SRS worker thread
1257  * to do the remaining processing.
1258  *
1259  * Since packets come in the system via interrupt or poll path, we also
1260  * update the stats and deal with promiscous clients here.
1261  */
1262 void
1263 mac_rx_srs_poll_ring(mac_soft_ring_set_t *mac_srs)
1264 {
1265         kmutex_t                *lock = &mac_srs->srs_lock;
1266         kcondvar_t              *async = &mac_srs->srs_cv;
1267         mac_srs_rx_t            *srs_rx = &mac_srs->srs_rx;
1268         mblk_t                  *head, *tail, *mp;
1269         callb_cpr_t             cprinfo;
1270         ssize_t                 bytes_to_pickup;
1271         size_t                  sz;
1272         int                     count;
1273         mac_client_impl_t       *smcip;
1274
1275         CALLB_CPR_INIT(&cprinfo, lock, callb_generic_cpr, "mac_srs_poll");
1276         mutex_enter(lock);
1277
1278 start:
1279         for (;;) {
1280                 if (mac_srs->srs_state & SRS_PAUSE)
1281                         goto done;
1282
1283                 CALLB_CPR_SAFE_BEGIN(&cprinfo);
1284                 cv_wait(async, lock);
1285                 CALLB_CPR_SAFE_END(&cprinfo, lock);
1286
1287                 if (mac_srs->srs_state & SRS_PAUSE)
1288                         goto done;
1289
1290 check_again:
1291                 if (mac_srs->srs_type & SRST_BW_CONTROL) {
1292                         /*
1293                          * We pick as many bytes as we are allowed to queue.
1294                          * Its possible that we will exceed the total
1295                          * packets queued in case this SRS is part of the
1296                          * Rx ring group since > 1 poll thread can be pulling
1297                          * upto the max allowed packets at the same time
1298                          * but that should be OK.
1299                          */
1300                         mutex_enter(&mac_srs->srs_bw->mac_bw_lock);
1301                         bytes_to_pickup =
1302                             mac_srs->srs_bw->mac_bw_drop_threshold -
1303                             mac_srs->srs_bw->mac_bw_sz;
1304                         /*
1305                          * We shouldn't have been signalled if we
1306                          * have 0 or less bytes to pick but since
1307                          * some of the bytes accounting is driver
1308                          * dependant, we do the safety check.
1309                          */
1310                         if (bytes_to_pickup < 0)
1311                                 bytes_to_pickup = 0;
1312                         mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
1313                 } else {
1314                         /*
1315                          * ToDO: Need to change the polling API
1316                          * to add a packet count and a flag which
1317                          * tells the driver whether we want packets
1318                          * based on a count, or bytes, or all the
1319                          * packets queued in the driver/HW. This
1320                          * way, we never have to check the limits
1321                          * on poll path. We truly let only as many
1322                          * packets enter the system as we are willing
1323                          * to process or queue.
1324                          *
1325                          * Something along the lines of
1326                          * pkts_to_pickup = mac_soft_ring_max_q_cnt -
1327                          *      mac_srs->srs_poll_pkt_cnt
1328                          */
1329
1330                         /*
1331                          * Since we are not doing B/W control, pick
1332                          * as many packets as allowed.
1333                          */
1334                         bytes_to_pickup = max_bytes_to_pickup;
1335                 }
1336
1337                 /* Poll the underlying Hardware */
1338                 mutex_exit(lock);
1339                 head = MAC_HWRING_POLL(mac_srs->srs_ring, (int)bytes_to_pickup);
1340                 mutex_enter(lock);
1341
1342                 ASSERT((mac_srs->srs_state & SRS_POLL_THR_OWNER) ==
1343                     SRS_POLL_THR_OWNER);
1344
1345                 mp = tail = head;
1346                 count = 0;
1347                 sz = 0;
1348                 while (mp != NULL) {
1349                         tail = mp;
1350                         sz += msgdsize(mp);
1351                         mp = mp->b_next;
1352                         count++;
1353                 }
1354
1355                 if (head != NULL) {
1356                         tail->b_next = NULL;
1357                         smcip = mac_srs->srs_mcip;
1358
1359                         SRS_RX_STAT_UPDATE(mac_srs, pollbytes, sz);
1360                         SRS_RX_STAT_UPDATE(mac_srs, pollcnt, count);
1361
1362                         /*
1363                          * If there are any promiscuous mode callbacks
1364                          * defined for this MAC client, pass them a copy
1365                          * if appropriate and also update the counters.
1366                          */
1367                         if (smcip != NULL) {
1368                                 if (smcip->mci_mip->mi_promisc_list != NULL) {
1369                                         mutex_exit(lock);
1370                                         mac_promisc_dispatch(smcip->mci_mip,
1371                                             head, NULL);
1372                                         mutex_enter(lock);
1373                                 }
1374                         }
1375                         if (mac_srs->srs_type & SRST_BW_CONTROL) {
1376                                 mutex_enter(&mac_srs->srs_bw->mac_bw_lock);
1377                                 mac_srs->srs_bw->mac_bw_polled += sz;
1378                                 mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
1379                         }
1380                         MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, head, tail,
1381                             count, sz);
1382                         if (count <= 10)
1383                                 srs_rx->sr_stat.mrs_chaincntundr10++;
1384                         else if (count > 10 && count <= 50)
1385                                 srs_rx->sr_stat.mrs_chaincnt10to50++;
1386                         else
1387                                 srs_rx->sr_stat.mrs_chaincntover50++;
1388                 }
1389
1390                 /*
1391                  * We are guaranteed that SRS_PROC will be set if we
1392                  * are here. Also, poll thread gets to run only if
1393                  * the drain was being done by a worker thread although
1394                  * its possible that worker thread is still running
1395                  * and poll thread was sent down to keep the pipeline
1396                  * going instead of doing a complete drain and then
1397                  * trying to poll the NIC.
1398                  *
1399                  * So we need to check SRS_WORKER flag to make sure
1400                  * that the worker thread is not processing the queue
1401                  * in parallel to us. The flags and conditions are
1402                  * protected by the srs_lock to prevent any race. We
1403                  * ensure that we don't drop the srs_lock from now
1404                  * till the end and similarly we don't drop the srs_lock
1405                  * in mac_rx_srs_drain() till similar condition check
1406                  * are complete. The mac_rx_srs_drain() needs to ensure
1407                  * that SRS_WORKER flag remains set as long as its
1408                  * processing the queue.
1409                  */
1410                 if (!(mac_srs->srs_state & SRS_WORKER) &&
1411                     (mac_srs->srs_first != NULL)) {
1412                         /*
1413                          * We have packets to process and worker thread
1414                          * is not running. Check to see if poll thread is
1415                          * allowed to process.
1416                          */
1417                         if (mac_srs->srs_state & SRS_LATENCY_OPT) {
1418                                 mac_srs->srs_drain_func(mac_srs, SRS_POLL_PROC);
1419                                 if (!(mac_srs->srs_state & SRS_PAUSE) &&
1420                                     srs_rx->sr_poll_pkt_cnt <=
1421                                     srs_rx->sr_lowat) {
1422                                         srs_rx->sr_poll_again++;
1423                                         goto check_again;
1424                                 }
1425                                 /*
1426                                  * We are already above low water mark
1427                                  * so stay in the polling mode but no
1428                                  * need to poll. Once we dip below
1429                                  * the polling threshold, the processing
1430                                  * thread (soft ring) will signal us
1431                                  * to poll again (MAC_UPDATE_SRS_COUNT)
1432                                  */
1433                                 srs_rx->sr_poll_drain_no_poll++;
1434                                 mac_srs->srs_state &= ~(SRS_PROC|SRS_GET_PKTS);
1435                                 /*
1436                                  * In B/W control case, its possible
1437                                  * that the backlog built up due to
1438                                  * B/W limit being reached and packets
1439                                  * are queued only in SRS. In this case,
1440                                  * we should schedule worker thread
1441                                  * since no one else will wake us up.
1442                                  */
1443                                 if ((mac_srs->srs_type & SRST_BW_CONTROL) &&
1444                                     (mac_srs->srs_tid == NULL)) {
1445                                         mac_srs->srs_tid =
1446                                             timeout(mac_srs_fire, mac_srs, 1);
1447                                         srs_rx->sr_poll_worker_wakeup++;
1448                                 }
1449                         } else {
1450                                 /*
1451                                  * Wakeup the worker thread for more processing.
1452                                  * We optimize for throughput in this case.
1453                                  */
1454                                 mac_srs->srs_state &= ~(SRS_PROC|SRS_GET_PKTS);
1455                                 MAC_SRS_WORKER_WAKEUP(mac_srs);
1456                                 srs_rx->sr_poll_sig_worker++;
1457                         }
1458                 } else if ((mac_srs->srs_first == NULL) &&
1459                     !(mac_srs->srs_state & SRS_WORKER)) {
1460                         /*
1461                          * There is nothing queued in SRS and
1462                          * no worker thread running. Plus we
1463                          * didn't get anything from the H/W
1464                          * as well (head == NULL);
1465                          */
1466                         ASSERT(head == NULL);
1467                         mac_srs->srs_state &=
1468                             ~(SRS_PROC|SRS_GET_PKTS);
1469
1470                         /*
1471                          * If we have a packets in soft ring, don't allow
1472                          * more packets to come into this SRS by keeping the
1473                          * interrupts off but not polling the H/W. The
1474                          * poll thread will get signaled as soon as
1475                          * srs_poll_pkt_cnt dips below poll threshold.
1476                          */
1477                         if (srs_rx->sr_poll_pkt_cnt == 0) {
1478                                 srs_rx->sr_poll_intr_enable++;
1479                                 MAC_SRS_POLLING_OFF(mac_srs);
1480                         } else {
1481                                 /*
1482                                  * We know nothing is queued in SRS
1483                                  * since we are here after checking
1484                                  * srs_first is NULL. The backlog
1485                                  * is entirely due to packets queued
1486                                  * in Soft ring which will wake us up
1487                                  * and get the interface out of polling
1488                                  * mode once the backlog dips below
1489                                  * sr_poll_thres.
1490                                  */
1491                                 srs_rx->sr_poll_no_poll++;
1492                         }
1493                 } else {
1494                         /*
1495                          * Worker thread is already running.
1496                          * Nothing much to do. If the polling
1497                          * was enabled, worker thread will deal
1498                          * with that.
1499                          */
1500                         mac_srs->srs_state &= ~SRS_GET_PKTS;
1501                         srs_rx->sr_poll_goto_sleep++;
1502                 }
1503         }
1504 done:
1505         mac_srs->srs_state |= SRS_POLL_THR_QUIESCED;
1506         cv_signal(&mac_srs->srs_async);
1507         /*
1508          * If this is a temporary quiesce then wait for the restart signal
1509          * from the srs worker. Then clear the flags and signal the srs worker
1510          * to ensure a positive handshake and go back to start.
1511          */
1512         while (!(mac_srs->srs_state & (SRS_CONDEMNED | SRS_POLL_THR_RESTART)))
1513                 cv_wait(async, lock);
1514         if (mac_srs->srs_state & SRS_POLL_THR_RESTART) {
1515                 ASSERT(!(mac_srs->srs_state & SRS_CONDEMNED));
1516                 mac_srs->srs_state &=
1517                     ~(SRS_POLL_THR_QUIESCED | SRS_POLL_THR_RESTART);
1518                 cv_signal(&mac_srs->srs_async);
1519                 goto start;
1520         } else {
1521                 mac_srs->srs_state |= SRS_POLL_THR_EXITED;
1522                 cv_signal(&mac_srs->srs_async);
1523                 CALLB_CPR_EXIT(&cprinfo);
1524                 thread_exit();
1525         }
1526 }
1527
1528 /*
1529  * mac_srs_pick_chain
1530  *
1531  * In Bandwidth control case, checks how many packets can be processed
1532  * and return them in a sub chain.
1533  */
1534 static mblk_t *
1535 mac_srs_pick_chain(mac_soft_ring_set_t *mac_srs, mblk_t **chain_tail,
1536     size_t *chain_sz, int *chain_cnt)
1537 {
1538         mblk_t                  *head = NULL;
1539         mblk_t                  *tail = NULL;
1540         size_t                  sz;
1541         size_t                  tsz = 0;
1542         int                     cnt = 0;
1543         mblk_t                  *mp;
1544
1545         ASSERT(MUTEX_HELD(&mac_srs->srs_lock));
1546         mutex_enter(&mac_srs->srs_bw->mac_bw_lock);
1547         if (((mac_srs->srs_bw->mac_bw_used + mac_srs->srs_size) <=
1548             mac_srs->srs_bw->mac_bw_limit) ||
1549             (mac_srs->srs_bw->mac_bw_limit == 0)) {
1550                 mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
1551                 head = mac_srs->srs_first;
1552                 mac_srs->srs_first = NULL;
1553                 *chain_tail = mac_srs->srs_last;
1554                 mac_srs->srs_last = NULL;
1555                 *chain_sz = mac_srs->srs_size;
1556                 *chain_cnt = mac_srs->srs_count;
1557                 mac_srs->srs_count = 0;
1558                 mac_srs->srs_size = 0;
1559                 return (head);
1560         }
1561
1562         /*
1563          * Can't clear the entire backlog.
1564          * Need to find how many packets to pick
1565          */
1566         ASSERT(MUTEX_HELD(&mac_srs->srs_bw->mac_bw_lock));
1567         while ((mp = mac_srs->srs_first) != NULL) {
1568                 sz = msgdsize(mp);
1569                 if ((tsz + sz + mac_srs->srs_bw->mac_bw_used) >
1570                     mac_srs->srs_bw->mac_bw_limit) {
1571                         if (!(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED))
1572                                 mac_srs->srs_bw->mac_bw_state |=
1573                                     SRS_BW_ENFORCED;
1574                         break;
1575                 }
1576
1577                 /*
1578                  * The _size & cnt is  decremented from the softrings
1579                  * when they send up the packet for polling to work
1580                  * properly.
1581                  */
1582                 tsz += sz;
1583                 cnt++;
1584                 mac_srs->srs_count--;
1585                 mac_srs->srs_size -= sz;
1586                 if (tail != NULL)
1587                         tail->b_next = mp;
1588                 else
1589                         head = mp;
1590                 tail = mp;
1591                 mac_srs->srs_first = mac_srs->srs_first->b_next;
1592         }
1593         mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
1594         if (mac_srs->srs_first == NULL)
1595                 mac_srs->srs_last = NULL;
1596
1597         if (tail != NULL)
1598                 tail->b_next = NULL;
1599         *chain_tail = tail;
1600         *chain_cnt = cnt;
1601         *chain_sz = tsz;
1602
1603         return (head);
1604 }
1605
1606 /*
1607  * mac_rx_srs_drain
1608  *
1609  * The SRS drain routine. Gets to run to clear the queue. Any thread
1610  * (worker, interrupt, poll) can call this based on processing model.
1611  * The first thing we do is disable interrupts if possible and then
1612  * drain the queue. we also try to poll the underlying hardware if
1613  * there is a dedicated hardware Rx ring assigned to this SRS.
1614  *
1615  * There is a equivalent drain routine in bandwidth control mode
1616  * mac_rx_srs_drain_bw. There is some code duplication between the two
1617  * routines but they are highly performance sensitive and are easier
1618  * to read/debug if they stay separate. Any code changes here might
1619  * also apply to mac_rx_srs_drain_bw as well.
1620  */
1621 void
1622 mac_rx_srs_drain(mac_soft_ring_set_t *mac_srs, uint_t proc_type)
1623 {
1624         mblk_t                  *head;
1625         mblk_t                  *tail;
1626         timeout_id_t            tid;
1627         int                     cnt = 0;
1628         mac_client_impl_t       *mcip = mac_srs->srs_mcip;
1629         mac_srs_rx_t            *srs_rx = &mac_srs->srs_rx;
1630
1631         ASSERT(MUTEX_HELD(&mac_srs->srs_lock));
1632         ASSERT(!(mac_srs->srs_type & SRST_BW_CONTROL));
1633
1634         /* If we are blanked i.e. can't do upcalls, then we are done */
1635         if (mac_srs->srs_state & (SRS_BLANK | SRS_PAUSE)) {
1636                 ASSERT((mac_srs->srs_type & SRST_NO_SOFT_RINGS) ||
1637                     (mac_srs->srs_state & SRS_PAUSE));
1638                 goto out;
1639         }
1640
1641         if (mac_srs->srs_first == NULL)
1642                 goto out;
1643
1644         if (!(mac_srs->srs_state & SRS_LATENCY_OPT) &&
1645             (srs_rx->sr_poll_pkt_cnt <= srs_rx->sr_lowat)) {
1646                 /*
1647                  * In the normal case, the SRS worker thread does no
1648                  * work and we wait for a backlog to build up before
1649                  * we switch into polling mode. In case we are
1650                  * optimizing for throughput, we use the worker thread
1651                  * as well. The goal is to let worker thread process
1652                  * the queue and poll thread to feed packets into
1653                  * the queue. As such, we should signal the poll
1654                  * thread to try and get more packets.
1655                  *
1656                  * We could have pulled this check in the POLL_RING
1657                  * macro itself but keeping it explicit here makes
1658                  * the architecture more human understandable.
1659                  */
1660                 MAC_SRS_POLL_RING(mac_srs);
1661         }
1662
1663 again:
1664         head = mac_srs->srs_first;
1665         mac_srs->srs_first = NULL;
1666         tail = mac_srs->srs_last;
1667         mac_srs->srs_last = NULL;
1668         cnt = mac_srs->srs_count;
1669         mac_srs->srs_count = 0;
1670
1671         ASSERT(head != NULL);
1672         ASSERT(tail != NULL);
1673
1674         if ((tid = mac_srs->srs_tid) != 0)
1675                 mac_srs->srs_tid = 0;
1676
1677         mac_srs->srs_state |= (SRS_PROC|proc_type);
1678
1679
1680         /*
1681          * mcip is NULL for broadcast and multicast flows. The promisc
1682          * callbacks for broadcast and multicast packets are delivered from
1683          * mac_rx() and we don't need to worry about that case in this path
1684          */
1685         if (mcip != NULL) {
1686                 if (mcip->mci_promisc_list != NULL) {
1687                         mutex_exit(&mac_srs->srs_lock);
1688                         mac_promisc_client_dispatch(mcip, head);
1689                         mutex_enter(&mac_srs->srs_lock);
1690                 }
1691                 if (MAC_PROTECT_ENABLED(mcip, MPT_IPNOSPOOF)) {
1692                         mutex_exit(&mac_srs->srs_lock);
1693                         mac_protect_intercept_dhcp(mcip, head);
1694                         mutex_enter(&mac_srs->srs_lock);
1695                 }
1696         }
1697
1698         /*
1699          * Check if SRS itself is doing the processing
1700          * This direct path does not apply when subflows are present. In this
1701          * case, packets need to be dispatched to a soft ring according to the
1702          * flow's bandwidth and other resources contraints.
1703          */
1704         if (mac_srs->srs_type & SRST_NO_SOFT_RINGS) {
1705                 mac_direct_rx_t         proc;
1706                 void                    *arg1;
1707                 mac_resource_handle_t   arg2;
1708
1709                 /*
1710                  * This is the case when a Rx is directly
1711                  * assigned and we have a fully classified
1712                  * protocol chain. We can deal with it in
1713                  * one shot.
1714                  */
1715                 proc = srs_rx->sr_func;
1716                 arg1 = srs_rx->sr_arg1;
1717                 arg2 = srs_rx->sr_arg2;
1718
1719                 mac_srs->srs_state |= SRS_CLIENT_PROC;
1720                 mutex_exit(&mac_srs->srs_lock);
1721                 if (tid != 0) {
1722                         (void) untimeout(tid);
1723                         tid = 0;
1724                 }
1725
1726                 proc(arg1, arg2, head, NULL);
1727                 /*
1728                  * Decrement the size and count here itelf
1729                  * since the packet has been processed.
1730                  */
1731                 mutex_enter(&mac_srs->srs_lock);
1732                 MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt);
1733                 if (mac_srs->srs_state & SRS_CLIENT_WAIT)
1734                         cv_signal(&mac_srs->srs_client_cv);
1735                 mac_srs->srs_state &= ~SRS_CLIENT_PROC;
1736         } else {
1737                 /* Some kind of softrings based fanout is required */
1738                 mutex_exit(&mac_srs->srs_lock);
1739                 if (tid != 0) {
1740                         (void) untimeout(tid);
1741                         tid = 0;
1742                 }
1743
1744                 /*
1745                  * Since the fanout routines can deal with chains,
1746                  * shoot the entire chain up.
1747                  */
1748                 if (mac_srs->srs_type & SRST_FANOUT_SRC_IP)
1749                         mac_rx_srs_fanout(mac_srs, head);
1750                 else
1751                         mac_rx_srs_proto_fanout(mac_srs, head);
1752                 mutex_enter(&mac_srs->srs_lock);
1753         }
1754
1755         if (!(mac_srs->srs_state & (SRS_BLANK|SRS_PAUSE)) &&
1756             (mac_srs->srs_first != NULL)) {
1757                 /*
1758                  * More packets arrived while we were clearing the
1759                  * SRS. This can be possible because of one of
1760                  * three conditions below:
1761                  * 1) The driver is using multiple worker threads
1762                  *    to send the packets to us.
1763                  * 2) The driver has a race in switching
1764                  *    between interrupt and polling mode or
1765                  * 3) Packets are arriving in this SRS via the
1766                  *    S/W classification as well.
1767                  *
1768                  * We should switch to polling mode and see if we
1769                  * need to send the poll thread down. Also, signal
1770                  * the worker thread to process whats just arrived.
1771                  */
1772                 MAC_SRS_POLLING_ON(mac_srs);
1773                 if (srs_rx->sr_poll_pkt_cnt <= srs_rx->sr_lowat) {
1774                         srs_rx->sr_drain_poll_sig++;
1775                         MAC_SRS_POLL_RING(mac_srs);
1776                 }
1777
1778                 /*
1779                  * If we didn't signal the poll thread, we need
1780                  * to deal with the pending packets ourselves.
1781                  */
1782                 if (proc_type == SRS_WORKER) {
1783                         srs_rx->sr_drain_again++;
1784                         goto again;
1785                 } else {
1786                         srs_rx->sr_drain_worker_sig++;
1787                         cv_signal(&mac_srs->srs_async);
1788                 }
1789         }
1790
1791 out:
1792         if (mac_srs->srs_state & SRS_GET_PKTS) {
1793                 /*
1794                  * Poll thread is already running. Leave the
1795                  * SRS_RPOC set and hand over the control to
1796                  * poll thread.
1797                  */
1798                 mac_srs->srs_state &= ~proc_type;
1799                 srs_rx->sr_drain_poll_running++;
1800                 return;
1801         }
1802
1803         /*
1804          * Even if there are no packets queued in SRS, we
1805          * need to make sure that the shared counter is
1806          * clear and any associated softrings have cleared
1807          * all the backlog. Otherwise, leave the interface
1808          * in polling mode and the poll thread will get
1809          * signalled once the count goes down to zero.
1810          *
1811          * If someone is already draining the queue (SRS_PROC is
1812          * set) when the srs_poll_pkt_cnt goes down to zero,
1813          * then it means that drain is already running and we
1814          * will turn off polling at that time if there is
1815          * no backlog.
1816          *
1817          * As long as there are packets queued either
1818          * in soft ring set or its soft rings, we will leave
1819          * the interface in polling mode (even if the drain
1820          * was done being the interrupt thread). We signal
1821          * the poll thread as well if we have dipped below
1822          * low water mark.
1823          *
1824          * NOTE: We can't use the MAC_SRS_POLLING_ON macro
1825          * since that turn polling on only for worker thread.
1826          * Its not worth turning polling on for interrupt
1827          * thread (since NIC will not issue another interrupt)
1828          * unless a backlog builds up.
1829          */
1830         if ((srs_rx->sr_poll_pkt_cnt > 0) &&
1831             (mac_srs->srs_state & SRS_POLLING_CAPAB)) {
1832                 mac_srs->srs_state &= ~(SRS_PROC|proc_type);
1833                 srs_rx->sr_drain_keep_polling++;
1834                 MAC_SRS_POLLING_ON(mac_srs);
1835                 if (srs_rx->sr_poll_pkt_cnt <= srs_rx->sr_lowat)
1836                         MAC_SRS_POLL_RING(mac_srs);
1837                 return;
1838         }
1839
1840         /* Nothing else to do. Get out of poll mode */
1841         MAC_SRS_POLLING_OFF(mac_srs);
1842         mac_srs->srs_state &= ~(SRS_PROC|proc_type);
1843         srs_rx->sr_drain_finish_intr++;
1844 }
1845
1846 /*
1847  * mac_rx_srs_drain_bw
1848  *
1849  * The SRS BW drain routine. Gets to run to clear the queue. Any thread
1850  * (worker, interrupt, poll) can call this based on processing model.
1851  * The first thing we do is disable interrupts if possible and then
1852  * drain the queue. we also try to poll the underlying hardware if
1853  * there is a dedicated hardware Rx ring assigned to this SRS.
1854  *
1855  * There is a equivalent drain routine in non bandwidth control mode
1856  * mac_rx_srs_drain. There is some code duplication between the two
1857  * routines but they are highly performance sensitive and are easier
1858  * to read/debug if they stay separate. Any code changes here might
1859  * also apply to mac_rx_srs_drain as well.
1860  */
1861 void
1862 mac_rx_srs_drain_bw(mac_soft_ring_set_t *mac_srs, uint_t proc_type)
1863 {
1864         mblk_t                  *head;
1865         mblk_t                  *tail;
1866         timeout_id_t            tid;
1867         size_t                  sz = 0;
1868         int                     cnt = 0;
1869         mac_client_impl_t       *mcip = mac_srs->srs_mcip;
1870         mac_srs_rx_t            *srs_rx = &mac_srs->srs_rx;
1871         clock_t                 now;
1872
1873         ASSERT(MUTEX_HELD(&mac_srs->srs_lock));
1874         ASSERT(mac_srs->srs_type & SRST_BW_CONTROL);
1875 again:
1876         /* Check if we are doing B/W control */
1877         mutex_enter(&mac_srs->srs_bw->mac_bw_lock);
1878         now = ddi_get_lbolt();
1879         if (mac_srs->srs_bw->mac_bw_curr_time != now) {
1880                 mac_srs->srs_bw->mac_bw_curr_time = now;
1881                 mac_srs->srs_bw->mac_bw_used = 0;
1882                 if (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)
1883                         mac_srs->srs_bw->mac_bw_state &= ~SRS_BW_ENFORCED;
1884         } else if (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED) {
1885                 mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
1886                 goto done;
1887         } else if (mac_srs->srs_bw->mac_bw_used >
1888             mac_srs->srs_bw->mac_bw_limit) {
1889                 mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED;
1890                 mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
1891                 goto done;
1892         }
1893         mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
1894
1895         /* If we are blanked i.e. can't do upcalls, then we are done */
1896         if (mac_srs->srs_state & (SRS_BLANK | SRS_PAUSE)) {
1897                 ASSERT((mac_srs->srs_type & SRST_NO_SOFT_RINGS) ||
1898                     (mac_srs->srs_state & SRS_PAUSE));
1899                 goto done;
1900         }
1901
1902         sz = 0;
1903         cnt = 0;
1904         if ((head = mac_srs_pick_chain(mac_srs, &tail, &sz, &cnt)) == NULL) {
1905                 /*
1906                  * We couldn't pick up a single packet.
1907                  */
1908                 mutex_enter(&mac_srs->srs_bw->mac_bw_lock);
1909                 if ((mac_srs->srs_bw->mac_bw_used == 0) &&
1910                     (mac_srs->srs_size != 0) &&
1911                     !(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)) {
1912                         /*
1913                          * Seems like configured B/W doesn't
1914                          * even allow processing of 1 packet
1915                          * per tick.
1916                          *
1917                          * XXX: raise the limit to processing
1918                          * at least 1 packet per tick.
1919                          */
1920                         mac_srs->srs_bw->mac_bw_limit +=
1921                             mac_srs->srs_bw->mac_bw_limit;
1922                         mac_srs->srs_bw->mac_bw_drop_threshold +=
1923                             mac_srs->srs_bw->mac_bw_drop_threshold;
1924                         cmn_err(CE_NOTE, "mac_rx_srs_drain: srs(%p) "
1925                             "raised B/W limit to %d since not even a "
1926                             "single packet can be processed per "
1927                             "tick %d\n", (void *)mac_srs,
1928                             (int)mac_srs->srs_bw->mac_bw_limit,
1929                             (int)msgdsize(mac_srs->srs_first));
1930                 }
1931                 mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
1932                 goto done;
1933         }
1934
1935         ASSERT(head != NULL);
1936         ASSERT(tail != NULL);
1937
1938         /* zero bandwidth: drop all and return to interrupt mode */
1939         mutex_enter(&mac_srs->srs_bw->mac_bw_lock);
1940         if (mac_srs->srs_bw->mac_bw_limit == 0) {
1941                 srs_rx->sr_stat.mrs_sdrops += cnt;
1942                 ASSERT(mac_srs->srs_bw->mac_bw_sz >= sz);
1943                 mac_srs->srs_bw->mac_bw_sz -= sz;
1944                 mac_srs->srs_bw->mac_bw_drop_bytes += sz;
1945                 mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
1946                 mac_pkt_drop(NULL, NULL, head, B_FALSE);
1947                 goto leave_poll;
1948         } else {
1949                 mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
1950         }
1951
1952         if ((tid = mac_srs->srs_tid) != 0)
1953                 mac_srs->srs_tid = 0;
1954
1955         mac_srs->srs_state |= (SRS_PROC|proc_type);
1956         MAC_SRS_WORKER_POLLING_ON(mac_srs);
1957
1958         /*
1959          * mcip is NULL for broadcast and multicast flows. The promisc
1960          * callbacks for broadcast and multicast packets are delivered from
1961          * mac_rx() and we don't need to worry about that case in this path
1962          */
1963         if (mcip != NULL) {
1964                 if (mcip->mci_promisc_list != NULL) {
1965                         mutex_exit(&mac_srs->srs_lock);
1966                         mac_promisc_client_dispatch(mcip, head);
1967                         mutex_enter(&mac_srs->srs_lock);
1968                 }
1969                 if (MAC_PROTECT_ENABLED(mcip, MPT_IPNOSPOOF)) {
1970                         mutex_exit(&mac_srs->srs_lock);
1971                         mac_protect_intercept_dhcp(mcip, head);
1972                         mutex_enter(&mac_srs->srs_lock);
1973                 }
1974         }
1975
1976         /*
1977          * Check if SRS itself is doing the processing
1978          * This direct path does not apply when subflows are present. In this
1979          * case, packets need to be dispatched to a soft ring according to the
1980          * flow's bandwidth and other resources contraints.
1981          */
1982         if (mac_srs->srs_type & SRST_NO_SOFT_RINGS) {
1983                 mac_direct_rx_t         proc;
1984                 void                    *arg1;
1985                 mac_resource_handle_t   arg2;
1986
1987                 /*
1988                  * This is the case when a Rx is directly
1989                  * assigned and we have a fully classified
1990                  * protocol chain. We can deal with it in
1991                  * one shot.
1992                  */
1993                 proc = srs_rx->sr_func;
1994                 arg1 = srs_rx->sr_arg1;
1995                 arg2 = srs_rx->sr_arg2;
1996
1997                 mac_srs->srs_state |= SRS_CLIENT_PROC;
1998                 mutex_exit(&mac_srs->srs_lock);
1999                 if (tid != 0) {
2000                         (void) untimeout(tid);
2001                         tid = 0;
2002                 }
2003
2004                 proc(arg1, arg2, head, NULL);
2005                 /*
2006                  * Decrement the size and count here itelf
2007                  * since the packet has been processed.
2008                  */
2009                 mutex_enter(&mac_srs->srs_lock);
2010                 MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt);
2011                 MAC_UPDATE_SRS_SIZE_LOCKED(mac_srs, sz);
2012
2013                 if (mac_srs->srs_state & SRS_CLIENT_WAIT)
2014                         cv_signal(&mac_srs->srs_client_cv);
2015                 mac_srs->srs_state &= ~SRS_CLIENT_PROC;
2016         } else {
2017                 /* Some kind of softrings based fanout is required */
2018                 mutex_exit(&mac_srs->srs_lock);
2019                 if (tid != 0) {
2020                         (void) untimeout(tid);
2021                         tid = 0;
2022                 }
2023
2024                 /*
2025                  * Since the fanout routines can deal with chains,
2026                  * shoot the entire chain up.
2027                  */
2028                 if (mac_srs->srs_type & SRST_FANOUT_SRC_IP)
2029                         mac_rx_srs_fanout(mac_srs, head);
2030                 else
2031                         mac_rx_srs_proto_fanout(mac_srs, head);
2032                 mutex_enter(&mac_srs->srs_lock);
2033         }
2034
2035         /*
2036          * Send the poll thread to pick up any packets arrived
2037          * so far. This also serves as the last check in case
2038          * nothing else is queued in the SRS. The poll thread
2039          * is signalled only in the case the drain was done
2040          * by the worker thread and SRS_WORKER is set. The
2041          * worker thread can run in parallel as long as the
2042          * SRS_WORKER flag is set. We we have nothing else to
2043          * process, we can exit while leaving SRS_PROC set
2044          * which gives the poll thread control to process and
2045          * cleanup once it returns from the NIC.
2046          *
2047          * If we have nothing else to process, we need to
2048          * ensure that we keep holding the srs_lock till
2049          * all the checks below are done and control is
2050          * handed to the poll thread if it was running.
2051          */
2052         mutex_enter(&mac_srs->srs_bw->mac_bw_lock);
2053         if (!(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)) {
2054                 if (mac_srs->srs_first != NULL) {
2055                         if (proc_type == SRS_WORKER) {
2056                                 mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
2057                                 if (srs_rx->sr_poll_pkt_cnt <=
2058                                     srs_rx->sr_lowat)
2059                                         MAC_SRS_POLL_RING(mac_srs);
2060                                 goto again;
2061                         } else {
2062                                 cv_signal(&mac_srs->srs_async);
2063                         }
2064                 }
2065         }
2066         mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
2067
2068 done:
2069
2070         if (mac_srs->srs_state & SRS_GET_PKTS) {
2071                 /*
2072                  * Poll thread is already running. Leave the
2073                  * SRS_RPOC set and hand over the control to
2074                  * poll thread.
2075                  */
2076                 mac_srs->srs_state &= ~proc_type;
2077                 return;
2078         }
2079
2080         /*
2081          * If we can't process packets because we have exceeded
2082          * B/W limit for this tick, just set the timeout
2083          * and leave.
2084          *
2085          * Even if there are no packets queued in SRS, we
2086          * need to make sure that the shared counter is
2087          * clear and any associated softrings have cleared
2088          * all the backlog. Otherwise, leave the interface
2089          * in polling mode and the poll thread will get
2090          * signalled once the count goes down to zero.
2091          *
2092          * If someone is already draining the queue (SRS_PROC is
2093          * set) when the srs_poll_pkt_cnt goes down to zero,
2094          * then it means that drain is already running and we
2095          * will turn off polling at that time if there is
2096          * no backlog. As long as there are packets queued either
2097          * is soft ring set or its soft rings, we will leave
2098          * the interface in polling mode.
2099          */
2100         mutex_enter(&mac_srs->srs_bw->mac_bw_lock);
2101         if ((mac_srs->srs_state & SRS_POLLING_CAPAB) &&
2102             ((mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED) ||
2103             (srs_rx->sr_poll_pkt_cnt > 0))) {
2104                 MAC_SRS_POLLING_ON(mac_srs);
2105                 mac_srs->srs_state &= ~(SRS_PROC|proc_type);
2106                 if ((mac_srs->srs_first != NULL) &&
2107                     (mac_srs->srs_tid == NULL))
2108                         mac_srs->srs_tid = timeout(mac_srs_fire,
2109                             mac_srs, 1);
2110                 mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
2111                 return;
2112         }
2113         mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
2114
2115 leave_poll:
2116
2117         /* Nothing else to do. Get out of poll mode */
2118         MAC_SRS_POLLING_OFF(mac_srs);
2119         mac_srs->srs_state &= ~(SRS_PROC|proc_type);
2120 }
2121
2122 /*
2123  * mac_srs_worker
2124  *
2125  * The SRS worker routine. Drains the queue when no one else is
2126  * processing it.
2127  */
2128 void
2129 mac_srs_worker(mac_soft_ring_set_t *mac_srs)
2130 {
2131         kmutex_t                *lock = &mac_srs->srs_lock;
2132         kcondvar_t              *async = &mac_srs->srs_async;
2133         callb_cpr_t             cprinfo;
2134         boolean_t               bw_ctl_flag;
2135
2136         CALLB_CPR_INIT(&cprinfo, lock, callb_generic_cpr, "srs_worker");
2137         mutex_enter(lock);
2138
2139 start:
2140         for (;;) {
2141                 bw_ctl_flag = B_FALSE;
2142                 if (mac_srs->srs_type & SRST_BW_CONTROL) {
2143                         MAC_SRS_BW_LOCK(mac_srs);
2144                         MAC_SRS_CHECK_BW_CONTROL(mac_srs);
2145                         if (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)
2146                                 bw_ctl_flag = B_TRUE;
2147                         MAC_SRS_BW_UNLOCK(mac_srs);
2148                 }
2149                 /*
2150                  * The SRS_BW_ENFORCED flag may change since we have dropped
2151                  * the mac_bw_lock. However the drain function can handle both
2152                  * a drainable SRS or a bandwidth controlled SRS, and the
2153                  * effect of scheduling a timeout is to wakeup the worker
2154                  * thread which in turn will call the drain function. Since
2155                  * we release the srs_lock atomically only in the cv_wait there
2156                  * isn't a fear of waiting for ever.
2157                  */
2158                 while (((mac_srs->srs_state & SRS_PROC) ||
2159                     (mac_srs->srs_first == NULL) || bw_ctl_flag ||
2160                     (mac_srs->srs_state & SRS_TX_BLOCKED)) &&
2161                     !(mac_srs->srs_state & SRS_PAUSE)) {
2162                         /*
2163                          * If we have packets queued and we are here
2164                          * because B/W control is in place, we better
2165                          * schedule the worker wakeup after 1 tick
2166                          * to see if bandwidth control can be relaxed.
2167                          */
2168                         if (bw_ctl_flag && mac_srs->srs_tid == NULL) {
2169                                 /*
2170                                  * We need to ensure that a timer  is already
2171                                  * scheduled or we force  schedule one for
2172                                  * later so that we can continue processing
2173                                  * after this  quanta is over.
2174                                  */
2175                                 mac_srs->srs_tid = timeout(mac_srs_fire,
2176                                     mac_srs, 1);
2177                         }
2178 wait:
2179                         CALLB_CPR_SAFE_BEGIN(&cprinfo);
2180                         cv_wait(async, lock);
2181                         CALLB_CPR_SAFE_END(&cprinfo, lock);
2182
2183                         if (mac_srs->srs_state & SRS_PAUSE)
2184                                 goto done;
2185                         if (mac_srs->srs_state & SRS_PROC)
2186                                 goto wait;
2187
2188                         if (mac_srs->srs_first != NULL &&
2189                             mac_srs->srs_type & SRST_BW_CONTROL) {
2190                                 MAC_SRS_BW_LOCK(mac_srs);
2191                                 if (mac_srs->srs_bw->mac_bw_state &
2192                                     SRS_BW_ENFORCED) {
2193                                         MAC_SRS_CHECK_BW_CONTROL(mac_srs);
2194                                 }
2195                                 bw_ctl_flag = mac_srs->srs_bw->mac_bw_state &
2196                                     SRS_BW_ENFORCED;
2197                                 MAC_SRS_BW_UNLOCK(mac_srs);
2198                         }
2199                 }
2200
2201                 if (mac_srs->srs_state & SRS_PAUSE)
2202                         goto done;
2203                 mac_srs->srs_drain_func(mac_srs, SRS_WORKER);
2204         }
2205 done:
2206         /*
2207          * The Rx SRS quiesce logic first cuts off packet supply to the SRS
2208          * from both hard and soft classifications and waits for such threads
2209          * to finish before signaling the worker. So at this point the only
2210          * thread left that could be competing with the worker is the poll
2211          * thread. In the case of Tx, there shouldn't be any thread holding
2212          * SRS_PROC at this point.
2213          */
2214         if (!(mac_srs->srs_state & SRS_PROC)) {
2215                 mac_srs->srs_state |= SRS_PROC;
2216         } else {
2217                 ASSERT((mac_srs->srs_type & SRST_TX) == 0);
2218                 /*
2219                  * Poll thread still owns the SRS and is still running
2220                  */
2221                 ASSERT((mac_srs->srs_poll_thr == NULL) ||
2222                     ((mac_srs->srs_state & SRS_POLL_THR_OWNER) ==
2223                     SRS_POLL_THR_OWNER));
2224         }
2225         mac_srs_worker_quiesce(mac_srs);
2226         /*
2227          * Wait for the SRS_RESTART or SRS_CONDEMNED signal from the initiator
2228          * of the quiesce operation
2229          */
2230         while (!(mac_srs->srs_state & (SRS_CONDEMNED | SRS_RESTART)))
2231                 cv_wait(&mac_srs->srs_async, &mac_srs->srs_lock);
2232
2233         if (mac_srs->srs_state & SRS_RESTART) {
2234                 ASSERT(!(mac_srs->srs_state & SRS_CONDEMNED));
2235                 mac_srs_worker_restart(mac_srs);
2236                 mac_srs->srs_state &= ~SRS_PROC;
2237                 goto start;
2238         }
2239
2240         if (!(mac_srs->srs_state & SRS_CONDEMNED_DONE))
2241                 mac_srs_worker_quiesce(mac_srs);
2242
2243         mac_srs->srs_state &= ~SRS_PROC;
2244         /* The macro drops the srs_lock */
2245         CALLB_CPR_EXIT(&cprinfo);
2246         thread_exit();
2247 }
2248
2249 /*
2250  * mac_rx_srs_subflow_process
2251  *
2252  * Receive side routine called from interrupt path when there are
2253  * sub flows present on this SRS.
2254  */
2255 /* ARGSUSED */
2256 void
2257 mac_rx_srs_subflow_process(void *arg, mac_resource_handle_t srs,
2258     mblk_t *mp_chain, boolean_t loopback)
2259 {
2260         flow_entry_t            *flent = NULL;
2261         flow_entry_t            *prev_flent = NULL;
2262         mblk_t                  *mp = NULL;
2263         mblk_t                  *tail = NULL;
2264         mac_soft_ring_set_t     *mac_srs = (mac_soft_ring_set_t *)srs;
2265         mac_client_impl_t       *mcip;
2266
2267         mcip = mac_srs->srs_mcip;
2268         ASSERT(mcip != NULL);
2269
2270         /*
2271          * We need to determine the SRS for every packet
2272          * by walking the flow table, if we don't get any,
2273          * then we proceed using the SRS we came with.
2274          */
2275         mp = tail = mp_chain;
2276         while (mp != NULL) {
2277
2278                 /*
2279                  * We will increment the stats for the mactching subflow.
2280                  * when we get the bytes/pkt count for the classified packets
2281                  * later in mac_rx_srs_process.
2282                  */
2283                 (void) mac_flow_lookup(mcip->mci_subflow_tab, mp,
2284                     FLOW_INBOUND, &flent);
2285
2286                 if (mp == mp_chain || flent == prev_flent) {
2287                         if (prev_flent != NULL)
2288                                 FLOW_REFRELE(prev_flent);
2289                         prev_flent = flent;
2290                         flent = NULL;
2291                         tail = mp;
2292                         mp = mp->b_next;
2293                         continue;
2294                 }
2295                 tail->b_next = NULL;
2296                 /*
2297                  * A null indicates, this is for the mac_srs itself.
2298                  * XXX-venu : probably assert for fe_rx_srs_cnt == 0.
2299                  */
2300                 if (prev_flent == NULL || prev_flent->fe_rx_srs_cnt == 0) {
2301                         mac_rx_srs_process(arg,
2302                             (mac_resource_handle_t)mac_srs, mp_chain,
2303                             loopback);
2304                 } else {
2305                         (prev_flent->fe_cb_fn)(prev_flent->fe_cb_arg1,
2306                             prev_flent->fe_cb_arg2, mp_chain, loopback);
2307                         FLOW_REFRELE(prev_flent);
2308                 }
2309                 prev_flent = flent;
2310                 flent = NULL;
2311                 mp_chain = mp;
2312                 tail = mp;
2313                 mp = mp->b_next;
2314         }
2315         /* Last chain */
2316         ASSERT(mp_chain != NULL);
2317         if (prev_flent == NULL || prev_flent->fe_rx_srs_cnt == 0) {
2318                 mac_rx_srs_process(arg,
2319                     (mac_resource_handle_t)mac_srs, mp_chain, loopback);
2320         } else {
2321                 (prev_flent->fe_cb_fn)(prev_flent->fe_cb_arg1,
2322                     prev_flent->fe_cb_arg2, mp_chain, loopback);
2323                 FLOW_REFRELE(prev_flent);
2324         }
2325 }
2326
2327 /*
2328  * mac_rx_srs_process
2329  *
2330  * Receive side routine called from the interrupt path.
2331  *
2332  * loopback is set to force a context switch on the loopback
2333  * path between MAC clients.
2334  */
2335 /* ARGSUSED */
2336 void
2337 mac_rx_srs_process(void *arg, mac_resource_handle_t srs, mblk_t *mp_chain,
2338     boolean_t loopback)
2339 {
2340         mac_soft_ring_set_t     *mac_srs = (mac_soft_ring_set_t *)srs;
2341         mblk_t                  *mp, *tail, *head;
2342         int                     count = 0;
2343         int                     count1;
2344         size_t                  sz = 0;
2345         size_t                  chain_sz, sz1;
2346         mac_bw_ctl_t            *mac_bw;
2347         mac_srs_rx_t            *srs_rx = &mac_srs->srs_rx;
2348
2349         /*
2350          * Set the tail, count and sz. We set the sz irrespective
2351          * of whether we are doing B/W control or not for the
2352          * purpose of updating the stats.
2353          */
2354         mp = tail = mp_chain;
2355         while (mp != NULL) {
2356                 tail = mp;
2357                 count++;
2358                 sz += msgdsize(mp);
2359                 mp = mp->b_next;
2360         }
2361
2362         mutex_enter(&mac_srs->srs_lock);
2363
2364         if (loopback) {
2365                 SRS_RX_STAT_UPDATE(mac_srs, lclbytes, sz);
2366                 SRS_RX_STAT_UPDATE(mac_srs, lclcnt, count);
2367
2368         } else {
2369                 SRS_RX_STAT_UPDATE(mac_srs, intrbytes, sz);
2370                 SRS_RX_STAT_UPDATE(mac_srs, intrcnt, count);
2371         }
2372
2373         /*
2374          * If the SRS in already being processed; has been blanked;
2375          * can be processed by worker thread only; or the B/W limit
2376          * has been reached, then queue the chain and check if
2377          * worker thread needs to be awakend.
2378          */
2379         if (mac_srs->srs_type & SRST_BW_CONTROL) {
2380                 mac_bw = mac_srs->srs_bw;
2381                 ASSERT(mac_bw != NULL);
2382                 mutex_enter(&mac_bw->mac_bw_lock);
2383                 mac_bw->mac_bw_intr += sz;
2384                 if (mac_bw->mac_bw_limit == 0) {
2385                         /* zero bandwidth: drop all */
2386                         srs_rx->sr_stat.mrs_sdrops += count;
2387                         mac_bw->mac_bw_drop_bytes += sz;
2388                         mutex_exit(&mac_bw->mac_bw_lock);
2389                         mutex_exit(&mac_srs->srs_lock);
2390                         mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE);
2391                         return;
2392                 } else {
2393                         if ((mac_bw->mac_bw_sz + sz) <=
2394                             mac_bw->mac_bw_drop_threshold) {
2395                                 mutex_exit(&mac_bw->mac_bw_lock);
2396                                 MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, mp_chain,
2397                                     tail, count, sz);
2398                         } else {
2399                                 mp = mp_chain;
2400                                 chain_sz = 0;
2401                                 count1 = 0;
2402                                 tail = NULL;
2403                                 head = NULL;
2404                                 while (mp != NULL) {
2405                                         sz1 = msgdsize(mp);
2406                                         if (mac_bw->mac_bw_sz + chain_sz + sz1 >
2407                                             mac_bw->mac_bw_drop_threshold)
2408                                                 break;
2409                                         chain_sz += sz1;
2410                                         count1++;
2411                                         tail = mp;
2412                                         mp = mp->b_next;
2413                                 }
2414                                 mutex_exit(&mac_bw->mac_bw_lock);
2415                                 if (tail != NULL) {
2416                                         head = tail->b_next;
2417                                         tail->b_next = NULL;
2418                                         MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs,
2419                                             mp_chain, tail, count1, chain_sz);
2420                                         sz -= chain_sz;
2421                                         count -= count1;
2422                                 } else {
2423                                         /* Can't pick up any */
2424                                         head = mp_chain;
2425                                 }
2426                                 if (head != NULL) {
2427                                         /* Drop any packet over the threshold */
2428                                         srs_rx->sr_stat.mrs_sdrops += count;
2429                                         mutex_enter(&mac_bw->mac_bw_lock);
2430                                         mac_bw->mac_bw_drop_bytes += sz;
2431                                         mutex_exit(&mac_bw->mac_bw_lock);
2432                                         freemsgchain(head);
2433                                 }
2434                         }
2435                         MAC_SRS_WORKER_WAKEUP(mac_srs);
2436                         mutex_exit(&mac_srs->srs_lock);
2437                         return;
2438                 }
2439         }
2440
2441         /*
2442          * If the total number of packets queued in the SRS and
2443          * its associated soft rings exceeds the max allowed,
2444          * then drop the chain. If we are polling capable, this
2445          * shouldn't be happening.
2446          */
2447         if (!(mac_srs->srs_type & SRST_BW_CONTROL) &&
2448             (srs_rx->sr_poll_pkt_cnt > srs_rx->sr_hiwat)) {
2449                 mac_bw = mac_srs->srs_bw;
2450                 srs_rx->sr_stat.mrs_sdrops += count;
2451                 mutex_enter(&mac_bw->mac_bw_lock);
2452                 mac_bw->mac_bw_drop_bytes += sz;
2453                 mutex_exit(&mac_bw->mac_bw_lock);
2454                 freemsgchain(mp_chain);
2455                 mutex_exit(&mac_srs->srs_lock);
2456                 return;
2457         }
2458
2459         MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, mp_chain, tail, count, sz);
2460
2461         if (!(mac_srs->srs_state & SRS_PROC)) {
2462                 /*
2463                  * If we are coming via loopback, if we are not optimizing for
2464                  * latency, or if our stack is running deep, we should signal
2465                  * the worker thread.
2466                  */
2467                 if (loopback || !(mac_srs->srs_state & SRS_LATENCY_OPT) ||
2468                     MAC_RX_SRS_TOODEEP()) {
2469                         /*
2470                          * For loopback, We need to let the worker take
2471                          * over as we don't want to continue in the same
2472                          * thread even if we can. This could lead to stack
2473                          * overflows and may also end up using
2474                          * resources (cpu) incorrectly.
2475                          */
2476                         cv_signal(&mac_srs->srs_async);
2477                 } else {
2478                         /*
2479                          * Seems like no one is processing the SRS and
2480                          * there is no backlog. We also inline process
2481                          * our packet if its a single packet in non
2482                          * latency optimized case (in latency optimized
2483                          * case, we inline process chains of any size).
2484                          */
2485                         mac_srs->srs_drain_func(mac_srs, SRS_PROC_FAST);
2486                 }
2487         }
2488         mutex_exit(&mac_srs->srs_lock);
2489 }
2490
2491 /* TX SIDE ROUTINES (RUNTIME) */
2492
2493 /*
2494  * mac_tx_srs_no_desc
2495  *
2496  * This routine is called by Tx single ring default mode
2497  * when Tx ring runs out of descs.
2498  */
2499 mac_tx_cookie_t
2500 mac_tx_srs_no_desc(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
2501     uint16_t flag, mblk_t **ret_mp)
2502 {
2503         mac_tx_cookie_t cookie = NULL;
2504         mac_srs_tx_t *srs_tx = &mac_srs->srs_tx;
2505         boolean_t wakeup_worker = B_TRUE;
2506         uint32_t tx_mode = srs_tx->st_mode;
2507         int cnt, sz;
2508         mblk_t *tail;
2509
2510         ASSERT(tx_mode == SRS_TX_DEFAULT || tx_mode == SRS_TX_BW);
2511         if (flag & MAC_DROP_ON_NO_DESC) {
2512                 MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie);
2513         } else {
2514                 if (mac_srs->srs_first != NULL)
2515                         wakeup_worker = B_FALSE;
2516                 MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz);
2517                 if (flag & MAC_TX_NO_ENQUEUE) {
2518                         /*
2519                          * If TX_QUEUED is not set, queue the
2520                          * packet and let mac_tx_srs_drain()
2521                          * set the TX_BLOCKED bit for the
2522                          * reasons explained above. Otherwise,
2523                          * return the mblks.
2524                          */
2525                         if (wakeup_worker) {
2526                                 MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs,
2527                                     mp_chain, tail, cnt, sz);
2528                         } else {
2529                                 MAC_TX_SET_NO_ENQUEUE(mac_srs,
2530                                     mp_chain, ret_mp, cookie);
2531                         }
2532                 } else {
2533                         MAC_TX_SRS_TEST_HIWAT(mac_srs, mp_chain,
2534                             tail, cnt, sz, cookie);
2535                 }
2536                 if (wakeup_worker)
2537                         cv_signal(&mac_srs->srs_async);
2538         }
2539         return (cookie);
2540 }
2541
2542 /*
2543  * mac_tx_srs_enqueue
2544  *
2545  * This routine is called when Tx SRS is operating in either serializer
2546  * or bandwidth mode. In serializer mode, a packet will get enqueued
2547  * when a thread cannot enter SRS exclusively. In bandwidth mode,
2548  * packets gets queued if allowed byte-count limit for a tick is
2549  * exceeded. The action that gets taken when MAC_DROP_ON_NO_DESC and
2550  * MAC_TX_NO_ENQUEUE is set is different than when operaing in either
2551  * the default mode or fanout mode. Here packets get dropped or
2552  * returned back to the caller only after hi-watermark worth of data
2553  * is queued.
2554  */
2555 static mac_tx_cookie_t
2556 mac_tx_srs_enqueue(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
2557     uint16_t flag, uintptr_t fanout_hint, mblk_t **ret_mp)
2558 {
2559         mac_tx_cookie_t cookie = NULL;
2560         int cnt, sz;
2561         mblk_t *tail;
2562         boolean_t wakeup_worker = B_TRUE;
2563
2564         /*
2565          * Ignore fanout hint if we don't have multiple tx rings.
2566          */
2567         if (!MAC_TX_SOFT_RINGS(mac_srs))
2568                 fanout_hint = 0;
2569
2570         if (mac_srs->srs_first != NULL)
2571                 wakeup_worker = B_FALSE;
2572         MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz);
2573         if (flag & MAC_DROP_ON_NO_DESC) {
2574                 if (mac_srs->srs_count > mac_srs->srs_tx.st_hiwat) {
2575                         MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie);
2576                 } else {
2577                         MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs,
2578                             mp_chain, tail, cnt, sz);
2579                 }
2580         } else if (flag & MAC_TX_NO_ENQUEUE) {
2581                 if ((mac_srs->srs_count > mac_srs->srs_tx.st_hiwat) ||
2582                     (mac_srs->srs_state & SRS_TX_WAKEUP_CLIENT)) {
2583                         MAC_TX_SET_NO_ENQUEUE(mac_srs, mp_chain,
2584                             ret_mp, cookie);
2585                 } else {
2586                         mp_chain->b_prev = (mblk_t *)fanout_hint;
2587                         MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs,
2588                             mp_chain, tail, cnt, sz);
2589                 }
2590         } else {
2591                 /*
2592                  * If you are BW_ENFORCED, just enqueue the
2593                  * packet. srs_worker will drain it at the
2594                  * prescribed rate. Before enqueueing, save
2595                  * the fanout hint.
2596                  */
2597                 mp_chain->b_prev = (mblk_t *)fanout_hint;
2598                 MAC_TX_SRS_TEST_HIWAT(mac_srs, mp_chain,
2599                     tail, cnt, sz, cookie);
2600         }
2601         if (wakeup_worker)
2602                 cv_signal(&mac_srs->srs_async);
2603         return (cookie);
2604 }
2605
2606 /*
2607  * There are seven tx modes:
2608  *
2609  * 1) Default mode (SRS_TX_DEFAULT)
2610  * 2) Serialization mode (SRS_TX_SERIALIZE)
2611  * 3) Fanout mode (SRS_TX_FANOUT)
2612  * 4) Bandwdith mode (SRS_TX_BW)
2613  * 5) Fanout and Bandwidth mode (SRS_TX_BW_FANOUT)
2614  * 6) aggr Tx mode (SRS_TX_AGGR)
2615  * 7) aggr Tx bw mode (SRS_TX_BW_AGGR)
2616  *
2617  * The tx mode in which an SRS operates is decided in mac_tx_srs_setup()
2618  * based on the number of Tx rings requested for an SRS and whether
2619  * bandwidth control is requested or not.
2620  *
2621  * The default mode (i.e., no fanout/no bandwidth) is used when the
2622  * underlying NIC does not have Tx rings or just one Tx ring. In this mode,
2623  * the SRS acts as a pass-thru. Packets will go directly to mac_tx_send().
2624  * When the underlying Tx ring runs out of Tx descs, it starts queueing up
2625  * packets in SRS. When flow-control is relieved, the srs_worker drains
2626  * the queued packets and informs blocked clients to restart sending
2627  * packets.
2628  *
2629  * In the SRS_TX_SERIALIZE mode, all calls to mac_tx() are serialized. This
2630  * mode is used when the link has no Tx rings or only one Tx ring.
2631  *
2632  * In the SRS_TX_FANOUT mode, packets will be fanned out to multiple
2633  * Tx rings. Each Tx ring will have a soft ring associated with it.
2634  * These soft rings will be hung off the Tx SRS. Queueing if it happens
2635  * due to lack of Tx desc will be in individual soft ring (and not srs)
2636  * associated with Tx ring.
2637  *
2638  * In the TX_BW mode, tx srs will allow packets to go down to Tx ring
2639  * only if bw is available. Otherwise the packets will be queued in
2640  * SRS. If fanout to multiple Tx rings is configured, the packets will
2641  * be fanned out among the soft rings associated with the Tx rings.
2642  *
2643  * In SRS_TX_AGGR mode, mac_tx_aggr_mode() routine is called. This routine
2644  * invokes an aggr function, aggr_find_tx_ring(), to find a pseudo Tx ring
2645  * belonging to a port on which the packet has to be sent. Aggr will
2646  * always have a pseudo Tx ring associated with it even when it is an
2647  * aggregation over a single NIC that has no Tx rings. Even in such a
2648  * case, the single pseudo Tx ring will have a soft ring associated with
2649  * it and the soft ring will hang off the SRS.
2650  *
2651  * If a bandwidth is specified for an aggr, SRS_TX_BW_AGGR mode is used.
2652  * In this mode, the bandwidth is first applied on the outgoing packets
2653  * and later mac_tx_addr_mode() function is called to send the packet out
2654  * of one of the pseudo Tx rings.
2655  *
2656  * Four flags are used in srs_state for indicating flow control
2657  * conditions : SRS_TX_BLOCKED, SRS_TX_HIWAT, SRS_TX_WAKEUP_CLIENT.
2658  * SRS_TX_BLOCKED indicates out of Tx descs. SRS expects a wakeup from the
2659  * driver below.
2660  * SRS_TX_HIWAT indicates packet count enqueued in Tx SRS exceeded Tx hiwat
2661  * and flow-control pressure is applied back to clients. The clients expect
2662  * wakeup when flow-control is relieved.
2663  * SRS_TX_WAKEUP_CLIENT get set when (flag == MAC_TX_NO_ENQUEUE) and mblk
2664  * got returned back to client either due to lack of Tx descs or due to bw
2665  * control reasons. The clients expect a wakeup when condition is relieved.
2666  *
2667  * The fourth argument to mac_tx() is the flag. Normally it will be 0 but
2668  * some clients set the following values too: MAC_DROP_ON_NO_DESC,
2669  * MAC_TX_NO_ENQUEUE
2670  * Mac clients that do not want packets to be enqueued in the mac layer set
2671  * MAC_DROP_ON_NO_DESC value. The packets won't be queued in the Tx SRS or
2672  * Tx soft rings but instead get dropped when the NIC runs out of desc. The
2673  * behaviour of this flag is different when the Tx is running in serializer
2674  * or bandwidth mode. Under these (Serializer, bandwidth) modes, the packet
2675  * get dropped when Tx high watermark is reached.
2676  * There are some mac clients like vsw, aggr that want the mblks to be
2677  * returned back to clients instead of being queued in Tx SRS (or Tx soft
2678  * rings) under flow-control (i.e., out of desc or exceeding bw limits)
2679  * conditions. These clients call mac_tx() with MAC_TX_NO_ENQUEUE flag set.
2680  * In the default and Tx fanout mode, the un-transmitted mblks will be
2681  * returned back to the clients when the driver runs out of Tx descs.
2682  * SRS_TX_WAKEUP_CLIENT (or S_RING_WAKEUP_CLIENT) will be set in SRS (or
2683  * soft ring) so that the clients can be woken up when Tx desc become
2684  * available. When running in serializer or bandwidth mode mode,
2685  * SRS_TX_WAKEUP_CLIENT will be set when tx hi-watermark is reached.
2686  */
2687
2688 mac_tx_func_t
2689 mac_tx_get_func(uint32_t mode)
2690 {
2691         return (mac_tx_mode_list[mode].mac_tx_func);
2692 }
2693
2694 /* ARGSUSED */
2695 static mac_tx_cookie_t
2696 mac_tx_single_ring_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
2697     uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp)
2698 {
2699         mac_srs_tx_t            *srs_tx = &mac_srs->srs_tx;
2700         mac_tx_stats_t          stats;
2701         mac_tx_cookie_t         cookie = NULL;
2702
2703         ASSERT(srs_tx->st_mode == SRS_TX_DEFAULT);
2704
2705         /* Regular case with a single Tx ring */
2706         /*
2707          * SRS_TX_BLOCKED is set when underlying NIC runs
2708          * out of Tx descs and messages start getting
2709          * queued. It won't get reset until
2710          * tx_srs_drain() completely drains out the
2711          * messages.
2712          */
2713         if ((mac_srs->srs_state & SRS_ENQUEUED) != 0) {
2714                 /* Tx descs/resources not available */
2715                 mutex_enter(&mac_srs->srs_lock);
2716                 if ((mac_srs->srs_state & SRS_ENQUEUED) != 0) {
2717                         cookie = mac_tx_srs_no_desc(mac_srs, mp_chain,
2718                             flag, ret_mp);
2719                         mutex_exit(&mac_srs->srs_lock);
2720                         return (cookie);
2721                 }
2722                 /*
2723                  * While we were computing mblk count, the
2724                  * flow control condition got relieved.
2725                  * Continue with the transmission.
2726                  */
2727                 mutex_exit(&mac_srs->srs_lock);
2728         }
2729
2730         mp_chain = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2,
2731             mp_chain, &stats);
2732
2733         /*
2734          * Multiple threads could be here sending packets.
2735          * Under such conditions, it is not possible to
2736          * automically set SRS_TX_BLOCKED bit to indicate
2737          * out of tx desc condition. To atomically set
2738          * this, we queue the returned packet and do
2739          * the setting of SRS_TX_BLOCKED in
2740          * mac_tx_srs_drain().
2741          */
2742         if (mp_chain != NULL) {
2743                 mutex_enter(&mac_srs->srs_lock);
2744                 cookie = mac_tx_srs_no_desc(mac_srs, mp_chain, flag, ret_mp);
2745                 mutex_exit(&mac_srs->srs_lock);
2746                 return (cookie);
2747         }
2748         SRS_TX_STATS_UPDATE(mac_srs, &stats);
2749
2750         return (NULL);
2751 }
2752
2753 /*
2754  * mac_tx_serialize_mode
2755  *
2756  * This is an experimental mode implemented as per the request of PAE.
2757  * In this mode, all callers attempting to send a packet to the NIC
2758  * will get serialized. Only one thread at any time will access the
2759  * NIC to send the packet out.
2760  */
2761 /* ARGSUSED */
2762 static mac_tx_cookie_t
2763 mac_tx_serializer_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
2764     uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp)
2765 {
2766         mac_tx_stats_t          stats;
2767         mac_tx_cookie_t         cookie = NULL;
2768         mac_srs_tx_t            *srs_tx = &mac_srs->srs_tx;
2769
2770         /* Single ring, serialize below */
2771         ASSERT(srs_tx->st_mode == SRS_TX_SERIALIZE);
2772         mutex_enter(&mac_srs->srs_lock);
2773         if ((mac_srs->srs_first != NULL) ||
2774             (mac_srs->srs_state & SRS_PROC)) {
2775                 /*
2776                  * In serialization mode, queue all packets until
2777                  * TX_HIWAT is set.
2778                  * If drop bit is set, drop if TX_HIWAT is set.
2779                  * If no_enqueue is set, still enqueue until hiwat
2780                  * is set and return mblks after TX_HIWAT is set.
2781                  */
2782                 cookie = mac_tx_srs_enqueue(mac_srs, mp_chain,
2783                     flag, NULL, ret_mp);
2784                 mutex_exit(&mac_srs->srs_lock);
2785                 return (cookie);
2786         }
2787         /*
2788          * No packets queued, nothing on proc and no flow
2789          * control condition. Fast-path, ok. Do inline
2790          * processing.
2791          */
2792         mac_srs->srs_state |= SRS_PROC;
2793         mutex_exit(&mac_srs->srs_lock);
2794
2795         mp_chain = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2,
2796             mp_chain, &stats);
2797
2798         mutex_enter(&mac_srs->srs_lock);
2799         mac_srs->srs_state &= ~SRS_PROC;
2800         if (mp_chain != NULL) {
2801                 cookie = mac_tx_srs_enqueue(mac_srs,
2802                     mp_chain, flag, NULL, ret_mp);
2803         }
2804         if (mac_srs->srs_first != NULL) {
2805                 /*
2806                  * We processed inline our packet and a new
2807                  * packet/s got queued while we were
2808                  * processing. Wakeup srs worker
2809                  */
2810                 cv_signal(&mac_srs->srs_async);
2811         }
2812         mutex_exit(&mac_srs->srs_lock);
2813
2814         if (cookie == NULL)
2815                 SRS_TX_STATS_UPDATE(mac_srs, &stats);
2816
2817         return (cookie);
2818 }
2819
2820 /*
2821  * mac_tx_fanout_mode
2822  *
2823  * In this mode, the SRS will have access to multiple Tx rings to send
2824  * the packet out. The fanout hint that is passed as an argument is
2825  * used to find an appropriate ring to fanout the traffic. Each Tx
2826  * ring, in turn,  will have a soft ring associated with it. If a Tx
2827  * ring runs out of Tx desc's the returned packet will be queued in
2828  * the soft ring associated with that Tx ring. The srs itself will not
2829  * queue any packets.
2830  */
2831
2832 #define MAC_TX_SOFT_RING_PROCESS(chain) {                               \
2833         index = COMPUTE_INDEX(hash, mac_srs->srs_tx_ring_count),        \
2834         softring = mac_srs->srs_tx_soft_rings[index];                   \
2835         cookie = mac_tx_soft_ring_process(softring, chain, flag, ret_mp); \
2836         DTRACE_PROBE2(tx__fanout, uint64_t, hash, uint_t, index);       \
2837 }
2838
2839 static mac_tx_cookie_t
2840 mac_tx_fanout_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
2841     uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp)
2842 {
2843         mac_soft_ring_t         *softring;
2844         uint64_t                hash;
2845         uint_t                  index;
2846         mac_tx_cookie_t         cookie = NULL;
2847
2848         ASSERT(mac_srs->srs_tx.st_mode == SRS_TX_FANOUT ||
2849             mac_srs->srs_tx.st_mode == SRS_TX_BW_FANOUT);
2850         if (fanout_hint != 0) {
2851                 /*
2852                  * The hint is specified by the caller, simply pass the
2853                  * whole chain to the soft ring.
2854                  */
2855                 hash = HASH_HINT(fanout_hint);
2856                 MAC_TX_SOFT_RING_PROCESS(mp_chain);
2857         } else {
2858                 mblk_t *last_mp, *cur_mp, *sub_chain;
2859                 uint64_t last_hash = 0;
2860                 uint_t media = mac_srs->srs_mcip->mci_mip->mi_info.mi_media;
2861
2862                 /*
2863                  * Compute the hash from the contents (headers) of the
2864                  * packets of the mblk chain. Split the chains into
2865                  * subchains of the same conversation.
2866                  *
2867                  * Since there may be more than one ring used for
2868                  * sub-chains of the same call, and since the caller
2869                  * does not maintain per conversation state since it
2870                  * passed a zero hint, unsent subchains will be
2871                  * dropped.
2872                  */
2873
2874                 flag |= MAC_DROP_ON_NO_DESC;
2875                 ret_mp = NULL;
2876
2877                 ASSERT(ret_mp == NULL);
2878
2879                 sub_chain = NULL;
2880                 last_mp = NULL;
2881
2882                 for (cur_mp = mp_chain; cur_mp != NULL;
2883                     cur_mp = cur_mp->b_next) {
2884                         hash = mac_pkt_hash(media, cur_mp, MAC_PKT_HASH_L4,
2885                             B_TRUE);
2886                         if (last_hash != 0 && hash != last_hash) {
2887                                 /*
2888                                  * Starting a different subchain, send current
2889                                  * chain out.
2890                                  */
2891                                 ASSERT(last_mp != NULL);
2892                                 last_mp->b_next = NULL;
2893                                 MAC_TX_SOFT_RING_PROCESS(sub_chain);
2894                                 sub_chain = NULL;
2895                         }
2896
2897                         /* add packet to subchain */
2898                         if (sub_chain == NULL)
2899                                 sub_chain = cur_mp;
2900                         last_mp = cur_mp;
2901                         last_hash = hash;
2902                 }
2903
2904                 if (sub_chain != NULL) {
2905                         /* send last subchain */
2906                         ASSERT(last_mp != NULL);
2907                         last_mp->b_next = NULL;
2908                         MAC_TX_SOFT_RING_PROCESS(sub_chain);
2909                 }
2910
2911                 cookie = NULL;
2912         }
2913
2914         return (cookie);
2915 }
2916
2917 /*
2918  * mac_tx_bw_mode
2919  *
2920  * In the bandwidth mode, Tx srs will allow packets to go down to Tx ring
2921  * only if bw is available. Otherwise the packets will be queued in
2922  * SRS. If the SRS has multiple Tx rings, then packets will get fanned
2923  * out to a Tx rings.
2924  */
2925 static mac_tx_cookie_t
2926 mac_tx_bw_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
2927     uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp)
2928 {
2929         int                     cnt, sz;
2930         mblk_t                  *tail;
2931         mac_tx_cookie_t         cookie = NULL;
2932         mac_srs_tx_t            *srs_tx = &mac_srs->srs_tx;
2933         clock_t                 now;
2934
2935         ASSERT(TX_BANDWIDTH_MODE(mac_srs));
2936         ASSERT(mac_srs->srs_type & SRST_BW_CONTROL);
2937         mutex_enter(&mac_srs->srs_lock);
2938         if (mac_srs->srs_bw->mac_bw_limit == 0) {
2939                 /*
2940                  * zero bandwidth, no traffic is sent: drop the packets,
2941                  * or return the whole chain if the caller requests all
2942                  * unsent packets back.
2943                  */
2944                 if (flag & MAC_TX_NO_ENQUEUE) {
2945                         cookie = (mac_tx_cookie_t)mac_srs;
2946                         *ret_mp = mp_chain;
2947                 } else {
2948                         MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie);
2949                 }
2950                 mutex_exit(&mac_srs->srs_lock);
2951                 return (cookie);
2952         } else if ((mac_srs->srs_first != NULL) ||
2953             (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)) {
2954                 cookie = mac_tx_srs_enqueue(mac_srs, mp_chain, flag,
2955                     fanout_hint, ret_mp);
2956                 mutex_exit(&mac_srs->srs_lock);
2957                 return (cookie);
2958         }
2959         MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz);
2960         now = ddi_get_lbolt();
2961         if (mac_srs->srs_bw->mac_bw_curr_time != now) {
2962                 mac_srs->srs_bw->mac_bw_curr_time = now;
2963                 mac_srs->srs_bw->mac_bw_used = 0;
2964         } else if (mac_srs->srs_bw->mac_bw_used >
2965             mac_srs->srs_bw->mac_bw_limit) {
2966                 mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED;
2967                 MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs,
2968                     mp_chain, tail, cnt, sz);
2969                 /*
2970                  * Wakeup worker thread. Note that worker
2971                  * thread has to be woken up so that it
2972                  * can fire up the timer to be woken up
2973                  * on the next tick. Also once
2974                  * BW_ENFORCED is set, it can only be
2975                  * reset by srs_worker thread. Until then
2976                  * all packets will get queued up in SRS
2977                  * and hence this this code path won't be
2978                  * entered until BW_ENFORCED is reset.
2979                  */
2980                 cv_signal(&mac_srs->srs_async);
2981                 mutex_exit(&mac_srs->srs_lock);
2982                 return (cookie);
2983         }
2984
2985         mac_srs->srs_bw->mac_bw_used += sz;
2986         mutex_exit(&mac_srs->srs_lock);
2987
2988         if (srs_tx->st_mode == SRS_TX_BW_FANOUT) {
2989                 mac_soft_ring_t *softring;
2990                 uint_t indx, hash;
2991
2992                 hash = HASH_HINT(fanout_hint);
2993                 indx = COMPUTE_INDEX(hash,
2994                     mac_srs->srs_tx_ring_count);
2995                 softring = mac_srs->srs_tx_soft_rings[indx];
2996                 return (mac_tx_soft_ring_process(softring, mp_chain, flag,
2997                     ret_mp));
2998         } else if (srs_tx->st_mode == SRS_TX_BW_AGGR) {
2999                 return (mac_tx_aggr_mode(mac_srs, mp_chain,
3000                     fanout_hint, flag, ret_mp));
3001         } else {
3002                 mac_tx_stats_t          stats;
3003
3004                 mp_chain = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2,
3005                     mp_chain, &stats);
3006
3007                 if (mp_chain != NULL) {
3008                         mutex_enter(&mac_srs->srs_lock);
3009                         MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz);
3010                         if (mac_srs->srs_bw->mac_bw_used > sz)
3011                                 mac_srs->srs_bw->mac_bw_used -= sz;
3012                         else
3013                                 mac_srs->srs_bw->mac_bw_used = 0;
3014                         cookie = mac_tx_srs_enqueue(mac_srs, mp_chain, flag,
3015                             fanout_hint, ret_mp);
3016                         mutex_exit(&mac_srs->srs_lock);
3017                         return (cookie);
3018                 }
3019                 SRS_TX_STATS_UPDATE(mac_srs, &stats);
3020
3021                 return (NULL);
3022         }
3023 }
3024
3025 /*
3026  * mac_tx_aggr_mode
3027  *
3028  * This routine invokes an aggr function, aggr_find_tx_ring(), to find
3029  * a (pseudo) Tx ring belonging to a port on which the packet has to
3030  * be sent. aggr_find_tx_ring() first finds the outgoing port based on
3031  * L2/L3/L4 policy and then uses the fanout_hint passed to it to pick
3032  * a Tx ring from the selected port.
3033  *
3034  * Note that a port can be deleted from the aggregation. In such a case,
3035  * the aggregation layer first separates the port from the rest of the
3036  * ports making sure that port (and thus any Tx rings associated with
3037  * it) won't get selected in the call to aggr_find_tx_ring() function.
3038  * Later calls are made to mac_group_rem_ring() passing pseudo Tx ring
3039  * handles one by one which in turn will quiesce the Tx SRS and remove
3040  * the soft ring associated with the pseudo Tx ring. Unlike Rx side
3041  * where a cookie is used to protect against mac_rx_ring() calls on
3042  * rings that have been removed, no such cookie is needed on the Tx
3043  * side as the pseudo Tx ring won't be available anymore to
3044  * aggr_find_tx_ring() once the port has been removed.
3045  */
3046 static mac_tx_cookie_t
3047 mac_tx_aggr_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
3048     uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp)
3049 {
3050         mac_srs_tx_t            *srs_tx = &mac_srs->srs_tx;
3051         mac_tx_ring_fn_t        find_tx_ring_fn;
3052         mac_ring_handle_t       ring = NULL;
3053         void                    *arg;
3054         mac_soft_ring_t         *sringp;
3055
3056         find_tx_ring_fn = srs_tx->st_capab_aggr.mca_find_tx_ring_fn;
3057         arg = srs_tx->st_capab_aggr.mca_arg;
3058         if (find_tx_ring_fn(arg, mp_chain, fanout_hint, &ring) == NULL)
3059                 return (NULL);
3060         sringp = srs_tx->st_soft_rings[((mac_ring_t *)ring)->mr_index];
3061         return (mac_tx_soft_ring_process(sringp, mp_chain, flag, ret_mp));
3062 }
3063
3064 void
3065 mac_tx_invoke_callbacks(mac_client_impl_t *mcip, mac_tx_cookie_t cookie)
3066 {
3067         mac_cb_t *mcb;
3068         mac_tx_notify_cb_t *mtnfp;
3069
3070         /* Wakeup callback registered clients */
3071         MAC_CALLBACK_WALKER_INC(&mcip->mci_tx_notify_cb_info);
3072         for (mcb = mcip->mci_tx_notify_cb_list; mcb != NULL;
3073             mcb = mcb->mcb_nextp) {
3074                 mtnfp = (mac_tx_notify_cb_t *)mcb->mcb_objp;
3075                 mtnfp->mtnf_fn(mtnfp->mtnf_arg, cookie);
3076         }
3077         MAC_CALLBACK_WALKER_DCR(&mcip->mci_tx_notify_cb_info,
3078             &mcip->mci_tx_notify_cb_list);
3079 }
3080
3081 /* ARGSUSED */
3082 void
3083 mac_tx_srs_drain(mac_soft_ring_set_t *mac_srs, uint_t proc_type)
3084 {
3085         mblk_t                  *head, *tail;
3086         size_t                  sz;
3087         uint32_t                tx_mode;
3088         uint_t                  saved_pkt_count;
3089         mac_tx_stats_t          stats;
3090         mac_srs_tx_t            *srs_tx = &mac_srs->srs_tx;
3091         clock_t                 now;
3092
3093         saved_pkt_count = 0;
3094         ASSERT(mutex_owned(&mac_srs->srs_lock));
3095         ASSERT(!(mac_srs->srs_state & SRS_PROC));
3096
3097         mac_srs->srs_state |= SRS_PROC;
3098
3099         tx_mode = srs_tx->st_mode;
3100         if (tx_mode == SRS_TX_DEFAULT || tx_mode == SRS_TX_SERIALIZE) {
3101                 if (mac_srs->srs_first != NULL) {
3102                         head = mac_srs->srs_first;
3103                         tail = mac_srs->srs_last;
3104                         saved_pkt_count = mac_srs->srs_count;
3105                         mac_srs->srs_first = NULL;
3106                         mac_srs->srs_last = NULL;
3107                         mac_srs->srs_count = 0;
3108                         mutex_exit(&mac_srs->srs_lock);
3109
3110                         head = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2,
3111                             head, &stats);
3112
3113                         mutex_enter(&mac_srs->srs_lock);
3114                         if (head != NULL) {
3115                                 /* Device out of tx desc, set block */
3116                                 if (head->b_next == NULL)
3117                                         VERIFY(head == tail);
3118                                 tail->b_next = mac_srs->srs_first;
3119                                 mac_srs->srs_first = head;
3120                                 mac_srs->srs_count +=
3121                                     (saved_pkt_count - stats.mts_opackets);
3122                                 if (mac_srs->srs_last == NULL)
3123                                         mac_srs->srs_last = tail;
3124                                 MAC_TX_SRS_BLOCK(mac_srs, head);
3125                         } else {
3126                                 srs_tx->st_woken_up = B_FALSE;
3127                                 SRS_TX_STATS_UPDATE(mac_srs, &stats);
3128                         }
3129                 }
3130         } else if (tx_mode == SRS_TX_BW) {
3131                 /*
3132                  * We are here because the timer fired and we have some data
3133                  * to tranmit. Also mac_tx_srs_worker should have reset
3134                  * SRS_BW_ENFORCED flag
3135                  */
3136                 ASSERT(!(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED));
3137                 head = tail = mac_srs->srs_first;
3138                 while (mac_srs->srs_first != NULL) {
3139                         tail = mac_srs->srs_first;
3140                         tail->b_prev = NULL;
3141                         mac_srs->srs_first = tail->b_next;
3142                         if (mac_srs->srs_first == NULL)
3143                                 mac_srs->srs_last = NULL;
3144                         mac_srs->srs_count--;
3145                         sz = msgdsize(tail);
3146                         mac_srs->srs_size -= sz;
3147                         saved_pkt_count++;
3148                         MAC_TX_UPDATE_BW_INFO(mac_srs, sz);
3149
3150                         if (mac_srs->srs_bw->mac_bw_used <
3151                             mac_srs->srs_bw->mac_bw_limit)
3152                                 continue;
3153
3154                         now = ddi_get_lbolt();
3155                         if (mac_srs->srs_bw->mac_bw_curr_time != now) {
3156                                 mac_srs->srs_bw->mac_bw_curr_time = now;
3157                                 mac_srs->srs_bw->mac_bw_used = sz;
3158                                 continue;
3159                         }
3160                         mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED;
3161                         break;
3162                 }
3163
3164                 ASSERT((head == NULL && tail == NULL) ||
3165                     (head != NULL && tail != NULL));
3166                 if (tail != NULL) {
3167                         tail->b_next = NULL;
3168                         mutex_exit(&mac_srs->srs_lock);
3169
3170                         head = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2,
3171                             head, &stats);
3172
3173                         mutex_enter(&mac_srs->srs_lock);
3174                         if (head != NULL) {
3175                                 uint_t size_sent;
3176
3177                                 /* Device out of tx desc, set block */
3178                                 if (head->b_next == NULL)
3179                                         VERIFY(head == tail);
3180                                 tail->b_next = mac_srs->srs_first;
3181                                 mac_srs->srs_first = head;
3182                                 mac_srs->srs_count +=
3183                                     (saved_pkt_count - stats.mts_opackets);
3184                                 if (mac_srs->srs_last == NULL)
3185                                         mac_srs->srs_last = tail;
3186                                 size_sent = sz - stats.mts_obytes;
3187                                 mac_srs->srs_size += size_sent;
3188                                 mac_srs->srs_bw->mac_bw_sz += size_sent;
3189                                 if (mac_srs->srs_bw->mac_bw_used > size_sent) {
3190                                         mac_srs->srs_bw->mac_bw_used -=
3191                                             size_sent;
3192                                 } else {
3193                                         mac_srs->srs_bw->mac_bw_used = 0;
3194                                 }
3195                                 MAC_TX_SRS_BLOCK(mac_srs, head);
3196                         } else {
3197                                 srs_tx->st_woken_up = B_FALSE;
3198                                 SRS_TX_STATS_UPDATE(mac_srs, &stats);
3199                         }
3200                 }
3201         } else if (tx_mode == SRS_TX_BW_FANOUT || tx_mode == SRS_TX_BW_AGGR) {
3202                 mblk_t *prev;
3203                 uint64_t hint;
3204
3205                 /*
3206                  * We are here because the timer fired and we
3207                  * have some quota to tranmit.
3208                  */
3209                 prev = NULL;
3210                 head = tail = mac_srs->srs_first;
3211                 while (mac_srs->srs_first != NULL) {
3212                         tail = mac_srs->srs_first;
3213                         mac_srs->srs_first = tail->b_next;
3214                         if (mac_srs->srs_first == NULL)
3215                                 mac_srs->srs_last = NULL;
3216                         mac_srs->srs_count--;
3217                         sz = msgdsize(tail);
3218                         mac_srs->srs_size -= sz;
3219                         mac_srs->srs_bw->mac_bw_used += sz;
3220                         if (prev == NULL)
3221                                 hint = (ulong_t)tail->b_prev;
3222                         if (hint != (ulong_t)tail->b_prev) {
3223                                 prev->b_next = NULL;
3224                                 mutex_exit(&mac_srs->srs_lock);
3225                                 TX_SRS_TO_SOFT_RING(mac_srs, head, hint);
3226                                 head = tail;
3227                                 hint = (ulong_t)tail->b_prev;
3228                                 mutex_enter(&mac_srs->srs_lock);
3229                         }
3230
3231                         prev = tail;
3232                         tail->b_prev = NULL;
3233                         if (mac_srs->srs_bw->mac_bw_used <
3234                             mac_srs->srs_bw->mac_bw_limit)
3235                                 continue;
3236
3237                         now = ddi_get_lbolt();
3238                         if (mac_srs->srs_bw->mac_bw_curr_time != now) {
3239                                 mac_srs->srs_bw->mac_bw_curr_time = now;
3240                                 mac_srs->srs_bw->mac_bw_used = 0;
3241                                 continue;
3242                         }
3243                         mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED;
3244                         break;
3245                 }
3246                 ASSERT((head == NULL && tail == NULL) ||
3247                     (head != NULL && tail != NULL));
3248                 if (tail != NULL) {
3249                         tail->b_next = NULL;
3250                         mutex_exit(&mac_srs->srs_lock);
3251                         TX_SRS_TO_SOFT_RING(mac_srs, head, hint);
3252                         mutex_enter(&mac_srs->srs_lock);
3253                 }
3254         }
3255         /*
3256          * SRS_TX_FANOUT case not considered here because packets
3257          * won't be queued in the SRS for this case. Packets will
3258          * be sent directly to soft rings underneath and if there
3259          * is any queueing at all, it would be in Tx side soft
3260          * rings.
3261          */
3262
3263         /*
3264          * When srs_count becomes 0, reset SRS_TX_HIWAT and
3265          * SRS_TX_WAKEUP_CLIENT and wakeup registered clients.
3266          */
3267         if (mac_srs->srs_count == 0 && (mac_srs->srs_state &
3268             (SRS_TX_HIWAT | SRS_TX_WAKEUP_CLIENT | SRS_ENQUEUED))) {
3269                 mac_client_impl_t *mcip = mac_srs->srs_mcip;
3270                 boolean_t wakeup_required = B_FALSE;
3271
3272                 if (mac_srs->srs_state &
3273                     (SRS_TX_HIWAT|SRS_TX_WAKEUP_CLIENT)) {
3274                         wakeup_required = B_TRUE;
3275                 }
3276                 mac_srs->srs_state &= ~(SRS_TX_HIWAT |
3277                     SRS_TX_WAKEUP_CLIENT | SRS_ENQUEUED);
3278                 mutex_exit(&mac_srs->srs_lock);
3279                 if (wakeup_required) {
3280                         mac_tx_invoke_callbacks(mcip, (mac_tx_cookie_t)mac_srs);
3281                         /*
3282                          * If the client is not the primary MAC client, then we
3283                          * need to send the notification to the clients upper
3284                          * MAC, i.e. mci_upper_mip.
3285                          */
3286                         mac_tx_notify(mcip->mci_upper_mip != NULL ?
3287                             mcip->mci_upper_mip : mcip->mci_mip);
3288                 }
3289                 mutex_enter(&mac_srs->srs_lock);
3290         }
3291         mac_srs->srs_state &= ~SRS_PROC;
3292 }
3293
3294 /*
3295  * Given a packet, get the flow_entry that identifies the flow
3296  * to which that packet belongs. The flow_entry will contain
3297  * the transmit function to be used to send the packet. If the
3298  * function returns NULL, the packet should be sent using the
3299  * underlying NIC.
3300  */
3301 static flow_entry_t *
3302 mac_tx_classify(mac_impl_t *mip, mblk_t *mp)
3303 {
3304         flow_entry_t            *flent = NULL;
3305         mac_client_impl_t       *mcip;
3306         int     err;
3307
3308         /*
3309          * Do classification on the packet.
3310          */
3311         err = mac_flow_lookup(mip->mi_flow_tab, mp, FLOW_OUTBOUND, &flent);
3312         if (err != 0)
3313                 return (NULL);
3314
3315         /*
3316          * This flent might just be an additional one on the MAC client,
3317          * i.e. for classification purposes (different fdesc), however
3318          * the resources, SRS et. al., are in the mci_flent, so if
3319          * this isn't the mci_flent, we need to get it.
3320          */
3321         if ((mcip = flent->fe_mcip) != NULL && mcip->mci_flent != flent) {
3322                 FLOW_REFRELE(flent);
3323                 flent = mcip->mci_flent;
3324                 FLOW_TRY_REFHOLD(flent, err);
3325                 if (err != 0)
3326                         return (NULL);
3327         }
3328
3329         return (flent);
3330 }
3331
3332 /*
3333  * This macro is only meant to be used by mac_tx_send().
3334  */
3335 #define CHECK_VID_AND_ADD_TAG(mp) {                     \
3336         if (vid_check) {                                \
3337                 int err = 0;                            \
3338                                                         \
3339                 MAC_VID_CHECK(src_mcip, (mp), err);     \
3340                 if (err != 0) {                         \
3341                         freemsg((mp));                  \
3342                         (mp) = next;                    \
3343                         oerrors++;                      \
3344                         continue;                       \
3345                 }                                       \
3346         }                                               \
3347         if (add_tag) {                                  \
3348                 (mp) = mac_add_vlan_tag((mp), 0, vid);  \
3349                 if ((mp) == NULL) {                     \
3350                         (mp) = next;                    \
3351                         oerrors++;                      \
3352                         continue;                       \
3353                 }                                       \
3354         }                                               \
3355 }
3356
3357 mblk_t *
3358 mac_tx_send(mac_client_handle_t mch, mac_ring_handle_t ring, mblk_t *mp_chain,
3359     mac_tx_stats_t *stats)
3360 {
3361         mac_client_impl_t *src_mcip = (mac_client_impl_t *)mch;
3362         mac_impl_t *mip = src_mcip->mci_mip;
3363         uint_t obytes = 0, opackets = 0, oerrors = 0;
3364         mblk_t *mp = NULL, *next;
3365         boolean_t vid_check, add_tag;
3366         uint16_t vid = 0;
3367
3368         if (mip->mi_nclients > 1) {
3369                 vid_check = MAC_VID_CHECK_NEEDED(src_mcip);
3370                 add_tag = MAC_TAG_NEEDED(src_mcip);
3371                 if (add_tag)
3372                         vid = mac_client_vid(mch);
3373         } else {
3374                 ASSERT(mip->mi_nclients == 1);
3375                 vid_check = add_tag = B_FALSE;
3376         }
3377
3378         /*
3379          * Fastpath: if there's only one client, we simply send
3380          * the packet down to the underlying NIC.
3381          */
3382         if (mip->mi_nactiveclients == 1) {
3383                 DTRACE_PROBE2(fastpath,
3384                     mac_client_impl_t *, src_mcip, mblk_t *, mp_chain);
3385
3386                 mp = mp_chain;
3387                 while (mp != NULL) {
3388                         next = mp->b_next;
3389                         mp->b_next = NULL;
3390                         opackets++;
3391                         obytes += (mp->b_cont == NULL ? MBLKL(mp) :
3392                             msgdsize(mp));
3393
3394                         CHECK_VID_AND_ADD_TAG(mp);
3395                         MAC_TX(mip, ring, mp, src_mcip);
3396
3397                         /*
3398                          * If the driver is out of descriptors and does a
3399                          * partial send it will return a chain of unsent
3400                          * mblks. Adjust the accounting stats.
3401                          */
3402                         if (mp != NULL) {
3403                                 opackets--;
3404                                 obytes -= msgdsize(mp);
3405                                 mp->b_next = next;
3406                                 break;
3407                         }
3408                         mp = next;
3409                 }
3410                 goto done;
3411         }
3412
3413         /*
3414          * No fastpath, we either have more than one MAC client
3415          * defined on top of the same MAC, or one or more MAC
3416          * client promiscuous callbacks.
3417          */
3418         DTRACE_PROBE3(slowpath, mac_client_impl_t *,
3419             src_mcip, int, mip->mi_nclients, mblk_t *, mp_chain);
3420
3421         mp = mp_chain;
3422         while (mp != NULL) {
3423                 flow_entry_t *dst_flow_ent;
3424                 void *flow_cookie;
3425                 size_t  pkt_size;
3426                 mblk_t *mp1;
3427
3428                 next = mp->b_next;
3429                 mp->b_next = NULL;
3430                 opackets++;
3431                 pkt_size = (mp->b_cont == NULL ? MBLKL(mp) : msgdsize(mp));
3432                 obytes += pkt_size;
3433                 CHECK_VID_AND_ADD_TAG(mp);
3434
3435                 /*
3436                  * Find the destination.
3437                  */
3438                 dst_flow_ent = mac_tx_classify(mip, mp);
3439
3440                 if (dst_flow_ent != NULL) {
3441                         size_t  hdrsize;
3442                         int     err = 0;
3443
3444                         if (mip->mi_info.mi_nativemedia == DL_ETHER) {
3445                                 struct ether_vlan_header *evhp =
3446                                     (struct ether_vlan_header *)mp->b_rptr;
3447
3448                                 if (ntohs(evhp->ether_tpid) == ETHERTYPE_VLAN)
3449                                         hdrsize = sizeof (*evhp);
3450                                 else
3451                                         hdrsize = sizeof (struct ether_header);
3452                         } else {
3453                                 mac_header_info_t       mhi;
3454
3455                                 err = mac_header_info((mac_handle_t)mip,
3456                                     mp, &mhi);
3457                                 if (err == 0)
3458                                         hdrsize = mhi.mhi_hdrsize;
3459                         }
3460
3461                         /*
3462                          * Got a matching flow. It's either another
3463                          * MAC client, or a broadcast/multicast flow.
3464                          * Make sure the packet size is within the
3465                          * allowed size. If not drop the packet and
3466                          * move to next packet.
3467                          */
3468                         if (err != 0 ||
3469                             (pkt_size - hdrsize) > mip->mi_sdu_max) {
3470                                 oerrors++;
3471                                 DTRACE_PROBE2(loopback__drop, size_t, pkt_size,
3472                                     mblk_t *, mp);
3473                                 freemsg(mp);
3474                                 mp = next;
3475                                 FLOW_REFRELE(dst_flow_ent);
3476                                 continue;
3477                         }
3478                         flow_cookie = mac_flow_get_client_cookie(dst_flow_ent);
3479                         if (flow_cookie != NULL) {
3480                                 /*
3481                                  * The vnic_bcast_send function expects
3482                                  * to receive the sender MAC client
3483                                  * as value for arg2.
3484                                  */
3485                                 mac_bcast_send(flow_cookie, src_mcip, mp,
3486                                     B_TRUE);
3487                         } else {
3488                                 /*
3489                                  * loopback the packet to a local MAC
3490                                  * client. We force a context switch
3491                                  * if both source and destination MAC
3492                                  * clients are used by IP, i.e.
3493                                  * bypass is set.
3494                                  */
3495                                 boolean_t do_switch;
3496                                 mac_client_impl_t *dst_mcip =
3497                                     dst_flow_ent->fe_mcip;
3498
3499                                 /*
3500                                  * Check if there are promiscuous mode
3501                                  * callbacks defined. This check is
3502                                  * done here in the 'else' case and
3503                                  * not in other cases because this
3504                                  * path is for local loopback
3505                                  * communication which does not go
3506                                  * through MAC_TX(). For paths that go
3507                                  * through MAC_TX(), the promisc_list
3508                                  * check is done inside the MAC_TX()
3509                                  * macro.
3510                                  */
3511                                 if (mip->mi_promisc_list != NULL)
3512                                         mac_promisc_dispatch(mip, mp, src_mcip);
3513
3514                                 do_switch = ((src_mcip->mci_state_flags &
3515                                     dst_mcip->mci_state_flags &
3516                                     MCIS_CLIENT_POLL_CAPABLE) != 0);
3517
3518                                 if ((mp1 = mac_fix_cksum(mp)) != NULL) {
3519                                         (dst_flow_ent->fe_cb_fn)(
3520                                             dst_flow_ent->fe_cb_arg1,
3521                                             dst_flow_ent->fe_cb_arg2,
3522                                             mp1, do_switch);
3523                                 }
3524                         }
3525                         FLOW_REFRELE(dst_flow_ent);
3526                 } else {
3527                         /*
3528                          * Unknown destination, send via the underlying
3529                          * NIC.
3530                          */
3531                         MAC_TX(mip, ring, mp, src_mcip);
3532                         if (mp != NULL) {
3533                                 /*
3534                                  * Adjust for the last packet that
3535                                  * could not be transmitted
3536                                  */
3537                                 opackets--;
3538                                 obytes -= pkt_size;
3539                                 mp->b_next = next;
3540                                 break;
3541                         }
3542                 }
3543                 mp = next;
3544         }
3545
3546 done:
3547         stats->mts_obytes = obytes;
3548         stats->mts_opackets = opackets;
3549         stats->mts_oerrors = oerrors;
3550         return (mp);
3551 }
3552
3553 /*
3554  * mac_tx_srs_ring_present
3555  *
3556  * Returns whether the specified ring is part of the specified SRS.
3557  */
3558 boolean_t
3559 mac_tx_srs_ring_present(mac_soft_ring_set_t *srs, mac_ring_t *tx_ring)
3560 {
3561         int i;
3562         mac_soft_ring_t *soft_ring;
3563
3564         if (srs->srs_tx.st_arg2 == tx_ring)
3565                 return (B_TRUE);
3566
3567         for (i = 0; i < srs->srs_tx_ring_count; i++) {
3568                 soft_ring =  srs->srs_tx_soft_rings[i];
3569                 if (soft_ring->s_ring_tx_arg2 == tx_ring)
3570                         return (B_TRUE);
3571         }
3572
3573         return (B_FALSE);
3574 }
3575
3576 /*
3577  * mac_tx_srs_get_soft_ring
3578  *
3579  * Returns the TX soft ring associated with the given ring, if present.
3580  */
3581 mac_soft_ring_t *
3582 mac_tx_srs_get_soft_ring(mac_soft_ring_set_t *srs, mac_ring_t *tx_ring)
3583 {
3584         int             i;
3585         mac_soft_ring_t *soft_ring;
3586
3587         if (srs->srs_tx.st_arg2 == tx_ring)
3588                 return (NULL);
3589
3590         for (i = 0; i < srs->srs_tx_ring_count; i++) {
3591                 soft_ring =  srs->srs_tx_soft_rings[i];
3592                 if (soft_ring->s_ring_tx_arg2 == tx_ring)
3593                         return (soft_ring);
3594         }
3595
3596         return (NULL);
3597 }
3598
3599 /*
3600  * mac_tx_srs_wakeup
3601  *
3602  * Called when Tx desc become available. Wakeup the appropriate worker
3603  * thread after resetting the SRS_TX_BLOCKED/S_RING_BLOCK bit in the
3604  * state field.
3605  */
3606 void
3607 mac_tx_srs_wakeup(mac_soft_ring_set_t *mac_srs, mac_ring_handle_t ring)
3608 {
3609         int i;
3610         mac_soft_ring_t *sringp;
3611         mac_srs_tx_t *srs_tx = &mac_srs->srs_tx;
3612
3613         mutex_enter(&mac_srs->srs_lock);
3614         /*
3615          * srs_tx_ring_count == 0 is the single ring mode case. In
3616          * this mode, there will not be Tx soft rings associated
3617          * with the SRS.
3618          */
3619         if (!MAC_TX_SOFT_RINGS(mac_srs)) {
3620                 if (srs_tx->st_arg2 == ring &&
3621                     mac_srs->srs_state & SRS_TX_BLOCKED) {
3622                         mac_srs->srs_state &= ~SRS_TX_BLOCKED;
3623                         srs_tx->st_stat.mts_unblockcnt++;
3624                         cv_signal(&mac_srs->srs_async);
3625                 }
3626                 /*
3627                  * A wakeup can come before tx_srs_drain() could
3628                  * grab srs lock and set SRS_TX_BLOCKED. So
3629                  * always set woken_up flag when we come here.
3630                  */
3631                 srs_tx->st_woken_up = B_TRUE;
3632                 mutex_exit(&mac_srs->srs_lock);
3633                 return;
3634         }
3635
3636         /*
3637          * If you are here, it is for FANOUT, BW_FANOUT,
3638          * AGGR_MODE or AGGR_BW_MODE case
3639          */
3640         for (i = 0; i < mac_srs->srs_tx_ring_count; i++) {
3641                 sringp = mac_srs->srs_tx_soft_rings[i];
3642                 mutex_enter(&sringp->s_ring_lock);
3643                 if (sringp->s_ring_tx_arg2 == ring) {
3644                         if (sringp->s_ring_state & S_RING_BLOCK) {
3645                                 sringp->s_ring_state &= ~S_RING_BLOCK;
3646                                 sringp->s_st_stat.mts_unblockcnt++;
3647                                 cv_signal(&sringp->s_ring_async);
3648                         }
3649                         sringp->s_ring_tx_woken_up = B_TRUE;
3650                 }
3651                 mutex_exit(&sringp->s_ring_lock);
3652         }
3653         mutex_exit(&mac_srs->srs_lock);
3654 }
3655
3656 /*
3657  * Once the driver is done draining, send a MAC_NOTE_TX notification to unleash
3658  * the blocked clients again.
3659  */
3660 void
3661 mac_tx_notify(mac_impl_t *mip)
3662 {
3663         i_mac_notify(mip, MAC_NOTE_TX);
3664 }
3665
3666 /*
3667  * RX SOFTRING RELATED FUNCTIONS
3668  *
3669  * These functions really belong in mac_soft_ring.c and here for
3670  * a short period.
3671  */
3672
3673 #define SOFT_RING_ENQUEUE_CHAIN(ringp, mp, tail, cnt, sz) {             \
3674         /*                                                              \
3675          * Enqueue our mblk chain.                                      \
3676          */                                                             \
3677         ASSERT(MUTEX_HELD(&(ringp)->s_ring_lock));                      \
3678                                                                         \
3679         if ((ringp)->s_ring_last != NULL)                               \
3680                 (ringp)->s_ring_last->b_next = (mp);                    \
3681         else                                                            \
3682                 (ringp)->s_ring_first = (mp);                           \
3683         (ringp)->s_ring_last = (tail);                                  \
3684         (ringp)->s_ring_count += (cnt);                                 \
3685         ASSERT((ringp)->s_ring_count > 0);                              \
3686         if ((ringp)->s_ring_type & ST_RING_BW_CTL) {                    \
3687                 (ringp)->s_ring_size += sz;                             \
3688         }                                                               \
3689 }
3690
3691 /*
3692  * Default entry point to deliver a packet chain to a MAC client.
3693  * If the MAC client has flows, do the classification with these
3694  * flows as well.
3695  */
3696 /* ARGSUSED */
3697 void
3698 mac_rx_deliver(void *arg1, mac_resource_handle_t mrh, mblk_t *mp_chain,
3699     mac_header_info_t *arg3)
3700 {
3701         mac_client_impl_t *mcip = arg1;
3702
3703         if (mcip->mci_nvids == 1 &&
3704             !(mcip->mci_state_flags & MCIS_STRIP_DISABLE)) {
3705                 /*
3706                  * If the client has exactly one VID associated with it
3707                  * and striping of VLAN header is not disabled,
3708                  * remove the VLAN tag from the packet before
3709                  * passing it on to the client's receive callback.
3710                  * Note that this needs to be done after we dispatch
3711                  * the packet to the promiscuous listeners of the
3712                  * client, since they expect to see the whole
3713                  * frame including the VLAN headers.
3714                  */
3715                 mp_chain = mac_strip_vlan_tag_chain(mp_chain);
3716         }
3717
3718         mcip->mci_rx_fn(mcip->mci_rx_arg, mrh, mp_chain, B_FALSE);
3719 }
3720
3721 /*
3722  * mac_rx_soft_ring_process
3723  *
3724  * process a chain for a given soft ring. The number of packets queued
3725  * in the SRS and its associated soft rings (including this one) is
3726  * very small (tracked by srs_poll_pkt_cnt), then allow the entering
3727  * thread (interrupt or poll thread) to do inline processing. This
3728  * helps keep the latency down under low load.
3729  *
3730  * The proc and arg for each mblk is already stored in the mblk in
3731  * appropriate places.
3732  */
3733 /* ARGSUSED */
3734 void
3735 mac_rx_soft_ring_process(mac_client_impl_t *mcip, mac_soft_ring_t *ringp,
3736     mblk_t *mp_chain, mblk_t *tail, int cnt, size_t sz)
3737 {
3738         mac_direct_rx_t         proc;
3739         void                    *arg1;
3740         mac_resource_handle_t   arg2;
3741         mac_soft_ring_set_t     *mac_srs = ringp->s_ring_set;
3742
3743         ASSERT(ringp != NULL);
3744         ASSERT(mp_chain != NULL);
3745         ASSERT(tail != NULL);
3746         ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock));
3747
3748         mutex_enter(&ringp->s_ring_lock);
3749         ringp->s_ring_total_inpkt += cnt;
3750         ringp->s_ring_total_rbytes += sz;
3751         if ((mac_srs->srs_rx.sr_poll_pkt_cnt <= 1) &&
3752             !(ringp->s_ring_type & ST_RING_WORKER_ONLY)) {
3753                 /* If on processor or blanking on, then enqueue and return */
3754                 if (ringp->s_ring_state & S_RING_BLANK ||
3755                     ringp->s_ring_state & S_RING_PROC) {
3756                         SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz);
3757                         mutex_exit(&ringp->s_ring_lock);
3758                         return;
3759                 }
3760                 proc = ringp->s_ring_rx_func;
3761                 arg1 = ringp->s_ring_rx_arg1;
3762                 arg2 = ringp->s_ring_rx_arg2;
3763                 /*
3764                  * See if anything is already queued. If we are the
3765                  * first packet, do inline processing else queue the
3766                  * packet and do the drain.
3767                  */
3768                 if (ringp->s_ring_first == NULL) {
3769                         /*
3770                          * Fast-path, ok to process and nothing queued.
3771                          */
3772                         ringp->s_ring_run = curthread;
3773                         ringp->s_ring_state |= (S_RING_PROC);
3774
3775                         mutex_exit(&ringp->s_ring_lock);
3776
3777                         /*
3778                          * We are the chain of 1 packet so
3779                          * go through this fast path.
3780                          */
3781                         ASSERT(mp_chain->b_next == NULL);
3782
3783                         (*proc)(arg1, arg2, mp_chain, NULL);
3784
3785                         ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock));
3786                         /*
3787                          * If we have a soft ring set which is doing
3788                          * bandwidth control, we need to decrement
3789                          * srs_size and count so it the SRS can have a
3790                          * accurate idea of what is the real data
3791                          * queued between SRS and its soft rings. We
3792                          * decrement the counters only when the packet
3793                          * gets processed by both SRS and the soft ring.
3794                          */
3795                         mutex_enter(&mac_srs->srs_lock);
3796                         MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt);
3797                         MAC_UPDATE_SRS_SIZE_LOCKED(mac_srs, sz);
3798                         mutex_exit(&mac_srs->srs_lock);
3799
3800                         mutex_enter(&ringp->s_ring_lock);
3801                         ringp->s_ring_run = NULL;
3802                         ringp->s_ring_state &= ~S_RING_PROC;
3803                         if (ringp->s_ring_state & S_RING_CLIENT_WAIT)
3804                                 cv_signal(&ringp->s_ring_client_cv);
3805
3806                         if ((ringp->s_ring_first == NULL) ||
3807                             (ringp->s_ring_state & S_RING_BLANK)) {
3808                                 /*
3809                                  * We processed inline our packet and
3810                                  * nothing new has arrived or our
3811                                  * receiver doesn't want to receive
3812                                  * any packets. We are done.
3813                                  */
3814                                 mutex_exit(&ringp->s_ring_lock);
3815                                 return;
3816                         }
3817                 } else {
3818                         SOFT_RING_ENQUEUE_CHAIN(ringp,
3819                             mp_chain, tail, cnt, sz);
3820                 }
3821
3822                 /*
3823                  * We are here because either we couldn't do inline
3824                  * processing (because something was already
3825                  * queued), or we had a chain of more than one
3826                  * packet, or something else arrived after we were
3827                  * done with inline processing.
3828                  */
3829                 ASSERT(MUTEX_HELD(&ringp->s_ring_lock));
3830                 ASSERT(ringp->s_ring_first != NULL);
3831
3832                 ringp->s_ring_drain_func(ringp);
3833                 mutex_exit(&ringp->s_ring_lock);
3834                 return;
3835         } else {
3836                 /* ST_RING_WORKER_ONLY case */
3837                 SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz);
3838                 mac_soft_ring_worker_wakeup(ringp);
3839                 mutex_exit(&ringp->s_ring_lock);
3840         }
3841 }
3842
3843 /*
3844  * TX SOFTRING RELATED FUNCTIONS
3845  *
3846  * These functions really belong in mac_soft_ring.c and here for
3847  * a short period.
3848  */
3849
3850 #define TX_SOFT_RING_ENQUEUE_CHAIN(ringp, mp, tail, cnt, sz) {          \
3851         ASSERT(MUTEX_HELD(&ringp->s_ring_lock));                        \
3852         ringp->s_ring_state |= S_RING_ENQUEUED;                         \
3853         SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz);        \
3854 }
3855
3856 /*
3857  * mac_tx_sring_queued
3858  *
3859  * When we are out of transmit descriptors and we already have a
3860  * queue that exceeds hiwat (or the client called us with
3861  * MAC_TX_NO_ENQUEUE or MAC_DROP_ON_NO_DESC flag), return the
3862  * soft ring pointer as the opaque cookie for the client enable
3863  * flow control.
3864  */
3865 static mac_tx_cookie_t
3866 mac_tx_sring_enqueue(mac_soft_ring_t *ringp, mblk_t *mp_chain, uint16_t flag,
3867     mblk_t **ret_mp)
3868 {
3869         int cnt;
3870         size_t sz;
3871         mblk_t *tail;
3872         mac_soft_ring_set_t *mac_srs = ringp->s_ring_set;
3873         mac_tx_cookie_t cookie = NULL;
3874         boolean_t wakeup_worker = B_TRUE;
3875
3876         ASSERT(MUTEX_HELD(&ringp->s_ring_lock));
3877         MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz);
3878         if (flag & MAC_DROP_ON_NO_DESC) {
3879                 mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE);
3880                 /* increment freed stats */
3881                 ringp->s_ring_drops += cnt;
3882                 cookie = (mac_tx_cookie_t)ringp;
3883         } else {
3884                 if (ringp->s_ring_first != NULL)
3885                         wakeup_worker = B_FALSE;
3886
3887                 if (flag & MAC_TX_NO_ENQUEUE) {
3888                         /*
3889                          * If QUEUED is not set, queue the packet
3890                          * and let mac_tx_soft_ring_drain() set
3891                          * the TX_BLOCKED bit for the reasons
3892                          * explained above. Otherwise, return the
3893                          * mblks.
3894                          */
3895                         if (wakeup_worker) {
3896                                 TX_SOFT_RING_ENQUEUE_CHAIN(ringp,
3897                                     mp_chain, tail, cnt, sz);
3898                         } else {
3899                                 ringp->s_ring_state |= S_RING_WAKEUP_CLIENT;
3900                                 cookie = (mac_tx_cookie_t)ringp;
3901                                 *ret_mp = mp_chain;
3902                         }
3903                 } else {
3904                         boolean_t enqueue = B_TRUE;
3905
3906                         if (ringp->s_ring_count > ringp->s_ring_tx_hiwat) {
3907                                 /*
3908                                  * flow-controlled. Store ringp in cookie
3909                                  * so that it can be returned as
3910                                  * mac_tx_cookie_t to client
3911                                  */
3912                                 ringp->s_ring_state |= S_RING_TX_HIWAT;
3913                                 cookie = (mac_tx_cookie_t)ringp;
3914                                 ringp->s_ring_hiwat_cnt++;
3915                                 if (ringp->s_ring_count >
3916                                     ringp->s_ring_tx_max_q_cnt) {
3917                                         /* increment freed stats */
3918                                         ringp->s_ring_drops += cnt;
3919                                         /*
3920                                          * b_prev may be set to the fanout hint
3921                                          * hence can't use freemsg directly
3922                                          */
3923                                         mac_pkt_drop(NULL, NULL,
3924                                             mp_chain, B_FALSE);
3925                                         DTRACE_PROBE1(tx_queued_hiwat,
3926                                             mac_soft_ring_t *, ringp);
3927                                         enqueue = B_FALSE;
3928                                 }
3929                         }
3930                         if (enqueue) {
3931                                 TX_SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain,
3932                                     tail, cnt, sz);
3933                         }
3934                 }
3935                 if (wakeup_worker)
3936                         cv_signal(&ringp->s_ring_async);
3937         }
3938         return (cookie);
3939 }
3940
3941
3942 /*
3943  * mac_tx_soft_ring_process
3944  *
3945  * This routine is called when fanning out outgoing traffic among
3946  * multipe Tx rings.
3947  * Note that a soft ring is associated with a h/w Tx ring.
3948  */
3949 mac_tx_cookie_t
3950 mac_tx_soft_ring_process(mac_soft_ring_t *ringp, mblk_t *mp_chain,
3951     uint16_t flag, mblk_t **ret_mp)
3952 {
3953         mac_soft_ring_set_t *mac_srs = ringp->s_ring_set;
3954         int     cnt;
3955         size_t  sz;
3956         mblk_t  *tail;
3957         mac_tx_cookie_t cookie = NULL;
3958
3959         ASSERT(ringp != NULL);
3960         ASSERT(mp_chain != NULL);
3961         ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock));
3962         /*
3963          * The following modes can come here: SRS_TX_BW_FANOUT,
3964          * SRS_TX_FANOUT, SRS_TX_AGGR, SRS_TX_BW_AGGR.
3965          */
3966         ASSERT(MAC_TX_SOFT_RINGS(mac_srs));
3967         ASSERT(mac_srs->srs_tx.st_mode == SRS_TX_FANOUT ||
3968             mac_srs->srs_tx.st_mode == SRS_TX_BW_FANOUT ||
3969             mac_srs->srs_tx.st_mode == SRS_TX_AGGR ||
3970             mac_srs->srs_tx.st_mode == SRS_TX_BW_AGGR);
3971
3972         if (ringp->s_ring_type & ST_RING_WORKER_ONLY) {
3973                 /* Serialization mode */
3974
3975                 mutex_enter(&ringp->s_ring_lock);
3976                 if (ringp->s_ring_count > ringp->s_ring_tx_hiwat) {
3977                         cookie = mac_tx_sring_enqueue(ringp, mp_chain,
3978                             flag, ret_mp);
3979                         mutex_exit(&ringp->s_ring_lock);
3980                         return (cookie);
3981                 }
3982                 MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz);
3983                 TX_SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz);
3984                 if (ringp->s_ring_state & (S_RING_BLOCK | S_RING_PROC)) {
3985                         /*
3986                          * If ring is blocked due to lack of Tx
3987                          * descs, just return. Worker thread
3988                          * will get scheduled when Tx desc's
3989                          * become available.
3990                          */
3991                         mutex_exit(&ringp->s_ring_lock);
3992                         return (cookie);
3993                 }
3994                 mac_soft_ring_worker_wakeup(ringp);
3995                 mutex_exit(&ringp->s_ring_lock);
3996                 return (cookie);
3997         } else {
3998                 /* Default fanout mode */
3999                 /*
4000                  * S_RING_BLOCKED is set when underlying NIC runs
4001                  * out of Tx descs and messages start getting
4002                  * queued. It won't get reset until
4003                  * tx_srs_drain() completely drains out the
4004                  * messages.
4005                  */
4006                 mac_tx_stats_t          stats;
4007
4008                 if (ringp->s_ring_state & S_RING_ENQUEUED) {
4009                         /* Tx descs/resources not available */
4010                         mutex_enter(&ringp->s_ring_lock);
4011                         if (ringp->s_ring_state & S_RING_ENQUEUED) {
4012                                 cookie = mac_tx_sring_enqueue(ringp, mp_chain,
4013                                     flag, ret_mp);
4014                                 mutex_exit(&ringp->s_ring_lock);
4015                                 return (cookie);
4016                         }
4017                         /*
4018                          * While we were computing mblk count, the
4019                          * flow control condition got relieved.
4020                          * Continue with the transmission.
4021                          */
4022                         mutex_exit(&ringp->s_ring_lock);
4023                 }
4024
4025                 mp_chain = mac_tx_send(ringp->s_ring_tx_arg1,
4026                     ringp->s_ring_tx_arg2, mp_chain, &stats);
4027
4028                 /*
4029                  * Multiple threads could be here sending packets.
4030                  * Under such conditions, it is not possible to
4031                  * automically set S_RING_BLOCKED bit to indicate
4032                  * out of tx desc condition. To atomically set
4033                  * this, we queue the returned packet and do
4034                  * the setting of S_RING_BLOCKED in
4035                  * mac_tx_soft_ring_drain().
4036                  */
4037                 if (mp_chain != NULL) {
4038                         mutex_enter(&ringp->s_ring_lock);
4039                         cookie =
4040                             mac_tx_sring_enqueue(ringp, mp_chain, flag, ret_mp);
4041                         mutex_exit(&ringp->s_ring_lock);
4042                         return (cookie);
4043                 }
4044                 SRS_TX_STATS_UPDATE(mac_srs, &stats);
4045                 SOFTRING_TX_STATS_UPDATE(ringp, &stats);
4046
4047                 return (NULL);
4048         }
4049 }