4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright(c) 2007-2010 Intel Corporation. All rights reserved.
27 * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
32 static boolean_t
igb_tx(igb_tx_ring_t
*, mblk_t
*);
33 static int igb_tx_copy(igb_tx_ring_t
*, tx_control_block_t
*, mblk_t
*,
35 static int igb_tx_bind(igb_tx_ring_t
*, tx_control_block_t
*, mblk_t
*,
37 static int igb_tx_fill_ring(igb_tx_ring_t
*, link_list_t
*, tx_context_t
*,
39 static void igb_save_desc(tx_control_block_t
*, uint64_t, size_t);
40 static tx_control_block_t
*igb_get_free_list(igb_tx_ring_t
*);
41 static int igb_get_tx_context(mblk_t
*, tx_context_t
*);
42 static boolean_t
igb_check_tx_context(igb_tx_ring_t
*, tx_context_t
*);
43 static void igb_fill_tx_context(struct e1000_adv_tx_context_desc
*,
44 tx_context_t
*, uint32_t);
47 #pragma inline(igb_save_desc)
48 #pragma inline(igb_get_tx_context)
49 #pragma inline(igb_check_tx_context)
50 #pragma inline(igb_fill_tx_context)
54 igb_tx_ring_send(void *arg
, mblk_t
*mp
)
56 igb_tx_ring_t
*tx_ring
= (igb_tx_ring_t
*)arg
;
59 ASSERT(tx_ring
!= NULL
);
63 if ((igb
->igb_state
& IGB_SUSPENDED
) ||
64 (igb
->igb_state
& IGB_ERROR
) ||
65 !(igb
->igb_state
& IGB_STARTED
) ||
66 igb
->link_state
!= LINK_STATE_UP
) {
71 return ((igb_tx(tx_ring
, mp
)) ? NULL
: mp
);
75 * igb_tx - Main transmit processing
77 * Called from igb_m_tx with an mblk ready to transmit. this
78 * routine sets up the transmit descriptors and sends data to
81 * One mblk can consist of several fragments, each fragment
82 * will be processed with different methods based on the size.
83 * For the fragments with size less than the bcopy threshold,
84 * they will be processed by using bcopy; otherwise, they will
85 * be processed by using DMA binding.
87 * To process the mblk, a tx control block is got from the
88 * free list. One tx control block contains one tx buffer, which
89 * is used to copy mblk fragments' data; and one tx DMA handle,
90 * which is used to bind a mblk fragment with DMA resource.
92 * Several small mblk fragments can be copied into one tx control
93 * block's buffer, and then the buffer will be transmitted with
96 * A large fragment only binds with one tx control block's DMA
97 * handle, and it can span several tx descriptors for transmitting.
99 * So to transmit a packet (mblk), several tx control blocks can
100 * be used. After the processing, those tx control blocks will
101 * be put to the work list.
104 igb_tx(igb_tx_ring_t
*tx_ring
, mblk_t
*mp
)
106 igb_t
*igb
= tx_ring
->igb
;
107 tx_type_t current_flag
, next_flag
;
108 uint32_t current_len
, next_len
;
112 boolean_t copy_done
, eop
;
113 mblk_t
*current_mp
, *next_mp
, *nmp
;
114 tx_control_block_t
*tcb
;
115 tx_context_t tx_context
, *ctx
;
116 link_list_t pending_list
;
117 mblk_t
*hdr_new_mp
= NULL
;
118 mblk_t
*hdr_previous_mp
= NULL
;
119 mblk_t
*hdr_current_mp
= NULL
;
120 uint32_t hdr_frag_len
;
121 uint32_t hdr_len
, len
;
122 uint32_t copy_thresh
;
124 copy_thresh
= igb
->tx_copy_thresh
;
126 /* Get the mblk size */
128 for (nmp
= mp
; nmp
!= NULL
; nmp
= nmp
->b_cont
) {
129 mbsize
+= MBLKL(nmp
);
132 if (igb
->tx_hcksum_enable
) {
135 * Retrieve offloading context information from the mblk
136 * that will be used to decide whether/how to fill the
137 * context descriptor.
139 if (igb_get_tx_context(mp
, ctx
) != TX_CXT_SUCCESS
) {
144 if ((ctx
->lso_flag
&&
145 (mbsize
> (ctx
->mac_hdr_len
+ IGB_LSO_MAXLEN
))) ||
147 (mbsize
> (igb
->max_frame_size
- ETHERFCSL
)))) {
149 igb_log(igb
, IGB_LOG_INFO
, "igb_tx: packet oversize");
154 if (mbsize
> (igb
->max_frame_size
- ETHERFCSL
)) {
156 igb_log(igb
, IGB_LOG_INFO
, "igb_tx: packet oversize");
162 * Check and recycle tx descriptors.
163 * The recycle threshold here should be selected carefully
165 if (tx_ring
->tbd_free
< igb
->tx_recycle_thresh
)
166 tx_ring
->tx_recycle(tx_ring
);
169 * After the recycling, if the tbd_free is less than the
170 * tx_overload_threshold, assert overload, return B_FALSE;
171 * and we need to re-schedule the tx again.
173 if (tx_ring
->tbd_free
< igb
->tx_overload_thresh
) {
174 tx_ring
->reschedule
= B_TRUE
;
175 IGB_DEBUG_STAT(tx_ring
->stat_overload
);
180 * The software should guarantee LSO packet header(MAC+IP+TCP)
181 * to be within one descriptor - this is required by h/w.
182 * Here will reallocate and refill the header if
183 * the headers(MAC+IP+TCP) is physical memory non-contiguous.
185 if (ctx
&& ctx
->lso_flag
) {
186 hdr_len
= ctx
->mac_hdr_len
+ ctx
->ip_hdr_len
+ ctx
->l4_hdr_len
;
189 while (len
< hdr_len
) {
190 hdr_previous_mp
= hdr_current_mp
;
191 hdr_current_mp
= hdr_current_mp
->b_cont
;
192 len
+= MBLKL(hdr_current_mp
);
195 * If the header and the payload are in different mblks,
196 * we simply force the header to be copied into pre-allocated
197 * page-aligned buffer.
200 goto adjust_threshold
;
202 hdr_frag_len
= hdr_len
- (len
- MBLKL(hdr_current_mp
));
204 * There are two cases we will reallocate
205 * a mblk for the last header fragment.
206 * 1. the header is in multiple mblks and
207 * the last fragment shares the same mblk
209 * 2. the header is in a single mblk shared
210 * with the payload but the header crosses
213 if ((hdr_current_mp
!= mp
) ||
214 (P2NPHASE((uintptr_t)hdr_current_mp
->b_rptr
, igb
->page_size
)
217 * reallocate the mblk for the last header fragment,
218 * expect it to be copied into pre-allocated
219 * page-aligned buffer
221 hdr_new_mp
= allocb(hdr_frag_len
, 0);
226 /* link the new header fragment with the other parts */
227 bcopy(hdr_current_mp
->b_rptr
,
228 hdr_new_mp
->b_rptr
, hdr_frag_len
);
229 hdr_new_mp
->b_wptr
= hdr_new_mp
->b_rptr
+ hdr_frag_len
;
230 hdr_new_mp
->b_cont
= hdr_current_mp
;
232 hdr_previous_mp
->b_cont
= hdr_new_mp
;
235 hdr_current_mp
->b_rptr
+= hdr_frag_len
;
239 * adjust the bcopy threshhold to guarantee
240 * the header to use bcopy way
242 if (copy_thresh
< hdr_len
)
243 copy_thresh
= hdr_len
;
247 * The pending_list is a linked list that is used to save
248 * the tx control blocks that have packet data processed
249 * but have not put the data to the tx descriptor ring.
250 * It is used to reduce the lock contention of the tx_lock.
252 LINK_LIST_INIT(&pending_list
);
257 current_len
= MBLKL(current_mp
);
259 * Decide which method to use for the first fragment
261 current_flag
= (current_len
<= copy_thresh
) ?
264 * If the mblk includes several contiguous small fragments,
265 * they may be copied into one buffer. This flag is used to
266 * indicate whether there are pending fragments that need to
267 * be copied to the current tx buffer.
269 * If this flag is B_TRUE, it indicates that a new tx control
270 * block is needed to process the next fragment using either
271 * copy or DMA binding.
273 * Otherwise, it indicates that the next fragment will be
274 * copied to the current tx buffer that is maintained by the
275 * current tx control block. No new tx control block is needed.
279 next_mp
= current_mp
->b_cont
;
280 eop
= (next_mp
== NULL
); /* Last fragment of the packet? */
281 next_len
= eop
? 0: MBLKL(next_mp
);
284 * When the current fragment is an empty fragment, if
285 * the next fragment will still be copied to the current
286 * tx buffer, we cannot skip this fragment here. Because
287 * the copy processing is pending for completion. We have
288 * to process this empty fragment in the tx_copy routine.
290 * If the copy processing is completed or a DMA binding
291 * processing is just completed, we can just skip this
294 if ((current_len
== 0) && (copy_done
)) {
295 current_mp
= next_mp
;
296 current_len
= next_len
;
297 current_flag
= (current_len
<= copy_thresh
) ?
304 * Get a new tx control block from the free list
306 tcb
= igb_get_free_list(tx_ring
);
309 IGB_DEBUG_STAT(tx_ring
->stat_fail_no_tcb
);
314 * Push the tx control block to the pending list
315 * to avoid using lock too early
317 LIST_PUSH_TAIL(&pending_list
, &tcb
->link
);
320 if (current_flag
== USE_COPY
) {
322 * Check whether to use bcopy or DMA binding to process
323 * the next fragment, and if using bcopy, whether we
324 * need to continue copying the next fragment into the
327 ASSERT((tcb
->tx_buf
.len
+ current_len
) <=
332 * This is the last fragment of the packet, so
333 * the copy processing will be completed with
336 next_flag
= USE_NONE
;
338 } else if ((tcb
->tx_buf
.len
+ current_len
+ next_len
) >
341 * If the next fragment is too large to be
342 * copied to the current tx buffer, we need
343 * to complete the current copy processing.
345 next_flag
= (next_len
> copy_thresh
) ?
348 } else if (next_len
> copy_thresh
) {
350 * The next fragment needs to be processed with
351 * DMA binding. So the copy prcessing will be
352 * completed with the current fragment.
358 * Continue to copy the next fragment to the
361 next_flag
= USE_COPY
;
365 desc_num
= igb_tx_copy(tx_ring
, tcb
, current_mp
,
366 current_len
, copy_done
);
369 * Check whether to use bcopy or DMA binding to process
372 next_flag
= (next_len
> copy_thresh
) ?
374 ASSERT(copy_done
== B_TRUE
);
376 desc_num
= igb_tx_bind(tx_ring
, tcb
, current_mp
,
381 desc_total
+= desc_num
;
382 else if (desc_num
< 0)
385 current_mp
= next_mp
;
386 current_len
= next_len
;
387 current_flag
= next_flag
;
391 * Attach the mblk to the last tx control block
394 ASSERT(tcb
->mp
== NULL
);
398 * Before fill the tx descriptor ring with the data, we need to
399 * ensure there are adequate free descriptors for transmit
400 * (including one context descriptor).
401 * Do not use up all the tx descriptors.
402 * Otherwise tx recycle will fail and cause false hang.
404 if (tx_ring
->tbd_free
<= (desc_total
+ 1)) {
405 tx_ring
->tx_recycle(tx_ring
);
408 mutex_enter(&tx_ring
->tx_lock
);
411 * If the number of free tx descriptors is not enough for transmit
412 * then return failure.
414 * Note: we must put this check under the mutex protection to
415 * ensure the correctness when multiple threads access it in
418 if (tx_ring
->tbd_free
<= (desc_total
+ 1)) {
419 IGB_DEBUG_STAT(tx_ring
->stat_fail_no_tbd
);
420 mutex_exit(&tx_ring
->tx_lock
);
424 desc_num
= igb_tx_fill_ring(tx_ring
, &pending_list
, ctx
, mbsize
);
426 ASSERT((desc_num
== desc_total
) || (desc_num
== (desc_total
+ 1)));
428 /* Update per-ring tx statistics */
430 tx_ring
->tx_bytes
+= mbsize
;
432 mutex_exit(&tx_ring
->tx_lock
);
438 * If new mblk has been allocted for the last header
439 * fragment of a LSO packet, we should restore the
443 hdr_new_mp
->b_cont
= NULL
;
445 hdr_current_mp
->b_rptr
-= hdr_frag_len
;
447 hdr_previous_mp
->b_cont
= hdr_current_mp
;
453 * Discard the mblk and free the used resources
455 tcb
= (tx_control_block_t
*)LIST_GET_HEAD(&pending_list
);
461 tcb
= (tx_control_block_t
*)
462 LIST_GET_NEXT(&pending_list
, &tcb
->link
);
466 * Return the tx control blocks in the pending list to the free list.
468 igb_put_free_list(tx_ring
, &pending_list
);
470 /* Transmit failed, do not drop the mblk, rechedule the transmit */
471 tx_ring
->reschedule
= B_TRUE
;
479 * Copy the mblk fragment to the pre-allocated tx buffer
482 igb_tx_copy(igb_tx_ring_t
*tx_ring
, tx_control_block_t
*tcb
, mblk_t
*mp
,
483 uint32_t len
, boolean_t copy_done
)
485 dma_buffer_t
*tx_buf
;
487 _NOTE(ARGUNUSED(tx_ring
));
489 tx_buf
= &tcb
->tx_buf
;
492 * Copy the packet data of the mblk fragment into the
493 * pre-allocated tx buffer, which is maintained by the
496 * Several mblk fragments can be copied into one tx buffer.
497 * The destination address of the current copied fragment in
498 * the tx buffer is next to the end of the previous copied
502 bcopy(mp
->b_rptr
, tx_buf
->address
+ tx_buf
->len
, len
);
511 * If it is the last fragment copied to the current tx buffer,
512 * in other words, if there's no remaining fragment or the remaining
513 * fragment requires a new tx control block to process, we need to
514 * complete the current copy processing by syncing up the current
515 * DMA buffer and saving the descriptor data.
519 * Sync the DMA buffer of the packet data
521 DMA_SYNC(tx_buf
, DDI_DMA_SYNC_FORDEV
);
523 tcb
->tx_type
= USE_COPY
;
526 * Save the address and length to the private data structure
527 * of the tx control block, which will be used to fill the
528 * tx descriptor ring after all the fragments are processed.
530 igb_save_desc(tcb
, tx_buf
->dma_address
, tx_buf
->len
);
540 * Bind the mblk fragment with DMA
543 igb_tx_bind(igb_tx_ring_t
*tx_ring
, tx_control_block_t
*tcb
, mblk_t
*mp
,
547 ddi_dma_cookie_t dma_cookie
;
552 * Use DMA binding to process the mblk fragment
554 status
= ddi_dma_addr_bind_handle(tcb
->tx_dma_handle
, NULL
,
555 (caddr_t
)mp
->b_rptr
, len
,
556 DDI_DMA_WRITE
| DDI_DMA_STREAMING
, DDI_DMA_DONTWAIT
,
557 0, &dma_cookie
, &ncookies
);
559 if (status
!= DDI_DMA_MAPPED
) {
560 IGB_DEBUG_STAT(tx_ring
->stat_fail_dma_bind
);
565 tcb
->tx_type
= USE_DMA
;
567 * Each fragment can span several cookies. One cookie will have
568 * one tx descriptor to transmit.
571 for (i
= ncookies
; i
> 0; i
--) {
573 * Save the address and length to the private data structure
574 * of the tx control block, which will be used to fill the
575 * tx descriptor ring after all the fragments are processed.
578 dma_cookie
.dmac_laddress
,
579 dma_cookie
.dmac_size
);
584 ddi_dma_nextcookie(tcb
->tx_dma_handle
, &dma_cookie
);
593 * Get the tx context information from the mblk
596 igb_get_tx_context(mblk_t
*mp
, tx_context_t
*ctx
)
608 uint32_t mac_hdr_len
;
614 mac_hcksum_get(mp
, &start
, NULL
, NULL
, NULL
, &flags
);
615 bzero(ctx
, sizeof (tx_context_t
));
617 ctx
->hcksum_flags
= flags
;
620 return (TX_CXT_SUCCESS
);
622 mac_lso_get(mp
, &mss
, &lso_flag
);
624 ctx
->lso_flag
= (lso_flag
== HW_LSO
);
631 * Firstly get the position of the ether_type/ether_tpid.
632 * Here we don't assume the ether (VLAN) header is fully included
633 * in one mblk fragment, so we go thourgh the fragments to parse
636 size
= len
= MBLKL(mp
);
637 offset
= offsetof(struct ether_header
, ether_type
);
638 while (size
<= offset
) {
644 pos
= mp
->b_rptr
+ offset
+ len
- size
;
646 etype
= ntohs(*(ushort_t
*)(uintptr_t)pos
);
647 if (etype
== ETHERTYPE_VLAN
) {
649 * Get the position of the ether_type in VLAN header
651 offset
= offsetof(struct ether_vlan_header
, ether_type
);
652 while (size
<= offset
) {
658 pos
= mp
->b_rptr
+ offset
+ len
- size
;
660 etype
= ntohs(*(ushort_t
*)(uintptr_t)pos
);
661 mac_hdr_len
= sizeof (struct ether_vlan_header
);
663 mac_hdr_len
= sizeof (struct ether_header
);
667 * Here we assume the IP(V6) header is fully included in one
670 lso_cksum
= HCK_PARTIALCKSUM
;
671 ctx
->l3_proto
= etype
;
674 offset
= mac_hdr_len
;
675 while (size
<= offset
) {
681 pos
= mp
->b_rptr
+ offset
+ len
- size
;
684 *((uint16_t *)(uintptr_t)(pos
+ offsetof(ipha_t
,
688 * To utilize igb LSO, here need to fill
689 * the tcp checksum field of the packet with the
690 * following pseudo-header checksum:
691 * (ip_source_addr, ip_destination_addr, l4_proto)
692 * and also need to fill the ip header checksum
693 * with zero. Currently the tcp/ip stack has done
696 lso_cksum
|= HCK_IPV4_HDRCKSUM
;
699 l4_proto
= *(uint8_t *)(pos
+ offsetof(ipha_t
, ipha_protocol
));
703 * We need to zero out the length in the header.
706 offset
= offsetof(ip6_t
, ip6_plen
) + mac_hdr_len
;
707 while (size
<= offset
) {
713 pos
= mp
->b_rptr
+ offset
+ len
- size
;
714 *((uint16_t *)(uintptr_t)(pos
)) = 0;
717 offset
= offsetof(ip6_t
, ip6_nxt
) + mac_hdr_len
;
718 while (size
<= offset
) {
724 pos
= mp
->b_rptr
+ offset
+ len
- size
;
726 l4_proto
= *(uint8_t *)pos
;
729 /* Unrecoverable error */
730 igb_log(NULL
, IGB_LOG_INFO
, "Ethernet type field error with "
731 "tx hcksum flag set");
732 return (TX_CXT_E_ETHER_TYPE
);
737 * LSO relies on tx h/w checksum, so here the packet will be
738 * dropped if the h/w checksum flags are not set.
740 if ((ctx
->hcksum_flags
& lso_cksum
) != lso_cksum
) {
741 igb_log(NULL
, IGB_LOG_INFO
, "igb_tx: h/w "
742 "checksum flags are not set for LSO, found "
743 "0x%x, needed bits 0x%x", ctx
->hcksum_flags
,
745 return (TX_CXT_E_LSO_CSUM
);
748 offset
= mac_hdr_len
+ start
;
749 while (size
<= offset
) {
755 pos
= mp
->b_rptr
+ offset
+ len
- size
;
757 l4_hdr_len
= TCP_HDR_LENGTH((tcph_t
*)pos
);
760 * l4 header length is only required for LSO
765 ctx
->mac_hdr_len
= mac_hdr_len
;
766 ctx
->ip_hdr_len
= start
;
767 ctx
->l4_proto
= l4_proto
;
768 ctx
->l4_hdr_len
= l4_hdr_len
;
770 return (TX_CXT_SUCCESS
);
774 * igb_check_tx_context
776 * Check if a new context descriptor is needed
779 igb_check_tx_context(igb_tx_ring_t
*tx_ring
, tx_context_t
*ctx
)
787 * Compare the context data retrieved from the mblk and the
788 * stored context data of the last context descriptor. The data
789 * need to be checked are:
793 * mss (only check for LSO)
794 * l4_hdr_len (only check for LSO)
797 * Either one of the above data is changed, a new context descriptor
800 last
= &tx_ring
->tx_context
;
802 if (ctx
->hcksum_flags
!= 0) {
803 if ((ctx
->hcksum_flags
!= last
->hcksum_flags
) ||
804 (ctx
->l4_proto
!= last
->l4_proto
) ||
805 (ctx
->l3_proto
!= last
->l3_proto
) ||
806 (ctx
->lso_flag
&& ((ctx
->mss
!= last
->mss
) ||
807 (ctx
->l4_hdr_len
!= last
->l4_hdr_len
))) ||
808 (ctx
->ip_hdr_len
!= last
->ip_hdr_len
) ||
809 (ctx
->mac_hdr_len
!= last
->mac_hdr_len
)) {
818 * igb_fill_tx_context
820 * Fill the context descriptor with hardware checksum informations
823 igb_fill_tx_context(struct e1000_adv_tx_context_desc
*ctx_tbd
,
824 tx_context_t
*ctx
, uint32_t ring_index
)
827 * Fill the context descriptor with the checksum
828 * context information we've got
830 ctx_tbd
->vlan_macip_lens
= ctx
->ip_hdr_len
;
831 ctx_tbd
->vlan_macip_lens
|= ctx
->mac_hdr_len
<<
832 E1000_ADVTXD_MACLEN_SHIFT
;
834 ctx_tbd
->type_tucmd_mlhl
=
835 E1000_ADVTXD_DCMD_DEXT
| E1000_ADVTXD_DTYP_CTXT
;
838 * When we have a TX context set up, we enforce that the ethertype is
839 * either IPv4 or IPv6 in igb_get_tx_context().
841 if (ctx
->lso_flag
|| ctx
->hcksum_flags
& HCK_IPV4_HDRCKSUM
) {
842 if (ctx
->l3_proto
== ETHERTYPE_IP
) {
843 ctx_tbd
->type_tucmd_mlhl
|= E1000_ADVTXD_TUCMD_IPV4
;
845 ctx_tbd
->type_tucmd_mlhl
|= E1000_ADVTXD_TUCMD_IPV6
;
849 if (ctx
->lso_flag
|| ctx
->hcksum_flags
& HCK_PARTIALCKSUM
) {
850 switch (ctx
->l4_proto
) {
852 ctx_tbd
->type_tucmd_mlhl
|= E1000_ADVTXD_TUCMD_L4T_TCP
;
856 * We don't have to explicitly set:
857 * ctx_tbd->type_tucmd_mlhl |=
858 * E1000_ADVTXD_TUCMD_L4T_UDP;
859 * Because E1000_ADVTXD_TUCMD_L4T_UDP == 0b
863 /* Unrecoverable error */
864 igb_log(NULL
, IGB_LOG_INFO
,
865 "L4 type error with tx hcksum");
870 ctx_tbd
->seqnum_seed
= 0;
871 ctx_tbd
->mss_l4len_idx
= ring_index
<< 4;
873 ctx_tbd
->mss_l4len_idx
|=
874 (ctx
->l4_hdr_len
<< E1000_ADVTXD_L4LEN_SHIFT
) |
875 (ctx
->mss
<< E1000_ADVTXD_MSS_SHIFT
);
882 * Fill the tx descriptor ring with the data
885 igb_tx_fill_ring(igb_tx_ring_t
*tx_ring
, link_list_t
*pending_list
,
886 tx_context_t
*ctx
, size_t mbsize
)
888 struct e1000_hw
*hw
= &tx_ring
->igb
->hw
;
889 boolean_t load_context
;
890 uint32_t index
, tcb_index
, desc_num
;
891 union e1000_adv_tx_desc
*tbd
, *first_tbd
;
892 tx_control_block_t
*tcb
, *first_tcb
;
893 uint32_t hcksum_flags
;
895 igb_t
*igb
= tx_ring
->igb
;
897 ASSERT(mutex_owned(&tx_ring
->tx_lock
));
904 load_context
= B_FALSE
;
907 * Get the index of the first tx descriptor that will be filled,
908 * and the index of the first work list item that will be attached
909 * with the first used tx control block in the pending list.
910 * Note: the two indexes are the same.
912 index
= tx_ring
->tbd_tail
;
913 tcb_index
= tx_ring
->tbd_tail
;
916 hcksum_flags
= ctx
->hcksum_flags
;
919 * Check if a new context descriptor is needed for this packet
921 load_context
= igb_check_tx_context(tx_ring
, ctx
);
923 tbd
= &tx_ring
->tbd_ring
[index
];
926 * Fill the context descriptor with the
927 * hardware checksum offload informations.
930 (struct e1000_adv_tx_context_desc
*)tbd
,
931 ctx
, tx_ring
->index
);
933 index
= NEXT_INDEX(index
, 1, tx_ring
->ring_size
);
937 * Store the checksum context data if
938 * a new context descriptor is added
940 tx_ring
->tx_context
= *ctx
;
944 first_tbd
= &tx_ring
->tbd_ring
[index
];
947 * Fill tx data descriptors with the data saved in the pending list.
948 * The tx control blocks in the pending list are added to the work list
951 * The work list is strictly 1:1 corresponding to the descriptor ring.
952 * One item of the work list corresponds to one tx descriptor. Because
953 * one tx control block can span multiple tx descriptors, the tx
954 * control block will be added to the first work list item that
955 * corresponds to the first tx descriptor generated from that tx
958 tcb
= (tx_control_block_t
*)LIST_POP_HEAD(pending_list
);
960 while (tcb
!= NULL
) {
962 for (i
= 0; i
< tcb
->desc_num
; i
++) {
963 tbd
= &tx_ring
->tbd_ring
[index
];
965 tbd
->read
.buffer_addr
= tcb
->desc
[i
].address
;
966 tbd
->read
.cmd_type_len
= tcb
->desc
[i
].length
;
968 tbd
->read
.cmd_type_len
|= E1000_ADVTXD_DCMD_RS
|
969 E1000_ADVTXD_DCMD_DEXT
| E1000_ADVTXD_DTYP_DATA
|
970 E1000_ADVTXD_DCMD_IFCS
;
972 tbd
->read
.olinfo_status
= 0;
974 index
= NEXT_INDEX(index
, 1, tx_ring
->ring_size
);
979 * Add the tx control block to the work list
981 ASSERT(tx_ring
->work_list
[tcb_index
] == NULL
);
982 tx_ring
->work_list
[tcb_index
] = tcb
;
985 tcb
= (tx_control_block_t
*)LIST_POP_HEAD(pending_list
);
990 * Count the checksum context descriptor for
991 * the first tx control block.
993 first_tcb
->desc_num
++;
995 first_tcb
->last_index
= PREV_INDEX(index
, 1, tx_ring
->ring_size
);
998 * The Insert Ethernet CRC (IFCS) bit and the checksum fields are only
999 * valid in the first descriptor of the packet.
1000 * 82576 also requires the payload length setting even without LSO
1002 ASSERT(first_tbd
!= NULL
);
1003 first_tbd
->read
.cmd_type_len
|= E1000_ADVTXD_DCMD_IFCS
;
1004 if (ctx
!= NULL
&& ctx
->lso_flag
) {
1005 first_tbd
->read
.cmd_type_len
|= E1000_ADVTXD_DCMD_TSE
;
1006 first_tbd
->read
.olinfo_status
|=
1007 (mbsize
- ctx
->mac_hdr_len
- ctx
->ip_hdr_len
1008 - ctx
->l4_hdr_len
) << E1000_ADVTXD_PAYLEN_SHIFT
;
1010 if (hw
->mac
.type
>= e1000_82576
) {
1011 first_tbd
->read
.olinfo_status
|=
1012 (mbsize
<< E1000_ADVTXD_PAYLEN_SHIFT
);
1016 /* Set hardware checksum bits */
1017 if (hcksum_flags
!= 0) {
1018 if (hcksum_flags
& HCK_IPV4_HDRCKSUM
)
1019 first_tbd
->read
.olinfo_status
|=
1020 E1000_TXD_POPTS_IXSM
<< 8;
1021 if (hcksum_flags
& HCK_PARTIALCKSUM
)
1022 first_tbd
->read
.olinfo_status
|=
1023 E1000_TXD_POPTS_TXSM
<< 8;
1024 first_tbd
->read
.olinfo_status
|= tx_ring
->index
<< 4;
1028 * The last descriptor of packet needs End Of Packet (EOP),
1029 * and Report Status (RS) bits set
1031 ASSERT(tbd
!= NULL
);
1032 tbd
->read
.cmd_type_len
|=
1033 E1000_ADVTXD_DCMD_EOP
| E1000_ADVTXD_DCMD_RS
;
1035 IGB_DEBUG_STAT(tx_ring
->stat_pkt_cnt
);
1038 * Sync the DMA buffer of the tx descriptor ring
1040 DMA_SYNC(&tx_ring
->tbd_area
, DDI_DMA_SYNC_FORDEV
);
1043 * Update the number of the free tx descriptors.
1044 * The mutual exclusion between the transmission and the recycling
1045 * (for the tx descriptor ring and the work list) is implemented
1046 * with the atomic operation on the number of the free tx descriptors.
1048 * Note: we should always decrement the counter tbd_free before
1049 * advancing the hardware TDT pointer to avoid the race condition -
1050 * before the counter tbd_free is decremented, the transmit of the
1051 * tx descriptors has done and the counter tbd_free is increased by
1054 i
= igb_atomic_reserve(&tx_ring
->tbd_free
, desc_num
);
1057 tx_ring
->tbd_tail
= index
;
1060 * Advance the hardware TDT pointer of the tx descriptor ring
1062 E1000_WRITE_REG(hw
, E1000_TDT(tx_ring
->index
), index
);
1064 if (igb_check_acc_handle(igb
->osdep
.reg_handle
) != DDI_FM_OK
) {
1065 ddi_fm_service_impact(igb
->dip
, DDI_SERVICE_DEGRADED
);
1066 atomic_or_32(&igb
->igb_state
, IGB_ERROR
);
1075 * Save the address/length pair to the private array
1076 * of the tx control block. The address/length pairs
1077 * will be filled into the tx descriptor ring later.
1080 igb_save_desc(tx_control_block_t
*tcb
, uint64_t address
, size_t length
)
1084 desc
= &tcb
->desc
[tcb
->desc_num
];
1085 desc
->address
= address
;
1086 desc
->length
= length
;
1092 * igb_tx_recycle_legacy
1094 * Recycle the tx descriptors and tx control blocks.
1096 * The work list is traversed to check if the corresponding
1097 * tx descriptors have been transmitted. If so, the resources
1098 * bound to the tx control blocks will be freed, and those
1099 * tx control blocks will be returned to the free list.
1102 igb_tx_recycle_legacy(igb_tx_ring_t
*tx_ring
)
1104 uint32_t index
, last_index
, next_index
;
1106 boolean_t desc_done
;
1107 tx_control_block_t
*tcb
;
1108 link_list_t pending_list
;
1109 igb_t
*igb
= tx_ring
->igb
;
1112 * The mutex_tryenter() is used to avoid unnecessary
1115 if (mutex_tryenter(&tx_ring
->recycle_lock
) == 0)
1118 ASSERT(tx_ring
->tbd_free
<= tx_ring
->ring_size
);
1120 if (tx_ring
->tbd_free
== tx_ring
->ring_size
) {
1121 tx_ring
->recycle_fail
= 0;
1122 tx_ring
->stall_watchdog
= 0;
1123 mutex_exit(&tx_ring
->recycle_lock
);
1128 * Sync the DMA buffer of the tx descriptor ring
1130 DMA_SYNC(&tx_ring
->tbd_area
, DDI_DMA_SYNC_FORKERNEL
);
1132 if (igb_check_dma_handle(
1133 tx_ring
->tbd_area
.dma_handle
) != DDI_FM_OK
) {
1134 mutex_exit(&tx_ring
->recycle_lock
);
1135 ddi_fm_service_impact(igb
->dip
, DDI_SERVICE_DEGRADED
);
1136 atomic_or_32(&igb
->igb_state
, IGB_ERROR
);
1140 LINK_LIST_INIT(&pending_list
);
1142 index
= tx_ring
->tbd_head
; /* Index of next tbd/tcb to recycle */
1144 tcb
= tx_ring
->work_list
[index
];
1145 ASSERT(tcb
!= NULL
);
1147 while (tcb
!= NULL
) {
1150 * Get the last tx descriptor of this packet.
1151 * If the last tx descriptor is done, then
1152 * we can recycle all descriptors of a packet
1153 * which usually includes several tx control blocks.
1154 * For some chips, LSO descriptors can not be recycled
1155 * unless the whole packet's transmission is done.
1156 * That's why packet level recycling is used here.
1158 last_index
= tcb
->last_index
;
1160 * MAX_TX_RING_SIZE is used to judge whether
1161 * the index is a valid value or not.
1163 if (last_index
== MAX_TX_RING_SIZE
)
1166 next_index
= NEXT_INDEX(last_index
, 1, tx_ring
->ring_size
);
1169 * Check if the Descriptor Done bit is set
1171 desc_done
= tx_ring
->tbd_ring
[last_index
].wb
.status
&
1174 while (tcb
!= NULL
) {
1176 * Strip off the tx control block from the work
1177 * list, and add it to the pending list.
1179 tx_ring
->work_list
[index
] = NULL
;
1180 LIST_PUSH_TAIL(&pending_list
, &tcb
->link
);
1183 * Count the total number of the tx descriptors
1186 desc_num
+= tcb
->desc_num
;
1189 * Advance the index of the tx descriptor ring
1191 index
= NEXT_INDEX(index
, tcb
->desc_num
,
1192 tx_ring
->ring_size
);
1194 tcb
= tx_ring
->work_list
[index
];
1195 if (index
== next_index
)
1204 * If no tx descriptors are recycled, no need to do more processing
1206 if (desc_num
== 0) {
1207 tx_ring
->recycle_fail
++;
1208 mutex_exit(&tx_ring
->recycle_lock
);
1212 tx_ring
->recycle_fail
= 0;
1213 tx_ring
->stall_watchdog
= 0;
1216 * Update the head index of the tx descriptor ring
1218 tx_ring
->tbd_head
= index
;
1221 * Update the number of the free tx descriptors with atomic operations
1223 atomic_add_32(&tx_ring
->tbd_free
, desc_num
);
1225 mutex_exit(&tx_ring
->recycle_lock
);
1228 * Free the resources used by the tx control blocks
1229 * in the pending list
1231 tcb
= (tx_control_block_t
*)LIST_GET_HEAD(&pending_list
);
1232 while (tcb
!= NULL
) {
1234 * Release the resources occupied by the tx control block
1238 tcb
= (tx_control_block_t
*)
1239 LIST_GET_NEXT(&pending_list
, &tcb
->link
);
1243 * Add the tx control blocks in the pending list to the free list.
1245 igb_put_free_list(tx_ring
, &pending_list
);
1251 * igb_tx_recycle_head_wb
1253 * Check the head write-back, and recycle all the transmitted
1254 * tx descriptors and tx control blocks.
1257 igb_tx_recycle_head_wb(igb_tx_ring_t
*tx_ring
)
1262 tx_control_block_t
*tcb
;
1263 link_list_t pending_list
;
1264 igb_t
*igb
= tx_ring
->igb
;
1267 * The mutex_tryenter() is used to avoid unnecessary
1270 if (mutex_tryenter(&tx_ring
->recycle_lock
) == 0)
1273 ASSERT(tx_ring
->tbd_free
<= tx_ring
->ring_size
);
1275 if (tx_ring
->tbd_free
== tx_ring
->ring_size
) {
1276 tx_ring
->recycle_fail
= 0;
1277 tx_ring
->stall_watchdog
= 0;
1278 mutex_exit(&tx_ring
->recycle_lock
);
1283 * Sync the DMA buffer of the tx descriptor ring
1285 * Note: For head write-back mode, the tx descriptors will not
1286 * be written back, but the head write-back value is stored at
1287 * the last extra tbd at the end of the DMA area, we still need
1288 * to sync the head write-back value for kernel.
1290 * DMA_SYNC(&tx_ring->tbd_area, DDI_DMA_SYNC_FORKERNEL);
1292 (void) ddi_dma_sync(tx_ring
->tbd_area
.dma_handle
,
1293 sizeof (union e1000_adv_tx_desc
) * tx_ring
->ring_size
,
1295 DDI_DMA_SYNC_FORKERNEL
);
1297 if (igb_check_dma_handle(
1298 tx_ring
->tbd_area
.dma_handle
) != DDI_FM_OK
) {
1299 mutex_exit(&tx_ring
->recycle_lock
);
1300 ddi_fm_service_impact(igb
->dip
, DDI_SERVICE_DEGRADED
);
1301 atomic_or_32(&igb
->igb_state
, IGB_ERROR
);
1305 LINK_LIST_INIT(&pending_list
);
1307 index
= tx_ring
->tbd_head
; /* Next index to clean */
1310 * Get the value of head write-back
1312 head_wb
= *tx_ring
->tbd_head_wb
;
1313 while (index
!= head_wb
) {
1314 tcb
= tx_ring
->work_list
[index
];
1315 ASSERT(tcb
!= NULL
);
1317 if (OFFSET(index
, head_wb
, tx_ring
->ring_size
) <
1320 * The current tx control block is not
1321 * completely transmitted, stop recycling
1327 * Strip off the tx control block from the work list,
1328 * and add it to the pending list.
1330 tx_ring
->work_list
[index
] = NULL
;
1331 LIST_PUSH_TAIL(&pending_list
, &tcb
->link
);
1334 * Advance the index of the tx descriptor ring
1336 index
= NEXT_INDEX(index
, tcb
->desc_num
, tx_ring
->ring_size
);
1339 * Count the total number of the tx descriptors recycled
1341 desc_num
+= tcb
->desc_num
;
1345 * If no tx descriptors are recycled, no need to do more processing
1347 if (desc_num
== 0) {
1348 tx_ring
->recycle_fail
++;
1349 mutex_exit(&tx_ring
->recycle_lock
);
1353 tx_ring
->recycle_fail
= 0;
1354 tx_ring
->stall_watchdog
= 0;
1357 * Update the head index of the tx descriptor ring
1359 tx_ring
->tbd_head
= index
;
1362 * Update the number of the free tx descriptors with atomic operations
1364 atomic_add_32(&tx_ring
->tbd_free
, desc_num
);
1366 mutex_exit(&tx_ring
->recycle_lock
);
1369 * Free the resources used by the tx control blocks
1370 * in the pending list
1372 tcb
= (tx_control_block_t
*)LIST_GET_HEAD(&pending_list
);
1375 * Release the resources occupied by the tx control block
1379 tcb
= (tx_control_block_t
*)
1380 LIST_GET_NEXT(&pending_list
, &tcb
->link
);
1384 * Add the tx control blocks in the pending list to the free list.
1386 igb_put_free_list(tx_ring
, &pending_list
);
1392 * igb_free_tcb - free up the tx control block
1394 * Free the resources of the tx control block, including
1395 * unbind the previously bound DMA handle, and reset other
1399 igb_free_tcb(tx_control_block_t
*tcb
)
1401 switch (tcb
->tx_type
) {
1404 * Reset the buffer length that is used for copy
1406 tcb
->tx_buf
.len
= 0;
1410 * Release the DMA resource that is used for
1413 (void) ddi_dma_unbind_handle(tcb
->tx_dma_handle
);
1422 if (tcb
->mp
!= NULL
) {
1427 tcb
->tx_type
= USE_NONE
;
1428 tcb
->last_index
= MAX_TX_RING_SIZE
;
1434 * igb_get_free_list - Get a free tx control block from the free list
1436 * The atomic operation on the number of the available tx control block
1437 * in the free list is used to keep this routine mutual exclusive with
1438 * the routine igb_put_check_list.
1440 static tx_control_block_t
*
1441 igb_get_free_list(igb_tx_ring_t
*tx_ring
)
1443 tx_control_block_t
*tcb
;
1446 * Check and update the number of the free tx control block
1449 if (igb_atomic_reserve(&tx_ring
->tcb_free
, 1) < 0)
1452 mutex_enter(&tx_ring
->tcb_head_lock
);
1454 tcb
= tx_ring
->free_list
[tx_ring
->tcb_head
];
1455 ASSERT(tcb
!= NULL
);
1456 tx_ring
->free_list
[tx_ring
->tcb_head
] = NULL
;
1457 tx_ring
->tcb_head
= NEXT_INDEX(tx_ring
->tcb_head
, 1,
1458 tx_ring
->free_list_size
);
1460 mutex_exit(&tx_ring
->tcb_head_lock
);
1468 * Put a list of used tx control blocks back to the free list
1470 * A mutex is used here to ensure the serialization. The mutual exclusion
1471 * between igb_get_free_list and igb_put_free_list is implemented with
1472 * the atomic operation on the counter tcb_free.
1475 igb_put_free_list(igb_tx_ring_t
*tx_ring
, link_list_t
*pending_list
)
1479 tx_control_block_t
*tcb
;
1481 mutex_enter(&tx_ring
->tcb_tail_lock
);
1483 index
= tx_ring
->tcb_tail
;
1486 tcb
= (tx_control_block_t
*)LIST_POP_HEAD(pending_list
);
1487 while (tcb
!= NULL
) {
1488 ASSERT(tx_ring
->free_list
[index
] == NULL
);
1489 tx_ring
->free_list
[index
] = tcb
;
1493 index
= NEXT_INDEX(index
, 1, tx_ring
->free_list_size
);
1495 tcb
= (tx_control_block_t
*)LIST_POP_HEAD(pending_list
);
1498 tx_ring
->tcb_tail
= index
;
1501 * Update the number of the free tx control block
1502 * in the free list. This operation must be placed
1503 * under the protection of the lock.
1505 atomic_add_32(&tx_ring
->tcb_free
, tcb_num
);
1507 mutex_exit(&tx_ring
->tcb_tail_lock
);