4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright(c) 2007-2010 Intel Corporation. All rights reserved.
27 * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Copyright 2012 Nexenta Systems, Inc. All rights reserved.
29 * Copyright 2016 OmniTI Computer Consulting, Inc. All rights reserved.
30 * Copyright 2017 Joyent, Inc.
35 static int ixgbe_tx_copy(ixgbe_tx_ring_t
*, tx_control_block_t
*, mblk_t
*,
37 static int ixgbe_tx_bind(ixgbe_tx_ring_t
*, tx_control_block_t
*, mblk_t
*,
39 static int ixgbe_tx_fill_ring(ixgbe_tx_ring_t
*, link_list_t
*,
40 ixgbe_tx_context_t
*, size_t);
41 static void ixgbe_save_desc(tx_control_block_t
*, uint64_t, size_t);
42 static tx_control_block_t
*ixgbe_get_free_list(ixgbe_tx_ring_t
*);
44 static int ixgbe_get_context(mblk_t
*, ixgbe_tx_context_t
*);
45 static boolean_t
ixgbe_check_context(ixgbe_tx_ring_t
*,
46 ixgbe_tx_context_t
*);
47 static void ixgbe_fill_context(struct ixgbe_adv_tx_context_desc
*,
48 ixgbe_tx_context_t
*);
51 #pragma inline(ixgbe_save_desc)
52 #pragma inline(ixgbe_get_context)
53 #pragma inline(ixgbe_check_context)
54 #pragma inline(ixgbe_fill_context)
60 * To transmit one mblk through one specified ring.
62 * One mblk can consist of several fragments, each fragment
63 * will be processed with different methods based on the size.
64 * For the fragments with size less than the bcopy threshold,
65 * they will be processed by using bcopy; otherwise, they will
66 * be processed by using DMA binding.
68 * To process the mblk, a tx control block is got from the
69 * free list. One tx control block contains one tx buffer, which
70 * is used to copy mblk fragments' data; and one tx DMA handle,
71 * which is used to bind a mblk fragment with DMA resource.
73 * Several small mblk fragments can be copied into one tx control
74 * block's buffer, and then the buffer will be transmitted with
77 * A large fragment only binds with one tx control block's DMA
78 * handle, and it can span several tx descriptors for transmitting.
80 * So to transmit a packet (mblk), several tx control blocks can
81 * be used. After the processing, those tx control blocks will
82 * be put to the work list.
85 ixgbe_ring_tx(void *arg
, mblk_t
*mp
)
87 ixgbe_tx_ring_t
*tx_ring
= (ixgbe_tx_ring_t
*)arg
;
88 ixgbe_t
*ixgbe
= tx_ring
->ixgbe
;
89 tx_type_t current_flag
, next_flag
;
90 uint32_t current_len
, next_len
;
94 boolean_t copy_done
, eop
;
95 mblk_t
*current_mp
, *next_mp
, *nmp
, *pull_mp
= NULL
;
96 tx_control_block_t
*tcb
;
97 ixgbe_tx_context_t tx_context
, *ctx
;
98 link_list_t pending_list
;
99 uint32_t len
, hdr_frag_len
, hdr_len
;
100 uint32_t copy_thresh
;
101 mblk_t
*hdr_new_mp
= NULL
;
102 mblk_t
*hdr_pre_mp
= NULL
;
103 mblk_t
*hdr_nmp
= NULL
;
105 ASSERT(mp
->b_next
== NULL
);
107 if ((ixgbe
->ixgbe_state
& IXGBE_SUSPENDED
) ||
108 (ixgbe
->ixgbe_state
& IXGBE_ERROR
) ||
109 (ixgbe
->ixgbe_state
& IXGBE_OVERTEMP
) ||
110 !(ixgbe
->ixgbe_state
& IXGBE_STARTED
) ||
111 ixgbe
->link_state
!= LINK_STATE_UP
) {
116 copy_thresh
= ixgbe
->tx_copy_thresh
;
118 /* Get the mblk size */
120 for (nmp
= mp
; nmp
!= NULL
; nmp
= nmp
->b_cont
) {
121 mbsize
+= MBLKL(nmp
);
124 if (ixgbe
->tx_hcksum_enable
) {
126 * Retrieve checksum context information from the mblk
127 * that will be used to decide whether/how to fill the
128 * context descriptor.
131 if (ixgbe_get_context(mp
, ctx
) < 0) {
137 * If the mblk size exceeds the max size ixgbe could
138 * process, then discard this mblk, and return NULL.
140 if ((ctx
->lso_flag
&&
141 ((mbsize
- ctx
->mac_hdr_len
) > IXGBE_LSO_MAXLEN
)) ||
143 (mbsize
> (ixgbe
->max_frame_size
- ETHERFCSL
)))) {
145 IXGBE_DEBUGLOG_0(ixgbe
, "ixgbe_tx: packet oversize");
153 * Check and recycle tx descriptors.
154 * The recycle threshold here should be selected carefully
156 if (tx_ring
->tbd_free
< ixgbe
->tx_recycle_thresh
) {
157 tx_ring
->tx_recycle(tx_ring
);
161 * After the recycling, if the tbd_free is less than the
162 * overload_threshold, assert overload, return mp;
163 * and we need to re-schedule the tx again.
165 if (tx_ring
->tbd_free
< ixgbe
->tx_overload_thresh
) {
166 tx_ring
->reschedule
= B_TRUE
;
167 tx_ring
->stat_overload
++;
172 * The pending_list is a linked list that is used to save
173 * the tx control blocks that have packet data processed
174 * but have not put the data to the tx descriptor ring.
175 * It is used to reduce the lock contention of the tx_lock.
177 LINK_LIST_INIT(&pending_list
);
182 * The software should guarantee LSO packet header(MAC+IP+TCP)
183 * to be within one descriptor. Here we reallocate and refill the
184 * the header if it's physical memory non-contiguous.
186 if ((ctx
!= NULL
) && ctx
->lso_flag
) {
187 /* find the last fragment of the header */
191 hdr_len
= ctx
->ip_hdr_len
+ ctx
->mac_hdr_len
+ ctx
->l4_hdr_len
;
192 while (len
< hdr_len
) {
193 hdr_pre_mp
= hdr_nmp
;
194 hdr_nmp
= hdr_nmp
->b_cont
;
195 len
+= MBLKL(hdr_nmp
);
198 * If the header and the payload are in different mblks,
199 * we simply force the header to be copied into pre-allocated
200 * page-aligned buffer.
203 goto adjust_threshold
;
205 hdr_frag_len
= hdr_len
- (len
- MBLKL(hdr_nmp
));
207 * There are two cases we need to reallocate a mblk for the
208 * last header fragment:
209 * 1. the header is in multiple mblks and the last fragment
210 * share the same mblk with the payload
211 * 2. the header is in a single mblk shared with the payload
212 * and the header is physical memory non-contiguous
214 if ((hdr_nmp
!= mp
) ||
215 (P2NPHASE((uintptr_t)hdr_nmp
->b_rptr
, ixgbe
->sys_page_size
)
217 IXGBE_DEBUG_STAT(tx_ring
->stat_lso_header_fail
);
219 * reallocate the mblk for the last header fragment,
220 * expect to bcopy into pre-allocated page-aligned
223 hdr_new_mp
= allocb(hdr_frag_len
, 0);
226 bcopy(hdr_nmp
->b_rptr
, hdr_new_mp
->b_rptr
,
228 /* link the new header fragment with the other parts */
229 hdr_new_mp
->b_wptr
= hdr_new_mp
->b_rptr
+ hdr_frag_len
;
230 hdr_new_mp
->b_cont
= hdr_nmp
;
232 hdr_pre_mp
->b_cont
= hdr_new_mp
;
235 hdr_nmp
->b_rptr
+= hdr_frag_len
;
239 * adjust the bcopy threshhold to guarantee
240 * the header to use bcopy way
242 if (copy_thresh
< hdr_len
)
243 copy_thresh
= hdr_len
;
247 current_len
= MBLKL(current_mp
);
249 * Decide which method to use for the first fragment
251 current_flag
= (current_len
<= copy_thresh
) ?
254 * If the mblk includes several contiguous small fragments,
255 * they may be copied into one buffer. This flag is used to
256 * indicate whether there are pending fragments that need to
257 * be copied to the current tx buffer.
259 * If this flag is B_TRUE, it indicates that a new tx control
260 * block is needed to process the next fragment using either
261 * copy or DMA binding.
263 * Otherwise, it indicates that the next fragment will be
264 * copied to the current tx buffer that is maintained by the
265 * current tx control block. No new tx control block is needed.
269 next_mp
= current_mp
->b_cont
;
270 eop
= (next_mp
== NULL
); /* Last fragment of the packet? */
271 next_len
= eop
? 0: MBLKL(next_mp
);
274 * When the current fragment is an empty fragment, if
275 * the next fragment will still be copied to the current
276 * tx buffer, we cannot skip this fragment here. Because
277 * the copy processing is pending for completion. We have
278 * to process this empty fragment in the tx_copy routine.
280 * If the copy processing is completed or a DMA binding
281 * processing is just completed, we can just skip this
284 if ((current_len
== 0) && (copy_done
)) {
285 current_mp
= next_mp
;
286 current_len
= next_len
;
287 current_flag
= (current_len
<= copy_thresh
) ?
294 * Get a new tx control block from the free list
296 tcb
= ixgbe_get_free_list(tx_ring
);
299 tx_ring
->stat_fail_no_tcb
++;
304 * Push the tx control block to the pending list
305 * to avoid using lock too early
307 LIST_PUSH_TAIL(&pending_list
, &tcb
->link
);
310 if (current_flag
== USE_COPY
) {
312 * Check whether to use bcopy or DMA binding to process
313 * the next fragment, and if using bcopy, whether we
314 * need to continue copying the next fragment into the
317 ASSERT((tcb
->tx_buf
.len
+ current_len
) <=
322 * This is the last fragment of the packet, so
323 * the copy processing will be completed with
326 next_flag
= USE_NONE
;
328 } else if ((tcb
->tx_buf
.len
+ current_len
+ next_len
) >
331 * If the next fragment is too large to be
332 * copied to the current tx buffer, we need
333 * to complete the current copy processing.
335 next_flag
= (next_len
> copy_thresh
) ?
338 } else if (next_len
> copy_thresh
) {
340 * The next fragment needs to be processed with
341 * DMA binding. So the copy prcessing will be
342 * completed with the current fragment.
348 * Continue to copy the next fragment to the
351 next_flag
= USE_COPY
;
355 desc_num
= ixgbe_tx_copy(tx_ring
, tcb
, current_mp
,
356 current_len
, copy_done
);
359 * Check whether to use bcopy or DMA binding to process
362 next_flag
= (next_len
> copy_thresh
) ?
364 ASSERT(copy_done
== B_TRUE
);
366 desc_num
= ixgbe_tx_bind(tx_ring
, tcb
, current_mp
,
371 desc_total
+= desc_num
;
372 else if (desc_num
< 0)
375 current_mp
= next_mp
;
376 current_len
= next_len
;
377 current_flag
= next_flag
;
381 * Attach the mblk to the last tx control block
384 ASSERT(tcb
->mp
== NULL
);
388 * 82598/82599 chipset has a limitation that no more than 32 tx
389 * descriptors can be transmited out at one time.
391 * Here is a workaround for it: pull up the mblk then send it
392 * out with bind way. By doing so, no more than MAX_COOKIE (18)
393 * descriptors is needed.
395 if (desc_total
+ 1 > IXGBE_TX_DESC_LIMIT
) {
396 IXGBE_DEBUG_STAT(tx_ring
->stat_break_tbd_limit
);
399 * Discard the mblk and free the used resources
401 tcb
= (tx_control_block_t
*)LIST_GET_HEAD(&pending_list
);
405 tcb
= (tx_control_block_t
*)
406 LIST_GET_NEXT(&pending_list
, &tcb
->link
);
410 * Return the tx control blocks in the pending list to
413 ixgbe_put_free_list(tx_ring
, &pending_list
);
416 * pull up the mblk and send it out with bind way
418 if ((pull_mp
= msgpullup(mp
, -1)) == NULL
) {
419 tx_ring
->reschedule
= B_TRUE
;
422 * If new mblk has been allocted for the last header
423 * fragment of a LSO packet, we should restore the
427 hdr_new_mp
->b_cont
= NULL
;
429 hdr_nmp
->b_rptr
-= hdr_frag_len
;
431 hdr_pre_mp
->b_cont
= hdr_nmp
;
438 LINK_LIST_INIT(&pending_list
);
442 * if the packet is a LSO packet, we simply
443 * transmit the header in one descriptor using the copy way
445 if ((ctx
!= NULL
) && ctx
->lso_flag
) {
446 hdr_len
= ctx
->ip_hdr_len
+ ctx
->mac_hdr_len
+
449 tcb
= ixgbe_get_free_list(tx_ring
);
451 tx_ring
->stat_fail_no_tcb
++;
454 desc_num
= ixgbe_tx_copy(tx_ring
, tcb
, pull_mp
,
456 LIST_PUSH_TAIL(&pending_list
, &tcb
->link
);
457 desc_total
+= desc_num
;
459 pull_mp
->b_rptr
+= hdr_len
;
462 tcb
= ixgbe_get_free_list(tx_ring
);
464 tx_ring
->stat_fail_no_tcb
++;
467 if ((ctx
!= NULL
) && ctx
->lso_flag
) {
468 desc_num
= ixgbe_tx_bind(tx_ring
, tcb
, pull_mp
,
471 desc_num
= ixgbe_tx_bind(tx_ring
, tcb
, pull_mp
,
477 LIST_PUSH_TAIL(&pending_list
, &tcb
->link
);
479 desc_total
+= desc_num
;
484 * Before fill the tx descriptor ring with the data, we need to
485 * ensure there are adequate free descriptors for transmit
486 * (including one context descriptor).
487 * Do not use up all the tx descriptors.
488 * Otherwise tx recycle will fail and cause false hang.
490 if (tx_ring
->tbd_free
<= (desc_total
+ 1)) {
491 tx_ring
->tx_recycle(tx_ring
);
494 mutex_enter(&tx_ring
->tx_lock
);
496 * If the number of free tx descriptors is not enough for transmit
499 * Note: we must put this check under the mutex protection to
500 * ensure the correctness when multiple threads access it in
503 if (tx_ring
->tbd_free
<= (desc_total
+ 1)) {
504 tx_ring
->stat_fail_no_tbd
++;
505 mutex_exit(&tx_ring
->tx_lock
);
509 desc_num
= ixgbe_tx_fill_ring(tx_ring
, &pending_list
, ctx
,
512 ASSERT((desc_num
== desc_total
) || (desc_num
== (desc_total
+ 1)));
514 tx_ring
->stat_obytes
+= mbsize
;
515 tx_ring
->stat_opackets
++;
517 mutex_exit(&tx_ring
->tx_lock
);
520 * now that the transmission succeeds, need to free the original
521 * mp if we used the pulling up mblk for transmission.
531 * If transmission fails, need to free the pulling up mblk.
538 * If new mblk has been allocted for the last header
539 * fragment of a LSO packet, we should restore the
543 hdr_new_mp
->b_cont
= NULL
;
545 hdr_nmp
->b_rptr
-= hdr_frag_len
;
547 hdr_pre_mp
->b_cont
= hdr_nmp
;
552 * Discard the mblk and free the used resources
554 tcb
= (tx_control_block_t
*)LIST_GET_HEAD(&pending_list
);
560 tcb
= (tx_control_block_t
*)
561 LIST_GET_NEXT(&pending_list
, &tcb
->link
);
565 * Return the tx control blocks in the pending list to the free list.
567 ixgbe_put_free_list(tx_ring
, &pending_list
);
569 /* Transmit failed, do not drop the mblk, rechedule the transmit */
570 tx_ring
->reschedule
= B_TRUE
;
578 * Copy the mblk fragment to the pre-allocated tx buffer
581 ixgbe_tx_copy(ixgbe_tx_ring_t
*tx_ring
, tx_control_block_t
*tcb
, mblk_t
*mp
,
582 uint32_t len
, boolean_t copy_done
)
584 dma_buffer_t
*tx_buf
;
586 _NOTE(ARGUNUSED(tx_ring
));
588 tx_buf
= &tcb
->tx_buf
;
591 * Copy the packet data of the mblk fragment into the
592 * pre-allocated tx buffer, which is maintained by the
595 * Several mblk fragments can be copied into one tx buffer.
596 * The destination address of the current copied fragment in
597 * the tx buffer is next to the end of the previous copied
601 bcopy(mp
->b_rptr
, tx_buf
->address
+ tx_buf
->len
, len
);
610 * If it is the last fragment copied to the current tx buffer,
611 * in other words, if there's no remaining fragment or the remaining
612 * fragment requires a new tx control block to process, we need to
613 * complete the current copy processing by syncing up the current
614 * DMA buffer and saving the descriptor data.
618 * Sync the DMA buffer of the packet data
620 DMA_SYNC(tx_buf
, DDI_DMA_SYNC_FORDEV
);
622 tcb
->tx_type
= USE_COPY
;
625 * Save the address and length to the private data structure
626 * of the tx control block, which will be used to fill the
627 * tx descriptor ring after all the fragments are processed.
629 ixgbe_save_desc(tcb
, tx_buf
->dma_address
, tx_buf
->len
);
639 * Bind the mblk fragment with DMA
642 ixgbe_tx_bind(ixgbe_tx_ring_t
*tx_ring
, tx_control_block_t
*tcb
, mblk_t
*mp
,
646 ddi_dma_cookie_t dma_cookie
;
651 * Use DMA binding to process the mblk fragment
653 status
= ddi_dma_addr_bind_handle(tcb
->tx_dma_handle
, NULL
,
654 (caddr_t
)mp
->b_rptr
, len
,
655 DDI_DMA_WRITE
| DDI_DMA_STREAMING
, DDI_DMA_DONTWAIT
,
656 0, &dma_cookie
, &ncookies
);
658 if (status
!= DDI_DMA_MAPPED
) {
659 tx_ring
->stat_fail_dma_bind
++;
664 tcb
->tx_type
= USE_DMA
;
666 * Each fragment can span several cookies. One cookie will have
667 * one tx descriptor to transmit.
670 for (i
= ncookies
; i
> 0; i
--) {
672 * Save the address and length to the private data structure
673 * of the tx control block, which will be used to fill the
674 * tx descriptor ring after all the fragments are processed.
677 dma_cookie
.dmac_laddress
,
678 dma_cookie
.dmac_size
);
683 ddi_dma_nextcookie(tcb
->tx_dma_handle
, &dma_cookie
);
692 * Get the context information from the mblk
695 ixgbe_get_context(mblk_t
*mp
, ixgbe_tx_context_t
*ctx
)
706 uint32_t mac_hdr_len
;
712 mac_hcksum_get(mp
, &start
, NULL
, NULL
, NULL
, &hckflags
);
713 bzero(ctx
, sizeof (ixgbe_tx_context_t
));
719 ctx
->hcksum_flags
= hckflags
;
721 mac_lso_get(mp
, &mss
, &lsoflags
);
723 ctx
->lso_flag
= (lsoflags
== HW_LSO
);
726 * LSO relies on tx h/w checksum, so here will drop the package
727 * if h/w checksum flag is not declared.
730 if (!((ctx
->hcksum_flags
& HCK_PARTIALCKSUM
) &&
731 (ctx
->hcksum_flags
& HCK_IPV4_HDRCKSUM
))) {
732 IXGBE_DEBUGLOG_0(NULL
, "ixgbe_tx: h/w "
733 "checksum flags are not specified when doing LSO");
743 * Firstly get the position of the ether_type/ether_tpid.
744 * Here we don't assume the ether (VLAN) header is fully included
745 * in one mblk fragment, so we go thourgh the fragments to parse
748 size
= len
= MBLKL(mp
);
749 offset
= offsetof(struct ether_header
, ether_type
);
750 while (size
<= offset
) {
756 pos
= mp
->b_rptr
+ offset
+ len
- size
;
758 etype
= ntohs(*(ushort_t
*)(uintptr_t)pos
);
759 if (etype
== ETHERTYPE_VLAN
) {
761 * Get the position of the ether_type in VLAN header
763 offset
= offsetof(struct ether_vlan_header
, ether_type
);
764 while (size
<= offset
) {
770 pos
= mp
->b_rptr
+ offset
+ len
- size
;
772 etype
= ntohs(*(ushort_t
*)(uintptr_t)pos
);
773 mac_hdr_len
= sizeof (struct ether_vlan_header
);
775 mac_hdr_len
= sizeof (struct ether_header
);
779 * Here we don't assume the IP(V6) header is fully included in
785 offset
= offsetof(ipha_t
, ipha_length
) + mac_hdr_len
;
786 while (size
<= offset
) {
792 pos
= mp
->b_rptr
+ offset
+ len
- size
;
793 *((uint16_t *)(uintptr_t)(pos
)) = 0;
795 offset
= offsetof(ipha_t
, ipha_hdr_checksum
) +
797 while (size
<= offset
) {
803 pos
= mp
->b_rptr
+ offset
+ len
- size
;
804 *((uint16_t *)(uintptr_t)(pos
)) = 0;
807 * To perform ixgbe LSO, here also need to fill
808 * the tcp checksum field of the packet with the
809 * following pseudo-header checksum:
810 * (ip_source_addr, ip_destination_addr, l4_proto)
811 * Currently the tcp/ip stack has done it.
815 offset
= offsetof(ipha_t
, ipha_protocol
) + mac_hdr_len
;
816 while (size
<= offset
) {
822 pos
= mp
->b_rptr
+ offset
+ len
- size
;
824 l4_proto
= *(uint8_t *)pos
;
827 offset
= offsetof(ip6_t
, ip6_nxt
) + mac_hdr_len
;
828 while (size
<= offset
) {
834 pos
= mp
->b_rptr
+ offset
+ len
- size
;
836 l4_proto
= *(uint8_t *)pos
;
839 /* Unrecoverable error */
840 IXGBE_DEBUGLOG_0(NULL
, "Ether type error with tx hcksum");
845 offset
= mac_hdr_len
+ start
;
846 while (size
<= offset
) {
852 pos
= mp
->b_rptr
+ offset
+ len
- size
;
854 l4_hdr_len
= TCP_HDR_LENGTH((tcph_t
*)pos
);
857 * l4 header length is only required for LSO
862 ctx
->mac_hdr_len
= mac_hdr_len
;
863 ctx
->ip_hdr_len
= start
;
864 ctx
->l4_proto
= l4_proto
;
865 ctx
->l4_hdr_len
= l4_hdr_len
;
871 * ixgbe_check_context
873 * Check if a new context descriptor is needed
876 ixgbe_check_context(ixgbe_tx_ring_t
*tx_ring
, ixgbe_tx_context_t
*ctx
)
878 ixgbe_tx_context_t
*last
;
884 * Compare the context data retrieved from the mblk and the
885 * stored data of the last context descriptor. The data need
892 * mss (only checked for LSO)
893 * l4_hr_len (only checked for LSO)
894 * Either one of the above data is changed, a new context descriptor
897 last
= &tx_ring
->tx_context
;
899 if ((ctx
->hcksum_flags
!= last
->hcksum_flags
) ||
900 (ctx
->l4_proto
!= last
->l4_proto
) ||
901 (ctx
->mac_hdr_len
!= last
->mac_hdr_len
) ||
902 (ctx
->ip_hdr_len
!= last
->ip_hdr_len
) ||
903 (ctx
->lso_flag
!= last
->lso_flag
) ||
904 (ctx
->lso_flag
&& ((ctx
->mss
!= last
->mss
) ||
905 (ctx
->l4_hdr_len
!= last
->l4_hdr_len
)))) {
915 * Fill the context descriptor with hardware checksum informations
918 ixgbe_fill_context(struct ixgbe_adv_tx_context_desc
*ctx_tbd
,
919 ixgbe_tx_context_t
*ctx
)
922 * Fill the context descriptor with the checksum
923 * context information we've got.
925 ctx_tbd
->vlan_macip_lens
= ctx
->ip_hdr_len
;
926 ctx_tbd
->vlan_macip_lens
|= ctx
->mac_hdr_len
<<
927 IXGBE_ADVTXD_MACLEN_SHIFT
;
929 ctx_tbd
->type_tucmd_mlhl
=
930 IXGBE_ADVTXD_DCMD_DEXT
| IXGBE_ADVTXD_DTYP_CTXT
;
932 if (ctx
->hcksum_flags
& HCK_IPV4_HDRCKSUM
)
933 ctx_tbd
->type_tucmd_mlhl
|= IXGBE_ADVTXD_TUCMD_IPV4
;
935 if (ctx
->hcksum_flags
& HCK_PARTIALCKSUM
) {
936 switch (ctx
->l4_proto
) {
938 ctx_tbd
->type_tucmd_mlhl
|= IXGBE_ADVTXD_TUCMD_L4T_TCP
;
942 * We don't have to explicitly set:
943 * ctx_tbd->type_tucmd_mlhl |=
944 * IXGBE_ADVTXD_TUCMD_L4T_UDP;
945 * Because IXGBE_ADVTXD_TUCMD_L4T_UDP == 0b
949 /* Unrecoverable error */
950 IXGBE_DEBUGLOG_0(NULL
, "L4 type error with tx hcksum");
955 ctx_tbd
->seqnum_seed
= 0;
958 ctx_tbd
->mss_l4len_idx
=
959 (ctx
->l4_hdr_len
<< IXGBE_ADVTXD_L4LEN_SHIFT
) |
960 (ctx
->mss
<< IXGBE_ADVTXD_MSS_SHIFT
);
962 ctx_tbd
->mss_l4len_idx
= 0;
969 * Fill the tx descriptor ring with the data
972 ixgbe_tx_fill_ring(ixgbe_tx_ring_t
*tx_ring
, link_list_t
*pending_list
,
973 ixgbe_tx_context_t
*ctx
, size_t mbsize
)
975 struct ixgbe_hw
*hw
= &tx_ring
->ixgbe
->hw
;
976 boolean_t load_context
;
977 uint32_t index
, tcb_index
, desc_num
;
978 union ixgbe_adv_tx_desc
*tbd
, *first_tbd
;
979 tx_control_block_t
*tcb
, *first_tcb
;
980 uint32_t hcksum_flags
;
983 ASSERT(mutex_owned(&tx_ring
->tx_lock
));
990 load_context
= B_FALSE
;
993 * Get the index of the first tx descriptor that will be filled,
994 * and the index of the first work list item that will be attached
995 * with the first used tx control block in the pending list.
996 * Note: the two indexes are the same.
998 index
= tx_ring
->tbd_tail
;
999 tcb_index
= tx_ring
->tbd_tail
;
1002 hcksum_flags
= ctx
->hcksum_flags
;
1005 * Check if a new context descriptor is needed for this packet
1007 load_context
= ixgbe_check_context(tx_ring
, ctx
);
1010 tbd
= &tx_ring
->tbd_ring
[index
];
1013 * Fill the context descriptor with the
1014 * hardware checksum offload informations.
1017 (struct ixgbe_adv_tx_context_desc
*)tbd
, ctx
);
1019 index
= NEXT_INDEX(index
, 1, tx_ring
->ring_size
);
1023 * Store the checksum context data if
1024 * a new context descriptor is added
1026 tx_ring
->tx_context
= *ctx
;
1030 first_tbd
= &tx_ring
->tbd_ring
[index
];
1033 * Fill tx data descriptors with the data saved in the pending list.
1034 * The tx control blocks in the pending list are added to the work list
1037 * The work list is strictly 1:1 corresponding to the descriptor ring.
1038 * One item of the work list corresponds to one tx descriptor. Because
1039 * one tx control block can span multiple tx descriptors, the tx
1040 * control block will be added to the first work list item that
1041 * corresponds to the first tx descriptor generated from that tx
1044 tcb
= (tx_control_block_t
*)LIST_POP_HEAD(pending_list
);
1046 while (tcb
!= NULL
) {
1048 for (i
= 0; i
< tcb
->desc_num
; i
++) {
1049 tbd
= &tx_ring
->tbd_ring
[index
];
1051 tbd
->read
.buffer_addr
= tcb
->desc
[i
].address
;
1052 tbd
->read
.cmd_type_len
= tcb
->desc
[i
].length
;
1054 tbd
->read
.cmd_type_len
|= IXGBE_ADVTXD_DCMD_DEXT
1055 | IXGBE_ADVTXD_DTYP_DATA
;
1057 tbd
->read
.olinfo_status
= 0;
1059 index
= NEXT_INDEX(index
, 1, tx_ring
->ring_size
);
1064 * Add the tx control block to the work list
1066 ASSERT(tx_ring
->work_list
[tcb_index
] == NULL
);
1067 tx_ring
->work_list
[tcb_index
] = tcb
;
1070 tcb
= (tx_control_block_t
*)LIST_POP_HEAD(pending_list
);
1075 * Count the context descriptor for
1076 * the first tx control block.
1078 first_tcb
->desc_num
++;
1080 first_tcb
->last_index
= PREV_INDEX(index
, 1, tx_ring
->ring_size
);
1083 * The Insert Ethernet CRC (IFCS) bit and the checksum fields are only
1084 * valid in the first descriptor of the packet.
1085 * Setting paylen in every first_tbd for all parts.
1086 * 82599, X540 and X550 require the packet length in paylen field
1087 * with or without LSO and 82598 will ignore it in non-LSO mode.
1089 ASSERT(first_tbd
!= NULL
);
1090 first_tbd
->read
.cmd_type_len
|= IXGBE_ADVTXD_DCMD_IFCS
;
1092 switch (hw
->mac
.type
) {
1093 case ixgbe_mac_82598EB
:
1094 if (ctx
!= NULL
&& ctx
->lso_flag
) {
1095 first_tbd
->read
.cmd_type_len
|= IXGBE_ADVTXD_DCMD_TSE
;
1096 first_tbd
->read
.olinfo_status
|=
1097 (mbsize
- ctx
->mac_hdr_len
- ctx
->ip_hdr_len
1098 - ctx
->l4_hdr_len
) << IXGBE_ADVTXD_PAYLEN_SHIFT
;
1102 case ixgbe_mac_82599EB
:
1103 case ixgbe_mac_X540
:
1104 case ixgbe_mac_X550
:
1105 case ixgbe_mac_X550EM_x
:
1106 if (ctx
!= NULL
&& ctx
->lso_flag
) {
1107 first_tbd
->read
.cmd_type_len
|= IXGBE_ADVTXD_DCMD_TSE
;
1108 first_tbd
->read
.olinfo_status
|=
1109 (mbsize
- ctx
->mac_hdr_len
- ctx
->ip_hdr_len
1110 - ctx
->l4_hdr_len
) << IXGBE_ADVTXD_PAYLEN_SHIFT
;
1112 first_tbd
->read
.olinfo_status
|=
1113 (mbsize
<< IXGBE_ADVTXD_PAYLEN_SHIFT
);
1121 /* Set hardware checksum bits */
1122 if (hcksum_flags
!= 0) {
1123 if (hcksum_flags
& HCK_IPV4_HDRCKSUM
)
1124 first_tbd
->read
.olinfo_status
|=
1125 IXGBE_ADVTXD_POPTS_IXSM
;
1126 if (hcksum_flags
& HCK_PARTIALCKSUM
)
1127 first_tbd
->read
.olinfo_status
|=
1128 IXGBE_ADVTXD_POPTS_TXSM
;
1132 * The last descriptor of packet needs End Of Packet (EOP),
1133 * and Report Status (RS) bits set
1135 ASSERT(tbd
!= NULL
);
1136 tbd
->read
.cmd_type_len
|=
1137 IXGBE_ADVTXD_DCMD_EOP
| IXGBE_ADVTXD_DCMD_RS
;
1140 * Sync the DMA buffer of the tx descriptor ring
1142 DMA_SYNC(&tx_ring
->tbd_area
, DDI_DMA_SYNC_FORDEV
);
1145 * Update the number of the free tx descriptors.
1146 * The mutual exclusion between the transmission and the recycling
1147 * (for the tx descriptor ring and the work list) is implemented
1148 * with the atomic operation on the number of the free tx descriptors.
1150 * Note: we should always decrement the counter tbd_free before
1151 * advancing the hardware TDT pointer to avoid the race condition -
1152 * before the counter tbd_free is decremented, the transmit of the
1153 * tx descriptors has done and the counter tbd_free is increased by
1156 i
= ixgbe_atomic_reserve(&tx_ring
->tbd_free
, desc_num
);
1159 tx_ring
->tbd_tail
= index
;
1162 * Advance the hardware TDT pointer of the tx descriptor ring
1164 IXGBE_WRITE_REG(hw
, IXGBE_TDT(tx_ring
->index
), index
);
1166 if (ixgbe_check_acc_handle(tx_ring
->ixgbe
->osdep
.reg_handle
) !=
1168 ddi_fm_service_impact(tx_ring
->ixgbe
->dip
,
1169 DDI_SERVICE_DEGRADED
);
1170 atomic_or_32(&tx_ring
->ixgbe
->ixgbe_state
, IXGBE_ERROR
);
1179 * Save the address/length pair to the private array
1180 * of the tx control block. The address/length pairs
1181 * will be filled into the tx descriptor ring later.
1184 ixgbe_save_desc(tx_control_block_t
*tcb
, uint64_t address
, size_t length
)
1188 desc
= &tcb
->desc
[tcb
->desc_num
];
1189 desc
->address
= address
;
1190 desc
->length
= length
;
1196 * ixgbe_tx_recycle_legacy
1198 * Recycle the tx descriptors and tx control blocks.
1200 * The work list is traversed to check if the corresponding
1201 * tx descriptors have been transmitted. If so, the resources
1202 * bound to the tx control blocks will be freed, and those
1203 * tx control blocks will be returned to the free list.
1206 ixgbe_tx_recycle_legacy(ixgbe_tx_ring_t
*tx_ring
)
1208 uint32_t index
, last_index
, prev_index
;
1210 boolean_t desc_done
;
1211 tx_control_block_t
*tcb
;
1212 link_list_t pending_list
;
1213 ixgbe_t
*ixgbe
= tx_ring
->ixgbe
;
1215 mutex_enter(&tx_ring
->recycle_lock
);
1217 ASSERT(tx_ring
->tbd_free
<= tx_ring
->ring_size
);
1219 if (tx_ring
->tbd_free
== tx_ring
->ring_size
) {
1220 tx_ring
->recycle_fail
= 0;
1221 tx_ring
->stall_watchdog
= 0;
1222 if (tx_ring
->reschedule
) {
1223 tx_ring
->reschedule
= B_FALSE
;
1224 mac_tx_ring_update(ixgbe
->mac_hdl
,
1225 tx_ring
->ring_handle
);
1227 mutex_exit(&tx_ring
->recycle_lock
);
1232 * Sync the DMA buffer of the tx descriptor ring
1234 DMA_SYNC(&tx_ring
->tbd_area
, DDI_DMA_SYNC_FORKERNEL
);
1236 if (ixgbe_check_dma_handle(tx_ring
->tbd_area
.dma_handle
) != DDI_FM_OK
) {
1237 mutex_exit(&tx_ring
->recycle_lock
);
1238 ddi_fm_service_impact(ixgbe
->dip
, DDI_SERVICE_DEGRADED
);
1239 atomic_or_32(&ixgbe
->ixgbe_state
, IXGBE_ERROR
);
1243 LINK_LIST_INIT(&pending_list
);
1245 index
= tx_ring
->tbd_head
; /* Index of next tbd/tcb to recycle */
1247 tcb
= tx_ring
->work_list
[index
];
1248 ASSERT(tcb
!= NULL
);
1250 while (tcb
!= NULL
) {
1252 * Get the last tx descriptor of this packet.
1253 * If the last tx descriptor is done, then
1254 * we can recycle all descriptors of a packet
1255 * which usually includes several tx control blocks.
1256 * For 82599, LSO descriptors can not be recycled
1257 * unless the whole packet's transmission is done.
1258 * That's why packet level recycling is used here.
1259 * For 82598, there's not such limit.
1261 last_index
= tcb
->last_index
;
1263 * MAX_TX_RING_SIZE is used to judge whether
1264 * the index is a valid value or not.
1266 if (last_index
== MAX_TX_RING_SIZE
)
1270 * Check if the Descriptor Done bit is set
1272 desc_done
= tx_ring
->tbd_ring
[last_index
].wb
.status
&
1276 * recycle all descriptors of the packet
1278 while (tcb
!= NULL
) {
1280 * Strip off the tx control block from
1281 * the work list, and add it to the
1284 tx_ring
->work_list
[index
] = NULL
;
1285 LIST_PUSH_TAIL(&pending_list
, &tcb
->link
);
1288 * Count the total number of the tx
1289 * descriptors recycled
1291 desc_num
+= tcb
->desc_num
;
1293 index
= NEXT_INDEX(index
, tcb
->desc_num
,
1294 tx_ring
->ring_size
);
1296 tcb
= tx_ring
->work_list
[index
];
1298 prev_index
= PREV_INDEX(index
, 1,
1299 tx_ring
->ring_size
);
1300 if (prev_index
== last_index
)
1309 * If no tx descriptors are recycled, no need to do more processing
1311 if (desc_num
== 0) {
1312 tx_ring
->recycle_fail
++;
1313 mutex_exit(&tx_ring
->recycle_lock
);
1317 tx_ring
->recycle_fail
= 0;
1318 tx_ring
->stall_watchdog
= 0;
1321 * Update the head index of the tx descriptor ring
1323 tx_ring
->tbd_head
= index
;
1326 * Update the number of the free tx descriptors with atomic operations
1328 atomic_add_32(&tx_ring
->tbd_free
, desc_num
);
1330 if ((tx_ring
->tbd_free
>= ixgbe
->tx_resched_thresh
) &&
1331 (tx_ring
->reschedule
)) {
1332 tx_ring
->reschedule
= B_FALSE
;
1333 mac_tx_ring_update(ixgbe
->mac_hdl
,
1334 tx_ring
->ring_handle
);
1336 mutex_exit(&tx_ring
->recycle_lock
);
1339 * Free the resources used by the tx control blocks
1340 * in the pending list
1342 tcb
= (tx_control_block_t
*)LIST_GET_HEAD(&pending_list
);
1343 while (tcb
!= NULL
) {
1345 * Release the resources occupied by the tx control block
1347 ixgbe_free_tcb(tcb
);
1349 tcb
= (tx_control_block_t
*)
1350 LIST_GET_NEXT(&pending_list
, &tcb
->link
);
1354 * Add the tx control blocks in the pending list to the free list.
1356 ixgbe_put_free_list(tx_ring
, &pending_list
);
1362 * ixgbe_tx_recycle_head_wb
1364 * Check the head write-back, and recycle all the transmitted
1365 * tx descriptors and tx control blocks.
1368 ixgbe_tx_recycle_head_wb(ixgbe_tx_ring_t
*tx_ring
)
1373 tx_control_block_t
*tcb
;
1374 link_list_t pending_list
;
1375 ixgbe_t
*ixgbe
= tx_ring
->ixgbe
;
1377 mutex_enter(&tx_ring
->recycle_lock
);
1379 ASSERT(tx_ring
->tbd_free
<= tx_ring
->ring_size
);
1381 if (tx_ring
->tbd_free
== tx_ring
->ring_size
) {
1382 tx_ring
->recycle_fail
= 0;
1383 tx_ring
->stall_watchdog
= 0;
1384 if (tx_ring
->reschedule
) {
1385 tx_ring
->reschedule
= B_FALSE
;
1386 mac_tx_ring_update(ixgbe
->mac_hdl
,
1387 tx_ring
->ring_handle
);
1389 mutex_exit(&tx_ring
->recycle_lock
);
1394 * Sync the DMA buffer of the tx descriptor ring
1396 * Note: For head write-back mode, the tx descriptors will not
1397 * be written back, but the head write-back value is stored at
1398 * the last extra tbd at the end of the DMA area, we still need
1399 * to sync the head write-back value for kernel.
1401 * DMA_SYNC(&tx_ring->tbd_area, DDI_DMA_SYNC_FORKERNEL);
1403 (void) ddi_dma_sync(tx_ring
->tbd_area
.dma_handle
,
1404 sizeof (union ixgbe_adv_tx_desc
) * tx_ring
->ring_size
,
1406 DDI_DMA_SYNC_FORKERNEL
);
1408 if (ixgbe_check_dma_handle(tx_ring
->tbd_area
.dma_handle
) != DDI_FM_OK
) {
1409 mutex_exit(&tx_ring
->recycle_lock
);
1410 ddi_fm_service_impact(ixgbe
->dip
,
1411 DDI_SERVICE_DEGRADED
);
1412 atomic_or_32(&ixgbe
->ixgbe_state
, IXGBE_ERROR
);
1416 LINK_LIST_INIT(&pending_list
);
1418 index
= tx_ring
->tbd_head
; /* Next index to clean */
1421 * Get the value of head write-back
1423 head_wb
= *tx_ring
->tbd_head_wb
;
1424 while (index
!= head_wb
) {
1425 tcb
= tx_ring
->work_list
[index
];
1426 ASSERT(tcb
!= NULL
);
1428 if (OFFSET(index
, head_wb
, tx_ring
->ring_size
) <
1431 * The current tx control block is not
1432 * completely transmitted, stop recycling
1438 * Strip off the tx control block from the work list,
1439 * and add it to the pending list.
1441 tx_ring
->work_list
[index
] = NULL
;
1442 LIST_PUSH_TAIL(&pending_list
, &tcb
->link
);
1445 * Advance the index of the tx descriptor ring
1447 index
= NEXT_INDEX(index
, tcb
->desc_num
, tx_ring
->ring_size
);
1450 * Count the total number of the tx descriptors recycled
1452 desc_num
+= tcb
->desc_num
;
1456 * If no tx descriptors are recycled, no need to do more processing
1458 if (desc_num
== 0) {
1459 tx_ring
->recycle_fail
++;
1460 mutex_exit(&tx_ring
->recycle_lock
);
1464 tx_ring
->recycle_fail
= 0;
1465 tx_ring
->stall_watchdog
= 0;
1468 * Update the head index of the tx descriptor ring
1470 tx_ring
->tbd_head
= index
;
1473 * Update the number of the free tx descriptors with atomic operations
1475 atomic_add_32(&tx_ring
->tbd_free
, desc_num
);
1477 if ((tx_ring
->tbd_free
>= ixgbe
->tx_resched_thresh
) &&
1478 (tx_ring
->reschedule
)) {
1479 tx_ring
->reschedule
= B_FALSE
;
1480 mac_tx_ring_update(ixgbe
->mac_hdl
,
1481 tx_ring
->ring_handle
);
1483 mutex_exit(&tx_ring
->recycle_lock
);
1486 * Free the resources used by the tx control blocks
1487 * in the pending list
1489 tcb
= (tx_control_block_t
*)LIST_GET_HEAD(&pending_list
);
1492 * Release the resources occupied by the tx control block
1494 ixgbe_free_tcb(tcb
);
1496 tcb
= (tx_control_block_t
*)
1497 LIST_GET_NEXT(&pending_list
, &tcb
->link
);
1501 * Add the tx control blocks in the pending list to the free list.
1503 ixgbe_put_free_list(tx_ring
, &pending_list
);
1509 * ixgbe_free_tcb - free up the tx control block
1511 * Free the resources of the tx control block, including
1512 * unbind the previously bound DMA handle, and reset other
1516 ixgbe_free_tcb(tx_control_block_t
*tcb
)
1518 switch (tcb
->tx_type
) {
1521 * Reset the buffer length that is used for copy
1523 tcb
->tx_buf
.len
= 0;
1527 * Release the DMA resource that is used for
1530 (void) ddi_dma_unbind_handle(tcb
->tx_dma_handle
);
1539 if (tcb
->mp
!= NULL
) {
1544 tcb
->tx_type
= USE_NONE
;
1545 tcb
->last_index
= MAX_TX_RING_SIZE
;
1551 * ixgbe_get_free_list - Get a free tx control block from the free list
1553 * The atomic operation on the number of the available tx control block
1554 * in the free list is used to keep this routine mutual exclusive with
1555 * the routine ixgbe_put_check_list.
1557 static tx_control_block_t
*
1558 ixgbe_get_free_list(ixgbe_tx_ring_t
*tx_ring
)
1560 tx_control_block_t
*tcb
;
1563 * Check and update the number of the free tx control block
1566 if (ixgbe_atomic_reserve(&tx_ring
->tcb_free
, 1) < 0)
1569 mutex_enter(&tx_ring
->tcb_head_lock
);
1571 tcb
= tx_ring
->free_list
[tx_ring
->tcb_head
];
1572 ASSERT(tcb
!= NULL
);
1573 tx_ring
->free_list
[tx_ring
->tcb_head
] = NULL
;
1574 tx_ring
->tcb_head
= NEXT_INDEX(tx_ring
->tcb_head
, 1,
1575 tx_ring
->free_list_size
);
1577 mutex_exit(&tx_ring
->tcb_head_lock
);
1583 * ixgbe_put_free_list
1585 * Put a list of used tx control blocks back to the free list
1587 * A mutex is used here to ensure the serialization. The mutual exclusion
1588 * between ixgbe_get_free_list and ixgbe_put_free_list is implemented with
1589 * the atomic operation on the counter tcb_free.
1592 ixgbe_put_free_list(ixgbe_tx_ring_t
*tx_ring
, link_list_t
*pending_list
)
1596 tx_control_block_t
*tcb
;
1598 mutex_enter(&tx_ring
->tcb_tail_lock
);
1600 index
= tx_ring
->tcb_tail
;
1603 tcb
= (tx_control_block_t
*)LIST_POP_HEAD(pending_list
);
1604 while (tcb
!= NULL
) {
1605 ASSERT(tx_ring
->free_list
[index
] == NULL
);
1606 tx_ring
->free_list
[index
] = tcb
;
1610 index
= NEXT_INDEX(index
, 1, tx_ring
->free_list_size
);
1612 tcb
= (tx_control_block_t
*)LIST_POP_HEAD(pending_list
);
1615 tx_ring
->tcb_tail
= index
;
1618 * Update the number of the free tx control block
1619 * in the free list. This operation must be placed
1620 * under the protection of the lock.
1622 atomic_add_32(&tx_ring
->tcb_free
, tcb_num
);
1624 mutex_exit(&tx_ring
->tcb_tail_lock
);