9075 Improve ZFS pool import/load process and corrupted pool recovery
[unleashed.git] / usr / src / uts / common / io / e1000g / e1000g_tx.c
blob80ba93aef9a29b280deadf104f09ef879008c132
1 /*
2 * This file is provided under a CDDLv1 license. When using or
3 * redistributing this file, you may do so under this license.
4 * In redistributing this file this license must be included
5 * and no other modification of this header file is permitted.
7 * CDDL LICENSE SUMMARY
9 * Copyright(c) 1999 - 2009 Intel Corporation. All rights reserved.
11 * The contents of this file are subject to the terms of Version
12 * 1.0 of the Common Development and Distribution License (the "License").
14 * You should have received a copy of the License with this software.
15 * You can obtain a copy of the License at
16 * http://www.opensolaris.org/os/licensing.
17 * See the License for the specific language governing permissions
18 * and limitations under the License.
22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
27 * Copyright 2016 Joyent, Inc.
31 * **********************************************************************
32 * *
33 * Module Name: *
34 * e1000g_tx.c *
35 * *
36 * Abstract: *
37 * This file contains some routines that take care of Transmit, *
38 * make the hardware to send the data pointed by the packet out *
39 * on to the physical medium. *
40 * *
41 * **********************************************************************
44 #include "e1000g_sw.h"
45 #include "e1000g_debug.h"
47 static boolean_t e1000g_send(struct e1000g *, mblk_t *);
48 static int e1000g_tx_copy(e1000g_tx_ring_t *,
49 p_tx_sw_packet_t, mblk_t *, boolean_t);
50 static int e1000g_tx_bind(e1000g_tx_ring_t *,
51 p_tx_sw_packet_t, mblk_t *);
52 static boolean_t e1000g_retrieve_context(mblk_t *, context_data_t *, size_t);
53 static boolean_t e1000g_check_context(e1000g_tx_ring_t *, context_data_t *);
54 static int e1000g_fill_tx_ring(e1000g_tx_ring_t *, LIST_DESCRIBER *,
55 context_data_t *);
56 static void e1000g_fill_context_descriptor(context_data_t *,
57 struct e1000_context_desc *);
58 static int e1000g_fill_tx_desc(e1000g_tx_ring_t *,
59 p_tx_sw_packet_t, uint64_t, size_t);
60 static uint32_t e1000g_fill_82544_desc(uint64_t Address, size_t Length,
61 p_desc_array_t desc_array);
62 static int e1000g_tx_workaround_PCIX_82544(p_tx_sw_packet_t, uint64_t, size_t);
63 static int e1000g_tx_workaround_jumbo_82544(p_tx_sw_packet_t, uint64_t, size_t);
64 static void e1000g_82547_timeout(void *);
65 static void e1000g_82547_tx_move_tail(e1000g_tx_ring_t *);
66 static void e1000g_82547_tx_move_tail_work(e1000g_tx_ring_t *);
68 #ifndef E1000G_DEBUG
69 #pragma inline(e1000g_tx_copy)
70 #pragma inline(e1000g_tx_bind)
71 #pragma inline(e1000g_retrieve_context)
72 #pragma inline(e1000g_check_context)
73 #pragma inline(e1000g_fill_tx_ring)
74 #pragma inline(e1000g_fill_context_descriptor)
75 #pragma inline(e1000g_fill_tx_desc)
76 #pragma inline(e1000g_fill_82544_desc)
77 #pragma inline(e1000g_tx_workaround_PCIX_82544)
78 #pragma inline(e1000g_tx_workaround_jumbo_82544)
79 #pragma inline(e1000g_free_tx_swpkt)
80 #endif
83 * e1000g_free_tx_swpkt - free up the tx sw packet
85 * Unbind the previously bound DMA handle for a given
86 * transmit sw packet. And reset the sw packet data.
88 void
89 e1000g_free_tx_swpkt(register p_tx_sw_packet_t packet)
91 switch (packet->data_transfer_type) {
92 case USE_BCOPY:
93 packet->tx_buf->len = 0;
94 break;
95 #ifdef __sparc
96 case USE_DVMA:
97 dvma_unload(packet->tx_dma_handle, 0, -1);
98 break;
99 #endif
100 case USE_DMA:
101 (void) ddi_dma_unbind_handle(packet->tx_dma_handle);
102 break;
103 default:
104 break;
108 * The mblk has been stripped off the sw packet
109 * and will be freed in a triggered soft intr.
111 ASSERT(packet->mp == NULL);
113 packet->data_transfer_type = USE_NONE;
114 packet->num_mblk_frag = 0;
115 packet->num_desc = 0;
118 mblk_t *
119 e1000g_m_tx(void *arg, mblk_t *mp)
121 struct e1000g *Adapter = (struct e1000g *)arg;
122 mblk_t *next;
124 rw_enter(&Adapter->chip_lock, RW_READER);
126 if ((Adapter->e1000g_state & E1000G_SUSPENDED) ||
127 !(Adapter->e1000g_state & E1000G_STARTED) ||
128 (Adapter->link_state != LINK_STATE_UP)) {
129 freemsgchain(mp);
130 mp = NULL;
133 while (mp != NULL) {
134 next = mp->b_next;
135 mp->b_next = NULL;
137 if (!e1000g_send(Adapter, mp)) {
138 mp->b_next = next;
139 break;
142 mp = next;
145 rw_exit(&Adapter->chip_lock);
146 return (mp);
150 * e1000g_send - send packets onto the wire
152 * Called from e1000g_m_tx with an mblk ready to send. this
153 * routine sets up the transmit descriptors and sends data to
154 * the wire. It also pushes the just transmitted packet to
155 * the used tx sw packet list.
157 static boolean_t
158 e1000g_send(struct e1000g *Adapter, mblk_t *mp)
160 p_tx_sw_packet_t packet;
161 LIST_DESCRIBER pending_list;
162 size_t len;
163 size_t msg_size;
164 uint32_t frag_count;
165 int desc_count;
166 uint32_t desc_total;
167 uint32_t bcopy_thresh;
168 uint32_t hdr_frag_len;
169 boolean_t tx_undersize_flag;
170 mblk_t *nmp;
171 mblk_t *tmp;
172 mblk_t *new_mp;
173 mblk_t *pre_mp;
174 mblk_t *next_mp;
175 e1000g_tx_ring_t *tx_ring;
176 context_data_t cur_context;
178 tx_ring = Adapter->tx_ring;
179 bcopy_thresh = Adapter->tx_bcopy_thresh;
181 /* Get the total size and frags number of the message */
182 tx_undersize_flag = B_FALSE;
183 frag_count = 0;
184 msg_size = 0;
185 for (nmp = mp; nmp; nmp = nmp->b_cont) {
186 frag_count++;
187 msg_size += MBLKL(nmp);
190 /* retrieve and compute information for context descriptor */
191 if (!e1000g_retrieve_context(mp, &cur_context, msg_size)) {
192 freemsg(mp);
193 return (B_TRUE);
197 * Make sure the packet is less than the allowed size
199 if (!cur_context.lso_flag &&
200 (msg_size > Adapter->max_frame_size - ETHERFCSL)) {
202 * For the over size packet, we'll just drop it.
203 * So we return B_TRUE here.
205 E1000G_DEBUGLOG_1(Adapter, E1000G_WARN_LEVEL,
206 "Tx packet out of bound. length = %d \n", msg_size);
207 E1000G_STAT(tx_ring->stat_over_size);
208 freemsg(mp);
209 return (B_TRUE);
213 * Check and reclaim tx descriptors.
214 * This low water mark check should be done all the time as
215 * Transmit interrupt delay can produce Transmit interrupts little
216 * late and that may cause few problems related to reaping Tx
217 * Descriptors... As you may run short of them before getting any
218 * transmit interrupt...
220 if (tx_ring->tbd_avail < DEFAULT_TX_NO_RESOURCE) {
221 (void) e1000g_recycle(tx_ring);
222 E1000G_DEBUG_STAT(tx_ring->stat_recycle);
224 if (tx_ring->tbd_avail < DEFAULT_TX_NO_RESOURCE) {
225 E1000G_DEBUG_STAT(tx_ring->stat_lack_desc);
226 goto tx_no_resource;
231 * If the message size is less than the minimum ethernet packet size,
232 * we'll use bcopy to send it, and padd it to 60 bytes later.
234 if (msg_size < ETHERMIN) {
235 E1000G_DEBUG_STAT(tx_ring->stat_under_size);
236 tx_undersize_flag = B_TRUE;
239 /* Initialize variables */
240 desc_count = 1; /* The initial value should be greater than 0 */
241 desc_total = 0;
242 new_mp = NULL;
243 QUEUE_INIT_LIST(&pending_list);
245 /* Process each mblk fragment and fill tx descriptors */
247 * The software should guarantee LSO packet header(MAC+IP+TCP)
248 * to be within one descriptor. Here we reallocate and refill the
249 * the header if it's physical memory non-contiguous.
251 if (cur_context.lso_flag) {
252 /* find the last fragment of the header */
253 len = MBLKL(mp);
254 ASSERT(len > 0);
255 next_mp = mp;
256 pre_mp = NULL;
257 while (len < cur_context.hdr_len) {
258 pre_mp = next_mp;
259 next_mp = next_mp->b_cont;
260 len += MBLKL(next_mp);
263 * If the header and the payload are in different mblks,
264 * we simply force the header to be copied into pre-allocated
265 * page-aligned buffer.
267 if (len == cur_context.hdr_len)
268 goto adjust_threshold;
270 hdr_frag_len = cur_context.hdr_len - (len - MBLKL(next_mp));
272 * There are three cases we need to reallocate a mblk for the
273 * last header fragment:
275 * 1. the header is in multiple mblks and the last fragment
276 * share the same mblk with the payload
278 * 2. the header is in a single mblk shared with the payload
279 * and the header is physical memory non-contiguous
281 * 3. there is 4 KB boundary within the header and 64 bytes
282 * following the end of the header bytes. The case may cause
283 * TCP data corruption issue.
285 * The workaround for the case #2 and case #3 is:
286 * Assuming standard Ethernet/IP/TCP headers of 54 bytes,
287 * this means that the buffer(containing the headers) should
288 * not start -118 bytes before a 4 KB boundary. For example,
289 * 128-byte alignment for this buffer could be used to fulfill
290 * this condition.
292 if ((next_mp != mp) ||
293 (P2NPHASE((uintptr_t)next_mp->b_rptr,
294 E1000_LSO_FIRST_DESC_ALIGNMENT_BOUNDARY_4K)
295 < E1000_LSO_FIRST_DESC_ALIGNMENT)) {
296 E1000G_DEBUG_STAT(tx_ring->stat_lso_header_fail);
298 * reallocate the mblk for the last header fragment,
299 * expect to bcopy into pre-allocated page-aligned
300 * buffer
302 new_mp = allocb(hdr_frag_len, NULL);
303 if (!new_mp)
304 return (B_FALSE);
305 bcopy(next_mp->b_rptr, new_mp->b_rptr, hdr_frag_len);
306 /* link the new header fragment with the other parts */
307 new_mp->b_wptr = new_mp->b_rptr + hdr_frag_len;
308 new_mp->b_cont = next_mp;
309 if (pre_mp)
310 pre_mp->b_cont = new_mp;
311 else
312 mp = new_mp;
313 next_mp->b_rptr += hdr_frag_len;
314 frag_count++;
316 adjust_threshold:
318 * adjust the bcopy threshhold to guarantee
319 * the header to use bcopy way
321 if (bcopy_thresh < cur_context.hdr_len)
322 bcopy_thresh = cur_context.hdr_len;
325 packet = NULL;
326 nmp = mp;
327 while (nmp) {
328 tmp = nmp->b_cont;
330 len = MBLKL(nmp);
331 /* Check zero length mblks */
332 if (len == 0) {
333 E1000G_DEBUG_STAT(tx_ring->stat_empty_frags);
335 * If there're no packet buffers have been used,
336 * or we just completed processing a buffer, then
337 * skip the empty mblk fragment.
338 * Otherwise, there's still a pending buffer that
339 * needs to be processed (tx_copy).
341 if (desc_count > 0) {
342 nmp = tmp;
343 continue;
348 * Get a new TxSwPacket to process mblk buffers.
350 if (desc_count > 0) {
351 mutex_enter(&tx_ring->freelist_lock);
352 packet = (p_tx_sw_packet_t)
353 QUEUE_POP_HEAD(&tx_ring->free_list);
354 mutex_exit(&tx_ring->freelist_lock);
356 if (packet == NULL) {
357 E1000G_DEBUGLOG_0(Adapter, E1000G_INFO_LEVEL,
358 "No Tx SwPacket available\n");
359 E1000G_STAT(tx_ring->stat_no_swpkt);
360 goto tx_send_failed;
362 QUEUE_PUSH_TAIL(&pending_list, &packet->Link);
365 ASSERT(packet);
367 * If the size of the fragment is less than the tx_bcopy_thresh
368 * we'll use bcopy; Otherwise, we'll use DMA binding.
370 if ((len <= bcopy_thresh) || tx_undersize_flag) {
371 desc_count =
372 e1000g_tx_copy(tx_ring, packet, nmp,
373 tx_undersize_flag);
374 E1000G_DEBUG_STAT(tx_ring->stat_copy);
375 } else {
376 desc_count =
377 e1000g_tx_bind(tx_ring, packet, nmp);
378 E1000G_DEBUG_STAT(tx_ring->stat_bind);
381 if (desc_count > 0)
382 desc_total += desc_count;
383 else if (desc_count < 0)
384 goto tx_send_failed;
386 nmp = tmp;
389 /* Assign the message to the last sw packet */
390 ASSERT(packet);
391 ASSERT(packet->mp == NULL);
392 packet->mp = mp;
394 /* Try to recycle the tx descriptors again */
395 if (tx_ring->tbd_avail < (desc_total + 3)) {
396 E1000G_DEBUG_STAT(tx_ring->stat_recycle_retry);
397 (void) e1000g_recycle(tx_ring);
400 mutex_enter(&tx_ring->tx_lock);
403 * If the number of available tx descriptors is not enough for transmit
404 * (one redundant descriptor and one hw checksum context descriptor are
405 * included), then return failure.
407 if (tx_ring->tbd_avail < (desc_total + 3)) {
408 E1000G_DEBUGLOG_0(Adapter, E1000G_INFO_LEVEL,
409 "No Enough Tx descriptors\n");
410 E1000G_STAT(tx_ring->stat_no_desc);
411 mutex_exit(&tx_ring->tx_lock);
412 goto tx_send_failed;
415 desc_count = e1000g_fill_tx_ring(tx_ring, &pending_list, &cur_context);
417 mutex_exit(&tx_ring->tx_lock);
419 ASSERT(desc_count > 0);
421 /* Send successful */
422 return (B_TRUE);
424 tx_send_failed:
425 /* Restore mp to original */
426 if (new_mp) {
427 if (pre_mp) {
428 pre_mp->b_cont = next_mp;
430 new_mp->b_cont = NULL;
431 freemsg(new_mp);
433 next_mp->b_rptr -= hdr_frag_len;
437 * Enable Transmit interrupts, so that the interrupt routine can
438 * call mac_tx_update() when transmit descriptors become available.
440 tx_ring->resched_timestamp = ddi_get_lbolt();
441 tx_ring->resched_needed = B_TRUE;
442 if (!Adapter->tx_intr_enable)
443 e1000g_mask_tx_interrupt(Adapter);
445 /* Free pending TxSwPackets */
446 packet = (p_tx_sw_packet_t)QUEUE_GET_HEAD(&pending_list);
447 while (packet) {
448 packet->mp = NULL;
449 e1000g_free_tx_swpkt(packet);
450 packet = (p_tx_sw_packet_t)
451 QUEUE_GET_NEXT(&pending_list, &packet->Link);
454 /* Return pending TxSwPackets to the "Free" list */
455 mutex_enter(&tx_ring->freelist_lock);
456 QUEUE_APPEND(&tx_ring->free_list, &pending_list);
457 mutex_exit(&tx_ring->freelist_lock);
459 E1000G_STAT(tx_ring->stat_send_fail);
461 /* Message will be scheduled for re-transmit */
462 return (B_FALSE);
464 tx_no_resource:
466 * Enable Transmit interrupts, so that the interrupt routine can
467 * call mac_tx_update() when transmit descriptors become available.
469 tx_ring->resched_timestamp = ddi_get_lbolt();
470 tx_ring->resched_needed = B_TRUE;
471 if (!Adapter->tx_intr_enable)
472 e1000g_mask_tx_interrupt(Adapter);
474 /* Message will be scheduled for re-transmit */
475 return (B_FALSE);
478 static boolean_t
479 e1000g_retrieve_context(mblk_t *mp, context_data_t *cur_context,
480 size_t msg_size)
482 uintptr_t ip_start;
483 uintptr_t tcp_start;
484 mblk_t *nmp;
485 uint32_t lsoflags;
486 uint32_t mss;
488 bzero(cur_context, sizeof (context_data_t));
490 /* first check lso information */
491 mac_lso_get(mp, &mss, &lsoflags);
493 /* retrieve checksum info */
494 mac_hcksum_get(mp, &cur_context->cksum_start,
495 &cur_context->cksum_stuff, NULL, NULL, &cur_context->cksum_flags);
496 /* retrieve ethernet header size */
497 if (((struct ether_vlan_header *)(uintptr_t)mp->b_rptr)->ether_tpid ==
498 htons(ETHERTYPE_VLAN))
499 cur_context->ether_header_size =
500 sizeof (struct ether_vlan_header);
501 else
502 cur_context->ether_header_size =
503 sizeof (struct ether_header);
505 if (lsoflags & HW_LSO) {
506 ASSERT(mss != 0);
508 /* free the invalid packet */
509 if (mss == 0 ||
510 !((cur_context->cksum_flags & HCK_PARTIALCKSUM) &&
511 (cur_context->cksum_flags & HCK_IPV4_HDRCKSUM))) {
512 return (B_FALSE);
514 cur_context->mss = (uint16_t)mss;
515 cur_context->lso_flag = B_TRUE;
518 * Some fields are cleared for the hardware to fill
519 * in. We don't assume Ethernet header, IP header and
520 * TCP header are always in the same mblk fragment,
521 * while we assume each header is always within one
522 * mblk fragment and Ethernet header is always in the
523 * first mblk fragment.
525 nmp = mp;
526 ip_start = (uintptr_t)(nmp->b_rptr)
527 + cur_context->ether_header_size;
528 if (ip_start >= (uintptr_t)(nmp->b_wptr)) {
529 ip_start = (uintptr_t)nmp->b_cont->b_rptr
530 + (ip_start - (uintptr_t)(nmp->b_wptr));
531 nmp = nmp->b_cont;
533 tcp_start = ip_start +
534 IPH_HDR_LENGTH((ipha_t *)ip_start);
535 if (tcp_start >= (uintptr_t)(nmp->b_wptr)) {
536 tcp_start = (uintptr_t)nmp->b_cont->b_rptr
537 + (tcp_start - (uintptr_t)(nmp->b_wptr));
538 nmp = nmp->b_cont;
540 cur_context->hdr_len = cur_context->ether_header_size
541 + IPH_HDR_LENGTH((ipha_t *)ip_start)
542 + TCP_HDR_LENGTH((tcph_t *)tcp_start);
543 ((ipha_t *)ip_start)->ipha_length = 0;
544 ((ipha_t *)ip_start)->ipha_hdr_checksum = 0;
545 /* calculate the TCP packet payload length */
546 cur_context->pay_len = msg_size - cur_context->hdr_len;
548 return (B_TRUE);
551 static boolean_t
552 e1000g_check_context(e1000g_tx_ring_t *tx_ring, context_data_t *cur_context)
554 boolean_t context_reload;
555 context_data_t *pre_context;
556 struct e1000g *Adapter;
558 context_reload = B_FALSE;
559 pre_context = &tx_ring->pre_context;
560 Adapter = tx_ring->adapter;
563 * The following code determine if the context descriptor is
564 * needed to be reloaded. The sequence of the conditions is
565 * made by their possibilities of changing.
568 * workaround for 82546EB, context descriptor must be reloaded
569 * per LSO/hw_cksum packet if LSO is enabled.
571 if (Adapter->lso_premature_issue &&
572 Adapter->lso_enable &&
573 (cur_context->cksum_flags != 0)) {
575 context_reload = B_TRUE;
576 } else if (cur_context->lso_flag) {
577 if ((cur_context->lso_flag != pre_context->lso_flag) ||
578 (cur_context->cksum_flags != pre_context->cksum_flags) ||
579 (cur_context->pay_len != pre_context->pay_len) ||
580 (cur_context->mss != pre_context->mss) ||
581 (cur_context->hdr_len != pre_context->hdr_len) ||
582 (cur_context->cksum_stuff != pre_context->cksum_stuff) ||
583 (cur_context->cksum_start != pre_context->cksum_start) ||
584 (cur_context->ether_header_size !=
585 pre_context->ether_header_size)) {
587 context_reload = B_TRUE;
589 } else if (cur_context->cksum_flags != 0) {
590 if ((cur_context->lso_flag != pre_context->lso_flag) ||
591 (cur_context->cksum_flags != pre_context->cksum_flags) ||
592 (cur_context->cksum_stuff != pre_context->cksum_stuff) ||
593 (cur_context->cksum_start != pre_context->cksum_start) ||
594 (cur_context->ether_header_size !=
595 pre_context->ether_header_size)) {
597 context_reload = B_TRUE;
601 return (context_reload);
604 static int
605 e1000g_fill_tx_ring(e1000g_tx_ring_t *tx_ring, LIST_DESCRIBER *pending_list,
606 context_data_t *cur_context)
608 struct e1000g *Adapter;
609 struct e1000_hw *hw;
610 p_tx_sw_packet_t first_packet;
611 p_tx_sw_packet_t packet;
612 p_tx_sw_packet_t previous_packet;
613 boolean_t context_reload;
614 struct e1000_tx_desc *first_data_desc;
615 struct e1000_tx_desc *next_desc;
616 struct e1000_tx_desc *descriptor;
617 struct e1000_data_desc zeroed;
618 int desc_count;
619 boolean_t buff_overrun_flag;
620 int i;
622 Adapter = tx_ring->adapter;
623 hw = &Adapter->shared;
625 desc_count = 0;
626 first_packet = NULL;
627 first_data_desc = NULL;
628 descriptor = NULL;
629 first_packet = NULL;
630 packet = NULL;
631 buff_overrun_flag = B_FALSE;
632 zeroed.upper.data = 0;
634 next_desc = tx_ring->tbd_next;
636 /* Context descriptor reload check */
637 context_reload = e1000g_check_context(tx_ring, cur_context);
639 if (context_reload) {
640 first_packet = (p_tx_sw_packet_t)QUEUE_GET_HEAD(pending_list);
642 descriptor = next_desc;
644 e1000g_fill_context_descriptor(cur_context,
645 (struct e1000_context_desc *)descriptor);
647 /* Check the wrap-around case */
648 if (descriptor == tx_ring->tbd_last)
649 next_desc = tx_ring->tbd_first;
650 else
651 next_desc++;
653 desc_count++;
656 first_data_desc = next_desc;
659 * According to the documentation, the packet options field (POPTS) is
660 * "ignored except on the first data descriptor of a packet." However,
661 * there is a bug in QEMU (638955) whereby the POPTS field within a
662 * given data descriptor is used to interpret that data descriptor --
663 * regardless of whether or not the descriptor is the first in a packet
664 * or not. For a packet that spans multiple descriptors, the (virtual)
665 * HW checksum (either TCP/UDP or IP or both) will therefore _not_ be
666 * performed on descriptors after the first, resulting in incorrect
667 * checksums and mysteriously dropped/retransmitted packets. Other
668 * drivers do not have this issue because they (harmlessly) set the
669 * POPTS field on every data descriptor to be the intended options for
670 * the entire packet. To circumvent this QEMU bug, we engage in this
671 * same behavior iff the subsystem vendor and device IDs indicate that
672 * this is an emulated QEMU device (1af4,1100).
674 if (hw->subsystem_vendor_id == 0x1af4 &&
675 hw->subsystem_device_id == 0x1100 &&
676 cur_context->cksum_flags) {
677 if (cur_context->cksum_flags & HCK_IPV4_HDRCKSUM)
678 zeroed.upper.fields.popts |= E1000_TXD_POPTS_IXSM;
680 if (cur_context->cksum_flags & HCK_PARTIALCKSUM)
681 zeroed.upper.fields.popts |= E1000_TXD_POPTS_TXSM;
684 packet = (p_tx_sw_packet_t)QUEUE_GET_HEAD(pending_list);
685 while (packet) {
686 ASSERT(packet->num_desc);
688 for (i = 0; i < packet->num_desc; i++) {
689 ASSERT(tx_ring->tbd_avail > 0);
691 descriptor = next_desc;
692 descriptor->buffer_addr =
693 packet->desc[i].address;
694 descriptor->lower.data =
695 packet->desc[i].length;
697 /* Zero out status */
698 descriptor->upper.data = zeroed.upper.data;
700 descriptor->lower.data |=
701 E1000_TXD_CMD_DEXT | E1000_TXD_DTYP_D;
702 /* must set RS on every outgoing descriptor */
703 descriptor->lower.data |=
704 E1000_TXD_CMD_RS;
706 if (cur_context->lso_flag)
707 descriptor->lower.data |= E1000_TXD_CMD_TSE;
709 /* Check the wrap-around case */
710 if (descriptor == tx_ring->tbd_last)
711 next_desc = tx_ring->tbd_first;
712 else
713 next_desc++;
715 desc_count++;
718 * workaround for 82546EB errata 33, hang in PCI-X
719 * systems due to 2k Buffer Overrun during Transmit
720 * Operation. The workaround applies to all the Intel
721 * PCI-X chips.
723 if (hw->bus.type == e1000_bus_type_pcix &&
724 descriptor == first_data_desc &&
725 ((descriptor->lower.data & E1000G_TBD_LENGTH_MASK)
726 > E1000_TX_BUFFER_OEVRRUN_THRESHOLD)) {
727 /* modified the first descriptor */
728 descriptor->lower.data &=
729 ~E1000G_TBD_LENGTH_MASK;
730 descriptor->lower.flags.length =
731 E1000_TX_BUFFER_OEVRRUN_THRESHOLD;
733 /* insert a new descriptor */
734 ASSERT(tx_ring->tbd_avail > 0);
735 next_desc->buffer_addr =
736 packet->desc[0].address +
737 E1000_TX_BUFFER_OEVRRUN_THRESHOLD;
738 next_desc->lower.data =
739 packet->desc[0].length -
740 E1000_TX_BUFFER_OEVRRUN_THRESHOLD;
742 /* Zero out status */
743 next_desc->upper.data = zeroed.upper.data;
745 next_desc->lower.data |=
746 E1000_TXD_CMD_DEXT | E1000_TXD_DTYP_D;
747 /* must set RS on every outgoing descriptor */
748 next_desc->lower.data |=
749 E1000_TXD_CMD_RS;
751 if (cur_context->lso_flag)
752 next_desc->lower.data |=
753 E1000_TXD_CMD_TSE;
755 descriptor = next_desc;
757 /* Check the wrap-around case */
758 if (next_desc == tx_ring->tbd_last)
759 next_desc = tx_ring->tbd_first;
760 else
761 next_desc++;
763 desc_count++;
764 buff_overrun_flag = B_TRUE;
768 if (buff_overrun_flag) {
769 packet->num_desc++;
770 buff_overrun_flag = B_FALSE;
773 if (first_packet != NULL) {
775 * Count the checksum context descriptor for
776 * the first SwPacket.
778 first_packet->num_desc++;
779 first_packet = NULL;
782 packet->tickstamp = ddi_get_lbolt64();
784 previous_packet = packet;
785 packet = (p_tx_sw_packet_t)
786 QUEUE_GET_NEXT(pending_list, &packet->Link);
790 * workaround for 82546EB errata 21, LSO Premature Descriptor Write Back
792 if (Adapter->lso_premature_issue && cur_context->lso_flag &&
793 ((descriptor->lower.data & E1000G_TBD_LENGTH_MASK) > 8)) {
794 /* modified the previous descriptor */
795 descriptor->lower.data -= 4;
797 /* insert a new descriptor */
798 ASSERT(tx_ring->tbd_avail > 0);
799 /* the lower 20 bits of lower.data is the length field */
800 next_desc->buffer_addr =
801 descriptor->buffer_addr +
802 (descriptor->lower.data & E1000G_TBD_LENGTH_MASK);
803 next_desc->lower.data = 4;
805 /* Zero out status */
806 next_desc->upper.data = zeroed.upper.data;
807 /* It must be part of a LSO packet */
808 next_desc->lower.data |=
809 E1000_TXD_CMD_DEXT | E1000_TXD_DTYP_D |
810 E1000_TXD_CMD_RS | E1000_TXD_CMD_TSE;
812 descriptor = next_desc;
814 /* Check the wrap-around case */
815 if (descriptor == tx_ring->tbd_last)
816 next_desc = tx_ring->tbd_first;
817 else
818 next_desc++;
820 desc_count++;
821 /* update the number of descriptors */
822 previous_packet->num_desc++;
825 ASSERT(descriptor);
827 if (cur_context->cksum_flags) {
828 if (cur_context->cksum_flags & HCK_IPV4_HDRCKSUM)
829 ((struct e1000_data_desc *)first_data_desc)->
830 upper.fields.popts |= E1000_TXD_POPTS_IXSM;
831 if (cur_context->cksum_flags & HCK_PARTIALCKSUM)
832 ((struct e1000_data_desc *)first_data_desc)->
833 upper.fields.popts |= E1000_TXD_POPTS_TXSM;
837 * Last Descriptor of Packet needs End Of Packet (EOP), Report
838 * Status (RS) set.
840 if (Adapter->tx_intr_delay) {
841 descriptor->lower.data |= E1000_TXD_CMD_IDE |
842 E1000_TXD_CMD_EOP;
843 } else {
844 descriptor->lower.data |= E1000_TXD_CMD_EOP;
847 /* Set append Ethernet CRC (IFCS) bits */
848 if (cur_context->lso_flag) {
849 first_data_desc->lower.data |= E1000_TXD_CMD_IFCS;
850 } else {
851 descriptor->lower.data |= E1000_TXD_CMD_IFCS;
855 * Sync the Tx descriptors DMA buffer
857 (void) ddi_dma_sync(tx_ring->tbd_dma_handle,
858 0, 0, DDI_DMA_SYNC_FORDEV);
860 tx_ring->tbd_next = next_desc;
863 * Advance the Transmit Descriptor Tail (Tdt), this tells the
864 * FX1000 that this frame is available to transmit.
866 if (hw->mac.type == e1000_82547)
867 e1000g_82547_tx_move_tail(tx_ring);
868 else
869 E1000_WRITE_REG(hw, E1000_TDT(0),
870 (uint32_t)(next_desc - tx_ring->tbd_first));
872 if (e1000g_check_acc_handle(Adapter->osdep.reg_handle) != DDI_FM_OK) {
873 ddi_fm_service_impact(Adapter->dip, DDI_SERVICE_DEGRADED);
874 Adapter->e1000g_state |= E1000G_ERROR;
877 /* Put the pending SwPackets to the "Used" list */
878 mutex_enter(&tx_ring->usedlist_lock);
879 QUEUE_APPEND(&tx_ring->used_list, pending_list);
880 tx_ring->tbd_avail -= desc_count;
881 mutex_exit(&tx_ring->usedlist_lock);
883 /* update LSO related data */
884 if (context_reload)
885 tx_ring->pre_context = *cur_context;
887 return (desc_count);
891 * e1000g_tx_setup - setup tx data structures
893 * This routine initializes all of the transmit related
894 * structures. This includes the Transmit descriptors,
895 * and the tx_sw_packet structures.
897 void
898 e1000g_tx_setup(struct e1000g *Adapter)
900 struct e1000_hw *hw;
901 p_tx_sw_packet_t packet;
902 uint32_t i;
903 uint32_t buf_high;
904 uint32_t buf_low;
905 uint32_t reg_tipg;
906 uint32_t reg_tctl;
907 int size;
908 e1000g_tx_ring_t *tx_ring;
910 hw = &Adapter->shared;
911 tx_ring = Adapter->tx_ring;
913 /* init the lists */
915 * Here we don't need to protect the lists using the
916 * usedlist_lock and freelist_lock, for they have
917 * been protected by the chip_lock.
919 QUEUE_INIT_LIST(&tx_ring->used_list);
920 QUEUE_INIT_LIST(&tx_ring->free_list);
922 /* Go through and set up each SW_Packet */
923 packet = tx_ring->packet_area;
924 for (i = 0; i < Adapter->tx_freelist_num; i++, packet++) {
925 /* Initialize this tx_sw_apcket area */
926 e1000g_free_tx_swpkt(packet);
927 /* Add this tx_sw_packet to the free list */
928 QUEUE_PUSH_TAIL(&tx_ring->free_list,
929 &packet->Link);
932 /* Setup TX descriptor pointers */
933 tx_ring->tbd_next = tx_ring->tbd_first;
934 tx_ring->tbd_oldest = tx_ring->tbd_first;
937 * Setup Hardware TX Registers
939 /* Setup the Transmit Control Register (TCTL). */
940 reg_tctl = E1000_READ_REG(hw, E1000_TCTL);
941 reg_tctl |= E1000_TCTL_PSP | E1000_TCTL_EN |
942 (E1000_COLLISION_THRESHOLD << E1000_CT_SHIFT) |
943 (E1000_COLLISION_DISTANCE << E1000_COLD_SHIFT) |
944 E1000_TCTL_RTLC;
946 /* Enable the MULR bit */
947 if (hw->bus.type == e1000_bus_type_pci_express)
948 reg_tctl |= E1000_TCTL_MULR;
950 E1000_WRITE_REG(hw, E1000_TCTL, reg_tctl);
952 /* Setup HW Base and Length of Tx descriptor area */
953 size = (Adapter->tx_desc_num * sizeof (struct e1000_tx_desc));
954 E1000_WRITE_REG(hw, E1000_TDLEN(0), size);
955 size = E1000_READ_REG(hw, E1000_TDLEN(0));
957 buf_low = (uint32_t)tx_ring->tbd_dma_addr;
958 buf_high = (uint32_t)(tx_ring->tbd_dma_addr >> 32);
961 * Write the highest location first and work backward to the lowest.
962 * This is necessary for some adapter types to
963 * prevent write combining from occurring.
965 E1000_WRITE_REG(hw, E1000_TDBAH(0), buf_high);
966 E1000_WRITE_REG(hw, E1000_TDBAL(0), buf_low);
968 /* Setup our HW Tx Head & Tail descriptor pointers */
969 E1000_WRITE_REG(hw, E1000_TDH(0), 0);
970 E1000_WRITE_REG(hw, E1000_TDT(0), 0);
972 /* Set the default values for the Tx Inter Packet Gap timer */
973 if ((hw->mac.type == e1000_82542) &&
974 ((hw->revision_id == E1000_REVISION_2) ||
975 (hw->revision_id == E1000_REVISION_3))) {
976 reg_tipg = DEFAULT_82542_TIPG_IPGT;
977 reg_tipg |=
978 DEFAULT_82542_TIPG_IPGR1 << E1000_TIPG_IPGR1_SHIFT;
979 reg_tipg |=
980 DEFAULT_82542_TIPG_IPGR2 << E1000_TIPG_IPGR2_SHIFT;
981 } else if (hw->mac.type == e1000_80003es2lan) {
982 reg_tipg = DEFAULT_82543_TIPG_IPGR1;
983 reg_tipg |= DEFAULT_80003ES2LAN_TIPG_IPGR2 <<
984 E1000_TIPG_IPGR2_SHIFT;
985 } else {
986 if (hw->phy.media_type == e1000_media_type_fiber)
987 reg_tipg = DEFAULT_82543_TIPG_IPGT_FIBER;
988 else
989 reg_tipg = DEFAULT_82543_TIPG_IPGT_COPPER;
990 reg_tipg |=
991 DEFAULT_82543_TIPG_IPGR1 << E1000_TIPG_IPGR1_SHIFT;
992 reg_tipg |=
993 DEFAULT_82543_TIPG_IPGR2 << E1000_TIPG_IPGR2_SHIFT;
995 E1000_WRITE_REG(hw, E1000_TIPG, reg_tipg);
997 /* Setup Transmit Interrupt Delay Value */
998 E1000_WRITE_REG(hw, E1000_TIDV, Adapter->tx_intr_delay);
999 E1000G_DEBUGLOG_1(Adapter, E1000G_INFO_LEVEL,
1000 "E1000_TIDV: 0x%x\n", Adapter->tx_intr_delay);
1002 if (hw->mac.type >= e1000_82540) {
1003 E1000_WRITE_REG(&Adapter->shared, E1000_TADV,
1004 Adapter->tx_intr_abs_delay);
1005 E1000G_DEBUGLOG_1(Adapter, E1000G_INFO_LEVEL,
1006 "E1000_TADV: 0x%x\n", Adapter->tx_intr_abs_delay);
1009 tx_ring->tbd_avail = Adapter->tx_desc_num;
1011 /* Initialize stored context information */
1012 bzero(&(tx_ring->pre_context), sizeof (context_data_t));
1016 * e1000g_recycle - recycle the tx descriptors and tx sw packets
1019 e1000g_recycle(e1000g_tx_ring_t *tx_ring)
1021 struct e1000g *Adapter;
1022 LIST_DESCRIBER pending_list;
1023 p_tx_sw_packet_t packet;
1024 mblk_t *mp;
1025 mblk_t *nmp;
1026 struct e1000_tx_desc *descriptor;
1027 int desc_count;
1028 int64_t delta;
1031 * This function will examine each TxSwPacket in the 'used' queue
1032 * if the e1000g is done with it then the associated resources (Tx
1033 * Descriptors) will be "freed" and the TxSwPacket will be
1034 * returned to the 'free' queue.
1036 Adapter = tx_ring->adapter;
1037 delta = 0;
1039 packet = (p_tx_sw_packet_t)QUEUE_GET_HEAD(&tx_ring->used_list);
1040 if (packet == NULL) {
1041 Adapter->stall_flag = B_FALSE;
1042 return (0);
1045 desc_count = 0;
1046 QUEUE_INIT_LIST(&pending_list);
1048 /* Sync the Tx descriptor DMA buffer */
1049 (void) ddi_dma_sync(tx_ring->tbd_dma_handle,
1050 0, 0, DDI_DMA_SYNC_FORKERNEL);
1051 if (e1000g_check_dma_handle(
1052 tx_ring->tbd_dma_handle) != DDI_FM_OK) {
1053 ddi_fm_service_impact(Adapter->dip, DDI_SERVICE_DEGRADED);
1054 Adapter->e1000g_state |= E1000G_ERROR;
1055 return (0);
1059 * While there are still TxSwPackets in the used queue check them
1061 mutex_enter(&tx_ring->usedlist_lock);
1062 while ((packet =
1063 (p_tx_sw_packet_t)QUEUE_GET_HEAD(&tx_ring->used_list)) != NULL) {
1066 * Get hold of the next descriptor that the e1000g will
1067 * report status back to (this will be the last descriptor
1068 * of a given sw packet). We only want to free the
1069 * sw packet (and it resources) if the e1000g is done
1070 * with ALL of the descriptors. If the e1000g is done
1071 * with the last one then it is done with all of them.
1073 ASSERT(packet->num_desc);
1074 descriptor = tx_ring->tbd_oldest + (packet->num_desc - 1);
1076 /* Check for wrap case */
1077 if (descriptor > tx_ring->tbd_last)
1078 descriptor -= Adapter->tx_desc_num;
1081 * If the descriptor done bit is set free TxSwPacket and
1082 * associated resources
1084 if (descriptor->upper.fields.status & E1000_TXD_STAT_DD) {
1085 QUEUE_POP_HEAD(&tx_ring->used_list);
1086 QUEUE_PUSH_TAIL(&pending_list, &packet->Link);
1088 if (descriptor == tx_ring->tbd_last)
1089 tx_ring->tbd_oldest =
1090 tx_ring->tbd_first;
1091 else
1092 tx_ring->tbd_oldest =
1093 descriptor + 1;
1095 desc_count += packet->num_desc;
1096 } else {
1098 * Found a sw packet that the e1000g is not done
1099 * with then there is no reason to check the rest
1100 * of the queue.
1102 delta = ddi_get_lbolt64() - packet->tickstamp;
1103 break;
1107 tx_ring->tbd_avail += desc_count;
1108 Adapter->tx_pkt_cnt += desc_count;
1110 mutex_exit(&tx_ring->usedlist_lock);
1112 if (desc_count == 0) {
1113 E1000G_DEBUG_STAT(tx_ring->stat_recycle_none);
1115 * If the packet hasn't been sent out for seconds and
1116 * the transmitter is not under paused flowctrl condition,
1117 * the transmitter is considered to be stalled.
1119 if ((delta > Adapter->stall_threshold) &&
1120 !(E1000_READ_REG(&Adapter->shared,
1121 E1000_STATUS) & E1000_STATUS_TXOFF)) {
1122 Adapter->stall_flag = B_TRUE;
1124 return (0);
1127 Adapter->stall_flag = B_FALSE;
1129 mp = NULL;
1130 nmp = NULL;
1131 packet = (p_tx_sw_packet_t)QUEUE_GET_HEAD(&pending_list);
1132 ASSERT(packet != NULL);
1133 while (packet != NULL) {
1134 if (packet->mp != NULL) {
1135 ASSERT(packet->mp->b_next == NULL);
1136 /* Assemble the message chain */
1137 if (mp == NULL) {
1138 mp = packet->mp;
1139 nmp = packet->mp;
1140 } else {
1141 nmp->b_next = packet->mp;
1142 nmp = packet->mp;
1144 /* Disconnect the message from the sw packet */
1145 packet->mp = NULL;
1148 /* Free the TxSwPackets */
1149 e1000g_free_tx_swpkt(packet);
1151 packet = (p_tx_sw_packet_t)
1152 QUEUE_GET_NEXT(&pending_list, &packet->Link);
1155 /* Return the TxSwPackets back to the FreeList */
1156 mutex_enter(&tx_ring->freelist_lock);
1157 QUEUE_APPEND(&tx_ring->free_list, &pending_list);
1158 mutex_exit(&tx_ring->freelist_lock);
1160 if (mp != NULL)
1161 freemsgchain(mp);
1163 return (desc_count);
1166 * 82544 Coexistence issue workaround:
1167 * There are 2 issues.
1168 * 1. If a 32 bit split completion happens from P64H2 and another
1169 * agent drives a 64 bit request/split completion after ONLY
1170 * 1 idle clock (BRCM/Emulex/Adaptec fiber channel cards) then
1171 * 82544 has a problem where in to clock all the data in, it
1172 * looks at REQ64# signal and since it has changed so fast (i.e. 1
1173 * idle clock turn around), it will fail to clock all the data in.
1174 * Data coming from certain ending addresses has exposure to this issue.
1176 * To detect this issue, following equation can be used...
1177 * SIZE[3:0] + ADDR[2:0] = SUM[3:0].
1178 * If SUM[3:0] is in between 1 to 4, we will have this issue.
1180 * ROOT CAUSE:
1181 * The erratum involves the 82544 PCIX elasticity FIFO implementations as
1182 * 64-bit FIFO's and flushing of the final partial-bytes corresponding
1183 * to the end of a requested read burst. Under a specific burst condition
1184 * of ending-data alignment and 32-byte split-completions, the final
1185 * byte(s) of split-completion data require an extra clock cycle to flush
1186 * into 64-bit FIFO orientation. An incorrect logic dependency on the
1187 * REQ64# signal occurring during during this clock cycle may cause the
1188 * residual byte(s) to be lost, thereby rendering the internal DMA client
1189 * forever awaiting the final byte(s) for an outbound data-fetch. The
1190 * erratum is confirmed to *only* occur if certain subsequent external
1191 * 64-bit PCIX bus transactions occur immediately (minimum possible bus
1192 * turn- around) following the odd-aligned 32-bit split-completion
1193 * containing the final byte(s). Intel has confirmed that this has been
1194 * seen only with chipset/bridges which have the capability to provide
1195 * 32-bit split-completion data, and in the presence of newer PCIX bus
1196 * agents which fully-optimize the inter-transaction turn-around (zero
1197 * additional initiator latency when pre-granted bus ownership).
1199 * This issue does not exist in PCI bus mode, when any agent is operating
1200 * in 32 bit only mode or on chipsets that do not do 32 bit split
1201 * completions for 64 bit read requests (Serverworks chipsets). P64H2 does
1202 * 32 bit split completions for any read request that has bit 2 set to 1
1203 * for the requested address and read request size is more than 8 bytes.
1205 * 2. Another issue is related to 82544 driving DACs under the similar
1206 * scenario (32 bit split completion followed by 64 bit transaction with
1207 * only 1 cycle turnaround). This issue is still being root caused. We
1208 * think that both of these issues can be avoided if following workaround
1209 * is implemented. It seems DAC issues is related to ending addresses being
1210 * 0x9, 0xA, 0xB, 0xC and hence ending up at odd boundaries in elasticity
1211 * FIFO which does not get flushed due to REQ64# dependency. We will only
1212 * know the full story after it has been simulated successfully by HW team.
1214 * WORKAROUND:
1215 * Make sure we do not have ending address as 1,2,3,4(Hang) or 9,a,b,c(DAC)
1217 static uint32_t
1218 e1000g_fill_82544_desc(uint64_t address,
1219 size_t length, p_desc_array_t desc_array)
1222 * Since issue is sensitive to length and address.
1223 * Let us first check the address...
1225 uint32_t safe_terminator;
1227 if (length <= 4) {
1228 desc_array->descriptor[0].address = address;
1229 desc_array->descriptor[0].length = (uint32_t)length;
1230 desc_array->elements = 1;
1231 return (desc_array->elements);
1233 safe_terminator =
1234 (uint32_t)((((uint32_t)address & 0x7) +
1235 (length & 0xF)) & 0xF);
1237 * if it does not fall between 0x1 to 0x4 and 0x9 to 0xC then
1238 * return
1240 if (safe_terminator == 0 ||
1241 (safe_terminator > 4 && safe_terminator < 9) ||
1242 (safe_terminator > 0xC && safe_terminator <= 0xF)) {
1243 desc_array->descriptor[0].address = address;
1244 desc_array->descriptor[0].length = (uint32_t)length;
1245 desc_array->elements = 1;
1246 return (desc_array->elements);
1249 desc_array->descriptor[0].address = address;
1250 desc_array->descriptor[0].length = length - 4;
1251 desc_array->descriptor[1].address = address + (length - 4);
1252 desc_array->descriptor[1].length = 4;
1253 desc_array->elements = 2;
1254 return (desc_array->elements);
1257 static int
1258 e1000g_tx_copy(e1000g_tx_ring_t *tx_ring, p_tx_sw_packet_t packet,
1259 mblk_t *mp, boolean_t tx_undersize_flag)
1261 size_t len;
1262 size_t len1;
1263 dma_buffer_t *tx_buf;
1264 mblk_t *nmp;
1265 boolean_t finished;
1266 int desc_count;
1268 desc_count = 0;
1269 tx_buf = packet->tx_buf;
1270 len = MBLKL(mp);
1272 ASSERT((tx_buf->len + len) <= tx_buf->size);
1274 if (len > 0) {
1275 bcopy(mp->b_rptr,
1276 tx_buf->address + tx_buf->len,
1277 len);
1278 tx_buf->len += len;
1280 packet->num_mblk_frag++;
1283 nmp = mp->b_cont;
1284 if (nmp == NULL) {
1285 finished = B_TRUE;
1286 } else {
1287 len1 = MBLKL(nmp);
1288 if ((tx_buf->len + len1) > tx_buf->size)
1289 finished = B_TRUE;
1290 else if (tx_undersize_flag)
1291 finished = B_FALSE;
1292 else if (len1 > tx_ring->adapter->tx_bcopy_thresh)
1293 finished = B_TRUE;
1294 else
1295 finished = B_FALSE;
1298 if (finished) {
1299 E1000G_DEBUG_STAT_COND(tx_ring->stat_multi_copy,
1300 (tx_buf->len > len));
1303 * If the packet is smaller than 64 bytes, which is the
1304 * minimum ethernet packet size, pad the packet to make
1305 * it at least 60 bytes. The hardware will add 4 bytes
1306 * for CRC.
1308 if (tx_undersize_flag) {
1309 ASSERT(tx_buf->len < ETHERMIN);
1311 bzero(tx_buf->address + tx_buf->len,
1312 ETHERMIN - tx_buf->len);
1313 tx_buf->len = ETHERMIN;
1316 #ifdef __sparc
1317 if (packet->dma_type == USE_DVMA)
1318 dvma_sync(tx_buf->dma_handle, 0, DDI_DMA_SYNC_FORDEV);
1319 else
1320 (void) ddi_dma_sync(tx_buf->dma_handle, 0,
1321 tx_buf->len, DDI_DMA_SYNC_FORDEV);
1322 #else
1323 (void) ddi_dma_sync(tx_buf->dma_handle, 0,
1324 tx_buf->len, DDI_DMA_SYNC_FORDEV);
1325 #endif
1327 packet->data_transfer_type = USE_BCOPY;
1329 desc_count = e1000g_fill_tx_desc(tx_ring,
1330 packet,
1331 tx_buf->dma_address,
1332 tx_buf->len);
1334 if (desc_count <= 0)
1335 return (-1);
1338 return (desc_count);
1341 static int
1342 e1000g_tx_bind(e1000g_tx_ring_t *tx_ring, p_tx_sw_packet_t packet, mblk_t *mp)
1344 int j;
1345 int mystat;
1346 size_t len;
1347 ddi_dma_cookie_t dma_cookie;
1348 uint_t ncookies;
1349 int desc_count;
1350 uint32_t desc_total;
1352 desc_total = 0;
1353 len = MBLKL(mp);
1356 * ddi_dma_addr_bind_handle() allocates DMA resources for a
1357 * memory object such that a device can perform DMA to or from
1358 * the object. DMA resources are allocated considering the
1359 * device's DMA attributes as expressed by ddi_dma_attr(9S)
1360 * (see ddi_dma_alloc_handle(9F)).
1362 * ddi_dma_addr_bind_handle() fills in the first DMA cookie
1363 * pointed to by cookiep with the appropriate address, length,
1364 * and bus type. *ccountp is set to the number of DMA cookies
1365 * representing this DMA object. Subsequent DMA cookies must be
1366 * retrieved by calling ddi_dma_nextcookie(9F) the number of
1367 * times specified by *countp - 1.
1369 switch (packet->dma_type) {
1370 #ifdef __sparc
1371 case USE_DVMA:
1372 dvma_kaddr_load(packet->tx_dma_handle,
1373 (caddr_t)mp->b_rptr, len, 0, &dma_cookie);
1375 dvma_sync(packet->tx_dma_handle, 0,
1376 DDI_DMA_SYNC_FORDEV);
1378 ncookies = 1;
1379 packet->data_transfer_type = USE_DVMA;
1380 break;
1381 #endif
1382 case USE_DMA:
1383 if ((mystat = ddi_dma_addr_bind_handle(
1384 packet->tx_dma_handle, NULL,
1385 (caddr_t)mp->b_rptr, len,
1386 DDI_DMA_WRITE | DDI_DMA_STREAMING,
1387 DDI_DMA_DONTWAIT, 0, &dma_cookie,
1388 &ncookies)) != DDI_DMA_MAPPED) {
1390 e1000g_log(tx_ring->adapter, CE_WARN,
1391 "Couldn't bind mblk buffer to Tx DMA handle: "
1392 "return: %X, Pkt: %X\n",
1393 mystat, packet);
1394 return (-1);
1398 * An implicit ddi_dma_sync() is done when the
1399 * ddi_dma_addr_bind_handle() is called. So we
1400 * don't need to explicitly call ddi_dma_sync()
1401 * here any more.
1403 ASSERT(ncookies);
1404 E1000G_DEBUG_STAT_COND(tx_ring->stat_multi_cookie,
1405 (ncookies > 1));
1408 * The data_transfer_type value must be set after the handle
1409 * has been bound, for it will be used in e1000g_free_tx_swpkt()
1410 * to decide whether we need to unbind the handle.
1412 packet->data_transfer_type = USE_DMA;
1413 break;
1414 default:
1415 ASSERT(B_FALSE);
1416 break;
1419 packet->num_mblk_frag++;
1422 * Each address could span thru multpile cookie..
1423 * Each cookie will have one descriptor
1425 for (j = ncookies; j != 0; j--) {
1427 desc_count = e1000g_fill_tx_desc(tx_ring,
1428 packet,
1429 dma_cookie.dmac_laddress,
1430 dma_cookie.dmac_size);
1432 if (desc_count <= 0)
1433 return (-1);
1435 desc_total += desc_count;
1438 * ddi_dma_nextcookie() retrieves subsequent DMA
1439 * cookies for a DMA object.
1440 * ddi_dma_nextcookie() fills in the
1441 * ddi_dma_cookie(9S) structure pointed to by
1442 * cookiep. The ddi_dma_cookie(9S) structure
1443 * must be allocated prior to calling
1444 * ddi_dma_nextcookie(). The DMA cookie count
1445 * returned by ddi_dma_buf_bind_handle(9F),
1446 * ddi_dma_addr_bind_handle(9F), or
1447 * ddi_dma_getwin(9F) indicates the number of DMA
1448 * cookies a DMA object consists of. If the
1449 * resulting cookie count, N, is larger than 1,
1450 * ddi_dma_nextcookie() must be called N-1 times
1451 * to retrieve all DMA cookies.
1453 if (j > 1) {
1454 ddi_dma_nextcookie(packet->tx_dma_handle,
1455 &dma_cookie);
1459 return (desc_total);
1462 static void
1463 e1000g_fill_context_descriptor(context_data_t *cur_context,
1464 struct e1000_context_desc *context_desc)
1466 if (cur_context->cksum_flags & HCK_IPV4_HDRCKSUM) {
1467 context_desc->lower_setup.ip_fields.ipcss =
1468 cur_context->ether_header_size;
1469 context_desc->lower_setup.ip_fields.ipcso =
1470 cur_context->ether_header_size +
1471 offsetof(struct ip, ip_sum);
1472 context_desc->lower_setup.ip_fields.ipcse =
1473 cur_context->ether_header_size +
1474 cur_context->cksum_start - 1;
1475 } else
1476 context_desc->lower_setup.ip_config = 0;
1478 if (cur_context->cksum_flags & HCK_PARTIALCKSUM) {
1480 * The packet with same protocol has the following
1481 * stuff and start offset:
1482 * | Protocol | Stuff | Start | Checksum
1483 * | | Offset | Offset | Enable
1484 * | IPv4 + TCP | 0x24 | 0x14 | Yes
1485 * | IPv4 + UDP | 0x1A | 0x14 | Yes
1486 * | IPv6 + TCP | 0x20 | 0x10 | No
1487 * | IPv6 + UDP | 0x14 | 0x10 | No
1489 context_desc->upper_setup.tcp_fields.tucss =
1490 cur_context->cksum_start + cur_context->ether_header_size;
1491 context_desc->upper_setup.tcp_fields.tucso =
1492 cur_context->cksum_stuff + cur_context->ether_header_size;
1493 context_desc->upper_setup.tcp_fields.tucse = 0;
1494 } else
1495 context_desc->upper_setup.tcp_config = 0;
1497 if (cur_context->lso_flag) {
1498 context_desc->tcp_seg_setup.fields.mss = cur_context->mss;
1499 context_desc->tcp_seg_setup.fields.hdr_len =
1500 cur_context->hdr_len;
1502 * workaround for 82546EB errata 23, status-writeback
1503 * reporting (RS) should not be set on context or
1504 * Null descriptors
1506 context_desc->cmd_and_length = E1000_TXD_CMD_DEXT
1507 | E1000_TXD_CMD_TSE | E1000_TXD_CMD_IP | E1000_TXD_CMD_TCP
1508 | E1000_TXD_DTYP_C | cur_context->pay_len;
1509 } else {
1510 context_desc->cmd_and_length = E1000_TXD_CMD_DEXT
1511 | E1000_TXD_DTYP_C;
1513 * Zero out the options for TCP Segmentation Offload
1515 context_desc->tcp_seg_setup.data = 0;
1519 static int
1520 e1000g_fill_tx_desc(e1000g_tx_ring_t *tx_ring,
1521 p_tx_sw_packet_t packet, uint64_t address, size_t size)
1523 struct e1000_hw *hw = &tx_ring->adapter->shared;
1524 p_sw_desc_t desc;
1526 if (hw->mac.type == e1000_82544) {
1527 if (hw->bus.type == e1000_bus_type_pcix)
1528 return (e1000g_tx_workaround_PCIX_82544(packet,
1529 address, size));
1531 if (size > JUMBO_FRAG_LENGTH)
1532 return (e1000g_tx_workaround_jumbo_82544(packet,
1533 address, size));
1536 ASSERT(packet->num_desc < MAX_TX_DESC_PER_PACKET);
1538 desc = &packet->desc[packet->num_desc];
1539 desc->address = address;
1540 desc->length = (uint32_t)size;
1542 packet->num_desc++;
1544 return (1);
1547 static int
1548 e1000g_tx_workaround_PCIX_82544(p_tx_sw_packet_t packet,
1549 uint64_t address, size_t size)
1551 p_sw_desc_t desc;
1552 int desc_count;
1553 long size_left;
1554 size_t len;
1555 uint32_t counter;
1556 uint32_t array_elements;
1557 desc_array_t desc_array;
1560 * Coexist Workaround for cordova: RP: 07/04/03
1562 * RP: ERRATA: Workaround ISSUE:
1563 * 8kb_buffer_Lockup CONTROLLER: Cordova Breakup
1564 * Eachbuffer in to 8kb pieces until the
1565 * remainder is < 8kb
1567 size_left = size;
1568 desc_count = 0;
1570 while (size_left > 0) {
1571 if (size_left > MAX_TX_BUF_SIZE)
1572 len = MAX_TX_BUF_SIZE;
1573 else
1574 len = size_left;
1576 array_elements = e1000g_fill_82544_desc(address,
1577 len, &desc_array);
1579 for (counter = 0; counter < array_elements; counter++) {
1580 ASSERT(packet->num_desc < MAX_TX_DESC_PER_PACKET);
1582 * Put in the buffer address
1584 desc = &packet->desc[packet->num_desc];
1586 desc->address =
1587 desc_array.descriptor[counter].address;
1588 desc->length =
1589 desc_array.descriptor[counter].length;
1591 packet->num_desc++;
1592 desc_count++;
1593 } /* for */
1596 * Update the buffer address and length
1598 address += MAX_TX_BUF_SIZE;
1599 size_left -= MAX_TX_BUF_SIZE;
1600 } /* while */
1602 return (desc_count);
1605 static int
1606 e1000g_tx_workaround_jumbo_82544(p_tx_sw_packet_t packet,
1607 uint64_t address, size_t size)
1609 p_sw_desc_t desc;
1610 int desc_count;
1611 long size_left;
1612 uint32_t offset;
1615 * Workaround for Jumbo Frames on Cordova
1616 * PSD 06/01/2001
1618 size_left = size;
1619 desc_count = 0;
1620 offset = 0;
1621 while (size_left > 0) {
1622 ASSERT(packet->num_desc < MAX_TX_DESC_PER_PACKET);
1624 desc = &packet->desc[packet->num_desc];
1626 desc->address = address + offset;
1628 if (size_left > JUMBO_FRAG_LENGTH)
1629 desc->length = JUMBO_FRAG_LENGTH;
1630 else
1631 desc->length = (uint32_t)size_left;
1633 packet->num_desc++;
1634 desc_count++;
1636 offset += desc->length;
1637 size_left -= JUMBO_FRAG_LENGTH;
1640 return (desc_count);
1643 #pragma inline(e1000g_82547_tx_move_tail_work)
1645 static void
1646 e1000g_82547_tx_move_tail_work(e1000g_tx_ring_t *tx_ring)
1648 struct e1000_hw *hw;
1649 uint16_t hw_tdt;
1650 uint16_t sw_tdt;
1651 struct e1000_tx_desc *tx_desc;
1652 uint16_t length = 0;
1653 boolean_t eop = B_FALSE;
1654 struct e1000g *Adapter;
1656 Adapter = tx_ring->adapter;
1657 hw = &Adapter->shared;
1659 hw_tdt = E1000_READ_REG(hw, E1000_TDT(0));
1660 sw_tdt = tx_ring->tbd_next - tx_ring->tbd_first;
1662 while (hw_tdt != sw_tdt) {
1663 tx_desc = &(tx_ring->tbd_first[hw_tdt]);
1664 length += tx_desc->lower.flags.length;
1665 eop = tx_desc->lower.data & E1000_TXD_CMD_EOP;
1666 if (++hw_tdt == Adapter->tx_desc_num)
1667 hw_tdt = 0;
1669 if (eop) {
1670 if ((Adapter->link_duplex == HALF_DUPLEX) &&
1671 (e1000_fifo_workaround_82547(hw, length)
1672 != E1000_SUCCESS)) {
1673 if (tx_ring->timer_enable_82547) {
1674 ASSERT(tx_ring->timer_id_82547 == 0);
1675 tx_ring->timer_id_82547 =
1676 timeout(e1000g_82547_timeout,
1677 (void *)tx_ring,
1678 drv_usectohz(10000));
1680 return;
1682 } else {
1683 E1000_WRITE_REG(hw, E1000_TDT(0), hw_tdt);
1684 e1000_update_tx_fifo_head_82547(hw, length);
1685 length = 0;
1691 static void
1692 e1000g_82547_timeout(void *arg)
1694 e1000g_tx_ring_t *tx_ring;
1696 tx_ring = (e1000g_tx_ring_t *)arg;
1698 mutex_enter(&tx_ring->tx_lock);
1700 tx_ring->timer_id_82547 = 0;
1701 e1000g_82547_tx_move_tail_work(tx_ring);
1703 mutex_exit(&tx_ring->tx_lock);
1706 static void
1707 e1000g_82547_tx_move_tail(e1000g_tx_ring_t *tx_ring)
1709 timeout_id_t tid;
1711 ASSERT(MUTEX_HELD(&tx_ring->tx_lock));
1713 tid = tx_ring->timer_id_82547;
1714 tx_ring->timer_id_82547 = 0;
1715 if (tid != 0) {
1716 tx_ring->timer_enable_82547 = B_FALSE;
1717 mutex_exit(&tx_ring->tx_lock);
1719 (void) untimeout(tid);
1721 mutex_enter(&tx_ring->tx_lock);
1723 tx_ring->timer_enable_82547 = B_TRUE;
1724 e1000g_82547_tx_move_tail_work(tx_ring);
1728 * This is part of a workaround for the I219, see e1000g_flush_desc_rings() for
1729 * more information.
1731 * We need to clear any potential pending descriptors from the tx_ring. As
1732 * we're about to reset the device, we don't care about the data that we give it
1733 * itself.
1735 void
1736 e1000g_flush_tx_ring(struct e1000g *Adapter)
1738 struct e1000_hw *hw = &Adapter->shared;
1739 e1000g_tx_ring_t *tx_ring = &Adapter->tx_ring[0];
1740 uint32_t tctl, txd_lower = E1000_TXD_CMD_IFCS;
1741 uint16_t size = 512;
1742 struct e1000_tx_desc *desc;
1744 tctl = E1000_READ_REG(hw, E1000_TCTL);
1745 E1000_WRITE_REG(hw, E1000_TCTL, tctl | E1000_TCTL_EN);
1747 desc = tx_ring->tbd_next;
1748 if (tx_ring->tbd_next == tx_ring->tbd_last)
1749 tx_ring->tbd_next = tx_ring->tbd_first;
1750 else
1751 tx_ring->tbd_next++;
1753 /* We just need to set any valid address, so we use the ring itself */
1754 desc->buffer_addr = tx_ring->tbd_dma_addr;
1755 desc->lower.data = LE_32(txd_lower | size);
1756 desc->upper.data = 0;
1758 (void) ddi_dma_sync(tx_ring->tbd_dma_handle,
1759 0, 0, DDI_DMA_SYNC_FORDEV);
1760 E1000_WRITE_REG(hw, E1000_TDT(0),
1761 (uint32_t)(tx_ring->tbd_next - tx_ring->tbd_first));
1762 (void) E1000_READ_REG(hw, E1000_STATUS);
1763 usec_delay(250);