From c84d324c770dc81acebc1042163da33c8ded2364 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Tue, 16 Nov 2010 19:27:12 -0800 Subject: [PATCH] ixgbe: rework Tx hang detection to fix reoccurring false Tx hangs The Tx hang logic has been known to detect false hangs when the device is receiving pause frames or has delayed processing for some other reason. This patch makes the logic more robust and resolves these known issues. The old logic checked to see if the device was paused by querying the HW then the hang logic was aborted if the device was currently paused. This check was racy because the device could have been in the pause state any time up to this check. The other operation of the hang logic is to verify the Tx ring is still advancing the old logic checked the EOP timestamp. This is not sufficient to determine the ring is not advancing but only infers that it may be moving slowly. Here we add logic to track the number of completed Tx descriptors and use the adapter stats to check if any pause frames have been received since the previous Tx hang check. This way we avoid racing with the HW register and do not detect false hangs if the ring is advancing slowly. This patch is primarily the work of Jesse Brandeburg. I clean it up some and fixed the PFC checking. Signed-off-by: John Fastabend Tested-by: Ross Brattain Signed-off-by: Jeff Kirsher --- drivers/net/ixgbe/ixgbe.h | 4 + drivers/net/ixgbe/ixgbe_main.c | 250 ++++++++++++++++++++++++++++------------- 2 files changed, 175 insertions(+), 79 deletions(-) diff --git a/drivers/net/ixgbe/ixgbe.h b/drivers/net/ixgbe/ixgbe.h index ce43c935268..2b8cbb3a81f 100644 --- a/drivers/net/ixgbe/ixgbe.h +++ b/drivers/net/ixgbe/ixgbe.h @@ -149,6 +149,8 @@ struct ixgbe_queue_stats { struct ixgbe_tx_queue_stats { u64 restart_queue; u64 tx_busy; + u64 completed; + u64 tx_done_old; }; struct ixgbe_rx_queue_stats { @@ -162,6 +164,7 @@ struct ixgbe_rx_queue_stats { enum ixbge_ring_state_t { __IXGBE_TX_FDIR_INIT_DONE, __IXGBE_TX_DETECT_HANG, + __IXGBE_HANG_CHECK_ARMED, __IXGBE_RX_PS_ENABLED, __IXGBE_RX_RSC_ENABLED, }; @@ -514,6 +517,7 @@ extern void ixgbe_unmap_and_free_tx_resource(struct ixgbe_ring *, extern void ixgbe_alloc_rx_buffers(struct ixgbe_ring *, u16); extern void ixgbe_write_eitr(struct ixgbe_q_vector *); extern int ethtool_ioctl(struct ifreq *ifr); +extern u8 ixgbe_dcb_txq_to_tc(struct ixgbe_adapter *adapter, u8 index); extern s32 ixgbe_reinit_fdir_tables_82599(struct ixgbe_hw *hw); extern s32 ixgbe_init_fdir_signature_82599(struct ixgbe_hw *hw, u32 pballoc); extern s32 ixgbe_init_fdir_perfect_82599(struct ixgbe_hw *hw, u32 pballoc); diff --git a/drivers/net/ixgbe/ixgbe_main.c b/drivers/net/ixgbe/ixgbe_main.c index def5c6e047c..6e56f7b7c8f 100644 --- a/drivers/net/ixgbe/ixgbe_main.c +++ b/drivers/net/ixgbe/ixgbe_main.c @@ -630,93 +630,166 @@ void ixgbe_unmap_and_free_tx_resource(struct ixgbe_ring *tx_ring, } /** - * ixgbe_tx_xon_state - check the tx ring xon state - * @adapter: the ixgbe adapter - * @tx_ring: the corresponding tx_ring + * ixgbe_dcb_txq_to_tc - convert a reg index to a traffic class + * @adapter: driver private struct + * @index: reg idx of queue to query (0-127) * - * If not in DCB mode, checks TFCS.TXOFF, otherwise, find out the - * corresponding TC of this tx_ring when checking TFCS. + * Helper function to determine the traffic index for a paticular + * register index. * - * Returns : true if in xon state (currently not paused) + * Returns : a tc index for use in range 0-7, or 0-3 */ -static inline bool ixgbe_tx_xon_state(struct ixgbe_adapter *adapter, - struct ixgbe_ring *tx_ring) +u8 ixgbe_dcb_txq_to_tc(struct ixgbe_adapter *adapter, u8 reg_idx) { - u32 txoff = IXGBE_TFCS_TXOFF; + int tc = -1; + int dcb_i = adapter->ring_feature[RING_F_DCB].indices; -#ifdef CONFIG_IXGBE_DCB - if (adapter->dcb_cfg.pfc_mode_enable) { - int tc; - int dcb_i = adapter->ring_feature[RING_F_DCB].indices; - u8 reg_idx = tx_ring->reg_idx; + /* if DCB is not enabled the queues have no TC */ + if (!(adapter->flags & IXGBE_FLAG_DCB_ENABLED)) + return tc; - switch (adapter->hw.mac.type) { - case ixgbe_mac_82598EB: - tc = reg_idx >> 2; - txoff = IXGBE_TFCS_TXOFF0; + /* check valid range */ + if (reg_idx >= adapter->hw.mac.max_tx_queues) + return tc; + + switch (adapter->hw.mac.type) { + case ixgbe_mac_82598EB: + tc = reg_idx >> 2; + break; + default: + if (dcb_i != 4 && dcb_i != 8) break; - case ixgbe_mac_82599EB: - tc = 0; - txoff = IXGBE_TFCS_TXOFF; - if (dcb_i == 8) { - /* TC0, TC1 */ - tc = reg_idx >> 5; - if (tc == 2) /* TC2, TC3 */ - tc += (reg_idx - 64) >> 4; - else if (tc == 3) /* TC4, TC5, TC6, TC7 */ - tc += 1 + ((reg_idx - 96) >> 3); - } else if (dcb_i == 4) { - /* TC0, TC1 */ - tc = reg_idx >> 6; - if (tc == 1) { - tc += (reg_idx - 64) >> 5; - if (tc == 2) /* TC2, TC3 */ - tc += (reg_idx - 96) >> 4; - } - } + + /* if VMDq is enabled the lowest order bits determine TC */ + if (adapter->flags & (IXGBE_FLAG_SRIOV_ENABLED | + IXGBE_FLAG_VMDQ_ENABLED)) { + tc = reg_idx & (dcb_i - 1); + break; + } + + /* + * Convert the reg_idx into the correct TC. This bitmask + * targets the last full 32 ring traffic class and assigns + * it a value of 1. From there the rest of the rings are + * based on shifting the mask further up to include the + * reg_idx / 16 and then reg_idx / 8. It assumes dcB_i + * will only ever be 8 or 4 and that reg_idx will never + * be greater then 128. The code without the power of 2 + * optimizations would be: + * (((reg_idx % 32) + 32) * dcb_i) >> (9 - reg_idx / 32) + */ + tc = ((reg_idx & 0X1F) + 0x20) * dcb_i; + tc >>= 9 - (reg_idx >> 5); + } + + return tc; +} + +static void ixgbe_update_xoff_received(struct ixgbe_adapter *adapter) +{ + struct ixgbe_hw *hw = &adapter->hw; + struct ixgbe_hw_stats *hwstats = &adapter->stats; + u32 data = 0; + u32 xoff[8] = {0}; + int i; + + if ((hw->fc.current_mode == ixgbe_fc_full) || + (hw->fc.current_mode == ixgbe_fc_rx_pause)) { + switch (hw->mac.type) { + case ixgbe_mac_82598EB: + data = IXGBE_READ_REG(hw, IXGBE_LXOFFRXC); break; default: - tc = 0; + data = IXGBE_READ_REG(hw, IXGBE_LXOFFRXCNT); + } + hwstats->lxoffrxc += data; + + /* refill credits (no tx hang) if we received xoff */ + if (!data) + return; + + for (i = 0; i < adapter->num_tx_queues; i++) + clear_bit(__IXGBE_HANG_CHECK_ARMED, + &adapter->tx_ring[i]->state); + return; + } else if (!(adapter->dcb_cfg.pfc_mode_enable)) + return; + + /* update stats for each tc, only valid with PFC enabled */ + for (i = 0; i < MAX_TX_PACKET_BUFFERS; i++) { + switch (hw->mac.type) { + case ixgbe_mac_82598EB: + xoff[i] = IXGBE_READ_REG(hw, IXGBE_PXOFFRXC(i)); break; + default: + xoff[i] = IXGBE_READ_REG(hw, IXGBE_PXOFFRXCNT(i)); } - txoff <<= tc; + hwstats->pxoffrxc[i] += xoff[i]; + } + + /* disarm tx queues that have received xoff frames */ + for (i = 0; i < adapter->num_tx_queues; i++) { + struct ixgbe_ring *tx_ring = adapter->tx_ring[i]; + u32 tc = ixgbe_dcb_txq_to_tc(adapter, tx_ring->reg_idx); + + if (xoff[tc]) + clear_bit(__IXGBE_HANG_CHECK_ARMED, &tx_ring->state); } -#endif - return IXGBE_READ_REG(&adapter->hw, IXGBE_TFCS) & txoff; } -static inline bool ixgbe_check_tx_hang(struct ixgbe_adapter *adapter, - struct ixgbe_ring *tx_ring, - unsigned int eop) +static u64 ixgbe_get_tx_completed(struct ixgbe_ring *ring) { + return ring->tx_stats.completed; +} + +static u64 ixgbe_get_tx_pending(struct ixgbe_ring *ring) +{ + struct ixgbe_adapter *adapter = netdev_priv(ring->netdev); struct ixgbe_hw *hw = &adapter->hw; - /* Detect a transmit hang in hardware, this serializes the - * check with the clearing of time_stamp and movement of eop */ + u32 head = IXGBE_READ_REG(hw, IXGBE_TDH(ring->reg_idx)); + u32 tail = IXGBE_READ_REG(hw, IXGBE_TDT(ring->reg_idx)); + + if (head != tail) + return (head < tail) ? + tail - head : (tail + ring->count - head); + + return 0; +} + +static inline bool ixgbe_check_tx_hang(struct ixgbe_ring *tx_ring) +{ + u32 tx_done = ixgbe_get_tx_completed(tx_ring); + u32 tx_done_old = tx_ring->tx_stats.tx_done_old; + u32 tx_pending = ixgbe_get_tx_pending(tx_ring); + bool ret = false; + clear_check_for_tx_hang(tx_ring); - if (tx_ring->tx_buffer_info[eop].time_stamp && - time_after(jiffies, tx_ring->tx_buffer_info[eop].time_stamp + HZ) && - ixgbe_tx_xon_state(adapter, tx_ring)) { - /* detected Tx unit hang */ - union ixgbe_adv_tx_desc *tx_desc; - tx_desc = IXGBE_TX_DESC_ADV(tx_ring, eop); - e_err(drv, "Detected Tx Unit Hang\n" - " Tx Queue <%d>\n" - " TDH, TDT <%x>, <%x>\n" - " next_to_use <%x>\n" - " next_to_clean <%x>\n" - "tx_buffer_info[next_to_clean]\n" - " time_stamp <%lx>\n" - " jiffies <%lx>\n", - tx_ring->queue_index, - IXGBE_READ_REG(hw, IXGBE_TDH(tx_ring->reg_idx)), - IXGBE_READ_REG(hw, IXGBE_TDT(tx_ring->reg_idx)), - tx_ring->next_to_use, eop, - tx_ring->tx_buffer_info[eop].time_stamp, jiffies); - return true; + + /* + * Check for a hung queue, but be thorough. This verifies + * that a transmit has been completed since the previous + * check AND there is at least one packet pending. The + * ARMED bit is set to indicate a potential hang. The + * bit is cleared if a pause frame is received to remove + * false hang detection due to PFC or 802.3x frames. By + * requiring this to fail twice we avoid races with + * pfc clearing the ARMED bit and conditions where we + * run the check_tx_hang logic with a transmit completion + * pending but without time to complete it yet. + */ + if ((tx_done_old == tx_done) && tx_pending) { + /* make sure it is true for two checks in a row */ + ret = test_and_set_bit(__IXGBE_HANG_CHECK_ARMED, + &tx_ring->state); + } else { + /* update completed stats and continue */ + tx_ring->tx_stats.tx_done_old = tx_done; + /* reset the countdown */ + clear_bit(__IXGBE_HANG_CHECK_ARMED, &tx_ring->state); } - return false; + return ret; } #define IXGBE_MAX_TXD_PWR 14 @@ -772,6 +845,7 @@ static bool ixgbe_clean_tx_irq(struct ixgbe_q_vector *q_vector, tx_buffer_info); } + tx_ring->tx_stats.completed++; eop = tx_ring->tx_buffer_info[i].next_to_watch; eop_desc = IXGBE_TX_DESC_ADV(tx_ring, eop); } @@ -784,11 +858,31 @@ static bool ixgbe_clean_tx_irq(struct ixgbe_q_vector *q_vector, tx_ring->stats.bytes += total_bytes; u64_stats_update_end(&tx_ring->syncp); - if (check_for_tx_hang(tx_ring) && - ixgbe_check_tx_hang(adapter, tx_ring, i)) { + if (check_for_tx_hang(tx_ring) && ixgbe_check_tx_hang(tx_ring)) { + /* schedule immediate reset if we believe we hung */ + struct ixgbe_hw *hw = &adapter->hw; + tx_desc = IXGBE_TX_DESC_ADV(tx_ring, eop); + e_err(drv, "Detected Tx Unit Hang\n" + " Tx Queue <%d>\n" + " TDH, TDT <%x>, <%x>\n" + " next_to_use <%x>\n" + " next_to_clean <%x>\n" + "tx_buffer_info[next_to_clean]\n" + " time_stamp <%lx>\n" + " jiffies <%lx>\n", + tx_ring->queue_index, + IXGBE_READ_REG(hw, IXGBE_TDH(tx_ring->reg_idx)), + IXGBE_READ_REG(hw, IXGBE_TDT(tx_ring->reg_idx)), + tx_ring->next_to_use, eop, + tx_ring->tx_buffer_info[eop].time_stamp, jiffies); + + netif_stop_subqueue(tx_ring->netdev, tx_ring->queue_index); + + e_info(probe, + "tx hang %d detected on queue %d, resetting adapter\n", + adapter->tx_timeout_count + 1, tx_ring->queue_index); + /* schedule immediate reset if we believe we hung */ - e_info(probe, "tx hang %d detected, resetting " - "adapter\n", adapter->tx_timeout_count + 1); ixgbe_tx_timeout(adapter->netdev); /* the adapter is about to reset, no point in enabling stuff */ @@ -2599,6 +2693,8 @@ void ixgbe_configure_tx_ring(struct ixgbe_adapter *adapter, ring->atr_sample_rate = 0; } + clear_bit(__IXGBE_HANG_CHECK_ARMED, &ring->state); + /* enable queue */ txdctl |= IXGBE_TXDCTL_ENABLE; IXGBE_WRITE_REG(hw, IXGBE_TXDCTL(reg_idx), txdctl); @@ -4034,6 +4130,8 @@ static void ixgbe_tx_timeout(struct net_device *netdev) { struct ixgbe_adapter *adapter = netdev_priv(netdev); + adapter->tx_timeout_count++; + /* Do the reset outside of interrupt context */ schedule_work(&adapter->reset_task); } @@ -4048,8 +4146,6 @@ static void ixgbe_reset_task(struct work_struct *work) test_bit(__IXGBE_RESETTING, &adapter->state)) return; - adapter->tx_timeout_count++; - ixgbe_dump(adapter); netdev_err(adapter->netdev, "Reset adapter\n"); ixgbe_reinit_locked(adapter); @@ -5597,14 +5693,10 @@ void ixgbe_update_stats(struct ixgbe_adapter *adapter) case ixgbe_mac_82598EB: hwstats->pxonrxc[i] += IXGBE_READ_REG(hw, IXGBE_PXONRXC(i)); - hwstats->pxoffrxc[i] += - IXGBE_READ_REG(hw, IXGBE_PXOFFRXC(i)); break; case ixgbe_mac_82599EB: hwstats->pxonrxc[i] += IXGBE_READ_REG(hw, IXGBE_PXONRXCNT(i)); - hwstats->pxoffrxc[i] += - IXGBE_READ_REG(hw, IXGBE_PXOFFRXCNT(i)); break; default: break; @@ -5616,11 +5708,12 @@ void ixgbe_update_stats(struct ixgbe_adapter *adapter) /* work around hardware counting issue */ hwstats->gprc -= missed_rx; + ixgbe_update_xoff_received(adapter); + /* 82598 hardware only has a 32 bit counter in the high register */ switch (hw->mac.type) { case ixgbe_mac_82598EB: hwstats->lxonrxc += IXGBE_READ_REG(hw, IXGBE_LXONRXC); - hwstats->lxoffrxc += IXGBE_READ_REG(hw, IXGBE_LXOFFRXC); hwstats->gorc += IXGBE_READ_REG(hw, IXGBE_GORCH); hwstats->gotc += IXGBE_READ_REG(hw, IXGBE_GOTCH); hwstats->tor += IXGBE_READ_REG(hw, IXGBE_TORH); @@ -5633,7 +5726,6 @@ void ixgbe_update_stats(struct ixgbe_adapter *adapter) hwstats->tor += IXGBE_READ_REG(hw, IXGBE_TORL); IXGBE_READ_REG(hw, IXGBE_TORH); /* to clear */ hwstats->lxonrxc += IXGBE_READ_REG(hw, IXGBE_LXONRXCNT); - hwstats->lxoffrxc += IXGBE_READ_REG(hw, IXGBE_LXOFFRXCNT); hwstats->fdirmatch += IXGBE_READ_REG(hw, IXGBE_FDIRMATCH); hwstats->fdirmiss += IXGBE_READ_REG(hw, IXGBE_FDIRMISS); #ifdef IXGBE_FCOE -- 2.11.4.GIT