kernel/2.6.29.6-aldebaran-rt/drivers/net/cxgb3/sge.c

   1 /*
   2  * Copyright (c) 2005-2008 Chelsio, Inc. All rights reserved.
   3  *
   4  * This software is available to you under a choice of one of two
   5  * licenses.  You may choose to be licensed under the terms of the GNU
   6  * General Public License (GPL) Version 2, available from the file
   7  * COPYING in the main directory of this source tree, or the
   8  * OpenIB.org BSD license below:
   9  *
  10  *     Redistribution and use in source and binary forms, with or
  11  *     without modification, are permitted provided that the following
  12  *     conditions are met:
  13  *
  14  *      - Redistributions of source code must retain the above
  15  *        copyright notice, this list of conditions and the following
  16  *        disclaimer.
  17  *
  18  *      - Redistributions in binary form must reproduce the above
  19  *        copyright notice, this list of conditions and the following
  20  *        disclaimer in the documentation and/or other materials
  21  *        provided with the distribution.
  22  *
  23  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  24  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  25  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  26  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  27  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  28  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  29  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  30  * SOFTWARE.
  31  */
  32 #include <linux/skbuff.h>
  33 #include <linux/netdevice.h>
  34 #include <linux/etherdevice.h>
  35 #include <linux/if_vlan.h>
  36 #include <linux/ip.h>
  37 #include <linux/tcp.h>
  38 #include <linux/dma-mapping.h>
  39 #include <net/arp.h>
  40 #include "common.h"
  41 #include "regs.h"
  42 #include "sge_defs.h"
  43 #include "t3_cpl.h"
  44 #include "firmware_exports.h"
  45
  46 #define USE_GTS 0
  47
  48 #define SGE_RX_SM_BUF_SIZE 1536
  49
  50 #define SGE_RX_COPY_THRES  256
  51 #define SGE_RX_PULL_LEN    128
  52
  53 /*
  54  * Page chunk size for FL0 buffers if FL0 is to be populated with page chunks.
  55  * It must be a divisor of PAGE_SIZE.  If set to 0 FL0 will use sk_buffs
  56  * directly.
  57  */
  58 #define FL0_PG_CHUNK_SIZE  2048
  59 #define FL0_PG_ORDER 0
  60 #define FL1_PG_CHUNK_SIZE (PAGE_SIZE > 8192 ? 16384 : 8192)
  61 #define FL1_PG_ORDER (PAGE_SIZE > 8192 ? 0 : 1)
  62
  63 #define SGE_RX_DROP_THRES 16
  64
  65 /*
  66  * Period of the Tx buffer reclaim timer.  This timer does not need to run
  67  * frequently as Tx buffers are usually reclaimed by new Tx packets.
  68  */
  69 #define TX_RECLAIM_PERIOD (HZ / 4)
  70
  71 /* WR size in bytes */
  72 #define WR_LEN (WR_FLITS * 8)
  73
  74 /*
  75  * Types of Tx queues in each queue set.  Order here matters, do not change.
  76  */
  77 enum { TXQ_ETH, TXQ_OFLD, TXQ_CTRL };
  78
  79 /* Values for sge_txq.flags */
  80 enum {
  81         TXQ_RUNNING = 1 << 0,   /* fetch engine is running */
  82         TXQ_LAST_PKT_DB = 1 << 1,       /* last packet rang the doorbell */
  83 };
  84
  85 struct tx_desc {
  86         __be64 flit[TX_DESC_FLITS];
  87 };
  88
  89 struct rx_desc {
  90         __be32 addr_lo;
  91         __be32 len_gen;
  92         __be32 gen2;
  93         __be32 addr_hi;
  94 };
  95
  96 struct tx_sw_desc {             /* SW state per Tx descriptor */
  97         struct sk_buff *skb;
  98         u8 eop;       /* set if last descriptor for packet */
  99         u8 addr_idx;  /* buffer index of first SGL entry in descriptor */
 100         u8 fragidx;   /* first page fragment associated with descriptor */
 101         s8 sflit;     /* start flit of first SGL entry in descriptor */
 102 };
 103
 104 struct rx_sw_desc {                /* SW state per Rx descriptor */
 105         union {
 106                 struct sk_buff *skb;
 107                 struct fl_pg_chunk pg_chunk;
 108         };
 109         DECLARE_PCI_UNMAP_ADDR(dma_addr);
 110 };
 111
 112 struct rsp_desc {               /* response queue descriptor */
 113         struct rss_header rss_hdr;
 114         __be32 flags;
 115         __be32 len_cq;
 116         u8 imm_data[47];
 117         u8 intr_gen;
 118 };
 119
 120 /*
 121  * Holds unmapping information for Tx packets that need deferred unmapping.
 122  * This structure lives at skb->head and must be allocated by callers.
 123  */
 124 struct deferred_unmap_info {
 125         struct pci_dev *pdev;
 126         dma_addr_t addr[MAX_SKB_FRAGS + 1];
 127 };
 128
 129 /*
 130  * Maps a number of flits to the number of Tx descriptors that can hold them.
 131  * The formula is
 132  *
 133  * desc = 1 + (flits - 2) / (WR_FLITS - 1).
 134  *
 135  * HW allows up to 4 descriptors to be combined into a WR.
 136  */
 137 static u8 flit_desc_map[] = {
 138         0,
 139 #if SGE_NUM_GENBITS == 1
 140         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 141         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 142         3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 143         4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
 144 #elif SGE_NUM_GENBITS == 2
 145         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 146         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 147         3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 148         4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
 149 #else
 150 # error "SGE_NUM_GENBITS must be 1 or 2"
 151 #endif
 152 };
 153
 154 static inline struct sge_qset *fl_to_qset(const struct sge_fl *q, int qidx)
 155 {
 156         return container_of(q, struct sge_qset, fl[qidx]);
 157 }
 158
 159 static inline struct sge_qset *rspq_to_qset(const struct sge_rspq *q)
 160 {
 161         return container_of(q, struct sge_qset, rspq);
 162 }
 163
 164 static inline struct sge_qset *txq_to_qset(const struct sge_txq *q, int qidx)
 165 {
 166         return container_of(q, struct sge_qset, txq[qidx]);
 167 }
 168
 169 /**
 170  *      refill_rspq - replenish an SGE response queue
 171  *      @adapter: the adapter
 172  *      @q: the response queue to replenish
 173  *      @credits: how many new responses to make available
 174  *
 175  *      Replenishes a response queue by making the supplied number of responses
 176  *      available to HW.
 177  */
 178 static inline void refill_rspq(struct adapter *adapter,
 179                                const struct sge_rspq *q, unsigned int credits)
 180 {
 181         rmb();
 182         t3_write_reg(adapter, A_SG_RSPQ_CREDIT_RETURN,
 183                      V_RSPQ(q->cntxt_id) | V_CREDITS(credits));
 184 }
 185
 186 /**
 187  *      need_skb_unmap - does the platform need unmapping of sk_buffs?
 188  *
 189  *      Returns true if the platfrom needs sk_buff unmapping.  The compiler
 190  *      optimizes away unecessary code if this returns true.
 191  */
 192 static inline int need_skb_unmap(void)
 193 {
 194         /*
 195          * This structure is used to tell if the platfrom needs buffer
 196          * unmapping by checking if DECLARE_PCI_UNMAP_ADDR defines anything.
 197          */
 198         struct dummy {
 199                 DECLARE_PCI_UNMAP_ADDR(addr);
 200         };
 201
 202         return sizeof(struct dummy) != 0;
 203 }
 204
 205 /**
 206  *      unmap_skb - unmap a packet main body and its page fragments
 207  *      @skb: the packet
 208  *      @q: the Tx queue containing Tx descriptors for the packet
 209  *      @cidx: index of Tx descriptor
 210  *      @pdev: the PCI device
 211  *
 212  *      Unmap the main body of an sk_buff and its page fragments, if any.
 213  *      Because of the fairly complicated structure of our SGLs and the desire
 214  *      to conserve space for metadata, the information necessary to unmap an
 215  *      sk_buff is spread across the sk_buff itself (buffer lengths), the HW Tx
 216  *      descriptors (the physical addresses of the various data buffers), and
 217  *      the SW descriptor state (assorted indices).  The send functions
 218  *      initialize the indices for the first packet descriptor so we can unmap
 219  *      the buffers held in the first Tx descriptor here, and we have enough
 220  *      information at this point to set the state for the next Tx descriptor.
 221  *
 222  *      Note that it is possible to clean up the first descriptor of a packet
 223  *      before the send routines have written the next descriptors, but this
 224  *      race does not cause any problem.  We just end up writing the unmapping
 225  *      info for the descriptor first.
 226  */
 227 static inline void unmap_skb(struct sk_buff *skb, struct sge_txq *q,
 228                              unsigned int cidx, struct pci_dev *pdev)
 229 {
 230         const struct sg_ent *sgp;
 231         struct tx_sw_desc *d = &q->sdesc[cidx];
 232         int nfrags, frag_idx, curflit, j = d->addr_idx;
 233
 234         sgp = (struct sg_ent *)&q->desc[cidx].flit[d->sflit];
 235         frag_idx = d->fragidx;
 236
 237         if (frag_idx == 0 && skb_headlen(skb)) {
 238                 pci_unmap_single(pdev, be64_to_cpu(sgp->addr[0]),
 239                                  skb_headlen(skb), PCI_DMA_TODEVICE);
 240                 j = 1;
 241         }
 242
 243         curflit = d->sflit + 1 + j;
 244         nfrags = skb_shinfo(skb)->nr_frags;
 245
 246         while (frag_idx < nfrags && curflit < WR_FLITS) {
 247                 pci_unmap_page(pdev, be64_to_cpu(sgp->addr[j]),
 248                                skb_shinfo(skb)->frags[frag_idx].size,
 249                                PCI_DMA_TODEVICE);
 250                 j ^= 1;
 251                 if (j == 0) {
 252                         sgp++;
 253                         curflit++;
 254                 }
 255                 curflit++;
 256                 frag_idx++;
 257         }
 258
 259         if (frag_idx < nfrags) {   /* SGL continues into next Tx descriptor */
 260                 d = cidx + 1 == q->size ? q->sdesc : d + 1;
 261                 d->fragidx = frag_idx;
 262                 d->addr_idx = j;
 263                 d->sflit = curflit - WR_FLITS - j; /* sflit can be -1 */
 264         }
 265 }
 266
 267 /**
 268  *      free_tx_desc - reclaims Tx descriptors and their buffers
 269  *      @adapter: the adapter
 270  *      @q: the Tx queue to reclaim descriptors from
 271  *      @n: the number of descriptors to reclaim
 272  *
 273  *      Reclaims Tx descriptors from an SGE Tx queue and frees the associated
 274  *      Tx buffers.  Called with the Tx queue lock held.
 275  */
 276 static void free_tx_desc(struct adapter *adapter, struct sge_txq *q,
 277                          unsigned int n)
 278 {
 279         struct tx_sw_desc *d;
 280         struct pci_dev *pdev = adapter->pdev;
 281         unsigned int cidx = q->cidx;
 282
 283         const int need_unmap = need_skb_unmap() &&
 284                                q->cntxt_id >= FW_TUNNEL_SGEEC_START;
 285
 286         d = &q->sdesc[cidx];
 287         while (n--) {
 288                 if (d->skb) {   /* an SGL is present */
 289                         if (need_unmap)
 290                                 unmap_skb(d->skb, q, cidx, pdev);
 291                         if (d->eop)
 292                                 kfree_skb(d->skb);
 293                 }
 294                 ++d;
 295                 if (++cidx == q->size) {
 296                         cidx = 0;
 297                         d = q->sdesc;
 298                 }
 299         }
 300         q->cidx = cidx;
 301 }
 302
 303 /**
 304  *      reclaim_completed_tx - reclaims completed Tx descriptors
 305  *      @adapter: the adapter
 306  *      @q: the Tx queue to reclaim completed descriptors from
 307  *
 308  *      Reclaims Tx descriptors that the SGE has indicated it has processed,
 309  *      and frees the associated buffers if possible.  Called with the Tx
 310  *      queue's lock held.
 311  */
 312 static inline void reclaim_completed_tx(struct adapter *adapter,
 313                                         struct sge_txq *q)
 314 {
 315         unsigned int reclaim = q->processed - q->cleaned;
 316
 317         if (reclaim) {
 318                 free_tx_desc(adapter, q, reclaim);
 319                 q->cleaned += reclaim;
 320                 q->in_use -= reclaim;
 321         }
 322 }
 323
 324 /**
 325  *      should_restart_tx - are there enough resources to restart a Tx queue?
 326  *      @q: the Tx queue
 327  *
 328  *      Checks if there are enough descriptors to restart a suspended Tx queue.
 329  */
 330 static inline int should_restart_tx(const struct sge_txq *q)
 331 {
 332         unsigned int r = q->processed - q->cleaned;
 333
 334         return q->in_use - r < (q->size >> 1);
 335 }
 336
 337 /**
 338  *      free_rx_bufs - free the Rx buffers on an SGE free list
 339  *      @pdev: the PCI device associated with the adapter
 340  *      @rxq: the SGE free list to clean up
 341  *
 342  *      Release the buffers on an SGE free-buffer Rx queue.  HW fetching from
 343  *      this queue should be stopped before calling this function.
 344  */
 345 static void free_rx_bufs(struct pci_dev *pdev, struct sge_fl *q)
 346 {
 347         unsigned int cidx = q->cidx;
 348
 349         while (q->credits--) {
 350                 struct rx_sw_desc *d = &q->sdesc[cidx];
 351
 352                 pci_unmap_single(pdev, pci_unmap_addr(d, dma_addr),
 353                                  q->buf_size, PCI_DMA_FROMDEVICE);
 354                 if (q->use_pages) {
 355                         if (d->pg_chunk.page)
 356                                 put_page(d->pg_chunk.page);
 357                         d->pg_chunk.page = NULL;
 358                 } else {
 359                         kfree_skb(d->skb);
 360                         d->skb = NULL;
 361                 }
 362                 if (++cidx == q->size)
 363                         cidx = 0;
 364         }
 365
 366         if (q->pg_chunk.page) {
 367                 __free_pages(q->pg_chunk.page, q->order);
 368                 q->pg_chunk.page = NULL;
 369         }
 370 }
 371
 372 /**
 373  *      add_one_rx_buf - add a packet buffer to a free-buffer list
 374  *      @va:  buffer start VA
 375  *      @len: the buffer length
 376  *      @d: the HW Rx descriptor to write
 377  *      @sd: the SW Rx descriptor to write
 378  *      @gen: the generation bit value
 379  *      @pdev: the PCI device associated with the adapter
 380  *
 381  *      Add a buffer of the given length to the supplied HW and SW Rx
 382  *      descriptors.
 383  */
 384 static inline int add_one_rx_buf(void *va, unsigned int len,
 385                                  struct rx_desc *d, struct rx_sw_desc *sd,
 386                                  unsigned int gen, struct pci_dev *pdev)
 387 {
 388         dma_addr_t mapping;
 389
 390         mapping = pci_map_single(pdev, va, len, PCI_DMA_FROMDEVICE);
 391         if (unlikely(pci_dma_mapping_error(pdev, mapping)))
 392                 return -ENOMEM;
 393
 394         pci_unmap_addr_set(sd, dma_addr, mapping);
 395
 396         d->addr_lo = cpu_to_be32(mapping);
 397         d->addr_hi = cpu_to_be32((u64) mapping >> 32);
 398         wmb();
 399         d->len_gen = cpu_to_be32(V_FLD_GEN1(gen));
 400         d->gen2 = cpu_to_be32(V_FLD_GEN2(gen));
 401         return 0;
 402 }
 403
 404 static int alloc_pg_chunk(struct sge_fl *q, struct rx_sw_desc *sd, gfp_t gfp,
 405                           unsigned int order)
 406 {
 407         if (!q->pg_chunk.page) {
 408                 q->pg_chunk.page = alloc_pages(gfp, order);
 409                 if (unlikely(!q->pg_chunk.page))
 410                         return -ENOMEM;
 411                 q->pg_chunk.va = page_address(q->pg_chunk.page);
 412                 q->pg_chunk.offset = 0;
 413         }
 414         sd->pg_chunk = q->pg_chunk;
 415
 416         q->pg_chunk.offset += q->buf_size;
 417         if (q->pg_chunk.offset == (PAGE_SIZE << order))
 418                 q->pg_chunk.page = NULL;
 419         else {
 420                 q->pg_chunk.va += q->buf_size;
 421                 get_page(q->pg_chunk.page);
 422         }
 423         return 0;
 424 }
 425
 426 /**
 427  *      refill_fl - refill an SGE free-buffer list
 428  *      @adapter: the adapter
 429  *      @q: the free-list to refill
 430  *      @n: the number of new buffers to allocate
 431  *      @gfp: the gfp flags for allocating new buffers
 432  *
 433  *      (Re)populate an SGE free-buffer list with up to @n new packet buffers,
 434  *      allocated with the supplied gfp flags.  The caller must assure that
 435  *      @n does not exceed the queue's capacity.
 436  */
 437 static int refill_fl(struct adapter *adap, struct sge_fl *q, int n, gfp_t gfp)
 438 {
 439         void *buf_start;
 440         struct rx_sw_desc *sd = &q->sdesc[q->pidx];
 441         struct rx_desc *d = &q->desc[q->pidx];
 442         unsigned int count = 0;
 443
 444         while (n--) {
 445                 int err;
 446
 447                 if (q->use_pages) {
 448                         if (unlikely(alloc_pg_chunk(q, sd, gfp, q->order))) {
 449 nomem:                          q->alloc_failed++;
 450                                 break;
 451                         }
 452                         buf_start = sd->pg_chunk.va;
 453                 } else {
 454                         struct sk_buff *skb = alloc_skb(q->buf_size, gfp);
 455
 456                         if (!skb)
 457                                 goto nomem;
 458
 459                         sd->skb = skb;
 460                         buf_start = skb->data;
 461                 }
 462
 463                 err = add_one_rx_buf(buf_start, q->buf_size, d, sd, q->gen,
 464                                      adap->pdev);
 465                 if (unlikely(err)) {
 466                         if (!q->use_pages) {
 467                                 kfree_skb(sd->skb);
 468                                 sd->skb = NULL;
 469                         }
 470                         break;
 471                 }
 472
 473                 d++;
 474                 sd++;
 475                 if (++q->pidx == q->size) {
 476                         q->pidx = 0;
 477                         q->gen ^= 1;
 478                         sd = q->sdesc;
 479                         d = q->desc;
 480                 }
 481                 q->credits++;
 482                 count++;
 483         }
 484         wmb();
 485         if (likely(count))
 486                 t3_write_reg(adap, A_SG_KDOORBELL, V_EGRCNTX(q->cntxt_id));
 487
 488         return count;
 489 }
 490
 491 static inline void __refill_fl(struct adapter *adap, struct sge_fl *fl)
 492 {
 493         refill_fl(adap, fl, min(16U, fl->size - fl->credits),
 494                   GFP_ATOMIC | __GFP_COMP);
 495 }
 496
 497 /**
 498  *      recycle_rx_buf - recycle a receive buffer
 499  *      @adapter: the adapter
 500  *      @q: the SGE free list
 501  *      @idx: index of buffer to recycle
 502  *
 503  *      Recycles the specified buffer on the given free list by adding it at
 504  *      the next available slot on the list.
 505  */
 506 static void recycle_rx_buf(struct adapter *adap, struct sge_fl *q,
 507                            unsigned int idx)
 508 {
 509         struct rx_desc *from = &q->desc[idx];
 510         struct rx_desc *to = &q->desc[q->pidx];
 511
 512         q->sdesc[q->pidx] = q->sdesc[idx];
 513         to->addr_lo = from->addr_lo;    /* already big endian */
 514         to->addr_hi = from->addr_hi;    /* likewise */
 515         wmb();
 516         to->len_gen = cpu_to_be32(V_FLD_GEN1(q->gen));
 517         to->gen2 = cpu_to_be32(V_FLD_GEN2(q->gen));
 518         q->credits++;
 519
 520         if (++q->pidx == q->size) {
 521                 q->pidx = 0;
 522                 q->gen ^= 1;
 523         }
 524         t3_write_reg(adap, A_SG_KDOORBELL, V_EGRCNTX(q->cntxt_id));
 525 }
 526
 527 /**
 528  *      alloc_ring - allocate resources for an SGE descriptor ring
 529  *      @pdev: the PCI device
 530  *      @nelem: the number of descriptors
 531  *      @elem_size: the size of each descriptor
 532  *      @sw_size: the size of the SW state associated with each ring element
 533  *      @phys: the physical address of the allocated ring
 534  *      @metadata: address of the array holding the SW state for the ring
 535  *
 536  *      Allocates resources for an SGE descriptor ring, such as Tx queues,
 537  *      free buffer lists, or response queues.  Each SGE ring requires
 538  *      space for its HW descriptors plus, optionally, space for the SW state
 539  *      associated with each HW entry (the metadata).  The function returns
 540  *      three values: the virtual address for the HW ring (the return value
 541  *      of the function), the physical address of the HW ring, and the address
 542  *      of the SW ring.
 543  */
 544 static void *alloc_ring(struct pci_dev *pdev, size_t nelem, size_t elem_size,
 545                         size_t sw_size, dma_addr_t * phys, void *metadata)
 546 {
 547         size_t len = nelem * elem_size;
 548         void *s = NULL;
 549         void *p = dma_alloc_coherent(&pdev->dev, len, phys, GFP_KERNEL);
 550
 551         if (!p)
 552                 return NULL;
 553         if (sw_size && metadata) {
 554                 s = kcalloc(nelem, sw_size, GFP_KERNEL);
 555
 556                 if (!s) {
 557                         dma_free_coherent(&pdev->dev, len, p, *phys);
 558                         return NULL;
 559                 }
 560                 *(void **)metadata = s;
 561         }
 562         memset(p, 0, len);
 563         return p;
 564 }
 565
 566 /**
 567  *      t3_reset_qset - reset a sge qset
 568  *      @q: the queue set
 569  *
 570  *      Reset the qset structure.
 571  *      the NAPI structure is preserved in the event of
 572  *      the qset's reincarnation, for example during EEH recovery.
 573  */
 574 static void t3_reset_qset(struct sge_qset *q)
 575 {
 576         if (q->adap &&
 577             !(q->adap->flags & NAPI_INIT)) {
 578                 memset(q, 0, sizeof(*q));
 579                 return;
 580         }
 581
 582         q->adap = NULL;
 583         memset(&q->rspq, 0, sizeof(q->rspq));
 584         memset(q->fl, 0, sizeof(struct sge_fl) * SGE_RXQ_PER_SET);
 585         memset(q->txq, 0, sizeof(struct sge_txq) * SGE_TXQ_PER_SET);
 586         q->txq_stopped = 0;
 587         q->tx_reclaim_timer.function = NULL; /* for t3_stop_sge_timers() */
 588         kfree(q->lro_frag_tbl);
 589         q->lro_nfrags = q->lro_frag_len = 0;
 590 }
 591
 592
 593 /**
 594  *      free_qset - free the resources of an SGE queue set
 595  *      @adapter: the adapter owning the queue set
 596  *      @q: the queue set
 597  *
 598  *      Release the HW and SW resources associated with an SGE queue set, such
 599  *      as HW contexts, packet buffers, and descriptor rings.  Traffic to the
 600  *      queue set must be quiesced prior to calling this.
 601  */
 602 static void t3_free_qset(struct adapter *adapter, struct sge_qset *q)
 603 {
 604         int i;
 605         struct pci_dev *pdev = adapter->pdev;
 606
 607         for (i = 0; i < SGE_RXQ_PER_SET; ++i)
 608                 if (q->fl[i].desc) {
 609                         spin_lock_irq(&adapter->sge.reg_lock);
 610                         t3_sge_disable_fl(adapter, q->fl[i].cntxt_id);
 611                         spin_unlock_irq(&adapter->sge.reg_lock);
 612                         free_rx_bufs(pdev, &q->fl[i]);
 613                         kfree(q->fl[i].sdesc);
 614                         dma_free_coherent(&pdev->dev,
 615                                           q->fl[i].size *
 616                                           sizeof(struct rx_desc), q->fl[i].desc,
 617                                           q->fl[i].phys_addr);
 618                 }
 619
 620         for (i = 0; i < SGE_TXQ_PER_SET; ++i)
 621                 if (q->txq[i].desc) {
 622                         spin_lock_irq(&adapter->sge.reg_lock);
 623                         t3_sge_enable_ecntxt(adapter, q->txq[i].cntxt_id, 0);
 624                         spin_unlock_irq(&adapter->sge.reg_lock);
 625                         if (q->txq[i].sdesc) {
 626                                 free_tx_desc(adapter, &q->txq[i],
 627                                              q->txq[i].in_use);
 628                                 kfree(q->txq[i].sdesc);
 629                         }
 630                         dma_free_coherent(&pdev->dev,
 631                                           q->txq[i].size *
 632                                           sizeof(struct tx_desc),
 633                                           q->txq[i].desc, q->txq[i].phys_addr);
 634                         __skb_queue_purge(&q->txq[i].sendq);
 635                 }
 636
 637         if (q->rspq.desc) {
 638                 spin_lock_irq(&adapter->sge.reg_lock);
 639                 t3_sge_disable_rspcntxt(adapter, q->rspq.cntxt_id);
 640                 spin_unlock_irq(&adapter->sge.reg_lock);
 641                 dma_free_coherent(&pdev->dev,
 642                                   q->rspq.size * sizeof(struct rsp_desc),
 643                                   q->rspq.desc, q->rspq.phys_addr);
 644         }
 645
 646         t3_reset_qset(q);
 647 }
 648
 649 /**
 650  *      init_qset_cntxt - initialize an SGE queue set context info
 651  *      @qs: the queue set
 652  *      @id: the queue set id
 653  *
 654  *      Initializes the TIDs and context ids for the queues of a queue set.
 655  */
 656 static void init_qset_cntxt(struct sge_qset *qs, unsigned int id)
 657 {
 658         qs->rspq.cntxt_id = id;
 659         qs->fl[0].cntxt_id = 2 * id;
 660         qs->fl[1].cntxt_id = 2 * id + 1;
 661         qs->txq[TXQ_ETH].cntxt_id = FW_TUNNEL_SGEEC_START + id;
 662         qs->txq[TXQ_ETH].token = FW_TUNNEL_TID_START + id;
 663         qs->txq[TXQ_OFLD].cntxt_id = FW_OFLD_SGEEC_START + id;
 664         qs->txq[TXQ_CTRL].cntxt_id = FW_CTRL_SGEEC_START + id;
 665         qs->txq[TXQ_CTRL].token = FW_CTRL_TID_START + id;
 666 }
 667
 668 /**
 669  *      sgl_len - calculates the size of an SGL of the given capacity
 670  *      @n: the number of SGL entries
 671  *
 672  *      Calculates the number of flits needed for a scatter/gather list that
 673  *      can hold the given number of entries.
 674  */
 675 static inline unsigned int sgl_len(unsigned int n)
 676 {
 677         /* alternatively: 3 * (n / 2) + 2 * (n & 1) */
 678         return (3 * n) / 2 + (n & 1);
 679 }
 680
 681 /**
 682  *      flits_to_desc - returns the num of Tx descriptors for the given flits
 683  *      @n: the number of flits
 684  *
 685  *      Calculates the number of Tx descriptors needed for the supplied number
 686  *      of flits.
 687  */
 688 static inline unsigned int flits_to_desc(unsigned int n)
 689 {
 690         BUG_ON(n >= ARRAY_SIZE(flit_desc_map));
 691         return flit_desc_map[n];
 692 }
 693
 694 /**
 695  *      get_packet - return the next ingress packet buffer from a free list
 696  *      @adap: the adapter that received the packet
 697  *      @fl: the SGE free list holding the packet
 698  *      @len: the packet length including any SGE padding
 699  *      @drop_thres: # of remaining buffers before we start dropping packets
 700  *
 701  *      Get the next packet from a free list and complete setup of the
 702  *      sk_buff.  If the packet is small we make a copy and recycle the
 703  *      original buffer, otherwise we use the original buffer itself.  If a
 704  *      positive drop threshold is supplied packets are dropped and their
 705  *      buffers recycled if (a) the number of remaining buffers is under the
 706  *      threshold and the packet is too big to copy, or (b) the packet should
 707  *      be copied but there is no memory for the copy.
 708  */
 709 static struct sk_buff *get_packet(struct adapter *adap, struct sge_fl *fl,
 710                                   unsigned int len, unsigned int drop_thres)
 711 {
 712         struct sk_buff *skb = NULL;
 713         struct rx_sw_desc *sd = &fl->sdesc[fl->cidx];
 714
 715         prefetch(sd->skb->data);
 716         fl->credits--;
 717
 718         if (len <= SGE_RX_COPY_THRES) {
 719                 skb = alloc_skb(len, GFP_ATOMIC);
 720                 if (likely(skb != NULL)) {
 721                         __skb_put(skb, len);
 722                         pci_dma_sync_single_for_cpu(adap->pdev,
 723                                             pci_unmap_addr(sd, dma_addr), len,
 724                                             PCI_DMA_FROMDEVICE);
 725                         memcpy(skb->data, sd->skb->data, len);
 726                         pci_dma_sync_single_for_device(adap->pdev,
 727                                             pci_unmap_addr(sd, dma_addr), len,
 728                                             PCI_DMA_FROMDEVICE);
 729                 } else if (!drop_thres)
 730                         goto use_orig_buf;
 731 recycle:
 732                 recycle_rx_buf(adap, fl, fl->cidx);
 733                 return skb;
 734         }
 735
 736         if (unlikely(fl->credits < drop_thres))
 737                 goto recycle;
 738
 739 use_orig_buf:
 740         pci_unmap_single(adap->pdev, pci_unmap_addr(sd, dma_addr),
 741                          fl->buf_size, PCI_DMA_FROMDEVICE);
 742         skb = sd->skb;
 743         skb_put(skb, len);
 744         __refill_fl(adap, fl);
 745         return skb;
 746 }
 747
 748 /**
 749  *      get_packet_pg - return the next ingress packet buffer from a free list
 750  *      @adap: the adapter that received the packet
 751  *      @fl: the SGE free list holding the packet
 752  *      @len: the packet length including any SGE padding
 753  *      @drop_thres: # of remaining buffers before we start dropping packets
 754  *
 755  *      Get the next packet from a free list populated with page chunks.
 756  *      If the packet is small we make a copy and recycle the original buffer,
 757  *      otherwise we attach the original buffer as a page fragment to a fresh
 758  *      sk_buff.  If a positive drop threshold is supplied packets are dropped
 759  *      and their buffers recycled if (a) the number of remaining buffers is
 760  *      under the threshold and the packet is too big to copy, or (b) there's
 761  *      no system memory.
 762  *
 763  *      Note: this function is similar to @get_packet but deals with Rx buffers
 764  *      that are page chunks rather than sk_buffs.
 765  */
 766 static struct sk_buff *get_packet_pg(struct adapter *adap, struct sge_fl *fl,
 767                                      struct sge_rspq *q, unsigned int len,
 768                                      unsigned int drop_thres)
 769 {
 770         struct sk_buff *newskb, *skb;
 771         struct rx_sw_desc *sd = &fl->sdesc[fl->cidx];
 772
 773         newskb = skb = q->pg_skb;
 774
 775         if (!skb && (len <= SGE_RX_COPY_THRES)) {
 776                 newskb = alloc_skb(len, GFP_ATOMIC);
 777                 if (likely(newskb != NULL)) {
 778                         __skb_put(newskb, len);
 779                         pci_dma_sync_single_for_cpu(adap->pdev,
 780                                             pci_unmap_addr(sd, dma_addr), len,
 781                                             PCI_DMA_FROMDEVICE);
 782                         memcpy(newskb->data, sd->pg_chunk.va, len);
 783                         pci_dma_sync_single_for_device(adap->pdev,
 784                                             pci_unmap_addr(sd, dma_addr), len,
 785                                             PCI_DMA_FROMDEVICE);
 786                 } else if (!drop_thres)
 787                         return NULL;
 788 recycle:
 789                 fl->credits--;
 790                 recycle_rx_buf(adap, fl, fl->cidx);
 791                 q->rx_recycle_buf++;
 792                 return newskb;
 793         }
 794
 795         if (unlikely(q->rx_recycle_buf || (!skb && fl->credits <= drop_thres)))
 796                 goto recycle;
 797
 798         if (!skb)
 799                 newskb = alloc_skb(SGE_RX_PULL_LEN, GFP_ATOMIC);
 800         if (unlikely(!newskb)) {
 801                 if (!drop_thres)
 802                         return NULL;
 803                 goto recycle;
 804         }
 805
 806         pci_unmap_single(adap->pdev, pci_unmap_addr(sd, dma_addr),
 807                          fl->buf_size, PCI_DMA_FROMDEVICE);
 808         if (!skb) {
 809                 __skb_put(newskb, SGE_RX_PULL_LEN);
 810                 memcpy(newskb->data, sd->pg_chunk.va, SGE_RX_PULL_LEN);
 811                 skb_fill_page_desc(newskb, 0, sd->pg_chunk.page,
 812                                    sd->pg_chunk.offset + SGE_RX_PULL_LEN,
 813                                    len - SGE_RX_PULL_LEN);
 814                 newskb->len = len;
 815                 newskb->data_len = len - SGE_RX_PULL_LEN;
 816         } else {
 817                 skb_fill_page_desc(newskb, skb_shinfo(newskb)->nr_frags,
 818                                    sd->pg_chunk.page,
 819                                    sd->pg_chunk.offset, len);
 820                 newskb->len += len;
 821                 newskb->data_len += len;
 822         }
 823         newskb->truesize += newskb->data_len;
 824
 825         fl->credits--;
 826         /*
 827          * We do not refill FLs here, we let the caller do it to overlap a
 828          * prefetch.
 829          */
 830         return newskb;
 831 }
 832
 833 /**
 834  *      get_imm_packet - return the next ingress packet buffer from a response
 835  *      @resp: the response descriptor containing the packet data
 836  *
 837  *      Return a packet containing the immediate data of the given response.
 838  */
 839 static inline struct sk_buff *get_imm_packet(const struct rsp_desc *resp)
 840 {
 841         struct sk_buff *skb = alloc_skb(IMMED_PKT_SIZE, GFP_ATOMIC);
 842
 843         if (skb) {
 844                 __skb_put(skb, IMMED_PKT_SIZE);
 845                 skb_copy_to_linear_data(skb, resp->imm_data, IMMED_PKT_SIZE);
 846         }
 847         return skb;
 848 }
 849
 850 /**
 851  *      calc_tx_descs - calculate the number of Tx descriptors for a packet
 852  *      @skb: the packet
 853  *
 854  *      Returns the number of Tx descriptors needed for the given Ethernet
 855  *      packet.  Ethernet packets require addition of WR and CPL headers.
 856  */
 857 static inline unsigned int calc_tx_descs(const struct sk_buff *skb)
 858 {
 859         unsigned int flits;
 860
 861         if (skb->len <= WR_LEN - sizeof(struct cpl_tx_pkt))
 862                 return 1;
 863
 864         flits = sgl_len(skb_shinfo(skb)->nr_frags + 1) + 2;
 865         if (skb_shinfo(skb)->gso_size)
 866                 flits++;
 867         return flits_to_desc(flits);
 868 }
 869
 870 /**
 871  *      make_sgl - populate a scatter/gather list for a packet
 872  *      @skb: the packet
 873  *      @sgp: the SGL to populate
 874  *      @start: start address of skb main body data to include in the SGL
 875  *      @len: length of skb main body data to include in the SGL
 876  *      @pdev: the PCI device
 877  *
 878  *      Generates a scatter/gather list for the buffers that make up a packet
 879  *      and returns the SGL size in 8-byte words.  The caller must size the SGL
 880  *      appropriately.
 881  */
 882 static inline unsigned int make_sgl(const struct sk_buff *skb,
 883                                     struct sg_ent *sgp, unsigned char *start,
 884                                     unsigned int len, struct pci_dev *pdev)
 885 {
 886         dma_addr_t mapping;
 887         unsigned int i, j = 0, nfrags;
 888
 889         if (len) {
 890                 mapping = pci_map_single(pdev, start, len, PCI_DMA_TODEVICE);
 891                 sgp->len[0] = cpu_to_be32(len);
 892                 sgp->addr[0] = cpu_to_be64(mapping);
 893                 j = 1;
 894         }
 895
 896         nfrags = skb_shinfo(skb)->nr_frags;
 897         for (i = 0; i < nfrags; i++) {
 898                 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
 899
 900                 mapping = pci_map_page(pdev, frag->page, frag->page_offset,
 901                                        frag->size, PCI_DMA_TODEVICE);
 902                 sgp->len[j] = cpu_to_be32(frag->size);
 903                 sgp->addr[j] = cpu_to_be64(mapping);
 904                 j ^= 1;
 905                 if (j == 0)
 906                         ++sgp;
 907         }
 908         if (j)
 909                 sgp->len[j] = 0;
 910         return ((nfrags + (len != 0)) * 3) / 2 + j;
 911 }
 912
 913 /**
 914  *      check_ring_tx_db - check and potentially ring a Tx queue's doorbell
 915  *      @adap: the adapter
 916  *      @q: the Tx queue
 917  *
 918  *      Ring the doorbel if a Tx queue is asleep.  There is a natural race,
 919  *      where the HW is going to sleep just after we checked, however,
 920  *      then the interrupt handler will detect the outstanding TX packet
 921  *      and ring the doorbell for us.
 922  *
 923  *      When GTS is disabled we unconditionally ring the doorbell.
 924  */
 925 static inline void check_ring_tx_db(struct adapter *adap, struct sge_txq *q)
 926 {
 927 #if USE_GTS
 928         clear_bit(TXQ_LAST_PKT_DB, &q->flags);
 929         if (test_and_set_bit(TXQ_RUNNING, &q->flags) == 0) {
 930                 set_bit(TXQ_LAST_PKT_DB, &q->flags);
 931                 t3_write_reg(adap, A_SG_KDOORBELL,
 932                              F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
 933         }
 934 #else
 935         wmb();                  /* write descriptors before telling HW */
 936         t3_write_reg(adap, A_SG_KDOORBELL,
 937                      F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
 938 #endif
 939 }
 940
 941 static inline void wr_gen2(struct tx_desc *d, unsigned int gen)
 942 {
 943 #if SGE_NUM_GENBITS == 2
 944         d->flit[TX_DESC_FLITS - 1] = cpu_to_be64(gen);
 945 #endif
 946 }
 947
 948 /**
 949  *      write_wr_hdr_sgl - write a WR header and, optionally, SGL
 950  *      @ndesc: number of Tx descriptors spanned by the SGL
 951  *      @skb: the packet corresponding to the WR
 952  *      @d: first Tx descriptor to be written
 953  *      @pidx: index of above descriptors
 954  *      @q: the SGE Tx queue
 955  *      @sgl: the SGL
 956  *      @flits: number of flits to the start of the SGL in the first descriptor
 957  *      @sgl_flits: the SGL size in flits
 958  *      @gen: the Tx descriptor generation
 959  *      @wr_hi: top 32 bits of WR header based on WR type (big endian)
 960  *      @wr_lo: low 32 bits of WR header based on WR type (big endian)
 961  *
 962  *      Write a work request header and an associated SGL.  If the SGL is
 963  *      small enough to fit into one Tx descriptor it has already been written
 964  *      and we just need to write the WR header.  Otherwise we distribute the
 965  *      SGL across the number of descriptors it spans.
 966  */
 967 static void write_wr_hdr_sgl(unsigned int ndesc, struct sk_buff *skb,
 968                              struct tx_desc *d, unsigned int pidx,
 969                              const struct sge_txq *q,
 970                              const struct sg_ent *sgl,
 971                              unsigned int flits, unsigned int sgl_flits,
 972                              unsigned int gen, __be32 wr_hi,
 973                              __be32 wr_lo)
 974 {
 975         struct work_request_hdr *wrp = (struct work_request_hdr *)d;
 976         struct tx_sw_desc *sd = &q->sdesc[pidx];
 977
 978         sd->skb = skb;
 979         if (need_skb_unmap()) {
 980                 sd->fragidx = 0;
 981                 sd->addr_idx = 0;
 982                 sd->sflit = flits;
 983         }
 984
 985         if (likely(ndesc == 1)) {
 986                 sd->eop = 1;
 987                 wrp->wr_hi = htonl(F_WR_SOP | F_WR_EOP | V_WR_DATATYPE(1) |
 988                                    V_WR_SGLSFLT(flits)) | wr_hi;
 989                 wmb();
 990                 wrp->wr_lo = htonl(V_WR_LEN(flits + sgl_flits) |
 991                                    V_WR_GEN(gen)) | wr_lo;
 992                 wr_gen2(d, gen);
 993         } else {
 994                 unsigned int ogen = gen;
 995                 const u64 *fp = (const u64 *)sgl;
 996                 struct work_request_hdr *wp = wrp;
 997
 998                 wrp->wr_hi = htonl(F_WR_SOP | V_WR_DATATYPE(1) |
 999                                    V_WR_SGLSFLT(flits)) | wr_hi;
1000
1001                 while (sgl_flits) {
1002                         unsigned int avail = WR_FLITS - flits;
1003
1004                         if (avail > sgl_flits)
1005                                 avail = sgl_flits;
1006                         memcpy(&d->flit[flits], fp, avail * sizeof(*fp));
1007                         sgl_flits -= avail;
1008                         ndesc--;
1009                         if (!sgl_flits)
1010                                 break;
1011
1012                         fp += avail;
1013                         d++;
1014                         sd->eop = 0;
1015                         sd++;
1016                         if (++pidx == q->size) {
1017                                 pidx = 0;
1018                                 gen ^= 1;
1019                                 d = q->desc;
1020                                 sd = q->sdesc;
1021                         }
1022
1023                         sd->skb = skb;
1024                         wrp = (struct work_request_hdr *)d;
1025                         wrp->wr_hi = htonl(V_WR_DATATYPE(1) |
1026                                            V_WR_SGLSFLT(1)) | wr_hi;
1027                         wrp->wr_lo = htonl(V_WR_LEN(min(WR_FLITS,
1028                                                         sgl_flits + 1)) |
1029                                            V_WR_GEN(gen)) | wr_lo;
1030                         wr_gen2(d, gen);
1031                         flits = 1;
1032                 }
1033                 sd->eop = 1;
1034                 wrp->wr_hi |= htonl(F_WR_EOP);
1035                 wmb();
1036                 wp->wr_lo = htonl(V_WR_LEN(WR_FLITS) | V_WR_GEN(ogen)) | wr_lo;
1037                 wr_gen2((struct tx_desc *)wp, ogen);
1038                 WARN_ON(ndesc != 0);
1039         }
1040 }
1041
1042 /**
1043  *      write_tx_pkt_wr - write a TX_PKT work request
1044  *      @adap: the adapter
1045  *      @skb: the packet to send
1046  *      @pi: the egress interface
1047  *      @pidx: index of the first Tx descriptor to write
1048  *      @gen: the generation value to use
1049  *      @q: the Tx queue
1050  *      @ndesc: number of descriptors the packet will occupy
1051  *      @compl: the value of the COMPL bit to use
1052  *
1053  *      Generate a TX_PKT work request to send the supplied packet.
1054  */
1055 static void write_tx_pkt_wr(struct adapter *adap, struct sk_buff *skb,
1056                             const struct port_info *pi,
1057                             unsigned int pidx, unsigned int gen,
1058                             struct sge_txq *q, unsigned int ndesc,
1059                             unsigned int compl)
1060 {
1061         unsigned int flits, sgl_flits, cntrl, tso_info;
1062         struct sg_ent *sgp, sgl[MAX_SKB_FRAGS / 2 + 1];
1063         struct tx_desc *d = &q->desc[pidx];
1064         struct cpl_tx_pkt *cpl = (struct cpl_tx_pkt *)d;
1065
1066         cpl->len = htonl(skb->len | 0x80000000);
1067         cntrl = V_TXPKT_INTF(pi->port_id);
1068
1069         if (vlan_tx_tag_present(skb) && pi->vlan_grp)
1070                 cntrl |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN(vlan_tx_tag_get(skb));
1071
1072         tso_info = V_LSO_MSS(skb_shinfo(skb)->gso_size);
1073         if (tso_info) {
1074                 int eth_type;
1075                 struct cpl_tx_pkt_lso *hdr = (struct cpl_tx_pkt_lso *)cpl;
1076
1077                 d->flit[2] = 0;
1078                 cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT_LSO);
1079                 hdr->cntrl = htonl(cntrl);
1080                 eth_type = skb_network_offset(skb) == ETH_HLEN ?
1081                     CPL_ETH_II : CPL_ETH_II_VLAN;
1082                 tso_info |= V_LSO_ETH_TYPE(eth_type) |
1083                     V_LSO_IPHDR_WORDS(ip_hdr(skb)->ihl) |
1084                     V_LSO_TCPHDR_WORDS(tcp_hdr(skb)->doff);
1085                 hdr->lso_info = htonl(tso_info);
1086                 flits = 3;
1087         } else {
1088                 cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT);
1089                 cntrl |= F_TXPKT_IPCSUM_DIS;    /* SW calculates IP csum */
1090                 cntrl |= V_TXPKT_L4CSUM_DIS(skb->ip_summed != CHECKSUM_PARTIAL);
1091                 cpl->cntrl = htonl(cntrl);
1092
1093                 if (skb->len <= WR_LEN - sizeof(*cpl)) {
1094                         q->sdesc[pidx].skb = NULL;
1095                         if (!skb->data_len)
1096                                 skb_copy_from_linear_data(skb, &d->flit[2],
1097                                                           skb->len);
1098                         else
1099                                 skb_copy_bits(skb, 0, &d->flit[2], skb->len);
1100
1101                         flits = (skb->len + 7) / 8 + 2;
1102                         cpl->wr.wr_hi = htonl(V_WR_BCNTLFLT(skb->len & 7) |
1103                                               V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT)
1104                                               | F_WR_SOP | F_WR_EOP | compl);
1105                         wmb();
1106                         cpl->wr.wr_lo = htonl(V_WR_LEN(flits) | V_WR_GEN(gen) |
1107                                               V_WR_TID(q->token));
1108                         wr_gen2(d, gen);
1109                         kfree_skb(skb);
1110                         return;
1111                 }
1112
1113                 flits = 2;
1114         }
1115
1116         sgp = ndesc == 1 ? (struct sg_ent *)&d->flit[flits] : sgl;
1117         sgl_flits = make_sgl(skb, sgp, skb->data, skb_headlen(skb), adap->pdev);
1118
1119         write_wr_hdr_sgl(ndesc, skb, d, pidx, q, sgl, flits, sgl_flits, gen,
1120                          htonl(V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) | compl),
1121                          htonl(V_WR_TID(q->token)));
1122 }
1123
1124 static inline void t3_stop_tx_queue(struct netdev_queue *txq,
1125                                     struct sge_qset *qs, struct sge_txq *q)
1126 {
1127         netif_tx_stop_queue(txq);
1128         set_bit(TXQ_ETH, &qs->txq_stopped);
1129         q->stops++;
1130 }
1131
1132 /**
1133  *      eth_xmit - add a packet to the Ethernet Tx queue
1134  *      @skb: the packet
1135  *      @dev: the egress net device
1136  *
1137  *      Add a packet to an SGE Tx queue.  Runs with softirqs disabled.
1138  */
1139 int t3_eth_xmit(struct sk_buff *skb, struct net_device *dev)
1140 {
1141         int qidx;
1142         unsigned int ndesc, pidx, credits, gen, compl;
1143         const struct port_info *pi = netdev_priv(dev);
1144         struct adapter *adap = pi->adapter;
1145         struct netdev_queue *txq;
1146         struct sge_qset *qs;
1147         struct sge_txq *q;
1148
1149         /*
1150          * The chip min packet length is 9 octets but play safe and reject
1151          * anything shorter than an Ethernet header.
1152          */
1153         if (unlikely(skb->len < ETH_HLEN)) {
1154                 dev_kfree_skb(skb);
1155                 return NETDEV_TX_OK;
1156         }
1157
1158         qidx = skb_get_queue_mapping(skb);
1159         qs = &pi->qs[qidx];
1160         q = &qs->txq[TXQ_ETH];
1161         txq = netdev_get_tx_queue(dev, qidx);
1162
1163         spin_lock(&q->lock);
1164         reclaim_completed_tx(adap, q);
1165
1166         credits = q->size - q->in_use;
1167         ndesc = calc_tx_descs(skb);
1168
1169         if (unlikely(credits < ndesc)) {
1170                 t3_stop_tx_queue(txq, qs, q);
1171                 dev_err(&adap->pdev->dev,
1172                         "%s: Tx ring %u full while queue awake!\n",
1173                         dev->name, q->cntxt_id & 7);
1174                 spin_unlock(&q->lock);
1175                 return NETDEV_TX_BUSY;
1176         }
1177
1178         q->in_use += ndesc;
1179         if (unlikely(credits - ndesc < q->stop_thres)) {
1180                 t3_stop_tx_queue(txq, qs, q);
1181
1182                 if (should_restart_tx(q) &&
1183                     test_and_clear_bit(TXQ_ETH, &qs->txq_stopped)) {
1184                         q->restarts++;
1185                         netif_tx_wake_queue(txq);
1186                 }
1187         }
1188
1189         gen = q->gen;
1190         q->unacked += ndesc;
1191         compl = (q->unacked & 8) << (S_WR_COMPL - 3);
1192         q->unacked &= 7;
1193         pidx = q->pidx;
1194         q->pidx += ndesc;
1195         if (q->pidx >= q->size) {
1196                 q->pidx -= q->size;
1197                 q->gen ^= 1;
1198         }
1199
1200         /* update port statistics */
1201         if (skb->ip_summed == CHECKSUM_COMPLETE)
1202                 qs->port_stats[SGE_PSTAT_TX_CSUM]++;
1203         if (skb_shinfo(skb)->gso_size)
1204                 qs->port_stats[SGE_PSTAT_TSO]++;
1205         if (vlan_tx_tag_present(skb) && pi->vlan_grp)
1206                 qs->port_stats[SGE_PSTAT_VLANINS]++;
1207
1208         dev->trans_start = jiffies;
1209         spin_unlock(&q->lock);
1210
1211         /*
1212          * We do not use Tx completion interrupts to free DMAd Tx packets.
1213          * This is good for performamce but means that we rely on new Tx
1214          * packets arriving to run the destructors of completed packets,
1215          * which open up space in their sockets' send queues.  Sometimes
1216          * we do not get such new packets causing Tx to stall.  A single
1217          * UDP transmitter is a good example of this situation.  We have
1218          * a clean up timer that periodically reclaims completed packets
1219          * but it doesn't run often enough (nor do we want it to) to prevent
1220          * lengthy stalls.  A solution to this problem is to run the
1221          * destructor early, after the packet is queued but before it's DMAd.
1222          * A cons is that we lie to socket memory accounting, but the amount
1223          * of extra memory is reasonable (limited by the number of Tx
1224          * descriptors), the packets do actually get freed quickly by new
1225          * packets almost always, and for protocols like TCP that wait for
1226          * acks to really free up the data the extra memory is even less.
1227          * On the positive side we run the destructors on the sending CPU
1228          * rather than on a potentially different completing CPU, usually a
1229          * good thing.  We also run them without holding our Tx queue lock,
1230          * unlike what reclaim_completed_tx() would otherwise do.
1231          *
1232          * Run the destructor before telling the DMA engine about the packet
1233          * to make sure it doesn't complete and get freed prematurely.
1234          */
1235         if (likely(!skb_shared(skb)))
1236                 skb_orphan(skb);
1237
1238         write_tx_pkt_wr(adap, skb, pi, pidx, gen, q, ndesc, compl);
1239         check_ring_tx_db(adap, q);
1240         return NETDEV_TX_OK;
1241 }
1242
1243 /**
1244  *      write_imm - write a packet into a Tx descriptor as immediate data
1245  *      @d: the Tx descriptor to write
1246  *      @skb: the packet
1247  *      @len: the length of packet data to write as immediate data
1248  *      @gen: the generation bit value to write
1249  *
1250  *      Writes a packet as immediate data into a Tx descriptor.  The packet
1251  *      contains a work request at its beginning.  We must write the packet
1252  *      carefully so the SGE doesn't read it accidentally before it's written
1253  *      in its entirety.
1254  */
1255 static inline void write_imm(struct tx_desc *d, struct sk_buff *skb,
1256                              unsigned int len, unsigned int gen)
1257 {
1258         struct work_request_hdr *from = (struct work_request_hdr *)skb->data;
1259         struct work_request_hdr *to = (struct work_request_hdr *)d;
1260
1261         if (likely(!skb->data_len))
1262                 memcpy(&to[1], &from[1], len - sizeof(*from));
1263         else
1264                 skb_copy_bits(skb, sizeof(*from), &to[1], len - sizeof(*from));
1265
1266         to->wr_hi = from->wr_hi | htonl(F_WR_SOP | F_WR_EOP |
1267                                         V_WR_BCNTLFLT(len & 7));
1268         wmb();
1269         to->wr_lo = from->wr_lo | htonl(V_WR_GEN(gen) |
1270                                         V_WR_LEN((len + 7) / 8));
1271         wr_gen2(d, gen);
1272         kfree_skb(skb);
1273 }
1274
1275 /**
1276  *      check_desc_avail - check descriptor availability on a send queue
1277  *      @adap: the adapter
1278  *      @q: the send queue
1279  *      @skb: the packet needing the descriptors
1280  *      @ndesc: the number of Tx descriptors needed
1281  *      @qid: the Tx queue number in its queue set (TXQ_OFLD or TXQ_CTRL)
1282  *
1283  *      Checks if the requested number of Tx descriptors is available on an
1284  *      SGE send queue.  If the queue is already suspended or not enough
1285  *      descriptors are available the packet is queued for later transmission.
1286  *      Must be called with the Tx queue locked.
1287  *
1288  *      Returns 0 if enough descriptors are available, 1 if there aren't
1289  *      enough descriptors and the packet has been queued, and 2 if the caller
1290  *      needs to retry because there weren't enough descriptors at the
1291  *      beginning of the call but some freed up in the mean time.
1292  */
1293 static inline int check_desc_avail(struct adapter *adap, struct sge_txq *q,
1294                                    struct sk_buff *skb, unsigned int ndesc,
1295                                    unsigned int qid)
1296 {
1297         if (unlikely(!skb_queue_empty(&q->sendq))) {
1298               addq_exit:__skb_queue_tail(&q->sendq, skb);
1299                 return 1;
1300         }
1301         if (unlikely(q->size - q->in_use < ndesc)) {
1302                 struct sge_qset *qs = txq_to_qset(q, qid);
1303
1304                 set_bit(qid, &qs->txq_stopped);
1305                 smp_mb__after_clear_bit();
1306
1307                 if (should_restart_tx(q) &&
1308                     test_and_clear_bit(qid, &qs->txq_stopped))
1309                         return 2;
1310
1311                 q->stops++;
1312                 goto addq_exit;
1313         }
1314         return 0;
1315 }
1316
1317 /**
1318  *      reclaim_completed_tx_imm - reclaim completed control-queue Tx descs
1319  *      @q: the SGE control Tx queue
1320  *
1321  *      This is a variant of reclaim_completed_tx() that is used for Tx queues
1322  *      that send only immediate data (presently just the control queues) and
1323  *      thus do not have any sk_buffs to release.
1324  */
1325 static inline void reclaim_completed_tx_imm(struct sge_txq *q)
1326 {
1327         unsigned int reclaim = q->processed - q->cleaned;
1328
1329         q->in_use -= reclaim;
1330         q->cleaned += reclaim;
1331 }
1332
1333 static inline int immediate(const struct sk_buff *skb)
1334 {
1335         return skb->len <= WR_LEN;
1336 }
1337
1338 /**
1339  *      ctrl_xmit - send a packet through an SGE control Tx queue
1340  *      @adap: the adapter
1341  *      @q: the control queue
1342  *      @skb: the packet
1343  *
1344  *      Send a packet through an SGE control Tx queue.  Packets sent through
1345  *      a control queue must fit entirely as immediate data in a single Tx
1346  *      descriptor and have no page fragments.
1347  */
1348 static int ctrl_xmit(struct adapter *adap, struct sge_txq *q,
1349                      struct sk_buff *skb)
1350 {
1351         int ret;
1352         struct work_request_hdr *wrp = (struct work_request_hdr *)skb->data;
1353
1354         if (unlikely(!immediate(skb))) {
1355                 WARN_ON(1);
1356                 dev_kfree_skb(skb);
1357                 return NET_XMIT_SUCCESS;
1358         }
1359
1360         wrp->wr_hi |= htonl(F_WR_SOP | F_WR_EOP);
1361         wrp->wr_lo = htonl(V_WR_TID(q->token));
1362
1363         spin_lock(&q->lock);
1364       again:reclaim_completed_tx_imm(q);
1365
1366         ret = check_desc_avail(adap, q, skb, 1, TXQ_CTRL);
1367         if (unlikely(ret)) {
1368                 if (ret == 1) {
1369                         spin_unlock(&q->lock);
1370                         return NET_XMIT_CN;
1371                 }
1372                 goto again;
1373         }
1374
1375         write_imm(&q->desc[q->pidx], skb, skb->len, q->gen);
1376
1377         q->in_use++;
1378         if (++q->pidx >= q->size) {
1379                 q->pidx = 0;
1380                 q->gen ^= 1;
1381         }
1382         spin_unlock(&q->lock);
1383         wmb();
1384         t3_write_reg(adap, A_SG_KDOORBELL,
1385                      F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1386         return NET_XMIT_SUCCESS;
1387 }
1388
1389 /**
1390  *      restart_ctrlq - restart a suspended control queue
1391  *      @qs: the queue set cotaining the control queue
1392  *
1393  *      Resumes transmission on a suspended Tx control queue.
1394  */
1395 static void restart_ctrlq(unsigned long data)
1396 {
1397         struct sk_buff *skb;
1398         struct sge_qset *qs = (struct sge_qset *)data;
1399         struct sge_txq *q = &qs->txq[TXQ_CTRL];
1400
1401         spin_lock(&q->lock);
1402       again:reclaim_completed_tx_imm(q);
1403
1404         while (q->in_use < q->size &&
1405                (skb = __skb_dequeue(&q->sendq)) != NULL) {
1406
1407                 write_imm(&q->desc[q->pidx], skb, skb->len, q->gen);
1408
1409                 if (++q->pidx >= q->size) {
1410                         q->pidx = 0;
1411                         q->gen ^= 1;
1412                 }
1413                 q->in_use++;
1414         }
1415
1416         if (!skb_queue_empty(&q->sendq)) {
1417                 set_bit(TXQ_CTRL, &qs->txq_stopped);
1418                 smp_mb__after_clear_bit();
1419
1420                 if (should_restart_tx(q) &&
1421                     test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped))
1422                         goto again;
1423                 q->stops++;
1424         }
1425
1426         spin_unlock(&q->lock);
1427         wmb();
1428         t3_write_reg(qs->adap, A_SG_KDOORBELL,
1429                      F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1430 }
1431
1432 /*
1433  * Send a management message through control queue 0
1434  */
1435 int t3_mgmt_tx(struct adapter *adap, struct sk_buff *skb)
1436 {
1437         int ret;
1438         local_bh_disable();
1439         ret = ctrl_xmit(adap, &adap->sge.qs[0].txq[TXQ_CTRL], skb);
1440         local_bh_enable();
1441
1442         return ret;
1443 }
1444
1445 /**
1446  *      deferred_unmap_destructor - unmap a packet when it is freed
1447  *      @skb: the packet
1448  *
1449  *      This is the packet destructor used for Tx packets that need to remain
1450  *      mapped until they are freed rather than until their Tx descriptors are
1451  *      freed.
1452  */
1453 static void deferred_unmap_destructor(struct sk_buff *skb)
1454 {
1455         int i;
1456         const dma_addr_t *p;
1457         const struct skb_shared_info *si;
1458         const struct deferred_unmap_info *dui;
1459
1460         dui = (struct deferred_unmap_info *)skb->head;
1461         p = dui->addr;
1462
1463         if (skb->tail - skb->transport_header)
1464                 pci_unmap_single(dui->pdev, *p++,
1465                                  skb->tail - skb->transport_header,
1466                                  PCI_DMA_TODEVICE);
1467
1468         si = skb_shinfo(skb);
1469         for (i = 0; i < si->nr_frags; i++)
1470                 pci_unmap_page(dui->pdev, *p++, si->frags[i].size,
1471                                PCI_DMA_TODEVICE);
1472 }
1473
1474 static void setup_deferred_unmapping(struct sk_buff *skb, struct pci_dev *pdev,
1475                                      const struct sg_ent *sgl, int sgl_flits)
1476 {
1477         dma_addr_t *p;
1478         struct deferred_unmap_info *dui;
1479
1480         dui = (struct deferred_unmap_info *)skb->head;
1481         dui->pdev = pdev;
1482         for (p = dui->addr; sgl_flits >= 3; sgl++, sgl_flits -= 3) {
1483                 *p++ = be64_to_cpu(sgl->addr[0]);
1484                 *p++ = be64_to_cpu(sgl->addr[1]);
1485         }
1486         if (sgl_flits)
1487                 *p = be64_to_cpu(sgl->addr[0]);
1488 }
1489
1490 /**
1491  *      write_ofld_wr - write an offload work request
1492  *      @adap: the adapter
1493  *      @skb: the packet to send
1494  *      @q: the Tx queue
1495  *      @pidx: index of the first Tx descriptor to write
1496  *      @gen: the generation value to use
1497  *      @ndesc: number of descriptors the packet will occupy
1498  *
1499  *      Write an offload work request to send the supplied packet.  The packet
1500  *      data already carry the work request with most fields populated.
1501  */
1502 static void write_ofld_wr(struct adapter *adap, struct sk_buff *skb,
1503                           struct sge_txq *q, unsigned int pidx,
1504                           unsigned int gen, unsigned int ndesc)
1505 {
1506         unsigned int sgl_flits, flits;
1507         struct work_request_hdr *from;
1508         struct sg_ent *sgp, sgl[MAX_SKB_FRAGS / 2 + 1];
1509         struct tx_desc *d = &q->desc[pidx];
1510
1511         if (immediate(skb)) {
1512                 q->sdesc[pidx].skb = NULL;
1513                 write_imm(d, skb, skb->len, gen);
1514                 return;
1515         }
1516
1517         /* Only TX_DATA builds SGLs */
1518
1519         from = (struct work_request_hdr *)skb->data;
1520         memcpy(&d->flit[1], &from[1],
1521                skb_transport_offset(skb) - sizeof(*from));
1522
1523         flits = skb_transport_offset(skb) / 8;
1524         sgp = ndesc == 1 ? (struct sg_ent *)&d->flit[flits] : sgl;
1525         sgl_flits = make_sgl(skb, sgp, skb_transport_header(skb),
1526                              skb->tail - skb->transport_header,
1527                              adap->pdev);
1528         if (need_skb_unmap()) {
1529                 setup_deferred_unmapping(skb, adap->pdev, sgp, sgl_flits);
1530                 skb->destructor = deferred_unmap_destructor;
1531         }
1532
1533         write_wr_hdr_sgl(ndesc, skb, d, pidx, q, sgl, flits, sgl_flits,
1534                          gen, from->wr_hi, from->wr_lo);
1535 }
1536
1537 /**
1538  *      calc_tx_descs_ofld - calculate # of Tx descriptors for an offload packet
1539  *      @skb: the packet
1540  *
1541  *      Returns the number of Tx descriptors needed for the given offload
1542  *      packet.  These packets are already fully constructed.
1543  */
1544 static inline unsigned int calc_tx_descs_ofld(const struct sk_buff *skb)
1545 {
1546         unsigned int flits, cnt;
1547
1548         if (skb->len <= WR_LEN)
1549                 return 1;       /* packet fits as immediate data */
1550
1551         flits = skb_transport_offset(skb) / 8;  /* headers */
1552         cnt = skb_shinfo(skb)->nr_frags;
1553         if (skb->tail != skb->transport_header)
1554                 cnt++;
1555         return flits_to_desc(flits + sgl_len(cnt));
1556 }
1557
1558 /**
1559  *      ofld_xmit - send a packet through an offload queue
1560  *      @adap: the adapter
1561  *      @q: the Tx offload queue
1562  *      @skb: the packet
1563  *
1564  *      Send an offload packet through an SGE offload queue.
1565  */
1566 static int ofld_xmit(struct adapter *adap, struct sge_txq *q,
1567                      struct sk_buff *skb)
1568 {
1569         int ret;
1570         unsigned int ndesc = calc_tx_descs_ofld(skb), pidx, gen;
1571
1572         spin_lock(&q->lock);
1573       again:reclaim_completed_tx(adap, q);
1574
1575         ret = check_desc_avail(adap, q, skb, ndesc, TXQ_OFLD);
1576         if (unlikely(ret)) {
1577                 if (ret == 1) {
1578                         skb->priority = ndesc;  /* save for restart */
1579                         spin_unlock(&q->lock);
1580                         return NET_XMIT_CN;
1581                 }
1582                 goto again;
1583         }
1584
1585         gen = q->gen;
1586         q->in_use += ndesc;
1587         pidx = q->pidx;
1588         q->pidx += ndesc;
1589         if (q->pidx >= q->size) {
1590                 q->pidx -= q->size;
1591                 q->gen ^= 1;
1592         }
1593         spin_unlock(&q->lock);
1594
1595         write_ofld_wr(adap, skb, q, pidx, gen, ndesc);
1596         check_ring_tx_db(adap, q);
1597         return NET_XMIT_SUCCESS;
1598 }
1599
1600 /**
1601  *      restart_offloadq - restart a suspended offload queue
1602  *      @qs: the queue set cotaining the offload queue
1603  *
1604  *      Resumes transmission on a suspended Tx offload queue.
1605  */
1606 static void restart_offloadq(unsigned long data)
1607 {
1608         struct sk_buff *skb;
1609         struct sge_qset *qs = (struct sge_qset *)data;
1610         struct sge_txq *q = &qs->txq[TXQ_OFLD];
1611         const struct port_info *pi = netdev_priv(qs->netdev);
1612         struct adapter *adap = pi->adapter;
1613
1614         spin_lock(&q->lock);
1615       again:reclaim_completed_tx(adap, q);
1616
1617         while ((skb = skb_peek(&q->sendq)) != NULL) {
1618                 unsigned int gen, pidx;
1619                 unsigned int ndesc = skb->priority;
1620
1621                 if (unlikely(q->size - q->in_use < ndesc)) {
1622                         set_bit(TXQ_OFLD, &qs->txq_stopped);
1623                         smp_mb__after_clear_bit();
1624
1625                         if (should_restart_tx(q) &&
1626                             test_and_clear_bit(TXQ_OFLD, &qs->txq_stopped))
1627                                 goto again;
1628                         q->stops++;
1629                         break;
1630                 }
1631
1632                 gen = q->gen;
1633                 q->in_use += ndesc;
1634                 pidx = q->pidx;
1635                 q->pidx += ndesc;
1636                 if (q->pidx >= q->size) {
1637                         q->pidx -= q->size;
1638                         q->gen ^= 1;
1639                 }
1640                 __skb_unlink(skb, &q->sendq);
1641                 spin_unlock(&q->lock);
1642
1643                 write_ofld_wr(adap, skb, q, pidx, gen, ndesc);
1644                 spin_lock(&q->lock);
1645         }
1646         spin_unlock(&q->lock);
1647
1648 #if USE_GTS
1649         set_bit(TXQ_RUNNING, &q->flags);
1650         set_bit(TXQ_LAST_PKT_DB, &q->flags);
1651 #endif
1652         wmb();
1653         t3_write_reg(adap, A_SG_KDOORBELL,
1654                      F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
1655 }
1656
1657 /**
1658  *      queue_set - return the queue set a packet should use
1659  *      @skb: the packet
1660  *
1661  *      Maps a packet to the SGE queue set it should use.  The desired queue
1662  *      set is carried in bits 1-3 in the packet's priority.
1663  */
1664 static inline int queue_set(const struct sk_buff *skb)
1665 {
1666         return skb->priority >> 1;
1667 }
1668
1669 /**
1670  *      is_ctrl_pkt - return whether an offload packet is a control packet
1671  *      @skb: the packet
1672  *
1673  *      Determines whether an offload packet should use an OFLD or a CTRL
1674  *      Tx queue.  This is indicated by bit 0 in the packet's priority.
1675  */
1676 static inline int is_ctrl_pkt(const struct sk_buff *skb)
1677 {
1678         return skb->priority & 1;
1679 }
1680
1681 /**
1682  *      t3_offload_tx - send an offload packet
1683  *      @tdev: the offload device to send to
1684  *      @skb: the packet
1685  *
1686  *      Sends an offload packet.  We use the packet priority to select the
1687  *      appropriate Tx queue as follows: bit 0 indicates whether the packet
1688  *      should be sent as regular or control, bits 1-3 select the queue set.
1689  */
1690 int t3_offload_tx(struct t3cdev *tdev, struct sk_buff *skb)
1691 {
1692         struct adapter *adap = tdev2adap(tdev);
1693         struct sge_qset *qs = &adap->sge.qs[queue_set(skb)];
1694
1695         if (unlikely(is_ctrl_pkt(skb)))
1696                 return ctrl_xmit(adap, &qs->txq[TXQ_CTRL], skb);
1697
1698         return ofld_xmit(adap, &qs->txq[TXQ_OFLD], skb);
1699 }
1700
1701 /**
1702  *      offload_enqueue - add an offload packet to an SGE offload receive queue
1703  *      @q: the SGE response queue
1704  *      @skb: the packet
1705  *
1706  *      Add a new offload packet to an SGE response queue's offload packet
1707  *      queue.  If the packet is the first on the queue it schedules the RX
1708  *      softirq to process the queue.
1709  */
1710 static inline void offload_enqueue(struct sge_rspq *q, struct sk_buff *skb)
1711 {
1712         int was_empty = skb_queue_empty(&q->rx_queue);
1713
1714         __skb_queue_tail(&q->rx_queue, skb);
1715
1716         if (was_empty) {
1717                 struct sge_qset *qs = rspq_to_qset(q);
1718
1719                 napi_schedule(&qs->napi);
1720         }
1721 }
1722
1723 /**
1724  *      deliver_partial_bundle - deliver a (partial) bundle of Rx offload pkts
1725  *      @tdev: the offload device that will be receiving the packets
1726  *      @q: the SGE response queue that assembled the bundle
1727  *      @skbs: the partial bundle
1728  *      @n: the number of packets in the bundle
1729  *
1730  *      Delivers a (partial) bundle of Rx offload packets to an offload device.
1731  */
1732 static inline void deliver_partial_bundle(struct t3cdev *tdev,
1733                                           struct sge_rspq *q,
1734                                           struct sk_buff *skbs[], int n)
1735 {
1736         if (n) {
1737                 q->offload_bundles++;
1738                 tdev->recv(tdev, skbs, n);
1739         }
1740 }
1741
1742 /**
1743  *      ofld_poll - NAPI handler for offload packets in interrupt mode
1744  *      @dev: the network device doing the polling
1745  *      @budget: polling budget
1746  *
1747  *      The NAPI handler for offload packets when a response queue is serviced
1748  *      by the hard interrupt handler, i.e., when it's operating in non-polling
1749  *      mode.  Creates small packet batches and sends them through the offload
1750  *      receive handler.  Batches need to be of modest size as we do prefetches
1751  *      on the packets in each.
1752  */
1753 static int ofld_poll(struct napi_struct *napi, int budget)
1754 {
1755         struct sge_qset *qs = container_of(napi, struct sge_qset, napi);
1756         struct sge_rspq *q = &qs->rspq;
1757         struct adapter *adapter = qs->adap;
1758         int work_done = 0;
1759
1760         while (work_done < budget) {
1761                 struct sk_buff *skb, *tmp, *skbs[RX_BUNDLE_SIZE];
1762                 struct sk_buff_head queue;
1763                 int ngathered;
1764
1765                 spin_lock_irq(&q->lock);
1766                 __skb_queue_head_init(&queue);
1767                 skb_queue_splice_init(&q->rx_queue, &queue);
1768                 if (skb_queue_empty(&queue)) {
1769                         napi_complete(napi);
1770                         spin_unlock_irq(&q->lock);
1771                         return work_done;
1772                 }
1773                 spin_unlock_irq(&q->lock);
1774
1775                 ngathered = 0;
1776                 skb_queue_walk_safe(&queue, skb, tmp) {
1777                         if (work_done >= budget)
1778                                 break;
1779                         work_done++;
1780
1781                         __skb_unlink(skb, &queue);
1782                         prefetch(skb->data);
1783                         skbs[ngathered] = skb;
1784                         if (++ngathered == RX_BUNDLE_SIZE) {
1785                                 q->offload_bundles++;
1786                                 adapter->tdev.recv(&adapter->tdev, skbs,
1787                                                    ngathered);
1788                                 ngathered = 0;
1789                         }
1790                 }
1791                 if (!skb_queue_empty(&queue)) {
1792                         /* splice remaining packets back onto Rx queue */
1793                         spin_lock_irq(&q->lock);
1794                         skb_queue_splice(&queue, &q->rx_queue);
1795                         spin_unlock_irq(&q->lock);
1796                 }
1797                 deliver_partial_bundle(&adapter->tdev, q, skbs, ngathered);
1798         }
1799
1800         return work_done;
1801 }
1802
1803 /**
1804  *      rx_offload - process a received offload packet
1805  *      @tdev: the offload device receiving the packet
1806  *      @rq: the response queue that received the packet
1807  *      @skb: the packet
1808  *      @rx_gather: a gather list of packets if we are building a bundle
1809  *      @gather_idx: index of the next available slot in the bundle
1810  *
1811  *      Process an ingress offload pakcet and add it to the offload ingress
1812  *      queue.  Returns the index of the next available slot in the bundle.
1813  */
1814 static inline int rx_offload(struct t3cdev *tdev, struct sge_rspq *rq,
1815                              struct sk_buff *skb, struct sk_buff *rx_gather[],
1816                              unsigned int gather_idx)
1817 {
1818         skb_reset_mac_header(skb);
1819         skb_reset_network_header(skb);
1820         skb_reset_transport_header(skb);
1821
1822         if (rq->polling) {
1823                 rx_gather[gather_idx++] = skb;
1824                 if (gather_idx == RX_BUNDLE_SIZE) {
1825                         tdev->recv(tdev, rx_gather, RX_BUNDLE_SIZE);
1826                         gather_idx = 0;
1827                         rq->offload_bundles++;
1828                 }
1829         } else
1830                 offload_enqueue(rq, skb);
1831
1832         return gather_idx;
1833 }
1834
1835 /**
1836  *      restart_tx - check whether to restart suspended Tx queues
1837  *      @qs: the queue set to resume
1838  *
1839  *      Restarts suspended Tx queues of an SGE queue set if they have enough
1840  *      free resources to resume operation.
1841  */
1842 static void restart_tx(struct sge_qset *qs)
1843 {
1844         if (test_bit(TXQ_ETH, &qs->txq_stopped) &&
1845             should_restart_tx(&qs->txq[TXQ_ETH]) &&
1846             test_and_clear_bit(TXQ_ETH, &qs->txq_stopped)) {
1847                 qs->txq[TXQ_ETH].restarts++;
1848                 if (netif_running(qs->netdev))
1849                         netif_tx_wake_queue(qs->tx_q);
1850         }
1851
1852         if (test_bit(TXQ_OFLD, &qs->txq_stopped) &&
1853             should_restart_tx(&qs->txq[TXQ_OFLD]) &&
1854             test_and_clear_bit(TXQ_OFLD, &qs->txq_stopped)) {
1855                 qs->txq[TXQ_OFLD].restarts++;
1856                 tasklet_schedule(&qs->txq[TXQ_OFLD].qresume_tsk);
1857         }
1858         if (test_bit(TXQ_CTRL, &qs->txq_stopped) &&
1859             should_restart_tx(&qs->txq[TXQ_CTRL]) &&
1860             test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped)) {
1861                 qs->txq[TXQ_CTRL].restarts++;
1862                 tasklet_schedule(&qs->txq[TXQ_CTRL].qresume_tsk);
1863         }
1864 }
1865
1866 /**
1867  *      cxgb3_arp_process - process an ARP request probing a private IP address
1868  *      @adapter: the adapter
1869  *      @skb: the skbuff containing the ARP request
1870  *
1871  *      Check if the ARP request is probing the private IP address
1872  *      dedicated to iSCSI, generate an ARP reply if so.
1873  */
1874 static void cxgb3_arp_process(struct adapter *adapter, struct sk_buff *skb)
1875 {
1876         struct net_device *dev = skb->dev;
1877         struct port_info *pi;
1878         struct arphdr *arp;
1879         unsigned char *arp_ptr;
1880         unsigned char *sha;
1881         __be32 sip, tip;
1882
1883         if (!dev)
1884                 return;
1885
1886         skb_reset_network_header(skb);
1887         arp = arp_hdr(skb);
1888
1889         if (arp->ar_op != htons(ARPOP_REQUEST))
1890                 return;
1891
1892         arp_ptr = (unsigned char *)(arp + 1);
1893         sha = arp_ptr;
1894         arp_ptr += dev->addr_len;
1895         memcpy(&sip, arp_ptr, sizeof(sip));
1896         arp_ptr += sizeof(sip);
1897         arp_ptr += dev->addr_len;
1898         memcpy(&tip, arp_ptr, sizeof(tip));
1899
1900         pi = netdev_priv(dev);
1901         if (tip != pi->iscsi_ipv4addr)
1902                 return;
1903
1904         arp_send(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha,
1905                  dev->dev_addr, sha);
1906
1907 }
1908
1909 static inline int is_arp(struct sk_buff *skb)
1910 {
1911         return skb->protocol == htons(ETH_P_ARP);
1912 }
1913
1914 /**
1915  *      rx_eth - process an ingress ethernet packet
1916  *      @adap: the adapter
1917  *      @rq: the response queue that received the packet
1918  *      @skb: the packet
1919  *      @pad: amount of padding at the start of the buffer
1920  *
1921  *      Process an ingress ethernet pakcet and deliver it to the stack.
1922  *      The padding is 2 if the packet was delivered in an Rx buffer and 0
1923  *      if it was immediate data in a response.
1924  */
1925 static void rx_eth(struct adapter *adap, struct sge_rspq *rq,
1926                    struct sk_buff *skb, int pad, int lro)
1927 {
1928         struct cpl_rx_pkt *p = (struct cpl_rx_pkt *)(skb->data + pad);
1929         struct sge_qset *qs = rspq_to_qset(rq);
1930         struct port_info *pi;
1931
1932         skb_pull(skb, sizeof(*p) + pad);
1933         skb->protocol = eth_type_trans(skb, adap->port[p->iff]);
1934         pi = netdev_priv(skb->dev);
1935         if ((pi->rx_offload & T3_RX_CSUM) && p->csum_valid && p->csum == htons(0xffff) &&
1936             !p->fragment) {
1937                 qs->port_stats[SGE_PSTAT_RX_CSUM_GOOD]++;
1938                 skb->ip_summed = CHECKSUM_UNNECESSARY;
1939         } else
1940                 skb->ip_summed = CHECKSUM_NONE;
1941
1942         if (unlikely(p->vlan_valid)) {
1943                 struct vlan_group *grp = pi->vlan_grp;
1944
1945                 qs->port_stats[SGE_PSTAT_VLANEX]++;
1946                 if (likely(grp))
1947                         if (lro)
1948                                 lro_vlan_hwaccel_receive_skb(&qs->lro_mgr, skb,
1949                                                              grp,
1950                                                              ntohs(p->vlan),
1951                                                              p);
1952                         else {
1953                                 if (unlikely(pi->iscsi_ipv4addr &&
1954                                     is_arp(skb))) {
1955                                         unsigned short vtag = ntohs(p->vlan) &
1956                                                                 VLAN_VID_MASK;
1957                                         skb->dev = vlan_group_get_device(grp,
1958                                                                          vtag);
1959                                         cxgb3_arp_process(adap, skb);
1960                                 }
1961                                 __vlan_hwaccel_rx(skb, grp, ntohs(p->vlan),
1962                                                   rq->polling);
1963                         }
1964                 else
1965                         dev_kfree_skb_any(skb);
1966         } else if (rq->polling) {
1967                 if (lro)
1968                         lro_receive_skb(&qs->lro_mgr, skb, p);
1969                 else {
1970                         if (unlikely(pi->iscsi_ipv4addr && is_arp(skb)))
1971                                 cxgb3_arp_process(adap, skb);
1972                         netif_receive_skb(skb);
1973                 }
1974         } else
1975                 netif_rx(skb);
1976 }
1977
1978 static inline int is_eth_tcp(u32 rss)
1979 {
1980         return G_HASHTYPE(ntohl(rss)) == RSS_HASH_4_TUPLE;
1981 }
1982
1983 /**
1984  *      lro_frame_ok - check if an ingress packet is eligible for LRO
1985  *      @p: the CPL header of the packet
1986  *
1987  *      Returns true if a received packet is eligible for LRO.
1988  *      The following conditions must be true:
1989  *      - packet is TCP/IP Ethernet II (checked elsewhere)
1990  *      - not an IP fragment
1991  *      - no IP options
1992  *      - TCP/IP checksums are correct
1993  *      - the packet is for this host
1994  */
1995 static inline int lro_frame_ok(const struct cpl_rx_pkt *p)
1996 {
1997         const struct ethhdr *eh = (struct ethhdr *)(p + 1);
1998         const struct iphdr *ih = (struct iphdr *)(eh + 1);
1999
2000         return (*((u8 *)p + 1) & 0x90) == 0x10 && p->csum == htons(0xffff) &&
2001                 eh->h_proto == htons(ETH_P_IP) && ih->ihl == (sizeof(*ih) >> 2);
2002 }
2003
2004 static int t3_get_lro_header(void **eh,  void **iph, void **tcph,
2005                              u64 *hdr_flags, void *priv)
2006 {
2007         const struct cpl_rx_pkt *cpl = priv;
2008
2009         if (!lro_frame_ok(cpl))
2010                 return -1;
2011
2012         *eh = (struct ethhdr *)(cpl + 1);
2013         *iph = (struct iphdr *)((struct ethhdr *)*eh + 1);
2014         *tcph = (struct tcphdr *)((struct iphdr *)*iph + 1);
2015
2016         *hdr_flags = LRO_IPV4 | LRO_TCP;
2017         return 0;
2018 }
2019
2020 static int t3_get_skb_header(struct sk_buff *skb,
2021                               void **iph, void **tcph, u64 *hdr_flags,
2022                               void *priv)
2023 {
2024         void *eh;
2025
2026         return t3_get_lro_header(&eh, iph, tcph, hdr_flags, priv);
2027 }
2028
2029 static int t3_get_frag_header(struct skb_frag_struct *frag, void **eh,
2030                               void **iph, void **tcph, u64 *hdr_flags,
2031                               void *priv)
2032 {
2033         return t3_get_lro_header(eh, iph, tcph, hdr_flags, priv);
2034 }
2035
2036 /**
2037  *      lro_add_page - add a page chunk to an LRO session
2038  *      @adap: the adapter
2039  *      @qs: the associated queue set
2040  *      @fl: the free list containing the page chunk to add
2041  *      @len: packet length
2042  *      @complete: Indicates the last fragment of a frame
2043  *
2044  *      Add a received packet contained in a page chunk to an existing LRO
2045  *      session.
2046  */
2047 static void lro_add_page(struct adapter *adap, struct sge_qset *qs,
2048                          struct sge_fl *fl, int len, int complete)
2049 {
2050         struct rx_sw_desc *sd = &fl->sdesc[fl->cidx];
2051         struct cpl_rx_pkt *cpl;
2052         struct skb_frag_struct *rx_frag = qs->lro_frag_tbl;
2053         int nr_frags = qs->lro_nfrags, frag_len = qs->lro_frag_len;
2054         int offset = 0;
2055
2056         if (!nr_frags) {
2057                 offset = 2 + sizeof(struct cpl_rx_pkt);
2058                 qs->lro_va = cpl = sd->pg_chunk.va + 2;
2059         }
2060
2061         fl->credits--;
2062
2063         len -= offset;
2064         pci_unmap_single(adap->pdev, pci_unmap_addr(sd, dma_addr),
2065                          fl->buf_size, PCI_DMA_FROMDEVICE);
2066
2067         rx_frag += nr_frags;
2068         rx_frag->page = sd->pg_chunk.page;
2069         rx_frag->page_offset = sd->pg_chunk.offset + offset;
2070         rx_frag->size = len;
2071         frag_len += len;
2072         qs->lro_nfrags++;
2073         qs->lro_frag_len = frag_len;
2074
2075         if (!complete)
2076                 return;
2077
2078         qs->lro_nfrags = qs->lro_frag_len = 0;
2079         cpl = qs->lro_va;
2080
2081         if (unlikely(cpl->vlan_valid)) {
2082                 struct net_device *dev = qs->netdev;
2083                 struct port_info *pi = netdev_priv(dev);
2084                 struct vlan_group *grp = pi->vlan_grp;
2085
2086                 if (likely(grp != NULL)) {
2087                         lro_vlan_hwaccel_receive_frags(&qs->lro_mgr,
2088                                                        qs->lro_frag_tbl,
2089                                                        frag_len, frag_len,
2090                                                        grp, ntohs(cpl->vlan),
2091                                                        cpl, 0);
2092                         return;
2093                 }
2094         }
2095         lro_receive_frags(&qs->lro_mgr, qs->lro_frag_tbl,
2096                           frag_len, frag_len, cpl, 0);
2097 }
2098
2099 /**
2100  *      init_lro_mgr - initialize a LRO manager object
2101  *      @lro_mgr: the LRO manager object
2102  */
2103 static void init_lro_mgr(struct sge_qset *qs, struct net_lro_mgr *lro_mgr)
2104 {
2105         lro_mgr->dev = qs->netdev;
2106         lro_mgr->features = LRO_F_NAPI;
2107         lro_mgr->frag_align_pad = NET_IP_ALIGN;
2108         lro_mgr->ip_summed = CHECKSUM_UNNECESSARY;
2109         lro_mgr->ip_summed_aggr = CHECKSUM_UNNECESSARY;
2110         lro_mgr->max_desc = T3_MAX_LRO_SES;
2111         lro_mgr->lro_arr = qs->lro_desc;
2112         lro_mgr->get_frag_header = t3_get_frag_header;
2113         lro_mgr->get_skb_header = t3_get_skb_header;
2114         lro_mgr->max_aggr = T3_MAX_LRO_MAX_PKTS;
2115         if (lro_mgr->max_aggr > MAX_SKB_FRAGS)
2116                 lro_mgr->max_aggr = MAX_SKB_FRAGS;
2117 }
2118
2119 /**
2120  *      handle_rsp_cntrl_info - handles control information in a response
2121  *      @qs: the queue set corresponding to the response
2122  *      @flags: the response control flags
2123  *
2124  *      Handles the control information of an SGE response, such as GTS
2125  *      indications and completion credits for the queue set's Tx queues.
2126  *      HW coalesces credits, we don't do any extra SW coalescing.
2127  */
2128 static inline void handle_rsp_cntrl_info(struct sge_qset *qs, u32 flags)
2129 {
2130         unsigned int credits;
2131
2132 #if USE_GTS
2133         if (flags & F_RSPD_TXQ0_GTS)
2134                 clear_bit(TXQ_RUNNING, &qs->txq[TXQ_ETH].flags);
2135 #endif
2136
2137         credits = G_RSPD_TXQ0_CR(flags);
2138         if (credits)
2139                 qs->txq[TXQ_ETH].processed += credits;
2140
2141         credits = G_RSPD_TXQ2_CR(flags);
2142         if (credits)
2143                 qs->txq[TXQ_CTRL].processed += credits;
2144
2145 # if USE_GTS
2146         if (flags & F_RSPD_TXQ1_GTS)
2147                 clear_bit(TXQ_RUNNING, &qs->txq[TXQ_OFLD].flags);
2148 # endif
2149         credits = G_RSPD_TXQ1_CR(flags);
2150         if (credits)
2151                 qs->txq[TXQ_OFLD].processed += credits;
2152 }
2153
2154 /**
2155  *      check_ring_db - check if we need to ring any doorbells
2156  *      @adapter: the adapter
2157  *      @qs: the queue set whose Tx queues are to be examined
2158  *      @sleeping: indicates which Tx queue sent GTS
2159  *
2160  *      Checks if some of a queue set's Tx queues need to ring their doorbells
2161  *      to resume transmission after idling while they still have unprocessed
2162  *      descriptors.
2163  */
2164 static void check_ring_db(struct adapter *adap, struct sge_qset *qs,
2165                           unsigned int sleeping)
2166 {
2167         if (sleeping & F_RSPD_TXQ0_GTS) {
2168                 struct sge_txq *txq = &qs->txq[TXQ_ETH];
2169
2170                 if (txq->cleaned + txq->in_use != txq->processed &&
2171                     !test_and_set_bit(TXQ_LAST_PKT_DB, &txq->flags)) {
2172                         set_bit(TXQ_RUNNING, &txq->flags);
2173                         t3_write_reg(adap, A_SG_KDOORBELL, F_SELEGRCNTX |
2174                                      V_EGRCNTX(txq->cntxt_id));
2175                 }
2176         }
2177
2178         if (sleeping & F_RSPD_TXQ1_GTS) {
2179                 struct sge_txq *txq = &qs->txq[TXQ_OFLD];
2180
2181                 if (txq->cleaned + txq->in_use != txq->processed &&
2182                     !test_and_set_bit(TXQ_LAST_PKT_DB, &txq->flags)) {
2183                         set_bit(TXQ_RUNNING, &txq->flags);
2184                         t3_write_reg(adap, A_SG_KDOORBELL, F_SELEGRCNTX |
2185                                      V_EGRCNTX(txq->cntxt_id));
2186                 }
2187         }
2188 }
2189
2190 /**
2191  *      is_new_response - check if a response is newly written
2192  *      @r: the response descriptor
2193  *      @q: the response queue
2194  *
2195  *      Returns true if a response descriptor contains a yet unprocessed
2196  *      response.
2197  */
2198 static inline int is_new_response(const struct rsp_desc *r,
2199                                   const struct sge_rspq *q)
2200 {
2201         return (r->intr_gen & F_RSPD_GEN2) == q->gen;
2202 }
2203
2204 static inline void clear_rspq_bufstate(struct sge_rspq * const q)
2205 {
2206         q->pg_skb = NULL;
2207         q->rx_recycle_buf = 0;
2208 }
2209
2210 #define RSPD_GTS_MASK  (F_RSPD_TXQ0_GTS | F_RSPD_TXQ1_GTS)
2211 #define RSPD_CTRL_MASK (RSPD_GTS_MASK | \
2212                         V_RSPD_TXQ0_CR(M_RSPD_TXQ0_CR) | \
2213                         V_RSPD_TXQ1_CR(M_RSPD_TXQ1_CR) | \
2214                         V_RSPD_TXQ2_CR(M_RSPD_TXQ2_CR))
2215
2216 /* How long to delay the next interrupt in case of memory shortage, in 0.1us. */
2217 #define NOMEM_INTR_DELAY 2500
2218
2219 /**
2220  *      process_responses - process responses from an SGE response queue
2221  *      @adap: the adapter
2222  *      @qs: the queue set to which the response queue belongs
2223  *      @budget: how many responses can be processed in this round
2224  *
2225  *      Process responses from an SGE response queue up to the supplied budget.
2226  *      Responses include received packets as well as credits and other events
2227  *      for the queues that belong to the response queue's queue set.
2228  *      A negative budget is effectively unlimited.
2229  *
2230  *      Additionally choose the interrupt holdoff time for the next interrupt
2231  *      on this queue.  If the system is under memory shortage use a fairly
2232  *      long delay to help recovery.
2233  */
2234 static int process_responses(struct adapter *adap, struct sge_qset *qs,
2235                              int budget)
2236 {
2237         struct sge_rspq *q = &qs->rspq;
2238         struct rsp_desc *r = &q->desc[q->cidx];
2239         int budget_left = budget;
2240         unsigned int sleeping = 0;
2241         struct sk_buff *offload_skbs[RX_BUNDLE_SIZE];
2242         int ngathered = 0;
2243
2244         q->next_holdoff = q->holdoff_tmr;
2245
2246         while (likely(budget_left && is_new_response(r, q))) {
2247                 int packet_complete, eth, ethpad = 2, lro = qs->lro_enabled;
2248                 struct sk_buff *skb = NULL;
2249                 u32 len, flags = ntohl(r->flags);
2250                 __be32 rss_hi = *(const __be32 *)r,
2251                        rss_lo = r->rss_hdr.rss_hash_val;
2252
2253                 eth = r->rss_hdr.opcode == CPL_RX_PKT;
2254
2255                 if (unlikely(flags & F_RSPD_ASYNC_NOTIF)) {
2256                         skb = alloc_skb(AN_PKT_SIZE, GFP_ATOMIC);
2257                         if (!skb)
2258                                 goto no_mem;
2259
2260                         memcpy(__skb_put(skb, AN_PKT_SIZE), r, AN_PKT_SIZE);
2261                         skb->data[0] = CPL_ASYNC_NOTIF;
2262                         rss_hi = htonl(CPL_ASYNC_NOTIF << 24);
2263                         q->async_notif++;
2264                 } else if (flags & F_RSPD_IMM_DATA_VALID) {
2265                         skb = get_imm_packet(r);
2266                         if (unlikely(!skb)) {
2267 no_mem:
2268                                 q->next_holdoff = NOMEM_INTR_DELAY;
2269                                 q->nomem++;
2270                                 /* consume one credit since we tried */
2271                                 budget_left--;
2272                                 break;
2273                         }
2274                         q->imm_data++;
2275                         ethpad = 0;
2276                 } else if ((len = ntohl(r->len_cq)) != 0) {
2277                         struct sge_fl *fl;
2278
2279                         lro &= eth && is_eth_tcp(rss_hi);
2280
2281                         fl = (len & F_RSPD_FLQ) ? &qs->fl[1] : &qs->fl[0];
2282                         if (fl->use_pages) {
2283                                 void *addr = fl->sdesc[fl->cidx].pg_chunk.va;
2284
2285                                 prefetch(addr);
2286 #if L1_CACHE_BYTES < 128
2287                                 prefetch(addr + L1_CACHE_BYTES);
2288 #endif
2289                                 __refill_fl(adap, fl);
2290                                 if (lro > 0) {
2291                                         lro_add_page(adap, qs, fl,
2292                                                      G_RSPD_LEN(len),
2293                                                      flags & F_RSPD_EOP);
2294                                          goto next_fl;
2295                                 }
2296
2297                                 skb = get_packet_pg(adap, fl, q,
2298                                                     G_RSPD_LEN(len),
2299                                                     eth ?
2300                                                     SGE_RX_DROP_THRES : 0);
2301                                 q->pg_skb = skb;
2302                         } else
2303                                 skb = get_packet(adap, fl, G_RSPD_LEN(len),
2304                                                  eth ? SGE_RX_DROP_THRES : 0);
2305                         if (unlikely(!skb)) {
2306                                 if (!eth)
2307                                         goto no_mem;
2308                                 q->rx_drops++;
2309                         } else if (unlikely(r->rss_hdr.opcode == CPL_TRACE_PKT))
2310                                 __skb_pull(skb, 2);
2311 next_fl:
2312                         if (++fl->cidx == fl->size)
2313                                 fl->cidx = 0;
2314                 } else
2315                         q->pure_rsps++;
2316
2317                 if (flags & RSPD_CTRL_MASK) {
2318                         sleeping |= flags & RSPD_GTS_MASK;
2319                         handle_rsp_cntrl_info(qs, flags);
2320                 }
2321
2322                 r++;
2323                 if (unlikely(++q->cidx == q->size)) {
2324                         q->cidx = 0;
2325                         q->gen ^= 1;
2326                         r = q->desc;
2327                 }
2328                 prefetch(r);
2329
2330                 if (++q->credits >= (q->size / 4)) {
2331                         refill_rspq(adap, q, q->credits);
2332                         q->credits = 0;
2333                 }
2334
2335                 packet_complete = flags &
2336                                   (F_RSPD_EOP | F_RSPD_IMM_DATA_VALID |
2337                                    F_RSPD_ASYNC_NOTIF);
2338
2339                 if (skb != NULL && packet_complete) {
2340                         if (eth)
2341                                 rx_eth(adap, q, skb, ethpad, lro);
2342                         else {
2343                                 q->offload_pkts++;
2344                                 /* Preserve the RSS info in csum & priority */
2345                                 skb->csum = rss_hi;
2346                                 skb->priority = rss_lo;
2347                                 ngathered = rx_offload(&adap->tdev, q, skb,
2348                                                        offload_skbs,
2349                                                        ngathered);
2350                         }
2351
2352                         if (flags & F_RSPD_EOP)
2353                                 clear_rspq_bufstate(q);
2354                 }
2355                 --budget_left;
2356         }
2357
2358         deliver_partial_bundle(&adap->tdev, q, offload_skbs, ngathered);
2359         lro_flush_all(&qs->lro_mgr);
2360         qs->port_stats[SGE_PSTAT_LRO_AGGR] = qs->lro_mgr.stats.aggregated;
2361         qs->port_stats[SGE_PSTAT_LRO_FLUSHED] = qs->lro_mgr.stats.flushed;
2362         qs->port_stats[SGE_PSTAT_LRO_NO_DESC] = qs->lro_mgr.stats.no_desc;
2363
2364         if (sleeping)
2365                 check_ring_db(adap, qs, sleeping);
2366
2367         smp_mb();               /* commit Tx queue .processed updates */
2368         if (unlikely(qs->txq_stopped != 0))
2369                 restart_tx(qs);
2370
2371         budget -= budget_left;
2372         return budget;
2373 }
2374
2375 static inline int is_pure_response(const struct rsp_desc *r)
2376 {
2377         __be32 n = r->flags & htonl(F_RSPD_ASYNC_NOTIF | F_RSPD_IMM_DATA_VALID);
2378
2379         return (n | r->len_cq) == 0;
2380 }
2381
2382 /**
2383  *      napi_rx_handler - the NAPI handler for Rx processing
2384  *      @napi: the napi instance
2385  *      @budget: how many packets we can process in this round
2386  *
2387  *      Handler for new data events when using NAPI.
2388  */
2389 static int napi_rx_handler(struct napi_struct *napi, int budget)
2390 {
2391         struct sge_qset *qs = container_of(napi, struct sge_qset, napi);
2392         struct adapter *adap = qs->adap;
2393         int work_done = process_responses(adap, qs, budget);
2394
2395         if (likely(work_done < budget)) {
2396                 napi_complete(napi);
2397
2398                 /*
2399                  * Because we don't atomically flush the following
2400                  * write it is possible that in very rare cases it can
2401                  * reach the device in a way that races with a new
2402                  * response being written plus an error interrupt
2403                  * causing the NAPI interrupt handler below to return
2404                  * unhandled status to the OS.  To protect against
2405                  * this would require flushing the write and doing
2406                  * both the write and the flush with interrupts off.
2407                  * Way too expensive and unjustifiable given the
2408                  * rarity of the race.
2409                  *
2410                  * The race cannot happen at all with MSI-X.
2411                  */
2412                 t3_write_reg(adap, A_SG_GTS, V_RSPQ(qs->rspq.cntxt_id) |
2413                              V_NEWTIMER(qs->rspq.next_holdoff) |
2414                              V_NEWINDEX(qs->rspq.cidx));
2415         }
2416         return work_done;
2417 }
2418
2419 /*
2420  * Returns true if the device is already scheduled for polling.
2421  */
2422 static inline int napi_is_scheduled(struct napi_struct *napi)
2423 {
2424         return test_bit(NAPI_STATE_SCHED, &napi->state);
2425 }
2426
2427 /**
2428  *      process_pure_responses - process pure responses from a response queue
2429  *      @adap: the adapter
2430  *      @qs: the queue set owning the response queue
2431  *      @r: the first pure response to process
2432  *
2433  *      A simpler version of process_responses() that handles only pure (i.e.,
2434  *      non data-carrying) responses.  Such respones are too light-weight to
2435  *      justify calling a softirq under NAPI, so we handle them specially in
2436  *      the interrupt handler.  The function is called with a pointer to a
2437  *      response, which the caller must ensure is a valid pure response.
2438  *
2439  *      Returns 1 if it encounters a valid data-carrying response, 0 otherwise.
2440  */
2441 static int process_pure_responses(struct adapter *adap, struct sge_qset *qs,
2442                                   struct rsp_desc *r)
2443 {
2444         struct sge_rspq *q = &qs->rspq;
2445         unsigned int sleeping = 0;
2446
2447         do {
2448                 u32 flags = ntohl(r->flags);
2449
2450                 r++;
2451                 if (unlikely(++q->cidx == q->size)) {
2452                         q->cidx = 0;
2453                         q->gen ^= 1;
2454                         r = q->desc;
2455                 }
2456                 prefetch(r);
2457
2458                 if (flags & RSPD_CTRL_MASK) {
2459                         sleeping |= flags & RSPD_GTS_MASK;
2460                         handle_rsp_cntrl_info(qs, flags);
2461                 }
2462
2463                 q->pure_rsps++;
2464                 if (++q->credits >= (q->size / 4)) {
2465                         refill_rspq(adap, q, q->credits);
2466                         q->credits = 0;
2467                 }
2468         } while (is_new_response(r, q) && is_pure_response(r));
2469
2470         if (sleeping)
2471                 check_ring_db(adap, qs, sleeping);
2472
2473         smp_mb();               /* commit Tx queue .processed updates */
2474         if (unlikely(qs->txq_stopped != 0))
2475                 restart_tx(qs);
2476
2477         return is_new_response(r, q);
2478 }
2479
2480 /**
2481  *      handle_responses - decide what to do with new responses in NAPI mode
2482  *      @adap: the adapter
2483  *      @q: the response queue
2484  *
2485  *      This is used by the NAPI interrupt handlers to decide what to do with
2486  *      new SGE responses.  If there are no new responses it returns -1.  If
2487  *      there are new responses and they are pure (i.e., non-data carrying)
2488  *      it handles them straight in hard interrupt context as they are very
2489  *      cheap and don't deliver any packets.  Finally, if there are any data
2490  *      signaling responses it schedules the NAPI handler.  Returns 1 if it
2491  *      schedules NAPI, 0 if all new responses were pure.
2492  *
2493  *      The caller must ascertain NAPI is not already running.
2494  */
2495 static inline int handle_responses(struct adapter *adap, struct sge_rspq *q)
2496 {
2497         struct sge_qset *qs = rspq_to_qset(q);
2498         struct rsp_desc *r = &q->desc[q->cidx];
2499
2500         if (!is_new_response(r, q))
2501                 return -1;
2502         if (is_pure_response(r) && process_pure_responses(adap, qs, r) == 0) {
2503                 t3_write_reg(adap, A_SG_GTS, V_RSPQ(q->cntxt_id) |
2504                              V_NEWTIMER(q->holdoff_tmr) | V_NEWINDEX(q->cidx));
2505                 return 0;
2506         }
2507         napi_schedule(&qs->napi);
2508         return 1;
2509 }
2510
2511 /*
2512  * The MSI-X interrupt handler for an SGE response queue for the non-NAPI case
2513  * (i.e., response queue serviced in hard interrupt).
2514  */
2515 irqreturn_t t3_sge_intr_msix(int irq, void *cookie)
2516 {
2517         struct sge_qset *qs = cookie;
2518         struct adapter *adap = qs->adap;
2519         struct sge_rspq *q = &qs->rspq;
2520
2521         spin_lock(&q->lock);
2522         if (process_responses(adap, qs, -1) == 0)
2523                 q->unhandled_irqs++;
2524         t3_write_reg(adap, A_SG_GTS, V_RSPQ(q->cntxt_id) |
2525                      V_NEWTIMER(q->next_holdoff) | V_NEWINDEX(q->cidx));
2526         spin_unlock(&q->lock);
2527         return IRQ_HANDLED;
2528 }
2529
2530 /*
2531  * The MSI-X interrupt handler for an SGE response queue for the NAPI case
2532  * (i.e., response queue serviced by NAPI polling).
2533  */
2534 static irqreturn_t t3_sge_intr_msix_napi(int irq, void *cookie)
2535 {
2536         struct sge_qset *qs = cookie;
2537         struct sge_rspq *q = &qs->rspq;
2538
2539         spin_lock(&q->lock);
2540
2541         if (handle_responses(qs->adap, q) < 0)
2542                 q->unhandled_irqs++;
2543         spin_unlock(&q->lock);
2544         return IRQ_HANDLED;
2545 }
2546
2547 /*
2548  * The non-NAPI MSI interrupt handler.  This needs to handle data events from
2549  * SGE response queues as well as error and other async events as they all use
2550  * the same MSI vector.  We use one SGE response queue per port in this mode
2551  * and protect all response queues with queue 0's lock.
2552  */
2553 static irqreturn_t t3_intr_msi(int irq, void *cookie)
2554 {
2555         int new_packets = 0;
2556         struct adapter *adap = cookie;
2557         struct sge_rspq *q = &adap->sge.qs[0].rspq;
2558
2559         spin_lock(&q->lock);
2560
2561         if (process_responses(adap, &adap->sge.qs[0], -1)) {
2562                 t3_write_reg(adap, A_SG_GTS, V_RSPQ(q->cntxt_id) |
2563                              V_NEWTIMER(q->next_holdoff) | V_NEWINDEX(q->cidx));
2564                 new_packets = 1;
2565         }
2566
2567         if (adap->params.nports == 2 &&
2568             process_responses(adap, &adap->sge.qs[1], -1)) {
2569                 struct sge_rspq *q1 = &adap->sge.qs[1].rspq;
2570
2571                 t3_write_reg(adap, A_SG_GTS, V_RSPQ(q1->cntxt_id) |
2572                              V_NEWTIMER(q1->next_holdoff) |
2573                              V_NEWINDEX(q1->cidx));
2574                 new_packets = 1;
2575         }
2576
2577         if (!new_packets && t3_slow_intr_handler(adap) == 0)
2578                 q->unhandled_irqs++;
2579
2580         spin_unlock(&q->lock);
2581         return IRQ_HANDLED;
2582 }
2583
2584 static int rspq_check_napi(struct sge_qset *qs)
2585 {
2586         struct sge_rspq *q = &qs->rspq;
2587
2588         if (!napi_is_scheduled(&qs->napi) &&
2589             is_new_response(&q->desc[q->cidx], q)) {
2590                 napi_schedule(&qs->napi);
2591                 return 1;
2592         }
2593         return 0;
2594 }
2595
2596 /*
2597  * The MSI interrupt handler for the NAPI case (i.e., response queues serviced
2598  * by NAPI polling).  Handles data events from SGE response queues as well as
2599  * error and other async events as they all use the same MSI vector.  We use
2600  * one SGE response queue per port in this mode and protect all response
2601  * queues with queue 0's lock.
2602  */
2603 static irqreturn_t t3_intr_msi_napi(int irq, void *cookie)
2604 {
2605         int new_packets;
2606         struct adapter *adap = cookie;
2607         struct sge_rspq *q = &adap->sge.qs[0].rspq;
2608
2609         spin_lock(&q->lock);
2610
2611         new_packets = rspq_check_napi(&adap->sge.qs[0]);
2612         if (adap->params.nports == 2)
2613                 new_packets += rspq_check_napi(&adap->sge.qs[1]);
2614         if (!new_packets && t3_slow_intr_handler(adap) == 0)
2615                 q->unhandled_irqs++;
2616
2617         spin_unlock(&q->lock);
2618         return IRQ_HANDLED;
2619 }
2620
2621 /*
2622  * A helper function that processes responses and issues GTS.
2623  */
2624 static inline int process_responses_gts(struct adapter *adap,
2625                                         struct sge_rspq *rq)
2626 {
2627         int work;
2628
2629         work = process_responses(adap, rspq_to_qset(rq), -1);
2630         t3_write_reg(adap, A_SG_GTS, V_RSPQ(rq->cntxt_id) |
2631                      V_NEWTIMER(rq->next_holdoff) | V_NEWINDEX(rq->cidx));
2632         return work;
2633 }
2634
2635 /*
2636  * The legacy INTx interrupt handler.  This needs to handle data events from
2637  * SGE response queues as well as error and other async events as they all use
2638  * the same interrupt pin.  We use one SGE response queue per port in this mode
2639  * and protect all response queues with queue 0's lock.
2640  */
2641 static irqreturn_t t3_intr(int irq, void *cookie)
2642 {
2643         int work_done, w0, w1;
2644         struct adapter *adap = cookie;
2645         struct sge_rspq *q0 = &adap->sge.qs[0].rspq;
2646         struct sge_rspq *q1 = &adap->sge.qs[1].rspq;
2647
2648         spin_lock(&q0->lock);
2649
2650         w0 = is_new_response(&q0->desc[q0->cidx], q0);
2651         w1 = adap->params.nports == 2 &&
2652             is_new_response(&q1->desc[q1->cidx], q1);
2653
2654         if (likely(w0 | w1)) {
2655                 t3_write_reg(adap, A_PL_CLI, 0);
2656                 t3_read_reg(adap, A_PL_CLI);    /* flush */
2657
2658                 if (likely(w0))
2659                         process_responses_gts(adap, q0);
2660
2661                 if (w1)
2662                         process_responses_gts(adap, q1);
2663
2664                 work_done = w0 | w1;
2665         } else
2666                 work_done = t3_slow_intr_handler(adap);
2667
2668         spin_unlock(&q0->lock);
2669         return IRQ_RETVAL(work_done != 0);
2670 }
2671
2672 /*
2673  * Interrupt handler for legacy INTx interrupts for T3B-based cards.
2674  * Handles data events from SGE response queues as well as error and other
2675  * async events as they all use the same interrupt pin.  We use one SGE
2676  * response queue per port in this mode and protect all response queues with
2677  * queue 0's lock.
2678  */
2679 static irqreturn_t t3b_intr(int irq, void *cookie)
2680 {
2681         u32 map;
2682         struct adapter *adap = cookie;
2683         struct sge_rspq *q0 = &adap->sge.qs[0].rspq;
2684
2685         t3_write_reg(adap, A_PL_CLI, 0);
2686         map = t3_read_reg(adap, A_SG_DATA_INTR);
2687
2688         if (unlikely(!map))     /* shared interrupt, most likely */
2689                 return IRQ_NONE;
2690
2691         spin_lock(&q0->lock);
2692
2693         if (unlikely(map & F_ERRINTR))
2694                 t3_slow_intr_handler(adap);
2695
2696         if (likely(map & 1))
2697                 process_responses_gts(adap, q0);
2698
2699         if (map & 2)
2700                 process_responses_gts(adap, &adap->sge.qs[1].rspq);
2701
2702         spin_unlock(&q0->lock);
2703         return IRQ_HANDLED;
2704 }
2705
2706 /*
2707  * NAPI interrupt handler for legacy INTx interrupts for T3B-based cards.
2708  * Handles data events from SGE response queues as well as error and other
2709  * async events as they all use the same interrupt pin.  We use one SGE
2710  * response queue per port in this mode and protect all response queues with
2711  * queue 0's lock.
2712  */
2713 static irqreturn_t t3b_intr_napi(int irq, void *cookie)
2714 {
2715         u32 map;
2716         struct adapter *adap = cookie;
2717         struct sge_qset *qs0 = &adap->sge.qs[0];
2718         struct sge_rspq *q0 = &qs0->rspq;
2719
2720         t3_write_reg(adap, A_PL_CLI, 0);
2721         map = t3_read_reg(adap, A_SG_DATA_INTR);
2722
2723         if (unlikely(!map))     /* shared interrupt, most likely */
2724                 return IRQ_NONE;
2725
2726         spin_lock(&q0->lock);
2727
2728         if (unlikely(map & F_ERRINTR))
2729                 t3_slow_intr_handler(adap);
2730
2731         if (likely(map & 1))
2732                 napi_schedule(&qs0->napi);
2733
2734         if (map & 2)
2735                 napi_schedule(&adap->sge.qs[1].napi);
2736
2737         spin_unlock(&q0->lock);
2738         return IRQ_HANDLED;
2739 }
2740
2741 /**
2742  *      t3_intr_handler - select the top-level interrupt handler
2743  *      @adap: the adapter
2744  *      @polling: whether using NAPI to service response queues
2745  *
2746  *      Selects the top-level interrupt handler based on the type of interrupts
2747  *      (MSI-X, MSI, or legacy) and whether NAPI will be used to service the
2748  *      response queues.
2749  */
2750 irq_handler_t t3_intr_handler(struct adapter *adap, int polling)
2751 {
2752         if (adap->flags & USING_MSIX)
2753                 return polling ? t3_sge_intr_msix_napi : t3_sge_intr_msix;
2754         if (adap->flags & USING_MSI)
2755                 return polling ? t3_intr_msi_napi : t3_intr_msi;
2756         if (adap->params.rev > 0)
2757                 return polling ? t3b_intr_napi : t3b_intr;
2758         return t3_intr;
2759 }
2760
2761 #define SGE_PARERR (F_CPPARITYERROR | F_OCPARITYERROR | F_RCPARITYERROR | \
2762                     F_IRPARITYERROR | V_ITPARITYERROR(M_ITPARITYERROR) | \
2763                     V_FLPARITYERROR(M_FLPARITYERROR) | F_LODRBPARITYERROR | \
2764                     F_HIDRBPARITYERROR | F_LORCQPARITYERROR | \
2765                     F_HIRCQPARITYERROR)
2766 #define SGE_FRAMINGERR (F_UC_REQ_FRAMINGERROR | F_R_REQ_FRAMINGERROR)
2767 #define SGE_FATALERR (SGE_PARERR | SGE_FRAMINGERR | F_RSPQCREDITOVERFOW | \
2768                       F_RSPQDISABLED)
2769
2770 /**
2771  *      t3_sge_err_intr_handler - SGE async event interrupt handler
2772  *      @adapter: the adapter
2773  *
2774  *      Interrupt handler for SGE asynchronous (non-data) events.
2775  */
2776 void t3_sge_err_intr_handler(struct adapter *adapter)
2777 {
2778         unsigned int v, status = t3_read_reg(adapter, A_SG_INT_CAUSE);
2779
2780         if (status & SGE_PARERR)
2781                 CH_ALERT(adapter, "SGE parity error (0x%x)\n",
2782                          status & SGE_PARERR);
2783         if (status & SGE_FRAMINGERR)
2784                 CH_ALERT(adapter, "SGE framing error (0x%x)\n",
2785                          status & SGE_FRAMINGERR);
2786
2787         if (status & F_RSPQCREDITOVERFOW)
2788                 CH_ALERT(adapter, "SGE response queue credit overflow\n");
2789
2790         if (status & F_RSPQDISABLED) {
2791                 v = t3_read_reg(adapter, A_SG_RSPQ_FL_STATUS);
2792
2793                 CH_ALERT(adapter,
2794                          "packet delivered to disabled response queue "
2795                          "(0x%x)\n", (v >> S_RSPQ0DISABLED) & 0xff);
2796         }
2797
2798         if (status & (F_HIPIODRBDROPERR | F_LOPIODRBDROPERR))
2799                 CH_ALERT(adapter, "SGE dropped %s priority doorbell\n",
2800                          status & F_HIPIODRBDROPERR ? "high" : "lo");
2801
2802         t3_write_reg(adapter, A_SG_INT_CAUSE, status);
2803         if (status &  SGE_FATALERR)
2804                 t3_fatal_err(adapter);
2805 }
2806
2807 /**
2808  *      sge_timer_cb - perform periodic maintenance of an SGE qset
2809  *      @data: the SGE queue set to maintain
2810  *
2811  *      Runs periodically from a timer to perform maintenance of an SGE queue
2812  *      set.  It performs two tasks:
2813  *
2814  *      a) Cleans up any completed Tx descriptors that may still be pending.
2815  *      Normal descriptor cleanup happens when new packets are added to a Tx
2816  *      queue so this timer is relatively infrequent and does any cleanup only
2817  *      if the Tx queue has not seen any new packets in a while.  We make a
2818  *      best effort attempt to reclaim descriptors, in that we don't wait
2819  *      around if we cannot get a queue's lock (which most likely is because
2820  *      someone else is queueing new packets and so will also handle the clean
2821  *      up).  Since control queues use immediate data exclusively we don't
2822  *      bother cleaning them up here.
2823  *
2824  *      b) Replenishes Rx queues that have run out due to memory shortage.
2825  *      Normally new Rx buffers are added when existing ones are consumed but
2826  *      when out of memory a queue can become empty.  We try to add only a few
2827  *      buffers here, the queue will be replenished fully as these new buffers
2828  *      are used up if memory shortage has subsided.
2829  */
2830 static void sge_timer_cb(unsigned long data)
2831 {
2832         spinlock_t *lock;
2833         struct sge_qset *qs = (struct sge_qset *)data;
2834         struct adapter *adap = qs->adap;
2835
2836         if (spin_trylock(&qs->txq[TXQ_ETH].lock)) {
2837                 reclaim_completed_tx(adap, &qs->txq[TXQ_ETH]);
2838                 spin_unlock(&qs->txq[TXQ_ETH].lock);
2839         }
2840         if (spin_trylock(&qs->txq[TXQ_OFLD].lock)) {
2841                 reclaim_completed_tx(adap, &qs->txq[TXQ_OFLD]);
2842                 spin_unlock(&qs->txq[TXQ_OFLD].lock);
2843         }
2844         lock = (adap->flags & USING_MSIX) ? &qs->rspq.lock :
2845                                             &adap->sge.qs[0].rspq.lock;
2846         if (spin_trylock_irq(lock)) {
2847                 if (!napi_is_scheduled(&qs->napi)) {
2848                         u32 status = t3_read_reg(adap, A_SG_RSPQ_FL_STATUS);
2849
2850                         if (qs->fl[0].credits < qs->fl[0].size)
2851                                 __refill_fl(adap, &qs->fl[0]);
2852                         if (qs->fl[1].credits < qs->fl[1].size)
2853                                 __refill_fl(adap, &qs->fl[1]);
2854
2855                         if (status & (1 << qs->rspq.cntxt_id)) {
2856                                 qs->rspq.starved++;
2857                                 if (qs->rspq.credits) {
2858                                         refill_rspq(adap, &qs->rspq, 1);
2859                                         qs->rspq.credits--;
2860                                         qs->rspq.restarted++;
2861                                         t3_write_reg(adap, A_SG_RSPQ_FL_STATUS,
2862                                                      1 << qs->rspq.cntxt_id);
2863                                 }
2864                         }
2865                 }
2866                 spin_unlock_irq(lock);
2867         }
2868         mod_timer(&qs->tx_reclaim_timer, jiffies + TX_RECLAIM_PERIOD);
2869 }
2870
2871 /**
2872  *      t3_update_qset_coalesce - update coalescing settings for a queue set
2873  *      @qs: the SGE queue set
2874  *      @p: new queue set parameters
2875  *
2876  *      Update the coalescing settings for an SGE queue set.  Nothing is done
2877  *      if the queue set is not initialized yet.
2878  */
2879 void t3_update_qset_coalesce(struct sge_qset *qs, const struct qset_params *p)
2880 {
2881         qs->rspq.holdoff_tmr = max(p->coalesce_usecs * 10, 1U);/* can't be 0 */
2882         qs->rspq.polling = p->polling;
2883         qs->napi.poll = p->polling ? napi_rx_handler : ofld_poll;
2884 }
2885
2886 /**
2887  *      t3_sge_alloc_qset - initialize an SGE queue set
2888  *      @adapter: the adapter
2889  *      @id: the queue set id
2890  *      @nports: how many Ethernet ports will be using this queue set
2891  *      @irq_vec_idx: the IRQ vector index for response queue interrupts
2892  *      @p: configuration parameters for this queue set
2893  *      @ntxq: number of Tx queues for the queue set
2894  *      @netdev: net device associated with this queue set
2895  *      @netdevq: net device TX queue associated with this queue set
2896  *
2897  *      Allocate resources and initialize an SGE queue set.  A queue set
2898  *      comprises a response queue, two Rx free-buffer queues, and up to 3
2899  *      Tx queues.  The Tx queues are assigned roles in the order Ethernet
2900  *      queue, offload queue, and control queue.
2901  */
2902 int t3_sge_alloc_qset(struct adapter *adapter, unsigned int id, int nports,
2903                       int irq_vec_idx, const struct qset_params *p,
2904                       int ntxq, struct net_device *dev,
2905                       struct netdev_queue *netdevq)
2906 {
2907         int i, avail, ret = -ENOMEM;
2908         struct sge_qset *q = &adapter->sge.qs[id];
2909         struct net_lro_mgr *lro_mgr = &q->lro_mgr;
2910
2911         init_qset_cntxt(q, id);
2912         setup_timer(&q->tx_reclaim_timer, sge_timer_cb, (unsigned long)q);
2913
2914         q->fl[0].desc = alloc_ring(adapter->pdev, p->fl_size,
2915                                    sizeof(struct rx_desc),
2916                                    sizeof(struct rx_sw_desc),
2917                                    &q->fl[0].phys_addr, &q->fl[0].sdesc);
2918         if (!q->fl[0].desc)
2919                 goto err;
2920
2921         q->fl[1].desc = alloc_ring(adapter->pdev, p->jumbo_size,
2922                                    sizeof(struct rx_desc),
2923                                    sizeof(struct rx_sw_desc),
2924                                    &q->fl[1].phys_addr, &q->fl[1].sdesc);
2925         if (!q->fl[1].desc)
2926                 goto err;
2927
2928         q->rspq.desc = alloc_ring(adapter->pdev, p->rspq_size,
2929                                   sizeof(struct rsp_desc), 0,
2930                                   &q->rspq.phys_addr, NULL);
2931         if (!q->rspq.desc)
2932                 goto err;
2933
2934         for (i = 0; i < ntxq; ++i) {
2935                 /*
2936                  * The control queue always uses immediate data so does not
2937                  * need to keep track of any sk_buffs.
2938                  */
2939                 size_t sz = i == TXQ_CTRL ? 0 : sizeof(struct tx_sw_desc);
2940
2941                 q->txq[i].desc = alloc_ring(adapter->pdev, p->txq_size[i],
2942                                             sizeof(struct tx_desc), sz,
2943                                             &q->txq[i].phys_addr,
2944                                             &q->txq[i].sdesc);
2945                 if (!q->txq[i].desc)
2946                         goto err;
2947
2948                 q->txq[i].gen = 1;
2949                 q->txq[i].size = p->txq_size[i];
2950                 spin_lock_init(&q->txq[i].lock);
2951                 skb_queue_head_init(&q->txq[i].sendq);
2952         }
2953
2954         tasklet_init(&q->txq[TXQ_OFLD].qresume_tsk, restart_offloadq,
2955                      (unsigned long)q);
2956         tasklet_init(&q->txq[TXQ_CTRL].qresume_tsk, restart_ctrlq,
2957                      (unsigned long)q);
2958
2959         q->fl[0].gen = q->fl[1].gen = 1;
2960         q->fl[0].size = p->fl_size;
2961         q->fl[1].size = p->jumbo_size;
2962
2963         q->rspq.gen = 1;
2964         q->rspq.size = p->rspq_size;
2965         spin_lock_init(&q->rspq.lock);
2966         skb_queue_head_init(&q->rspq.rx_queue);
2967
2968         q->txq[TXQ_ETH].stop_thres = nports *
2969             flits_to_desc(sgl_len(MAX_SKB_FRAGS + 1) + 3);
2970
2971 #if FL0_PG_CHUNK_SIZE > 0
2972         q->fl[0].buf_size = FL0_PG_CHUNK_SIZE;
2973 #else
2974         q->fl[0].buf_size = SGE_RX_SM_BUF_SIZE + sizeof(struct cpl_rx_data);
2975 #endif
2976 #if FL1_PG_CHUNK_SIZE > 0
2977         q->fl[1].buf_size = FL1_PG_CHUNK_SIZE;
2978 #else
2979         q->fl[1].buf_size = is_offload(adapter) ?
2980                 (16 * 1024) - SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) :
2981                 MAX_FRAME_SIZE + 2 + sizeof(struct cpl_rx_pkt);
2982 #endif
2983
2984         q->fl[0].use_pages = FL0_PG_CHUNK_SIZE > 0;
2985         q->fl[1].use_pages = FL1_PG_CHUNK_SIZE > 0;
2986         q->fl[0].order = FL0_PG_ORDER;
2987         q->fl[1].order = FL1_PG_ORDER;
2988
2989         q->lro_frag_tbl = kcalloc(MAX_FRAME_SIZE / FL1_PG_CHUNK_SIZE + 1,
2990                                   sizeof(struct skb_frag_struct),
2991                                   GFP_KERNEL);
2992         q->lro_nfrags = q->lro_frag_len = 0;
2993         spin_lock_irq(&adapter->sge.reg_lock);
2994
2995         /* FL threshold comparison uses < */
2996         ret = t3_sge_init_rspcntxt(adapter, q->rspq.cntxt_id, irq_vec_idx,
2997                                    q->rspq.phys_addr, q->rspq.size,
2998                                    q->fl[0].buf_size, 1, 0);
2999         if (ret)
3000                 goto err_unlock;
3001
3002         for (i = 0; i < SGE_RXQ_PER_SET; ++i) {
3003                 ret = t3_sge_init_flcntxt(adapter, q->fl[i].cntxt_id, 0,
3004                                           q->fl[i].phys_addr, q->fl[i].size,
3005                                           q->fl[i].buf_size, p->cong_thres, 1,
3006                                           0);
3007                 if (ret)
3008                         goto err_unlock;
3009         }
3010
3011         ret = t3_sge_init_ecntxt(adapter, q->txq[TXQ_ETH].cntxt_id, USE_GTS,
3012                                  SGE_CNTXT_ETH, id, q->txq[TXQ_ETH].phys_addr,
3013                                  q->txq[TXQ_ETH].size, q->txq[TXQ_ETH].token,
3014                                  1, 0);
3015         if (ret)
3016                 goto err_unlock;
3017
3018         if (ntxq > 1) {
3019                 ret = t3_sge_init_ecntxt(adapter, q->txq[TXQ_OFLD].cntxt_id,
3020                                          USE_GTS, SGE_CNTXT_OFLD, id,
3021                                          q->txq[TXQ_OFLD].phys_addr,
3022                                          q->txq[TXQ_OFLD].size, 0, 1, 0);
3023                 if (ret)
3024                         goto err_unlock;
3025         }
3026
3027         if (ntxq > 2) {
3028                 ret = t3_sge_init_ecntxt(adapter, q->txq[TXQ_CTRL].cntxt_id, 0,
3029                                          SGE_CNTXT_CTRL, id,
3030                                          q->txq[TXQ_CTRL].phys_addr,
3031                                          q->txq[TXQ_CTRL].size,
3032                                          q->txq[TXQ_CTRL].token, 1, 0);
3033                 if (ret)
3034                         goto err_unlock;
3035         }
3036
3037         spin_unlock_irq(&adapter->sge.reg_lock);
3038
3039         q->adap = adapter;
3040         q->netdev = dev;
3041         q->tx_q = netdevq;
3042         t3_update_qset_coalesce(q, p);
3043
3044         init_lro_mgr(q, lro_mgr);
3045
3046         avail = refill_fl(adapter, &q->fl[0], q->fl[0].size,
3047                           GFP_KERNEL | __GFP_COMP);
3048         if (!avail) {
3049                 CH_ALERT(adapter, "free list queue 0 initialization failed\n");
3050                 goto err;
3051         }
3052         if (avail < q->fl[0].size)
3053                 CH_WARN(adapter, "free list queue 0 enabled with %d credits\n",
3054                         avail);
3055
3056         avail = refill_fl(adapter, &q->fl[1], q->fl[1].size,
3057                           GFP_KERNEL | __GFP_COMP);
3058         if (avail < q->fl[1].size)
3059                 CH_WARN(adapter, "free list queue 1 enabled with %d credits\n",
3060                         avail);
3061         refill_rspq(adapter, &q->rspq, q->rspq.size - 1);
3062
3063         t3_write_reg(adapter, A_SG_GTS, V_RSPQ(q->rspq.cntxt_id) |
3064                      V_NEWTIMER(q->rspq.holdoff_tmr));
3065
3066         mod_timer(&q->tx_reclaim_timer, jiffies + TX_RECLAIM_PERIOD);
3067         return 0;
3068
3069 err_unlock:
3070         spin_unlock_irq(&adapter->sge.reg_lock);
3071 err:
3072         t3_free_qset(adapter, q);
3073         return ret;
3074 }
3075
3076 /**
3077  *      t3_stop_sge_timers - stop SGE timer call backs
3078  *      @adap: the adapter
3079  *
3080  *      Stops each SGE queue set's timer call back
3081  */
3082 void t3_stop_sge_timers(struct adapter *adap)
3083 {
3084         int i;
3085
3086         for (i = 0; i < SGE_QSETS; ++i) {
3087                 struct sge_qset *q = &adap->sge.qs[i];
3088
3089                 if (q->tx_reclaim_timer.function)
3090                         del_timer_sync(&q->tx_reclaim_timer);
3091         }
3092 }
3093
3094 /**
3095  *      t3_free_sge_resources - free SGE resources
3096  *      @adap: the adapter
3097  *
3098  *      Frees resources used by the SGE queue sets.
3099  */
3100 void t3_free_sge_resources(struct adapter *adap)
3101 {
3102         int i;
3103
3104         for (i = 0; i < SGE_QSETS; ++i)
3105                 t3_free_qset(adap, &adap->sge.qs[i]);
3106 }
3107
3108 /**
3109  *      t3_sge_start - enable SGE
3110  *      @adap: the adapter
3111  *
3112  *      Enables the SGE for DMAs.  This is the last step in starting packet
3113  *      transfers.
3114  */
3115 void t3_sge_start(struct adapter *adap)
3116 {
3117         t3_set_reg_field(adap, A_SG_CONTROL, F_GLOBALENABLE, F_GLOBALENABLE);
3118 }
3119
3120 /**
3121  *      t3_sge_stop - disable SGE operation
3122  *      @adap: the adapter
3123  *
3124  *      Disables the DMA engine.  This can be called in emeregencies (e.g.,
3125  *      from error interrupts) or from normal process context.  In the latter
3126  *      case it also disables any pending queue restart tasklets.  Note that
3127  *      if it is called in interrupt context it cannot disable the restart
3128  *      tasklets as it cannot wait, however the tasklets will have no effect
3129  *      since the doorbells are disabled and the driver will call this again
3130  *      later from process context, at which time the tasklets will be stopped
3131  *      if they are still running.
3132  */
3133 void t3_sge_stop(struct adapter *adap)
3134 {
3135         t3_set_reg_field(adap, A_SG_CONTROL, F_GLOBALENABLE, 0);
3136         if (!in_interrupt()) {
3137                 int i;
3138
3139                 for (i = 0; i < SGE_QSETS; ++i) {
3140                         struct sge_qset *qs = &adap->sge.qs[i];
3141
3142                         tasklet_kill(&qs->txq[TXQ_OFLD].qresume_tsk);
3143                         tasklet_kill(&qs->txq[TXQ_CTRL].qresume_tsk);
3144                 }
3145         }
3146 }
3147
3148 /**
3149  *      t3_sge_init - initialize SGE
3150  *      @adap: the adapter
3151  *      @p: the SGE parameters
3152  *
3153  *      Performs SGE initialization needed every time after a chip reset.
3154  *      We do not initialize any of the queue sets here, instead the driver
3155  *      top-level must request those individually.  We also do not enable DMA
3156  *      here, that should be done after the queues have been set up.
3157  */
3158 void t3_sge_init(struct adapter *adap, struct sge_params *p)
3159 {
3160         unsigned int ctrl, ups = ffs(pci_resource_len(adap->pdev, 2) >> 12);
3161
3162         ctrl = F_DROPPKT | V_PKTSHIFT(2) | F_FLMODE | F_AVOIDCQOVFL |
3163             F_CQCRDTCTRL | F_CONGMODE | F_TNLFLMODE | F_FATLPERREN |
3164             V_HOSTPAGESIZE(PAGE_SHIFT - 11) | F_BIGENDIANINGRESS |
3165             V_USERSPACESIZE(ups ? ups - 1 : 0) | F_ISCSICOALESCING;
3166 #if SGE_NUM_GENBITS == 1
3167         ctrl |= F_EGRGENCTRL;
3168 #endif
3169         if (adap->params.rev > 0) {
3170                 if (!(adap->flags & (USING_MSIX | USING_MSI)))
3171                         ctrl |= F_ONEINTMULTQ | F_OPTONEINTMULTQ;
3172         }
3173         t3_write_reg(adap, A_SG_CONTROL, ctrl);
3174         t3_write_reg(adap, A_SG_EGR_RCQ_DRB_THRSH, V_HIRCQDRBTHRSH(512) |
3175                      V_LORCQDRBTHRSH(512));
3176         t3_write_reg(adap, A_SG_TIMER_TICK, core_ticks_per_usec(adap) / 10);
3177         t3_write_reg(adap, A_SG_CMDQ_CREDIT_TH, V_THRESHOLD(32) |
3178                      V_TIMEOUT(200 * core_ticks_per_usec(adap)));
3179         t3_write_reg(adap, A_SG_HI_DRB_HI_THRSH,
3180                      adap->params.rev < T3_REV_C ? 1000 : 500);
3181         t3_write_reg(adap, A_SG_HI_DRB_LO_THRSH, 256);
3182         t3_write_reg(adap, A_SG_LO_DRB_HI_THRSH, 1000);
3183         t3_write_reg(adap, A_SG_LO_DRB_LO_THRSH, 256);
3184         t3_write_reg(adap, A_SG_OCO_BASE, V_BASE1(0xfff));
3185         t3_write_reg(adap, A_SG_DRB_PRI_THRESH, 63 * 1024);
3186 }
3187
3188 /**
3189  *      t3_sge_prep - one-time SGE initialization
3190  *      @adap: the associated adapter
3191  *      @p: SGE parameters
3192  *
3193  *      Performs one-time initialization of SGE SW state.  Includes determining
3194  *      defaults for the assorted SGE parameters, which admins can change until
3195  *      they are used to initialize the SGE.
3196  */
3197 void t3_sge_prep(struct adapter *adap, struct sge_params *p)
3198 {
3199         int i;
3200
3201         p->max_pkt_size = (16 * 1024) - sizeof(struct cpl_rx_data) -
3202             SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
3203
3204         for (i = 0; i < SGE_QSETS; ++i) {
3205                 struct qset_params *q = p->qset + i;
3206
3207                 q->polling = adap->params.rev > 0;
3208                 q->coalesce_usecs = 5;
3209                 q->rspq_size = 1024;
3210                 q->fl_size = 1024;
3211                 q->jumbo_size = 512;
3212                 q->txq_size[TXQ_ETH] = 1024;
3213                 q->txq_size[TXQ_OFLD] = 1024;
3214                 q->txq_size[TXQ_CTRL] = 256;
3215                 q->cong_thres = 0;
3216         }
3217
3218         spin_lock_init(&adap->sge.reg_lock);
3219 }
3220
3221 /**
3222  *      t3_get_desc - dump an SGE descriptor for debugging purposes
3223  *      @qs: the queue set
3224  *      @qnum: identifies the specific queue (0..2: Tx, 3:response, 4..5: Rx)
3225  *      @idx: the descriptor index in the queue
3226  *      @data: where to dump the descriptor contents
3227  *
3228  *      Dumps the contents of a HW descriptor of an SGE queue.  Returns the
3229  *      size of the descriptor.
3230  */
3231 int t3_get_desc(const struct sge_qset *qs, unsigned int qnum, unsigned int idx,
3232                 unsigned char *data)
3233 {
3234         if (qnum >= 6)
3235                 return -EINVAL;
3236
3237         if (qnum < 3) {
3238                 if (!qs->txq[qnum].desc || idx >= qs->txq[qnum].size)
3239                         return -EINVAL;
3240                 memcpy(data, &qs->txq[qnum].desc[idx], sizeof(struct tx_desc));
3241                 return sizeof(struct tx_desc);
3242         }
3243
3244         if (qnum == 3) {
3245                 if (!qs->rspq.desc || idx >= qs->rspq.size)
3246                         return -EINVAL;
3247                 memcpy(data, &qs->rspq.desc[idx], sizeof(struct rsp_desc));
3248                 return sizeof(struct rsp_desc);
3249         }
3250
3251         qnum -= 4;
3252         if (!qs->fl[qnum].desc || idx >= qs->fl[qnum].size)
3253                 return -EINVAL;
3254         memcpy(data, &qs->fl[qnum].desc[idx], sizeof(struct rx_desc));
3255         return sizeof(struct rx_desc);
3256 }