mxge: Use ringmap to do MSI-X cpu assignment and fill redirect table.
[dragonfly.git] / sys / dev / netif / mxge / if_mxge.c
blob886768578a5ded63da164315dea0c0d5b0396bf3
1 /******************************************************************************
3 Copyright (c) 2006-2013, Myricom Inc.
4 All rights reserved.
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are met:
9 1. Redistributions of source code must retain the above copyright notice,
10 this list of conditions and the following disclaimer.
12 2. Neither the name of the Myricom Inc, nor the names of its
13 contributors may be used to endorse or promote products derived from
14 this software without specific prior written permission.
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 POSSIBILITY OF SUCH DAMAGE.
28 $FreeBSD: head/sys/dev/mxge/if_mxge.c 254263 2013-08-12 23:30:01Z scottl $
30 ***************************************************************************/
32 #include "opt_ifpoll.h"
33 #include "opt_inet.h"
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/linker.h>
38 #include <sys/firmware.h>
39 #include <sys/endian.h>
40 #include <sys/in_cksum.h>
41 #include <sys/sockio.h>
42 #include <sys/mbuf.h>
43 #include <sys/malloc.h>
44 #include <sys/kernel.h>
45 #include <sys/module.h>
46 #include <sys/serialize.h>
47 #include <sys/socket.h>
48 #include <sys/sysctl.h>
50 #include <net/if.h>
51 #include <net/if_arp.h>
52 #include <net/ifq_var.h>
53 #include <net/ethernet.h>
54 #include <net/if_dl.h>
55 #include <net/if_media.h>
56 #include <net/if_poll.h>
58 #include <net/bpf.h>
60 #include <net/if_types.h>
61 #include <net/vlan/if_vlan_var.h>
62 #include <net/zlib.h>
63 #include <net/toeplitz.h>
65 #include <netinet/in_systm.h>
66 #include <netinet/in.h>
67 #include <netinet/ip.h>
68 #include <netinet/tcp.h>
70 #include <sys/bus.h>
71 #include <sys/rman.h>
73 #include <bus/pci/pcireg.h>
74 #include <bus/pci/pcivar.h>
75 #include <bus/pci/pci_private.h> /* XXX for pci_cfg_restore */
77 #include <vm/vm.h> /* for pmap_mapdev() */
78 #include <vm/pmap.h>
80 #if defined(__x86_64__)
81 #include <machine/specialreg.h>
82 #endif
84 #include <dev/netif/mxge/mxge_mcp.h>
85 #include <dev/netif/mxge/mcp_gen_header.h>
86 #include <dev/netif/mxge/if_mxge_var.h>
88 #define MXGE_IFM (IFM_ETHER | IFM_FDX | IFM_ETH_FORCEPAUSE)
90 #define MXGE_RX_SMALL_BUFLEN (MHLEN - MXGEFW_PAD)
91 #define MXGE_HWRSS_KEYLEN 16
93 /* Tunable params */
94 static int mxge_nvidia_ecrc_enable = 1;
95 static int mxge_force_firmware = 0;
96 static int mxge_intr_coal_delay = MXGE_INTR_COAL_DELAY;
97 static int mxge_deassert_wait = 1;
98 static int mxge_ticks;
99 static int mxge_num_slices = 0;
100 static int mxge_always_promisc = 0;
101 static int mxge_throttle = 0;
102 static int mxge_msi_enable = 1;
103 static int mxge_msix_enable = 1;
104 static int mxge_multi_tx = 1;
106 * Don't use RSS by default, its just too slow
108 static int mxge_use_rss = 0;
110 static char mxge_flowctrl[IFM_ETH_FC_STRLEN] = IFM_ETH_FC_FORCE_FULL;
112 static const char *mxge_fw_unaligned = "mxge_ethp_z8e";
113 static const char *mxge_fw_aligned = "mxge_eth_z8e";
114 static const char *mxge_fw_rss_aligned = "mxge_rss_eth_z8e";
115 static const char *mxge_fw_rss_unaligned = "mxge_rss_ethp_z8e";
117 TUNABLE_INT("hw.mxge.num_slices", &mxge_num_slices);
118 TUNABLE_INT("hw.mxge.intr_coal_delay", &mxge_intr_coal_delay);
119 TUNABLE_INT("hw.mxge.nvidia_ecrc_enable", &mxge_nvidia_ecrc_enable);
120 TUNABLE_INT("hw.mxge.force_firmware", &mxge_force_firmware);
121 TUNABLE_INT("hw.mxge.deassert_wait", &mxge_deassert_wait);
122 TUNABLE_INT("hw.mxge.ticks", &mxge_ticks);
123 TUNABLE_INT("hw.mxge.always_promisc", &mxge_always_promisc);
124 TUNABLE_INT("hw.mxge.throttle", &mxge_throttle);
125 TUNABLE_INT("hw.mxge.multi_tx", &mxge_multi_tx);
126 TUNABLE_INT("hw.mxge.use_rss", &mxge_use_rss);
127 TUNABLE_INT("hw.mxge.msi.enable", &mxge_msi_enable);
128 TUNABLE_INT("hw.mxge.msix.enable", &mxge_msix_enable);
129 TUNABLE_STR("hw.mxge.flow_ctrl", mxge_flowctrl, sizeof(mxge_flowctrl));
131 static int mxge_probe(device_t dev);
132 static int mxge_attach(device_t dev);
133 static int mxge_detach(device_t dev);
134 static int mxge_shutdown(device_t dev);
136 static int mxge_alloc_intr(struct mxge_softc *sc);
137 static void mxge_free_intr(struct mxge_softc *sc);
138 static int mxge_setup_intr(struct mxge_softc *sc);
139 static void mxge_teardown_intr(struct mxge_softc *sc, int cnt);
141 static device_method_t mxge_methods[] = {
142 /* Device interface */
143 DEVMETHOD(device_probe, mxge_probe),
144 DEVMETHOD(device_attach, mxge_attach),
145 DEVMETHOD(device_detach, mxge_detach),
146 DEVMETHOD(device_shutdown, mxge_shutdown),
147 DEVMETHOD_END
150 static driver_t mxge_driver = {
151 "mxge",
152 mxge_methods,
153 sizeof(mxge_softc_t),
156 static devclass_t mxge_devclass;
158 /* Declare ourselves to be a child of the PCI bus.*/
159 DRIVER_MODULE(mxge, pci, mxge_driver, mxge_devclass, NULL, NULL);
160 MODULE_DEPEND(mxge, firmware, 1, 1, 1);
161 MODULE_DEPEND(mxge, zlib, 1, 1, 1);
163 static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
164 static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
165 static void mxge_close(mxge_softc_t *sc, int down);
166 static int mxge_open(mxge_softc_t *sc);
167 static void mxge_tick(void *arg);
168 static void mxge_watchdog_reset(mxge_softc_t *sc);
169 static void mxge_warn_stuck(mxge_softc_t *sc, mxge_tx_ring_t *tx, int slice);
171 static int
172 mxge_probe(device_t dev)
174 if (pci_get_vendor(dev) == MXGE_PCI_VENDOR_MYRICOM &&
175 (pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E ||
176 pci_get_device(dev) == MXGE_PCI_DEVICE_Z8E_9)) {
177 int rev = pci_get_revid(dev);
179 switch (rev) {
180 case MXGE_PCI_REV_Z8E:
181 device_set_desc(dev, "Myri10G-PCIE-8A");
182 break;
183 case MXGE_PCI_REV_Z8ES:
184 device_set_desc(dev, "Myri10G-PCIE-8B");
185 break;
186 default:
187 device_set_desc(dev, "Myri10G-PCIE-8??");
188 device_printf(dev, "Unrecognized rev %d NIC\n", rev);
189 break;
191 return 0;
193 return ENXIO;
196 static void
197 mxge_enable_wc(mxge_softc_t *sc)
199 #if defined(__x86_64__)
200 vm_offset_t len;
202 sc->wc = 1;
203 len = rman_get_size(sc->mem_res);
204 pmap_change_attr((vm_offset_t) sc->sram, len / PAGE_SIZE,
205 PAT_WRITE_COMBINING);
206 #endif
209 static int
210 mxge_dma_alloc(mxge_softc_t *sc, bus_dmamem_t *dma, size_t bytes,
211 bus_size_t alignment)
213 bus_size_t boundary;
214 int err;
216 if (bytes > 4096 && alignment == 4096)
217 boundary = 0;
218 else
219 boundary = 4096;
221 err = bus_dmamem_coherent(sc->parent_dmat, alignment, boundary,
222 BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, bytes,
223 BUS_DMA_WAITOK | BUS_DMA_ZERO, dma);
224 if (err != 0) {
225 device_printf(sc->dev, "bus_dmamem_coherent failed: %d\n", err);
226 return err;
228 return 0;
231 static void
232 mxge_dma_free(bus_dmamem_t *dma)
234 bus_dmamap_unload(dma->dmem_tag, dma->dmem_map);
235 bus_dmamem_free(dma->dmem_tag, dma->dmem_addr, dma->dmem_map);
236 bus_dma_tag_destroy(dma->dmem_tag);
240 * The eeprom strings on the lanaiX have the format
241 * SN=x\0
242 * MAC=x:x:x:x:x:x\0
243 * PC=text\0
245 static int
246 mxge_parse_strings(mxge_softc_t *sc)
248 const char *ptr;
249 int i, found_mac, found_sn2;
250 char *endptr;
252 ptr = sc->eeprom_strings;
253 found_mac = 0;
254 found_sn2 = 0;
255 while (*ptr != '\0') {
256 if (strncmp(ptr, "MAC=", 4) == 0) {
257 ptr += 4;
258 for (i = 0;;) {
259 sc->mac_addr[i] = strtoul(ptr, &endptr, 16);
260 if (endptr - ptr != 2)
261 goto abort;
262 ptr = endptr;
263 if (++i == 6)
264 break;
265 if (*ptr++ != ':')
266 goto abort;
268 found_mac = 1;
269 } else if (strncmp(ptr, "PC=", 3) == 0) {
270 ptr += 3;
271 strlcpy(sc->product_code_string, ptr,
272 sizeof(sc->product_code_string));
273 } else if (!found_sn2 && (strncmp(ptr, "SN=", 3) == 0)) {
274 ptr += 3;
275 strlcpy(sc->serial_number_string, ptr,
276 sizeof(sc->serial_number_string));
277 } else if (strncmp(ptr, "SN2=", 4) == 0) {
278 /* SN2 takes precedence over SN */
279 ptr += 4;
280 found_sn2 = 1;
281 strlcpy(sc->serial_number_string, ptr,
282 sizeof(sc->serial_number_string));
284 while (*ptr++ != '\0') {}
287 if (found_mac)
288 return 0;
290 abort:
291 device_printf(sc->dev, "failed to parse eeprom_strings\n");
292 return ENXIO;
295 #if defined(__x86_64__)
297 static void
298 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
300 uint32_t val;
301 unsigned long base, off;
302 char *va, *cfgptr;
303 device_t pdev, mcp55;
304 uint16_t vendor_id, device_id, word;
305 uintptr_t bus, slot, func, ivend, idev;
306 uint32_t *ptr32;
308 if (!mxge_nvidia_ecrc_enable)
309 return;
311 pdev = device_get_parent(device_get_parent(sc->dev));
312 if (pdev == NULL) {
313 device_printf(sc->dev, "could not find parent?\n");
314 return;
316 vendor_id = pci_read_config(pdev, PCIR_VENDOR, 2);
317 device_id = pci_read_config(pdev, PCIR_DEVICE, 2);
319 if (vendor_id != 0x10de)
320 return;
322 base = 0;
324 if (device_id == 0x005d) {
325 /* ck804, base address is magic */
326 base = 0xe0000000UL;
327 } else if (device_id >= 0x0374 && device_id <= 0x378) {
328 /* mcp55, base address stored in chipset */
329 mcp55 = pci_find_bsf(0, 0, 0);
330 if (mcp55 &&
331 0x10de == pci_read_config(mcp55, PCIR_VENDOR, 2) &&
332 0x0369 == pci_read_config(mcp55, PCIR_DEVICE, 2)) {
333 word = pci_read_config(mcp55, 0x90, 2);
334 base = ((unsigned long)word & 0x7ffeU) << 25;
337 if (!base)
338 return;
341 * XXXX
342 * Test below is commented because it is believed that doing
343 * config read/write beyond 0xff will access the config space
344 * for the next larger function. Uncomment this and remove
345 * the hacky pmap_mapdev() way of accessing config space when
346 * DragonFly grows support for extended pcie config space access.
348 #if 0
350 * See if we can, by some miracle, access the extended
351 * config space
353 val = pci_read_config(pdev, 0x178, 4);
354 if (val != 0xffffffff) {
355 val |= 0x40;
356 pci_write_config(pdev, 0x178, val, 4);
357 return;
359 #endif
361 * Rather than using normal pci config space writes, we must
362 * map the Nvidia config space ourselves. This is because on
363 * opteron/nvidia class machine the 0xe000000 mapping is
364 * handled by the nvidia chipset, that means the internal PCI
365 * device (the on-chip northbridge), or the amd-8131 bridge
366 * and things behind them are not visible by this method.
369 BUS_READ_IVAR(device_get_parent(pdev), pdev,
370 PCI_IVAR_BUS, &bus);
371 BUS_READ_IVAR(device_get_parent(pdev), pdev,
372 PCI_IVAR_SLOT, &slot);
373 BUS_READ_IVAR(device_get_parent(pdev), pdev,
374 PCI_IVAR_FUNCTION, &func);
375 BUS_READ_IVAR(device_get_parent(pdev), pdev,
376 PCI_IVAR_VENDOR, &ivend);
377 BUS_READ_IVAR(device_get_parent(pdev), pdev,
378 PCI_IVAR_DEVICE, &idev);
380 off = base + 0x00100000UL * (unsigned long)bus +
381 0x00001000UL * (unsigned long)(func + 8 * slot);
383 /* map it into the kernel */
384 va = pmap_mapdev(trunc_page((vm_paddr_t)off), PAGE_SIZE);
385 if (va == NULL) {
386 device_printf(sc->dev, "pmap_kenter_temporary didn't\n");
387 return;
389 /* get a pointer to the config space mapped into the kernel */
390 cfgptr = va + (off & PAGE_MASK);
392 /* make sure that we can really access it */
393 vendor_id = *(uint16_t *)(cfgptr + PCIR_VENDOR);
394 device_id = *(uint16_t *)(cfgptr + PCIR_DEVICE);
395 if (!(vendor_id == ivend && device_id == idev)) {
396 device_printf(sc->dev, "mapping failed: 0x%x:0x%x\n",
397 vendor_id, device_id);
398 pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
399 return;
402 ptr32 = (uint32_t*)(cfgptr + 0x178);
403 val = *ptr32;
405 if (val == 0xffffffff) {
406 device_printf(sc->dev, "extended mapping failed\n");
407 pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
408 return;
410 *ptr32 = val | 0x40;
411 pmap_unmapdev((vm_offset_t)va, PAGE_SIZE);
412 if (bootverbose) {
413 device_printf(sc->dev, "Enabled ECRC on upstream "
414 "Nvidia bridge at %d:%d:%d\n",
415 (int)bus, (int)slot, (int)func);
419 #else /* __x86_64__ */
421 static void
422 mxge_enable_nvidia_ecrc(mxge_softc_t *sc)
424 device_printf(sc->dev, "Nforce 4 chipset on non-x86/x86_64!?!?!\n");
427 #endif
429 static int
430 mxge_dma_test(mxge_softc_t *sc, int test_type)
432 mxge_cmd_t cmd;
433 bus_addr_t dmatest_bus = sc->dmabench_dma.dmem_busaddr;
434 int status;
435 uint32_t len;
436 const char *test = " ";
439 * Run a small DMA test.
440 * The magic multipliers to the length tell the firmware
441 * to do DMA read, write, or read+write tests. The
442 * results are returned in cmd.data0. The upper 16
443 * bits of the return is the number of transfers completed.
444 * The lower 16 bits is the time in 0.5us ticks that the
445 * transfers took to complete.
448 len = sc->tx_boundary;
450 cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
451 cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
452 cmd.data2 = len * 0x10000;
453 status = mxge_send_cmd(sc, test_type, &cmd);
454 if (status != 0) {
455 test = "read";
456 goto abort;
458 sc->read_dma = ((cmd.data0>>16) * len * 2) / (cmd.data0 & 0xffff);
460 cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
461 cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
462 cmd.data2 = len * 0x1;
463 status = mxge_send_cmd(sc, test_type, &cmd);
464 if (status != 0) {
465 test = "write";
466 goto abort;
468 sc->write_dma = ((cmd.data0>>16) * len * 2) / (cmd.data0 & 0xffff);
470 cmd.data0 = MXGE_LOWPART_TO_U32(dmatest_bus);
471 cmd.data1 = MXGE_HIGHPART_TO_U32(dmatest_bus);
472 cmd.data2 = len * 0x10001;
473 status = mxge_send_cmd(sc, test_type, &cmd);
474 if (status != 0) {
475 test = "read/write";
476 goto abort;
478 sc->read_write_dma = ((cmd.data0>>16) * len * 2 * 2) /
479 (cmd.data0 & 0xffff);
481 abort:
482 if (status != 0 && test_type != MXGEFW_CMD_UNALIGNED_TEST) {
483 device_printf(sc->dev, "DMA %s benchmark failed: %d\n",
484 test, status);
486 return status;
490 * The Lanai Z8E PCI-E interface achieves higher Read-DMA throughput
491 * when the PCI-E Completion packets are aligned on an 8-byte
492 * boundary. Some PCI-E chip sets always align Completion packets; on
493 * the ones that do not, the alignment can be enforced by enabling
494 * ECRC generation (if supported).
496 * When PCI-E Completion packets are not aligned, it is actually more
497 * efficient to limit Read-DMA transactions to 2KB, rather than 4KB.
499 * If the driver can neither enable ECRC nor verify that it has
500 * already been enabled, then it must use a firmware image which works
501 * around unaligned completion packets (ethp_z8e.dat), and it should
502 * also ensure that it never gives the device a Read-DMA which is
503 * larger than 2KB by setting the tx_boundary to 2KB. If ECRC is
504 * enabled, then the driver should use the aligned (eth_z8e.dat)
505 * firmware image, and set tx_boundary to 4KB.
507 static int
508 mxge_firmware_probe(mxge_softc_t *sc)
510 device_t dev = sc->dev;
511 int reg, status;
512 uint16_t pectl;
514 sc->tx_boundary = 4096;
517 * Verify the max read request size was set to 4KB
518 * before trying the test with 4KB.
520 if (pci_find_extcap(dev, PCIY_EXPRESS, &reg) == 0) {
521 pectl = pci_read_config(dev, reg + 0x8, 2);
522 if ((pectl & (5 << 12)) != (5 << 12)) {
523 device_printf(dev, "Max Read Req. size != 4k (0x%x)\n",
524 pectl);
525 sc->tx_boundary = 2048;
530 * Load the optimized firmware (which assumes aligned PCIe
531 * completions) in order to see if it works on this host.
533 sc->fw_name = mxge_fw_aligned;
534 status = mxge_load_firmware(sc, 1);
535 if (status != 0)
536 return status;
539 * Enable ECRC if possible
541 mxge_enable_nvidia_ecrc(sc);
544 * Run a DMA test which watches for unaligned completions and
545 * aborts on the first one seen. Not required on Z8ES or newer.
547 if (pci_get_revid(sc->dev) >= MXGE_PCI_REV_Z8ES)
548 return 0;
550 status = mxge_dma_test(sc, MXGEFW_CMD_UNALIGNED_TEST);
551 if (status == 0)
552 return 0; /* keep the aligned firmware */
554 if (status != E2BIG)
555 device_printf(dev, "DMA test failed: %d\n", status);
556 if (status == ENOSYS) {
557 device_printf(dev, "Falling back to ethp! "
558 "Please install up to date fw\n");
560 return status;
563 static int
564 mxge_select_firmware(mxge_softc_t *sc)
566 int aligned = 0;
567 int force_firmware = mxge_force_firmware;
569 if (sc->throttle)
570 force_firmware = sc->throttle;
572 if (force_firmware != 0) {
573 if (force_firmware == 1)
574 aligned = 1;
575 else
576 aligned = 0;
577 if (bootverbose) {
578 device_printf(sc->dev,
579 "Assuming %s completions (forced)\n",
580 aligned ? "aligned" : "unaligned");
582 goto abort;
586 * If the PCIe link width is 4 or less, we can use the aligned
587 * firmware and skip any checks
589 if (sc->link_width != 0 && sc->link_width <= 4) {
590 device_printf(sc->dev, "PCIe x%d Link, "
591 "expect reduced performance\n", sc->link_width);
592 aligned = 1;
593 goto abort;
596 if (mxge_firmware_probe(sc) == 0)
597 return 0;
599 abort:
600 if (aligned) {
601 sc->fw_name = mxge_fw_aligned;
602 sc->tx_boundary = 4096;
603 } else {
604 sc->fw_name = mxge_fw_unaligned;
605 sc->tx_boundary = 2048;
607 return mxge_load_firmware(sc, 0);
610 static int
611 mxge_validate_firmware(mxge_softc_t *sc, const mcp_gen_header_t *hdr)
613 if (be32toh(hdr->mcp_type) != MCP_TYPE_ETH) {
614 if_printf(sc->ifp, "Bad firmware type: 0x%x\n",
615 be32toh(hdr->mcp_type));
616 return EIO;
619 /* Save firmware version for sysctl */
620 strlcpy(sc->fw_version, hdr->version, sizeof(sc->fw_version));
621 if (bootverbose)
622 if_printf(sc->ifp, "firmware id: %s\n", hdr->version);
624 ksscanf(sc->fw_version, "%d.%d.%d", &sc->fw_ver_major,
625 &sc->fw_ver_minor, &sc->fw_ver_tiny);
627 if (!(sc->fw_ver_major == MXGEFW_VERSION_MAJOR &&
628 sc->fw_ver_minor == MXGEFW_VERSION_MINOR)) {
629 if_printf(sc->ifp, "Found firmware version %s\n",
630 sc->fw_version);
631 if_printf(sc->ifp, "Driver needs %d.%d\n",
632 MXGEFW_VERSION_MAJOR, MXGEFW_VERSION_MINOR);
633 return EINVAL;
635 return 0;
638 static void *
639 z_alloc(void *nil, u_int items, u_int size)
641 return kmalloc(items * size, M_TEMP, M_WAITOK);
644 static void
645 z_free(void *nil, void *ptr)
647 kfree(ptr, M_TEMP);
650 static int
651 mxge_load_firmware_helper(mxge_softc_t *sc, uint32_t *limit)
653 z_stream zs;
654 char *inflate_buffer;
655 const struct firmware *fw;
656 const mcp_gen_header_t *hdr;
657 unsigned hdr_offset;
658 int status;
659 unsigned int i;
660 char dummy;
661 size_t fw_len;
663 fw = firmware_get(sc->fw_name);
664 if (fw == NULL) {
665 if_printf(sc->ifp, "Could not find firmware image %s\n",
666 sc->fw_name);
667 return ENOENT;
670 /* Setup zlib and decompress f/w */
671 bzero(&zs, sizeof(zs));
672 zs.zalloc = z_alloc;
673 zs.zfree = z_free;
674 status = inflateInit(&zs);
675 if (status != Z_OK) {
676 status = EIO;
677 goto abort_with_fw;
681 * The uncompressed size is stored as the firmware version,
682 * which would otherwise go unused
684 fw_len = (size_t)fw->version;
685 inflate_buffer = kmalloc(fw_len, M_TEMP, M_WAITOK);
686 zs.avail_in = fw->datasize;
687 zs.next_in = __DECONST(char *, fw->data);
688 zs.avail_out = fw_len;
689 zs.next_out = inflate_buffer;
690 status = inflate(&zs, Z_FINISH);
691 if (status != Z_STREAM_END) {
692 if_printf(sc->ifp, "zlib %d\n", status);
693 status = EIO;
694 goto abort_with_buffer;
697 /* Check id */
698 hdr_offset =
699 htobe32(*(const uint32_t *)(inflate_buffer + MCP_HEADER_PTR_OFFSET));
700 if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > fw_len) {
701 if_printf(sc->ifp, "Bad firmware file");
702 status = EIO;
703 goto abort_with_buffer;
705 hdr = (const void*)(inflate_buffer + hdr_offset);
707 status = mxge_validate_firmware(sc, hdr);
708 if (status != 0)
709 goto abort_with_buffer;
711 /* Copy the inflated firmware to NIC SRAM. */
712 for (i = 0; i < fw_len; i += 256) {
713 mxge_pio_copy(sc->sram + MXGE_FW_OFFSET + i, inflate_buffer + i,
714 min(256U, (unsigned)(fw_len - i)));
715 wmb();
716 dummy = *sc->sram;
717 wmb();
720 *limit = fw_len;
721 status = 0;
722 abort_with_buffer:
723 kfree(inflate_buffer, M_TEMP);
724 inflateEnd(&zs);
725 abort_with_fw:
726 firmware_put(fw, FIRMWARE_UNLOAD);
727 return status;
731 * Enable or disable periodic RDMAs from the host to make certain
732 * chipsets resend dropped PCIe messages
734 static void
735 mxge_dummy_rdma(mxge_softc_t *sc, int enable)
737 char buf_bytes[72];
738 volatile uint32_t *confirm;
739 volatile char *submit;
740 uint32_t *buf, dma_low, dma_high;
741 int i;
743 buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
745 /* Clear confirmation addr */
746 confirm = (volatile uint32_t *)sc->cmd;
747 *confirm = 0;
748 wmb();
751 * Send an rdma command to the PCIe engine, and wait for the
752 * response in the confirmation address. The firmware should
753 * write a -1 there to indicate it is alive and well
755 dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.dmem_busaddr);
756 dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.dmem_busaddr);
757 buf[0] = htobe32(dma_high); /* confirm addr MSW */
758 buf[1] = htobe32(dma_low); /* confirm addr LSW */
759 buf[2] = htobe32(0xffffffff); /* confirm data */
760 dma_low = MXGE_LOWPART_TO_U32(sc->zeropad_dma.dmem_busaddr);
761 dma_high = MXGE_HIGHPART_TO_U32(sc->zeropad_dma.dmem_busaddr);
762 buf[3] = htobe32(dma_high); /* dummy addr MSW */
763 buf[4] = htobe32(dma_low); /* dummy addr LSW */
764 buf[5] = htobe32(enable); /* enable? */
766 submit = (volatile char *)(sc->sram + MXGEFW_BOOT_DUMMY_RDMA);
768 mxge_pio_copy(submit, buf, 64);
769 wmb();
770 DELAY(1000);
771 wmb();
772 i = 0;
773 while (*confirm != 0xffffffff && i < 20) {
774 DELAY(1000);
775 i++;
777 if (*confirm != 0xffffffff) {
778 if_printf(sc->ifp, "dummy rdma %s failed (%p = 0x%x)",
779 (enable ? "enable" : "disable"), confirm, *confirm);
783 static int
784 mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data)
786 mcp_cmd_t *buf;
787 char buf_bytes[sizeof(*buf) + 8];
788 volatile mcp_cmd_response_t *response = sc->cmd;
789 volatile char *cmd_addr = sc->sram + MXGEFW_ETH_CMD;
790 uint32_t dma_low, dma_high;
791 int err, sleep_total = 0;
793 /* Ensure buf is aligned to 8 bytes */
794 buf = (mcp_cmd_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
796 buf->data0 = htobe32(data->data0);
797 buf->data1 = htobe32(data->data1);
798 buf->data2 = htobe32(data->data2);
799 buf->cmd = htobe32(cmd);
800 dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.dmem_busaddr);
801 dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.dmem_busaddr);
803 buf->response_addr.low = htobe32(dma_low);
804 buf->response_addr.high = htobe32(dma_high);
806 response->result = 0xffffffff;
807 wmb();
808 mxge_pio_copy((volatile void *)cmd_addr, buf, sizeof (*buf));
811 * Wait up to 20ms
813 err = EAGAIN;
814 for (sleep_total = 0; sleep_total < 20; sleep_total++) {
815 wmb();
816 switch (be32toh(response->result)) {
817 case 0:
818 data->data0 = be32toh(response->data);
819 err = 0;
820 break;
821 case 0xffffffff:
822 DELAY(1000);
823 break;
824 case MXGEFW_CMD_UNKNOWN:
825 err = ENOSYS;
826 break;
827 case MXGEFW_CMD_ERROR_UNALIGNED:
828 err = E2BIG;
829 break;
830 case MXGEFW_CMD_ERROR_BUSY:
831 err = EBUSY;
832 break;
833 case MXGEFW_CMD_ERROR_I2C_ABSENT:
834 err = ENXIO;
835 break;
836 default:
837 if_printf(sc->ifp, "command %d failed, result = %d\n",
838 cmd, be32toh(response->result));
839 err = ENXIO;
840 break;
842 if (err != EAGAIN)
843 break;
845 if (err == EAGAIN) {
846 if_printf(sc->ifp, "command %d timed out result = %d\n",
847 cmd, be32toh(response->result));
849 return err;
852 static int
853 mxge_adopt_running_firmware(mxge_softc_t *sc)
855 struct mcp_gen_header *hdr;
856 const size_t bytes = sizeof(struct mcp_gen_header);
857 size_t hdr_offset;
858 int status;
861 * Find running firmware header
863 hdr_offset =
864 htobe32(*(volatile uint32_t *)(sc->sram + MCP_HEADER_PTR_OFFSET));
866 if ((hdr_offset & 3) || hdr_offset + sizeof(*hdr) > sc->sram_size) {
867 if_printf(sc->ifp, "Running firmware has bad header offset "
868 "(%zu)\n", hdr_offset);
869 return EIO;
873 * Copy header of running firmware from SRAM to host memory to
874 * validate firmware
876 hdr = kmalloc(bytes, M_DEVBUF, M_WAITOK);
877 bus_space_read_region_1(rman_get_bustag(sc->mem_res),
878 rman_get_bushandle(sc->mem_res), hdr_offset, (char *)hdr, bytes);
879 status = mxge_validate_firmware(sc, hdr);
880 kfree(hdr, M_DEVBUF);
883 * Check to see if adopted firmware has bug where adopting
884 * it will cause broadcasts to be filtered unless the NIC
885 * is kept in ALLMULTI mode
887 if (sc->fw_ver_major == 1 && sc->fw_ver_minor == 4 &&
888 sc->fw_ver_tiny >= 4 && sc->fw_ver_tiny <= 11) {
889 sc->adopted_rx_filter_bug = 1;
890 if_printf(sc->ifp, "Adopting fw %d.%d.%d: "
891 "working around rx filter bug\n",
892 sc->fw_ver_major, sc->fw_ver_minor, sc->fw_ver_tiny);
895 return status;
898 static int
899 mxge_load_firmware(mxge_softc_t *sc, int adopt)
901 volatile uint32_t *confirm;
902 volatile char *submit;
903 char buf_bytes[72];
904 uint32_t *buf, size, dma_low, dma_high;
905 int status, i;
907 buf = (uint32_t *)((unsigned long)(buf_bytes + 7) & ~7UL);
909 size = sc->sram_size;
910 status = mxge_load_firmware_helper(sc, &size);
911 if (status) {
912 if (!adopt)
913 return status;
916 * Try to use the currently running firmware, if
917 * it is new enough
919 status = mxge_adopt_running_firmware(sc);
920 if (status) {
921 if_printf(sc->ifp,
922 "failed to adopt running firmware\n");
923 return status;
925 if_printf(sc->ifp, "Successfully adopted running firmware\n");
927 if (sc->tx_boundary == 4096) {
928 if_printf(sc->ifp,
929 "Using firmware currently running on NIC. "
930 "For optimal\n");
931 if_printf(sc->ifp, "performance consider loading "
932 "optimized firmware\n");
934 sc->fw_name = mxge_fw_unaligned;
935 sc->tx_boundary = 2048;
936 return 0;
939 /* Clear confirmation addr */
940 confirm = (volatile uint32_t *)sc->cmd;
941 *confirm = 0;
942 wmb();
945 * Send a reload command to the bootstrap MCP, and wait for the
946 * response in the confirmation address. The firmware should
947 * write a -1 there to indicate it is alive and well
950 dma_low = MXGE_LOWPART_TO_U32(sc->cmd_dma.dmem_busaddr);
951 dma_high = MXGE_HIGHPART_TO_U32(sc->cmd_dma.dmem_busaddr);
953 buf[0] = htobe32(dma_high); /* confirm addr MSW */
954 buf[1] = htobe32(dma_low); /* confirm addr LSW */
955 buf[2] = htobe32(0xffffffff); /* confirm data */
958 * FIX: All newest firmware should un-protect the bottom of
959 * the sram before handoff. However, the very first interfaces
960 * do not. Therefore the handoff copy must skip the first 8 bytes
962 /* where the code starts*/
963 buf[3] = htobe32(MXGE_FW_OFFSET + 8);
964 buf[4] = htobe32(size - 8); /* length of code */
965 buf[5] = htobe32(8); /* where to copy to */
966 buf[6] = htobe32(0); /* where to jump to */
968 submit = (volatile char *)(sc->sram + MXGEFW_BOOT_HANDOFF);
969 mxge_pio_copy(submit, buf, 64);
970 wmb();
971 DELAY(1000);
972 wmb();
973 i = 0;
974 while (*confirm != 0xffffffff && i < 20) {
975 DELAY(1000*10);
976 i++;
978 if (*confirm != 0xffffffff) {
979 if_printf(sc->ifp,"handoff failed (%p = 0x%x)",
980 confirm, *confirm);
981 return ENXIO;
983 return 0;
986 static int
987 mxge_update_mac_address(mxge_softc_t *sc)
989 mxge_cmd_t cmd;
990 uint8_t *addr = sc->mac_addr;
992 cmd.data0 = (addr[0] << 24) | (addr[1] << 16) |
993 (addr[2] << 8) | addr[3];
994 cmd.data1 = (addr[4] << 8) | (addr[5]);
995 return mxge_send_cmd(sc, MXGEFW_SET_MAC_ADDRESS, &cmd);
998 static int
999 mxge_change_pause(mxge_softc_t *sc, int pause)
1001 mxge_cmd_t cmd;
1002 int status;
1004 bzero(&cmd, sizeof(cmd)); /* silence gcc warning */
1005 if (pause)
1006 status = mxge_send_cmd(sc, MXGEFW_ENABLE_FLOW_CONTROL, &cmd);
1007 else
1008 status = mxge_send_cmd(sc, MXGEFW_DISABLE_FLOW_CONTROL, &cmd);
1009 if (status) {
1010 if_printf(sc->ifp, "Failed to set flow control mode\n");
1011 return ENXIO;
1013 sc->pause = pause;
1014 return 0;
1017 static void
1018 mxge_change_promisc(mxge_softc_t *sc, int promisc)
1020 mxge_cmd_t cmd;
1021 int status;
1023 bzero(&cmd, sizeof(cmd)); /* avoid gcc warning */
1024 if (mxge_always_promisc)
1025 promisc = 1;
1027 if (promisc)
1028 status = mxge_send_cmd(sc, MXGEFW_ENABLE_PROMISC, &cmd);
1029 else
1030 status = mxge_send_cmd(sc, MXGEFW_DISABLE_PROMISC, &cmd);
1031 if (status)
1032 if_printf(sc->ifp, "Failed to set promisc mode\n");
1035 static void
1036 mxge_set_multicast_list(mxge_softc_t *sc)
1038 mxge_cmd_t cmd;
1039 struct ifmultiaddr *ifma;
1040 struct ifnet *ifp = sc->ifp;
1041 int err;
1043 /* This firmware is known to not support multicast */
1044 if (!sc->fw_multicast_support)
1045 return;
1047 /* Disable multicast filtering while we play with the lists*/
1048 bzero(&cmd, sizeof(cmd)); /* silence gcc warning */
1049 err = mxge_send_cmd(sc, MXGEFW_ENABLE_ALLMULTI, &cmd);
1050 if (err != 0) {
1051 if_printf(ifp, "Failed MXGEFW_ENABLE_ALLMULTI, "
1052 "error status: %d\n", err);
1053 return;
1056 if (sc->adopted_rx_filter_bug)
1057 return;
1059 if (ifp->if_flags & IFF_ALLMULTI) {
1060 /* Request to disable multicast filtering, so quit here */
1061 return;
1064 /* Flush all the filters */
1065 err = mxge_send_cmd(sc, MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, &cmd);
1066 if (err != 0) {
1067 if_printf(ifp, "Failed MXGEFW_LEAVE_ALL_MULTICAST_GROUPS, "
1068 "error status: %d\n", err);
1069 return;
1073 * Walk the multicast list, and add each address
1075 TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
1076 if (ifma->ifma_addr->sa_family != AF_LINK)
1077 continue;
1079 bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
1080 &cmd.data0, 4);
1081 bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr) + 4,
1082 &cmd.data1, 2);
1083 cmd.data0 = htonl(cmd.data0);
1084 cmd.data1 = htonl(cmd.data1);
1085 err = mxge_send_cmd(sc, MXGEFW_JOIN_MULTICAST_GROUP, &cmd);
1086 if (err != 0) {
1087 if_printf(ifp, "Failed MXGEFW_JOIN_MULTICAST_GROUP, "
1088 "error status: %d\n", err);
1089 /* Abort, leaving multicast filtering off */
1090 return;
1094 /* Enable multicast filtering */
1095 err = mxge_send_cmd(sc, MXGEFW_DISABLE_ALLMULTI, &cmd);
1096 if (err != 0) {
1097 if_printf(ifp, "Failed MXGEFW_DISABLE_ALLMULTI, "
1098 "error status: %d\n", err);
1102 #if 0
1103 static int
1104 mxge_max_mtu(mxge_softc_t *sc)
1106 mxge_cmd_t cmd;
1107 int status;
1109 if (MJUMPAGESIZE - MXGEFW_PAD > MXGEFW_MAX_MTU)
1110 return MXGEFW_MAX_MTU - MXGEFW_PAD;
1112 /* try to set nbufs to see if it we can
1113 use virtually contiguous jumbos */
1114 cmd.data0 = 0;
1115 status = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS,
1116 &cmd);
1117 if (status == 0)
1118 return MXGEFW_MAX_MTU - MXGEFW_PAD;
1120 /* otherwise, we're limited to MJUMPAGESIZE */
1121 return MJUMPAGESIZE - MXGEFW_PAD;
1123 #endif
1125 static int
1126 mxge_reset(mxge_softc_t *sc, int interrupts_setup)
1128 struct mxge_slice_state *ss;
1129 mxge_rx_done_t *rx_done;
1130 volatile uint32_t *irq_claim;
1131 mxge_cmd_t cmd;
1132 int slice, status, rx_intr_size;
1135 * Try to send a reset command to the card to see if it
1136 * is alive
1138 memset(&cmd, 0, sizeof (cmd));
1139 status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
1140 if (status != 0) {
1141 if_printf(sc->ifp, "failed reset\n");
1142 return ENXIO;
1145 mxge_dummy_rdma(sc, 1);
1148 * Set the intrq size
1149 * XXX assume 4byte mcp_slot
1151 rx_intr_size = sc->rx_intr_slots * sizeof(mcp_slot_t);
1152 cmd.data0 = rx_intr_size;
1153 status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
1156 * Even though we already know how many slices are supported
1157 * via mxge_slice_probe(), MXGEFW_CMD_GET_MAX_RSS_QUEUES
1158 * has magic side effects, and must be called after a reset.
1159 * It must be called prior to calling any RSS related cmds,
1160 * including assigning an interrupt queue for anything but
1161 * slice 0. It must also be called *after*
1162 * MXGEFW_CMD_SET_INTRQ_SIZE, since the intrq size is used by
1163 * the firmware to compute offsets.
1165 if (sc->num_slices > 1) {
1166 /* Ask the maximum number of slices it supports */
1167 status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
1168 if (status != 0) {
1169 if_printf(sc->ifp, "failed to get number of slices\n");
1170 return status;
1174 * MXGEFW_CMD_ENABLE_RSS_QUEUES must be called prior
1175 * to setting up the interrupt queue DMA
1177 cmd.data0 = sc->num_slices;
1178 cmd.data1 = MXGEFW_SLICE_INTR_MODE_ONE_PER_SLICE;
1179 if (sc->num_tx_rings > 1)
1180 cmd.data1 |= MXGEFW_SLICE_ENABLE_MULTIPLE_TX_QUEUES;
1181 status = mxge_send_cmd(sc, MXGEFW_CMD_ENABLE_RSS_QUEUES, &cmd);
1182 if (status != 0) {
1183 if_printf(sc->ifp, "failed to set number of slices\n");
1184 return status;
1188 if (interrupts_setup) {
1189 /* Now exchange information about interrupts */
1190 for (slice = 0; slice < sc->num_slices; slice++) {
1191 ss = &sc->ss[slice];
1193 rx_done = &ss->rx_data.rx_done;
1194 memset(rx_done->entry, 0, rx_intr_size);
1196 cmd.data0 =
1197 MXGE_LOWPART_TO_U32(ss->rx_done_dma.dmem_busaddr);
1198 cmd.data1 =
1199 MXGE_HIGHPART_TO_U32(ss->rx_done_dma.dmem_busaddr);
1200 cmd.data2 = slice;
1201 status |= mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_DMA,
1202 &cmd);
1206 status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_INTR_COAL_DELAY_OFFSET,
1207 &cmd);
1208 sc->intr_coal_delay_ptr = (volatile uint32_t *)(sc->sram + cmd.data0);
1210 status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_ACK_OFFSET, &cmd);
1211 irq_claim = (volatile uint32_t *)(sc->sram + cmd.data0);
1213 status |= mxge_send_cmd(sc, MXGEFW_CMD_GET_IRQ_DEASSERT_OFFSET, &cmd);
1214 sc->irq_deassert = (volatile uint32_t *)(sc->sram + cmd.data0);
1216 if (status != 0) {
1217 if_printf(sc->ifp, "failed set interrupt parameters\n");
1218 return status;
1221 *sc->intr_coal_delay_ptr = htobe32(sc->intr_coal_delay);
1223 /* Run a DMA benchmark */
1224 mxge_dma_test(sc, MXGEFW_DMA_TEST);
1226 for (slice = 0; slice < sc->num_slices; slice++) {
1227 ss = &sc->ss[slice];
1229 ss->irq_claim = irq_claim + (2 * slice);
1231 /* Reset mcp/driver shared state back to 0 */
1232 ss->rx_data.rx_done.idx = 0;
1233 ss->tx.req = 0;
1234 ss->tx.done = 0;
1235 ss->tx.pkt_done = 0;
1236 ss->tx.queue_active = 0;
1237 ss->tx.activate = 0;
1238 ss->tx.deactivate = 0;
1239 ss->rx_data.rx_big.cnt = 0;
1240 ss->rx_data.rx_small.cnt = 0;
1241 if (ss->fw_stats != NULL)
1242 bzero(ss->fw_stats, sizeof(*ss->fw_stats));
1244 sc->rdma_tags_available = 15;
1246 status = mxge_update_mac_address(sc);
1247 mxge_change_promisc(sc, sc->ifp->if_flags & IFF_PROMISC);
1248 mxge_change_pause(sc, sc->pause);
1249 mxge_set_multicast_list(sc);
1251 if (sc->throttle) {
1252 cmd.data0 = sc->throttle;
1253 if (mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR, &cmd))
1254 if_printf(sc->ifp, "can't enable throttle\n");
1256 return status;
1259 static int
1260 mxge_change_throttle(SYSCTL_HANDLER_ARGS)
1262 mxge_cmd_t cmd;
1263 mxge_softc_t *sc;
1264 int err;
1265 unsigned int throttle;
1267 sc = arg1;
1268 throttle = sc->throttle;
1269 err = sysctl_handle_int(oidp, &throttle, arg2, req);
1270 if (err != 0)
1271 return err;
1273 if (throttle == sc->throttle)
1274 return 0;
1276 if (throttle < MXGE_MIN_THROTTLE || throttle > MXGE_MAX_THROTTLE)
1277 return EINVAL;
1279 ifnet_serialize_all(sc->ifp);
1281 cmd.data0 = throttle;
1282 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_THROTTLE_FACTOR, &cmd);
1283 if (err == 0)
1284 sc->throttle = throttle;
1286 ifnet_deserialize_all(sc->ifp);
1287 return err;
1290 static int
1291 mxge_change_use_rss(SYSCTL_HANDLER_ARGS)
1293 mxge_softc_t *sc;
1294 int err, use_rss;
1296 sc = arg1;
1297 use_rss = sc->use_rss;
1298 err = sysctl_handle_int(oidp, &use_rss, arg2, req);
1299 if (err != 0)
1300 return err;
1302 if (use_rss == sc->use_rss)
1303 return 0;
1305 ifnet_serialize_all(sc->ifp);
1307 sc->use_rss = use_rss;
1308 if (sc->ifp->if_flags & IFF_RUNNING) {
1309 mxge_close(sc, 0);
1310 mxge_open(sc);
1313 ifnet_deserialize_all(sc->ifp);
1314 return err;
1317 static int
1318 mxge_change_intr_coal(SYSCTL_HANDLER_ARGS)
1320 mxge_softc_t *sc;
1321 unsigned int intr_coal_delay;
1322 int err;
1324 sc = arg1;
1325 intr_coal_delay = sc->intr_coal_delay;
1326 err = sysctl_handle_int(oidp, &intr_coal_delay, arg2, req);
1327 if (err != 0)
1328 return err;
1330 if (intr_coal_delay == sc->intr_coal_delay)
1331 return 0;
1333 if (intr_coal_delay == 0 || intr_coal_delay > 1000*1000)
1334 return EINVAL;
1336 ifnet_serialize_all(sc->ifp);
1338 *sc->intr_coal_delay_ptr = htobe32(intr_coal_delay);
1339 sc->intr_coal_delay = intr_coal_delay;
1341 ifnet_deserialize_all(sc->ifp);
1342 return err;
1345 static int
1346 mxge_handle_be32(SYSCTL_HANDLER_ARGS)
1348 int err;
1350 if (arg1 == NULL)
1351 return EFAULT;
1352 arg2 = be32toh(*(int *)arg1);
1353 arg1 = NULL;
1354 err = sysctl_handle_int(oidp, arg1, arg2, req);
1356 return err;
1359 static void
1360 mxge_rem_sysctls(mxge_softc_t *sc)
1362 if (sc->ss != NULL) {
1363 struct mxge_slice_state *ss;
1364 int slice;
1366 for (slice = 0; slice < sc->num_slices; slice++) {
1367 ss = &sc->ss[slice];
1368 if (ss->sysctl_tree != NULL) {
1369 sysctl_ctx_free(&ss->sysctl_ctx);
1370 ss->sysctl_tree = NULL;
1375 if (sc->slice_sysctl_tree != NULL) {
1376 sysctl_ctx_free(&sc->slice_sysctl_ctx);
1377 sc->slice_sysctl_tree = NULL;
1381 static void
1382 mxge_add_sysctls(mxge_softc_t *sc)
1384 struct sysctl_ctx_list *ctx;
1385 struct sysctl_oid_list *children;
1386 mcp_irq_data_t *fw;
1387 struct mxge_slice_state *ss;
1388 int slice;
1389 char slice_num[8];
1391 ctx = device_get_sysctl_ctx(sc->dev);
1392 children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
1393 fw = sc->ss[0].fw_stats;
1396 * Random information
1398 SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "firmware_version",
1399 CTLFLAG_RD, &sc->fw_version, 0, "firmware version");
1401 SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "serial_number",
1402 CTLFLAG_RD, &sc->serial_number_string, 0, "serial number");
1404 SYSCTL_ADD_STRING(ctx, children, OID_AUTO, "product_code",
1405 CTLFLAG_RD, &sc->product_code_string, 0, "product code");
1407 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "pcie_link_width",
1408 CTLFLAG_RD, &sc->link_width, 0, "link width");
1410 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_boundary",
1411 CTLFLAG_RD, &sc->tx_boundary, 0, "tx boundary");
1413 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "write_combine",
1414 CTLFLAG_RD, &sc->wc, 0, "write combining PIO");
1416 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "read_dma_MBs",
1417 CTLFLAG_RD, &sc->read_dma, 0, "DMA Read speed in MB/s");
1419 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "write_dma_MBs",
1420 CTLFLAG_RD, &sc->write_dma, 0, "DMA Write speed in MB/s");
1422 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "read_write_dma_MBs",
1423 CTLFLAG_RD, &sc->read_write_dma, 0,
1424 "DMA concurrent Read/Write speed in MB/s");
1426 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "watchdog_resets",
1427 CTLFLAG_RD, &sc->watchdog_resets, 0,
1428 "Number of times NIC was reset");
1430 if (sc->num_slices > 1) {
1431 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "slice_cpumap",
1432 CTLTYPE_OPAQUE | CTLFLAG_RD, sc->ring_map, 0,
1433 if_ringmap_cpumap_sysctl, "I", "slice CPU map");
1437 * Performance related tunables
1439 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "intr_coal_delay",
1440 CTLTYPE_INT|CTLFLAG_RW, sc, 0, mxge_change_intr_coal, "I",
1441 "Interrupt coalescing delay in usecs");
1443 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "throttle",
1444 CTLTYPE_INT|CTLFLAG_RW, sc, 0, mxge_change_throttle, "I",
1445 "Transmit throttling");
1447 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "use_rss",
1448 CTLTYPE_INT|CTLFLAG_RW, sc, 0, mxge_change_use_rss, "I",
1449 "Use RSS");
1451 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "deassert_wait",
1452 CTLFLAG_RW, &mxge_deassert_wait, 0,
1453 "Wait for IRQ line to go low in ihandler");
1456 * Stats block from firmware is in network byte order.
1457 * Need to swap it
1459 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "link_up",
1460 CTLTYPE_INT|CTLFLAG_RD, &fw->link_up, 0,
1461 mxge_handle_be32, "I", "link up");
1463 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "rdma_tags_available",
1464 CTLTYPE_INT|CTLFLAG_RD, &fw->rdma_tags_available, 0,
1465 mxge_handle_be32, "I", "rdma_tags_available");
1467 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_bad_crc32",
1468 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_bad_crc32, 0,
1469 mxge_handle_be32, "I", "dropped_bad_crc32");
1471 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_bad_phy",
1472 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_bad_phy, 0,
1473 mxge_handle_be32, "I", "dropped_bad_phy");
1475 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_link_error_or_filtered",
1476 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_error_or_filtered, 0,
1477 mxge_handle_be32, "I", "dropped_link_error_or_filtered");
1479 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_link_overflow",
1480 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_link_overflow, 0,
1481 mxge_handle_be32, "I", "dropped_link_overflow");
1483 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_multicast_filtered",
1484 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_multicast_filtered, 0,
1485 mxge_handle_be32, "I", "dropped_multicast_filtered");
1487 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_no_big_buffer",
1488 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_big_buffer, 0,
1489 mxge_handle_be32, "I", "dropped_no_big_buffer");
1491 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_no_small_buffer",
1492 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_no_small_buffer, 0,
1493 mxge_handle_be32, "I", "dropped_no_small_buffer");
1495 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_overrun",
1496 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_overrun, 0,
1497 mxge_handle_be32, "I", "dropped_overrun");
1499 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_pause",
1500 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_pause, 0,
1501 mxge_handle_be32, "I", "dropped_pause");
1503 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_runt",
1504 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_runt, 0,
1505 mxge_handle_be32, "I", "dropped_runt");
1507 SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "dropped_unicast_filtered",
1508 CTLTYPE_INT|CTLFLAG_RD, &fw->dropped_unicast_filtered, 0,
1509 mxge_handle_be32, "I", "dropped_unicast_filtered");
1511 /* add counters exported for debugging from all slices */
1512 sysctl_ctx_init(&sc->slice_sysctl_ctx);
1513 sc->slice_sysctl_tree = SYSCTL_ADD_NODE(&sc->slice_sysctl_ctx,
1514 children, OID_AUTO, "slice", CTLFLAG_RD, 0, "");
1515 if (sc->slice_sysctl_tree == NULL) {
1516 device_printf(sc->dev, "can't add slice sysctl node\n");
1517 return;
1520 for (slice = 0; slice < sc->num_slices; slice++) {
1521 ss = &sc->ss[slice];
1522 sysctl_ctx_init(&ss->sysctl_ctx);
1523 ctx = &ss->sysctl_ctx;
1524 children = SYSCTL_CHILDREN(sc->slice_sysctl_tree);
1525 ksprintf(slice_num, "%d", slice);
1526 ss->sysctl_tree = SYSCTL_ADD_NODE(ctx, children, OID_AUTO,
1527 slice_num, CTLFLAG_RD, 0, "");
1528 if (ss->sysctl_tree == NULL) {
1529 device_printf(sc->dev,
1530 "can't add %d slice sysctl node\n", slice);
1531 return; /* XXX continue? */
1533 children = SYSCTL_CHILDREN(ss->sysctl_tree);
1536 * XXX change to ULONG
1539 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "rx_small_cnt",
1540 CTLFLAG_RD, &ss->rx_data.rx_small.cnt, 0, "rx_small_cnt");
1542 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "rx_big_cnt",
1543 CTLFLAG_RD, &ss->rx_data.rx_big.cnt, 0, "rx_small_cnt");
1545 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_req",
1546 CTLFLAG_RD, &ss->tx.req, 0, "tx_req");
1548 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_done",
1549 CTLFLAG_RD, &ss->tx.done, 0, "tx_done");
1551 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_pkt_done",
1552 CTLFLAG_RD, &ss->tx.pkt_done, 0, "tx_done");
1554 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_queue_active",
1555 CTLFLAG_RD, &ss->tx.queue_active, 0, "tx_queue_active");
1557 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_activate",
1558 CTLFLAG_RD, &ss->tx.activate, 0, "tx_activate");
1560 SYSCTL_ADD_INT(ctx, children, OID_AUTO, "tx_deactivate",
1561 CTLFLAG_RD, &ss->tx.deactivate, 0, "tx_deactivate");
1566 * Copy an array of mcp_kreq_ether_send_t's to the mcp. Copy
1567 * backwards one at a time and handle ring wraps
1569 static __inline void
1570 mxge_submit_req_backwards(mxge_tx_ring_t *tx,
1571 mcp_kreq_ether_send_t *src, int cnt)
1573 int idx, starting_slot;
1575 starting_slot = tx->req;
1576 while (cnt > 1) {
1577 cnt--;
1578 idx = (starting_slot + cnt) & tx->mask;
1579 mxge_pio_copy(&tx->lanai[idx], &src[cnt], sizeof(*src));
1580 wmb();
1585 * Copy an array of mcp_kreq_ether_send_t's to the mcp. Copy
1586 * at most 32 bytes at a time, so as to avoid involving the software
1587 * pio handler in the nic. We re-write the first segment's flags
1588 * to mark them valid only after writing the entire chain
1590 static __inline void
1591 mxge_submit_req(mxge_tx_ring_t *tx, mcp_kreq_ether_send_t *src, int cnt)
1593 int idx, i;
1594 uint32_t *src_ints;
1595 volatile uint32_t *dst_ints;
1596 mcp_kreq_ether_send_t *srcp;
1597 volatile mcp_kreq_ether_send_t *dstp, *dst;
1598 uint8_t last_flags;
1600 idx = tx->req & tx->mask;
1602 last_flags = src->flags;
1603 src->flags = 0;
1604 wmb();
1605 dst = dstp = &tx->lanai[idx];
1606 srcp = src;
1608 if ((idx + cnt) < tx->mask) {
1609 for (i = 0; i < cnt - 1; i += 2) {
1610 mxge_pio_copy(dstp, srcp, 2 * sizeof(*src));
1611 wmb(); /* force write every 32 bytes */
1612 srcp += 2;
1613 dstp += 2;
1615 } else {
1617 * Submit all but the first request, and ensure
1618 * that it is submitted below
1620 mxge_submit_req_backwards(tx, src, cnt);
1621 i = 0;
1623 if (i < cnt) {
1624 /* Submit the first request */
1625 mxge_pio_copy(dstp, srcp, sizeof(*src));
1626 wmb(); /* barrier before setting valid flag */
1629 /* Re-write the last 32-bits with the valid flags */
1630 src->flags = last_flags;
1631 src_ints = (uint32_t *)src;
1632 src_ints+=3;
1633 dst_ints = (volatile uint32_t *)dst;
1634 dst_ints+=3;
1635 *dst_ints = *src_ints;
1636 tx->req += cnt;
1637 wmb();
1640 static int
1641 mxge_pullup_tso(struct mbuf **mp)
1643 int hoff, iphlen, thoff;
1644 struct mbuf *m;
1646 m = *mp;
1647 KASSERT(M_WRITABLE(m), ("TSO mbuf not writable"));
1649 iphlen = m->m_pkthdr.csum_iphlen;
1650 thoff = m->m_pkthdr.csum_thlen;
1651 hoff = m->m_pkthdr.csum_lhlen;
1653 KASSERT(iphlen > 0, ("invalid ip hlen"));
1654 KASSERT(thoff > 0, ("invalid tcp hlen"));
1655 KASSERT(hoff > 0, ("invalid ether hlen"));
1657 if (__predict_false(m->m_len < hoff + iphlen + thoff)) {
1658 m = m_pullup(m, hoff + iphlen + thoff);
1659 if (m == NULL) {
1660 *mp = NULL;
1661 return ENOBUFS;
1663 *mp = m;
1665 return 0;
1668 static int
1669 mxge_encap_tso(mxge_tx_ring_t *tx, struct mxge_buffer_state *info_map,
1670 struct mbuf *m, int busdma_seg_cnt)
1672 mcp_kreq_ether_send_t *req;
1673 bus_dma_segment_t *seg;
1674 uint32_t low, high_swapped;
1675 int len, seglen, cum_len, cum_len_next;
1676 int next_is_first, chop, cnt, rdma_count, small;
1677 uint16_t pseudo_hdr_offset, cksum_offset, mss;
1678 uint8_t flags, flags_next;
1679 struct mxge_buffer_state *info_last;
1680 bus_dmamap_t map = info_map->map;
1682 mss = m->m_pkthdr.tso_segsz;
1685 * Negative cum_len signifies to the send loop that we are
1686 * still in the header portion of the TSO packet.
1688 cum_len = -(m->m_pkthdr.csum_lhlen + m->m_pkthdr.csum_iphlen +
1689 m->m_pkthdr.csum_thlen);
1692 * TSO implies checksum offload on this hardware
1694 cksum_offset = m->m_pkthdr.csum_lhlen + m->m_pkthdr.csum_iphlen;
1695 flags = MXGEFW_FLAGS_TSO_HDR | MXGEFW_FLAGS_FIRST;
1698 * For TSO, pseudo_hdr_offset holds mss. The firmware figures
1699 * out where to put the checksum by parsing the header.
1701 pseudo_hdr_offset = htobe16(mss);
1703 req = tx->req_list;
1704 seg = tx->seg_list;
1705 cnt = 0;
1706 rdma_count = 0;
1709 * "rdma_count" is the number of RDMAs belonging to the current
1710 * packet BEFORE the current send request. For non-TSO packets,
1711 * this is equal to "count".
1713 * For TSO packets, rdma_count needs to be reset to 0 after a
1714 * segment cut.
1716 * The rdma_count field of the send request is the number of
1717 * RDMAs of the packet starting at that request. For TSO send
1718 * requests with one ore more cuts in the middle, this is the
1719 * number of RDMAs starting after the last cut in the request.
1720 * All previous segments before the last cut implicitly have 1
1721 * RDMA.
1723 * Since the number of RDMAs is not known beforehand, it must be
1724 * filled-in retroactively - after each segmentation cut or at
1725 * the end of the entire packet.
1728 while (busdma_seg_cnt) {
1730 * Break the busdma segment up into pieces
1732 low = MXGE_LOWPART_TO_U32(seg->ds_addr);
1733 high_swapped = htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1734 len = seg->ds_len;
1736 while (len) {
1737 flags_next = flags & ~MXGEFW_FLAGS_FIRST;
1738 seglen = len;
1739 cum_len_next = cum_len + seglen;
1740 (req - rdma_count)->rdma_count = rdma_count + 1;
1741 if (__predict_true(cum_len >= 0)) {
1742 /* Payload */
1743 chop = (cum_len_next > mss);
1744 cum_len_next = cum_len_next % mss;
1745 next_is_first = (cum_len_next == 0);
1746 flags |= chop * MXGEFW_FLAGS_TSO_CHOP;
1747 flags_next |=
1748 next_is_first * MXGEFW_FLAGS_FIRST;
1749 rdma_count |= -(chop | next_is_first);
1750 rdma_count += chop & !next_is_first;
1751 } else if (cum_len_next >= 0) {
1752 /* Header ends */
1753 rdma_count = -1;
1754 cum_len_next = 0;
1755 seglen = -cum_len;
1756 small = (mss <= MXGEFW_SEND_SMALL_SIZE);
1757 flags_next = MXGEFW_FLAGS_TSO_PLD |
1758 MXGEFW_FLAGS_FIRST |
1759 (small * MXGEFW_FLAGS_SMALL);
1762 req->addr_high = high_swapped;
1763 req->addr_low = htobe32(low);
1764 req->pseudo_hdr_offset = pseudo_hdr_offset;
1765 req->pad = 0;
1766 req->rdma_count = 1;
1767 req->length = htobe16(seglen);
1768 req->cksum_offset = cksum_offset;
1769 req->flags =
1770 flags | ((cum_len & 1) * MXGEFW_FLAGS_ALIGN_ODD);
1771 low += seglen;
1772 len -= seglen;
1773 cum_len = cum_len_next;
1774 flags = flags_next;
1775 req++;
1776 cnt++;
1777 rdma_count++;
1778 if (__predict_false(cksum_offset > seglen))
1779 cksum_offset -= seglen;
1780 else
1781 cksum_offset = 0;
1782 if (__predict_false(cnt > tx->max_desc))
1783 goto drop;
1785 busdma_seg_cnt--;
1786 seg++;
1788 (req - rdma_count)->rdma_count = rdma_count;
1790 do {
1791 req--;
1792 req->flags |= MXGEFW_FLAGS_TSO_LAST;
1793 } while (!(req->flags & (MXGEFW_FLAGS_TSO_CHOP | MXGEFW_FLAGS_FIRST)));
1795 info_last = &tx->info[((cnt - 1) + tx->req) & tx->mask];
1797 info_map->map = info_last->map;
1798 info_last->map = map;
1799 info_last->m = m;
1801 mxge_submit_req(tx, tx->req_list, cnt);
1803 if (tx->send_go != NULL && tx->queue_active == 0) {
1804 /* Tell the NIC to start polling this slice */
1805 *tx->send_go = 1;
1806 tx->queue_active = 1;
1807 tx->activate++;
1808 wmb();
1810 return 0;
1812 drop:
1813 bus_dmamap_unload(tx->dmat, tx->info[tx->req & tx->mask].map);
1814 m_freem(m);
1815 return ENOBUFS;
1818 static int
1819 mxge_encap(mxge_tx_ring_t *tx, struct mbuf *m, bus_addr_t zeropad)
1821 mcp_kreq_ether_send_t *req;
1822 bus_dma_segment_t *seg;
1823 bus_dmamap_t map;
1824 int cnt, cum_len, err, i, idx, odd_flag;
1825 uint16_t pseudo_hdr_offset;
1826 uint8_t flags, cksum_offset;
1827 struct mxge_buffer_state *info_map, *info_last;
1829 if (m->m_pkthdr.csum_flags & CSUM_TSO) {
1830 err = mxge_pullup_tso(&m);
1831 if (__predict_false(err))
1832 return err;
1836 * Map the frame for DMA
1838 idx = tx->req & tx->mask;
1839 info_map = &tx->info[idx];
1840 map = info_map->map;
1842 err = bus_dmamap_load_mbuf_defrag(tx->dmat, map, &m,
1843 tx->seg_list, tx->max_desc - 2, &cnt, BUS_DMA_NOWAIT);
1844 if (__predict_false(err != 0))
1845 goto drop;
1846 bus_dmamap_sync(tx->dmat, map, BUS_DMASYNC_PREWRITE);
1849 * TSO is different enough, we handle it in another routine
1851 if (m->m_pkthdr.csum_flags & CSUM_TSO)
1852 return mxge_encap_tso(tx, info_map, m, cnt);
1854 req = tx->req_list;
1855 cksum_offset = 0;
1856 pseudo_hdr_offset = 0;
1857 flags = MXGEFW_FLAGS_NO_TSO;
1860 * Checksum offloading
1862 if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
1863 cksum_offset = m->m_pkthdr.csum_lhlen + m->m_pkthdr.csum_iphlen;
1864 pseudo_hdr_offset = cksum_offset + m->m_pkthdr.csum_data;
1865 pseudo_hdr_offset = htobe16(pseudo_hdr_offset);
1866 req->cksum_offset = cksum_offset;
1867 flags |= MXGEFW_FLAGS_CKSUM;
1868 odd_flag = MXGEFW_FLAGS_ALIGN_ODD;
1869 } else {
1870 odd_flag = 0;
1872 if (m->m_pkthdr.len < MXGEFW_SEND_SMALL_SIZE)
1873 flags |= MXGEFW_FLAGS_SMALL;
1876 * Convert segments into a request list
1878 cum_len = 0;
1879 seg = tx->seg_list;
1880 req->flags = MXGEFW_FLAGS_FIRST;
1881 for (i = 0; i < cnt; i++) {
1882 req->addr_low = htobe32(MXGE_LOWPART_TO_U32(seg->ds_addr));
1883 req->addr_high = htobe32(MXGE_HIGHPART_TO_U32(seg->ds_addr));
1884 req->length = htobe16(seg->ds_len);
1885 req->cksum_offset = cksum_offset;
1886 if (cksum_offset > seg->ds_len)
1887 cksum_offset -= seg->ds_len;
1888 else
1889 cksum_offset = 0;
1890 req->pseudo_hdr_offset = pseudo_hdr_offset;
1891 req->pad = 0; /* complete solid 16-byte block */
1892 req->rdma_count = 1;
1893 req->flags |= flags | ((cum_len & 1) * odd_flag);
1894 cum_len += seg->ds_len;
1895 seg++;
1896 req++;
1897 req->flags = 0;
1899 req--;
1902 * Pad runt to 60 bytes
1904 if (cum_len < 60) {
1905 req++;
1906 req->addr_low = htobe32(MXGE_LOWPART_TO_U32(zeropad));
1907 req->addr_high = htobe32(MXGE_HIGHPART_TO_U32(zeropad));
1908 req->length = htobe16(60 - cum_len);
1909 req->cksum_offset = 0;
1910 req->pseudo_hdr_offset = pseudo_hdr_offset;
1911 req->pad = 0; /* complete solid 16-byte block */
1912 req->rdma_count = 1;
1913 req->flags |= flags | ((cum_len & 1) * odd_flag);
1914 cnt++;
1917 tx->req_list[0].rdma_count = cnt;
1918 #if 0
1919 /* print what the firmware will see */
1920 for (i = 0; i < cnt; i++) {
1921 kprintf("%d: addr: 0x%x 0x%x len:%d pso%d,"
1922 "cso:%d, flags:0x%x, rdma:%d\n",
1923 i, (int)ntohl(tx->req_list[i].addr_high),
1924 (int)ntohl(tx->req_list[i].addr_low),
1925 (int)ntohs(tx->req_list[i].length),
1926 (int)ntohs(tx->req_list[i].pseudo_hdr_offset),
1927 tx->req_list[i].cksum_offset, tx->req_list[i].flags,
1928 tx->req_list[i].rdma_count);
1930 kprintf("--------------\n");
1931 #endif
1932 info_last = &tx->info[((cnt - 1) + tx->req) & tx->mask];
1934 info_map->map = info_last->map;
1935 info_last->map = map;
1936 info_last->m = m;
1938 mxge_submit_req(tx, tx->req_list, cnt);
1940 if (tx->send_go != NULL && tx->queue_active == 0) {
1941 /* Tell the NIC to start polling this slice */
1942 *tx->send_go = 1;
1943 tx->queue_active = 1;
1944 tx->activate++;
1945 wmb();
1947 return 0;
1949 drop:
1950 m_freem(m);
1951 return err;
1954 static void
1955 mxge_start(struct ifnet *ifp, struct ifaltq_subque *ifsq)
1957 mxge_softc_t *sc = ifp->if_softc;
1958 mxge_tx_ring_t *tx = ifsq_get_priv(ifsq);
1959 bus_addr_t zeropad;
1960 int encap = 0;
1962 KKASSERT(tx->ifsq == ifsq);
1963 ASSERT_SERIALIZED(&tx->tx_serialize);
1965 if ((ifp->if_flags & IFF_RUNNING) == 0 || ifsq_is_oactive(ifsq))
1966 return;
1968 zeropad = sc->zeropad_dma.dmem_busaddr;
1969 while (tx->mask - (tx->req - tx->done) > tx->max_desc) {
1970 struct mbuf *m;
1971 int error;
1973 m = ifsq_dequeue(ifsq);
1974 if (m == NULL)
1975 goto done;
1977 BPF_MTAP(ifp, m);
1978 error = mxge_encap(tx, m, zeropad);
1979 if (!error)
1980 encap = 1;
1981 else
1982 IFNET_STAT_INC(ifp, oerrors, 1);
1985 /* Ran out of transmit slots */
1986 ifsq_set_oactive(ifsq);
1987 done:
1988 if (encap)
1989 tx->watchdog.wd_timer = 5;
1992 static void
1993 mxge_watchdog(struct ifaltq_subque *ifsq)
1995 struct ifnet *ifp = ifsq_get_ifp(ifsq);
1996 struct mxge_softc *sc = ifp->if_softc;
1997 uint32_t rx_pause = be32toh(sc->ss->fw_stats->dropped_pause);
1998 mxge_tx_ring_t *tx = ifsq_get_priv(ifsq);
2000 ASSERT_IFNET_SERIALIZED_ALL(ifp);
2002 /* Check for pause blocking before resetting */
2003 if (tx->watchdog_rx_pause == rx_pause) {
2004 mxge_warn_stuck(sc, tx, 0);
2005 mxge_watchdog_reset(sc);
2006 return;
2007 } else {
2008 if_printf(ifp, "Flow control blocking xmits, "
2009 "check link partner\n");
2011 tx->watchdog_rx_pause = rx_pause;
2015 * Copy an array of mcp_kreq_ether_recv_t's to the mcp. Copy
2016 * at most 32 bytes at a time, so as to avoid involving the software
2017 * pio handler in the nic. We re-write the first segment's low
2018 * DMA address to mark it valid only after we write the entire chunk
2019 * in a burst
2021 static __inline void
2022 mxge_submit_8rx(volatile mcp_kreq_ether_recv_t *dst,
2023 mcp_kreq_ether_recv_t *src)
2025 uint32_t low;
2027 low = src->addr_low;
2028 src->addr_low = 0xffffffff;
2029 mxge_pio_copy(dst, src, 4 * sizeof (*src));
2030 wmb();
2031 mxge_pio_copy(dst + 4, src + 4, 4 * sizeof (*src));
2032 wmb();
2033 src->addr_low = low;
2034 dst->addr_low = low;
2035 wmb();
2038 static int
2039 mxge_get_buf_small(mxge_rx_ring_t *rx, bus_dmamap_t map, int idx,
2040 boolean_t init)
2042 bus_dma_segment_t seg;
2043 struct mbuf *m;
2044 int cnt, err, mflag;
2046 mflag = M_NOWAIT;
2047 if (__predict_false(init))
2048 mflag = M_WAITOK;
2050 m = m_gethdr(mflag, MT_DATA);
2051 if (m == NULL) {
2052 err = ENOBUFS;
2053 if (__predict_false(init)) {
2055 * During initialization, there
2056 * is nothing to setup; bail out
2058 return err;
2060 goto done;
2062 m->m_len = m->m_pkthdr.len = MHLEN;
2064 err = bus_dmamap_load_mbuf_segment(rx->dmat, map, m,
2065 &seg, 1, &cnt, BUS_DMA_NOWAIT);
2066 if (err != 0) {
2067 m_freem(m);
2068 if (__predict_false(init)) {
2070 * During initialization, there
2071 * is nothing to setup; bail out
2073 return err;
2075 goto done;
2078 rx->info[idx].m = m;
2079 rx->shadow[idx].addr_low = htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2080 rx->shadow[idx].addr_high = htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2082 done:
2083 if ((idx & 7) == 7)
2084 mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2085 return err;
2088 static int
2089 mxge_get_buf_big(mxge_rx_ring_t *rx, bus_dmamap_t map, int idx,
2090 boolean_t init)
2092 bus_dma_segment_t seg;
2093 struct mbuf *m;
2094 int cnt, err, mflag;
2096 mflag = M_NOWAIT;
2097 if (__predict_false(init))
2098 mflag = M_WAITOK;
2100 if (rx->cl_size == MCLBYTES)
2101 m = m_getcl(mflag, MT_DATA, M_PKTHDR);
2102 else
2103 m = m_getjcl(mflag, MT_DATA, M_PKTHDR, MJUMPAGESIZE);
2104 if (m == NULL) {
2105 err = ENOBUFS;
2106 if (__predict_false(init)) {
2108 * During initialization, there
2109 * is nothing to setup; bail out
2111 return err;
2113 goto done;
2115 m->m_len = m->m_pkthdr.len = rx->cl_size;
2117 err = bus_dmamap_load_mbuf_segment(rx->dmat, map, m,
2118 &seg, 1, &cnt, BUS_DMA_NOWAIT);
2119 if (err != 0) {
2120 m_freem(m);
2121 if (__predict_false(init)) {
2123 * During initialization, there
2124 * is nothing to setup; bail out
2126 return err;
2128 goto done;
2131 rx->info[idx].m = m;
2132 rx->shadow[idx].addr_low = htobe32(MXGE_LOWPART_TO_U32(seg.ds_addr));
2133 rx->shadow[idx].addr_high = htobe32(MXGE_HIGHPART_TO_U32(seg.ds_addr));
2135 done:
2136 if ((idx & 7) == 7)
2137 mxge_submit_8rx(&rx->lanai[idx - 7], &rx->shadow[idx - 7]);
2138 return err;
2142 * Myri10GE hardware checksums are not valid if the sender
2143 * padded the frame with non-zero padding. This is because
2144 * the firmware just does a simple 16-bit 1s complement
2145 * checksum across the entire frame, excluding the first 14
2146 * bytes. It is best to simply to check the checksum and
2147 * tell the stack about it only if the checksum is good
2149 static __inline uint16_t
2150 mxge_rx_csum(struct mbuf *m, int csum)
2152 const struct ether_header *eh;
2153 const struct ip *ip;
2154 uint16_t c;
2156 eh = mtod(m, const struct ether_header *);
2158 /* Only deal with IPv4 TCP & UDP for now */
2159 if (__predict_false(eh->ether_type != htons(ETHERTYPE_IP)))
2160 return 1;
2162 ip = (const struct ip *)(eh + 1);
2163 if (__predict_false(ip->ip_p != IPPROTO_TCP && ip->ip_p != IPPROTO_UDP))
2164 return 1;
2166 #ifdef INET
2167 c = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
2168 htonl(ntohs(csum) + ntohs(ip->ip_len) +
2169 - (ip->ip_hl << 2) + ip->ip_p));
2170 #else
2171 c = 1;
2172 #endif
2173 c ^= 0xffff;
2174 return c;
2177 static void
2178 mxge_vlan_tag_remove(struct mbuf *m, uint32_t *csum)
2180 struct ether_vlan_header *evl;
2181 uint32_t partial;
2183 evl = mtod(m, struct ether_vlan_header *);
2186 * Fix checksum by subtracting EVL_ENCAPLEN bytes after
2187 * what the firmware thought was the end of the ethernet
2188 * header.
2191 /* Put checksum into host byte order */
2192 *csum = ntohs(*csum);
2194 partial = ntohl(*(uint32_t *)(mtod(m, char *) + ETHER_HDR_LEN));
2195 *csum += ~partial;
2196 *csum += ((*csum) < ~partial);
2197 *csum = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2198 *csum = ((*csum) >> 16) + ((*csum) & 0xFFFF);
2201 * Restore checksum to network byte order;
2202 * later consumers expect this
2204 *csum = htons(*csum);
2206 /* save the tag */
2207 m->m_pkthdr.ether_vlantag = ntohs(evl->evl_tag);
2208 m->m_flags |= M_VLANTAG;
2211 * Remove the 802.1q header by copying the Ethernet
2212 * addresses over it and adjusting the beginning of
2213 * the data in the mbuf. The encapsulated Ethernet
2214 * type field is already in place.
2216 bcopy((char *)evl, (char *)evl + EVL_ENCAPLEN,
2217 ETHER_HDR_LEN - ETHER_TYPE_LEN);
2218 m_adj(m, EVL_ENCAPLEN);
2222 static __inline void
2223 mxge_rx_done_big(struct ifnet *ifp, mxge_rx_ring_t *rx,
2224 uint32_t len, uint32_t csum)
2226 struct mbuf *m;
2227 const struct ether_header *eh;
2228 bus_dmamap_t old_map;
2229 int idx;
2231 idx = rx->cnt & rx->mask;
2232 rx->cnt++;
2234 /* Save a pointer to the received mbuf */
2235 m = rx->info[idx].m;
2237 /* Try to replace the received mbuf */
2238 if (mxge_get_buf_big(rx, rx->extra_map, idx, FALSE)) {
2239 /* Drop the frame -- the old mbuf is re-cycled */
2240 IFNET_STAT_INC(ifp, ierrors, 1);
2241 return;
2244 /* Unmap the received buffer */
2245 old_map = rx->info[idx].map;
2246 bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2247 bus_dmamap_unload(rx->dmat, old_map);
2249 /* Swap the bus_dmamap_t's */
2250 rx->info[idx].map = rx->extra_map;
2251 rx->extra_map = old_map;
2254 * mcp implicitly skips 1st 2 bytes so that packet is properly
2255 * aligned
2257 m->m_data += MXGEFW_PAD;
2259 m->m_pkthdr.rcvif = ifp;
2260 m->m_len = m->m_pkthdr.len = len;
2262 IFNET_STAT_INC(ifp, ipackets, 1);
2264 eh = mtod(m, const struct ether_header *);
2265 if (eh->ether_type == htons(ETHERTYPE_VLAN))
2266 mxge_vlan_tag_remove(m, &csum);
2268 /* If the checksum is valid, mark it in the mbuf header */
2269 if ((ifp->if_capenable & IFCAP_RXCSUM) &&
2270 mxge_rx_csum(m, csum) == 0) {
2271 /* Tell the stack that the checksum is good */
2272 m->m_pkthdr.csum_data = 0xffff;
2273 m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR |
2274 CSUM_DATA_VALID;
2276 ifp->if_input(ifp, m, NULL, -1);
2279 static __inline void
2280 mxge_rx_done_small(struct ifnet *ifp, mxge_rx_ring_t *rx,
2281 uint32_t len, uint32_t csum)
2283 const struct ether_header *eh;
2284 struct mbuf *m;
2285 bus_dmamap_t old_map;
2286 int idx;
2288 idx = rx->cnt & rx->mask;
2289 rx->cnt++;
2291 /* Save a pointer to the received mbuf */
2292 m = rx->info[idx].m;
2294 /* Try to replace the received mbuf */
2295 if (mxge_get_buf_small(rx, rx->extra_map, idx, FALSE)) {
2296 /* Drop the frame -- the old mbuf is re-cycled */
2297 IFNET_STAT_INC(ifp, ierrors, 1);
2298 return;
2301 /* Unmap the received buffer */
2302 old_map = rx->info[idx].map;
2303 bus_dmamap_sync(rx->dmat, old_map, BUS_DMASYNC_POSTREAD);
2304 bus_dmamap_unload(rx->dmat, old_map);
2306 /* Swap the bus_dmamap_t's */
2307 rx->info[idx].map = rx->extra_map;
2308 rx->extra_map = old_map;
2311 * mcp implicitly skips 1st 2 bytes so that packet is properly
2312 * aligned
2314 m->m_data += MXGEFW_PAD;
2316 m->m_pkthdr.rcvif = ifp;
2317 m->m_len = m->m_pkthdr.len = len;
2319 IFNET_STAT_INC(ifp, ipackets, 1);
2321 eh = mtod(m, const struct ether_header *);
2322 if (eh->ether_type == htons(ETHERTYPE_VLAN))
2323 mxge_vlan_tag_remove(m, &csum);
2325 /* If the checksum is valid, mark it in the mbuf header */
2326 if ((ifp->if_capenable & IFCAP_RXCSUM) &&
2327 mxge_rx_csum(m, csum) == 0) {
2328 /* Tell the stack that the checksum is good */
2329 m->m_pkthdr.csum_data = 0xffff;
2330 m->m_pkthdr.csum_flags = CSUM_PSEUDO_HDR |
2331 CSUM_DATA_VALID;
2333 ifp->if_input(ifp, m, NULL, -1);
2336 static __inline void
2337 mxge_clean_rx_done(struct ifnet *ifp, struct mxge_rx_data *rx_data, int cycle)
2339 mxge_rx_done_t *rx_done = &rx_data->rx_done;
2341 while (rx_done->entry[rx_done->idx].length != 0 && cycle != 0) {
2342 uint16_t length, checksum;
2344 length = ntohs(rx_done->entry[rx_done->idx].length);
2345 rx_done->entry[rx_done->idx].length = 0;
2347 checksum = rx_done->entry[rx_done->idx].checksum;
2349 if (length <= MXGE_RX_SMALL_BUFLEN) {
2350 mxge_rx_done_small(ifp, &rx_data->rx_small,
2351 length, checksum);
2352 } else {
2353 mxge_rx_done_big(ifp, &rx_data->rx_big,
2354 length, checksum);
2357 rx_done->idx++;
2358 rx_done->idx &= rx_done->mask;
2359 --cycle;
2363 static __inline void
2364 mxge_tx_done(struct ifnet *ifp, mxge_tx_ring_t *tx, uint32_t mcp_idx)
2366 ASSERT_SERIALIZED(&tx->tx_serialize);
2368 while (tx->pkt_done != mcp_idx) {
2369 struct mbuf *m;
2370 int idx;
2372 idx = tx->done & tx->mask;
2373 tx->done++;
2375 m = tx->info[idx].m;
2377 * mbuf and DMA map only attached to the first
2378 * segment per-mbuf.
2380 if (m != NULL) {
2381 tx->pkt_done++;
2382 IFNET_STAT_INC(ifp, opackets, 1);
2383 tx->info[idx].m = NULL;
2384 bus_dmamap_unload(tx->dmat, tx->info[idx].map);
2385 m_freem(m);
2390 * If we have space, clear OACTIVE to tell the stack that
2391 * its OK to send packets
2393 if (tx->req - tx->done < (tx->mask + 1) / 2) {
2394 ifsq_clr_oactive(tx->ifsq);
2395 if (tx->req == tx->done) {
2396 /* Reset watchdog */
2397 tx->watchdog.wd_timer = 0;
2401 if (!ifsq_is_empty(tx->ifsq))
2402 ifsq_devstart(tx->ifsq);
2404 if (tx->send_stop != NULL && tx->req == tx->done) {
2406 * Let the NIC stop polling this queue, since there
2407 * are no more transmits pending
2409 *tx->send_stop = 1;
2410 tx->queue_active = 0;
2411 tx->deactivate++;
2412 wmb();
2416 static struct mxge_media_type mxge_xfp_media_types[] = {
2417 {IFM_10G_CX4, 0x7f, "10GBASE-CX4 (module)"},
2418 {IFM_10G_SR, (1 << 7), "10GBASE-SR"},
2419 {IFM_10G_LR, (1 << 6), "10GBASE-LR"},
2420 {IFM_NONE, (1 << 5), "10GBASE-ER"},
2421 {IFM_10G_LRM, (1 << 4), "10GBASE-LRM"},
2422 {IFM_NONE, (1 << 3), "10GBASE-SW"},
2423 {IFM_NONE, (1 << 2), "10GBASE-LW"},
2424 {IFM_NONE, (1 << 1), "10GBASE-EW"},
2425 {IFM_NONE, (1 << 0), "Reserved"}
2428 static struct mxge_media_type mxge_sfp_media_types[] = {
2429 {IFM_10G_TWINAX, 0, "10GBASE-Twinax"},
2430 {IFM_NONE, (1 << 7), "Reserved"},
2431 {IFM_10G_LRM, (1 << 6), "10GBASE-LRM"},
2432 {IFM_10G_LR, (1 << 5), "10GBASE-LR"},
2433 {IFM_10G_SR, (1 << 4), "10GBASE-SR"},
2434 {IFM_10G_TWINAX,(1 << 0), "10GBASE-Twinax"}
2437 static void
2438 mxge_media_set(mxge_softc_t *sc, int media_type)
2440 int fc_opt = 0;
2442 if (media_type == IFM_NONE)
2443 return;
2445 if (sc->pause)
2446 fc_opt = IFM_ETH_RXPAUSE | IFM_ETH_TXPAUSE;
2448 ifmedia_add(&sc->media, MXGE_IFM | media_type, 0, NULL);
2449 ifmedia_set(&sc->media, MXGE_IFM | media_type | fc_opt);
2451 sc->current_media = media_type;
2454 static void
2455 mxge_media_unset(mxge_softc_t *sc)
2457 ifmedia_removeall(&sc->media);
2458 sc->current_media = IFM_NONE;
2461 static void
2462 mxge_media_init(mxge_softc_t *sc)
2464 const char *ptr;
2465 int i;
2467 mxge_media_unset(sc);
2470 * Parse the product code to deterimine the interface type
2471 * (CX4, XFP, Quad Ribbon Fiber) by looking at the character
2472 * after the 3rd dash in the driver's cached copy of the
2473 * EEPROM's product code string.
2475 ptr = sc->product_code_string;
2476 if (ptr == NULL) {
2477 if_printf(sc->ifp, "Missing product code\n");
2478 return;
2481 for (i = 0; i < 3; i++, ptr++) {
2482 ptr = strchr(ptr, '-');
2483 if (ptr == NULL) {
2484 if_printf(sc->ifp, "only %d dashes in PC?!?\n", i);
2485 return;
2488 if (*ptr == 'C' || *(ptr +1) == 'C') {
2489 /* -C is CX4 */
2490 sc->connector = MXGE_CX4;
2491 mxge_media_set(sc, IFM_10G_CX4);
2492 } else if (*ptr == 'Q') {
2493 /* -Q is Quad Ribbon Fiber */
2494 sc->connector = MXGE_QRF;
2495 if_printf(sc->ifp, "Quad Ribbon Fiber Media\n");
2496 /* DragonFly has no media type for Quad ribbon fiber */
2497 } else if (*ptr == 'R') {
2498 /* -R is XFP */
2499 sc->connector = MXGE_XFP;
2500 /* NOTE: ifmedia will be installed later */
2501 } else if (*ptr == 'S' || *(ptr +1) == 'S') {
2502 /* -S or -2S is SFP+ */
2503 sc->connector = MXGE_SFP;
2504 /* NOTE: ifmedia will be installed later */
2505 } else {
2506 sc->connector = MXGE_UNK;
2507 if_printf(sc->ifp, "Unknown media type: %c\n", *ptr);
2512 * Determine the media type for a NIC. Some XFPs will identify
2513 * themselves only when their link is up, so this is initiated via a
2514 * link up interrupt. However, this can potentially take up to
2515 * several milliseconds, so it is run via the watchdog routine, rather
2516 * than in the interrupt handler itself.
2518 static void
2519 mxge_media_probe(mxge_softc_t *sc)
2521 mxge_cmd_t cmd;
2522 const char *cage_type;
2523 struct mxge_media_type *mxge_media_types = NULL;
2524 int i, err, ms, mxge_media_type_entries;
2525 uint32_t byte;
2527 sc->need_media_probe = 0;
2529 if (sc->connector == MXGE_XFP) {
2530 /* -R is XFP */
2531 mxge_media_types = mxge_xfp_media_types;
2532 mxge_media_type_entries = NELEM(mxge_xfp_media_types);
2533 byte = MXGE_XFP_COMPLIANCE_BYTE;
2534 cage_type = "XFP";
2535 } else if (sc->connector == MXGE_SFP) {
2536 /* -S or -2S is SFP+ */
2537 mxge_media_types = mxge_sfp_media_types;
2538 mxge_media_type_entries = NELEM(mxge_sfp_media_types);
2539 cage_type = "SFP+";
2540 byte = 3;
2541 } else {
2542 /* nothing to do; media type cannot change */
2543 return;
2547 * At this point we know the NIC has an XFP cage, so now we
2548 * try to determine what is in the cage by using the
2549 * firmware's XFP I2C commands to read the XFP 10GbE compilance
2550 * register. We read just one byte, which may take over
2551 * a millisecond
2554 bzero(&cmd, sizeof(cmd)); /* silence gcc warning */
2555 cmd.data0 = 0; /* just fetch 1 byte, not all 256 */
2556 cmd.data1 = byte;
2557 err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_READ, &cmd);
2558 if (err != MXGEFW_CMD_OK) {
2559 if (err == MXGEFW_CMD_ERROR_I2C_FAILURE)
2560 if_printf(sc->ifp, "failed to read XFP\n");
2561 else if (err == MXGEFW_CMD_ERROR_I2C_ABSENT)
2562 if_printf(sc->ifp, "Type R/S with no XFP!?!?\n");
2563 else
2564 if_printf(sc->ifp, "I2C read failed, err: %d", err);
2565 mxge_media_unset(sc);
2566 return;
2569 /* Now we wait for the data to be cached */
2570 cmd.data0 = byte;
2571 err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2572 for (ms = 0; err == EBUSY && ms < 50; ms++) {
2573 DELAY(1000);
2574 cmd.data0 = byte;
2575 err = mxge_send_cmd(sc, MXGEFW_CMD_I2C_BYTE, &cmd);
2577 if (err != MXGEFW_CMD_OK) {
2578 if_printf(sc->ifp, "failed to read %s (%d, %dms)\n",
2579 cage_type, err, ms);
2580 mxge_media_unset(sc);
2581 return;
2584 if (cmd.data0 == mxge_media_types[0].bitmask) {
2585 if (bootverbose) {
2586 if_printf(sc->ifp, "%s:%s\n", cage_type,
2587 mxge_media_types[0].name);
2589 if (sc->current_media != mxge_media_types[0].flag) {
2590 mxge_media_unset(sc);
2591 mxge_media_set(sc, mxge_media_types[0].flag);
2593 return;
2595 for (i = 1; i < mxge_media_type_entries; i++) {
2596 if (cmd.data0 & mxge_media_types[i].bitmask) {
2597 if (bootverbose) {
2598 if_printf(sc->ifp, "%s:%s\n", cage_type,
2599 mxge_media_types[i].name);
2602 if (sc->current_media != mxge_media_types[i].flag) {
2603 mxge_media_unset(sc);
2604 mxge_media_set(sc, mxge_media_types[i].flag);
2606 return;
2609 mxge_media_unset(sc);
2610 if (bootverbose) {
2611 if_printf(sc->ifp, "%s media 0x%x unknown\n", cage_type,
2612 cmd.data0);
2616 static void
2617 mxge_intr_status(struct mxge_softc *sc, const mcp_irq_data_t *stats)
2619 if (sc->link_state != stats->link_up) {
2620 sc->link_state = stats->link_up;
2621 if (sc->link_state) {
2622 sc->ifp->if_link_state = LINK_STATE_UP;
2623 if_link_state_change(sc->ifp);
2624 if (bootverbose)
2625 if_printf(sc->ifp, "link up\n");
2626 } else {
2627 sc->ifp->if_link_state = LINK_STATE_DOWN;
2628 if_link_state_change(sc->ifp);
2629 if (bootverbose)
2630 if_printf(sc->ifp, "link down\n");
2632 sc->need_media_probe = 1;
2635 if (sc->rdma_tags_available != be32toh(stats->rdma_tags_available)) {
2636 sc->rdma_tags_available = be32toh(stats->rdma_tags_available);
2637 if_printf(sc->ifp, "RDMA timed out! %d tags left\n",
2638 sc->rdma_tags_available);
2641 if (stats->link_down) {
2642 sc->down_cnt += stats->link_down;
2643 sc->link_state = 0;
2644 sc->ifp->if_link_state = LINK_STATE_DOWN;
2645 if_link_state_change(sc->ifp);
2649 static void
2650 mxge_serialize_skipmain(struct mxge_softc *sc)
2652 lwkt_serialize_array_enter(sc->serializes, sc->nserialize, 1);
2655 static void
2656 mxge_deserialize_skipmain(struct mxge_softc *sc)
2658 lwkt_serialize_array_exit(sc->serializes, sc->nserialize, 1);
2661 static void
2662 mxge_legacy(void *arg)
2664 struct mxge_slice_state *ss = arg;
2665 mxge_softc_t *sc = ss->sc;
2666 mcp_irq_data_t *stats = ss->fw_stats;
2667 mxge_tx_ring_t *tx = &ss->tx;
2668 mxge_rx_done_t *rx_done = &ss->rx_data.rx_done;
2669 uint32_t send_done_count;
2670 uint8_t valid;
2672 ASSERT_SERIALIZED(&sc->main_serialize);
2674 /* Make sure the DMA has finished */
2675 if (!stats->valid)
2676 return;
2677 valid = stats->valid;
2679 /* Lower legacy IRQ */
2680 *sc->irq_deassert = 0;
2681 if (!mxge_deassert_wait) {
2682 /* Don't wait for conf. that irq is low */
2683 stats->valid = 0;
2686 mxge_serialize_skipmain(sc);
2689 * Loop while waiting for legacy irq deassertion
2690 * XXX do we really want to loop?
2692 do {
2693 /* Check for transmit completes and receives */
2694 send_done_count = be32toh(stats->send_done_count);
2695 while ((send_done_count != tx->pkt_done) ||
2696 (rx_done->entry[rx_done->idx].length != 0)) {
2697 if (send_done_count != tx->pkt_done) {
2698 mxge_tx_done(&sc->arpcom.ac_if, tx,
2699 (int)send_done_count);
2701 mxge_clean_rx_done(&sc->arpcom.ac_if, &ss->rx_data, -1);
2702 send_done_count = be32toh(stats->send_done_count);
2704 if (mxge_deassert_wait)
2705 wmb();
2706 } while (*((volatile uint8_t *)&stats->valid));
2708 mxge_deserialize_skipmain(sc);
2710 /* Fw link & error stats meaningful only on the first slice */
2711 if (__predict_false(stats->stats_updated))
2712 mxge_intr_status(sc, stats);
2714 /* Check to see if we have rx token to pass back */
2715 if (valid & 0x1)
2716 *ss->irq_claim = be32toh(3);
2717 *(ss->irq_claim + 1) = be32toh(3);
2720 static void
2721 mxge_msi(void *arg)
2723 struct mxge_slice_state *ss = arg;
2724 mxge_softc_t *sc = ss->sc;
2725 mcp_irq_data_t *stats = ss->fw_stats;
2726 mxge_tx_ring_t *tx = &ss->tx;
2727 mxge_rx_done_t *rx_done = &ss->rx_data.rx_done;
2728 uint32_t send_done_count;
2729 uint8_t valid;
2730 #ifndef IFPOLL_ENABLE
2731 const boolean_t polling = FALSE;
2732 #else
2733 boolean_t polling = FALSE;
2734 #endif
2736 ASSERT_SERIALIZED(&sc->main_serialize);
2738 /* Make sure the DMA has finished */
2739 if (__predict_false(!stats->valid))
2740 return;
2742 valid = stats->valid;
2743 stats->valid = 0;
2745 #ifdef IFPOLL_ENABLE
2746 if (sc->arpcom.ac_if.if_flags & IFF_NPOLLING)
2747 polling = TRUE;
2748 #endif
2750 if (!polling) {
2751 /* Check for receives */
2752 lwkt_serialize_enter(&ss->rx_data.rx_serialize);
2753 if (rx_done->entry[rx_done->idx].length != 0)
2754 mxge_clean_rx_done(&sc->arpcom.ac_if, &ss->rx_data, -1);
2755 lwkt_serialize_exit(&ss->rx_data.rx_serialize);
2759 * Check for transmit completes
2761 * NOTE:
2762 * Since pkt_done is only changed by mxge_tx_done(),
2763 * which is called only in interrupt handler, the
2764 * check w/o holding tx serializer is MPSAFE.
2766 send_done_count = be32toh(stats->send_done_count);
2767 if (send_done_count != tx->pkt_done) {
2768 lwkt_serialize_enter(&tx->tx_serialize);
2769 mxge_tx_done(&sc->arpcom.ac_if, tx, (int)send_done_count);
2770 lwkt_serialize_exit(&tx->tx_serialize);
2773 if (__predict_false(stats->stats_updated))
2774 mxge_intr_status(sc, stats);
2776 /* Check to see if we have rx token to pass back */
2777 if (!polling && (valid & 0x1))
2778 *ss->irq_claim = be32toh(3);
2779 *(ss->irq_claim + 1) = be32toh(3);
2782 static void
2783 mxge_msix_rx(void *arg)
2785 struct mxge_slice_state *ss = arg;
2786 mxge_rx_done_t *rx_done = &ss->rx_data.rx_done;
2788 #ifdef IFPOLL_ENABLE
2789 if (ss->sc->arpcom.ac_if.if_flags & IFF_NPOLLING)
2790 return;
2791 #endif
2793 ASSERT_SERIALIZED(&ss->rx_data.rx_serialize);
2795 if (rx_done->entry[rx_done->idx].length != 0)
2796 mxge_clean_rx_done(&ss->sc->arpcom.ac_if, &ss->rx_data, -1);
2798 *ss->irq_claim = be32toh(3);
2801 static void
2802 mxge_msix_rxtx(void *arg)
2804 struct mxge_slice_state *ss = arg;
2805 mxge_softc_t *sc = ss->sc;
2806 mcp_irq_data_t *stats = ss->fw_stats;
2807 mxge_tx_ring_t *tx = &ss->tx;
2808 mxge_rx_done_t *rx_done = &ss->rx_data.rx_done;
2809 uint32_t send_done_count;
2810 uint8_t valid;
2811 #ifndef IFPOLL_ENABLE
2812 const boolean_t polling = FALSE;
2813 #else
2814 boolean_t polling = FALSE;
2815 #endif
2817 ASSERT_SERIALIZED(&ss->rx_data.rx_serialize);
2819 /* Make sure the DMA has finished */
2820 if (__predict_false(!stats->valid))
2821 return;
2823 valid = stats->valid;
2824 stats->valid = 0;
2826 #ifdef IFPOLL_ENABLE
2827 if (sc->arpcom.ac_if.if_flags & IFF_NPOLLING)
2828 polling = TRUE;
2829 #endif
2831 /* Check for receives */
2832 if (!polling && rx_done->entry[rx_done->idx].length != 0)
2833 mxge_clean_rx_done(&sc->arpcom.ac_if, &ss->rx_data, -1);
2836 * Check for transmit completes
2838 * NOTE:
2839 * Since pkt_done is only changed by mxge_tx_done(),
2840 * which is called only in interrupt handler, the
2841 * check w/o holding tx serializer is MPSAFE.
2843 send_done_count = be32toh(stats->send_done_count);
2844 if (send_done_count != tx->pkt_done) {
2845 lwkt_serialize_enter(&tx->tx_serialize);
2846 mxge_tx_done(&sc->arpcom.ac_if, tx, (int)send_done_count);
2847 lwkt_serialize_exit(&tx->tx_serialize);
2850 /* Check to see if we have rx token to pass back */
2851 if (!polling && (valid & 0x1))
2852 *ss->irq_claim = be32toh(3);
2853 *(ss->irq_claim + 1) = be32toh(3);
2856 static void
2857 mxge_init(void *arg)
2859 struct mxge_softc *sc = arg;
2861 ASSERT_IFNET_SERIALIZED_ALL(sc->ifp);
2862 if ((sc->ifp->if_flags & IFF_RUNNING) == 0)
2863 mxge_open(sc);
2866 static void
2867 mxge_free_slice_mbufs(struct mxge_slice_state *ss)
2869 int i;
2871 for (i = 0; i <= ss->rx_data.rx_big.mask; i++) {
2872 if (ss->rx_data.rx_big.info[i].m == NULL)
2873 continue;
2874 bus_dmamap_unload(ss->rx_data.rx_big.dmat,
2875 ss->rx_data.rx_big.info[i].map);
2876 m_freem(ss->rx_data.rx_big.info[i].m);
2877 ss->rx_data.rx_big.info[i].m = NULL;
2880 for (i = 0; i <= ss->rx_data.rx_small.mask; i++) {
2881 if (ss->rx_data.rx_small.info[i].m == NULL)
2882 continue;
2883 bus_dmamap_unload(ss->rx_data.rx_small.dmat,
2884 ss->rx_data.rx_small.info[i].map);
2885 m_freem(ss->rx_data.rx_small.info[i].m);
2886 ss->rx_data.rx_small.info[i].m = NULL;
2889 /* Transmit ring used only on the first slice */
2890 if (ss->tx.info == NULL)
2891 return;
2893 for (i = 0; i <= ss->tx.mask; i++) {
2894 if (ss->tx.info[i].m == NULL)
2895 continue;
2896 bus_dmamap_unload(ss->tx.dmat, ss->tx.info[i].map);
2897 m_freem(ss->tx.info[i].m);
2898 ss->tx.info[i].m = NULL;
2902 static void
2903 mxge_free_mbufs(mxge_softc_t *sc)
2905 int slice;
2907 for (slice = 0; slice < sc->num_slices; slice++)
2908 mxge_free_slice_mbufs(&sc->ss[slice]);
2911 static void
2912 mxge_free_slice_rings(struct mxge_slice_state *ss)
2914 int i;
2916 if (ss->rx_data.rx_done.entry != NULL) {
2917 mxge_dma_free(&ss->rx_done_dma);
2918 ss->rx_data.rx_done.entry = NULL;
2921 if (ss->tx.req_list != NULL) {
2922 kfree(ss->tx.req_list, M_DEVBUF);
2923 ss->tx.req_list = NULL;
2926 if (ss->tx.seg_list != NULL) {
2927 kfree(ss->tx.seg_list, M_DEVBUF);
2928 ss->tx.seg_list = NULL;
2931 if (ss->rx_data.rx_small.shadow != NULL) {
2932 kfree(ss->rx_data.rx_small.shadow, M_DEVBUF);
2933 ss->rx_data.rx_small.shadow = NULL;
2936 if (ss->rx_data.rx_big.shadow != NULL) {
2937 kfree(ss->rx_data.rx_big.shadow, M_DEVBUF);
2938 ss->rx_data.rx_big.shadow = NULL;
2941 if (ss->tx.info != NULL) {
2942 if (ss->tx.dmat != NULL) {
2943 for (i = 0; i <= ss->tx.mask; i++) {
2944 bus_dmamap_destroy(ss->tx.dmat,
2945 ss->tx.info[i].map);
2947 bus_dma_tag_destroy(ss->tx.dmat);
2949 kfree(ss->tx.info, M_DEVBUF);
2950 ss->tx.info = NULL;
2953 if (ss->rx_data.rx_small.info != NULL) {
2954 if (ss->rx_data.rx_small.dmat != NULL) {
2955 for (i = 0; i <= ss->rx_data.rx_small.mask; i++) {
2956 bus_dmamap_destroy(ss->rx_data.rx_small.dmat,
2957 ss->rx_data.rx_small.info[i].map);
2959 bus_dmamap_destroy(ss->rx_data.rx_small.dmat,
2960 ss->rx_data.rx_small.extra_map);
2961 bus_dma_tag_destroy(ss->rx_data.rx_small.dmat);
2963 kfree(ss->rx_data.rx_small.info, M_DEVBUF);
2964 ss->rx_data.rx_small.info = NULL;
2967 if (ss->rx_data.rx_big.info != NULL) {
2968 if (ss->rx_data.rx_big.dmat != NULL) {
2969 for (i = 0; i <= ss->rx_data.rx_big.mask; i++) {
2970 bus_dmamap_destroy(ss->rx_data.rx_big.dmat,
2971 ss->rx_data.rx_big.info[i].map);
2973 bus_dmamap_destroy(ss->rx_data.rx_big.dmat,
2974 ss->rx_data.rx_big.extra_map);
2975 bus_dma_tag_destroy(ss->rx_data.rx_big.dmat);
2977 kfree(ss->rx_data.rx_big.info, M_DEVBUF);
2978 ss->rx_data.rx_big.info = NULL;
2982 static void
2983 mxge_free_rings(mxge_softc_t *sc)
2985 int slice;
2987 if (sc->ss == NULL)
2988 return;
2990 for (slice = 0; slice < sc->num_slices; slice++)
2991 mxge_free_slice_rings(&sc->ss[slice]);
2994 static int
2995 mxge_alloc_slice_rings(struct mxge_slice_state *ss, int rx_ring_entries,
2996 int tx_ring_entries)
2998 mxge_softc_t *sc = ss->sc;
2999 size_t bytes;
3000 int err, i;
3003 * Allocate per-slice receive resources
3006 ss->rx_data.rx_small.mask = ss->rx_data.rx_big.mask =
3007 rx_ring_entries - 1;
3008 ss->rx_data.rx_done.mask = (2 * rx_ring_entries) - 1;
3010 /* Allocate the rx shadow rings */
3011 bytes = rx_ring_entries * sizeof(*ss->rx_data.rx_small.shadow);
3012 ss->rx_data.rx_small.shadow = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3014 bytes = rx_ring_entries * sizeof(*ss->rx_data.rx_big.shadow);
3015 ss->rx_data.rx_big.shadow = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3017 /* Allocate the rx host info rings */
3018 bytes = rx_ring_entries * sizeof(*ss->rx_data.rx_small.info);
3019 ss->rx_data.rx_small.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3021 bytes = rx_ring_entries * sizeof(*ss->rx_data.rx_big.info);
3022 ss->rx_data.rx_big.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3024 /* Allocate the rx busdma resources */
3025 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
3026 1, /* alignment */
3027 4096, /* boundary */
3028 BUS_SPACE_MAXADDR, /* low */
3029 BUS_SPACE_MAXADDR, /* high */
3030 NULL, NULL, /* filter */
3031 MHLEN, /* maxsize */
3032 1, /* num segs */
3033 MHLEN, /* maxsegsize */
3034 BUS_DMA_WAITOK | BUS_DMA_ALLOCNOW,
3035 /* flags */
3036 &ss->rx_data.rx_small.dmat); /* tag */
3037 if (err != 0) {
3038 device_printf(sc->dev, "Err %d allocating rx_small dmat\n",
3039 err);
3040 return err;
3043 err = bus_dmamap_create(ss->rx_data.rx_small.dmat, BUS_DMA_WAITOK,
3044 &ss->rx_data.rx_small.extra_map);
3045 if (err != 0) {
3046 device_printf(sc->dev, "Err %d extra rx_small dmamap\n", err);
3047 bus_dma_tag_destroy(ss->rx_data.rx_small.dmat);
3048 ss->rx_data.rx_small.dmat = NULL;
3049 return err;
3051 for (i = 0; i <= ss->rx_data.rx_small.mask; i++) {
3052 err = bus_dmamap_create(ss->rx_data.rx_small.dmat,
3053 BUS_DMA_WAITOK, &ss->rx_data.rx_small.info[i].map);
3054 if (err != 0) {
3055 int j;
3057 device_printf(sc->dev, "Err %d rx_small dmamap\n", err);
3059 for (j = 0; j < i; ++j) {
3060 bus_dmamap_destroy(ss->rx_data.rx_small.dmat,
3061 ss->rx_data.rx_small.info[j].map);
3063 bus_dmamap_destroy(ss->rx_data.rx_small.dmat,
3064 ss->rx_data.rx_small.extra_map);
3065 bus_dma_tag_destroy(ss->rx_data.rx_small.dmat);
3066 ss->rx_data.rx_small.dmat = NULL;
3067 return err;
3071 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
3072 1, /* alignment */
3073 4096, /* boundary */
3074 BUS_SPACE_MAXADDR, /* low */
3075 BUS_SPACE_MAXADDR, /* high */
3076 NULL, NULL, /* filter */
3077 4096, /* maxsize */
3078 1, /* num segs */
3079 4096, /* maxsegsize*/
3080 BUS_DMA_WAITOK | BUS_DMA_ALLOCNOW,
3081 /* flags */
3082 &ss->rx_data.rx_big.dmat); /* tag */
3083 if (err != 0) {
3084 device_printf(sc->dev, "Err %d allocating rx_big dmat\n",
3085 err);
3086 return err;
3089 err = bus_dmamap_create(ss->rx_data.rx_big.dmat, BUS_DMA_WAITOK,
3090 &ss->rx_data.rx_big.extra_map);
3091 if (err != 0) {
3092 device_printf(sc->dev, "Err %d extra rx_big dmamap\n", err);
3093 bus_dma_tag_destroy(ss->rx_data.rx_big.dmat);
3094 ss->rx_data.rx_big.dmat = NULL;
3095 return err;
3097 for (i = 0; i <= ss->rx_data.rx_big.mask; i++) {
3098 err = bus_dmamap_create(ss->rx_data.rx_big.dmat, BUS_DMA_WAITOK,
3099 &ss->rx_data.rx_big.info[i].map);
3100 if (err != 0) {
3101 int j;
3103 device_printf(sc->dev, "Err %d rx_big dmamap\n", err);
3104 for (j = 0; j < i; ++j) {
3105 bus_dmamap_destroy(ss->rx_data.rx_big.dmat,
3106 ss->rx_data.rx_big.info[j].map);
3108 bus_dmamap_destroy(ss->rx_data.rx_big.dmat,
3109 ss->rx_data.rx_big.extra_map);
3110 bus_dma_tag_destroy(ss->rx_data.rx_big.dmat);
3111 ss->rx_data.rx_big.dmat = NULL;
3112 return err;
3117 * Now allocate TX resources
3120 ss->tx.mask = tx_ring_entries - 1;
3121 ss->tx.max_desc = MIN(MXGE_MAX_SEND_DESC, tx_ring_entries / 4);
3124 * Allocate the tx request copy block; MUST be at least 8 bytes
3125 * aligned
3127 bytes = sizeof(*ss->tx.req_list) * (ss->tx.max_desc + 4);
3128 ss->tx.req_list = kmalloc_cachealign(__VM_CACHELINE_ALIGN(bytes),
3129 M_DEVBUF, M_WAITOK);
3131 /* Allocate the tx busdma segment list */
3132 bytes = sizeof(*ss->tx.seg_list) * ss->tx.max_desc;
3133 ss->tx.seg_list = kmalloc(bytes, M_DEVBUF, M_WAITOK);
3135 /* Allocate the tx host info ring */
3136 bytes = tx_ring_entries * sizeof(*ss->tx.info);
3137 ss->tx.info = kmalloc(bytes, M_DEVBUF, M_ZERO|M_WAITOK);
3139 /* Allocate the tx busdma resources */
3140 err = bus_dma_tag_create(sc->parent_dmat, /* parent */
3141 1, /* alignment */
3142 sc->tx_boundary, /* boundary */
3143 BUS_SPACE_MAXADDR, /* low */
3144 BUS_SPACE_MAXADDR, /* high */
3145 NULL, NULL, /* filter */
3146 IP_MAXPACKET +
3147 sizeof(struct ether_vlan_header),
3148 /* maxsize */
3149 ss->tx.max_desc - 2, /* num segs */
3150 sc->tx_boundary, /* maxsegsz */
3151 BUS_DMA_WAITOK | BUS_DMA_ALLOCNOW |
3152 BUS_DMA_ONEBPAGE, /* flags */
3153 &ss->tx.dmat); /* tag */
3154 if (err != 0) {
3155 device_printf(sc->dev, "Err %d allocating tx dmat\n", err);
3156 return err;
3160 * Now use these tags to setup DMA maps for each slot in the ring
3162 for (i = 0; i <= ss->tx.mask; i++) {
3163 err = bus_dmamap_create(ss->tx.dmat,
3164 BUS_DMA_WAITOK | BUS_DMA_ONEBPAGE, &ss->tx.info[i].map);
3165 if (err != 0) {
3166 int j;
3168 device_printf(sc->dev, "Err %d tx dmamap\n", err);
3169 for (j = 0; j < i; ++j) {
3170 bus_dmamap_destroy(ss->tx.dmat,
3171 ss->tx.info[j].map);
3173 bus_dma_tag_destroy(ss->tx.dmat);
3174 ss->tx.dmat = NULL;
3175 return err;
3178 return 0;
3181 static int
3182 mxge_alloc_rings(mxge_softc_t *sc)
3184 mxge_cmd_t cmd;
3185 int tx_ring_size;
3186 int tx_ring_entries, rx_ring_entries;
3187 int err, slice;
3189 /* Get ring sizes */
3190 err = mxge_send_cmd(sc, MXGEFW_CMD_GET_SEND_RING_SIZE, &cmd);
3191 if (err != 0) {
3192 device_printf(sc->dev, "Cannot determine tx ring sizes\n");
3193 return err;
3195 tx_ring_size = cmd.data0;
3197 tx_ring_entries = tx_ring_size / sizeof(mcp_kreq_ether_send_t);
3198 rx_ring_entries = sc->rx_intr_slots / 2;
3200 if (bootverbose) {
3201 device_printf(sc->dev, "tx desc %d, rx desc %d\n",
3202 tx_ring_entries, rx_ring_entries);
3205 sc->ifp->if_nmbclusters = rx_ring_entries * sc->num_slices;
3206 sc->ifp->if_nmbjclusters = sc->ifp->if_nmbclusters;
3208 ifq_set_maxlen(&sc->ifp->if_snd, tx_ring_entries - 1);
3209 ifq_set_ready(&sc->ifp->if_snd);
3210 ifq_set_subq_cnt(&sc->ifp->if_snd, sc->num_tx_rings);
3212 if (sc->num_tx_rings > 1) {
3213 sc->ifp->if_mapsubq = ifq_mapsubq_modulo;
3214 ifq_set_subq_divisor(&sc->ifp->if_snd, sc->num_tx_rings);
3217 for (slice = 0; slice < sc->num_slices; slice++) {
3218 err = mxge_alloc_slice_rings(&sc->ss[slice],
3219 rx_ring_entries, tx_ring_entries);
3220 if (err != 0) {
3221 device_printf(sc->dev,
3222 "alloc %d slice rings failed\n", slice);
3223 return err;
3226 return 0;
3229 static void
3230 mxge_choose_params(int mtu, int *cl_size)
3232 int bufsize = mtu + ETHER_HDR_LEN + EVL_ENCAPLEN + MXGEFW_PAD;
3234 if (bufsize < MCLBYTES) {
3235 *cl_size = MCLBYTES;
3236 } else {
3237 KASSERT(bufsize < MJUMPAGESIZE, ("invalid MTU %d", mtu));
3238 *cl_size = MJUMPAGESIZE;
3242 static int
3243 mxge_slice_open(struct mxge_slice_state *ss, int cl_size)
3245 mxge_cmd_t cmd;
3246 int err, i, slice;
3248 slice = ss - ss->sc->ss;
3251 * Get the lanai pointers to the send and receive rings
3253 err = 0;
3255 bzero(&cmd, sizeof(cmd)); /* silence gcc warning */
3256 if (ss->sc->num_tx_rings == 1) {
3257 if (slice == 0) {
3258 cmd.data0 = slice;
3259 err = mxge_send_cmd(ss->sc, MXGEFW_CMD_GET_SEND_OFFSET,
3260 &cmd);
3261 ss->tx.lanai = (volatile mcp_kreq_ether_send_t *)
3262 (ss->sc->sram + cmd.data0);
3263 /* Leave send_go and send_stop as NULL */
3265 } else {
3266 cmd.data0 = slice;
3267 err = mxge_send_cmd(ss->sc, MXGEFW_CMD_GET_SEND_OFFSET, &cmd);
3268 ss->tx.lanai = (volatile mcp_kreq_ether_send_t *)
3269 (ss->sc->sram + cmd.data0);
3270 ss->tx.send_go = (volatile uint32_t *)
3271 (ss->sc->sram + MXGEFW_ETH_SEND_GO + 64 * slice);
3272 ss->tx.send_stop = (volatile uint32_t *)
3273 (ss->sc->sram + MXGEFW_ETH_SEND_STOP + 64 * slice);
3276 cmd.data0 = slice;
3277 err |= mxge_send_cmd(ss->sc, MXGEFW_CMD_GET_SMALL_RX_OFFSET, &cmd);
3278 ss->rx_data.rx_small.lanai =
3279 (volatile mcp_kreq_ether_recv_t *)(ss->sc->sram + cmd.data0);
3281 cmd.data0 = slice;
3282 err |= mxge_send_cmd(ss->sc, MXGEFW_CMD_GET_BIG_RX_OFFSET, &cmd);
3283 ss->rx_data.rx_big.lanai =
3284 (volatile mcp_kreq_ether_recv_t *)(ss->sc->sram + cmd.data0);
3286 if (err != 0) {
3287 if_printf(ss->sc->ifp,
3288 "failed to get ring sizes or locations\n");
3289 return EIO;
3293 * Stock small receive ring
3295 for (i = 0; i <= ss->rx_data.rx_small.mask; i++) {
3296 err = mxge_get_buf_small(&ss->rx_data.rx_small,
3297 ss->rx_data.rx_small.info[i].map, i, TRUE);
3298 if (err) {
3299 if_printf(ss->sc->ifp, "alloced %d/%d smalls\n", i,
3300 ss->rx_data.rx_small.mask + 1);
3301 return ENOMEM;
3306 * Stock big receive ring
3308 for (i = 0; i <= ss->rx_data.rx_big.mask; i++) {
3309 ss->rx_data.rx_big.shadow[i].addr_low = 0xffffffff;
3310 ss->rx_data.rx_big.shadow[i].addr_high = 0xffffffff;
3313 ss->rx_data.rx_big.cl_size = cl_size;
3315 for (i = 0; i <= ss->rx_data.rx_big.mask; i++) {
3316 err = mxge_get_buf_big(&ss->rx_data.rx_big,
3317 ss->rx_data.rx_big.info[i].map, i, TRUE);
3318 if (err) {
3319 if_printf(ss->sc->ifp, "alloced %d/%d bigs\n", i,
3320 ss->rx_data.rx_big.mask + 1);
3321 return ENOMEM;
3324 return 0;
3327 static int
3328 mxge_open(mxge_softc_t *sc)
3330 struct ifnet *ifp = sc->ifp;
3331 mxge_cmd_t cmd;
3332 int err, slice, cl_size, i;
3333 bus_addr_t bus;
3334 volatile uint8_t *itable;
3335 struct mxge_slice_state *ss;
3337 ASSERT_IFNET_SERIALIZED_ALL(ifp);
3339 /* Copy the MAC address in case it was overridden */
3340 bcopy(IF_LLADDR(ifp), sc->mac_addr, ETHER_ADDR_LEN);
3342 err = mxge_reset(sc, 1);
3343 if (err != 0) {
3344 if_printf(ifp, "failed to reset\n");
3345 return EIO;
3348 if (sc->num_slices > 1) {
3349 if (sc->use_rss) {
3350 volatile uint8_t *hwkey;
3351 uint8_t swkey[MXGE_HWRSS_KEYLEN];
3354 * Setup the indirect table.
3356 if_ringmap_rdrtable(sc->ring_map, sc->rdr_table,
3357 NETISR_CPUMAX);
3359 cmd.data0 = NETISR_CPUMAX;
3360 err = mxge_send_cmd(sc,
3361 MXGEFW_CMD_SET_RSS_TABLE_SIZE, &cmd);
3363 err |= mxge_send_cmd(sc,
3364 MXGEFW_CMD_GET_RSS_TABLE_OFFSET, &cmd);
3365 if (err != 0) {
3366 if_printf(ifp, "failed to setup rss tables\n");
3367 return err;
3370 itable = sc->sram + cmd.data0;
3371 for (i = 0; i < NETISR_CPUMAX; i++)
3372 itable[i] = sc->rdr_table[i];
3375 * Setup Toeplitz key.
3377 err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RSS_KEY_OFFSET,
3378 &cmd);
3379 if (err != 0) {
3380 if_printf(ifp, "failed to get rsskey\n");
3381 return err;
3383 hwkey = sc->sram + cmd.data0;
3385 toeplitz_get_key(swkey, MXGE_HWRSS_KEYLEN);
3386 for (i = 0; i < MXGE_HWRSS_KEYLEN; ++i)
3387 hwkey[i] = swkey[i];
3388 wmb();
3390 err = mxge_send_cmd(sc, MXGEFW_CMD_RSS_KEY_UPDATED,
3391 &cmd);
3392 if (err != 0) {
3393 if_printf(ifp, "failed to update rsskey\n");
3394 return err;
3396 if (bootverbose)
3397 if_printf(ifp, "RSS key updated\n");
3398 } else {
3399 /* Setup the indirection table */
3400 cmd.data0 = sc->num_slices;
3401 err = mxge_send_cmd(sc,
3402 MXGEFW_CMD_SET_RSS_TABLE_SIZE, &cmd);
3404 err |= mxge_send_cmd(sc,
3405 MXGEFW_CMD_GET_RSS_TABLE_OFFSET, &cmd);
3406 if (err != 0) {
3407 if_printf(ifp, "failed to setup rss tables\n");
3408 return err;
3411 /* Just enable an identity mapping */
3412 itable = sc->sram + cmd.data0;
3413 for (i = 0; i < sc->num_slices; i++)
3414 itable[i] = (uint8_t)i;
3417 cmd.data0 = 1;
3418 if (sc->use_rss) {
3419 if (bootverbose)
3420 if_printf(ifp, "input hash: RSS\n");
3421 cmd.data1 = MXGEFW_RSS_HASH_TYPE_IPV4 |
3422 MXGEFW_RSS_HASH_TYPE_TCP_IPV4;
3423 } else {
3424 if (bootverbose)
3425 if_printf(ifp, "input hash: SRC_DST_PORT\n");
3426 cmd.data1 = MXGEFW_RSS_HASH_TYPE_SRC_DST_PORT;
3428 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_RSS_ENABLE, &cmd);
3429 if (err != 0) {
3430 if_printf(ifp, "failed to enable slices\n");
3431 return err;
3435 cmd.data0 = MXGEFW_TSO_MODE_NDIS;
3436 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_TSO_MODE, &cmd);
3437 if (err) {
3439 * Can't change TSO mode to NDIS, never allow TSO then
3441 if_printf(ifp, "failed to set TSO mode\n");
3442 ifp->if_capenable &= ~IFCAP_TSO;
3443 ifp->if_capabilities &= ~IFCAP_TSO;
3444 ifp->if_hwassist &= ~CSUM_TSO;
3447 mxge_choose_params(ifp->if_mtu, &cl_size);
3449 cmd.data0 = 1;
3450 err = mxge_send_cmd(sc, MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS, &cmd);
3452 * Error is only meaningful if we're trying to set
3453 * MXGEFW_CMD_ALWAYS_USE_N_BIG_BUFFERS > 1
3457 * Give the firmware the mtu and the big and small buffer
3458 * sizes. The firmware wants the big buf size to be a power
3459 * of two. Luckily, DragonFly's clusters are powers of two
3461 cmd.data0 = ifp->if_mtu + ETHER_HDR_LEN + EVL_ENCAPLEN;
3462 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_MTU, &cmd);
3464 cmd.data0 = MXGE_RX_SMALL_BUFLEN;
3465 err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_SMALL_BUFFER_SIZE, &cmd);
3467 cmd.data0 = cl_size;
3468 err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_BIG_BUFFER_SIZE, &cmd);
3470 if (err != 0) {
3471 if_printf(ifp, "failed to setup params\n");
3472 goto abort;
3475 /* Now give him the pointer to the stats block */
3476 for (slice = 0; slice < sc->num_slices; slice++) {
3477 ss = &sc->ss[slice];
3478 cmd.data0 = MXGE_LOWPART_TO_U32(ss->fw_stats_dma.dmem_busaddr);
3479 cmd.data1 = MXGE_HIGHPART_TO_U32(ss->fw_stats_dma.dmem_busaddr);
3480 cmd.data2 = sizeof(struct mcp_irq_data);
3481 cmd.data2 |= (slice << 16);
3482 err |= mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_V2, &cmd);
3485 if (err != 0) {
3486 bus = sc->ss->fw_stats_dma.dmem_busaddr;
3487 bus += offsetof(struct mcp_irq_data, send_done_count);
3488 cmd.data0 = MXGE_LOWPART_TO_U32(bus);
3489 cmd.data1 = MXGE_HIGHPART_TO_U32(bus);
3490 err = mxge_send_cmd(sc, MXGEFW_CMD_SET_STATS_DMA_OBSOLETE,
3491 &cmd);
3493 /* Firmware cannot support multicast without STATS_DMA_V2 */
3494 sc->fw_multicast_support = 0;
3495 } else {
3496 sc->fw_multicast_support = 1;
3499 if (err != 0) {
3500 if_printf(ifp, "failed to setup params\n");
3501 goto abort;
3504 for (slice = 0; slice < sc->num_slices; slice++) {
3505 err = mxge_slice_open(&sc->ss[slice], cl_size);
3506 if (err != 0) {
3507 if_printf(ifp, "couldn't open slice %d\n", slice);
3508 goto abort;
3512 /* Finally, start the firmware running */
3513 err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_UP, &cmd);
3514 if (err) {
3515 if_printf(ifp, "Couldn't bring up link\n");
3516 goto abort;
3519 ifp->if_flags |= IFF_RUNNING;
3520 for (i = 0; i < sc->num_tx_rings; ++i) {
3521 mxge_tx_ring_t *tx = &sc->ss[i].tx;
3523 ifsq_clr_oactive(tx->ifsq);
3524 ifsq_watchdog_start(&tx->watchdog);
3527 return 0;
3529 abort:
3530 mxge_free_mbufs(sc);
3531 return err;
3534 static void
3535 mxge_close(mxge_softc_t *sc, int down)
3537 struct ifnet *ifp = sc->ifp;
3538 mxge_cmd_t cmd;
3539 int err, old_down_cnt, i;
3541 ASSERT_IFNET_SERIALIZED_ALL(ifp);
3543 if (!down) {
3544 old_down_cnt = sc->down_cnt;
3545 wmb();
3547 err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
3548 if (err)
3549 if_printf(ifp, "Couldn't bring down link\n");
3551 if (old_down_cnt == sc->down_cnt) {
3553 * Wait for down irq
3554 * XXX racy
3556 ifnet_deserialize_all(ifp);
3557 DELAY(10 * sc->intr_coal_delay);
3558 ifnet_serialize_all(ifp);
3561 wmb();
3562 if (old_down_cnt == sc->down_cnt)
3563 if_printf(ifp, "never got down irq\n");
3565 mxge_free_mbufs(sc);
3567 ifp->if_flags &= ~IFF_RUNNING;
3568 for (i = 0; i < sc->num_tx_rings; ++i) {
3569 mxge_tx_ring_t *tx = &sc->ss[i].tx;
3571 ifsq_clr_oactive(tx->ifsq);
3572 ifsq_watchdog_stop(&tx->watchdog);
3576 static void
3577 mxge_setup_cfg_space(mxge_softc_t *sc)
3579 device_t dev = sc->dev;
3580 int reg;
3581 uint16_t lnk, pectl;
3583 /* Find the PCIe link width and set max read request to 4KB */
3584 if (pci_find_extcap(dev, PCIY_EXPRESS, &reg) == 0) {
3585 lnk = pci_read_config(dev, reg + 0x12, 2);
3586 sc->link_width = (lnk >> 4) & 0x3f;
3588 if (sc->pectl == 0) {
3589 pectl = pci_read_config(dev, reg + 0x8, 2);
3590 pectl = (pectl & ~0x7000) | (5 << 12);
3591 pci_write_config(dev, reg + 0x8, pectl, 2);
3592 sc->pectl = pectl;
3593 } else {
3594 /* Restore saved pectl after watchdog reset */
3595 pci_write_config(dev, reg + 0x8, sc->pectl, 2);
3599 /* Enable DMA and memory space access */
3600 pci_enable_busmaster(dev);
3603 static uint32_t
3604 mxge_read_reboot(mxge_softc_t *sc)
3606 device_t dev = sc->dev;
3607 uint32_t vs;
3609 /* Find the vendor specific offset */
3610 if (pci_find_extcap(dev, PCIY_VENDOR, &vs) != 0) {
3611 if_printf(sc->ifp, "could not find vendor specific offset\n");
3612 return (uint32_t)-1;
3614 /* Enable read32 mode */
3615 pci_write_config(dev, vs + 0x10, 0x3, 1);
3616 /* Tell NIC which register to read */
3617 pci_write_config(dev, vs + 0x18, 0xfffffff0, 4);
3618 return pci_read_config(dev, vs + 0x14, 4);
3621 static void
3622 mxge_watchdog_reset(mxge_softc_t *sc)
3624 struct pci_devinfo *dinfo;
3625 int err, running;
3626 uint32_t reboot;
3627 uint16_t cmd;
3629 err = ENXIO;
3631 if_printf(sc->ifp, "Watchdog reset!\n");
3634 * Check to see if the NIC rebooted. If it did, then all of
3635 * PCI config space has been reset, and things like the
3636 * busmaster bit will be zero. If this is the case, then we
3637 * must restore PCI config space before the NIC can be used
3638 * again
3640 cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3641 if (cmd == 0xffff) {
3643 * Maybe the watchdog caught the NIC rebooting; wait
3644 * up to 100ms for it to finish. If it does not come
3645 * back, then give up
3647 DELAY(1000*100);
3648 cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3649 if (cmd == 0xffff)
3650 if_printf(sc->ifp, "NIC disappeared!\n");
3652 if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3653 /* Print the reboot status */
3654 reboot = mxge_read_reboot(sc);
3655 if_printf(sc->ifp, "NIC rebooted, status = 0x%x\n", reboot);
3657 running = sc->ifp->if_flags & IFF_RUNNING;
3658 if (running) {
3660 * Quiesce NIC so that TX routines will not try to
3661 * xmit after restoration of BAR
3664 /* Mark the link as down */
3665 if (sc->link_state) {
3666 sc->ifp->if_link_state = LINK_STATE_DOWN;
3667 if_link_state_change(sc->ifp);
3669 mxge_close(sc, 1);
3671 /* Restore PCI configuration space */
3672 dinfo = device_get_ivars(sc->dev);
3673 pci_cfg_restore(sc->dev, dinfo);
3675 /* And redo any changes we made to our config space */
3676 mxge_setup_cfg_space(sc);
3678 /* Reload f/w */
3679 err = mxge_load_firmware(sc, 0);
3680 if (err)
3681 if_printf(sc->ifp, "Unable to re-load f/w\n");
3682 if (running && !err) {
3683 int i;
3685 err = mxge_open(sc);
3687 for (i = 0; i < sc->num_tx_rings; ++i)
3688 ifsq_devstart_sched(sc->ss[i].tx.ifsq);
3690 sc->watchdog_resets++;
3691 } else {
3692 if_printf(sc->ifp, "NIC did not reboot, not resetting\n");
3693 err = 0;
3695 if (err) {
3696 if_printf(sc->ifp, "watchdog reset failed\n");
3697 } else {
3698 if (sc->dying == 2)
3699 sc->dying = 0;
3700 callout_reset(&sc->co_hdl, mxge_ticks, mxge_tick, sc);
3704 static void
3705 mxge_warn_stuck(mxge_softc_t *sc, mxge_tx_ring_t *tx, int slice)
3707 if_printf(sc->ifp, "slice %d struck? ring state:\n", slice);
3708 if_printf(sc->ifp, "tx.req=%d tx.done=%d, tx.queue_active=%d\n",
3709 tx->req, tx->done, tx->queue_active);
3710 if_printf(sc->ifp, "tx.activate=%d tx.deactivate=%d\n",
3711 tx->activate, tx->deactivate);
3712 if_printf(sc->ifp, "pkt_done=%d fw=%d\n",
3713 tx->pkt_done, be32toh(sc->ss->fw_stats->send_done_count));
3716 static u_long
3717 mxge_update_stats(mxge_softc_t *sc)
3719 u_long ipackets, opackets, pkts;
3721 IFNET_STAT_GET(sc->ifp, ipackets, ipackets);
3722 IFNET_STAT_GET(sc->ifp, opackets, opackets);
3724 pkts = ipackets - sc->ipackets;
3725 pkts += opackets - sc->opackets;
3727 sc->ipackets = ipackets;
3728 sc->opackets = opackets;
3730 return pkts;
3733 static void
3734 mxge_tick(void *arg)
3736 mxge_softc_t *sc = arg;
3737 u_long pkts = 0;
3738 int err = 0;
3739 int ticks;
3741 lwkt_serialize_enter(&sc->main_serialize);
3743 ticks = mxge_ticks;
3744 if (sc->ifp->if_flags & IFF_RUNNING) {
3745 /* Aggregate stats from different slices */
3746 pkts = mxge_update_stats(sc);
3747 if (sc->need_media_probe)
3748 mxge_media_probe(sc);
3750 if (pkts == 0) {
3751 uint16_t cmd;
3753 /* Ensure NIC did not suffer h/w fault while idle */
3754 cmd = pci_read_config(sc->dev, PCIR_COMMAND, 2);
3755 if ((cmd & PCIM_CMD_BUSMASTEREN) == 0) {
3756 sc->dying = 2;
3757 mxge_serialize_skipmain(sc);
3758 mxge_watchdog_reset(sc);
3759 mxge_deserialize_skipmain(sc);
3760 err = ENXIO;
3763 /* Look less often if NIC is idle */
3764 ticks *= 4;
3767 if (err == 0)
3768 callout_reset(&sc->co_hdl, ticks, mxge_tick, sc);
3770 lwkt_serialize_exit(&sc->main_serialize);
3773 static int
3774 mxge_media_change(struct ifnet *ifp)
3776 mxge_softc_t *sc = ifp->if_softc;
3777 const struct ifmedia *ifm = &sc->media;
3778 int pause;
3780 if (IFM_OPTIONS(ifm->ifm_media) & (IFM_ETH_RXPAUSE | IFM_ETH_TXPAUSE)) {
3781 if (sc->pause)
3782 return 0;
3783 pause = 1;
3784 } else {
3785 if (!sc->pause)
3786 return 0;
3787 pause = 0;
3789 return mxge_change_pause(sc, pause);
3792 static int
3793 mxge_change_mtu(mxge_softc_t *sc, int mtu)
3795 struct ifnet *ifp = sc->ifp;
3796 int real_mtu, old_mtu;
3797 int err = 0;
3799 real_mtu = mtu + ETHER_HDR_LEN + EVL_ENCAPLEN;
3800 if (mtu > sc->max_mtu || real_mtu < 60)
3801 return EINVAL;
3803 old_mtu = ifp->if_mtu;
3804 ifp->if_mtu = mtu;
3805 if (ifp->if_flags & IFF_RUNNING) {
3806 mxge_close(sc, 0);
3807 err = mxge_open(sc);
3808 if (err != 0) {
3809 ifp->if_mtu = old_mtu;
3810 mxge_close(sc, 0);
3811 mxge_open(sc);
3814 return err;
3817 static void
3818 mxge_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
3820 mxge_softc_t *sc = ifp->if_softc;
3822 ifmr->ifm_status = IFM_AVALID;
3823 ifmr->ifm_active = IFM_ETHER;
3825 if (sc->link_state)
3826 ifmr->ifm_status |= IFM_ACTIVE;
3829 * Autoselect is not supported, so the current media
3830 * should be delivered.
3832 ifmr->ifm_active |= sc->current_media;
3833 if (sc->current_media != IFM_NONE) {
3834 ifmr->ifm_active |= MXGE_IFM;
3835 if (sc->pause)
3836 ifmr->ifm_active |= IFM_ETH_RXPAUSE | IFM_ETH_TXPAUSE;
3840 static int
3841 mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data,
3842 struct ucred *cr __unused)
3844 mxge_softc_t *sc = ifp->if_softc;
3845 struct ifreq *ifr = (struct ifreq *)data;
3846 int err, mask;
3848 ASSERT_IFNET_SERIALIZED_ALL(ifp);
3849 err = 0;
3851 switch (command) {
3852 case SIOCSIFMTU:
3853 err = mxge_change_mtu(sc, ifr->ifr_mtu);
3854 break;
3856 case SIOCSIFFLAGS:
3857 if (sc->dying)
3858 return EINVAL;
3860 if (ifp->if_flags & IFF_UP) {
3861 if (!(ifp->if_flags & IFF_RUNNING)) {
3862 err = mxge_open(sc);
3863 } else {
3865 * Take care of PROMISC and ALLMULTI
3866 * flag changes
3868 mxge_change_promisc(sc,
3869 ifp->if_flags & IFF_PROMISC);
3870 mxge_set_multicast_list(sc);
3872 } else {
3873 if (ifp->if_flags & IFF_RUNNING)
3874 mxge_close(sc, 0);
3876 break;
3878 case SIOCADDMULTI:
3879 case SIOCDELMULTI:
3880 mxge_set_multicast_list(sc);
3881 break;
3883 case SIOCSIFCAP:
3884 mask = ifr->ifr_reqcap ^ ifp->if_capenable;
3885 if (mask & IFCAP_TXCSUM) {
3886 ifp->if_capenable ^= IFCAP_TXCSUM;
3887 if (ifp->if_capenable & IFCAP_TXCSUM)
3888 ifp->if_hwassist |= CSUM_TCP | CSUM_UDP;
3889 else
3890 ifp->if_hwassist &= ~(CSUM_TCP | CSUM_UDP);
3892 if (mask & IFCAP_TSO) {
3893 ifp->if_capenable ^= IFCAP_TSO;
3894 if (ifp->if_capenable & IFCAP_TSO)
3895 ifp->if_hwassist |= CSUM_TSO;
3896 else
3897 ifp->if_hwassist &= ~CSUM_TSO;
3899 if (mask & IFCAP_RXCSUM)
3900 ifp->if_capenable ^= IFCAP_RXCSUM;
3901 if (mask & IFCAP_VLAN_HWTAGGING)
3902 ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
3903 break;
3905 case SIOCGIFMEDIA:
3906 case SIOCSIFMEDIA:
3907 err = ifmedia_ioctl(ifp, (struct ifreq *)data,
3908 &sc->media, command);
3909 break;
3911 default:
3912 err = ether_ioctl(ifp, command, data);
3913 break;
3915 return err;
3918 static void
3919 mxge_fetch_tunables(mxge_softc_t *sc)
3921 int ifm;
3923 sc->intr_coal_delay = mxge_intr_coal_delay;
3924 if (sc->intr_coal_delay < 0 || sc->intr_coal_delay > (10 * 1000))
3925 sc->intr_coal_delay = MXGE_INTR_COAL_DELAY;
3927 /* XXX */
3928 if (mxge_ticks == 0)
3929 mxge_ticks = hz / 2;
3931 ifm = ifmedia_str2ethfc(mxge_flowctrl);
3932 if (ifm & (IFM_ETH_RXPAUSE | IFM_ETH_TXPAUSE))
3933 sc->pause = 1;
3935 sc->use_rss = mxge_use_rss;
3937 sc->throttle = mxge_throttle;
3938 if (sc->throttle && sc->throttle > MXGE_MAX_THROTTLE)
3939 sc->throttle = MXGE_MAX_THROTTLE;
3940 if (sc->throttle && sc->throttle < MXGE_MIN_THROTTLE)
3941 sc->throttle = MXGE_MIN_THROTTLE;
3944 static void
3945 mxge_free_slices(mxge_softc_t *sc)
3947 struct mxge_slice_state *ss;
3948 int i;
3950 if (sc->ss == NULL)
3951 return;
3953 for (i = 0; i < sc->num_slices; i++) {
3954 ss = &sc->ss[i];
3955 if (ss->fw_stats != NULL) {
3956 mxge_dma_free(&ss->fw_stats_dma);
3957 ss->fw_stats = NULL;
3959 if (ss->rx_data.rx_done.entry != NULL) {
3960 mxge_dma_free(&ss->rx_done_dma);
3961 ss->rx_data.rx_done.entry = NULL;
3964 kfree(sc->ss, M_DEVBUF);
3965 sc->ss = NULL;
3968 static int
3969 mxge_alloc_slices(mxge_softc_t *sc)
3971 mxge_cmd_t cmd;
3972 struct mxge_slice_state *ss;
3973 size_t bytes;
3974 int err, i, rx_ring_size;
3976 err = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
3977 if (err != 0) {
3978 device_printf(sc->dev, "Cannot determine rx ring size\n");
3979 return err;
3981 rx_ring_size = cmd.data0;
3982 sc->rx_intr_slots = 2 * (rx_ring_size / sizeof (mcp_dma_addr_t));
3984 bytes = sizeof(*sc->ss) * sc->num_slices;
3985 sc->ss = kmalloc_cachealign(bytes, M_DEVBUF, M_WAITOK | M_ZERO);
3987 for (i = 0; i < sc->num_slices; i++) {
3988 ss = &sc->ss[i];
3990 ss->sc = sc;
3992 lwkt_serialize_init(&ss->rx_data.rx_serialize);
3993 lwkt_serialize_init(&ss->tx.tx_serialize);
3994 ss->intr_rid = -1;
3997 * Allocate per-slice rx interrupt queue
3998 * XXX assume 4bytes mcp_slot
4000 bytes = sc->rx_intr_slots * sizeof(mcp_slot_t);
4001 err = mxge_dma_alloc(sc, &ss->rx_done_dma, bytes, 4096);
4002 if (err != 0) {
4003 device_printf(sc->dev,
4004 "alloc %d slice rx_done failed\n", i);
4005 return err;
4007 ss->rx_data.rx_done.entry = ss->rx_done_dma.dmem_addr;
4010 * Allocate the per-slice firmware stats
4012 bytes = sizeof(*ss->fw_stats);
4013 err = mxge_dma_alloc(sc, &ss->fw_stats_dma,
4014 sizeof(*ss->fw_stats), 64);
4015 if (err != 0) {
4016 device_printf(sc->dev,
4017 "alloc %d fw_stats failed\n", i);
4018 return err;
4020 ss->fw_stats = ss->fw_stats_dma.dmem_addr;
4022 return 0;
4025 static void
4026 mxge_slice_probe(mxge_softc_t *sc)
4028 int status, max_intr_slots, max_slices, num_slices;
4029 int msix_cnt, msix_enable, multi_tx;
4030 mxge_cmd_t cmd;
4031 const char *old_fw;
4033 sc->num_slices = 1;
4034 sc->num_tx_rings = 1;
4036 num_slices = device_getenv_int(sc->dev, "num_slices", mxge_num_slices);
4037 if (num_slices == 1)
4038 return;
4040 if (netisr_ncpus == 1)
4041 return;
4043 msix_enable = device_getenv_int(sc->dev, "msix.enable",
4044 mxge_msix_enable);
4045 if (!msix_enable)
4046 return;
4048 msix_cnt = pci_msix_count(sc->dev);
4049 if (msix_cnt < 2)
4050 return;
4051 if (bootverbose)
4052 device_printf(sc->dev, "MSI-X count %d\n", msix_cnt);
4055 * Now load the slice aware firmware see what it supports
4057 old_fw = sc->fw_name;
4058 if (old_fw == mxge_fw_aligned)
4059 sc->fw_name = mxge_fw_rss_aligned;
4060 else
4061 sc->fw_name = mxge_fw_rss_unaligned;
4062 status = mxge_load_firmware(sc, 0);
4063 if (status != 0) {
4064 device_printf(sc->dev, "Falling back to a single slice\n");
4065 return;
4069 * Try to send a reset command to the card to see if it is alive
4071 memset(&cmd, 0, sizeof(cmd));
4072 status = mxge_send_cmd(sc, MXGEFW_CMD_RESET, &cmd);
4073 if (status != 0) {
4074 device_printf(sc->dev, "failed reset\n");
4075 goto abort_with_fw;
4079 * Get rx ring size to calculate rx interrupt queue size
4081 status = mxge_send_cmd(sc, MXGEFW_CMD_GET_RX_RING_SIZE, &cmd);
4082 if (status != 0) {
4083 device_printf(sc->dev, "Cannot determine rx ring size\n");
4084 goto abort_with_fw;
4086 max_intr_slots = 2 * (cmd.data0 / sizeof(mcp_dma_addr_t));
4089 * Tell it the size of the rx interrupt queue
4091 cmd.data0 = max_intr_slots * sizeof(struct mcp_slot);
4092 status = mxge_send_cmd(sc, MXGEFW_CMD_SET_INTRQ_SIZE, &cmd);
4093 if (status != 0) {
4094 device_printf(sc->dev, "failed MXGEFW_CMD_SET_INTRQ_SIZE\n");
4095 goto abort_with_fw;
4099 * Ask the maximum number of slices it supports
4101 status = mxge_send_cmd(sc, MXGEFW_CMD_GET_MAX_RSS_QUEUES, &cmd);
4102 if (status != 0) {
4103 device_printf(sc->dev,
4104 "failed MXGEFW_CMD_GET_MAX_RSS_QUEUES\n");
4105 goto abort_with_fw;
4107 max_slices = cmd.data0;
4108 if (bootverbose)
4109 device_printf(sc->dev, "max slices %d\n", max_slices);
4111 if (max_slices > msix_cnt)
4112 max_slices = msix_cnt;
4114 sc->ring_map = if_ringmap_alloc(sc->dev, num_slices, max_slices);
4115 sc->num_slices = if_ringmap_count(sc->ring_map);
4117 multi_tx = device_getenv_int(sc->dev, "multi_tx", mxge_multi_tx);
4118 if (multi_tx)
4119 sc->num_tx_rings = sc->num_slices;
4121 if (bootverbose) {
4122 device_printf(sc->dev, "using %d slices, max %d\n",
4123 sc->num_slices, max_slices);
4126 if (sc->num_slices == 1)
4127 goto abort_with_fw;
4128 return;
4130 abort_with_fw:
4131 sc->fw_name = old_fw;
4132 mxge_load_firmware(sc, 0);
4135 static void
4136 mxge_setup_serialize(struct mxge_softc *sc)
4138 int i = 0, slice;
4140 /* Main + rx + tx */
4141 sc->nserialize = (2 * sc->num_slices) + 1;
4142 sc->serializes =
4143 kmalloc(sc->nserialize * sizeof(struct lwkt_serialize *),
4144 M_DEVBUF, M_WAITOK | M_ZERO);
4147 * Setup serializes
4149 * NOTE: Order is critical
4152 KKASSERT(i < sc->nserialize);
4153 sc->serializes[i++] = &sc->main_serialize;
4155 for (slice = 0; slice < sc->num_slices; ++slice) {
4156 KKASSERT(i < sc->nserialize);
4157 sc->serializes[i++] = &sc->ss[slice].rx_data.rx_serialize;
4160 for (slice = 0; slice < sc->num_slices; ++slice) {
4161 KKASSERT(i < sc->nserialize);
4162 sc->serializes[i++] = &sc->ss[slice].tx.tx_serialize;
4165 KKASSERT(i == sc->nserialize);
4168 static void
4169 mxge_serialize(struct ifnet *ifp, enum ifnet_serialize slz)
4171 struct mxge_softc *sc = ifp->if_softc;
4173 ifnet_serialize_array_enter(sc->serializes, sc->nserialize, slz);
4176 static void
4177 mxge_deserialize(struct ifnet *ifp, enum ifnet_serialize slz)
4179 struct mxge_softc *sc = ifp->if_softc;
4181 ifnet_serialize_array_exit(sc->serializes, sc->nserialize, slz);
4184 static int
4185 mxge_tryserialize(struct ifnet *ifp, enum ifnet_serialize slz)
4187 struct mxge_softc *sc = ifp->if_softc;
4189 return ifnet_serialize_array_try(sc->serializes, sc->nserialize, slz);
4192 #ifdef INVARIANTS
4194 static void
4195 mxge_serialize_assert(struct ifnet *ifp, enum ifnet_serialize slz,
4196 boolean_t serialized)
4198 struct mxge_softc *sc = ifp->if_softc;
4200 ifnet_serialize_array_assert(sc->serializes, sc->nserialize,
4201 slz, serialized);
4204 #endif /* INVARIANTS */
4206 #ifdef IFPOLL_ENABLE
4208 static void
4209 mxge_npoll_rx(struct ifnet *ifp, void *xss, int cycle)
4211 struct mxge_slice_state *ss = xss;
4212 mxge_rx_done_t *rx_done = &ss->rx_data.rx_done;
4214 ASSERT_SERIALIZED(&ss->rx_data.rx_serialize);
4216 if (rx_done->entry[rx_done->idx].length != 0) {
4217 mxge_clean_rx_done(&ss->sc->arpcom.ac_if, &ss->rx_data, cycle);
4218 } else {
4220 * XXX
4221 * This register writting obviously has cost,
4222 * however, if we don't hand back the rx token,
4223 * the upcoming packets may suffer rediculously
4224 * large delay, as observed on 8AL-C using ping(8).
4226 *ss->irq_claim = be32toh(3);
4230 static void
4231 mxge_npoll(struct ifnet *ifp, struct ifpoll_info *info)
4233 struct mxge_softc *sc = ifp->if_softc;
4234 int i;
4236 if (info == NULL)
4237 return;
4240 * Only poll rx; polling tx and status don't seem to work
4242 for (i = 0; i < sc->num_slices; ++i) {
4243 struct mxge_slice_state *ss = &sc->ss[i];
4244 int cpu = ss->intr_cpuid;
4246 KKASSERT(cpu < netisr_ncpus);
4247 info->ifpi_rx[cpu].poll_func = mxge_npoll_rx;
4248 info->ifpi_rx[cpu].arg = ss;
4249 info->ifpi_rx[cpu].serializer = &ss->rx_data.rx_serialize;
4253 #endif /* IFPOLL_ENABLE */
4255 static int
4256 mxge_attach(device_t dev)
4258 mxge_softc_t *sc = device_get_softc(dev);
4259 struct ifnet *ifp = &sc->arpcom.ac_if;
4260 int err, rid, i;
4263 * Avoid rewriting half the lines in this file to use
4264 * &sc->arpcom.ac_if instead
4266 sc->ifp = ifp;
4267 sc->dev = dev;
4268 if_initname(ifp, device_get_name(dev), device_get_unit(dev));
4270 /* IFM_ETH_FORCEPAUSE can't be changed */
4271 ifmedia_init(&sc->media, IFM_ETH_RXPAUSE | IFM_ETH_TXPAUSE,
4272 mxge_media_change, mxge_media_status);
4274 lwkt_serialize_init(&sc->main_serialize);
4276 mxge_fetch_tunables(sc);
4278 err = bus_dma_tag_create(NULL, /* parent */
4279 1, /* alignment */
4280 0, /* boundary */
4281 BUS_SPACE_MAXADDR, /* low */
4282 BUS_SPACE_MAXADDR, /* high */
4283 NULL, NULL, /* filter */
4284 BUS_SPACE_MAXSIZE_32BIT,/* maxsize */
4285 0, /* num segs */
4286 BUS_SPACE_MAXSIZE_32BIT,/* maxsegsize */
4287 0, /* flags */
4288 &sc->parent_dmat); /* tag */
4289 if (err != 0) {
4290 device_printf(dev, "Err %d allocating parent dmat\n", err);
4291 goto failed;
4294 callout_init_mp(&sc->co_hdl);
4296 mxge_setup_cfg_space(sc);
4299 * Map the board into the kernel
4301 rid = PCIR_BARS;
4302 sc->mem_res = bus_alloc_resource_any(dev, SYS_RES_MEMORY,
4303 &rid, RF_ACTIVE);
4304 if (sc->mem_res == NULL) {
4305 device_printf(dev, "could not map memory\n");
4306 err = ENXIO;
4307 goto failed;
4310 sc->sram = rman_get_virtual(sc->mem_res);
4311 sc->sram_size = 2*1024*1024 - (2*(48*1024)+(32*1024)) - 0x100;
4312 if (sc->sram_size > rman_get_size(sc->mem_res)) {
4313 device_printf(dev, "impossible memory region size %ld\n",
4314 rman_get_size(sc->mem_res));
4315 err = ENXIO;
4316 goto failed;
4320 * Make NULL terminated copy of the EEPROM strings section of
4321 * lanai SRAM
4323 bzero(sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE);
4324 bus_space_read_region_1(rman_get_bustag(sc->mem_res),
4325 rman_get_bushandle(sc->mem_res),
4326 sc->sram_size - MXGE_EEPROM_STRINGS_SIZE,
4327 sc->eeprom_strings, MXGE_EEPROM_STRINGS_SIZE - 2);
4328 err = mxge_parse_strings(sc);
4329 if (err != 0) {
4330 device_printf(dev, "parse EEPROM string failed\n");
4331 goto failed;
4335 * Enable write combining for efficient use of PCIe bus
4337 mxge_enable_wc(sc);
4340 * Allocate the out of band DMA memory
4342 err = mxge_dma_alloc(sc, &sc->cmd_dma, sizeof(mxge_cmd_t), 64);
4343 if (err != 0) {
4344 device_printf(dev, "alloc cmd DMA buf failed\n");
4345 goto failed;
4347 sc->cmd = sc->cmd_dma.dmem_addr;
4349 err = mxge_dma_alloc(sc, &sc->zeropad_dma, 64, 64);
4350 if (err != 0) {
4351 device_printf(dev, "alloc zeropad DMA buf failed\n");
4352 goto failed;
4355 err = mxge_dma_alloc(sc, &sc->dmabench_dma, 4096, 4096);
4356 if (err != 0) {
4357 device_printf(dev, "alloc dmabench DMA buf failed\n");
4358 goto failed;
4361 /* Select & load the firmware */
4362 err = mxge_select_firmware(sc);
4363 if (err != 0) {
4364 device_printf(dev, "select firmware failed\n");
4365 goto failed;
4368 mxge_slice_probe(sc);
4369 err = mxge_alloc_slices(sc);
4370 if (err != 0) {
4371 device_printf(dev, "alloc slices failed\n");
4372 goto failed;
4375 err = mxge_alloc_intr(sc);
4376 if (err != 0) {
4377 device_printf(dev, "alloc intr failed\n");
4378 goto failed;
4381 /* Setup serializes */
4382 mxge_setup_serialize(sc);
4384 err = mxge_reset(sc, 0);
4385 if (err != 0) {
4386 device_printf(dev, "reset failed\n");
4387 goto failed;
4390 err = mxge_alloc_rings(sc);
4391 if (err != 0) {
4392 device_printf(dev, "failed to allocate rings\n");
4393 goto failed;
4396 ifp->if_baudrate = IF_Gbps(10UL);
4397 ifp->if_capabilities = IFCAP_RXCSUM | IFCAP_TXCSUM | IFCAP_TSO;
4398 ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_TSO;
4400 ifp->if_capabilities |= IFCAP_VLAN_MTU;
4401 #if 0
4402 /* Well, its software, sigh */
4403 ifp->if_capabilities |= IFCAP_VLAN_HWTAGGING;
4404 #endif
4405 ifp->if_capenable = ifp->if_capabilities;
4407 ifp->if_softc = sc;
4408 ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
4409 ifp->if_init = mxge_init;
4410 ifp->if_ioctl = mxge_ioctl;
4411 ifp->if_start = mxge_start;
4412 #ifdef IFPOLL_ENABLE
4413 if (sc->intr_type != PCI_INTR_TYPE_LEGACY)
4414 ifp->if_npoll = mxge_npoll;
4415 #endif
4416 ifp->if_serialize = mxge_serialize;
4417 ifp->if_deserialize = mxge_deserialize;
4418 ifp->if_tryserialize = mxge_tryserialize;
4419 #ifdef INVARIANTS
4420 ifp->if_serialize_assert = mxge_serialize_assert;
4421 #endif
4423 /* Increase TSO burst length */
4424 ifp->if_tsolen = (32 * ETHERMTU);
4426 /* Initialise the ifmedia structure */
4427 mxge_media_init(sc);
4428 mxge_media_probe(sc);
4430 ether_ifattach(ifp, sc->mac_addr, NULL);
4432 /* Setup TX rings and subqueues */
4433 for (i = 0; i < sc->num_tx_rings; ++i) {
4434 struct ifaltq_subque *ifsq = ifq_get_subq(&ifp->if_snd, i);
4435 struct mxge_slice_state *ss = &sc->ss[i];
4437 ifsq_set_cpuid(ifsq, ss->intr_cpuid);
4438 ifsq_set_hw_serialize(ifsq, &ss->tx.tx_serialize);
4439 ifsq_set_priv(ifsq, &ss->tx);
4440 ss->tx.ifsq = ifsq;
4442 ifsq_watchdog_init(&ss->tx.watchdog, ifsq, mxge_watchdog);
4446 * XXX
4447 * We are not ready to do "gather" jumbo frame, so
4448 * limit MTU to MJUMPAGESIZE
4450 sc->max_mtu = MJUMPAGESIZE -
4451 ETHER_HDR_LEN - EVL_ENCAPLEN - MXGEFW_PAD - 1;
4452 sc->dying = 0;
4454 err = mxge_setup_intr(sc);
4455 if (err != 0) {
4456 device_printf(dev, "alloc and setup intr failed\n");
4457 ether_ifdetach(ifp);
4458 goto failed;
4461 mxge_add_sysctls(sc);
4463 /* Increase non-cluster mbuf limit; used by small RX rings */
4464 mb_inclimit(ifp->if_nmbclusters);
4466 callout_reset_bycpu(&sc->co_hdl, mxge_ticks, mxge_tick, sc,
4467 sc->ss[0].intr_cpuid);
4468 return 0;
4470 failed:
4471 mxge_detach(dev);
4472 return err;
4475 static int
4476 mxge_detach(device_t dev)
4478 mxge_softc_t *sc = device_get_softc(dev);
4480 if (device_is_attached(dev)) {
4481 struct ifnet *ifp = sc->ifp;
4482 int mblimit = ifp->if_nmbclusters;
4484 ifnet_serialize_all(ifp);
4486 sc->dying = 1;
4487 if (ifp->if_flags & IFF_RUNNING)
4488 mxge_close(sc, 1);
4489 callout_stop(&sc->co_hdl);
4491 mxge_teardown_intr(sc, sc->num_slices);
4493 ifnet_deserialize_all(ifp);
4495 callout_terminate(&sc->co_hdl);
4497 ether_ifdetach(ifp);
4499 /* Decrease non-cluster mbuf limit increased by us */
4500 mb_inclimit(-mblimit);
4502 ifmedia_removeall(&sc->media);
4504 if (sc->cmd != NULL && sc->zeropad_dma.dmem_addr != NULL &&
4505 sc->sram != NULL)
4506 mxge_dummy_rdma(sc, 0);
4508 mxge_free_intr(sc);
4509 mxge_rem_sysctls(sc);
4510 mxge_free_rings(sc);
4512 /* MUST after sysctls, intr and rings are freed */
4513 mxge_free_slices(sc);
4515 if (sc->dmabench_dma.dmem_addr != NULL)
4516 mxge_dma_free(&sc->dmabench_dma);
4517 if (sc->zeropad_dma.dmem_addr != NULL)
4518 mxge_dma_free(&sc->zeropad_dma);
4519 if (sc->cmd_dma.dmem_addr != NULL)
4520 mxge_dma_free(&sc->cmd_dma);
4522 if (sc->msix_table_res != NULL) {
4523 bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BAR(2),
4524 sc->msix_table_res);
4526 if (sc->mem_res != NULL) {
4527 bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BARS,
4528 sc->mem_res);
4531 if (sc->parent_dmat != NULL)
4532 bus_dma_tag_destroy(sc->parent_dmat);
4534 if (sc->ring_map != NULL)
4535 if_ringmap_free(sc->ring_map);
4537 return 0;
4540 static int
4541 mxge_shutdown(device_t dev)
4543 return 0;
4546 static void
4547 mxge_free_msix(struct mxge_softc *sc, boolean_t setup)
4549 int i;
4551 KKASSERT(sc->num_slices > 1);
4553 for (i = 0; i < sc->num_slices; ++i) {
4554 struct mxge_slice_state *ss = &sc->ss[i];
4556 if (ss->intr_res != NULL) {
4557 bus_release_resource(sc->dev, SYS_RES_IRQ,
4558 ss->intr_rid, ss->intr_res);
4560 if (ss->intr_rid >= 0)
4561 pci_release_msix_vector(sc->dev, ss->intr_rid);
4563 if (setup)
4564 pci_teardown_msix(sc->dev);
4567 static int
4568 mxge_alloc_msix(struct mxge_softc *sc)
4570 struct mxge_slice_state *ss;
4571 int rid, error, i;
4572 boolean_t setup = FALSE;
4574 KKASSERT(sc->num_slices > 1);
4576 ss = &sc->ss[0];
4578 ss->intr_serialize = &sc->main_serialize;
4579 ss->intr_func = mxge_msi;
4580 ksnprintf(ss->intr_desc0, sizeof(ss->intr_desc0),
4581 "%s comb", device_get_nameunit(sc->dev));
4582 ss->intr_desc = ss->intr_desc0;
4583 ss->intr_cpuid = if_ringmap_cpumap(sc->ring_map, 0);
4585 for (i = 1; i < sc->num_slices; ++i) {
4586 ss = &sc->ss[i];
4588 ss->intr_serialize = &ss->rx_data.rx_serialize;
4589 if (sc->num_tx_rings == 1) {
4590 ss->intr_func = mxge_msix_rx;
4591 ksnprintf(ss->intr_desc0, sizeof(ss->intr_desc0),
4592 "%s rx%d", device_get_nameunit(sc->dev), i);
4593 } else {
4594 ss->intr_func = mxge_msix_rxtx;
4595 ksnprintf(ss->intr_desc0, sizeof(ss->intr_desc0),
4596 "%s rxtx%d", device_get_nameunit(sc->dev), i);
4598 ss->intr_desc = ss->intr_desc0;
4599 ss->intr_cpuid = if_ringmap_cpumap(sc->ring_map, i);
4602 rid = PCIR_BAR(2);
4603 sc->msix_table_res = bus_alloc_resource_any(sc->dev, SYS_RES_MEMORY,
4604 &rid, RF_ACTIVE);
4605 if (sc->msix_table_res == NULL) {
4606 device_printf(sc->dev, "couldn't alloc MSI-X table res\n");
4607 return ENXIO;
4610 error = pci_setup_msix(sc->dev);
4611 if (error) {
4612 device_printf(sc->dev, "could not setup MSI-X\n");
4613 goto back;
4615 setup = TRUE;
4617 for (i = 0; i < sc->num_slices; ++i) {
4618 ss = &sc->ss[i];
4620 error = pci_alloc_msix_vector(sc->dev, i, &ss->intr_rid,
4621 ss->intr_cpuid);
4622 if (error) {
4623 device_printf(sc->dev, "could not alloc "
4624 "MSI-X %d on cpu%d\n", i, ss->intr_cpuid);
4625 goto back;
4628 ss->intr_res = bus_alloc_resource_any(sc->dev, SYS_RES_IRQ,
4629 &ss->intr_rid, RF_ACTIVE);
4630 if (ss->intr_res == NULL) {
4631 device_printf(sc->dev, "could not alloc "
4632 "MSI-X %d resource\n", i);
4633 error = ENXIO;
4634 goto back;
4638 pci_enable_msix(sc->dev);
4639 sc->intr_type = PCI_INTR_TYPE_MSIX;
4640 back:
4641 if (error)
4642 mxge_free_msix(sc, setup);
4643 return error;
4646 static int
4647 mxge_alloc_intr(struct mxge_softc *sc)
4649 struct mxge_slice_state *ss;
4650 u_int irq_flags;
4652 if (sc->num_slices > 1) {
4653 int error;
4655 error = mxge_alloc_msix(sc);
4656 if (error)
4657 return error;
4658 KKASSERT(sc->intr_type == PCI_INTR_TYPE_MSIX);
4659 return 0;
4662 ss = &sc->ss[0];
4664 sc->intr_type = pci_alloc_1intr(sc->dev, mxge_msi_enable,
4665 &ss->intr_rid, &irq_flags);
4667 ss->intr_res = bus_alloc_resource_any(sc->dev, SYS_RES_IRQ,
4668 &ss->intr_rid, irq_flags);
4669 if (ss->intr_res == NULL) {
4670 device_printf(sc->dev, "could not alloc interrupt\n");
4671 return ENXIO;
4674 if (sc->intr_type == PCI_INTR_TYPE_LEGACY)
4675 ss->intr_func = mxge_legacy;
4676 else
4677 ss->intr_func = mxge_msi;
4678 ss->intr_serialize = &sc->main_serialize;
4679 ss->intr_cpuid = rman_get_cpuid(ss->intr_res);
4681 return 0;
4684 static int
4685 mxge_setup_intr(struct mxge_softc *sc)
4687 int i;
4689 for (i = 0; i < sc->num_slices; ++i) {
4690 struct mxge_slice_state *ss = &sc->ss[i];
4691 int error;
4693 error = bus_setup_intr_descr(sc->dev, ss->intr_res,
4694 INTR_MPSAFE, ss->intr_func, ss, &ss->intr_hand,
4695 ss->intr_serialize, ss->intr_desc);
4696 if (error) {
4697 device_printf(sc->dev, "can't setup %dth intr\n", i);
4698 mxge_teardown_intr(sc, i);
4699 return error;
4702 return 0;
4705 static void
4706 mxge_teardown_intr(struct mxge_softc *sc, int cnt)
4708 int i;
4710 if (sc->ss == NULL)
4711 return;
4713 for (i = 0; i < cnt; ++i) {
4714 struct mxge_slice_state *ss = &sc->ss[i];
4716 bus_teardown_intr(sc->dev, ss->intr_res, ss->intr_hand);
4720 static void
4721 mxge_free_intr(struct mxge_softc *sc)
4723 if (sc->ss == NULL)
4724 return;
4726 if (sc->intr_type != PCI_INTR_TYPE_MSIX) {
4727 struct mxge_slice_state *ss = &sc->ss[0];
4729 if (ss->intr_res != NULL) {
4730 bus_release_resource(sc->dev, SYS_RES_IRQ,
4731 ss->intr_rid, ss->intr_res);
4733 if (sc->intr_type == PCI_INTR_TYPE_MSI)
4734 pci_release_msi(sc->dev);
4735 } else {
4736 mxge_free_msix(sc, TRUE);