nvme - Cleanups, limit nirqs
[dragonfly.git] / sys / dev / disk / nvme / nvme_attach.c
blob8c90d851fcb061284878c9495a596a02625ced4c
1 /*
2 * Copyright (c) 2016 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
35 #include "nvme.h"
37 static int nvme_pci_attach(device_t);
38 static int nvme_pci_detach(device_t);
40 static const nvme_device_t nvme_devices[] = {
41 /* Vendor-specific table goes here (see ahci for example) */
42 { 0, 0, nvme_pci_attach, nvme_pci_detach, "NVME-PCIe" }
45 static int nvme_msix_enable = 1;
46 TUNABLE_INT("hw.nvme.msix.enable", &nvme_msix_enable);
47 static int nvme_msi_enable = 0;
48 TUNABLE_INT("hw.nvme.msi.enable", &nvme_msi_enable);
50 TAILQ_HEAD(, nvme_softc) nvme_sc_list = TAILQ_HEAD_INITIALIZER(nvme_sc_list);
51 struct lock nvme_master_lock = LOCK_INITIALIZER("nvmstr", 0, 0);
53 static int last_global_cpu;
56 * Match during probe and attach. The device does not yet have a softc.
58 const nvme_device_t *
59 nvme_lookup_device(device_t dev)
61 const nvme_device_t *ad;
62 uint16_t vendor = pci_get_vendor(dev);
63 uint16_t product = pci_get_device(dev);
64 uint8_t class = pci_get_class(dev);
65 uint8_t subclass = pci_get_subclass(dev);
66 uint8_t progif = pci_read_config(dev, PCIR_PROGIF, 1);
67 int is_nvme;
70 * Generally speaking if the pci device does not identify as
71 * AHCI we skip it.
73 if (class == PCIC_STORAGE && subclass == PCIS_STORAGE_NVM &&
74 progif == PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0) {
75 is_nvme = 1;
76 } else {
77 is_nvme = 0;
80 for (ad = &nvme_devices[0]; ad->vendor; ++ad) {
81 if (ad->vendor == vendor && ad->product == product)
82 return (ad);
86 * Last ad is the default match if the PCI device matches SATA.
88 if (is_nvme == 0)
89 ad = NULL;
90 return (ad);
94 * Attach functions. They all eventually fall through to nvme_pci_attach().
96 static int
97 nvme_pci_attach(device_t dev)
99 nvme_softc_t *sc = device_get_softc(dev);
100 uint32_t reg;
101 int error;
102 int msi_enable;
103 int msix_enable;
105 if (pci_read_config(dev, PCIR_COMMAND, 2) & 0x0400) {
106 device_printf(dev, "BIOS disabled PCI interrupt, "
107 "re-enabling\n");
108 pci_write_config(dev, PCIR_COMMAND,
109 pci_read_config(dev, PCIR_COMMAND, 2) & ~0x0400, 2);
112 sc->dev = dev;
115 * Map the register window
117 sc->rid_regs = PCIR_BAR(0);
118 sc->regs = bus_alloc_resource_any(dev, SYS_RES_MEMORY,
119 &sc->rid_regs, RF_ACTIVE);
120 if (sc->regs == NULL) {
121 device_printf(dev, "unable to map registers\n");
122 nvme_pci_detach(dev);
123 return (ENXIO);
125 sc->iot = rman_get_bustag(sc->regs);
126 sc->ioh = rman_get_bushandle(sc->regs);
129 * NVMe allows the MSI-X table to be mapped to BAR 4/5.
130 * Always try to map BAR4, but it's ok if it fails. Must
131 * be done prior to allocating our interrupts.
133 sc->rid_bar4 = PCIR_BAR(4);
134 sc->bar4 = bus_alloc_resource_any(dev, SYS_RES_MEMORY,
135 &sc->rid_bar4, RF_ACTIVE);
138 * Map the interrupt or initial interrupt which will be used for
139 * the admin queue. NVME chipsets can potentially support a huge
140 * number of MSIX vectors but we really only need enough for
141 * available cpus, plus 1.
143 msi_enable = device_getenv_int(dev, "msi.enable", nvme_msi_enable);
144 msix_enable = device_getenv_int(dev, "msix.enable", nvme_msix_enable);
146 error = 0;
147 if (msix_enable) {
148 int i;
149 int cpu;
151 sc->nirqs = pci_msix_count(dev);
152 sc->irq_type = PCI_INTR_TYPE_MSIX;
153 if (sc->nirqs > ncpus + 1) /* max we need */
154 sc->nirqs = ncpus + 1;
156 error = pci_setup_msix(dev);
157 cpu = (last_global_cpu + 0) % ncpus; /* GCC warn */
158 for (i = 0; error == 0 && i < sc->nirqs; ++i) {
159 cpu = (last_global_cpu + i) % ncpus;
160 error = pci_alloc_msix_vector(dev, i,
161 &sc->rid_irq[i], cpu);
162 if (error)
163 break;
164 sc->irq[i] = bus_alloc_resource_any(dev, SYS_RES_IRQ,
165 &sc->rid_irq[i],
166 RF_ACTIVE);
168 * We want this to overwrite queue 0's cpu vector
169 * when the cpu's rotate through later on.
171 if (sc->cputovect[cpu] == 0)
172 sc->cputovect[cpu] = i;
176 * If we did not iterate enough cpus (that is, there weren't
177 * enough irqs for all available cpus) we still need to
178 * finish or sc->cputovect[] mapping.
180 while (error == 0) {
181 cpu = (cpu + 1) % ncpus;
182 i = (i + 1) % sc->nirqs;
183 if (i == 0)
184 i = 1;
185 if (sc->cputovect[cpu] != 0)
186 break;
187 sc->cputovect[cpu] = i;
190 if (error) {
191 while (--i >= 0) {
192 bus_release_resource(dev, SYS_RES_IRQ,
193 sc->rid_irq[i],
194 sc->irq[i]);
195 pci_release_msix_vector(dev, sc->rid_irq[i]);
196 sc->irq[i] = NULL;
198 /* leave error intact to fall through to normal */
199 } else {
200 last_global_cpu = (last_global_cpu + sc->nirqs) % ncpus;
201 pci_enable_msix(dev);
204 if (error) {
205 uint32_t irq_flags;
207 error = 0;
208 sc->nirqs = 1;
209 sc->irq_type = pci_alloc_1intr(dev, msi_enable,
210 &sc->rid_irq[0], &irq_flags);
211 sc->irq[0] = bus_alloc_resource_any(dev, SYS_RES_IRQ,
212 &sc->rid_irq[0], irq_flags);
214 if (sc->irq[0] == NULL) {
215 device_printf(dev, "unable to map interrupt\n");
216 nvme_pci_detach(dev);
217 return (ENXIO);
218 } else {
219 const char *type;
220 switch(sc->irq_type) {
221 case PCI_INTR_TYPE_MSI:
222 type = "MSI";
223 break;
224 case PCI_INTR_TYPE_MSIX:
225 type = "MSIX";
226 break;
227 default:
228 type = "normal-int";
229 break;
231 device_printf(dev, "mapped %d %s IRQs\n", sc->nirqs, type);
235 * Make sure the chip is disabled, which will reset all controller
236 * registers except for the admin queue registers. Device should
237 * already be disabled so this is usually instantanious. Use a
238 * fixed 5-second timeout in case it is not. I'd like my other
239 * reads to occur after the device has been disabled.
241 sc->entimo = hz * 5;
242 error = nvme_enable(sc, 0);
243 if (error) {
244 nvme_pci_detach(dev);
245 return (ENXIO);
249 * Get capabillities and version and report
251 sc->vers = nvme_read(sc, NVME_REG_VERS);
252 sc->cap = nvme_read8(sc, NVME_REG_CAP);
253 sc->maxqe = NVME_CAP_MQES_GET(sc->cap);
254 sc->dstrd4 = NVME_CAP_DSTRD_GET(sc->cap);
256 device_printf(dev, "NVME Version %u.%u maxqe=%u caps=%016jx\n",
257 NVME_VERS_MAJOR_GET(sc->vers),
258 NVME_VERS_MINOR_GET(sc->vers),
259 sc->maxqe, sc->cap);
262 * Enable timeout, 500ms increments. Convert to ticks.
264 sc->entimo = NVME_CAP_TIMEOUT_GET(sc->cap) * hz / 2; /* in ticks */
265 ++sc->entimo; /* fudge */
268 * Validate maxqe. To cap the amount of memory we reserve for
269 * PRPs we limit maxqe to 256. Also make sure it is a power of
270 * two.
272 if (sc->maxqe < 2) {
273 device_printf(dev,
274 "Attach failed, max queue entries (%d) "
275 "below minimum (2)\n", sc->maxqe);
276 nvme_pci_detach(dev);
277 return (ENXIO);
279 if (sc->maxqe > 256)
280 sc->maxqe = 256;
281 for (reg = 2; reg <= sc->maxqe; reg <<= 1)
283 sc->maxqe = reg >> 1;
286 * DMA tags
288 * PRP - Worst case PRPs needed per queue is MAXPHYS / PAGE_SIZE
289 * (typically 64), multiplied by maxqe (typ 256). Roughly
290 * ~128KB per queue. Align for cache performance. We actually
291 * need one more PRP per queue entry worst-case to handle
292 * buffer overlap, but we have an extra one in the command
293 * structure so we don't have to calculate that out.
295 * Remember that we intend to allocate potentially many queues,
296 * so we don't want to bloat this too much. A queue depth of
297 * 256 is plenty.
299 * CMD - Storage for the submit queue. maxqe * 64 (~16KB)
301 * RES - Storage for the completion queue. maxqe * 16 (~4KB)
303 * ADM - Storage for admin command DMA data. Maximum admin command
304 * DMA data is 4KB so reserve maxqe * 4KB (~1MB). There is only
305 * one admin queue.
307 * NOTE: There are no boundary requirements for NVMe, but I specify a
308 * 4MB boundary anyway because this reduces mass-bit flipping
309 * of address bits inside the controller when incrementing
310 * DMA addresses. Why not? Can't hurt.
312 sc->prp_bytes = sizeof(uint64_t) * (MAXPHYS / PAGE_SIZE) * sc->maxqe;
313 sc->cmd_bytes = sizeof(nvme_subq_item_t) * sc->maxqe;
314 sc->res_bytes = sizeof(nvme_comq_item_t) * sc->maxqe;
315 sc->adm_bytes = NVME_MAX_ADMIN_BUFFER * sc->maxqe;
317 error = 0;
319 error += bus_dma_tag_create(
320 NULL, /* parent tag */
321 PAGE_SIZE, /* alignment */
322 4 * 1024 * 1024, /* boundary */
323 BUS_SPACE_MAXADDR, /* loaddr? */
324 BUS_SPACE_MAXADDR, /* hiaddr */
325 NULL, /* filter */
326 NULL, /* filterarg */
327 sc->prp_bytes, /* [max]size */
328 1, /* maxsegs */
329 sc->prp_bytes, /* maxsegsz */
330 0, /* flags */
331 &sc->prps_tag); /* return tag */
333 error += bus_dma_tag_create(
334 NULL, /* parent tag */
335 PAGE_SIZE, /* alignment */
336 4 * 1024 * 1024, /* boundary */
337 BUS_SPACE_MAXADDR, /* loaddr? */
338 BUS_SPACE_MAXADDR, /* hiaddr */
339 NULL, /* filter */
340 NULL, /* filterarg */
341 sc->cmd_bytes, /* [max]size */
342 1, /* maxsegs */
343 sc->cmd_bytes, /* maxsegsz */
344 0, /* flags */
345 &sc->sque_tag); /* return tag */
347 error += bus_dma_tag_create(
348 NULL, /* parent tag */
349 PAGE_SIZE, /* alignment */
350 4 * 1024 * 1024, /* boundary */
351 BUS_SPACE_MAXADDR, /* loaddr? */
352 BUS_SPACE_MAXADDR, /* hiaddr */
353 NULL, /* filter */
354 NULL, /* filterarg */
355 sc->res_bytes, /* [max]size */
356 1, /* maxsegs */
357 sc->res_bytes, /* maxsegsz */
358 0, /* flags */
359 &sc->cque_tag); /* return tag */
361 error += bus_dma_tag_create(
362 NULL, /* parent tag */
363 PAGE_SIZE, /* alignment */
364 4 * 1024 * 1024, /* boundary */
365 BUS_SPACE_MAXADDR, /* loaddr? */
366 BUS_SPACE_MAXADDR, /* hiaddr */
367 NULL, /* filter */
368 NULL, /* filterarg */
369 sc->adm_bytes, /* [max]size */
370 1, /* maxsegs */
371 sc->adm_bytes, /* maxsegsz */
372 0, /* flags */
373 &sc->adm_tag); /* return tag */
375 if (error) {
376 device_printf(dev, "unable to create dma tags\n");
377 nvme_pci_detach(dev);
378 return (ENXIO);
382 * Setup the admin queues (qid 0).
384 error = nvme_alloc_subqueue(sc, 0);
385 if (error) {
386 device_printf(dev, "unable to allocate admin subqueue\n");
387 nvme_pci_detach(dev);
388 return (ENXIO);
390 error = nvme_alloc_comqueue(sc, 0);
391 if (error) {
392 device_printf(dev, "unable to allocate admin comqueue\n");
393 nvme_pci_detach(dev);
394 return (ENXIO);
398 * Initialize the admin queue registers
400 reg = NVME_ATTR_COM_SET(sc->maxqe) | NVME_ATTR_SUB_SET(sc->maxqe);
401 nvme_write(sc, NVME_REG_ADM_ATTR, reg);
402 nvme_write8(sc, NVME_REG_ADM_SUBADR, (uint64_t)sc->subqueues[0].psubq);
403 nvme_write8(sc, NVME_REG_ADM_COMADR, (uint64_t)sc->comqueues[0].pcomq);
406 * Other configuration registers
408 reg = NVME_CONFIG_IOSUB_ES_SET(6) | /* 64 byte sub entry */
409 NVME_CONFIG_IOCOM_ES_SET(4) | /* 16 byte com entry */
410 NVME_CONFIG_MEMPG_SET(PAGE_SHIFT) | /* 4K pages */
411 NVME_CONFIG_CSS_NVM; /* NVME command set */
412 nvme_write(sc, NVME_REG_CONFIG, reg);
414 reg = nvme_read(sc, NVME_REG_MEMSIZE);
417 * Enable the chip for operation
419 error = nvme_enable(sc, 1);
420 if (error) {
421 nvme_enable(sc, 0);
422 nvme_pci_detach(dev);
423 return (ENXIO);
427 * Start the admin thread. This will also setup the admin queue
428 * interrupt.
430 error = nvme_start_admin_thread(sc);
431 if (error) {
432 nvme_pci_detach(dev);
433 return (ENXIO);
435 lockmgr(&nvme_master_lock, LK_EXCLUSIVE);
436 sc->flags |= NVME_SC_ATTACHED;
437 TAILQ_INSERT_TAIL(&nvme_sc_list, sc, entry);
438 lockmgr(&nvme_master_lock, LK_RELEASE);
440 return(0);
444 * Device unload / detachment
446 static int
447 nvme_pci_detach(device_t dev)
449 nvme_softc_t *sc = device_get_softc(dev);
450 int i;
453 * Stop the admin thread
455 nvme_stop_admin_thread(sc);
458 * Issue a normal shutdown and wait for completion
460 nvme_issue_shutdown(sc);
463 * Disable the chip
465 nvme_enable(sc, 0);
468 * Free admin memory
470 nvme_free_subqueue(sc, 0);
471 nvme_free_comqueue(sc, 0);
474 * Release related resources.
476 for (i = 0; i < sc->nirqs; ++i) {
477 if (sc->irq[i]) {
478 bus_release_resource(dev, SYS_RES_IRQ,
479 sc->rid_irq[i], sc->irq[i]);
480 sc->irq[i] = NULL;
481 if (sc->irq_type == PCI_INTR_TYPE_MSIX)
482 pci_release_msix_vector(dev, sc->rid_irq[i]);
485 switch(sc->irq_type) {
486 case PCI_INTR_TYPE_MSI:
487 pci_release_msi(dev);
488 break;
489 case PCI_INTR_TYPE_MSIX:
490 pci_teardown_msix(dev);
491 break;
492 default:
493 break;
497 * Release remaining chipset resources
499 if (sc->regs) {
500 bus_release_resource(dev, SYS_RES_MEMORY,
501 sc->rid_regs, sc->regs);
502 sc->regs = NULL;
504 if (sc->bar4) {
505 bus_release_resource(dev, SYS_RES_MEMORY,
506 sc->rid_bar4, sc->regs);
507 sc->bar4 = NULL;
511 * Cleanup the DMA tags
513 if (sc->prps_tag) {
514 bus_dma_tag_destroy(sc->prps_tag);
515 sc->prps_tag = NULL;
517 if (sc->sque_tag) {
518 bus_dma_tag_destroy(sc->sque_tag);
519 sc->sque_tag = NULL;
521 if (sc->cque_tag) {
522 bus_dma_tag_destroy(sc->cque_tag);
523 sc->cque_tag = NULL;
525 if (sc->adm_tag) {
526 bus_dma_tag_destroy(sc->adm_tag);
527 sc->adm_tag = NULL;
530 if (sc->flags & NVME_SC_ATTACHED) {
531 lockmgr(&nvme_master_lock, LK_EXCLUSIVE);
532 sc->flags &= ~NVME_SC_ATTACHED;
533 TAILQ_REMOVE(&nvme_sc_list, sc, entry);
534 lockmgr(&nvme_master_lock, LK_RELEASE);
537 return (0);