2 * Copyright (c) 2016 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37 static int nvme_pci_attach(device_t
);
38 static int nvme_pci_detach(device_t
);
40 static const nvme_device_t nvme_devices
[] = {
41 /* Vendor-specific table goes here (see ahci for example) */
42 { 0, 0, nvme_pci_attach
, nvme_pci_detach
, "NVME-PCIe" }
45 static int nvme_msix_enable
= 1;
46 TUNABLE_INT("hw.nvme.msix.enable", &nvme_msix_enable
);
47 static int nvme_msi_enable
= 0;
48 TUNABLE_INT("hw.nvme.msi.enable", &nvme_msi_enable
);
50 TAILQ_HEAD(, nvme_softc
) nvme_sc_list
= TAILQ_HEAD_INITIALIZER(nvme_sc_list
);
51 struct lock nvme_master_lock
= LOCK_INITIALIZER("nvmstr", 0, 0);
53 static int last_global_cpu
;
56 * Match during probe and attach. The device does not yet have a softc.
59 nvme_lookup_device(device_t dev
)
61 const nvme_device_t
*ad
;
62 uint16_t vendor
= pci_get_vendor(dev
);
63 uint16_t product
= pci_get_device(dev
);
64 uint8_t class = pci_get_class(dev
);
65 uint8_t subclass
= pci_get_subclass(dev
);
66 uint8_t progif
= pci_read_config(dev
, PCIR_PROGIF
, 1);
70 * Generally speaking if the pci device does not identify as
73 if (class == PCIC_STORAGE
&& subclass
== PCIS_STORAGE_NVM
&&
74 progif
== PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0
) {
80 for (ad
= &nvme_devices
[0]; ad
->vendor
; ++ad
) {
81 if (ad
->vendor
== vendor
&& ad
->product
== product
)
86 * Last ad is the default match if the PCI device matches SATA.
94 * Attach functions. They all eventually fall through to nvme_pci_attach().
97 nvme_pci_attach(device_t dev
)
99 nvme_softc_t
*sc
= device_get_softc(dev
);
106 if (pci_read_config(dev
, PCIR_COMMAND
, 2) & 0x0400) {
107 device_printf(dev
, "BIOS disabled PCI interrupt, "
109 pci_write_config(dev
, PCIR_COMMAND
,
110 pci_read_config(dev
, PCIR_COMMAND
, 2) & ~0x0400, 2);
117 * Map the register window
119 sc
->rid_regs
= PCIR_BAR(0);
120 sc
->regs
= bus_alloc_resource_any(dev
, SYS_RES_MEMORY
,
121 &sc
->rid_regs
, RF_ACTIVE
);
122 if (sc
->regs
== NULL
) {
123 device_printf(dev
, "unable to map registers\n");
124 nvme_pci_detach(dev
);
127 sc
->iot
= rman_get_bustag(sc
->regs
);
128 sc
->ioh
= rman_get_bushandle(sc
->regs
);
131 * NVMe allows the MSI-X table to be mapped to BAR 4/5.
132 * Always try to map BAR4, but it's ok if it fails. Must
133 * be done prior to allocating our interrupts.
135 sc
->rid_bar4
= PCIR_BAR(4);
136 sc
->bar4
= bus_alloc_resource_any(dev
, SYS_RES_MEMORY
,
137 &sc
->rid_bar4
, RF_ACTIVE
);
140 * Map the interrupt or initial interrupt which will be used for
141 * the admin queue. NVME chipsets can potentially support a huge
142 * number of MSIX vectors but we really only need enough for
143 * available cpus, plus 1.
145 msi_enable
= device_getenv_int(dev
, "msi.enable", nvme_msi_enable
);
146 msix_enable
= device_getenv_int(dev
, "msix.enable", nvme_msix_enable
);
153 sc
->nirqs
= pci_msix_count(dev
);
154 sc
->irq_type
= PCI_INTR_TYPE_MSIX
;
155 if (sc
->nirqs
> ncpus
+ 1) /* max we need */
156 sc
->nirqs
= ncpus
+ 1;
158 error
= pci_setup_msix(dev
);
159 cpu
= (last_global_cpu
+ 0) % ncpus
; /* GCC warn */
160 for (i
= 0; error
== 0 && i
< sc
->nirqs
; ++i
) {
161 cpu
= (last_global_cpu
+ i
) % ncpus
;
162 error
= pci_alloc_msix_vector(dev
, i
,
163 &sc
->rid_irq
[i
], cpu
);
166 sc
->irq
[i
] = bus_alloc_resource_any(dev
, SYS_RES_IRQ
,
170 * We want this to overwrite queue 0's cpu vector
171 * when the cpu's rotate through later on.
173 if (sc
->cputovect
[cpu
] == 0)
174 sc
->cputovect
[cpu
] = i
;
178 * If we did not iterate enough cpus (that is, there weren't
179 * enough irqs for all available cpus) we still need to
180 * finish or sc->cputovect[] mapping.
183 cpu
= (cpu
+ 1) % ncpus
;
184 i
= (i
+ 1) % sc
->nirqs
;
187 if (sc
->cputovect
[cpu
] != 0)
189 sc
->cputovect
[cpu
] = i
;
194 bus_release_resource(dev
, SYS_RES_IRQ
,
197 pci_release_msix_vector(dev
, sc
->rid_irq
[i
]);
200 /* leave error intact to fall through to normal */
202 last_global_cpu
= (last_global_cpu
+ sc
->nirqs
) % ncpus
;
203 pci_enable_msix(dev
);
206 if (msix_enable
== 0 || error
) {
211 sc
->irq_type
= pci_alloc_1intr(dev
, msi_enable
,
212 &sc
->rid_irq
[0], &irq_flags
);
213 sc
->irq
[0] = bus_alloc_resource_any(dev
, SYS_RES_IRQ
,
214 &sc
->rid_irq
[0], irq_flags
);
216 if (sc
->irq
[0] == NULL
) {
217 device_printf(dev
, "unable to map interrupt\n");
218 nvme_pci_detach(dev
);
222 switch(sc
->irq_type
) {
223 case PCI_INTR_TYPE_MSI
:
226 case PCI_INTR_TYPE_MSIX
:
233 device_printf(dev
, "mapped %d %s IRQs\n", sc
->nirqs
, type
);
237 * Make sure the chip is disabled, which will reset all controller
238 * registers except for the admin queue registers. Device should
239 * already be disabled so this is usually instantanious. Use a
240 * fixed 5-second timeout in case it is not. I'd like my other
241 * reads to occur after the device has been disabled.
244 error
= nvme_enable(sc
, 0);
246 nvme_pci_detach(dev
);
251 * Get capabillities and version and report
253 sc
->vers
= nvme_read(sc
, NVME_REG_VERS
);
254 sc
->cap
= nvme_read8(sc
, NVME_REG_CAP
);
255 sc
->maxqe
= NVME_CAP_MQES_GET(sc
->cap
);
256 sc
->dstrd4
= NVME_CAP_DSTRD_GET(sc
->cap
);
258 device_printf(dev
, "NVME Version %u.%u maxqe=%u caps=%016jx\n",
259 NVME_VERS_MAJOR_GET(sc
->vers
),
260 NVME_VERS_MINOR_GET(sc
->vers
),
264 * Enable timeout, 500ms increments. Convert to ticks.
266 sc
->entimo
= NVME_CAP_TIMEOUT_GET(sc
->cap
) * hz
/ 2; /* in ticks */
267 ++sc
->entimo
; /* fudge */
270 * Validate maxqe. To cap the amount of memory we reserve for
271 * PRPs we limit maxqe to 256. Also make sure it is a power of
276 "Attach failed, max queue entries (%d) "
277 "below minimum (2)\n", sc
->maxqe
);
278 nvme_pci_detach(dev
);
283 for (reg
= 2; reg
<= sc
->maxqe
; reg
<<= 1)
285 sc
->maxqe
= reg
>> 1;
290 * PRP - Worst case PRPs needed per queue is MAXPHYS / PAGE_SIZE
291 * (typically 64), multiplied by maxqe (typ 256). Roughly
292 * ~128KB per queue. Align for cache performance. We actually
293 * need one more PRP per queue entry worst-case to handle
294 * buffer overlap, but we have an extra one in the command
295 * structure so we don't have to calculate that out.
297 * Remember that we intend to allocate potentially many queues,
298 * so we don't want to bloat this too much. A queue depth of
301 * CMD - Storage for the submit queue. maxqe * 64 (~16KB)
303 * RES - Storage for the completion queue. maxqe * 16 (~4KB)
305 * ADM - Storage for admin command DMA data. Maximum admin command
306 * DMA data is 4KB so reserve maxqe * 4KB (~1MB). There is only
309 * NOTE: There are no boundary requirements for NVMe, but I specify a
310 * 4MB boundary anyway because this reduces mass-bit flipping
311 * of address bits inside the controller when incrementing
312 * DMA addresses. Why not? Can't hurt.
314 sc
->prp_bytes
= sizeof(uint64_t) * (MAXPHYS
/ PAGE_SIZE
) * sc
->maxqe
;
315 sc
->cmd_bytes
= sizeof(nvme_subq_item_t
) * sc
->maxqe
;
316 sc
->res_bytes
= sizeof(nvme_comq_item_t
) * sc
->maxqe
;
317 sc
->adm_bytes
= NVME_MAX_ADMIN_BUFFER
* sc
->maxqe
;
321 error
+= bus_dma_tag_create(
322 NULL
, /* parent tag */
323 PAGE_SIZE
, /* alignment */
324 4 * 1024 * 1024, /* boundary */
325 BUS_SPACE_MAXADDR
, /* loaddr? */
326 BUS_SPACE_MAXADDR
, /* hiaddr */
328 NULL
, /* filterarg */
329 sc
->prp_bytes
, /* [max]size */
331 sc
->prp_bytes
, /* maxsegsz */
333 &sc
->prps_tag
); /* return tag */
335 error
+= bus_dma_tag_create(
336 NULL
, /* parent tag */
337 PAGE_SIZE
, /* alignment */
338 4 * 1024 * 1024, /* boundary */
339 BUS_SPACE_MAXADDR
, /* loaddr? */
340 BUS_SPACE_MAXADDR
, /* hiaddr */
342 NULL
, /* filterarg */
343 sc
->cmd_bytes
, /* [max]size */
345 sc
->cmd_bytes
, /* maxsegsz */
347 &sc
->sque_tag
); /* return tag */
349 error
+= bus_dma_tag_create(
350 NULL
, /* parent tag */
351 PAGE_SIZE
, /* alignment */
352 4 * 1024 * 1024, /* boundary */
353 BUS_SPACE_MAXADDR
, /* loaddr? */
354 BUS_SPACE_MAXADDR
, /* hiaddr */
356 NULL
, /* filterarg */
357 sc
->res_bytes
, /* [max]size */
359 sc
->res_bytes
, /* maxsegsz */
361 &sc
->cque_tag
); /* return tag */
363 error
+= bus_dma_tag_create(
364 NULL
, /* parent tag */
365 PAGE_SIZE
, /* alignment */
366 4 * 1024 * 1024, /* boundary */
367 BUS_SPACE_MAXADDR
, /* loaddr? */
368 BUS_SPACE_MAXADDR
, /* hiaddr */
370 NULL
, /* filterarg */
371 sc
->adm_bytes
, /* [max]size */
373 sc
->adm_bytes
, /* maxsegsz */
375 &sc
->adm_tag
); /* return tag */
378 device_printf(dev
, "unable to create dma tags\n");
379 nvme_pci_detach(dev
);
384 * Setup the admin queues (qid 0).
386 error
= nvme_alloc_subqueue(sc
, 0);
388 device_printf(dev
, "unable to allocate admin subqueue\n");
389 nvme_pci_detach(dev
);
392 error
= nvme_alloc_comqueue(sc
, 0);
394 device_printf(dev
, "unable to allocate admin comqueue\n");
395 nvme_pci_detach(dev
);
400 * Initialize the admin queue registers
402 reg
= NVME_ATTR_COM_SET(sc
->maxqe
) | NVME_ATTR_SUB_SET(sc
->maxqe
);
403 nvme_write(sc
, NVME_REG_ADM_ATTR
, reg
);
404 nvme_write8(sc
, NVME_REG_ADM_SUBADR
, (uint64_t)sc
->subqueues
[0].psubq
);
405 nvme_write8(sc
, NVME_REG_ADM_COMADR
, (uint64_t)sc
->comqueues
[0].pcomq
);
408 * qemu appears to require this, real hardware does not appear
411 pci_enable_busmaster(dev
);
414 * Other configuration registers
416 reg
= NVME_CONFIG_IOSUB_ES_SET(6) | /* 64 byte sub entry */
417 NVME_CONFIG_IOCOM_ES_SET(4) | /* 16 byte com entry */
418 NVME_CONFIG_MEMPG_SET(PAGE_SHIFT
) | /* 4K pages */
419 NVME_CONFIG_CSS_NVM
; /* NVME command set */
420 nvme_write(sc
, NVME_REG_CONFIG
, reg
);
422 reg
= nvme_read(sc
, NVME_REG_MEMSIZE
);
425 * Enable the chip for operation
427 error
= nvme_enable(sc
, 1);
430 nvme_pci_detach(dev
);
435 * Start the admin thread. This will also setup the admin queue
438 error
= nvme_start_admin_thread(sc
);
440 nvme_pci_detach(dev
);
443 lockmgr(&nvme_master_lock
, LK_EXCLUSIVE
);
444 sc
->flags
|= NVME_SC_ATTACHED
;
445 TAILQ_INSERT_TAIL(&nvme_sc_list
, sc
, entry
);
446 lockmgr(&nvme_master_lock
, LK_RELEASE
);
452 * Device unload / detachment
455 nvme_pci_detach(device_t dev
)
457 nvme_softc_t
*sc
= device_get_softc(dev
);
461 * Stop the admin thread
463 nvme_stop_admin_thread(sc
);
466 * Issue a normal shutdown and wait for completion
468 nvme_issue_shutdown(sc
);
478 nvme_free_subqueue(sc
, 0);
479 nvme_free_comqueue(sc
, 0);
482 * Release related resources.
484 for (i
= 0; i
< sc
->nirqs
; ++i
) {
486 bus_release_resource(dev
, SYS_RES_IRQ
,
487 sc
->rid_irq
[i
], sc
->irq
[i
]);
489 if (sc
->irq_type
== PCI_INTR_TYPE_MSIX
)
490 pci_release_msix_vector(dev
, sc
->rid_irq
[i
]);
493 switch(sc
->irq_type
) {
494 case PCI_INTR_TYPE_MSI
:
495 pci_release_msi(dev
);
497 case PCI_INTR_TYPE_MSIX
:
498 pci_teardown_msix(dev
);
505 * Release remaining chipset resources
508 bus_release_resource(dev
, SYS_RES_MEMORY
,
509 sc
->rid_regs
, sc
->regs
);
513 bus_release_resource(dev
, SYS_RES_MEMORY
,
514 sc
->rid_bar4
, sc
->bar4
);
519 * Cleanup the DMA tags
522 bus_dma_tag_destroy(sc
->prps_tag
);
526 bus_dma_tag_destroy(sc
->sque_tag
);
530 bus_dma_tag_destroy(sc
->cque_tag
);
534 bus_dma_tag_destroy(sc
->adm_tag
);
538 if (sc
->flags
& NVME_SC_ATTACHED
) {
539 lockmgr(&nvme_master_lock
, LK_EXCLUSIVE
);
540 sc
->flags
&= ~NVME_SC_ATTACHED
;
541 TAILQ_REMOVE(&nvme_sc_list
, sc
, entry
);
542 lockmgr(&nvme_master_lock
, LK_RELEASE
);