2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
13 * Copyright 2018 Nexenta Systems, Inc.
14 * Copyright 2016 Tegile Systems, Inc. All rights reserved.
15 * Copyright (c) 2016 The MathWorks, Inc. All rights reserved.
16 * Copyright 2017 Joyent, Inc.
20 * blkdev driver for NVMe compliant storage devices
22 * This driver was written to conform to version 1.2.1 of the NVMe
23 * specification. It may work with newer versions, but that is completely
24 * untested and disabled by default.
26 * The driver has only been tested on x86 systems and will not work on big-
27 * endian systems without changes to the code accessing registers and data
28 * structures used by the hardware.
33 * The driver will use a single interrupt while configuring the device as the
34 * specification requires, but contrary to the specification it will try to use
35 * a single-message MSI(-X) or FIXED interrupt. Later in the attach process it
36 * will switch to multiple-message MSI(-X) if supported. The driver wants to
37 * have one interrupt vector per CPU, but it will work correctly if less are
38 * available. Interrupts can be shared by queues, the interrupt handler will
39 * iterate through the I/O queue array by steps of n_intr_cnt. Usually only
40 * the admin queue will share an interrupt with one I/O queue. The interrupt
41 * handler will retrieve completed commands from all queues sharing an interrupt
42 * vector and will post them to a taskq for completion processing.
47 * NVMe devices can have up to 65535 I/O queue pairs, with each queue holding up
48 * to 65536 I/O commands. The driver will configure one I/O queue pair per
49 * available interrupt vector, with the queue length usually much smaller than
50 * the maximum of 65536. If the hardware doesn't provide enough queues, fewer
51 * interrupt vectors will be used.
53 * Additionally the hardware provides a single special admin queue pair that can
54 * hold up to 4096 admin commands.
56 * From the hardware perspective both queues of a queue pair are independent,
57 * but they share some driver state: the command array (holding pointers to
58 * commands currently being processed by the hardware) and the active command
59 * counter. Access to a queue pair and the shared state is protected by
62 * When a command is submitted to a queue pair the active command counter is
63 * incremented and a pointer to the command is stored in the command array. The
64 * array index is used as command identifier (CID) in the submission queue
65 * entry. Some commands may take a very long time to complete, and if the queue
66 * wraps around in that time a submission may find the next array slot to still
67 * be used by a long-running command. In this case the array is sequentially
68 * searched for the next free slot. The length of the command array is the same
69 * as the configured queue length. Queue overrun is prevented by the semaphore,
70 * so a command submission may block if the queue is full.
75 * For kernel core dump support the driver can do polled I/O. As interrupts are
76 * turned off while dumping the driver will just submit a command in the regular
77 * way, and then repeatedly attempt a command retrieval until it gets the
83 * NVMe devices can have multiple namespaces, each being a independent data
84 * store. The driver supports multiple namespaces and creates a blkdev interface
85 * for each namespace found. Namespaces can have various attributes to support
86 * thin provisioning and protection information. This driver does not support
87 * any of this and ignores namespaces that have these attributes.
89 * As of NVMe 1.1 namespaces can have an 64bit Extended Unique Identifier
90 * (EUI64). This driver uses the EUI64 if present to generate the devid and
91 * passes it to blkdev to use it in the device node names. As this is currently
92 * untested namespaces with EUI64 are ignored by default.
94 * We currently support only (2 << NVME_MINOR_INST_SHIFT) - 2 namespaces in a
95 * single controller. This is an artificial limit imposed by the driver to be
96 * able to address a reasonable number of controllers and namespaces using a
97 * 32bit minor node number.
102 * For each NVMe device the driver exposes one minor node for the controller and
103 * one minor node for each namespace. The only operations supported by those
104 * minor nodes are open(9E), close(9E), and ioctl(9E). This serves as the
105 * interface for the nvmeadm(8) utility.
110 * This driver uses blkdev to do all the heavy lifting involved with presenting
111 * a disk device to the system. As a result, the processing of I/O requests is
112 * relatively simple as blkdev takes care of partitioning, boundary checks, DMA
113 * setup, and splitting of transfers into manageable chunks.
115 * I/O requests coming in from blkdev are turned into NVM commands and posted to
116 * an I/O queue. The queue is selected by taking the CPU id modulo the number of
117 * queues. There is currently no timeout handling of I/O commands.
119 * Blkdev also supports querying device/media information and generating a
120 * devid. The driver reports the best block size as determined by the namespace
121 * format back to blkdev as physical block size to support partition and block
122 * alignment. The devid is either based on the namespace EUI64, if present, or
123 * composed using the device vendor ID, model number, serial number, and the
129 * Error handling is currently limited to detecting fatal hardware errors,
130 * either by asynchronous events, or synchronously through command status or
131 * admin command timeouts. In case of severe errors the device is fenced off,
132 * all further requests will return EIO. FMA is then called to fault the device.
134 * The hardware has a limit for outstanding asynchronous event requests. Before
135 * this limit is known the driver assumes it is at least 1 and posts a single
136 * asynchronous request. Later when the limit is known more asynchronous event
137 * requests are posted to allow quicker reception of error information. When an
138 * asynchronous event is posted by the hardware the driver will parse the error
139 * status fields and log information or fault the device, depending on the
140 * severity of the asynchronous event. The asynchronous event request is then
141 * reused and posted to the admin queue again.
143 * On command completion the command status is checked for errors. In case of
144 * errors indicating a driver bug the driver panics. Almost all other error
145 * status values just cause EIO to be returned.
147 * Command timeouts are currently detected for all admin commands except
148 * asynchronous event requests. If a command times out and the hardware appears
149 * to be healthy the driver attempts to abort the command. The original command
150 * timeout is also applied to the abort command. If the abort times out too the
151 * driver assumes the device to be dead, fences it off, and calls FMA to retire
152 * it. In all other cases the aborted command should return immediately with a
153 * status indicating it was aborted, and the driver will wait indefinitely for
154 * that to happen. No timeout handling of normal I/O commands is presently done.
156 * Any command that times out due to the controller dropping dead will be put on
157 * nvme_lost_cmds list if it references DMA memory. This will prevent the DMA
158 * memory being reused by the system and later be written to by a "dead" NVMe
164 * Each queue pair has its own nq_mutex, which must be held when accessing the
165 * associated queue registers or the shared state of the queue pair. Callers of
166 * nvme_unqueue_cmd() must make sure that nq_mutex is held, while
167 * nvme_submit_{admin,io}_cmd() and nvme_retrieve_cmd() take care of this
170 * Each command also has its own nc_mutex, which is associated with the
171 * condition variable nc_cv. It is only used on admin commands which are run
172 * synchronously. In that case it must be held across calls to
173 * nvme_submit_{admin,io}_cmd() and nvme_wait_cmd(), which is taken care of by
174 * nvme_admin_cmd(). It must also be held whenever the completion state of the
175 * command is changed or while a admin command timeout is handled.
177 * If both nc_mutex and nq_mutex must be held, nc_mutex must be acquired first.
178 * More than one nc_mutex may only be held when aborting commands. In this case,
179 * the nc_mutex of the command to be aborted must be held across the call to
180 * nvme_abort_cmd() to prevent the command from completing while the abort is in
183 * Each minor node has its own nm_mutex, which protects the open count nm_ocnt
184 * and exclusive-open flag nm_oexcl.
187 * Quiesce / Fast Reboot:
189 * The driver currently does not support fast reboot. A quiesce(9E) entry point
190 * is still provided which is used to send a shutdown notification to the
194 * Driver Configuration:
196 * The following driver properties can be changed to control some aspects of the
198 * - strict-version: can be set to 0 to allow devices conforming to newer
199 * versions or namespaces with EUI64 to be used
200 * - ignore-unknown-vendor-status: can be set to 1 to not handle any vendor
201 * specific command status as a fatal error leading device faulting
202 * - admin-queue-len: the maximum length of the admin queue (16-4096)
203 * - io-queue-len: the maximum length of the I/O queues (16-65536)
204 * - async-event-limit: the maximum number of asynchronous event requests to be
205 * posted by the driver
206 * - volatile-write-cache-enable: can be set to 0 to disable the volatile write
208 * - min-phys-block-size: the minimum physical block size to report to blkdev,
209 * which is among other things the basis for ZFS vdev ashift
213 * - figure out sane default for I/O queue depth reported to blkdev
214 * - FMA handling of media errors
215 * - support for devices supporting very large I/O requests using chained PRPs
216 * - support for configuring hardware parameters like interrupt coalescing
217 * - support for media formatting and hard partitioning into namespaces
218 * - support for big-endian systems
219 * - support for fast reboot
220 * - support for firmware updates
221 * - support for NVMe Subsystem Reset (1.1)
222 * - support for Scatter/Gather lists (1.1)
223 * - support for Reservations (1.1)
224 * - support for power management
227 #include <sys/byteorder.h>
229 #error nvme driver needs porting for big-endian platforms
232 #include <sys/modctl.h>
233 #include <sys/conf.h>
234 #include <sys/devops.h>
236 #include <sys/sunddi.h>
237 #include <sys/sunndi.h>
238 #include <sys/bitmap.h>
239 #include <sys/sysmacros.h>
240 #include <sys/param.h>
241 #include <sys/varargs.h>
242 #include <sys/cpuvar.h>
243 #include <sys/disp.h>
244 #include <sys/blkdev.h>
245 #include <sys/atomic.h>
246 #include <sys/archsystm.h>
247 #include <sys/sata/sata_hba.h>
248 #include <sys/stat.h>
249 #include <sys/policy.h>
250 #include <sys/list.h>
252 #include <sys/nvme.h>
255 #include <sys/x86_archext.h>
258 #include "nvme_reg.h"
259 #include "nvme_var.h"
262 /* NVMe spec version supported */
263 static const int nvme_version_major
= 1;
264 static const int nvme_version_minor
= 2;
266 /* tunable for admin command timeout in seconds, default is 1s */
267 int nvme_admin_cmd_timeout
= 1;
269 /* tunable for FORMAT NVM command timeout in seconds, default is 600s */
270 int nvme_format_cmd_timeout
= 600;
272 static int nvme_attach(dev_info_t
*, ddi_attach_cmd_t
);
273 static int nvme_detach(dev_info_t
*, ddi_detach_cmd_t
);
274 static int nvme_quiesce(dev_info_t
*);
275 static int nvme_fm_errcb(dev_info_t
*, ddi_fm_error_t
*, const void *);
276 static int nvme_setup_interrupts(nvme_t
*, int, int);
277 static void nvme_release_interrupts(nvme_t
*);
278 static uint_t
nvme_intr(caddr_t
, caddr_t
);
280 static void nvme_shutdown(nvme_t
*, int, boolean_t
);
281 static boolean_t
nvme_reset(nvme_t
*, boolean_t
);
282 static int nvme_init(nvme_t
*);
283 static nvme_cmd_t
*nvme_alloc_cmd(nvme_t
*, int);
284 static void nvme_free_cmd(nvme_cmd_t
*);
285 static nvme_cmd_t
*nvme_create_nvm_cmd(nvme_namespace_t
*, uint8_t,
287 static void nvme_admin_cmd(nvme_cmd_t
*, int);
288 static void nvme_submit_admin_cmd(nvme_qpair_t
*, nvme_cmd_t
*);
289 static int nvme_submit_io_cmd(nvme_qpair_t
*, nvme_cmd_t
*);
290 static void nvme_submit_cmd_common(nvme_qpair_t
*, nvme_cmd_t
*);
291 static nvme_cmd_t
*nvme_unqueue_cmd(nvme_t
*, nvme_qpair_t
*, int);
292 static nvme_cmd_t
*nvme_retrieve_cmd(nvme_t
*, nvme_qpair_t
*);
293 static void nvme_wait_cmd(nvme_cmd_t
*, uint_t
);
294 static void nvme_wakeup_cmd(void *);
295 static void nvme_async_event_task(void *);
297 static int nvme_check_unknown_cmd_status(nvme_cmd_t
*);
298 static int nvme_check_vendor_cmd_status(nvme_cmd_t
*);
299 static int nvme_check_integrity_cmd_status(nvme_cmd_t
*);
300 static int nvme_check_specific_cmd_status(nvme_cmd_t
*);
301 static int nvme_check_generic_cmd_status(nvme_cmd_t
*);
302 static inline int nvme_check_cmd_status(nvme_cmd_t
*);
304 static int nvme_abort_cmd(nvme_cmd_t
*, uint_t
);
305 static void nvme_async_event(nvme_t
*);
306 static int nvme_format_nvm(nvme_t
*, uint32_t, uint8_t, boolean_t
, uint8_t,
308 static int nvme_get_logpage(nvme_t
*, void **, size_t *, uint8_t, ...);
309 static int nvme_identify(nvme_t
*, uint32_t, void **);
310 static int nvme_set_features(nvme_t
*, uint32_t, uint8_t, uint32_t,
312 static int nvme_get_features(nvme_t
*, uint32_t, uint8_t, uint32_t *,
314 static int nvme_write_cache_set(nvme_t
*, boolean_t
);
315 static int nvme_set_nqueues(nvme_t
*, uint16_t *);
317 static void nvme_free_dma(nvme_dma_t
*);
318 static int nvme_zalloc_dma(nvme_t
*, size_t, uint_t
, ddi_dma_attr_t
*,
320 static int nvme_zalloc_queue_dma(nvme_t
*, uint32_t, uint16_t, uint_t
,
322 static void nvme_free_qpair(nvme_qpair_t
*);
323 static int nvme_alloc_qpair(nvme_t
*, uint32_t, nvme_qpair_t
**, int);
324 static int nvme_create_io_qpair(nvme_t
*, nvme_qpair_t
*, uint16_t);
326 static inline void nvme_put64(nvme_t
*, uintptr_t, uint64_t);
327 static inline void nvme_put32(nvme_t
*, uintptr_t, uint32_t);
328 static inline uint64_t nvme_get64(nvme_t
*, uintptr_t);
329 static inline uint32_t nvme_get32(nvme_t
*, uintptr_t);
331 static boolean_t
nvme_check_regs_hdl(nvme_t
*);
332 static boolean_t
nvme_check_dma_hdl(nvme_dma_t
*);
334 static int nvme_fill_prp(nvme_cmd_t
*, bd_xfer_t
*);
336 static void nvme_bd_xfer_done(void *);
337 static void nvme_bd_driveinfo(void *, bd_drive_t
*);
338 static int nvme_bd_mediainfo(void *, bd_media_t
*);
339 static int nvme_bd_cmd(nvme_namespace_t
*, bd_xfer_t
*, uint8_t);
340 static int nvme_bd_read(void *, bd_xfer_t
*);
341 static int nvme_bd_write(void *, bd_xfer_t
*);
342 static int nvme_bd_sync(void *, bd_xfer_t
*);
343 static int nvme_bd_devid(void *, dev_info_t
*, ddi_devid_t
*);
345 static int nvme_prp_dma_constructor(void *, void *, int);
346 static void nvme_prp_dma_destructor(void *, void *);
348 static void nvme_prepare_devid(nvme_t
*, uint32_t);
350 static int nvme_open(dev_t
*, int, int, cred_t
*);
351 static int nvme_close(dev_t
, int, int, cred_t
*);
352 static int nvme_ioctl(dev_t
, int, intptr_t, int, cred_t
*, int *);
354 #define NVME_MINOR_INST_SHIFT 9
355 #define NVME_MINOR(inst, nsid) (((inst) << NVME_MINOR_INST_SHIFT) | (nsid))
356 #define NVME_MINOR_INST(minor) ((minor) >> NVME_MINOR_INST_SHIFT)
357 #define NVME_MINOR_NSID(minor) ((minor) & ((1 << NVME_MINOR_INST_SHIFT) - 1))
358 #define NVME_MINOR_MAX (NVME_MINOR(1, 0) - 2)
360 static void *nvme_state
;
361 static kmem_cache_t
*nvme_cmd_cache
;
364 * DMA attributes for queue DMA memory
366 * Queue DMA memory must be page aligned. The maximum length of a queue is
367 * 65536 entries, and an entry can be 64 bytes long.
369 static ddi_dma_attr_t nvme_queue_dma_attr
= {
370 .dma_attr_version
= DMA_ATTR_V0
,
371 .dma_attr_addr_lo
= 0,
372 .dma_attr_addr_hi
= 0xffffffffffffffffULL
,
373 .dma_attr_count_max
= (UINT16_MAX
+ 1) * sizeof (nvme_sqe_t
) - 1,
374 .dma_attr_align
= 0x1000,
375 .dma_attr_burstsizes
= 0x7ff,
376 .dma_attr_minxfer
= 0x1000,
377 .dma_attr_maxxfer
= (UINT16_MAX
+ 1) * sizeof (nvme_sqe_t
),
378 .dma_attr_seg
= 0xffffffffffffffffULL
,
379 .dma_attr_sgllen
= 1,
380 .dma_attr_granular
= 1,
385 * DMA attributes for transfers using Physical Region Page (PRP) entries
387 * A PRP entry describes one page of DMA memory using the page size specified
388 * in the controller configuration's memory page size register (CC.MPS). It uses
389 * a 64bit base address aligned to this page size. There is no limitation on
390 * chaining PRPs together for arbitrarily large DMA transfers.
392 static ddi_dma_attr_t nvme_prp_dma_attr
= {
393 .dma_attr_version
= DMA_ATTR_V0
,
394 .dma_attr_addr_lo
= 0,
395 .dma_attr_addr_hi
= 0xffffffffffffffffULL
,
396 .dma_attr_count_max
= 0xfff,
397 .dma_attr_align
= 0x1000,
398 .dma_attr_burstsizes
= 0x7ff,
399 .dma_attr_minxfer
= 0x1000,
400 .dma_attr_maxxfer
= 0x1000,
401 .dma_attr_seg
= 0xfff,
402 .dma_attr_sgllen
= -1,
403 .dma_attr_granular
= 1,
408 * DMA attributes for transfers using scatter/gather lists
410 * A SGL entry describes a chunk of DMA memory using a 64bit base address and a
411 * 32bit length field. SGL Segment and SGL Last Segment entries require the
412 * length to be a multiple of 16 bytes.
414 static ddi_dma_attr_t nvme_sgl_dma_attr
= {
415 .dma_attr_version
= DMA_ATTR_V0
,
416 .dma_attr_addr_lo
= 0,
417 .dma_attr_addr_hi
= 0xffffffffffffffffULL
,
418 .dma_attr_count_max
= 0xffffffffUL
,
420 .dma_attr_burstsizes
= 0x7ff,
421 .dma_attr_minxfer
= 0x10,
422 .dma_attr_maxxfer
= 0xfffffffffULL
,
423 .dma_attr_seg
= 0xffffffffffffffffULL
,
424 .dma_attr_sgllen
= -1,
425 .dma_attr_granular
= 0x10,
429 static ddi_device_acc_attr_t nvme_reg_acc_attr
= {
430 .devacc_attr_version
= DDI_DEVICE_ATTR_V0
,
431 .devacc_attr_endian_flags
= DDI_STRUCTURE_LE_ACC
,
432 .devacc_attr_dataorder
= DDI_STRICTORDER_ACC
435 static struct cb_ops nvme_cb_ops
= {
436 .cb_open
= nvme_open
,
437 .cb_close
= nvme_close
,
438 .cb_strategy
= nodev
,
443 .cb_ioctl
= nvme_ioctl
,
447 .cb_chpoll
= nochpoll
,
448 .cb_prop_op
= ddi_prop_op
,
450 .cb_flag
= D_NEW
| D_MP
,
456 static struct dev_ops nvme_dev_ops
= {
457 .devo_rev
= DEVO_REV
,
459 .devo_getinfo
= ddi_no_info
,
460 .devo_identify
= nulldev
,
461 .devo_probe
= nulldev
,
462 .devo_attach
= nvme_attach
,
463 .devo_detach
= nvme_detach
,
465 .devo_cb_ops
= &nvme_cb_ops
,
466 .devo_bus_ops
= NULL
,
468 .devo_quiesce
= nvme_quiesce
,
471 static struct modldrv nvme_modldrv
= {
472 .drv_modops
= &mod_driverops
,
473 .drv_linkinfo
= "NVMe v1.1b",
474 .drv_dev_ops
= &nvme_dev_ops
477 static struct modlinkage nvme_modlinkage
= {
479 .ml_linkage
= { &nvme_modldrv
, NULL
}
482 static bd_ops_t nvme_bd_ops
= {
483 .o_version
= BD_OPS_VERSION_0
,
484 .o_drive_info
= nvme_bd_driveinfo
,
485 .o_media_info
= nvme_bd_mediainfo
,
486 .o_devid_init
= nvme_bd_devid
,
487 .o_sync_cache
= nvme_bd_sync
,
488 .o_read
= nvme_bd_read
,
489 .o_write
= nvme_bd_write
,
493 * This list will hold commands that have timed out and couldn't be aborted.
494 * As we don't know what the hardware may still do with the DMA memory we can't
495 * free them, so we'll keep them forever on this list where we can easily look
498 static struct list nvme_lost_cmds
;
499 static kmutex_t nvme_lc_mutex
;
506 error
= ddi_soft_state_init(&nvme_state
, sizeof (nvme_t
), 1);
507 if (error
!= DDI_SUCCESS
)
510 nvme_cmd_cache
= kmem_cache_create("nvme_cmd_cache",
511 sizeof (nvme_cmd_t
), 64, NULL
, NULL
, NULL
, NULL
, NULL
, 0);
513 mutex_init(&nvme_lc_mutex
, NULL
, MUTEX_DRIVER
, NULL
);
514 list_create(&nvme_lost_cmds
, sizeof (nvme_cmd_t
),
515 offsetof(nvme_cmd_t
, nc_list
));
517 bd_mod_init(&nvme_dev_ops
);
519 error
= mod_install(&nvme_modlinkage
);
520 if (error
!= DDI_SUCCESS
) {
521 ddi_soft_state_fini(&nvme_state
);
522 mutex_destroy(&nvme_lc_mutex
);
523 list_destroy(&nvme_lost_cmds
);
524 bd_mod_fini(&nvme_dev_ops
);
535 if (!list_is_empty(&nvme_lost_cmds
))
536 return (DDI_FAILURE
);
538 error
= mod_remove(&nvme_modlinkage
);
539 if (error
== DDI_SUCCESS
) {
540 ddi_soft_state_fini(&nvme_state
);
541 kmem_cache_destroy(nvme_cmd_cache
);
542 mutex_destroy(&nvme_lc_mutex
);
543 list_destroy(&nvme_lost_cmds
);
544 bd_mod_fini(&nvme_dev_ops
);
551 _info(struct modinfo
*modinfop
)
553 return (mod_info(&nvme_modlinkage
, modinfop
));
557 nvme_put64(nvme_t
*nvme
, uintptr_t reg
, uint64_t val
)
559 ASSERT(((uintptr_t)(nvme
->n_regs
+ reg
) & 0x7) == 0);
561 /*LINTED: E_BAD_PTR_CAST_ALIGN*/
562 ddi_put64(nvme
->n_regh
, (uint64_t *)(nvme
->n_regs
+ reg
), val
);
566 nvme_put32(nvme_t
*nvme
, uintptr_t reg
, uint32_t val
)
568 ASSERT(((uintptr_t)(nvme
->n_regs
+ reg
) & 0x3) == 0);
570 /*LINTED: E_BAD_PTR_CAST_ALIGN*/
571 ddi_put32(nvme
->n_regh
, (uint32_t *)(nvme
->n_regs
+ reg
), val
);
574 static inline uint64_t
575 nvme_get64(nvme_t
*nvme
, uintptr_t reg
)
579 ASSERT(((uintptr_t)(nvme
->n_regs
+ reg
) & 0x7) == 0);
581 /*LINTED: E_BAD_PTR_CAST_ALIGN*/
582 val
= ddi_get64(nvme
->n_regh
, (uint64_t *)(nvme
->n_regs
+ reg
));
587 static inline uint32_t
588 nvme_get32(nvme_t
*nvme
, uintptr_t reg
)
592 ASSERT(((uintptr_t)(nvme
->n_regs
+ reg
) & 0x3) == 0);
594 /*LINTED: E_BAD_PTR_CAST_ALIGN*/
595 val
= ddi_get32(nvme
->n_regh
, (uint32_t *)(nvme
->n_regs
+ reg
));
601 nvme_check_regs_hdl(nvme_t
*nvme
)
603 ddi_fm_error_t error
;
605 ddi_fm_acc_err_get(nvme
->n_regh
, &error
, DDI_FME_VERSION
);
607 if (error
.fme_status
!= DDI_FM_OK
)
614 nvme_check_dma_hdl(nvme_dma_t
*dma
)
616 ddi_fm_error_t error
;
621 ddi_fm_dma_err_get(dma
->nd_dmah
, &error
, DDI_FME_VERSION
);
623 if (error
.fme_status
!= DDI_FM_OK
)
630 nvme_free_dma_common(nvme_dma_t
*dma
)
632 if (dma
->nd_dmah
!= NULL
)
633 (void) ddi_dma_unbind_handle(dma
->nd_dmah
);
634 if (dma
->nd_acch
!= NULL
)
635 ddi_dma_mem_free(&dma
->nd_acch
);
636 if (dma
->nd_dmah
!= NULL
)
637 ddi_dma_free_handle(&dma
->nd_dmah
);
641 nvme_free_dma(nvme_dma_t
*dma
)
643 nvme_free_dma_common(dma
);
644 kmem_free(dma
, sizeof (*dma
));
649 nvme_prp_dma_destructor(void *buf
, void *private)
651 nvme_dma_t
*dma
= (nvme_dma_t
*)buf
;
653 nvme_free_dma_common(dma
);
657 nvme_alloc_dma_common(nvme_t
*nvme
, nvme_dma_t
*dma
,
658 size_t len
, uint_t flags
, ddi_dma_attr_t
*dma_attr
)
660 if (ddi_dma_alloc_handle(nvme
->n_dip
, dma_attr
, DDI_DMA_SLEEP
, NULL
,
661 &dma
->nd_dmah
) != DDI_SUCCESS
) {
663 * Due to DDI_DMA_SLEEP this can't be DDI_DMA_NORESOURCES, and
664 * the only other possible error is DDI_DMA_BADATTR which
665 * indicates a driver bug which should cause a panic.
667 dev_err(nvme
->n_dip
, CE_PANIC
,
668 "!failed to get DMA handle, check DMA attributes");
669 return (DDI_FAILURE
);
673 * ddi_dma_mem_alloc() can only fail when DDI_DMA_NOSLEEP is specified
674 * or the flags are conflicting, which isn't the case here.
676 (void) ddi_dma_mem_alloc(dma
->nd_dmah
, len
, &nvme
->n_reg_acc_attr
,
677 DDI_DMA_CONSISTENT
, DDI_DMA_SLEEP
, NULL
, &dma
->nd_memp
,
678 &dma
->nd_len
, &dma
->nd_acch
);
680 if (ddi_dma_addr_bind_handle(dma
->nd_dmah
, NULL
, dma
->nd_memp
,
681 dma
->nd_len
, flags
| DDI_DMA_CONSISTENT
, DDI_DMA_SLEEP
, NULL
,
682 &dma
->nd_cookie
, &dma
->nd_ncookie
) != DDI_DMA_MAPPED
) {
683 dev_err(nvme
->n_dip
, CE_WARN
,
684 "!failed to bind DMA memory");
685 atomic_inc_32(&nvme
->n_dma_bind_err
);
686 nvme_free_dma_common(dma
);
687 return (DDI_FAILURE
);
690 return (DDI_SUCCESS
);
694 nvme_zalloc_dma(nvme_t
*nvme
, size_t len
, uint_t flags
,
695 ddi_dma_attr_t
*dma_attr
, nvme_dma_t
**ret
)
697 nvme_dma_t
*dma
= kmem_zalloc(sizeof (nvme_dma_t
), KM_SLEEP
);
699 if (nvme_alloc_dma_common(nvme
, dma
, len
, flags
, dma_attr
) !=
702 kmem_free(dma
, sizeof (nvme_dma_t
));
703 return (DDI_FAILURE
);
706 bzero(dma
->nd_memp
, dma
->nd_len
);
709 return (DDI_SUCCESS
);
714 nvme_prp_dma_constructor(void *buf
, void *private, int flags
)
716 nvme_dma_t
*dma
= (nvme_dma_t
*)buf
;
717 nvme_t
*nvme
= (nvme_t
*)private;
722 if (nvme_alloc_dma_common(nvme
, dma
, nvme
->n_pagesize
,
723 DDI_DMA_READ
, &nvme
->n_prp_dma_attr
) != DDI_SUCCESS
) {
727 ASSERT(dma
->nd_ncookie
== 1);
729 dma
->nd_cached
= B_TRUE
;
735 nvme_zalloc_queue_dma(nvme_t
*nvme
, uint32_t nentry
, uint16_t qe_len
,
736 uint_t flags
, nvme_dma_t
**dma
)
738 uint32_t len
= nentry
* qe_len
;
739 ddi_dma_attr_t q_dma_attr
= nvme
->n_queue_dma_attr
;
741 len
= roundup(len
, nvme
->n_pagesize
);
743 q_dma_attr
.dma_attr_minxfer
= len
;
745 if (nvme_zalloc_dma(nvme
, len
, flags
, &q_dma_attr
, dma
)
747 dev_err(nvme
->n_dip
, CE_WARN
,
748 "!failed to get DMA memory for queue");
752 if ((*dma
)->nd_ncookie
!= 1) {
753 dev_err(nvme
->n_dip
, CE_WARN
,
754 "!got too many cookies for queue DMA");
758 return (DDI_SUCCESS
);
766 return (DDI_FAILURE
);
770 nvme_free_qpair(nvme_qpair_t
*qp
)
774 mutex_destroy(&qp
->nq_mutex
);
775 sema_destroy(&qp
->nq_sema
);
777 if (qp
->nq_sqdma
!= NULL
)
778 nvme_free_dma(qp
->nq_sqdma
);
779 if (qp
->nq_cqdma
!= NULL
)
780 nvme_free_dma(qp
->nq_cqdma
);
782 if (qp
->nq_active_cmds
> 0)
783 for (i
= 0; i
!= qp
->nq_nentry
; i
++)
784 if (qp
->nq_cmd
[i
] != NULL
)
785 nvme_free_cmd(qp
->nq_cmd
[i
]);
787 if (qp
->nq_cmd
!= NULL
)
788 kmem_free(qp
->nq_cmd
, sizeof (nvme_cmd_t
*) * qp
->nq_nentry
);
790 kmem_free(qp
, sizeof (nvme_qpair_t
));
794 nvme_alloc_qpair(nvme_t
*nvme
, uint32_t nentry
, nvme_qpair_t
**nqp
,
797 nvme_qpair_t
*qp
= kmem_zalloc(sizeof (*qp
), KM_SLEEP
);
799 mutex_init(&qp
->nq_mutex
, NULL
, MUTEX_DRIVER
,
800 DDI_INTR_PRI(nvme
->n_intr_pri
));
801 sema_init(&qp
->nq_sema
, nentry
, NULL
, SEMA_DRIVER
, NULL
);
803 if (nvme_zalloc_queue_dma(nvme
, nentry
, sizeof (nvme_sqe_t
),
804 DDI_DMA_WRITE
, &qp
->nq_sqdma
) != DDI_SUCCESS
)
807 if (nvme_zalloc_queue_dma(nvme
, nentry
, sizeof (nvme_cqe_t
),
808 DDI_DMA_READ
, &qp
->nq_cqdma
) != DDI_SUCCESS
)
811 qp
->nq_sq
= (nvme_sqe_t
*)qp
->nq_sqdma
->nd_memp
;
812 qp
->nq_cq
= (nvme_cqe_t
*)qp
->nq_cqdma
->nd_memp
;
813 qp
->nq_nentry
= nentry
;
815 qp
->nq_sqtdbl
= NVME_REG_SQTDBL(nvme
, idx
);
816 qp
->nq_cqhdbl
= NVME_REG_CQHDBL(nvme
, idx
);
818 qp
->nq_cmd
= kmem_zalloc(sizeof (nvme_cmd_t
*) * nentry
, KM_SLEEP
);
822 return (DDI_SUCCESS
);
828 return (DDI_FAILURE
);
832 nvme_alloc_cmd(nvme_t
*nvme
, int kmflag
)
834 nvme_cmd_t
*cmd
= kmem_cache_alloc(nvme_cmd_cache
, kmflag
);
839 bzero(cmd
, sizeof (nvme_cmd_t
));
843 mutex_init(&cmd
->nc_mutex
, NULL
, MUTEX_DRIVER
,
844 DDI_INTR_PRI(nvme
->n_intr_pri
));
845 cv_init(&cmd
->nc_cv
, NULL
, CV_DRIVER
, NULL
);
851 nvme_free_cmd(nvme_cmd_t
*cmd
)
853 /* Don't free commands on the lost commands list. */
854 if (list_link_active(&cmd
->nc_list
))
858 if (cmd
->nc_dma
->nd_cached
)
859 kmem_cache_free(cmd
->nc_nvme
->n_prp_cache
,
862 nvme_free_dma(cmd
->nc_dma
);
866 cv_destroy(&cmd
->nc_cv
);
867 mutex_destroy(&cmd
->nc_mutex
);
869 kmem_cache_free(nvme_cmd_cache
, cmd
);
873 nvme_submit_admin_cmd(nvme_qpair_t
*qp
, nvme_cmd_t
*cmd
)
875 sema_p(&qp
->nq_sema
);
876 nvme_submit_cmd_common(qp
, cmd
);
880 nvme_submit_io_cmd(nvme_qpair_t
*qp
, nvme_cmd_t
*cmd
)
882 if (sema_tryp(&qp
->nq_sema
) == 0)
885 nvme_submit_cmd_common(qp
, cmd
);
890 nvme_submit_cmd_common(nvme_qpair_t
*qp
, nvme_cmd_t
*cmd
)
892 nvme_reg_sqtdbl_t tail
= { 0 };
894 mutex_enter(&qp
->nq_mutex
);
895 cmd
->nc_completed
= B_FALSE
;
898 * Try to insert the cmd into the active cmd array at the nq_next_cmd
899 * slot. If the slot is already occupied advance to the next slot and
900 * try again. This can happen for long running commands like async event
903 while (qp
->nq_cmd
[qp
->nq_next_cmd
] != NULL
)
904 qp
->nq_next_cmd
= (qp
->nq_next_cmd
+ 1) % qp
->nq_nentry
;
905 qp
->nq_cmd
[qp
->nq_next_cmd
] = cmd
;
907 qp
->nq_active_cmds
++;
909 cmd
->nc_sqe
.sqe_cid
= qp
->nq_next_cmd
;
910 bcopy(&cmd
->nc_sqe
, &qp
->nq_sq
[qp
->nq_sqtail
], sizeof (nvme_sqe_t
));
911 (void) ddi_dma_sync(qp
->nq_sqdma
->nd_dmah
,
912 sizeof (nvme_sqe_t
) * qp
->nq_sqtail
,
913 sizeof (nvme_sqe_t
), DDI_DMA_SYNC_FORDEV
);
914 qp
->nq_next_cmd
= (qp
->nq_next_cmd
+ 1) % qp
->nq_nentry
;
916 tail
.b
.sqtdbl_sqt
= qp
->nq_sqtail
= (qp
->nq_sqtail
+ 1) % qp
->nq_nentry
;
917 nvme_put32(cmd
->nc_nvme
, qp
->nq_sqtdbl
, tail
.r
);
919 mutex_exit(&qp
->nq_mutex
);
923 nvme_unqueue_cmd(nvme_t
*nvme
, nvme_qpair_t
*qp
, int cid
)
927 ASSERT(mutex_owned(&qp
->nq_mutex
));
928 ASSERT3S(cid
, <, qp
->nq_nentry
);
930 cmd
= qp
->nq_cmd
[cid
];
931 qp
->nq_cmd
[cid
] = NULL
;
932 ASSERT3U(qp
->nq_active_cmds
, >, 0);
933 qp
->nq_active_cmds
--;
934 sema_v(&qp
->nq_sema
);
936 ASSERT3P(cmd
, !=, NULL
);
937 ASSERT3P(cmd
->nc_nvme
, ==, nvme
);
938 ASSERT3S(cmd
->nc_sqe
.sqe_cid
, ==, cid
);
944 nvme_retrieve_cmd(nvme_t
*nvme
, nvme_qpair_t
*qp
)
946 nvme_reg_cqhdbl_t head
= { 0 };
951 (void) ddi_dma_sync(qp
->nq_cqdma
->nd_dmah
, 0,
952 sizeof (nvme_cqe_t
) * qp
->nq_nentry
, DDI_DMA_SYNC_FORKERNEL
);
954 mutex_enter(&qp
->nq_mutex
);
955 cqe
= &qp
->nq_cq
[qp
->nq_cqhead
];
957 /* Check phase tag of CQE. Hardware inverts it for new entries. */
958 if (cqe
->cqe_sf
.sf_p
== qp
->nq_phase
) {
959 mutex_exit(&qp
->nq_mutex
);
963 ASSERT(nvme
->n_ioq
[cqe
->cqe_sqid
] == qp
);
965 cmd
= nvme_unqueue_cmd(nvme
, qp
, cqe
->cqe_cid
);
967 ASSERT(cmd
->nc_sqid
== cqe
->cqe_sqid
);
968 bcopy(cqe
, &cmd
->nc_cqe
, sizeof (nvme_cqe_t
));
970 qp
->nq_sqhead
= cqe
->cqe_sqhd
;
972 head
.b
.cqhdbl_cqh
= qp
->nq_cqhead
= (qp
->nq_cqhead
+ 1) % qp
->nq_nentry
;
974 /* Toggle phase on wrap-around. */
975 if (qp
->nq_cqhead
== 0)
976 qp
->nq_phase
= qp
->nq_phase
? 0 : 1;
978 nvme_put32(cmd
->nc_nvme
, qp
->nq_cqhdbl
, head
.r
);
979 mutex_exit(&qp
->nq_mutex
);
985 nvme_check_unknown_cmd_status(nvme_cmd_t
*cmd
)
987 nvme_cqe_t
*cqe
= &cmd
->nc_cqe
;
989 dev_err(cmd
->nc_nvme
->n_dip
, CE_WARN
,
990 "!unknown command status received: opc = %x, sqid = %d, cid = %d, "
991 "sc = %x, sct = %x, dnr = %d, m = %d", cmd
->nc_sqe
.sqe_opc
,
992 cqe
->cqe_sqid
, cqe
->cqe_cid
, cqe
->cqe_sf
.sf_sc
, cqe
->cqe_sf
.sf_sct
,
993 cqe
->cqe_sf
.sf_dnr
, cqe
->cqe_sf
.sf_m
);
995 if (cmd
->nc_xfer
!= NULL
)
996 bd_error(cmd
->nc_xfer
, BD_ERR_ILLRQ
);
998 if (cmd
->nc_nvme
->n_strict_version
) {
999 cmd
->nc_nvme
->n_dead
= B_TRUE
;
1000 ddi_fm_service_impact(cmd
->nc_nvme
->n_dip
, DDI_SERVICE_LOST
);
1007 nvme_check_vendor_cmd_status(nvme_cmd_t
*cmd
)
1009 nvme_cqe_t
*cqe
= &cmd
->nc_cqe
;
1011 dev_err(cmd
->nc_nvme
->n_dip
, CE_WARN
,
1012 "!unknown command status received: opc = %x, sqid = %d, cid = %d, "
1013 "sc = %x, sct = %x, dnr = %d, m = %d", cmd
->nc_sqe
.sqe_opc
,
1014 cqe
->cqe_sqid
, cqe
->cqe_cid
, cqe
->cqe_sf
.sf_sc
, cqe
->cqe_sf
.sf_sct
,
1015 cqe
->cqe_sf
.sf_dnr
, cqe
->cqe_sf
.sf_m
);
1016 if (!cmd
->nc_nvme
->n_ignore_unknown_vendor_status
) {
1017 cmd
->nc_nvme
->n_dead
= B_TRUE
;
1018 ddi_fm_service_impact(cmd
->nc_nvme
->n_dip
, DDI_SERVICE_LOST
);
1025 nvme_check_integrity_cmd_status(nvme_cmd_t
*cmd
)
1027 nvme_cqe_t
*cqe
= &cmd
->nc_cqe
;
1029 switch (cqe
->cqe_sf
.sf_sc
) {
1030 case NVME_CQE_SC_INT_NVM_WRITE
:
1032 /* TODO: post ereport */
1033 if (cmd
->nc_xfer
!= NULL
)
1034 bd_error(cmd
->nc_xfer
, BD_ERR_MEDIA
);
1037 case NVME_CQE_SC_INT_NVM_READ
:
1039 /* TODO: post ereport */
1040 if (cmd
->nc_xfer
!= NULL
)
1041 bd_error(cmd
->nc_xfer
, BD_ERR_MEDIA
);
1045 return (nvme_check_unknown_cmd_status(cmd
));
1050 nvme_check_generic_cmd_status(nvme_cmd_t
*cmd
)
1052 nvme_cqe_t
*cqe
= &cmd
->nc_cqe
;
1054 switch (cqe
->cqe_sf
.sf_sc
) {
1055 case NVME_CQE_SC_GEN_SUCCESS
:
1059 * Errors indicating a bug in the driver should cause a panic.
1061 case NVME_CQE_SC_GEN_INV_OPC
:
1062 /* Invalid Command Opcode */
1063 if (!cmd
->nc_dontpanic
)
1064 dev_err(cmd
->nc_nvme
->n_dip
, CE_PANIC
,
1065 "programming error: invalid opcode in cmd %p",
1069 case NVME_CQE_SC_GEN_INV_FLD
:
1070 /* Invalid Field in Command */
1071 if (!cmd
->nc_dontpanic
)
1072 dev_err(cmd
->nc_nvme
->n_dip
, CE_PANIC
,
1073 "programming error: invalid field in cmd %p",
1077 case NVME_CQE_SC_GEN_ID_CNFL
:
1078 /* Command ID Conflict */
1079 dev_err(cmd
->nc_nvme
->n_dip
, CE_PANIC
, "programming error: "
1080 "cmd ID conflict in cmd %p", (void *)cmd
);
1083 case NVME_CQE_SC_GEN_INV_NS
:
1084 /* Invalid Namespace or Format */
1085 if (!cmd
->nc_dontpanic
)
1086 dev_err(cmd
->nc_nvme
->n_dip
, CE_PANIC
,
1087 "programming error: invalid NS/format in cmd %p",
1091 case NVME_CQE_SC_GEN_NVM_LBA_RANGE
:
1092 /* LBA Out Of Range */
1093 dev_err(cmd
->nc_nvme
->n_dip
, CE_PANIC
, "programming error: "
1094 "LBA out of range in cmd %p", (void *)cmd
);
1098 * Non-fatal errors, handle gracefully.
1100 case NVME_CQE_SC_GEN_DATA_XFR_ERR
:
1101 /* Data Transfer Error (DMA) */
1102 /* TODO: post ereport */
1103 atomic_inc_32(&cmd
->nc_nvme
->n_data_xfr_err
);
1104 if (cmd
->nc_xfer
!= NULL
)
1105 bd_error(cmd
->nc_xfer
, BD_ERR_NTRDY
);
1108 case NVME_CQE_SC_GEN_INTERNAL_ERR
:
1110 * Internal Error. The spec (v1.0, section 4.5.1.2) says
1111 * detailed error information is returned as async event,
1112 * so we pretty much ignore the error here and handle it
1113 * in the async event handler.
1115 atomic_inc_32(&cmd
->nc_nvme
->n_internal_err
);
1116 if (cmd
->nc_xfer
!= NULL
)
1117 bd_error(cmd
->nc_xfer
, BD_ERR_NTRDY
);
1120 case NVME_CQE_SC_GEN_ABORT_REQUEST
:
1122 * Command Abort Requested. This normally happens only when a
1123 * command times out.
1125 /* TODO: post ereport or change blkdev to handle this? */
1126 atomic_inc_32(&cmd
->nc_nvme
->n_abort_rq_err
);
1129 case NVME_CQE_SC_GEN_ABORT_PWRLOSS
:
1130 /* Command Aborted due to Power Loss Notification */
1131 ddi_fm_service_impact(cmd
->nc_nvme
->n_dip
, DDI_SERVICE_LOST
);
1132 cmd
->nc_nvme
->n_dead
= B_TRUE
;
1135 case NVME_CQE_SC_GEN_ABORT_SQ_DEL
:
1136 /* Command Aborted due to SQ Deletion */
1137 atomic_inc_32(&cmd
->nc_nvme
->n_abort_sq_del
);
1140 case NVME_CQE_SC_GEN_NVM_CAP_EXC
:
1141 /* Capacity Exceeded */
1142 atomic_inc_32(&cmd
->nc_nvme
->n_nvm_cap_exc
);
1143 if (cmd
->nc_xfer
!= NULL
)
1144 bd_error(cmd
->nc_xfer
, BD_ERR_MEDIA
);
1147 case NVME_CQE_SC_GEN_NVM_NS_NOTRDY
:
1148 /* Namespace Not Ready */
1149 atomic_inc_32(&cmd
->nc_nvme
->n_nvm_ns_notrdy
);
1150 if (cmd
->nc_xfer
!= NULL
)
1151 bd_error(cmd
->nc_xfer
, BD_ERR_NTRDY
);
1155 return (nvme_check_unknown_cmd_status(cmd
));
1160 nvme_check_specific_cmd_status(nvme_cmd_t
*cmd
)
1162 nvme_cqe_t
*cqe
= &cmd
->nc_cqe
;
1164 switch (cqe
->cqe_sf
.sf_sc
) {
1165 case NVME_CQE_SC_SPC_INV_CQ
:
1166 /* Completion Queue Invalid */
1167 ASSERT(cmd
->nc_sqe
.sqe_opc
== NVME_OPC_CREATE_SQUEUE
);
1168 atomic_inc_32(&cmd
->nc_nvme
->n_inv_cq_err
);
1171 case NVME_CQE_SC_SPC_INV_QID
:
1172 /* Invalid Queue Identifier */
1173 ASSERT(cmd
->nc_sqe
.sqe_opc
== NVME_OPC_CREATE_SQUEUE
||
1174 cmd
->nc_sqe
.sqe_opc
== NVME_OPC_DELETE_SQUEUE
||
1175 cmd
->nc_sqe
.sqe_opc
== NVME_OPC_CREATE_CQUEUE
||
1176 cmd
->nc_sqe
.sqe_opc
== NVME_OPC_DELETE_CQUEUE
);
1177 atomic_inc_32(&cmd
->nc_nvme
->n_inv_qid_err
);
1180 case NVME_CQE_SC_SPC_MAX_QSZ_EXC
:
1181 /* Max Queue Size Exceeded */
1182 ASSERT(cmd
->nc_sqe
.sqe_opc
== NVME_OPC_CREATE_SQUEUE
||
1183 cmd
->nc_sqe
.sqe_opc
== NVME_OPC_CREATE_CQUEUE
);
1184 atomic_inc_32(&cmd
->nc_nvme
->n_max_qsz_exc
);
1187 case NVME_CQE_SC_SPC_ABRT_CMD_EXC
:
1188 /* Abort Command Limit Exceeded */
1189 ASSERT(cmd
->nc_sqe
.sqe_opc
== NVME_OPC_ABORT
);
1190 dev_err(cmd
->nc_nvme
->n_dip
, CE_PANIC
, "programming error: "
1191 "abort command limit exceeded in cmd %p", (void *)cmd
);
1194 case NVME_CQE_SC_SPC_ASYNC_EVREQ_EXC
:
1195 /* Async Event Request Limit Exceeded */
1196 ASSERT(cmd
->nc_sqe
.sqe_opc
== NVME_OPC_ASYNC_EVENT
);
1197 dev_err(cmd
->nc_nvme
->n_dip
, CE_PANIC
, "programming error: "
1198 "async event request limit exceeded in cmd %p",
1202 case NVME_CQE_SC_SPC_INV_INT_VECT
:
1203 /* Invalid Interrupt Vector */
1204 ASSERT(cmd
->nc_sqe
.sqe_opc
== NVME_OPC_CREATE_CQUEUE
);
1205 atomic_inc_32(&cmd
->nc_nvme
->n_inv_int_vect
);
1208 case NVME_CQE_SC_SPC_INV_LOG_PAGE
:
1209 /* Invalid Log Page */
1210 ASSERT(cmd
->nc_sqe
.sqe_opc
== NVME_OPC_GET_LOG_PAGE
);
1211 atomic_inc_32(&cmd
->nc_nvme
->n_inv_log_page
);
1214 case NVME_CQE_SC_SPC_INV_FORMAT
:
1215 /* Invalid Format */
1216 ASSERT(cmd
->nc_sqe
.sqe_opc
== NVME_OPC_NVM_FORMAT
);
1217 atomic_inc_32(&cmd
->nc_nvme
->n_inv_format
);
1218 if (cmd
->nc_xfer
!= NULL
)
1219 bd_error(cmd
->nc_xfer
, BD_ERR_ILLRQ
);
1222 case NVME_CQE_SC_SPC_INV_Q_DEL
:
1223 /* Invalid Queue Deletion */
1224 ASSERT(cmd
->nc_sqe
.sqe_opc
== NVME_OPC_DELETE_CQUEUE
);
1225 atomic_inc_32(&cmd
->nc_nvme
->n_inv_q_del
);
1228 case NVME_CQE_SC_SPC_NVM_CNFL_ATTR
:
1229 /* Conflicting Attributes */
1230 ASSERT(cmd
->nc_sqe
.sqe_opc
== NVME_OPC_NVM_DSET_MGMT
||
1231 cmd
->nc_sqe
.sqe_opc
== NVME_OPC_NVM_READ
||
1232 cmd
->nc_sqe
.sqe_opc
== NVME_OPC_NVM_WRITE
);
1233 atomic_inc_32(&cmd
->nc_nvme
->n_cnfl_attr
);
1234 if (cmd
->nc_xfer
!= NULL
)
1235 bd_error(cmd
->nc_xfer
, BD_ERR_ILLRQ
);
1238 case NVME_CQE_SC_SPC_NVM_INV_PROT
:
1239 /* Invalid Protection Information */
1240 ASSERT(cmd
->nc_sqe
.sqe_opc
== NVME_OPC_NVM_COMPARE
||
1241 cmd
->nc_sqe
.sqe_opc
== NVME_OPC_NVM_READ
||
1242 cmd
->nc_sqe
.sqe_opc
== NVME_OPC_NVM_WRITE
);
1243 atomic_inc_32(&cmd
->nc_nvme
->n_inv_prot
);
1244 if (cmd
->nc_xfer
!= NULL
)
1245 bd_error(cmd
->nc_xfer
, BD_ERR_ILLRQ
);
1248 case NVME_CQE_SC_SPC_NVM_READONLY
:
1249 /* Write to Read Only Range */
1250 ASSERT(cmd
->nc_sqe
.sqe_opc
== NVME_OPC_NVM_WRITE
);
1251 atomic_inc_32(&cmd
->nc_nvme
->n_readonly
);
1252 if (cmd
->nc_xfer
!= NULL
)
1253 bd_error(cmd
->nc_xfer
, BD_ERR_ILLRQ
);
1257 return (nvme_check_unknown_cmd_status(cmd
));
1262 nvme_check_cmd_status(nvme_cmd_t
*cmd
)
1264 nvme_cqe_t
*cqe
= &cmd
->nc_cqe
;
1267 * Take a shortcut if the controller is dead, or if
1268 * command status indicates no error.
1270 if (cmd
->nc_nvme
->n_dead
)
1273 if (cqe
->cqe_sf
.sf_sct
== NVME_CQE_SCT_GENERIC
&&
1274 cqe
->cqe_sf
.sf_sc
== NVME_CQE_SC_GEN_SUCCESS
)
1277 if (cqe
->cqe_sf
.sf_sct
== NVME_CQE_SCT_GENERIC
)
1278 return (nvme_check_generic_cmd_status(cmd
));
1279 else if (cqe
->cqe_sf
.sf_sct
== NVME_CQE_SCT_SPECIFIC
)
1280 return (nvme_check_specific_cmd_status(cmd
));
1281 else if (cqe
->cqe_sf
.sf_sct
== NVME_CQE_SCT_INTEGRITY
)
1282 return (nvme_check_integrity_cmd_status(cmd
));
1283 else if (cqe
->cqe_sf
.sf_sct
== NVME_CQE_SCT_VENDOR
)
1284 return (nvme_check_vendor_cmd_status(cmd
));
1286 return (nvme_check_unknown_cmd_status(cmd
));
1290 nvme_abort_cmd(nvme_cmd_t
*abort_cmd
, uint_t sec
)
1292 nvme_t
*nvme
= abort_cmd
->nc_nvme
;
1293 nvme_cmd_t
*cmd
= nvme_alloc_cmd(nvme
, KM_SLEEP
);
1294 nvme_abort_cmd_t ac
= { 0 };
1297 sema_p(&nvme
->n_abort_sema
);
1299 ac
.b
.ac_cid
= abort_cmd
->nc_sqe
.sqe_cid
;
1300 ac
.b
.ac_sqid
= abort_cmd
->nc_sqid
;
1303 cmd
->nc_sqe
.sqe_opc
= NVME_OPC_ABORT
;
1304 cmd
->nc_callback
= nvme_wakeup_cmd
;
1305 cmd
->nc_sqe
.sqe_cdw10
= ac
.r
;
1308 * Send the ABORT to the hardware. The ABORT command will return _after_
1309 * the aborted command has completed (aborted or otherwise), but since
1310 * we still hold the aborted command's mutex its callback hasn't been
1313 nvme_admin_cmd(cmd
, sec
);
1314 sema_v(&nvme
->n_abort_sema
);
1316 if ((ret
= nvme_check_cmd_status(cmd
)) != 0) {
1317 dev_err(nvme
->n_dip
, CE_WARN
,
1318 "!ABORT failed with sct = %x, sc = %x",
1319 cmd
->nc_cqe
.cqe_sf
.sf_sct
, cmd
->nc_cqe
.cqe_sf
.sf_sc
);
1320 atomic_inc_32(&nvme
->n_abort_failed
);
1322 dev_err(nvme
->n_dip
, CE_WARN
,
1323 "!ABORT of command %d/%d %ssuccessful",
1324 abort_cmd
->nc_sqe
.sqe_cid
, abort_cmd
->nc_sqid
,
1325 cmd
->nc_cqe
.cqe_dw0
& 1 ? "un" : "");
1326 if ((cmd
->nc_cqe
.cqe_dw0
& 1) == 0)
1327 atomic_inc_32(&nvme
->n_cmd_aborted
);
1335 * nvme_wait_cmd -- wait for command completion or timeout
1337 * In case of a serious error or a timeout of the abort command the hardware
1338 * will be declared dead and FMA will be notified.
1341 nvme_wait_cmd(nvme_cmd_t
*cmd
, uint_t sec
)
1343 clock_t timeout
= ddi_get_lbolt() + drv_usectohz(sec
* MICROSEC
);
1344 nvme_t
*nvme
= cmd
->nc_nvme
;
1345 nvme_reg_csts_t csts
;
1348 ASSERT(mutex_owned(&cmd
->nc_mutex
));
1350 while (!cmd
->nc_completed
) {
1351 if (cv_timedwait(&cmd
->nc_cv
, &cmd
->nc_mutex
, timeout
) == -1)
1355 if (cmd
->nc_completed
)
1359 * The command timed out.
1361 * Check controller for fatal status, any errors associated with the
1362 * register or DMA handle, or for a double timeout (abort command timed
1363 * out). If necessary log a warning and call FMA.
1365 csts
.r
= nvme_get32(nvme
, NVME_REG_CSTS
);
1366 dev_err(nvme
->n_dip
, CE_WARN
, "!command %d/%d timeout, "
1367 "OPC = %x, CFS = %d", cmd
->nc_sqe
.sqe_cid
, cmd
->nc_sqid
,
1368 cmd
->nc_sqe
.sqe_opc
, csts
.b
.csts_cfs
);
1369 atomic_inc_32(&nvme
->n_cmd_timeout
);
1371 if (csts
.b
.csts_cfs
||
1372 nvme_check_regs_hdl(nvme
) ||
1373 nvme_check_dma_hdl(cmd
->nc_dma
) ||
1374 cmd
->nc_sqe
.sqe_opc
== NVME_OPC_ABORT
) {
1375 ddi_fm_service_impact(nvme
->n_dip
, DDI_SERVICE_LOST
);
1376 nvme
->n_dead
= B_TRUE
;
1377 } else if (nvme_abort_cmd(cmd
, sec
) == 0) {
1379 * If the abort succeeded the command should complete
1380 * immediately with an appropriate status.
1382 while (!cmd
->nc_completed
)
1383 cv_wait(&cmd
->nc_cv
, &cmd
->nc_mutex
);
1388 qp
= nvme
->n_ioq
[cmd
->nc_sqid
];
1390 mutex_enter(&qp
->nq_mutex
);
1391 (void) nvme_unqueue_cmd(nvme
, qp
, cmd
->nc_sqe
.sqe_cid
);
1392 mutex_exit(&qp
->nq_mutex
);
1395 * As we don't know what the presumed dead hardware might still do with
1396 * the DMA memory, we'll put the command on the lost commands list if it
1397 * has any DMA memory.
1399 if (cmd
->nc_dma
!= NULL
) {
1400 mutex_enter(&nvme_lc_mutex
);
1401 list_insert_head(&nvme_lost_cmds
, cmd
);
1402 mutex_exit(&nvme_lc_mutex
);
1407 nvme_wakeup_cmd(void *arg
)
1409 nvme_cmd_t
*cmd
= arg
;
1411 mutex_enter(&cmd
->nc_mutex
);
1412 cmd
->nc_completed
= B_TRUE
;
1413 cv_signal(&cmd
->nc_cv
);
1414 mutex_exit(&cmd
->nc_mutex
);
1418 nvme_async_event_task(void *arg
)
1420 nvme_cmd_t
*cmd
= arg
;
1421 nvme_t
*nvme
= cmd
->nc_nvme
;
1422 nvme_error_log_entry_t
*error_log
= NULL
;
1423 nvme_health_log_t
*health_log
= NULL
;
1425 nvme_async_event_t event
;
1428 * Check for errors associated with the async request itself. The only
1429 * command-specific error is "async event limit exceeded", which
1430 * indicates a programming error in the driver and causes a panic in
1431 * nvme_check_cmd_status().
1433 * Other possible errors are various scenarios where the async request
1434 * was aborted, or internal errors in the device. Internal errors are
1435 * reported to FMA, the command aborts need no special handling here.
1437 * And finally, at least qemu nvme does not support async events,
1438 * and will return NVME_CQE_SC_GEN_INV_OPC | DNR. If so, we
1439 * will avoid posting async events.
1442 if (nvme_check_cmd_status(cmd
) != 0) {
1443 dev_err(cmd
->nc_nvme
->n_dip
, CE_WARN
,
1444 "!async event request returned failure, sct = %x, "
1445 "sc = %x, dnr = %d, m = %d", cmd
->nc_cqe
.cqe_sf
.sf_sct
,
1446 cmd
->nc_cqe
.cqe_sf
.sf_sc
, cmd
->nc_cqe
.cqe_sf
.sf_dnr
,
1447 cmd
->nc_cqe
.cqe_sf
.sf_m
);
1449 if (cmd
->nc_cqe
.cqe_sf
.sf_sct
== NVME_CQE_SCT_GENERIC
&&
1450 cmd
->nc_cqe
.cqe_sf
.sf_sc
== NVME_CQE_SC_GEN_INTERNAL_ERR
) {
1451 cmd
->nc_nvme
->n_dead
= B_TRUE
;
1452 ddi_fm_service_impact(cmd
->nc_nvme
->n_dip
,
1456 if (cmd
->nc_cqe
.cqe_sf
.sf_sct
== NVME_CQE_SCT_GENERIC
&&
1457 cmd
->nc_cqe
.cqe_sf
.sf_sc
== NVME_CQE_SC_GEN_INV_OPC
&&
1458 cmd
->nc_cqe
.cqe_sf
.sf_dnr
== 1) {
1459 nvme
->n_async_event_supported
= B_FALSE
;
1467 event
.r
= cmd
->nc_cqe
.cqe_dw0
;
1469 /* Clear CQE and re-submit the async request. */
1470 bzero(&cmd
->nc_cqe
, sizeof (nvme_cqe_t
));
1471 nvme_submit_admin_cmd(nvme
->n_adminq
, cmd
);
1473 switch (event
.b
.ae_type
) {
1474 case NVME_ASYNC_TYPE_ERROR
:
1475 if (event
.b
.ae_logpage
== NVME_LOGPAGE_ERROR
) {
1476 (void) nvme_get_logpage(nvme
, (void **)&error_log
,
1477 &logsize
, event
.b
.ae_logpage
);
1479 dev_err(nvme
->n_dip
, CE_WARN
, "!wrong logpage in "
1480 "async event reply: %d", event
.b
.ae_logpage
);
1481 atomic_inc_32(&nvme
->n_wrong_logpage
);
1484 switch (event
.b
.ae_info
) {
1485 case NVME_ASYNC_ERROR_INV_SQ
:
1486 dev_err(nvme
->n_dip
, CE_PANIC
, "programming error: "
1487 "invalid submission queue");
1490 case NVME_ASYNC_ERROR_INV_DBL
:
1491 dev_err(nvme
->n_dip
, CE_PANIC
, "programming error: "
1492 "invalid doorbell write value");
1495 case NVME_ASYNC_ERROR_DIAGFAIL
:
1496 dev_err(nvme
->n_dip
, CE_WARN
, "!diagnostic failure");
1497 ddi_fm_service_impact(nvme
->n_dip
, DDI_SERVICE_LOST
);
1498 nvme
->n_dead
= B_TRUE
;
1499 atomic_inc_32(&nvme
->n_diagfail_event
);
1502 case NVME_ASYNC_ERROR_PERSISTENT
:
1503 dev_err(nvme
->n_dip
, CE_WARN
, "!persistent internal "
1505 ddi_fm_service_impact(nvme
->n_dip
, DDI_SERVICE_LOST
);
1506 nvme
->n_dead
= B_TRUE
;
1507 atomic_inc_32(&nvme
->n_persistent_event
);
1510 case NVME_ASYNC_ERROR_TRANSIENT
:
1511 dev_err(nvme
->n_dip
, CE_WARN
, "!transient internal "
1513 /* TODO: send ereport */
1514 atomic_inc_32(&nvme
->n_transient_event
);
1517 case NVME_ASYNC_ERROR_FW_LOAD
:
1518 dev_err(nvme
->n_dip
, CE_WARN
,
1519 "!firmware image load error");
1520 atomic_inc_32(&nvme
->n_fw_load_event
);
1525 case NVME_ASYNC_TYPE_HEALTH
:
1526 if (event
.b
.ae_logpage
== NVME_LOGPAGE_HEALTH
) {
1527 (void) nvme_get_logpage(nvme
, (void **)&health_log
,
1528 &logsize
, event
.b
.ae_logpage
, -1);
1530 dev_err(nvme
->n_dip
, CE_WARN
, "!wrong logpage in "
1531 "async event reply: %d", event
.b
.ae_logpage
);
1532 atomic_inc_32(&nvme
->n_wrong_logpage
);
1535 switch (event
.b
.ae_info
) {
1536 case NVME_ASYNC_HEALTH_RELIABILITY
:
1537 dev_err(nvme
->n_dip
, CE_WARN
,
1538 "!device reliability compromised");
1539 /* TODO: send ereport */
1540 atomic_inc_32(&nvme
->n_reliability_event
);
1543 case NVME_ASYNC_HEALTH_TEMPERATURE
:
1544 dev_err(nvme
->n_dip
, CE_WARN
,
1545 "!temperature above threshold");
1546 /* TODO: send ereport */
1547 atomic_inc_32(&nvme
->n_temperature_event
);
1550 case NVME_ASYNC_HEALTH_SPARE
:
1551 dev_err(nvme
->n_dip
, CE_WARN
,
1552 "!spare space below threshold");
1553 /* TODO: send ereport */
1554 atomic_inc_32(&nvme
->n_spare_event
);
1559 case NVME_ASYNC_TYPE_VENDOR
:
1560 dev_err(nvme
->n_dip
, CE_WARN
, "!vendor specific async event "
1561 "received, info = %x, logpage = %x", event
.b
.ae_info
,
1562 event
.b
.ae_logpage
);
1563 atomic_inc_32(&nvme
->n_vendor_event
);
1567 dev_err(nvme
->n_dip
, CE_WARN
, "!unknown async event received, "
1568 "type = %x, info = %x, logpage = %x", event
.b
.ae_type
,
1569 event
.b
.ae_info
, event
.b
.ae_logpage
);
1570 atomic_inc_32(&nvme
->n_unknown_event
);
1575 kmem_free(error_log
, logsize
);
1578 kmem_free(health_log
, logsize
);
1582 nvme_admin_cmd(nvme_cmd_t
*cmd
, int sec
)
1584 mutex_enter(&cmd
->nc_mutex
);
1585 nvme_submit_admin_cmd(cmd
->nc_nvme
->n_adminq
, cmd
);
1586 nvme_wait_cmd(cmd
, sec
);
1587 mutex_exit(&cmd
->nc_mutex
);
1591 nvme_async_event(nvme_t
*nvme
)
1595 cmd
= nvme_alloc_cmd(nvme
, KM_SLEEP
);
1597 cmd
->nc_sqe
.sqe_opc
= NVME_OPC_ASYNC_EVENT
;
1598 cmd
->nc_callback
= nvme_async_event_task
;
1599 cmd
->nc_dontpanic
= B_TRUE
;
1601 nvme_submit_admin_cmd(nvme
->n_adminq
, cmd
);
1605 nvme_format_nvm(nvme_t
*nvme
, uint32_t nsid
, uint8_t lbaf
, boolean_t ms
,
1606 uint8_t pi
, boolean_t pil
, uint8_t ses
)
1608 nvme_cmd_t
*cmd
= nvme_alloc_cmd(nvme
, KM_SLEEP
);
1609 nvme_format_nvm_t format_nvm
= { 0 };
1612 format_nvm
.b
.fm_lbaf
= lbaf
& 0xf;
1613 format_nvm
.b
.fm_ms
= ms
? 1 : 0;
1614 format_nvm
.b
.fm_pi
= pi
& 0x7;
1615 format_nvm
.b
.fm_pil
= pil
? 1 : 0;
1616 format_nvm
.b
.fm_ses
= ses
& 0x7;
1619 cmd
->nc_callback
= nvme_wakeup_cmd
;
1620 cmd
->nc_sqe
.sqe_nsid
= nsid
;
1621 cmd
->nc_sqe
.sqe_opc
= NVME_OPC_NVM_FORMAT
;
1622 cmd
->nc_sqe
.sqe_cdw10
= format_nvm
.r
;
1625 * Some devices like Samsung SM951 don't allow formatting of all
1626 * namespaces in one command. Handle that gracefully.
1628 if (nsid
== (uint32_t)-1)
1629 cmd
->nc_dontpanic
= B_TRUE
;
1631 nvme_admin_cmd(cmd
, nvme_format_cmd_timeout
);
1633 if ((ret
= nvme_check_cmd_status(cmd
)) != 0) {
1634 dev_err(nvme
->n_dip
, CE_WARN
,
1635 "!FORMAT failed with sct = %x, sc = %x",
1636 cmd
->nc_cqe
.cqe_sf
.sf_sct
, cmd
->nc_cqe
.cqe_sf
.sf_sc
);
1644 nvme_get_logpage(nvme_t
*nvme
, void **buf
, size_t *bufsize
, uint8_t logpage
,
1647 nvme_cmd_t
*cmd
= nvme_alloc_cmd(nvme
, KM_SLEEP
);
1648 nvme_getlogpage_t getlogpage
= { 0 };
1652 va_start(ap
, logpage
);
1655 cmd
->nc_callback
= nvme_wakeup_cmd
;
1656 cmd
->nc_sqe
.sqe_opc
= NVME_OPC_GET_LOG_PAGE
;
1658 getlogpage
.b
.lp_lid
= logpage
;
1661 case NVME_LOGPAGE_ERROR
:
1662 cmd
->nc_sqe
.sqe_nsid
= (uint32_t)-1;
1664 * The GET LOG PAGE command can use at most 2 pages to return
1665 * data, PRP lists are not supported.
1667 *bufsize
= MIN(2 * nvme
->n_pagesize
,
1668 nvme
->n_error_log_len
* sizeof (nvme_error_log_entry_t
));
1671 case NVME_LOGPAGE_HEALTH
:
1672 cmd
->nc_sqe
.sqe_nsid
= va_arg(ap
, uint32_t);
1673 *bufsize
= sizeof (nvme_health_log_t
);
1676 case NVME_LOGPAGE_FWSLOT
:
1677 cmd
->nc_sqe
.sqe_nsid
= (uint32_t)-1;
1678 *bufsize
= sizeof (nvme_fwslot_log_t
);
1682 dev_err(nvme
->n_dip
, CE_WARN
, "!unknown log page requested: %d",
1684 atomic_inc_32(&nvme
->n_unknown_logpage
);
1691 getlogpage
.b
.lp_numd
= *bufsize
/ sizeof (uint32_t) - 1;
1693 cmd
->nc_sqe
.sqe_cdw10
= getlogpage
.r
;
1695 if (nvme_zalloc_dma(nvme
, getlogpage
.b
.lp_numd
* sizeof (uint32_t),
1696 DDI_DMA_READ
, &nvme
->n_prp_dma_attr
, &cmd
->nc_dma
) != DDI_SUCCESS
) {
1697 dev_err(nvme
->n_dip
, CE_WARN
,
1698 "!nvme_zalloc_dma failed for GET LOG PAGE");
1703 if (cmd
->nc_dma
->nd_ncookie
> 2) {
1704 dev_err(nvme
->n_dip
, CE_WARN
,
1705 "!too many DMA cookies for GET LOG PAGE");
1706 atomic_inc_32(&nvme
->n_too_many_cookies
);
1711 cmd
->nc_sqe
.sqe_dptr
.d_prp
[0] = cmd
->nc_dma
->nd_cookie
.dmac_laddress
;
1712 if (cmd
->nc_dma
->nd_ncookie
> 1) {
1713 ddi_dma_nextcookie(cmd
->nc_dma
->nd_dmah
,
1714 &cmd
->nc_dma
->nd_cookie
);
1715 cmd
->nc_sqe
.sqe_dptr
.d_prp
[1] =
1716 cmd
->nc_dma
->nd_cookie
.dmac_laddress
;
1719 nvme_admin_cmd(cmd
, nvme_admin_cmd_timeout
);
1721 if ((ret
= nvme_check_cmd_status(cmd
)) != 0) {
1722 dev_err(nvme
->n_dip
, CE_WARN
,
1723 "!GET LOG PAGE failed with sct = %x, sc = %x",
1724 cmd
->nc_cqe
.cqe_sf
.sf_sct
, cmd
->nc_cqe
.cqe_sf
.sf_sc
);
1728 *buf
= kmem_alloc(*bufsize
, KM_SLEEP
);
1729 bcopy(cmd
->nc_dma
->nd_memp
, *buf
, *bufsize
);
1738 nvme_identify(nvme_t
*nvme
, uint32_t nsid
, void **buf
)
1740 nvme_cmd_t
*cmd
= nvme_alloc_cmd(nvme
, KM_SLEEP
);
1747 cmd
->nc_callback
= nvme_wakeup_cmd
;
1748 cmd
->nc_sqe
.sqe_opc
= NVME_OPC_IDENTIFY
;
1749 cmd
->nc_sqe
.sqe_nsid
= nsid
;
1750 cmd
->nc_sqe
.sqe_cdw10
= nsid
? NVME_IDENTIFY_NSID
: NVME_IDENTIFY_CTRL
;
1752 if (nvme_zalloc_dma(nvme
, NVME_IDENTIFY_BUFSIZE
, DDI_DMA_READ
,
1753 &nvme
->n_prp_dma_attr
, &cmd
->nc_dma
) != DDI_SUCCESS
) {
1754 dev_err(nvme
->n_dip
, CE_WARN
,
1755 "!nvme_zalloc_dma failed for IDENTIFY");
1760 if (cmd
->nc_dma
->nd_ncookie
> 2) {
1761 dev_err(nvme
->n_dip
, CE_WARN
,
1762 "!too many DMA cookies for IDENTIFY");
1763 atomic_inc_32(&nvme
->n_too_many_cookies
);
1768 cmd
->nc_sqe
.sqe_dptr
.d_prp
[0] = cmd
->nc_dma
->nd_cookie
.dmac_laddress
;
1769 if (cmd
->nc_dma
->nd_ncookie
> 1) {
1770 ddi_dma_nextcookie(cmd
->nc_dma
->nd_dmah
,
1771 &cmd
->nc_dma
->nd_cookie
);
1772 cmd
->nc_sqe
.sqe_dptr
.d_prp
[1] =
1773 cmd
->nc_dma
->nd_cookie
.dmac_laddress
;
1776 nvme_admin_cmd(cmd
, nvme_admin_cmd_timeout
);
1778 if ((ret
= nvme_check_cmd_status(cmd
)) != 0) {
1779 dev_err(nvme
->n_dip
, CE_WARN
,
1780 "!IDENTIFY failed with sct = %x, sc = %x",
1781 cmd
->nc_cqe
.cqe_sf
.sf_sct
, cmd
->nc_cqe
.cqe_sf
.sf_sc
);
1785 *buf
= kmem_alloc(NVME_IDENTIFY_BUFSIZE
, KM_SLEEP
);
1786 bcopy(cmd
->nc_dma
->nd_memp
, *buf
, NVME_IDENTIFY_BUFSIZE
);
1795 nvme_set_features(nvme_t
*nvme
, uint32_t nsid
, uint8_t feature
, uint32_t val
,
1798 _NOTE(ARGUNUSED(nsid
));
1799 nvme_cmd_t
*cmd
= nvme_alloc_cmd(nvme
, KM_SLEEP
);
1802 ASSERT(res
!= NULL
);
1805 cmd
->nc_callback
= nvme_wakeup_cmd
;
1806 cmd
->nc_sqe
.sqe_opc
= NVME_OPC_SET_FEATURES
;
1807 cmd
->nc_sqe
.sqe_cdw10
= feature
;
1808 cmd
->nc_sqe
.sqe_cdw11
= val
;
1811 case NVME_FEAT_WRITE_CACHE
:
1812 if (!nvme
->n_write_cache_present
)
1816 case NVME_FEAT_NQUEUES
:
1823 nvme_admin_cmd(cmd
, nvme_admin_cmd_timeout
);
1825 if ((ret
= nvme_check_cmd_status(cmd
)) != 0) {
1826 dev_err(nvme
->n_dip
, CE_WARN
,
1827 "!SET FEATURES %d failed with sct = %x, sc = %x",
1828 feature
, cmd
->nc_cqe
.cqe_sf
.sf_sct
,
1829 cmd
->nc_cqe
.cqe_sf
.sf_sc
);
1833 *res
= cmd
->nc_cqe
.cqe_dw0
;
1841 nvme_get_features(nvme_t
*nvme
, uint32_t nsid
, uint8_t feature
, uint32_t *res
,
1842 void **buf
, size_t *bufsize
)
1844 nvme_cmd_t
*cmd
= nvme_alloc_cmd(nvme
, KM_SLEEP
);
1847 ASSERT(res
!= NULL
);
1849 if (bufsize
!= NULL
)
1853 cmd
->nc_callback
= nvme_wakeup_cmd
;
1854 cmd
->nc_sqe
.sqe_opc
= NVME_OPC_GET_FEATURES
;
1855 cmd
->nc_sqe
.sqe_cdw10
= feature
;
1856 cmd
->nc_sqe
.sqe_cdw11
= *res
;
1859 * For some of the optional features there doesn't seem to be a method
1860 * of detecting whether it is supported other than using it. This will
1861 * cause "Invalid Field in Command" error, which is normally considered
1862 * a programming error. Set the nc_dontpanic flag to override the panic
1863 * in nvme_check_generic_cmd_status().
1866 case NVME_FEAT_ARBITRATION
:
1867 case NVME_FEAT_POWER_MGMT
:
1868 case NVME_FEAT_TEMPERATURE
:
1869 case NVME_FEAT_ERROR
:
1870 case NVME_FEAT_NQUEUES
:
1871 case NVME_FEAT_INTR_COAL
:
1872 case NVME_FEAT_INTR_VECT
:
1873 case NVME_FEAT_WRITE_ATOM
:
1874 case NVME_FEAT_ASYNC_EVENT
:
1877 case NVME_FEAT_WRITE_CACHE
:
1878 if (!nvme
->n_write_cache_present
)
1882 case NVME_FEAT_LBA_RANGE
:
1883 if (!nvme
->n_lba_range_supported
)
1886 cmd
->nc_dontpanic
= B_TRUE
;
1887 cmd
->nc_sqe
.sqe_nsid
= nsid
;
1888 ASSERT(bufsize
!= NULL
);
1889 *bufsize
= NVME_LBA_RANGE_BUFSIZE
;
1892 case NVME_FEAT_AUTO_PST
:
1893 if (!nvme
->n_auto_pst_supported
)
1896 ASSERT(bufsize
!= NULL
);
1897 *bufsize
= NVME_AUTO_PST_BUFSIZE
;
1900 case NVME_FEAT_PROGRESS
:
1901 if (!nvme
->n_progress_supported
)
1904 cmd
->nc_dontpanic
= B_TRUE
;
1911 if (bufsize
!= NULL
&& *bufsize
!= 0) {
1912 if (nvme_zalloc_dma(nvme
, *bufsize
, DDI_DMA_READ
,
1913 &nvme
->n_prp_dma_attr
, &cmd
->nc_dma
) != DDI_SUCCESS
) {
1914 dev_err(nvme
->n_dip
, CE_WARN
,
1915 "!nvme_zalloc_dma failed for GET FEATURES");
1920 if (cmd
->nc_dma
->nd_ncookie
> 2) {
1921 dev_err(nvme
->n_dip
, CE_WARN
,
1922 "!too many DMA cookies for GET FEATURES");
1923 atomic_inc_32(&nvme
->n_too_many_cookies
);
1928 cmd
->nc_sqe
.sqe_dptr
.d_prp
[0] =
1929 cmd
->nc_dma
->nd_cookie
.dmac_laddress
;
1930 if (cmd
->nc_dma
->nd_ncookie
> 1) {
1931 ddi_dma_nextcookie(cmd
->nc_dma
->nd_dmah
,
1932 &cmd
->nc_dma
->nd_cookie
);
1933 cmd
->nc_sqe
.sqe_dptr
.d_prp
[1] =
1934 cmd
->nc_dma
->nd_cookie
.dmac_laddress
;
1938 nvme_admin_cmd(cmd
, nvme_admin_cmd_timeout
);
1940 if ((ret
= nvme_check_cmd_status(cmd
)) != 0) {
1941 boolean_t known
= B_TRUE
;
1943 /* Check if this is unsupported optional feature */
1944 if (cmd
->nc_cqe
.cqe_sf
.sf_sct
== NVME_CQE_SCT_GENERIC
&&
1945 cmd
->nc_cqe
.cqe_sf
.sf_sc
== NVME_CQE_SC_GEN_INV_FLD
) {
1947 case NVME_FEAT_LBA_RANGE
:
1948 nvme
->n_lba_range_supported
= B_FALSE
;
1950 case NVME_FEAT_PROGRESS
:
1951 nvme
->n_progress_supported
= B_FALSE
;
1961 /* Report the error otherwise */
1963 dev_err(nvme
->n_dip
, CE_WARN
,
1964 "!GET FEATURES %d failed with sct = %x, sc = %x",
1965 feature
, cmd
->nc_cqe
.cqe_sf
.sf_sct
,
1966 cmd
->nc_cqe
.cqe_sf
.sf_sc
);
1972 if (bufsize
!= NULL
&& *bufsize
!= 0) {
1973 ASSERT(buf
!= NULL
);
1974 *buf
= kmem_alloc(*bufsize
, KM_SLEEP
);
1975 bcopy(cmd
->nc_dma
->nd_memp
, *buf
, *bufsize
);
1978 *res
= cmd
->nc_cqe
.cqe_dw0
;
1986 nvme_write_cache_set(nvme_t
*nvme
, boolean_t enable
)
1988 nvme_write_cache_t nwc
= { 0 };
1993 return (nvme_set_features(nvme
, 0, NVME_FEAT_WRITE_CACHE
, nwc
.r
,
1998 nvme_set_nqueues(nvme_t
*nvme
, uint16_t *nqueues
)
2000 nvme_nqueues_t nq
= { 0 };
2003 nq
.b
.nq_nsq
= nq
.b
.nq_ncq
= *nqueues
- 1;
2005 ret
= nvme_set_features(nvme
, 0, NVME_FEAT_NQUEUES
, nq
.r
, &nq
.r
);
2009 * Always use the same number of submission and completion
2010 * queues, and never use more than the requested number of
2013 *nqueues
= MIN(*nqueues
, MIN(nq
.b
.nq_nsq
, nq
.b
.nq_ncq
) + 1);
2020 nvme_create_io_qpair(nvme_t
*nvme
, nvme_qpair_t
*qp
, uint16_t idx
)
2022 nvme_cmd_t
*cmd
= nvme_alloc_cmd(nvme
, KM_SLEEP
);
2023 nvme_create_queue_dw10_t dw10
= { 0 };
2024 nvme_create_cq_dw11_t c_dw11
= { 0 };
2025 nvme_create_sq_dw11_t s_dw11
= { 0 };
2029 dw10
.b
.q_qsize
= qp
->nq_nentry
- 1;
2032 c_dw11
.b
.cq_ien
= 1;
2033 c_dw11
.b
.cq_iv
= idx
% nvme
->n_intr_cnt
;
2036 cmd
->nc_callback
= nvme_wakeup_cmd
;
2037 cmd
->nc_sqe
.sqe_opc
= NVME_OPC_CREATE_CQUEUE
;
2038 cmd
->nc_sqe
.sqe_cdw10
= dw10
.r
;
2039 cmd
->nc_sqe
.sqe_cdw11
= c_dw11
.r
;
2040 cmd
->nc_sqe
.sqe_dptr
.d_prp
[0] = qp
->nq_cqdma
->nd_cookie
.dmac_laddress
;
2042 nvme_admin_cmd(cmd
, nvme_admin_cmd_timeout
);
2044 if ((ret
= nvme_check_cmd_status(cmd
)) != 0) {
2045 dev_err(nvme
->n_dip
, CE_WARN
,
2046 "!CREATE CQUEUE failed with sct = %x, sc = %x",
2047 cmd
->nc_cqe
.cqe_sf
.sf_sct
, cmd
->nc_cqe
.cqe_sf
.sf_sc
);
2054 s_dw11
.b
.sq_cqid
= idx
;
2056 cmd
= nvme_alloc_cmd(nvme
, KM_SLEEP
);
2058 cmd
->nc_callback
= nvme_wakeup_cmd
;
2059 cmd
->nc_sqe
.sqe_opc
= NVME_OPC_CREATE_SQUEUE
;
2060 cmd
->nc_sqe
.sqe_cdw10
= dw10
.r
;
2061 cmd
->nc_sqe
.sqe_cdw11
= s_dw11
.r
;
2062 cmd
->nc_sqe
.sqe_dptr
.d_prp
[0] = qp
->nq_sqdma
->nd_cookie
.dmac_laddress
;
2064 nvme_admin_cmd(cmd
, nvme_admin_cmd_timeout
);
2066 if ((ret
= nvme_check_cmd_status(cmd
)) != 0) {
2067 dev_err(nvme
->n_dip
, CE_WARN
,
2068 "!CREATE SQUEUE failed with sct = %x, sc = %x",
2069 cmd
->nc_cqe
.cqe_sf
.sf_sct
, cmd
->nc_cqe
.cqe_sf
.sf_sc
);
2080 nvme_reset(nvme_t
*nvme
, boolean_t quiesce
)
2082 nvme_reg_csts_t csts
;
2085 nvme_put32(nvme
, NVME_REG_CC
, 0);
2087 csts
.r
= nvme_get32(nvme
, NVME_REG_CSTS
);
2088 if (csts
.b
.csts_rdy
== 1) {
2089 nvme_put32(nvme
, NVME_REG_CC
, 0);
2090 for (i
= 0; i
!= nvme
->n_timeout
* 10; i
++) {
2091 csts
.r
= nvme_get32(nvme
, NVME_REG_CSTS
);
2092 if (csts
.b
.csts_rdy
== 0)
2096 drv_usecwait(50000);
2098 delay(drv_usectohz(50000));
2102 nvme_put32(nvme
, NVME_REG_AQA
, 0);
2103 nvme_put32(nvme
, NVME_REG_ASQ
, 0);
2104 nvme_put32(nvme
, NVME_REG_ACQ
, 0);
2106 csts
.r
= nvme_get32(nvme
, NVME_REG_CSTS
);
2107 return (csts
.b
.csts_rdy
== 0 ? B_TRUE
: B_FALSE
);
2111 nvme_shutdown(nvme_t
*nvme
, int mode
, boolean_t quiesce
)
2114 nvme_reg_csts_t csts
;
2117 ASSERT(mode
== NVME_CC_SHN_NORMAL
|| mode
== NVME_CC_SHN_ABRUPT
);
2119 cc
.r
= nvme_get32(nvme
, NVME_REG_CC
);
2120 cc
.b
.cc_shn
= mode
& 0x3;
2121 nvme_put32(nvme
, NVME_REG_CC
, cc
.r
);
2123 for (i
= 0; i
!= 10; i
++) {
2124 csts
.r
= nvme_get32(nvme
, NVME_REG_CSTS
);
2125 if (csts
.b
.csts_shst
== NVME_CSTS_SHN_COMPLETE
)
2129 drv_usecwait(100000);
2131 delay(drv_usectohz(100000));
2137 nvme_prepare_devid(nvme_t
*nvme
, uint32_t nsid
)
2140 * Section 7.7 of the spec describes how to get a unique ID for
2141 * the controller: the vendor ID, the model name and the serial
2142 * number shall be unique when combined.
2144 * If a namespace has no EUI64 we use the above and add the hex
2145 * namespace ID to get a unique ID for the namespace.
2147 char model
[sizeof (nvme
->n_idctl
->id_model
) + 1];
2148 char serial
[sizeof (nvme
->n_idctl
->id_serial
) + 1];
2150 bcopy(nvme
->n_idctl
->id_model
, model
, sizeof (nvme
->n_idctl
->id_model
));
2151 bcopy(nvme
->n_idctl
->id_serial
, serial
,
2152 sizeof (nvme
->n_idctl
->id_serial
));
2154 model
[sizeof (nvme
->n_idctl
->id_model
)] = '\0';
2155 serial
[sizeof (nvme
->n_idctl
->id_serial
)] = '\0';
2157 nvme
->n_ns
[nsid
- 1].ns_devid
= kmem_asprintf("%4X-%s-%s-%X",
2158 nvme
->n_idctl
->id_vid
, model
, serial
, nsid
);
2162 nvme_init_ns(nvme_t
*nvme
, int nsid
)
2164 nvme_namespace_t
*ns
= &nvme
->n_ns
[nsid
- 1];
2165 nvme_identify_nsid_t
*idns
;
2170 if (nvme_identify(nvme
, nsid
, (void **)&idns
) != 0) {
2171 dev_err(nvme
->n_dip
, CE_WARN
,
2172 "!failed to identify namespace %d", nsid
);
2173 return (DDI_FAILURE
);
2178 ns
->ns_block_count
= idns
->id_nsize
;
2180 1 << idns
->id_lbaf
[idns
->id_flbas
.lba_format
].lbaf_lbads
;
2181 ns
->ns_best_block_size
= ns
->ns_block_size
;
2184 * Get the EUI64 if present. Use it for devid and device node names.
2186 if (NVME_VERSION_ATLEAST(&nvme
->n_version
, 1, 1))
2187 bcopy(idns
->id_eui64
, ns
->ns_eui64
, sizeof (ns
->ns_eui64
));
2189 /*LINTED: E_BAD_PTR_CAST_ALIGN*/
2190 if (*(uint64_t *)ns
->ns_eui64
!= 0) {
2191 uint8_t *eui64
= ns
->ns_eui64
;
2193 (void) snprintf(ns
->ns_name
, sizeof (ns
->ns_name
),
2194 "%02x%02x%02x%02x%02x%02x%02x%02x",
2195 eui64
[0], eui64
[1], eui64
[2], eui64
[3],
2196 eui64
[4], eui64
[5], eui64
[6], eui64
[7]);
2198 (void) snprintf(ns
->ns_name
, sizeof (ns
->ns_name
), "%d",
2201 nvme_prepare_devid(nvme
, ns
->ns_id
);
2205 * Find the LBA format with no metadata and the best relative
2206 * performance. A value of 3 means "degraded", 0 is best.
2209 for (int j
= 0; j
<= idns
->id_nlbaf
; j
++) {
2210 if (idns
->id_lbaf
[j
].lbaf_lbads
== 0)
2212 if (idns
->id_lbaf
[j
].lbaf_ms
!= 0)
2214 if (idns
->id_lbaf
[j
].lbaf_rp
>= last_rp
)
2216 last_rp
= idns
->id_lbaf
[j
].lbaf_rp
;
2217 ns
->ns_best_block_size
=
2218 1 << idns
->id_lbaf
[j
].lbaf_lbads
;
2221 if (ns
->ns_best_block_size
< nvme
->n_min_block_size
)
2222 ns
->ns_best_block_size
= nvme
->n_min_block_size
;
2225 * We currently don't support namespaces that use either:
2226 * - thin provisioning
2227 * - protection information
2228 * - illegal block size (< 512)
2230 if (idns
->id_nsfeat
.f_thin
||
2231 idns
->id_dps
.dp_pinfo
) {
2232 dev_err(nvme
->n_dip
, CE_WARN
,
2233 "!ignoring namespace %d, unsupported features: "
2234 "thin = %d, pinfo = %d", nsid
,
2235 idns
->id_nsfeat
.f_thin
, idns
->id_dps
.dp_pinfo
);
2236 ns
->ns_ignore
= B_TRUE
;
2237 } else if (ns
->ns_block_size
< 512) {
2238 dev_err(nvme
->n_dip
, CE_WARN
,
2239 "!ignoring namespace %d, unsupported block size %"PRIu64
,
2240 nsid
, (uint64_t)ns
->ns_block_size
);
2241 ns
->ns_ignore
= B_TRUE
;
2243 ns
->ns_ignore
= B_FALSE
;
2246 return (DDI_SUCCESS
);
2250 nvme_init(nvme_t
*nvme
)
2252 nvme_reg_cc_t cc
= { 0 };
2253 nvme_reg_aqa_t aqa
= { 0 };
2254 nvme_reg_asq_t asq
= { 0 };
2255 nvme_reg_acq_t acq
= { 0 };
2258 nvme_reg_csts_t csts
;
2261 char model
[sizeof (nvme
->n_idctl
->id_model
) + 1];
2262 char *vendor
, *product
;
2264 /* Check controller version */
2265 vs
.r
= nvme_get32(nvme
, NVME_REG_VS
);
2266 nvme
->n_version
.v_major
= vs
.b
.vs_mjr
;
2267 nvme
->n_version
.v_minor
= vs
.b
.vs_mnr
;
2268 dev_err(nvme
->n_dip
, CE_CONT
, "?NVMe spec version %d.%d",
2269 nvme
->n_version
.v_major
, nvme
->n_version
.v_minor
);
2271 if (NVME_VERSION_HIGHER(&nvme
->n_version
,
2272 nvme_version_major
, nvme_version_minor
)) {
2273 dev_err(nvme
->n_dip
, CE_WARN
, "!no support for version > %d.%d",
2274 nvme_version_major
, nvme_version_minor
);
2275 if (nvme
->n_strict_version
)
2279 /* retrieve controller configuration */
2280 cap
.r
= nvme_get64(nvme
, NVME_REG_CAP
);
2282 if ((cap
.b
.cap_css
& NVME_CAP_CSS_NVM
) == 0) {
2283 dev_err(nvme
->n_dip
, CE_WARN
,
2284 "!NVM command set not supported by hardware");
2288 nvme
->n_nssr_supported
= cap
.b
.cap_nssrs
;
2289 nvme
->n_doorbell_stride
= 4 << cap
.b
.cap_dstrd
;
2290 nvme
->n_timeout
= cap
.b
.cap_to
;
2291 nvme
->n_arbitration_mechanisms
= cap
.b
.cap_ams
;
2292 nvme
->n_cont_queues_reqd
= cap
.b
.cap_cqr
;
2293 nvme
->n_max_queue_entries
= cap
.b
.cap_mqes
+ 1;
2296 * The MPSMIN and MPSMAX fields in the CAP register use 0 to specify
2297 * the base page size of 4k (1<<12), so add 12 here to get the real
2300 nvme
->n_pageshift
= MIN(MAX(cap
.b
.cap_mpsmin
+ 12, PAGESHIFT
),
2301 cap
.b
.cap_mpsmax
+ 12);
2302 nvme
->n_pagesize
= 1UL << (nvme
->n_pageshift
);
2305 * Set up Queue DMA to transfer at least 1 page-aligned page at a time.
2307 nvme
->n_queue_dma_attr
.dma_attr_align
= nvme
->n_pagesize
;
2308 nvme
->n_queue_dma_attr
.dma_attr_minxfer
= nvme
->n_pagesize
;
2311 * Set up PRP DMA to transfer 1 page-aligned page at a time.
2312 * Maxxfer may be increased after we identified the controller limits.
2314 nvme
->n_prp_dma_attr
.dma_attr_maxxfer
= nvme
->n_pagesize
;
2315 nvme
->n_prp_dma_attr
.dma_attr_minxfer
= nvme
->n_pagesize
;
2316 nvme
->n_prp_dma_attr
.dma_attr_align
= nvme
->n_pagesize
;
2317 nvme
->n_prp_dma_attr
.dma_attr_seg
= nvme
->n_pagesize
- 1;
2320 * Reset controller if it's still in ready state.
2322 if (nvme_reset(nvme
, B_FALSE
) == B_FALSE
) {
2323 dev_err(nvme
->n_dip
, CE_WARN
, "!unable to reset controller");
2324 ddi_fm_service_impact(nvme
->n_dip
, DDI_SERVICE_LOST
);
2325 nvme
->n_dead
= B_TRUE
;
2330 * Create the admin queue pair.
2332 if (nvme_alloc_qpair(nvme
, nvme
->n_admin_queue_len
, &nvme
->n_adminq
, 0)
2334 dev_err(nvme
->n_dip
, CE_WARN
,
2335 "!unable to allocate admin qpair");
2338 nvme
->n_ioq
= kmem_alloc(sizeof (nvme_qpair_t
*), KM_SLEEP
);
2339 nvme
->n_ioq
[0] = nvme
->n_adminq
;
2341 nvme
->n_progress
|= NVME_ADMIN_QUEUE
;
2343 (void) ddi_prop_update_int(DDI_DEV_T_NONE
, nvme
->n_dip
,
2344 "admin-queue-len", nvme
->n_admin_queue_len
);
2346 aqa
.b
.aqa_asqs
= aqa
.b
.aqa_acqs
= nvme
->n_admin_queue_len
- 1;
2347 asq
= nvme
->n_adminq
->nq_sqdma
->nd_cookie
.dmac_laddress
;
2348 acq
= nvme
->n_adminq
->nq_cqdma
->nd_cookie
.dmac_laddress
;
2350 ASSERT((asq
& (nvme
->n_pagesize
- 1)) == 0);
2351 ASSERT((acq
& (nvme
->n_pagesize
- 1)) == 0);
2353 nvme_put32(nvme
, NVME_REG_AQA
, aqa
.r
);
2354 nvme_put64(nvme
, NVME_REG_ASQ
, asq
);
2355 nvme_put64(nvme
, NVME_REG_ACQ
, acq
);
2357 cc
.b
.cc_ams
= 0; /* use Round-Robin arbitration */
2358 cc
.b
.cc_css
= 0; /* use NVM command set */
2359 cc
.b
.cc_mps
= nvme
->n_pageshift
- 12;
2360 cc
.b
.cc_shn
= 0; /* no shutdown in progress */
2361 cc
.b
.cc_en
= 1; /* enable controller */
2362 cc
.b
.cc_iosqes
= 6; /* submission queue entry is 2^6 bytes long */
2363 cc
.b
.cc_iocqes
= 4; /* completion queue entry is 2^4 bytes long */
2365 nvme_put32(nvme
, NVME_REG_CC
, cc
.r
);
2368 * Wait for the controller to become ready.
2370 csts
.r
= nvme_get32(nvme
, NVME_REG_CSTS
);
2371 if (csts
.b
.csts_rdy
== 0) {
2372 for (i
= 0; i
!= nvme
->n_timeout
* 10; i
++) {
2373 delay(drv_usectohz(50000));
2374 csts
.r
= nvme_get32(nvme
, NVME_REG_CSTS
);
2376 if (csts
.b
.csts_cfs
== 1) {
2377 dev_err(nvme
->n_dip
, CE_WARN
,
2378 "!controller fatal status at init");
2379 ddi_fm_service_impact(nvme
->n_dip
,
2381 nvme
->n_dead
= B_TRUE
;
2385 if (csts
.b
.csts_rdy
== 1)
2390 if (csts
.b
.csts_rdy
== 0) {
2391 dev_err(nvme
->n_dip
, CE_WARN
, "!controller not ready");
2392 ddi_fm_service_impact(nvme
->n_dip
, DDI_SERVICE_LOST
);
2393 nvme
->n_dead
= B_TRUE
;
2398 * Assume an abort command limit of 1. We'll destroy and re-init
2399 * that later when we know the true abort command limit.
2401 sema_init(&nvme
->n_abort_sema
, 1, NULL
, SEMA_DRIVER
, NULL
);
2404 * Setup initial interrupt for admin queue.
2406 if ((nvme_setup_interrupts(nvme
, DDI_INTR_TYPE_MSIX
, 1)
2408 (nvme_setup_interrupts(nvme
, DDI_INTR_TYPE_MSI
, 1)
2410 (nvme_setup_interrupts(nvme
, DDI_INTR_TYPE_FIXED
, 1)
2412 dev_err(nvme
->n_dip
, CE_WARN
,
2413 "!failed to setup initial interrupt");
2418 * Post an asynchronous event command to catch errors.
2419 * We assume the asynchronous events are supported as required by
2420 * specification (Figure 40 in section 5 of NVMe 1.2).
2421 * However, since at least qemu does not follow the specification,
2422 * we need a mechanism to protect ourselves.
2424 nvme
->n_async_event_supported
= B_TRUE
;
2425 nvme_async_event(nvme
);
2428 * Identify Controller
2430 if (nvme_identify(nvme
, 0, (void **)&nvme
->n_idctl
) != 0) {
2431 dev_err(nvme
->n_dip
, CE_WARN
,
2432 "!failed to identify controller");
2437 * Get Vendor & Product ID
2439 bcopy(nvme
->n_idctl
->id_model
, model
, sizeof (nvme
->n_idctl
->id_model
));
2440 model
[sizeof (nvme
->n_idctl
->id_model
)] = '\0';
2441 sata_split_model(model
, &vendor
, &product
);
2444 nvme
->n_vendor
= strdup("NVMe");
2446 nvme
->n_vendor
= strdup(vendor
);
2448 nvme
->n_product
= strdup(product
);
2451 * Get controller limits.
2453 nvme
->n_async_event_limit
= MAX(NVME_MIN_ASYNC_EVENT_LIMIT
,
2454 MIN(nvme
->n_admin_queue_len
/ 10,
2455 MIN(nvme
->n_idctl
->id_aerl
+ 1, nvme
->n_async_event_limit
)));
2457 (void) ddi_prop_update_int(DDI_DEV_T_NONE
, nvme
->n_dip
,
2458 "async-event-limit", nvme
->n_async_event_limit
);
2460 nvme
->n_abort_command_limit
= nvme
->n_idctl
->id_acl
+ 1;
2463 * Reinitialize the semaphore with the true abort command limit
2464 * supported by the hardware. It's not necessary to disable interrupts
2465 * as only command aborts use the semaphore, and no commands are
2466 * executed or aborted while we're here.
2468 sema_destroy(&nvme
->n_abort_sema
);
2469 sema_init(&nvme
->n_abort_sema
, nvme
->n_abort_command_limit
- 1, NULL
,
2472 nvme
->n_progress
|= NVME_CTRL_LIMITS
;
2474 if (nvme
->n_idctl
->id_mdts
== 0)
2475 nvme
->n_max_data_transfer_size
= nvme
->n_pagesize
* 65536;
2477 nvme
->n_max_data_transfer_size
=
2478 1ull << (nvme
->n_pageshift
+ nvme
->n_idctl
->id_mdts
);
2480 nvme
->n_error_log_len
= nvme
->n_idctl
->id_elpe
+ 1;
2483 * Limit n_max_data_transfer_size to what we can handle in one PRP.
2484 * Chained PRPs are currently unsupported.
2486 * This is a no-op on hardware which doesn't support a transfer size
2487 * big enough to require chained PRPs.
2489 nvme
->n_max_data_transfer_size
= MIN(nvme
->n_max_data_transfer_size
,
2490 (nvme
->n_pagesize
/ sizeof (uint64_t) * nvme
->n_pagesize
));
2492 nvme
->n_prp_dma_attr
.dma_attr_maxxfer
= nvme
->n_max_data_transfer_size
;
2495 * Make sure the minimum/maximum queue entry sizes are not
2496 * larger/smaller than the default.
2499 if (((1 << nvme
->n_idctl
->id_sqes
.qes_min
) > sizeof (nvme_sqe_t
)) ||
2500 ((1 << nvme
->n_idctl
->id_sqes
.qes_max
) < sizeof (nvme_sqe_t
)) ||
2501 ((1 << nvme
->n_idctl
->id_cqes
.qes_min
) > sizeof (nvme_cqe_t
)) ||
2502 ((1 << nvme
->n_idctl
->id_cqes
.qes_max
) < sizeof (nvme_cqe_t
)))
2506 * Check for the presence of a Volatile Write Cache. If present,
2507 * enable or disable based on the value of the property
2508 * volatile-write-cache-enable (default is enabled).
2510 nvme
->n_write_cache_present
=
2511 nvme
->n_idctl
->id_vwc
.vwc_present
== 0 ? B_FALSE
: B_TRUE
;
2513 (void) ddi_prop_update_int(DDI_DEV_T_NONE
, nvme
->n_dip
,
2514 "volatile-write-cache-present",
2515 nvme
->n_write_cache_present
? 1 : 0);
2517 if (!nvme
->n_write_cache_present
) {
2518 nvme
->n_write_cache_enabled
= B_FALSE
;
2519 } else if (nvme_write_cache_set(nvme
, nvme
->n_write_cache_enabled
)
2521 dev_err(nvme
->n_dip
, CE_WARN
,
2522 "!failed to %sable volatile write cache",
2523 nvme
->n_write_cache_enabled
? "en" : "dis");
2525 * Assume the cache is (still) enabled.
2527 nvme
->n_write_cache_enabled
= B_TRUE
;
2530 (void) ddi_prop_update_int(DDI_DEV_T_NONE
, nvme
->n_dip
,
2531 "volatile-write-cache-enable",
2532 nvme
->n_write_cache_enabled
? 1 : 0);
2535 * Assume LBA Range Type feature is supported. If it isn't this
2536 * will be set to B_FALSE by nvme_get_features().
2538 nvme
->n_lba_range_supported
= B_TRUE
;
2541 * Check support for Autonomous Power State Transition.
2543 if (NVME_VERSION_ATLEAST(&nvme
->n_version
, 1, 1))
2544 nvme
->n_auto_pst_supported
=
2545 nvme
->n_idctl
->id_apsta
.ap_sup
== 0 ? B_FALSE
: B_TRUE
;
2548 * Assume Software Progress Marker feature is supported. If it isn't
2549 * this will be set to B_FALSE by nvme_get_features().
2551 nvme
->n_progress_supported
= B_TRUE
;
2554 * Identify Namespaces
2556 nvme
->n_namespace_count
= nvme
->n_idctl
->id_nn
;
2558 if (nvme
->n_namespace_count
== 0) {
2559 dev_err(nvme
->n_dip
, CE_WARN
,
2560 "!controllers without namespaces are not supported");
2564 if (nvme
->n_namespace_count
> NVME_MINOR_MAX
) {
2565 dev_err(nvme
->n_dip
, CE_WARN
,
2566 "!too many namespaces: %d, limiting to %d\n",
2567 nvme
->n_namespace_count
, NVME_MINOR_MAX
);
2568 nvme
->n_namespace_count
= NVME_MINOR_MAX
;
2571 nvme
->n_ns
= kmem_zalloc(sizeof (nvme_namespace_t
) *
2572 nvme
->n_namespace_count
, KM_SLEEP
);
2574 for (i
= 0; i
!= nvme
->n_namespace_count
; i
++) {
2575 mutex_init(&nvme
->n_ns
[i
].ns_minor
.nm_mutex
, NULL
, MUTEX_DRIVER
,
2577 if (nvme_init_ns(nvme
, i
+ 1) != DDI_SUCCESS
)
2582 * Try to set up MSI/MSI-X interrupts.
2584 if ((nvme
->n_intr_types
& (DDI_INTR_TYPE_MSI
| DDI_INTR_TYPE_MSIX
))
2586 nvme_release_interrupts(nvme
);
2588 nqueues
= MIN(UINT16_MAX
, ncpus
);
2590 if ((nvme_setup_interrupts(nvme
, DDI_INTR_TYPE_MSIX
,
2591 nqueues
) != DDI_SUCCESS
) &&
2592 (nvme_setup_interrupts(nvme
, DDI_INTR_TYPE_MSI
,
2593 nqueues
) != DDI_SUCCESS
)) {
2594 dev_err(nvme
->n_dip
, CE_WARN
,
2595 "!failed to setup MSI/MSI-X interrupts");
2600 nqueues
= nvme
->n_intr_cnt
;
2603 * Create I/O queue pairs.
2606 if (nvme_set_nqueues(nvme
, &nqueues
) != 0) {
2607 dev_err(nvme
->n_dip
, CE_WARN
,
2608 "!failed to set number of I/O queues to %d",
2614 * Reallocate I/O queue array
2616 kmem_free(nvme
->n_ioq
, sizeof (nvme_qpair_t
*));
2617 nvme
->n_ioq
= kmem_zalloc(sizeof (nvme_qpair_t
*) *
2618 (nqueues
+ 1), KM_SLEEP
);
2619 nvme
->n_ioq
[0] = nvme
->n_adminq
;
2621 nvme
->n_ioq_count
= nqueues
;
2624 * If we got less queues than we asked for we might as well give
2625 * some of the interrupt vectors back to the system.
2627 if (nvme
->n_ioq_count
< nvme
->n_intr_cnt
) {
2628 nvme_release_interrupts(nvme
);
2630 if (nvme_setup_interrupts(nvme
, nvme
->n_intr_type
,
2631 nvme
->n_ioq_count
) != DDI_SUCCESS
) {
2632 dev_err(nvme
->n_dip
, CE_WARN
,
2633 "!failed to reduce number of interrupts");
2639 * Alloc & register I/O queue pairs
2641 nvme
->n_io_queue_len
=
2642 MIN(nvme
->n_io_queue_len
, nvme
->n_max_queue_entries
);
2643 (void) ddi_prop_update_int(DDI_DEV_T_NONE
, nvme
->n_dip
, "io-queue-len",
2644 nvme
->n_io_queue_len
);
2646 for (i
= 1; i
!= nvme
->n_ioq_count
+ 1; i
++) {
2647 if (nvme_alloc_qpair(nvme
, nvme
->n_io_queue_len
,
2648 &nvme
->n_ioq
[i
], i
) != DDI_SUCCESS
) {
2649 dev_err(nvme
->n_dip
, CE_WARN
,
2650 "!unable to allocate I/O qpair %d", i
);
2654 if (nvme_create_io_qpair(nvme
, nvme
->n_ioq
[i
], i
) != 0) {
2655 dev_err(nvme
->n_dip
, CE_WARN
,
2656 "!unable to create I/O qpair %d", i
);
2662 * Post more asynchronous events commands to reduce event reporting
2663 * latency as suggested by the spec.
2665 if (nvme
->n_async_event_supported
) {
2666 for (i
= 1; i
!= nvme
->n_async_event_limit
; i
++)
2667 nvme_async_event(nvme
);
2670 return (DDI_SUCCESS
);
2673 (void) nvme_reset(nvme
, B_FALSE
);
2674 return (DDI_FAILURE
);
2678 nvme_intr(caddr_t arg1
, caddr_t arg2
)
2680 /*LINTED: E_PTR_BAD_CAST_ALIGN*/
2681 nvme_t
*nvme
= (nvme_t
*)arg1
;
2682 int inum
= (int)(uintptr_t)arg2
;
2687 if (inum
>= nvme
->n_intr_cnt
)
2688 return (DDI_INTR_UNCLAIMED
);
2691 return (nvme
->n_intr_type
== DDI_INTR_TYPE_FIXED
?
2692 DDI_INTR_UNCLAIMED
: DDI_INTR_CLAIMED
);
2695 * The interrupt vector a queue uses is calculated as queue_idx %
2696 * intr_cnt in nvme_create_io_qpair(). Iterate through the queue array
2697 * in steps of n_intr_cnt to process all queues using this vector.
2700 qnum
< nvme
->n_ioq_count
+ 1 && nvme
->n_ioq
[qnum
] != NULL
;
2701 qnum
+= nvme
->n_intr_cnt
) {
2702 while ((cmd
= nvme_retrieve_cmd(nvme
, nvme
->n_ioq
[qnum
]))) {
2703 taskq_dispatch_ent((taskq_t
*)cmd
->nc_nvme
->n_cmd_taskq
,
2704 cmd
->nc_callback
, cmd
, TQ_NOSLEEP
, &cmd
->nc_tqent
);
2709 return (ccnt
> 0 ? DDI_INTR_CLAIMED
: DDI_INTR_UNCLAIMED
);
2713 nvme_release_interrupts(nvme_t
*nvme
)
2717 for (i
= 0; i
< nvme
->n_intr_cnt
; i
++) {
2718 if (nvme
->n_inth
[i
] == NULL
)
2721 if (nvme
->n_intr_cap
& DDI_INTR_FLAG_BLOCK
)
2722 (void) ddi_intr_block_disable(&nvme
->n_inth
[i
], 1);
2724 (void) ddi_intr_disable(nvme
->n_inth
[i
]);
2726 (void) ddi_intr_remove_handler(nvme
->n_inth
[i
]);
2727 (void) ddi_intr_free(nvme
->n_inth
[i
]);
2730 kmem_free(nvme
->n_inth
, nvme
->n_inth_sz
);
2731 nvme
->n_inth
= NULL
;
2732 nvme
->n_inth_sz
= 0;
2734 nvme
->n_progress
&= ~NVME_INTERRUPTS
;
2738 nvme_setup_interrupts(nvme_t
*nvme
, int intr_type
, int nqpairs
)
2740 int nintrs
, navail
, count
;
2744 if (nvme
->n_intr_types
== 0) {
2745 ret
= ddi_intr_get_supported_types(nvme
->n_dip
,
2746 &nvme
->n_intr_types
);
2747 if (ret
!= DDI_SUCCESS
) {
2748 dev_err(nvme
->n_dip
, CE_WARN
,
2749 "!%s: ddi_intr_get_supported types failed",
2754 if (get_hwenv() == HW_VMWARE
)
2755 nvme
->n_intr_types
&= ~DDI_INTR_TYPE_MSIX
;
2759 if ((nvme
->n_intr_types
& intr_type
) == 0)
2760 return (DDI_FAILURE
);
2762 ret
= ddi_intr_get_nintrs(nvme
->n_dip
, intr_type
, &nintrs
);
2763 if (ret
!= DDI_SUCCESS
) {
2764 dev_err(nvme
->n_dip
, CE_WARN
, "!%s: ddi_intr_get_nintrs failed",
2769 ret
= ddi_intr_get_navail(nvme
->n_dip
, intr_type
, &navail
);
2770 if (ret
!= DDI_SUCCESS
) {
2771 dev_err(nvme
->n_dip
, CE_WARN
, "!%s: ddi_intr_get_navail failed",
2776 /* We want at most one interrupt per queue pair. */
2777 if (navail
> nqpairs
)
2780 nvme
->n_inth_sz
= sizeof (ddi_intr_handle_t
) * navail
;
2781 nvme
->n_inth
= kmem_zalloc(nvme
->n_inth_sz
, KM_SLEEP
);
2783 ret
= ddi_intr_alloc(nvme
->n_dip
, nvme
->n_inth
, intr_type
, 0, navail
,
2785 if (ret
!= DDI_SUCCESS
) {
2786 dev_err(nvme
->n_dip
, CE_WARN
, "!%s: ddi_intr_alloc failed",
2791 nvme
->n_intr_cnt
= count
;
2793 ret
= ddi_intr_get_pri(nvme
->n_inth
[0], &nvme
->n_intr_pri
);
2794 if (ret
!= DDI_SUCCESS
) {
2795 dev_err(nvme
->n_dip
, CE_WARN
, "!%s: ddi_intr_get_pri failed",
2800 for (i
= 0; i
< count
; i
++) {
2801 ret
= ddi_intr_add_handler(nvme
->n_inth
[i
], nvme_intr
,
2802 (void *)nvme
, (void *)(uintptr_t)i
);
2803 if (ret
!= DDI_SUCCESS
) {
2804 dev_err(nvme
->n_dip
, CE_WARN
,
2805 "!%s: ddi_intr_add_handler failed", __func__
);
2810 (void) ddi_intr_get_cap(nvme
->n_inth
[0], &nvme
->n_intr_cap
);
2812 for (i
= 0; i
< count
; i
++) {
2813 if (nvme
->n_intr_cap
& DDI_INTR_FLAG_BLOCK
)
2814 ret
= ddi_intr_block_enable(&nvme
->n_inth
[i
], 1);
2816 ret
= ddi_intr_enable(nvme
->n_inth
[i
]);
2818 if (ret
!= DDI_SUCCESS
) {
2819 dev_err(nvme
->n_dip
, CE_WARN
,
2820 "!%s: enabling interrupt %d failed", __func__
, i
);
2825 nvme
->n_intr_type
= intr_type
;
2827 nvme
->n_progress
|= NVME_INTERRUPTS
;
2829 return (DDI_SUCCESS
);
2832 nvme_release_interrupts(nvme
);
2838 nvme_fm_errcb(dev_info_t
*dip
, ddi_fm_error_t
*fm_error
, const void *arg
)
2840 _NOTE(ARGUNUSED(arg
));
2842 pci_ereport_post(dip
, fm_error
, NULL
);
2843 return (fm_error
->fme_status
);
2847 nvme_attach(dev_info_t
*dip
, ddi_attach_cmd_t cmd
)
2856 if (cmd
!= DDI_ATTACH
)
2857 return (DDI_FAILURE
);
2859 instance
= ddi_get_instance(dip
);
2861 if (ddi_soft_state_zalloc(nvme_state
, instance
) != DDI_SUCCESS
)
2862 return (DDI_FAILURE
);
2864 nvme
= ddi_get_soft_state(nvme_state
, instance
);
2865 ddi_set_driver_private(dip
, nvme
);
2868 mutex_init(&nvme
->n_minor
.nm_mutex
, NULL
, MUTEX_DRIVER
, NULL
);
2870 nvme
->n_strict_version
= ddi_prop_get_int(DDI_DEV_T_ANY
, dip
,
2871 DDI_PROP_DONTPASS
, "strict-version", 1) == 1 ? B_TRUE
: B_FALSE
;
2872 nvme
->n_ignore_unknown_vendor_status
= ddi_prop_get_int(DDI_DEV_T_ANY
,
2873 dip
, DDI_PROP_DONTPASS
, "ignore-unknown-vendor-status", 0) == 1 ?
2875 nvme
->n_admin_queue_len
= ddi_prop_get_int(DDI_DEV_T_ANY
, dip
,
2876 DDI_PROP_DONTPASS
, "admin-queue-len", NVME_DEFAULT_ADMIN_QUEUE_LEN
);
2877 nvme
->n_io_queue_len
= ddi_prop_get_int(DDI_DEV_T_ANY
, dip
,
2878 DDI_PROP_DONTPASS
, "io-queue-len", NVME_DEFAULT_IO_QUEUE_LEN
);
2879 nvme
->n_async_event_limit
= ddi_prop_get_int(DDI_DEV_T_ANY
, dip
,
2880 DDI_PROP_DONTPASS
, "async-event-limit",
2881 NVME_DEFAULT_ASYNC_EVENT_LIMIT
);
2882 nvme
->n_write_cache_enabled
= ddi_prop_get_int(DDI_DEV_T_ANY
, dip
,
2883 DDI_PROP_DONTPASS
, "volatile-write-cache-enable", 1) != 0 ?
2885 nvme
->n_min_block_size
= ddi_prop_get_int(DDI_DEV_T_ANY
, dip
,
2886 DDI_PROP_DONTPASS
, "min-phys-block-size",
2887 NVME_DEFAULT_MIN_BLOCK_SIZE
);
2889 if (!ISP2(nvme
->n_min_block_size
) ||
2890 (nvme
->n_min_block_size
< NVME_DEFAULT_MIN_BLOCK_SIZE
)) {
2891 dev_err(dip
, CE_WARN
, "!min-phys-block-size %s, "
2892 "using default %d", ISP2(nvme
->n_min_block_size
) ?
2893 "too low" : "not a power of 2",
2894 NVME_DEFAULT_MIN_BLOCK_SIZE
);
2895 nvme
->n_min_block_size
= NVME_DEFAULT_MIN_BLOCK_SIZE
;
2898 if (nvme
->n_admin_queue_len
< NVME_MIN_ADMIN_QUEUE_LEN
)
2899 nvme
->n_admin_queue_len
= NVME_MIN_ADMIN_QUEUE_LEN
;
2900 else if (nvme
->n_admin_queue_len
> NVME_MAX_ADMIN_QUEUE_LEN
)
2901 nvme
->n_admin_queue_len
= NVME_MAX_ADMIN_QUEUE_LEN
;
2903 if (nvme
->n_io_queue_len
< NVME_MIN_IO_QUEUE_LEN
)
2904 nvme
->n_io_queue_len
= NVME_MIN_IO_QUEUE_LEN
;
2906 if (nvme
->n_async_event_limit
< 1)
2907 nvme
->n_async_event_limit
= NVME_DEFAULT_ASYNC_EVENT_LIMIT
;
2909 nvme
->n_reg_acc_attr
= nvme_reg_acc_attr
;
2910 nvme
->n_queue_dma_attr
= nvme_queue_dma_attr
;
2911 nvme
->n_prp_dma_attr
= nvme_prp_dma_attr
;
2912 nvme
->n_sgl_dma_attr
= nvme_sgl_dma_attr
;
2915 * Setup FMA support.
2917 nvme
->n_fm_cap
= ddi_getprop(DDI_DEV_T_ANY
, dip
,
2918 DDI_PROP_CANSLEEP
| DDI_PROP_DONTPASS
, "fm-capable",
2919 DDI_FM_EREPORT_CAPABLE
| DDI_FM_ACCCHK_CAPABLE
|
2920 DDI_FM_DMACHK_CAPABLE
| DDI_FM_ERRCB_CAPABLE
);
2922 ddi_fm_init(dip
, &nvme
->n_fm_cap
, &nvme
->n_fm_ibc
);
2924 if (nvme
->n_fm_cap
) {
2925 if (nvme
->n_fm_cap
& DDI_FM_ACCCHK_CAPABLE
)
2926 nvme
->n_reg_acc_attr
.devacc_attr_access
=
2929 if (nvme
->n_fm_cap
& DDI_FM_DMACHK_CAPABLE
) {
2930 nvme
->n_prp_dma_attr
.dma_attr_flags
|= DDI_DMA_FLAGERR
;
2931 nvme
->n_sgl_dma_attr
.dma_attr_flags
|= DDI_DMA_FLAGERR
;
2934 if (DDI_FM_EREPORT_CAP(nvme
->n_fm_cap
) ||
2935 DDI_FM_ERRCB_CAP(nvme
->n_fm_cap
))
2936 pci_ereport_setup(dip
);
2938 if (DDI_FM_ERRCB_CAP(nvme
->n_fm_cap
))
2939 ddi_fm_handler_register(dip
, nvme_fm_errcb
,
2943 nvme
->n_progress
|= NVME_FMA_INIT
;
2946 * The spec defines several register sets. Only the controller
2947 * registers (set 1) are currently used.
2949 if (ddi_dev_nregs(dip
, &nregs
) == DDI_FAILURE
||
2951 ddi_dev_regsize(dip
, 1, ®size
) == DDI_FAILURE
)
2954 if (ddi_regs_map_setup(dip
, 1, &nvme
->n_regs
, 0, regsize
,
2955 &nvme
->n_reg_acc_attr
, &nvme
->n_regh
) != DDI_SUCCESS
) {
2956 dev_err(dip
, CE_WARN
, "!failed to map regset 1");
2960 nvme
->n_progress
|= NVME_REGS_MAPPED
;
2963 * Create taskq for command completion.
2965 (void) snprintf(name
, sizeof (name
), "%s%d_cmd_taskq",
2966 ddi_driver_name(dip
), ddi_get_instance(dip
));
2967 nvme
->n_cmd_taskq
= ddi_taskq_create(dip
, name
, MIN(UINT16_MAX
, ncpus
),
2968 TASKQ_DEFAULTPRI
, 0);
2969 if (nvme
->n_cmd_taskq
== NULL
) {
2970 dev_err(dip
, CE_WARN
, "!failed to create cmd taskq");
2975 * Create PRP DMA cache
2977 (void) snprintf(name
, sizeof (name
), "%s%d_prp_cache",
2978 ddi_driver_name(dip
), ddi_get_instance(dip
));
2979 nvme
->n_prp_cache
= kmem_cache_create(name
, sizeof (nvme_dma_t
),
2980 0, nvme_prp_dma_constructor
, nvme_prp_dma_destructor
,
2981 NULL
, (void *)nvme
, NULL
, 0);
2983 if (nvme_init(nvme
) != DDI_SUCCESS
)
2987 * Attach the blkdev driver for each namespace.
2989 for (i
= 0; i
!= nvme
->n_namespace_count
; i
++) {
2990 if (ddi_create_minor_node(nvme
->n_dip
, nvme
->n_ns
[i
].ns_name
,
2991 S_IFCHR
, NVME_MINOR(ddi_get_instance(nvme
->n_dip
), i
+ 1),
2992 DDI_NT_NVME_ATTACHMENT_POINT
, 0) != DDI_SUCCESS
) {
2993 dev_err(dip
, CE_WARN
,
2994 "!failed to create minor node for namespace %d", i
);
2998 if (nvme
->n_ns
[i
].ns_ignore
)
3001 nvme
->n_ns
[i
].ns_bd_hdl
= bd_alloc_handle(&nvme
->n_ns
[i
],
3002 &nvme_bd_ops
, &nvme
->n_prp_dma_attr
, KM_SLEEP
);
3004 if (nvme
->n_ns
[i
].ns_bd_hdl
== NULL
) {
3005 dev_err(dip
, CE_WARN
,
3006 "!failed to get blkdev handle for namespace %d", i
);
3010 if (bd_attach_handle(dip
, nvme
->n_ns
[i
].ns_bd_hdl
)
3012 dev_err(dip
, CE_WARN
,
3013 "!failed to attach blkdev handle for namespace %d",
3019 if (ddi_create_minor_node(dip
, "devctl", S_IFCHR
,
3020 NVME_MINOR(ddi_get_instance(dip
), 0), DDI_NT_NVME_NEXUS
, 0)
3022 dev_err(dip
, CE_WARN
, "nvme_attach: "
3023 "cannot create devctl minor node");
3027 return (DDI_SUCCESS
);
3030 /* attach successful anyway so that FMA can retire the device */
3032 return (DDI_SUCCESS
);
3034 (void) nvme_detach(dip
, DDI_DETACH
);
3036 return (DDI_FAILURE
);
3040 nvme_detach(dev_info_t
*dip
, ddi_detach_cmd_t cmd
)
3045 if (cmd
!= DDI_DETACH
)
3046 return (DDI_FAILURE
);
3048 instance
= ddi_get_instance(dip
);
3050 nvme
= ddi_get_soft_state(nvme_state
, instance
);
3053 return (DDI_FAILURE
);
3055 ddi_remove_minor_node(dip
, "devctl");
3056 mutex_destroy(&nvme
->n_minor
.nm_mutex
);
3059 for (i
= 0; i
!= nvme
->n_namespace_count
; i
++) {
3060 ddi_remove_minor_node(dip
, nvme
->n_ns
[i
].ns_name
);
3061 mutex_destroy(&nvme
->n_ns
[i
].ns_minor
.nm_mutex
);
3063 if (nvme
->n_ns
[i
].ns_bd_hdl
) {
3064 (void) bd_detach_handle(
3065 nvme
->n_ns
[i
].ns_bd_hdl
);
3066 bd_free_handle(nvme
->n_ns
[i
].ns_bd_hdl
);
3069 if (nvme
->n_ns
[i
].ns_idns
)
3070 kmem_free(nvme
->n_ns
[i
].ns_idns
,
3071 sizeof (nvme_identify_nsid_t
));
3072 if (nvme
->n_ns
[i
].ns_devid
)
3073 strfree(nvme
->n_ns
[i
].ns_devid
);
3076 kmem_free(nvme
->n_ns
, sizeof (nvme_namespace_t
) *
3077 nvme
->n_namespace_count
);
3080 if (nvme
->n_progress
& NVME_INTERRUPTS
)
3081 nvme_release_interrupts(nvme
);
3083 if (nvme
->n_cmd_taskq
)
3084 ddi_taskq_wait(nvme
->n_cmd_taskq
);
3086 if (nvme
->n_ioq_count
> 0) {
3087 for (i
= 1; i
!= nvme
->n_ioq_count
+ 1; i
++) {
3088 if (nvme
->n_ioq
[i
] != NULL
) {
3089 /* TODO: send destroy queue commands */
3090 nvme_free_qpair(nvme
->n_ioq
[i
]);
3094 kmem_free(nvme
->n_ioq
, sizeof (nvme_qpair_t
*) *
3095 (nvme
->n_ioq_count
+ 1));
3098 if (nvme
->n_prp_cache
!= NULL
) {
3099 kmem_cache_destroy(nvme
->n_prp_cache
);
3102 if (nvme
->n_progress
& NVME_REGS_MAPPED
) {
3103 nvme_shutdown(nvme
, NVME_CC_SHN_NORMAL
, B_FALSE
);
3104 (void) nvme_reset(nvme
, B_FALSE
);
3107 if (nvme
->n_cmd_taskq
)
3108 ddi_taskq_destroy(nvme
->n_cmd_taskq
);
3110 if (nvme
->n_progress
& NVME_CTRL_LIMITS
)
3111 sema_destroy(&nvme
->n_abort_sema
);
3113 if (nvme
->n_progress
& NVME_ADMIN_QUEUE
)
3114 nvme_free_qpair(nvme
->n_adminq
);
3117 kmem_free(nvme
->n_idctl
, NVME_IDENTIFY_BUFSIZE
);
3119 if (nvme
->n_progress
& NVME_REGS_MAPPED
)
3120 ddi_regs_map_free(&nvme
->n_regh
);
3122 if (nvme
->n_progress
& NVME_FMA_INIT
) {
3123 if (DDI_FM_ERRCB_CAP(nvme
->n_fm_cap
))
3124 ddi_fm_handler_unregister(nvme
->n_dip
);
3126 if (DDI_FM_EREPORT_CAP(nvme
->n_fm_cap
) ||
3127 DDI_FM_ERRCB_CAP(nvme
->n_fm_cap
))
3128 pci_ereport_teardown(nvme
->n_dip
);
3130 ddi_fm_fini(nvme
->n_dip
);
3133 if (nvme
->n_vendor
!= NULL
)
3134 strfree(nvme
->n_vendor
);
3136 if (nvme
->n_product
!= NULL
)
3137 strfree(nvme
->n_product
);
3139 ddi_soft_state_free(nvme_state
, instance
);
3141 return (DDI_SUCCESS
);
3145 nvme_quiesce(dev_info_t
*dip
)
3150 instance
= ddi_get_instance(dip
);
3152 nvme
= ddi_get_soft_state(nvme_state
, instance
);
3155 return (DDI_FAILURE
);
3157 nvme_shutdown(nvme
, NVME_CC_SHN_ABRUPT
, B_TRUE
);
3159 (void) nvme_reset(nvme
, B_TRUE
);
3161 return (DDI_FAILURE
);
3165 nvme_fill_prp(nvme_cmd_t
*cmd
, bd_xfer_t
*xfer
)
3167 nvme_t
*nvme
= cmd
->nc_nvme
;
3168 int nprp_page
, nprp
;
3171 if (xfer
->x_ndmac
== 0)
3172 return (DDI_FAILURE
);
3174 cmd
->nc_sqe
.sqe_dptr
.d_prp
[0] = xfer
->x_dmac
.dmac_laddress
;
3175 ddi_dma_nextcookie(xfer
->x_dmah
, &xfer
->x_dmac
);
3177 if (xfer
->x_ndmac
== 1) {
3178 cmd
->nc_sqe
.sqe_dptr
.d_prp
[1] = 0;
3179 return (DDI_SUCCESS
);
3180 } else if (xfer
->x_ndmac
== 2) {
3181 cmd
->nc_sqe
.sqe_dptr
.d_prp
[1] = xfer
->x_dmac
.dmac_laddress
;
3182 return (DDI_SUCCESS
);
3187 nprp_page
= nvme
->n_pagesize
/ sizeof (uint64_t) - 1;
3188 ASSERT(nprp_page
> 0);
3189 nprp
= (xfer
->x_ndmac
+ nprp_page
- 1) / nprp_page
;
3192 * We currently don't support chained PRPs and set up our DMA
3193 * attributes to reflect that. If we still get an I/O request
3194 * that needs a chained PRP something is very wrong.
3198 cmd
->nc_dma
= kmem_cache_alloc(nvme
->n_prp_cache
, KM_SLEEP
);
3199 bzero(cmd
->nc_dma
->nd_memp
, cmd
->nc_dma
->nd_len
);
3201 cmd
->nc_sqe
.sqe_dptr
.d_prp
[1] = cmd
->nc_dma
->nd_cookie
.dmac_laddress
;
3203 /*LINTED: E_PTR_BAD_CAST_ALIGN*/
3204 for (prp
= (uint64_t *)cmd
->nc_dma
->nd_memp
;
3206 prp
++, xfer
->x_ndmac
--) {
3207 *prp
= xfer
->x_dmac
.dmac_laddress
;
3208 ddi_dma_nextcookie(xfer
->x_dmah
, &xfer
->x_dmac
);
3211 (void) ddi_dma_sync(cmd
->nc_dma
->nd_dmah
, 0, cmd
->nc_dma
->nd_len
,
3212 DDI_DMA_SYNC_FORDEV
);
3213 return (DDI_SUCCESS
);
3217 nvme_create_nvm_cmd(nvme_namespace_t
*ns
, uint8_t opc
, bd_xfer_t
*xfer
)
3219 nvme_t
*nvme
= ns
->ns_nvme
;
3223 * Blkdev only sets BD_XFER_POLL when dumping, so don't sleep.
3225 cmd
= nvme_alloc_cmd(nvme
, (xfer
->x_flags
& BD_XFER_POLL
) ?
3226 KM_NOSLEEP
: KM_SLEEP
);
3231 cmd
->nc_sqe
.sqe_opc
= opc
;
3232 cmd
->nc_callback
= nvme_bd_xfer_done
;
3233 cmd
->nc_xfer
= xfer
;
3236 case NVME_OPC_NVM_WRITE
:
3237 case NVME_OPC_NVM_READ
:
3238 VERIFY(xfer
->x_nblks
<= 0x10000);
3240 cmd
->nc_sqe
.sqe_nsid
= ns
->ns_id
;
3242 cmd
->nc_sqe
.sqe_cdw10
= xfer
->x_blkno
& 0xffffffffu
;
3243 cmd
->nc_sqe
.sqe_cdw11
= (xfer
->x_blkno
>> 32);
3244 cmd
->nc_sqe
.sqe_cdw12
= (uint16_t)(xfer
->x_nblks
- 1);
3246 if (nvme_fill_prp(cmd
, xfer
) != DDI_SUCCESS
)
3250 case NVME_OPC_NVM_FLUSH
:
3251 cmd
->nc_sqe
.sqe_nsid
= ns
->ns_id
;
3266 nvme_bd_xfer_done(void *arg
)
3268 nvme_cmd_t
*cmd
= arg
;
3269 bd_xfer_t
*xfer
= cmd
->nc_xfer
;
3272 error
= nvme_check_cmd_status(cmd
);
3275 bd_xfer_done(xfer
, error
);
3279 nvme_bd_driveinfo(void *arg
, bd_drive_t
*drive
)
3281 nvme_namespace_t
*ns
= arg
;
3282 nvme_t
*nvme
= ns
->ns_nvme
;
3285 * blkdev maintains one queue size per instance (namespace),
3286 * but all namespace share the I/O queues.
3287 * TODO: need to figure out a sane default, or use per-NS I/O queues,
3288 * or change blkdev to handle EAGAIN
3290 drive
->d_qsize
= nvme
->n_ioq_count
* nvme
->n_io_queue_len
3291 / nvme
->n_namespace_count
;
3294 * d_maxxfer is not set, which means the value is taken from the DMA
3295 * attributes specified to bd_alloc_handle.
3298 drive
->d_removable
= B_FALSE
;
3299 drive
->d_hotpluggable
= B_FALSE
;
3301 bcopy(ns
->ns_eui64
, drive
->d_eui64
, sizeof (drive
->d_eui64
));
3302 drive
->d_target
= ns
->ns_id
;
3305 drive
->d_model
= nvme
->n_idctl
->id_model
;
3306 drive
->d_model_len
= sizeof (nvme
->n_idctl
->id_model
);
3307 drive
->d_vendor
= nvme
->n_vendor
;
3308 drive
->d_vendor_len
= strlen(nvme
->n_vendor
);
3309 drive
->d_product
= nvme
->n_product
;
3310 drive
->d_product_len
= strlen(nvme
->n_product
);
3311 drive
->d_serial
= nvme
->n_idctl
->id_serial
;
3312 drive
->d_serial_len
= sizeof (nvme
->n_idctl
->id_serial
);
3313 drive
->d_revision
= nvme
->n_idctl
->id_fwrev
;
3314 drive
->d_revision_len
= sizeof (nvme
->n_idctl
->id_fwrev
);
3318 nvme_bd_mediainfo(void *arg
, bd_media_t
*media
)
3320 nvme_namespace_t
*ns
= arg
;
3322 media
->m_nblks
= ns
->ns_block_count
;
3323 media
->m_blksize
= ns
->ns_block_size
;
3324 media
->m_readonly
= B_FALSE
;
3325 media
->m_solidstate
= B_TRUE
;
3327 media
->m_pblksize
= ns
->ns_best_block_size
;
3333 nvme_bd_cmd(nvme_namespace_t
*ns
, bd_xfer_t
*xfer
, uint8_t opc
)
3335 nvme_t
*nvme
= ns
->ns_nvme
;
3344 cmd
= nvme_create_nvm_cmd(ns
, opc
, xfer
);
3348 cmd
->nc_sqid
= (CPU
->cpu_id
% nvme
->n_ioq_count
) + 1;
3349 ASSERT(cmd
->nc_sqid
<= nvme
->n_ioq_count
);
3350 ioq
= nvme
->n_ioq
[cmd
->nc_sqid
];
3353 * Get the polling flag before submitting the command. The command may
3354 * complete immediately after it was submitted, which means we must
3355 * treat both cmd and xfer as if they have been freed already.
3357 poll
= (xfer
->x_flags
& BD_XFER_POLL
) != 0;
3359 ret
= nvme_submit_io_cmd(ioq
, cmd
);
3368 cmd
= nvme_retrieve_cmd(nvme
, ioq
);
3370 nvme_bd_xfer_done(cmd
);
3373 } while (ioq
->nq_active_cmds
!= 0);
3379 nvme_bd_read(void *arg
, bd_xfer_t
*xfer
)
3381 nvme_namespace_t
*ns
= arg
;
3383 return (nvme_bd_cmd(ns
, xfer
, NVME_OPC_NVM_READ
));
3387 nvme_bd_write(void *arg
, bd_xfer_t
*xfer
)
3389 nvme_namespace_t
*ns
= arg
;
3391 return (nvme_bd_cmd(ns
, xfer
, NVME_OPC_NVM_WRITE
));
3395 nvme_bd_sync(void *arg
, bd_xfer_t
*xfer
)
3397 nvme_namespace_t
*ns
= arg
;
3399 if (ns
->ns_nvme
->n_dead
)
3403 * If the volatile write cache is not present or not enabled the FLUSH
3404 * command is a no-op, so we can take a shortcut here.
3406 if (!ns
->ns_nvme
->n_write_cache_present
) {
3407 bd_xfer_done(xfer
, ENOTSUP
);
3411 if (!ns
->ns_nvme
->n_write_cache_enabled
) {
3412 bd_xfer_done(xfer
, 0);
3416 return (nvme_bd_cmd(ns
, xfer
, NVME_OPC_NVM_FLUSH
));
3420 nvme_bd_devid(void *arg
, dev_info_t
*devinfo
, ddi_devid_t
*devid
)
3422 nvme_namespace_t
*ns
= arg
;
3424 /*LINTED: E_BAD_PTR_CAST_ALIGN*/
3425 if (*(uint64_t *)ns
->ns_eui64
!= 0) {
3426 return (ddi_devid_init(devinfo
, DEVID_SCSI3_WWN
,
3427 sizeof (ns
->ns_eui64
), ns
->ns_eui64
, devid
));
3429 return (ddi_devid_init(devinfo
, DEVID_ENCAP
,
3430 strlen(ns
->ns_devid
), ns
->ns_devid
, devid
));
3435 nvme_open(dev_t
*devp
, int flag
, int otyp
, cred_t
*cred_p
)
3438 _NOTE(ARGUNUSED(cred_p
));
3440 minor_t minor
= getminor(*devp
);
3441 nvme_t
*nvme
= ddi_get_soft_state(nvme_state
, NVME_MINOR_INST(minor
));
3442 int nsid
= NVME_MINOR_NSID(minor
);
3443 nvme_minor_state_t
*nm
;
3446 if (otyp
!= OTYP_CHR
)
3452 if (nsid
> nvme
->n_namespace_count
)
3458 nm
= nsid
== 0 ? &nvme
->n_minor
: &nvme
->n_ns
[nsid
- 1].ns_minor
;
3460 mutex_enter(&nm
->nm_mutex
);
3467 if (nm
->nm_ocnt
!= 0) {
3471 nm
->nm_oexcl
= B_TRUE
;
3477 mutex_exit(&nm
->nm_mutex
);
3483 nvme_close(dev_t dev
, int flag
, int otyp
, cred_t
*cred_p
)
3486 _NOTE(ARGUNUSED(cred_p
));
3487 _NOTE(ARGUNUSED(flag
));
3489 minor_t minor
= getminor(dev
);
3490 nvme_t
*nvme
= ddi_get_soft_state(nvme_state
, NVME_MINOR_INST(minor
));
3491 int nsid
= NVME_MINOR_NSID(minor
);
3492 nvme_minor_state_t
*nm
;
3494 if (otyp
!= OTYP_CHR
)
3500 if (nsid
> nvme
->n_namespace_count
)
3503 nm
= nsid
== 0 ? &nvme
->n_minor
: &nvme
->n_ns
[nsid
- 1].ns_minor
;
3505 mutex_enter(&nm
->nm_mutex
);
3507 nm
->nm_oexcl
= B_FALSE
;
3509 ASSERT(nm
->nm_ocnt
> 0);
3511 mutex_exit(&nm
->nm_mutex
);
3517 nvme_ioctl_identify(nvme_t
*nvme
, int nsid
, nvme_ioctl_t
*nioc
, int mode
,
3520 _NOTE(ARGUNUSED(cred_p
));
3524 if ((mode
& FREAD
) == 0)
3527 if (nioc
->n_len
< NVME_IDENTIFY_BUFSIZE
)
3530 if ((rv
= nvme_identify(nvme
, nsid
, (void **)&idctl
)) != 0)
3533 if (ddi_copyout(idctl
, (void *)nioc
->n_buf
, NVME_IDENTIFY_BUFSIZE
, mode
)
3537 kmem_free(idctl
, NVME_IDENTIFY_BUFSIZE
);
3543 nvme_ioctl_capabilities(nvme_t
*nvme
, int nsid
, nvme_ioctl_t
*nioc
,
3544 int mode
, cred_t
*cred_p
)
3546 _NOTE(ARGUNUSED(nsid
, cred_p
));
3548 nvme_reg_cap_t cap
= { 0 };
3549 nvme_capabilities_t nc
;
3551 if ((mode
& FREAD
) == 0)
3554 if (nioc
->n_len
< sizeof (nc
))
3557 cap
.r
= nvme_get64(nvme
, NVME_REG_CAP
);
3560 * The MPSMIN and MPSMAX fields in the CAP register use 0 to
3561 * specify the base page size of 4k (1<<12), so add 12 here to
3562 * get the real page size value.
3564 nc
.mpsmax
= 1 << (12 + cap
.b
.cap_mpsmax
);
3565 nc
.mpsmin
= 1 << (12 + cap
.b
.cap_mpsmin
);
3567 if (ddi_copyout(&nc
, (void *)nioc
->n_buf
, sizeof (nc
), mode
) != 0)
3574 nvme_ioctl_get_logpage(nvme_t
*nvme
, int nsid
, nvme_ioctl_t
*nioc
,
3575 int mode
, cred_t
*cred_p
)
3577 _NOTE(ARGUNUSED(cred_p
));
3582 if ((mode
& FREAD
) == 0)
3585 switch (nioc
->n_arg
) {
3586 case NVME_LOGPAGE_ERROR
:
3590 case NVME_LOGPAGE_HEALTH
:
3591 if (nsid
!= 0 && nvme
->n_idctl
->id_lpa
.lp_smart
== 0)
3595 nsid
= (uint32_t)-1;
3598 case NVME_LOGPAGE_FWSLOT
:
3606 if (nvme_get_logpage(nvme
, &log
, &bufsize
, nioc
->n_arg
, nsid
)
3610 if (nioc
->n_len
< bufsize
) {
3611 kmem_free(log
, bufsize
);
3615 if (ddi_copyout(log
, (void *)nioc
->n_buf
, bufsize
, mode
) != 0)
3618 nioc
->n_len
= bufsize
;
3619 kmem_free(log
, bufsize
);
3625 nvme_ioctl_get_features(nvme_t
*nvme
, int nsid
, nvme_ioctl_t
*nioc
,
3626 int mode
, cred_t
*cred_p
)
3628 _NOTE(ARGUNUSED(cred_p
));
3635 if ((mode
& FREAD
) == 0)
3638 if ((nioc
->n_arg
>> 32) > 0xff)
3641 feature
= (uint8_t)(nioc
->n_arg
>> 32);
3644 case NVME_FEAT_ARBITRATION
:
3645 case NVME_FEAT_POWER_MGMT
:
3646 case NVME_FEAT_TEMPERATURE
:
3647 case NVME_FEAT_ERROR
:
3648 case NVME_FEAT_NQUEUES
:
3649 case NVME_FEAT_INTR_COAL
:
3650 case NVME_FEAT_WRITE_ATOM
:
3651 case NVME_FEAT_ASYNC_EVENT
:
3652 case NVME_FEAT_PROGRESS
:
3657 case NVME_FEAT_INTR_VECT
:
3661 res
= nioc
->n_arg
& 0xffffffffUL
;
3662 if (res
>= nvme
->n_intr_cnt
)
3666 case NVME_FEAT_LBA_RANGE
:
3667 if (nvme
->n_lba_range_supported
== B_FALSE
)
3671 nsid
> nvme
->n_namespace_count
)
3676 case NVME_FEAT_WRITE_CACHE
:
3680 if (!nvme
->n_write_cache_present
)
3685 case NVME_FEAT_AUTO_PST
:
3689 if (!nvme
->n_auto_pst_supported
)
3698 rv
= nvme_get_features(nvme
, nsid
, feature
, &res
, &buf
, &bufsize
);
3702 if (nioc
->n_len
< bufsize
) {
3703 kmem_free(buf
, bufsize
);
3707 if (buf
&& ddi_copyout(buf
, (void*)nioc
->n_buf
, bufsize
, mode
) != 0)
3710 kmem_free(buf
, bufsize
);
3712 nioc
->n_len
= bufsize
;
3718 nvme_ioctl_intr_cnt(nvme_t
*nvme
, int nsid
, nvme_ioctl_t
*nioc
, int mode
,
3721 _NOTE(ARGUNUSED(nsid
, mode
, cred_p
));
3723 if ((mode
& FREAD
) == 0)
3726 nioc
->n_arg
= nvme
->n_intr_cnt
;
3731 nvme_ioctl_version(nvme_t
*nvme
, int nsid
, nvme_ioctl_t
*nioc
, int mode
,
3734 _NOTE(ARGUNUSED(nsid
, cred_p
));
3737 if ((mode
& FREAD
) == 0)
3740 if (nioc
->n_len
< sizeof (nvme
->n_version
))
3743 if (ddi_copyout(&nvme
->n_version
, (void *)nioc
->n_buf
,
3744 sizeof (nvme
->n_version
), mode
) != 0)
3751 nvme_ioctl_format(nvme_t
*nvme
, int nsid
, nvme_ioctl_t
*nioc
, int mode
,
3754 _NOTE(ARGUNUSED(mode
));
3755 nvme_format_nvm_t frmt
= { 0 };
3756 int c_nsid
= nsid
!= 0 ? nsid
- 1 : 0;
3758 if ((mode
& FWRITE
) == 0 || secpolicy_sys_config(cred_p
, B_FALSE
) != 0)
3761 frmt
.r
= nioc
->n_arg
& 0xffffffff;
3764 * Check whether the FORMAT NVM command is supported.
3766 if (nvme
->n_idctl
->id_oacs
.oa_format
== 0)
3770 * Don't allow format or secure erase of individual namespace if that
3771 * would cause a format or secure erase of all namespaces.
3773 if (nsid
!= 0 && nvme
->n_idctl
->id_fna
.fn_format
!= 0)
3776 if (nsid
!= 0 && frmt
.b
.fm_ses
!= NVME_FRMT_SES_NONE
&&
3777 nvme
->n_idctl
->id_fna
.fn_sec_erase
!= 0)
3781 * Don't allow formatting with Protection Information.
3783 if (frmt
.b
.fm_pi
!= 0 || frmt
.b
.fm_pil
!= 0 || frmt
.b
.fm_ms
!= 0)
3787 * Don't allow formatting using an illegal LBA format, or any LBA format
3788 * that uses metadata.
3790 if (frmt
.b
.fm_lbaf
> nvme
->n_ns
[c_nsid
].ns_idns
->id_nlbaf
||
3791 nvme
->n_ns
[c_nsid
].ns_idns
->id_lbaf
[frmt
.b
.fm_lbaf
].lbaf_ms
!= 0)
3795 * Don't allow formatting using an illegal Secure Erase setting.
3797 if (frmt
.b
.fm_ses
> NVME_FRMT_MAX_SES
||
3798 (frmt
.b
.fm_ses
== NVME_FRMT_SES_CRYPTO
&&
3799 nvme
->n_idctl
->id_fna
.fn_crypt_erase
== 0))
3803 nsid
= (uint32_t)-1;
3805 return (nvme_format_nvm(nvme
, nsid
, frmt
.b
.fm_lbaf
, B_FALSE
, 0, B_FALSE
,
3810 nvme_ioctl_detach(nvme_t
*nvme
, int nsid
, nvme_ioctl_t
*nioc
, int mode
,
3813 _NOTE(ARGUNUSED(nioc
, mode
));
3816 if ((mode
& FWRITE
) == 0 || secpolicy_sys_config(cred_p
, B_FALSE
) != 0)
3822 rv
= bd_detach_handle(nvme
->n_ns
[nsid
- 1].ns_bd_hdl
);
3823 if (rv
!= DDI_SUCCESS
)
3830 nvme_ioctl_attach(nvme_t
*nvme
, int nsid
, nvme_ioctl_t
*nioc
, int mode
,
3833 _NOTE(ARGUNUSED(nioc
, mode
));
3834 nvme_identify_nsid_t
*idns
;
3837 if ((mode
& FWRITE
) == 0 || secpolicy_sys_config(cred_p
, B_FALSE
) != 0)
3844 * Identify namespace again, free old identify data.
3846 idns
= nvme
->n_ns
[nsid
- 1].ns_idns
;
3847 if (nvme_init_ns(nvme
, nsid
) != DDI_SUCCESS
)
3850 kmem_free(idns
, sizeof (nvme_identify_nsid_t
));
3852 rv
= bd_attach_handle(nvme
->n_dip
, nvme
->n_ns
[nsid
- 1].ns_bd_hdl
);
3853 if (rv
!= DDI_SUCCESS
)
3860 nvme_ioctl(dev_t dev
, int cmd
, intptr_t arg
, int mode
, cred_t
*cred_p
,
3864 _NOTE(ARGUNUSED(rval_p
));
3866 minor_t minor
= getminor(dev
);
3867 nvme_t
*nvme
= ddi_get_soft_state(nvme_state
, NVME_MINOR_INST(minor
));
3868 int nsid
= NVME_MINOR_NSID(minor
);
3872 int (*nvme_ioctl
[])(nvme_t
*, int, nvme_ioctl_t
*, int, cred_t
*) = {
3874 nvme_ioctl_identify
,
3875 nvme_ioctl_identify
,
3876 nvme_ioctl_capabilities
,
3877 nvme_ioctl_get_logpage
,
3878 nvme_ioctl_get_features
,
3879 nvme_ioctl_intr_cnt
,
3889 if (nsid
> nvme
->n_namespace_count
)
3893 return (ndi_devctl_ioctl(nvme
->n_dip
, cmd
, arg
, mode
, 0));
3895 #ifdef _MULTI_DATAMODEL
3896 switch (ddi_model_convert_from(mode
& FMODELS
)) {
3897 case DDI_MODEL_ILP32
: {
3898 nvme_ioctl32_t nioc32
;
3899 if (ddi_copyin((void*)arg
, &nioc32
, sizeof (nvme_ioctl32_t
),
3902 nioc
.n_len
= nioc32
.n_len
;
3903 nioc
.n_buf
= nioc32
.n_buf
;
3904 nioc
.n_arg
= nioc32
.n_arg
;
3907 case DDI_MODEL_NONE
:
3909 if (ddi_copyin((void*)arg
, &nioc
, sizeof (nvme_ioctl_t
), mode
)
3912 #ifdef _MULTI_DATAMODEL
3917 if (nvme
->n_dead
&& cmd
!= NVME_IOC_DETACH
)
3921 if (cmd
== NVME_IOC_IDENTIFY_CTRL
) {
3923 * This makes NVME_IOC_IDENTIFY_CTRL work the same on devctl and
3924 * attachment point nodes.
3927 } else if (cmd
== NVME_IOC_IDENTIFY_NSID
&& nsid
== 0) {
3929 * This makes NVME_IOC_IDENTIFY_NSID work on a devctl node, it
3930 * will always return identify data for namespace 1.
3935 if (IS_NVME_IOC(cmd
) && nvme_ioctl
[NVME_IOC_CMD(cmd
)] != NULL
)
3936 rv
= nvme_ioctl
[NVME_IOC_CMD(cmd
)](nvme
, nsid
, &nioc
, mode
,
3941 #ifdef _MULTI_DATAMODEL
3942 switch (ddi_model_convert_from(mode
& FMODELS
)) {
3943 case DDI_MODEL_ILP32
: {
3944 nvme_ioctl32_t nioc32
;
3946 nioc32
.n_len
= (size32_t
)nioc
.n_len
;
3947 nioc32
.n_buf
= (uintptr32_t
)nioc
.n_buf
;
3948 nioc32
.n_arg
= nioc
.n_arg
;
3950 if (ddi_copyout(&nioc32
, (void *)arg
, sizeof (nvme_ioctl32_t
),
3955 case DDI_MODEL_NONE
:
3957 if (ddi_copyout(&nioc
, (void *)arg
, sizeof (nvme_ioctl_t
), mode
)
3960 #ifdef _MULTI_DATAMODEL