2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
13 * Copyright 2018 Nexenta Systems, Inc.
14 * Copyright 2016 Tegile Systems, Inc. All rights reserved.
15 * Copyright (c) 2016 The MathWorks, Inc. All rights reserved.
16 * Copyright 2017 Joyent, Inc.
20 * blkdev driver for NVMe compliant storage devices
22 * This driver was written to conform to version 1.2.1 of the NVMe
23 * specification. It may work with newer versions, but that is completely
24 * untested and disabled by default.
26 * The driver has only been tested on x86 systems and will not work on big-
27 * endian systems without changes to the code accessing registers and data
28 * structures used by the hardware.
33 * The driver will use a single interrupt while configuring the device as the
34 * specification requires, but contrary to the specification it will try to use
35 * a single-message MSI(-X) or FIXED interrupt. Later in the attach process it
36 * will switch to multiple-message MSI(-X) if supported. The driver wants to
37 * have one interrupt vector per CPU, but it will work correctly if less are
38 * available. Interrupts can be shared by queues, the interrupt handler will
39 * iterate through the I/O queue array by steps of n_intr_cnt. Usually only
40 * the admin queue will share an interrupt with one I/O queue. The interrupt
41 * handler will retrieve completed commands from all queues sharing an interrupt
42 * vector and will post them to a taskq for completion processing.
47 * NVMe devices can have up to 65535 I/O queue pairs, with each queue holding up
48 * to 65536 I/O commands. The driver will configure one I/O queue pair per
49 * available interrupt vector, with the queue length usually much smaller than
50 * the maximum of 65536. If the hardware doesn't provide enough queues, fewer
51 * interrupt vectors will be used.
53 * Additionally the hardware provides a single special admin queue pair that can
54 * hold up to 4096 admin commands.
56 * From the hardware perspective both queues of a queue pair are independent,
57 * but they share some driver state: the command array (holding pointers to
58 * commands currently being processed by the hardware) and the active command
59 * counter. Access to a queue pair and the shared state is protected by
62 * When a command is submitted to a queue pair the active command counter is
63 * incremented and a pointer to the command is stored in the command array. The
64 * array index is used as command identifier (CID) in the submission queue
65 * entry. Some commands may take a very long time to complete, and if the queue
66 * wraps around in that time a submission may find the next array slot to still
67 * be used by a long-running command. In this case the array is sequentially
68 * searched for the next free slot. The length of the command array is the same
69 * as the configured queue length. Queue overrun is prevented by the semaphore,
70 * so a command submission may block if the queue is full.
75 * For kernel core dump support the driver can do polled I/O. As interrupts are
76 * turned off while dumping the driver will just submit a command in the regular
77 * way, and then repeatedly attempt a command retrieval until it gets the
83 * NVMe devices can have multiple namespaces, each being a independent data
84 * store. The driver supports multiple namespaces and creates a blkdev interface
85 * for each namespace found. Namespaces can have various attributes to support
86 * protection information. This driver does not support any of this and ignores
87 * namespaces that have these attributes.
89 * As of NVMe 1.1 namespaces can have an 64bit Extended Unique Identifier
90 * (EUI64). This driver uses the EUI64 if present to generate the devid and
91 * passes it to blkdev to use it in the device node names. As this is currently
92 * untested namespaces with EUI64 are ignored by default.
94 * We currently support only (2 << NVME_MINOR_INST_SHIFT) - 2 namespaces in a
95 * single controller. This is an artificial limit imposed by the driver to be
96 * able to address a reasonable number of controllers and namespaces using a
97 * 32bit minor node number.
102 * For each NVMe device the driver exposes one minor node for the controller and
103 * one minor node for each namespace. The only operations supported by those
104 * minor nodes are open(9E), close(9E), and ioctl(9E). This serves as the
105 * interface for the nvmeadm(8) utility.
110 * This driver uses blkdev to do all the heavy lifting involved with presenting
111 * a disk device to the system. As a result, the processing of I/O requests is
112 * relatively simple as blkdev takes care of partitioning, boundary checks, DMA
113 * setup, and splitting of transfers into manageable chunks.
115 * I/O requests coming in from blkdev are turned into NVM commands and posted to
116 * an I/O queue. The queue is selected by taking the CPU id modulo the number of
117 * queues. There is currently no timeout handling of I/O commands.
119 * Blkdev also supports querying device/media information and generating a
120 * devid. The driver reports the best block size as determined by the namespace
121 * format back to blkdev as physical block size to support partition and block
122 * alignment. The devid is either based on the namespace EUI64, if present, or
123 * composed using the device vendor ID, model number, serial number, and the
129 * Error handling is currently limited to detecting fatal hardware errors,
130 * either by asynchronous events, or synchronously through command status or
131 * admin command timeouts. In case of severe errors the device is fenced off,
132 * all further requests will return EIO. FMA is then called to fault the device.
134 * The hardware has a limit for outstanding asynchronous event requests. Before
135 * this limit is known the driver assumes it is at least 1 and posts a single
136 * asynchronous request. Later when the limit is known more asynchronous event
137 * requests are posted to allow quicker reception of error information. When an
138 * asynchronous event is posted by the hardware the driver will parse the error
139 * status fields and log information or fault the device, depending on the
140 * severity of the asynchronous event. The asynchronous event request is then
141 * reused and posted to the admin queue again.
143 * On command completion the command status is checked for errors. In case of
144 * errors indicating a driver bug the driver panics. Almost all other error
145 * status values just cause EIO to be returned.
147 * Command timeouts are currently detected for all admin commands except
148 * asynchronous event requests. If a command times out and the hardware appears
149 * to be healthy the driver attempts to abort the command. The original command
150 * timeout is also applied to the abort command. If the abort times out too the
151 * driver assumes the device to be dead, fences it off, and calls FMA to retire
152 * it. In all other cases the aborted command should return immediately with a
153 * status indicating it was aborted, and the driver will wait indefinitely for
154 * that to happen. No timeout handling of normal I/O commands is presently done.
156 * Any command that times out due to the controller dropping dead will be put on
157 * nvme_lost_cmds list if it references DMA memory. This will prevent the DMA
158 * memory being reused by the system and later be written to by a "dead" NVMe
164 * Each queue pair has its own nq_mutex, which must be held when accessing the
165 * associated queue registers or the shared state of the queue pair. Callers of
166 * nvme_unqueue_cmd() must make sure that nq_mutex is held, while
167 * nvme_submit_{admin,io}_cmd() and nvme_retrieve_cmd() take care of this
170 * Each command also has its own nc_mutex, which is associated with the
171 * condition variable nc_cv. It is only used on admin commands which are run
172 * synchronously. In that case it must be held across calls to
173 * nvme_submit_{admin,io}_cmd() and nvme_wait_cmd(), which is taken care of by
174 * nvme_admin_cmd(). It must also be held whenever the completion state of the
175 * command is changed or while a admin command timeout is handled.
177 * If both nc_mutex and nq_mutex must be held, nc_mutex must be acquired first.
178 * More than one nc_mutex may only be held when aborting commands. In this case,
179 * the nc_mutex of the command to be aborted must be held across the call to
180 * nvme_abort_cmd() to prevent the command from completing while the abort is in
183 * Each minor node has its own nm_mutex, which protects the open count nm_ocnt
184 * and exclusive-open flag nm_oexcl.
187 * Quiesce / Fast Reboot:
189 * The driver currently does not support fast reboot. A quiesce(9E) entry point
190 * is still provided which is used to send a shutdown notification to the
194 * Driver Configuration:
196 * The following driver properties can be changed to control some aspects of the
198 * - strict-version: can be set to 0 to allow devices conforming to newer
199 * versions or namespaces with EUI64 to be used
200 * - ignore-unknown-vendor-status: can be set to 1 to not handle any vendor
201 * specific command status as a fatal error leading device faulting
202 * - admin-queue-len: the maximum length of the admin queue (16-4096)
203 * - io-queue-len: the maximum length of the I/O queues (16-65536)
204 * - async-event-limit: the maximum number of asynchronous event requests to be
205 * posted by the driver
206 * - volatile-write-cache-enable: can be set to 0 to disable the volatile write
208 * - min-phys-block-size: the minimum physical block size to report to blkdev,
209 * which is among other things the basis for ZFS vdev ashift
213 * - figure out sane default for I/O queue depth reported to blkdev
214 * - FMA handling of media errors
215 * - support for devices supporting very large I/O requests using chained PRPs
216 * - support for configuring hardware parameters like interrupt coalescing
217 * - support for media formatting and hard partitioning into namespaces
218 * - support for big-endian systems
219 * - support for fast reboot
220 * - support for firmware updates
221 * - support for NVMe Subsystem Reset (1.1)
222 * - support for Scatter/Gather lists (1.1)
223 * - support for Reservations (1.1)
224 * - support for power management
227 #include <sys/byteorder.h>
229 #error nvme driver needs porting for big-endian platforms
232 #include <sys/modctl.h>
233 #include <sys/conf.h>
234 #include <sys/devops.h>
236 #include <sys/sunddi.h>
237 #include <sys/sunndi.h>
238 #include <sys/bitmap.h>
239 #include <sys/sysmacros.h>
240 #include <sys/param.h>
241 #include <sys/varargs.h>
242 #include <sys/cpuvar.h>
243 #include <sys/disp.h>
244 #include <sys/blkdev.h>
245 #include <sys/atomic.h>
246 #include <sys/archsystm.h>
247 #include <sys/sata/sata_hba.h>
248 #include <sys/stat.h>
249 #include <sys/policy.h>
250 #include <sys/list.h>
252 #include <sys/nvme.h>
255 #include <sys/x86_archext.h>
258 #include "nvme_reg.h"
259 #include "nvme_var.h"
262 /* NVMe spec version supported */
263 static const int nvme_version_major
= 1;
264 static const int nvme_version_minor
= 2;
266 /* tunable for admin command timeout in seconds, default is 1s */
267 int nvme_admin_cmd_timeout
= 1;
269 /* tunable for FORMAT NVM command timeout in seconds, default is 600s */
270 int nvme_format_cmd_timeout
= 600;
272 static int nvme_attach(dev_info_t
*, ddi_attach_cmd_t
);
273 static int nvme_detach(dev_info_t
*, ddi_detach_cmd_t
);
274 static int nvme_quiesce(dev_info_t
*);
275 static int nvme_fm_errcb(dev_info_t
*, ddi_fm_error_t
*, const void *);
276 static int nvme_setup_interrupts(nvme_t
*, int, int);
277 static void nvme_release_interrupts(nvme_t
*);
278 static uint_t
nvme_intr(caddr_t
, caddr_t
);
280 static void nvme_shutdown(nvme_t
*, int, boolean_t
);
281 static boolean_t
nvme_reset(nvme_t
*, boolean_t
);
282 static int nvme_init(nvme_t
*);
283 static nvme_cmd_t
*nvme_alloc_cmd(nvme_t
*, int);
284 static void nvme_free_cmd(nvme_cmd_t
*);
285 static nvme_cmd_t
*nvme_create_nvm_cmd(nvme_namespace_t
*, uint8_t,
287 static void nvme_admin_cmd(nvme_cmd_t
*, int);
288 static void nvme_submit_admin_cmd(nvme_qpair_t
*, nvme_cmd_t
*);
289 static int nvme_submit_io_cmd(nvme_qpair_t
*, nvme_cmd_t
*);
290 static void nvme_submit_cmd_common(nvme_qpair_t
*, nvme_cmd_t
*);
291 static nvme_cmd_t
*nvme_unqueue_cmd(nvme_t
*, nvme_qpair_t
*, int);
292 static nvme_cmd_t
*nvme_retrieve_cmd(nvme_t
*, nvme_qpair_t
*);
293 static void nvme_wait_cmd(nvme_cmd_t
*, uint_t
);
294 static void nvme_wakeup_cmd(void *);
295 static void nvme_async_event_task(void *);
297 static int nvme_check_unknown_cmd_status(nvme_cmd_t
*);
298 static int nvme_check_vendor_cmd_status(nvme_cmd_t
*);
299 static int nvme_check_integrity_cmd_status(nvme_cmd_t
*);
300 static int nvme_check_specific_cmd_status(nvme_cmd_t
*);
301 static int nvme_check_generic_cmd_status(nvme_cmd_t
*);
302 static inline int nvme_check_cmd_status(nvme_cmd_t
*);
304 static int nvme_abort_cmd(nvme_cmd_t
*, uint_t
);
305 static void nvme_async_event(nvme_t
*);
306 static int nvme_format_nvm(nvme_t
*, uint32_t, uint8_t, boolean_t
, uint8_t,
308 static int nvme_get_logpage(nvme_t
*, void **, size_t *, uint8_t, ...);
309 static int nvme_identify(nvme_t
*, uint32_t, void **);
310 static int nvme_set_features(nvme_t
*, uint32_t, uint8_t, uint32_t,
312 static int nvme_get_features(nvme_t
*, uint32_t, uint8_t, uint32_t *,
314 static int nvme_write_cache_set(nvme_t
*, boolean_t
);
315 static int nvme_set_nqueues(nvme_t
*, uint16_t *);
317 static void nvme_free_dma(nvme_dma_t
*);
318 static int nvme_zalloc_dma(nvme_t
*, size_t, uint_t
, ddi_dma_attr_t
*,
320 static int nvme_zalloc_queue_dma(nvme_t
*, uint32_t, uint16_t, uint_t
,
322 static void nvme_free_qpair(nvme_qpair_t
*);
323 static int nvme_alloc_qpair(nvme_t
*, uint32_t, nvme_qpair_t
**, int);
324 static int nvme_create_io_qpair(nvme_t
*, nvme_qpair_t
*, uint16_t);
326 static inline void nvme_put64(nvme_t
*, uintptr_t, uint64_t);
327 static inline void nvme_put32(nvme_t
*, uintptr_t, uint32_t);
328 static inline uint64_t nvme_get64(nvme_t
*, uintptr_t);
329 static inline uint32_t nvme_get32(nvme_t
*, uintptr_t);
331 static boolean_t
nvme_check_regs_hdl(nvme_t
*);
332 static boolean_t
nvme_check_dma_hdl(nvme_dma_t
*);
334 static int nvme_fill_prp(nvme_cmd_t
*, bd_xfer_t
*);
336 static void nvme_bd_xfer_done(void *);
337 static void nvme_bd_driveinfo(void *, bd_drive_t
*);
338 static int nvme_bd_mediainfo(void *, bd_media_t
*);
339 static int nvme_bd_cmd(nvme_namespace_t
*, bd_xfer_t
*, uint8_t);
340 static int nvme_bd_read(void *, bd_xfer_t
*);
341 static int nvme_bd_write(void *, bd_xfer_t
*);
342 static int nvme_bd_sync(void *, bd_xfer_t
*);
343 static int nvme_bd_devid(void *, dev_info_t
*, ddi_devid_t
*);
345 static int nvme_prp_dma_constructor(void *, void *, int);
346 static void nvme_prp_dma_destructor(void *, void *);
348 static void nvme_prepare_devid(nvme_t
*, uint32_t);
350 static int nvme_open(dev_t
*, int, int, cred_t
*);
351 static int nvme_close(dev_t
, int, int, cred_t
*);
352 static int nvme_ioctl(dev_t
, int, intptr_t, int, cred_t
*, int *);
354 #define NVME_MINOR_INST_SHIFT 9
355 #define NVME_MINOR(inst, nsid) (((inst) << NVME_MINOR_INST_SHIFT) | (nsid))
356 #define NVME_MINOR_INST(minor) ((minor) >> NVME_MINOR_INST_SHIFT)
357 #define NVME_MINOR_NSID(minor) ((minor) & ((1 << NVME_MINOR_INST_SHIFT) - 1))
358 #define NVME_MINOR_MAX (NVME_MINOR(1, 0) - 2)
360 static void *nvme_state
;
361 static kmem_cache_t
*nvme_cmd_cache
;
364 * DMA attributes for queue DMA memory
366 * Queue DMA memory must be page aligned. The maximum length of a queue is
367 * 65536 entries, and an entry can be 64 bytes long.
369 static ddi_dma_attr_t nvme_queue_dma_attr
= {
370 .dma_attr_version
= DMA_ATTR_V0
,
371 .dma_attr_addr_lo
= 0,
372 .dma_attr_addr_hi
= 0xffffffffffffffffULL
,
373 .dma_attr_count_max
= (UINT16_MAX
+ 1) * sizeof (nvme_sqe_t
) - 1,
374 .dma_attr_align
= 0x1000,
375 .dma_attr_burstsizes
= 0x7ff,
376 .dma_attr_minxfer
= 0x1000,
377 .dma_attr_maxxfer
= (UINT16_MAX
+ 1) * sizeof (nvme_sqe_t
),
378 .dma_attr_seg
= 0xffffffffffffffffULL
,
379 .dma_attr_sgllen
= 1,
380 .dma_attr_granular
= 1,
385 * DMA attributes for transfers using Physical Region Page (PRP) entries
387 * A PRP entry describes one page of DMA memory using the page size specified
388 * in the controller configuration's memory page size register (CC.MPS). It uses
389 * a 64bit base address aligned to this page size. There is no limitation on
390 * chaining PRPs together for arbitrarily large DMA transfers.
392 static ddi_dma_attr_t nvme_prp_dma_attr
= {
393 .dma_attr_version
= DMA_ATTR_V0
,
394 .dma_attr_addr_lo
= 0,
395 .dma_attr_addr_hi
= 0xffffffffffffffffULL
,
396 .dma_attr_count_max
= 0xfff,
397 .dma_attr_align
= 0x1000,
398 .dma_attr_burstsizes
= 0x7ff,
399 .dma_attr_minxfer
= 0x1000,
400 .dma_attr_maxxfer
= 0x1000,
401 .dma_attr_seg
= 0xfff,
402 .dma_attr_sgllen
= -1,
403 .dma_attr_granular
= 1,
408 * DMA attributes for transfers using scatter/gather lists
410 * A SGL entry describes a chunk of DMA memory using a 64bit base address and a
411 * 32bit length field. SGL Segment and SGL Last Segment entries require the
412 * length to be a multiple of 16 bytes.
414 static ddi_dma_attr_t nvme_sgl_dma_attr
= {
415 .dma_attr_version
= DMA_ATTR_V0
,
416 .dma_attr_addr_lo
= 0,
417 .dma_attr_addr_hi
= 0xffffffffffffffffULL
,
418 .dma_attr_count_max
= 0xffffffffUL
,
420 .dma_attr_burstsizes
= 0x7ff,
421 .dma_attr_minxfer
= 0x10,
422 .dma_attr_maxxfer
= 0xfffffffffULL
,
423 .dma_attr_seg
= 0xffffffffffffffffULL
,
424 .dma_attr_sgllen
= -1,
425 .dma_attr_granular
= 0x10,
429 static ddi_device_acc_attr_t nvme_reg_acc_attr
= {
430 .devacc_attr_version
= DDI_DEVICE_ATTR_V0
,
431 .devacc_attr_endian_flags
= DDI_STRUCTURE_LE_ACC
,
432 .devacc_attr_dataorder
= DDI_STRICTORDER_ACC
435 static struct cb_ops nvme_cb_ops
= {
436 .cb_open
= nvme_open
,
437 .cb_close
= nvme_close
,
438 .cb_strategy
= nodev
,
443 .cb_ioctl
= nvme_ioctl
,
447 .cb_chpoll
= nochpoll
,
448 .cb_prop_op
= ddi_prop_op
,
450 .cb_flag
= D_NEW
| D_MP
,
456 static struct dev_ops nvme_dev_ops
= {
457 .devo_rev
= DEVO_REV
,
459 .devo_getinfo
= ddi_no_info
,
460 .devo_identify
= nulldev
,
461 .devo_probe
= nulldev
,
462 .devo_attach
= nvme_attach
,
463 .devo_detach
= nvme_detach
,
465 .devo_cb_ops
= &nvme_cb_ops
,
466 .devo_bus_ops
= NULL
,
468 .devo_quiesce
= nvme_quiesce
,
471 static struct modldrv nvme_modldrv
= {
472 .drv_modops
= &mod_driverops
,
473 .drv_linkinfo
= "NVMe v1.1b",
474 .drv_dev_ops
= &nvme_dev_ops
477 static struct modlinkage nvme_modlinkage
= {
479 .ml_linkage
= { &nvme_modldrv
, NULL
}
482 static bd_ops_t nvme_bd_ops
= {
483 .o_version
= BD_OPS_VERSION_0
,
484 .o_drive_info
= nvme_bd_driveinfo
,
485 .o_media_info
= nvme_bd_mediainfo
,
486 .o_devid_init
= nvme_bd_devid
,
487 .o_sync_cache
= nvme_bd_sync
,
488 .o_read
= nvme_bd_read
,
489 .o_write
= nvme_bd_write
,
493 * This list will hold commands that have timed out and couldn't be aborted.
494 * As we don't know what the hardware may still do with the DMA memory we can't
495 * free them, so we'll keep them forever on this list where we can easily look
498 static struct list nvme_lost_cmds
;
499 static kmutex_t nvme_lc_mutex
;
506 error
= ddi_soft_state_init(&nvme_state
, sizeof (nvme_t
), 1);
507 if (error
!= DDI_SUCCESS
)
510 nvme_cmd_cache
= kmem_cache_create("nvme_cmd_cache",
511 sizeof (nvme_cmd_t
), 64, NULL
, NULL
, NULL
, NULL
, NULL
, 0);
513 mutex_init(&nvme_lc_mutex
, NULL
, MUTEX_DRIVER
, NULL
);
514 list_create(&nvme_lost_cmds
, sizeof (nvme_cmd_t
),
515 offsetof(nvme_cmd_t
, nc_list
));
517 bd_mod_init(&nvme_dev_ops
);
519 error
= mod_install(&nvme_modlinkage
);
520 if (error
!= DDI_SUCCESS
) {
521 ddi_soft_state_fini(&nvme_state
);
522 mutex_destroy(&nvme_lc_mutex
);
523 list_destroy(&nvme_lost_cmds
);
524 bd_mod_fini(&nvme_dev_ops
);
535 if (!list_is_empty(&nvme_lost_cmds
))
536 return (DDI_FAILURE
);
538 error
= mod_remove(&nvme_modlinkage
);
539 if (error
== DDI_SUCCESS
) {
540 ddi_soft_state_fini(&nvme_state
);
541 kmem_cache_destroy(nvme_cmd_cache
);
542 mutex_destroy(&nvme_lc_mutex
);
543 list_destroy(&nvme_lost_cmds
);
544 bd_mod_fini(&nvme_dev_ops
);
551 _info(struct modinfo
*modinfop
)
553 return (mod_info(&nvme_modlinkage
, modinfop
));
557 nvme_put64(nvme_t
*nvme
, uintptr_t reg
, uint64_t val
)
559 ASSERT(((uintptr_t)(nvme
->n_regs
+ reg
) & 0x7) == 0);
561 /*LINTED: E_BAD_PTR_CAST_ALIGN*/
562 ddi_put64(nvme
->n_regh
, (uint64_t *)(nvme
->n_regs
+ reg
), val
);
566 nvme_put32(nvme_t
*nvme
, uintptr_t reg
, uint32_t val
)
568 ASSERT(((uintptr_t)(nvme
->n_regs
+ reg
) & 0x3) == 0);
570 /*LINTED: E_BAD_PTR_CAST_ALIGN*/
571 ddi_put32(nvme
->n_regh
, (uint32_t *)(nvme
->n_regs
+ reg
), val
);
574 static inline uint64_t
575 nvme_get64(nvme_t
*nvme
, uintptr_t reg
)
579 ASSERT(((uintptr_t)(nvme
->n_regs
+ reg
) & 0x7) == 0);
581 /*LINTED: E_BAD_PTR_CAST_ALIGN*/
582 val
= ddi_get64(nvme
->n_regh
, (uint64_t *)(nvme
->n_regs
+ reg
));
587 static inline uint32_t
588 nvme_get32(nvme_t
*nvme
, uintptr_t reg
)
592 ASSERT(((uintptr_t)(nvme
->n_regs
+ reg
) & 0x3) == 0);
594 /*LINTED: E_BAD_PTR_CAST_ALIGN*/
595 val
= ddi_get32(nvme
->n_regh
, (uint32_t *)(nvme
->n_regs
+ reg
));
601 nvme_check_regs_hdl(nvme_t
*nvme
)
603 ddi_fm_error_t error
;
605 ddi_fm_acc_err_get(nvme
->n_regh
, &error
, DDI_FME_VERSION
);
607 if (error
.fme_status
!= DDI_FM_OK
)
614 nvme_check_dma_hdl(nvme_dma_t
*dma
)
616 ddi_fm_error_t error
;
621 ddi_fm_dma_err_get(dma
->nd_dmah
, &error
, DDI_FME_VERSION
);
623 if (error
.fme_status
!= DDI_FM_OK
)
630 nvme_free_dma_common(nvme_dma_t
*dma
)
632 if (dma
->nd_dmah
!= NULL
)
633 (void) ddi_dma_unbind_handle(dma
->nd_dmah
);
634 if (dma
->nd_acch
!= NULL
)
635 ddi_dma_mem_free(&dma
->nd_acch
);
636 if (dma
->nd_dmah
!= NULL
)
637 ddi_dma_free_handle(&dma
->nd_dmah
);
641 nvme_free_dma(nvme_dma_t
*dma
)
643 nvme_free_dma_common(dma
);
644 kmem_free(dma
, sizeof (*dma
));
649 nvme_prp_dma_destructor(void *buf
, void *private)
651 nvme_dma_t
*dma
= (nvme_dma_t
*)buf
;
653 nvme_free_dma_common(dma
);
657 nvme_alloc_dma_common(nvme_t
*nvme
, nvme_dma_t
*dma
,
658 size_t len
, uint_t flags
, ddi_dma_attr_t
*dma_attr
)
660 if (ddi_dma_alloc_handle(nvme
->n_dip
, dma_attr
, DDI_DMA_SLEEP
, NULL
,
661 &dma
->nd_dmah
) != DDI_SUCCESS
) {
663 * Due to DDI_DMA_SLEEP this can't be DDI_DMA_NORESOURCES, and
664 * the only other possible error is DDI_DMA_BADATTR which
665 * indicates a driver bug which should cause a panic.
667 dev_err(nvme
->n_dip
, CE_PANIC
,
668 "!failed to get DMA handle, check DMA attributes");
669 return (DDI_FAILURE
);
673 * ddi_dma_mem_alloc() can only fail when DDI_DMA_NOSLEEP is specified
674 * or the flags are conflicting, which isn't the case here.
676 (void) ddi_dma_mem_alloc(dma
->nd_dmah
, len
, &nvme
->n_reg_acc_attr
,
677 DDI_DMA_CONSISTENT
, DDI_DMA_SLEEP
, NULL
, &dma
->nd_memp
,
678 &dma
->nd_len
, &dma
->nd_acch
);
680 if (ddi_dma_addr_bind_handle(dma
->nd_dmah
, NULL
, dma
->nd_memp
,
681 dma
->nd_len
, flags
| DDI_DMA_CONSISTENT
, DDI_DMA_SLEEP
, NULL
,
682 &dma
->nd_cookie
, &dma
->nd_ncookie
) != DDI_DMA_MAPPED
) {
683 dev_err(nvme
->n_dip
, CE_WARN
,
684 "!failed to bind DMA memory");
685 atomic_inc_32(&nvme
->n_dma_bind_err
);
686 nvme_free_dma_common(dma
);
687 return (DDI_FAILURE
);
690 return (DDI_SUCCESS
);
694 nvme_zalloc_dma(nvme_t
*nvme
, size_t len
, uint_t flags
,
695 ddi_dma_attr_t
*dma_attr
, nvme_dma_t
**ret
)
697 nvme_dma_t
*dma
= kmem_zalloc(sizeof (nvme_dma_t
), KM_SLEEP
);
699 if (nvme_alloc_dma_common(nvme
, dma
, len
, flags
, dma_attr
) !=
702 kmem_free(dma
, sizeof (nvme_dma_t
));
703 return (DDI_FAILURE
);
706 bzero(dma
->nd_memp
, dma
->nd_len
);
709 return (DDI_SUCCESS
);
714 nvme_prp_dma_constructor(void *buf
, void *private, int flags
)
716 nvme_dma_t
*dma
= (nvme_dma_t
*)buf
;
717 nvme_t
*nvme
= (nvme_t
*)private;
722 if (nvme_alloc_dma_common(nvme
, dma
, nvme
->n_pagesize
,
723 DDI_DMA_READ
, &nvme
->n_prp_dma_attr
) != DDI_SUCCESS
) {
727 ASSERT(dma
->nd_ncookie
== 1);
729 dma
->nd_cached
= B_TRUE
;
735 nvme_zalloc_queue_dma(nvme_t
*nvme
, uint32_t nentry
, uint16_t qe_len
,
736 uint_t flags
, nvme_dma_t
**dma
)
738 uint32_t len
= nentry
* qe_len
;
739 ddi_dma_attr_t q_dma_attr
= nvme
->n_queue_dma_attr
;
741 len
= roundup(len
, nvme
->n_pagesize
);
743 q_dma_attr
.dma_attr_minxfer
= len
;
745 if (nvme_zalloc_dma(nvme
, len
, flags
, &q_dma_attr
, dma
)
747 dev_err(nvme
->n_dip
, CE_WARN
,
748 "!failed to get DMA memory for queue");
752 if ((*dma
)->nd_ncookie
!= 1) {
753 dev_err(nvme
->n_dip
, CE_WARN
,
754 "!got too many cookies for queue DMA");
758 return (DDI_SUCCESS
);
766 return (DDI_FAILURE
);
770 nvme_free_qpair(nvme_qpair_t
*qp
)
774 mutex_destroy(&qp
->nq_mutex
);
775 sema_destroy(&qp
->nq_sema
);
777 if (qp
->nq_sqdma
!= NULL
)
778 nvme_free_dma(qp
->nq_sqdma
);
779 if (qp
->nq_cqdma
!= NULL
)
780 nvme_free_dma(qp
->nq_cqdma
);
782 if (qp
->nq_active_cmds
> 0)
783 for (i
= 0; i
!= qp
->nq_nentry
; i
++)
784 if (qp
->nq_cmd
[i
] != NULL
)
785 nvme_free_cmd(qp
->nq_cmd
[i
]);
787 if (qp
->nq_cmd
!= NULL
)
788 kmem_free(qp
->nq_cmd
, sizeof (nvme_cmd_t
*) * qp
->nq_nentry
);
790 kmem_free(qp
, sizeof (nvme_qpair_t
));
794 nvme_alloc_qpair(nvme_t
*nvme
, uint32_t nentry
, nvme_qpair_t
**nqp
,
797 nvme_qpair_t
*qp
= kmem_zalloc(sizeof (*qp
), KM_SLEEP
);
799 mutex_init(&qp
->nq_mutex
, NULL
, MUTEX_DRIVER
,
800 DDI_INTR_PRI(nvme
->n_intr_pri
));
801 sema_init(&qp
->nq_sema
, nentry
, NULL
, SEMA_DRIVER
, NULL
);
803 if (nvme_zalloc_queue_dma(nvme
, nentry
, sizeof (nvme_sqe_t
),
804 DDI_DMA_WRITE
, &qp
->nq_sqdma
) != DDI_SUCCESS
)
807 if (nvme_zalloc_queue_dma(nvme
, nentry
, sizeof (nvme_cqe_t
),
808 DDI_DMA_READ
, &qp
->nq_cqdma
) != DDI_SUCCESS
)
811 qp
->nq_sq
= (nvme_sqe_t
*)qp
->nq_sqdma
->nd_memp
;
812 qp
->nq_cq
= (nvme_cqe_t
*)qp
->nq_cqdma
->nd_memp
;
813 qp
->nq_nentry
= nentry
;
815 qp
->nq_sqtdbl
= NVME_REG_SQTDBL(nvme
, idx
);
816 qp
->nq_cqhdbl
= NVME_REG_CQHDBL(nvme
, idx
);
818 qp
->nq_cmd
= kmem_zalloc(sizeof (nvme_cmd_t
*) * nentry
, KM_SLEEP
);
822 return (DDI_SUCCESS
);
828 return (DDI_FAILURE
);
832 nvme_alloc_cmd(nvme_t
*nvme
, int kmflag
)
834 nvme_cmd_t
*cmd
= kmem_cache_alloc(nvme_cmd_cache
, kmflag
);
839 bzero(cmd
, sizeof (nvme_cmd_t
));
843 mutex_init(&cmd
->nc_mutex
, NULL
, MUTEX_DRIVER
,
844 DDI_INTR_PRI(nvme
->n_intr_pri
));
845 cv_init(&cmd
->nc_cv
, NULL
, CV_DRIVER
, NULL
);
851 nvme_free_cmd(nvme_cmd_t
*cmd
)
853 /* Don't free commands on the lost commands list. */
854 if (list_link_active(&cmd
->nc_list
))
858 if (cmd
->nc_dma
->nd_cached
)
859 kmem_cache_free(cmd
->nc_nvme
->n_prp_cache
,
862 nvme_free_dma(cmd
->nc_dma
);
866 cv_destroy(&cmd
->nc_cv
);
867 mutex_destroy(&cmd
->nc_mutex
);
869 kmem_cache_free(nvme_cmd_cache
, cmd
);
873 nvme_submit_admin_cmd(nvme_qpair_t
*qp
, nvme_cmd_t
*cmd
)
875 sema_p(&qp
->nq_sema
);
876 nvme_submit_cmd_common(qp
, cmd
);
880 nvme_submit_io_cmd(nvme_qpair_t
*qp
, nvme_cmd_t
*cmd
)
882 if (sema_tryp(&qp
->nq_sema
) == 0)
885 nvme_submit_cmd_common(qp
, cmd
);
890 nvme_submit_cmd_common(nvme_qpair_t
*qp
, nvme_cmd_t
*cmd
)
892 nvme_reg_sqtdbl_t tail
= { 0 };
894 mutex_enter(&qp
->nq_mutex
);
895 cmd
->nc_completed
= B_FALSE
;
898 * Try to insert the cmd into the active cmd array at the nq_next_cmd
899 * slot. If the slot is already occupied advance to the next slot and
900 * try again. This can happen for long running commands like async event
903 while (qp
->nq_cmd
[qp
->nq_next_cmd
] != NULL
)
904 qp
->nq_next_cmd
= (qp
->nq_next_cmd
+ 1) % qp
->nq_nentry
;
905 qp
->nq_cmd
[qp
->nq_next_cmd
] = cmd
;
907 qp
->nq_active_cmds
++;
909 cmd
->nc_sqe
.sqe_cid
= qp
->nq_next_cmd
;
910 bcopy(&cmd
->nc_sqe
, &qp
->nq_sq
[qp
->nq_sqtail
], sizeof (nvme_sqe_t
));
911 (void) ddi_dma_sync(qp
->nq_sqdma
->nd_dmah
,
912 sizeof (nvme_sqe_t
) * qp
->nq_sqtail
,
913 sizeof (nvme_sqe_t
), DDI_DMA_SYNC_FORDEV
);
914 qp
->nq_next_cmd
= (qp
->nq_next_cmd
+ 1) % qp
->nq_nentry
;
916 tail
.b
.sqtdbl_sqt
= qp
->nq_sqtail
= (qp
->nq_sqtail
+ 1) % qp
->nq_nentry
;
917 nvme_put32(cmd
->nc_nvme
, qp
->nq_sqtdbl
, tail
.r
);
919 mutex_exit(&qp
->nq_mutex
);
923 nvme_unqueue_cmd(nvme_t
*nvme
, nvme_qpair_t
*qp
, int cid
)
927 ASSERT(mutex_owned(&qp
->nq_mutex
));
928 ASSERT3S(cid
, <, qp
->nq_nentry
);
930 cmd
= qp
->nq_cmd
[cid
];
931 qp
->nq_cmd
[cid
] = NULL
;
932 ASSERT3U(qp
->nq_active_cmds
, >, 0);
933 qp
->nq_active_cmds
--;
934 sema_v(&qp
->nq_sema
);
936 ASSERT3P(cmd
, !=, NULL
);
937 ASSERT3P(cmd
->nc_nvme
, ==, nvme
);
938 ASSERT3S(cmd
->nc_sqe
.sqe_cid
, ==, cid
);
944 nvme_retrieve_cmd(nvme_t
*nvme
, nvme_qpair_t
*qp
)
946 nvme_reg_cqhdbl_t head
= { 0 };
951 (void) ddi_dma_sync(qp
->nq_cqdma
->nd_dmah
, 0,
952 sizeof (nvme_cqe_t
) * qp
->nq_nentry
, DDI_DMA_SYNC_FORKERNEL
);
954 mutex_enter(&qp
->nq_mutex
);
955 cqe
= &qp
->nq_cq
[qp
->nq_cqhead
];
957 /* Check phase tag of CQE. Hardware inverts it for new entries. */
958 if (cqe
->cqe_sf
.sf_p
== qp
->nq_phase
) {
959 mutex_exit(&qp
->nq_mutex
);
963 ASSERT(nvme
->n_ioq
[cqe
->cqe_sqid
] == qp
);
965 cmd
= nvme_unqueue_cmd(nvme
, qp
, cqe
->cqe_cid
);
967 ASSERT(cmd
->nc_sqid
== cqe
->cqe_sqid
);
968 bcopy(cqe
, &cmd
->nc_cqe
, sizeof (nvme_cqe_t
));
970 qp
->nq_sqhead
= cqe
->cqe_sqhd
;
972 head
.b
.cqhdbl_cqh
= qp
->nq_cqhead
= (qp
->nq_cqhead
+ 1) % qp
->nq_nentry
;
974 /* Toggle phase on wrap-around. */
975 if (qp
->nq_cqhead
== 0)
976 qp
->nq_phase
= qp
->nq_phase
? 0 : 1;
978 nvme_put32(cmd
->nc_nvme
, qp
->nq_cqhdbl
, head
.r
);
979 mutex_exit(&qp
->nq_mutex
);
985 nvme_check_unknown_cmd_status(nvme_cmd_t
*cmd
)
987 nvme_cqe_t
*cqe
= &cmd
->nc_cqe
;
989 dev_err(cmd
->nc_nvme
->n_dip
, CE_WARN
,
990 "!unknown command status received: opc = %x, sqid = %d, cid = %d, "
991 "sc = %x, sct = %x, dnr = %d, m = %d", cmd
->nc_sqe
.sqe_opc
,
992 cqe
->cqe_sqid
, cqe
->cqe_cid
, cqe
->cqe_sf
.sf_sc
, cqe
->cqe_sf
.sf_sct
,
993 cqe
->cqe_sf
.sf_dnr
, cqe
->cqe_sf
.sf_m
);
995 if (cmd
->nc_xfer
!= NULL
)
996 bd_error(cmd
->nc_xfer
, BD_ERR_ILLRQ
);
998 if (cmd
->nc_nvme
->n_strict_version
) {
999 cmd
->nc_nvme
->n_dead
= B_TRUE
;
1000 ddi_fm_service_impact(cmd
->nc_nvme
->n_dip
, DDI_SERVICE_LOST
);
1007 nvme_check_vendor_cmd_status(nvme_cmd_t
*cmd
)
1009 nvme_cqe_t
*cqe
= &cmd
->nc_cqe
;
1011 dev_err(cmd
->nc_nvme
->n_dip
, CE_WARN
,
1012 "!unknown command status received: opc = %x, sqid = %d, cid = %d, "
1013 "sc = %x, sct = %x, dnr = %d, m = %d", cmd
->nc_sqe
.sqe_opc
,
1014 cqe
->cqe_sqid
, cqe
->cqe_cid
, cqe
->cqe_sf
.sf_sc
, cqe
->cqe_sf
.sf_sct
,
1015 cqe
->cqe_sf
.sf_dnr
, cqe
->cqe_sf
.sf_m
);
1016 if (!cmd
->nc_nvme
->n_ignore_unknown_vendor_status
) {
1017 cmd
->nc_nvme
->n_dead
= B_TRUE
;
1018 ddi_fm_service_impact(cmd
->nc_nvme
->n_dip
, DDI_SERVICE_LOST
);
1025 nvme_check_integrity_cmd_status(nvme_cmd_t
*cmd
)
1027 nvme_cqe_t
*cqe
= &cmd
->nc_cqe
;
1029 switch (cqe
->cqe_sf
.sf_sc
) {
1030 case NVME_CQE_SC_INT_NVM_WRITE
:
1032 /* TODO: post ereport */
1033 if (cmd
->nc_xfer
!= NULL
)
1034 bd_error(cmd
->nc_xfer
, BD_ERR_MEDIA
);
1037 case NVME_CQE_SC_INT_NVM_READ
:
1039 /* TODO: post ereport */
1040 if (cmd
->nc_xfer
!= NULL
)
1041 bd_error(cmd
->nc_xfer
, BD_ERR_MEDIA
);
1045 return (nvme_check_unknown_cmd_status(cmd
));
1050 nvme_check_generic_cmd_status(nvme_cmd_t
*cmd
)
1052 nvme_cqe_t
*cqe
= &cmd
->nc_cqe
;
1054 switch (cqe
->cqe_sf
.sf_sc
) {
1055 case NVME_CQE_SC_GEN_SUCCESS
:
1059 * Errors indicating a bug in the driver should cause a panic.
1061 case NVME_CQE_SC_GEN_INV_OPC
:
1062 /* Invalid Command Opcode */
1063 if (!cmd
->nc_dontpanic
)
1064 dev_err(cmd
->nc_nvme
->n_dip
, CE_PANIC
,
1065 "programming error: invalid opcode in cmd %p",
1069 case NVME_CQE_SC_GEN_INV_FLD
:
1070 /* Invalid Field in Command */
1071 if (!cmd
->nc_dontpanic
)
1072 dev_err(cmd
->nc_nvme
->n_dip
, CE_PANIC
,
1073 "programming error: invalid field in cmd %p",
1077 case NVME_CQE_SC_GEN_ID_CNFL
:
1078 /* Command ID Conflict */
1079 dev_err(cmd
->nc_nvme
->n_dip
, CE_PANIC
, "programming error: "
1080 "cmd ID conflict in cmd %p", (void *)cmd
);
1083 case NVME_CQE_SC_GEN_INV_NS
:
1084 /* Invalid Namespace or Format */
1085 if (!cmd
->nc_dontpanic
)
1086 dev_err(cmd
->nc_nvme
->n_dip
, CE_PANIC
,
1087 "programming error: invalid NS/format in cmd %p",
1091 case NVME_CQE_SC_GEN_NVM_LBA_RANGE
:
1092 /* LBA Out Of Range */
1093 dev_err(cmd
->nc_nvme
->n_dip
, CE_PANIC
, "programming error: "
1094 "LBA out of range in cmd %p", (void *)cmd
);
1098 * Non-fatal errors, handle gracefully.
1100 case NVME_CQE_SC_GEN_DATA_XFR_ERR
:
1101 /* Data Transfer Error (DMA) */
1102 /* TODO: post ereport */
1103 atomic_inc_32(&cmd
->nc_nvme
->n_data_xfr_err
);
1104 if (cmd
->nc_xfer
!= NULL
)
1105 bd_error(cmd
->nc_xfer
, BD_ERR_NTRDY
);
1108 case NVME_CQE_SC_GEN_INTERNAL_ERR
:
1110 * Internal Error. The spec (v1.0, section 4.5.1.2) says
1111 * detailed error information is returned as async event,
1112 * so we pretty much ignore the error here and handle it
1113 * in the async event handler.
1115 atomic_inc_32(&cmd
->nc_nvme
->n_internal_err
);
1116 if (cmd
->nc_xfer
!= NULL
)
1117 bd_error(cmd
->nc_xfer
, BD_ERR_NTRDY
);
1120 case NVME_CQE_SC_GEN_ABORT_REQUEST
:
1122 * Command Abort Requested. This normally happens only when a
1123 * command times out.
1125 /* TODO: post ereport or change blkdev to handle this? */
1126 atomic_inc_32(&cmd
->nc_nvme
->n_abort_rq_err
);
1129 case NVME_CQE_SC_GEN_ABORT_PWRLOSS
:
1130 /* Command Aborted due to Power Loss Notification */
1131 ddi_fm_service_impact(cmd
->nc_nvme
->n_dip
, DDI_SERVICE_LOST
);
1132 cmd
->nc_nvme
->n_dead
= B_TRUE
;
1135 case NVME_CQE_SC_GEN_ABORT_SQ_DEL
:
1136 /* Command Aborted due to SQ Deletion */
1137 atomic_inc_32(&cmd
->nc_nvme
->n_abort_sq_del
);
1140 case NVME_CQE_SC_GEN_NVM_CAP_EXC
:
1141 /* Capacity Exceeded */
1142 atomic_inc_32(&cmd
->nc_nvme
->n_nvm_cap_exc
);
1143 if (cmd
->nc_xfer
!= NULL
)
1144 bd_error(cmd
->nc_xfer
, BD_ERR_MEDIA
);
1147 case NVME_CQE_SC_GEN_NVM_NS_NOTRDY
:
1148 /* Namespace Not Ready */
1149 atomic_inc_32(&cmd
->nc_nvme
->n_nvm_ns_notrdy
);
1150 if (cmd
->nc_xfer
!= NULL
)
1151 bd_error(cmd
->nc_xfer
, BD_ERR_NTRDY
);
1155 return (nvme_check_unknown_cmd_status(cmd
));
1160 nvme_check_specific_cmd_status(nvme_cmd_t
*cmd
)
1162 nvme_cqe_t
*cqe
= &cmd
->nc_cqe
;
1164 switch (cqe
->cqe_sf
.sf_sc
) {
1165 case NVME_CQE_SC_SPC_INV_CQ
:
1166 /* Completion Queue Invalid */
1167 ASSERT(cmd
->nc_sqe
.sqe_opc
== NVME_OPC_CREATE_SQUEUE
);
1168 atomic_inc_32(&cmd
->nc_nvme
->n_inv_cq_err
);
1171 case NVME_CQE_SC_SPC_INV_QID
:
1172 /* Invalid Queue Identifier */
1173 ASSERT(cmd
->nc_sqe
.sqe_opc
== NVME_OPC_CREATE_SQUEUE
||
1174 cmd
->nc_sqe
.sqe_opc
== NVME_OPC_DELETE_SQUEUE
||
1175 cmd
->nc_sqe
.sqe_opc
== NVME_OPC_CREATE_CQUEUE
||
1176 cmd
->nc_sqe
.sqe_opc
== NVME_OPC_DELETE_CQUEUE
);
1177 atomic_inc_32(&cmd
->nc_nvme
->n_inv_qid_err
);
1180 case NVME_CQE_SC_SPC_MAX_QSZ_EXC
:
1181 /* Max Queue Size Exceeded */
1182 ASSERT(cmd
->nc_sqe
.sqe_opc
== NVME_OPC_CREATE_SQUEUE
||
1183 cmd
->nc_sqe
.sqe_opc
== NVME_OPC_CREATE_CQUEUE
);
1184 atomic_inc_32(&cmd
->nc_nvme
->n_max_qsz_exc
);
1187 case NVME_CQE_SC_SPC_ABRT_CMD_EXC
:
1188 /* Abort Command Limit Exceeded */
1189 ASSERT(cmd
->nc_sqe
.sqe_opc
== NVME_OPC_ABORT
);
1190 dev_err(cmd
->nc_nvme
->n_dip
, CE_PANIC
, "programming error: "
1191 "abort command limit exceeded in cmd %p", (void *)cmd
);
1194 case NVME_CQE_SC_SPC_ASYNC_EVREQ_EXC
:
1195 /* Async Event Request Limit Exceeded */
1196 ASSERT(cmd
->nc_sqe
.sqe_opc
== NVME_OPC_ASYNC_EVENT
);
1197 dev_err(cmd
->nc_nvme
->n_dip
, CE_PANIC
, "programming error: "
1198 "async event request limit exceeded in cmd %p",
1202 case NVME_CQE_SC_SPC_INV_INT_VECT
:
1203 /* Invalid Interrupt Vector */
1204 ASSERT(cmd
->nc_sqe
.sqe_opc
== NVME_OPC_CREATE_CQUEUE
);
1205 atomic_inc_32(&cmd
->nc_nvme
->n_inv_int_vect
);
1208 case NVME_CQE_SC_SPC_INV_LOG_PAGE
:
1209 /* Invalid Log Page */
1210 ASSERT(cmd
->nc_sqe
.sqe_opc
== NVME_OPC_GET_LOG_PAGE
);
1211 atomic_inc_32(&cmd
->nc_nvme
->n_inv_log_page
);
1214 case NVME_CQE_SC_SPC_INV_FORMAT
:
1215 /* Invalid Format */
1216 ASSERT(cmd
->nc_sqe
.sqe_opc
== NVME_OPC_NVM_FORMAT
);
1217 atomic_inc_32(&cmd
->nc_nvme
->n_inv_format
);
1218 if (cmd
->nc_xfer
!= NULL
)
1219 bd_error(cmd
->nc_xfer
, BD_ERR_ILLRQ
);
1222 case NVME_CQE_SC_SPC_INV_Q_DEL
:
1223 /* Invalid Queue Deletion */
1224 ASSERT(cmd
->nc_sqe
.sqe_opc
== NVME_OPC_DELETE_CQUEUE
);
1225 atomic_inc_32(&cmd
->nc_nvme
->n_inv_q_del
);
1228 case NVME_CQE_SC_SPC_NVM_CNFL_ATTR
:
1229 /* Conflicting Attributes */
1230 ASSERT(cmd
->nc_sqe
.sqe_opc
== NVME_OPC_NVM_DSET_MGMT
||
1231 cmd
->nc_sqe
.sqe_opc
== NVME_OPC_NVM_READ
||
1232 cmd
->nc_sqe
.sqe_opc
== NVME_OPC_NVM_WRITE
);
1233 atomic_inc_32(&cmd
->nc_nvme
->n_cnfl_attr
);
1234 if (cmd
->nc_xfer
!= NULL
)
1235 bd_error(cmd
->nc_xfer
, BD_ERR_ILLRQ
);
1238 case NVME_CQE_SC_SPC_NVM_INV_PROT
:
1239 /* Invalid Protection Information */
1240 ASSERT(cmd
->nc_sqe
.sqe_opc
== NVME_OPC_NVM_COMPARE
||
1241 cmd
->nc_sqe
.sqe_opc
== NVME_OPC_NVM_READ
||
1242 cmd
->nc_sqe
.sqe_opc
== NVME_OPC_NVM_WRITE
);
1243 atomic_inc_32(&cmd
->nc_nvme
->n_inv_prot
);
1244 if (cmd
->nc_xfer
!= NULL
)
1245 bd_error(cmd
->nc_xfer
, BD_ERR_ILLRQ
);
1248 case NVME_CQE_SC_SPC_NVM_READONLY
:
1249 /* Write to Read Only Range */
1250 ASSERT(cmd
->nc_sqe
.sqe_opc
== NVME_OPC_NVM_WRITE
);
1251 atomic_inc_32(&cmd
->nc_nvme
->n_readonly
);
1252 if (cmd
->nc_xfer
!= NULL
)
1253 bd_error(cmd
->nc_xfer
, BD_ERR_ILLRQ
);
1257 return (nvme_check_unknown_cmd_status(cmd
));
1262 nvme_check_cmd_status(nvme_cmd_t
*cmd
)
1264 nvme_cqe_t
*cqe
= &cmd
->nc_cqe
;
1267 * Take a shortcut if the controller is dead, or if
1268 * command status indicates no error.
1270 if (cmd
->nc_nvme
->n_dead
)
1273 if (cqe
->cqe_sf
.sf_sct
== NVME_CQE_SCT_GENERIC
&&
1274 cqe
->cqe_sf
.sf_sc
== NVME_CQE_SC_GEN_SUCCESS
)
1277 if (cqe
->cqe_sf
.sf_sct
== NVME_CQE_SCT_GENERIC
)
1278 return (nvme_check_generic_cmd_status(cmd
));
1279 else if (cqe
->cqe_sf
.sf_sct
== NVME_CQE_SCT_SPECIFIC
)
1280 return (nvme_check_specific_cmd_status(cmd
));
1281 else if (cqe
->cqe_sf
.sf_sct
== NVME_CQE_SCT_INTEGRITY
)
1282 return (nvme_check_integrity_cmd_status(cmd
));
1283 else if (cqe
->cqe_sf
.sf_sct
== NVME_CQE_SCT_VENDOR
)
1284 return (nvme_check_vendor_cmd_status(cmd
));
1286 return (nvme_check_unknown_cmd_status(cmd
));
1290 nvme_abort_cmd(nvme_cmd_t
*abort_cmd
, uint_t sec
)
1292 nvme_t
*nvme
= abort_cmd
->nc_nvme
;
1293 nvme_cmd_t
*cmd
= nvme_alloc_cmd(nvme
, KM_SLEEP
);
1294 nvme_abort_cmd_t ac
= { 0 };
1297 sema_p(&nvme
->n_abort_sema
);
1299 ac
.b
.ac_cid
= abort_cmd
->nc_sqe
.sqe_cid
;
1300 ac
.b
.ac_sqid
= abort_cmd
->nc_sqid
;
1303 cmd
->nc_sqe
.sqe_opc
= NVME_OPC_ABORT
;
1304 cmd
->nc_callback
= nvme_wakeup_cmd
;
1305 cmd
->nc_sqe
.sqe_cdw10
= ac
.r
;
1308 * Send the ABORT to the hardware. The ABORT command will return _after_
1309 * the aborted command has completed (aborted or otherwise), but since
1310 * we still hold the aborted command's mutex its callback hasn't been
1313 nvme_admin_cmd(cmd
, sec
);
1314 sema_v(&nvme
->n_abort_sema
);
1316 if ((ret
= nvme_check_cmd_status(cmd
)) != 0) {
1317 dev_err(nvme
->n_dip
, CE_WARN
,
1318 "!ABORT failed with sct = %x, sc = %x",
1319 cmd
->nc_cqe
.cqe_sf
.sf_sct
, cmd
->nc_cqe
.cqe_sf
.sf_sc
);
1320 atomic_inc_32(&nvme
->n_abort_failed
);
1322 dev_err(nvme
->n_dip
, CE_WARN
,
1323 "!ABORT of command %d/%d %ssuccessful",
1324 abort_cmd
->nc_sqe
.sqe_cid
, abort_cmd
->nc_sqid
,
1325 cmd
->nc_cqe
.cqe_dw0
& 1 ? "un" : "");
1326 if ((cmd
->nc_cqe
.cqe_dw0
& 1) == 0)
1327 atomic_inc_32(&nvme
->n_cmd_aborted
);
1335 * nvme_wait_cmd -- wait for command completion or timeout
1337 * In case of a serious error or a timeout of the abort command the hardware
1338 * will be declared dead and FMA will be notified.
1341 nvme_wait_cmd(nvme_cmd_t
*cmd
, uint_t sec
)
1343 clock_t timeout
= ddi_get_lbolt() + drv_usectohz(sec
* MICROSEC
);
1344 nvme_t
*nvme
= cmd
->nc_nvme
;
1345 nvme_reg_csts_t csts
;
1348 ASSERT(mutex_owned(&cmd
->nc_mutex
));
1350 while (!cmd
->nc_completed
) {
1351 if (cv_timedwait(&cmd
->nc_cv
, &cmd
->nc_mutex
, timeout
) == -1)
1355 if (cmd
->nc_completed
)
1359 * The command timed out.
1361 * Check controller for fatal status, any errors associated with the
1362 * register or DMA handle, or for a double timeout (abort command timed
1363 * out). If necessary log a warning and call FMA.
1365 csts
.r
= nvme_get32(nvme
, NVME_REG_CSTS
);
1366 dev_err(nvme
->n_dip
, CE_WARN
, "!command %d/%d timeout, "
1367 "OPC = %x, CFS = %d", cmd
->nc_sqe
.sqe_cid
, cmd
->nc_sqid
,
1368 cmd
->nc_sqe
.sqe_opc
, csts
.b
.csts_cfs
);
1369 atomic_inc_32(&nvme
->n_cmd_timeout
);
1371 if (csts
.b
.csts_cfs
||
1372 nvme_check_regs_hdl(nvme
) ||
1373 nvme_check_dma_hdl(cmd
->nc_dma
) ||
1374 cmd
->nc_sqe
.sqe_opc
== NVME_OPC_ABORT
) {
1375 ddi_fm_service_impact(nvme
->n_dip
, DDI_SERVICE_LOST
);
1376 nvme
->n_dead
= B_TRUE
;
1377 } else if (nvme_abort_cmd(cmd
, sec
) == 0) {
1379 * If the abort succeeded the command should complete
1380 * immediately with an appropriate status.
1382 while (!cmd
->nc_completed
)
1383 cv_wait(&cmd
->nc_cv
, &cmd
->nc_mutex
);
1388 qp
= nvme
->n_ioq
[cmd
->nc_sqid
];
1390 mutex_enter(&qp
->nq_mutex
);
1391 (void) nvme_unqueue_cmd(nvme
, qp
, cmd
->nc_sqe
.sqe_cid
);
1392 mutex_exit(&qp
->nq_mutex
);
1395 * As we don't know what the presumed dead hardware might still do with
1396 * the DMA memory, we'll put the command on the lost commands list if it
1397 * has any DMA memory.
1399 if (cmd
->nc_dma
!= NULL
) {
1400 mutex_enter(&nvme_lc_mutex
);
1401 list_insert_head(&nvme_lost_cmds
, cmd
);
1402 mutex_exit(&nvme_lc_mutex
);
1407 nvme_wakeup_cmd(void *arg
)
1409 nvme_cmd_t
*cmd
= arg
;
1411 mutex_enter(&cmd
->nc_mutex
);
1412 cmd
->nc_completed
= B_TRUE
;
1413 cv_signal(&cmd
->nc_cv
);
1414 mutex_exit(&cmd
->nc_mutex
);
1418 nvme_async_event_task(void *arg
)
1420 nvme_cmd_t
*cmd
= arg
;
1421 nvme_t
*nvme
= cmd
->nc_nvme
;
1422 nvme_error_log_entry_t
*error_log
= NULL
;
1423 nvme_health_log_t
*health_log
= NULL
;
1425 nvme_async_event_t event
;
1428 * Check for errors associated with the async request itself. The only
1429 * command-specific error is "async event limit exceeded", which
1430 * indicates a programming error in the driver and causes a panic in
1431 * nvme_check_cmd_status().
1433 * Other possible errors are various scenarios where the async request
1434 * was aborted, or internal errors in the device. Internal errors are
1435 * reported to FMA, the command aborts need no special handling here.
1437 * And finally, at least qemu nvme does not support async events,
1438 * and will return NVME_CQE_SC_GEN_INV_OPC | DNR. If so, we
1439 * will avoid posting async events.
1442 if (nvme_check_cmd_status(cmd
) != 0) {
1443 dev_err(cmd
->nc_nvme
->n_dip
, CE_WARN
,
1444 "!async event request returned failure, sct = %x, "
1445 "sc = %x, dnr = %d, m = %d", cmd
->nc_cqe
.cqe_sf
.sf_sct
,
1446 cmd
->nc_cqe
.cqe_sf
.sf_sc
, cmd
->nc_cqe
.cqe_sf
.sf_dnr
,
1447 cmd
->nc_cqe
.cqe_sf
.sf_m
);
1449 if (cmd
->nc_cqe
.cqe_sf
.sf_sct
== NVME_CQE_SCT_GENERIC
&&
1450 cmd
->nc_cqe
.cqe_sf
.sf_sc
== NVME_CQE_SC_GEN_INTERNAL_ERR
) {
1451 cmd
->nc_nvme
->n_dead
= B_TRUE
;
1452 ddi_fm_service_impact(cmd
->nc_nvme
->n_dip
,
1456 if (cmd
->nc_cqe
.cqe_sf
.sf_sct
== NVME_CQE_SCT_GENERIC
&&
1457 cmd
->nc_cqe
.cqe_sf
.sf_sc
== NVME_CQE_SC_GEN_INV_OPC
&&
1458 cmd
->nc_cqe
.cqe_sf
.sf_dnr
== 1) {
1459 nvme
->n_async_event_supported
= B_FALSE
;
1467 event
.r
= cmd
->nc_cqe
.cqe_dw0
;
1469 /* Clear CQE and re-submit the async request. */
1470 bzero(&cmd
->nc_cqe
, sizeof (nvme_cqe_t
));
1471 nvme_submit_admin_cmd(nvme
->n_adminq
, cmd
);
1473 switch (event
.b
.ae_type
) {
1474 case NVME_ASYNC_TYPE_ERROR
:
1475 if (event
.b
.ae_logpage
== NVME_LOGPAGE_ERROR
) {
1476 (void) nvme_get_logpage(nvme
, (void **)&error_log
,
1477 &logsize
, event
.b
.ae_logpage
);
1479 dev_err(nvme
->n_dip
, CE_WARN
, "!wrong logpage in "
1480 "async event reply: %d", event
.b
.ae_logpage
);
1481 atomic_inc_32(&nvme
->n_wrong_logpage
);
1484 switch (event
.b
.ae_info
) {
1485 case NVME_ASYNC_ERROR_INV_SQ
:
1486 dev_err(nvme
->n_dip
, CE_PANIC
, "programming error: "
1487 "invalid submission queue");
1490 case NVME_ASYNC_ERROR_INV_DBL
:
1491 dev_err(nvme
->n_dip
, CE_PANIC
, "programming error: "
1492 "invalid doorbell write value");
1495 case NVME_ASYNC_ERROR_DIAGFAIL
:
1496 dev_err(nvme
->n_dip
, CE_WARN
, "!diagnostic failure");
1497 ddi_fm_service_impact(nvme
->n_dip
, DDI_SERVICE_LOST
);
1498 nvme
->n_dead
= B_TRUE
;
1499 atomic_inc_32(&nvme
->n_diagfail_event
);
1502 case NVME_ASYNC_ERROR_PERSISTENT
:
1503 dev_err(nvme
->n_dip
, CE_WARN
, "!persistent internal "
1505 ddi_fm_service_impact(nvme
->n_dip
, DDI_SERVICE_LOST
);
1506 nvme
->n_dead
= B_TRUE
;
1507 atomic_inc_32(&nvme
->n_persistent_event
);
1510 case NVME_ASYNC_ERROR_TRANSIENT
:
1511 dev_err(nvme
->n_dip
, CE_WARN
, "!transient internal "
1513 /* TODO: send ereport */
1514 atomic_inc_32(&nvme
->n_transient_event
);
1517 case NVME_ASYNC_ERROR_FW_LOAD
:
1518 dev_err(nvme
->n_dip
, CE_WARN
,
1519 "!firmware image load error");
1520 atomic_inc_32(&nvme
->n_fw_load_event
);
1525 case NVME_ASYNC_TYPE_HEALTH
:
1526 if (event
.b
.ae_logpage
== NVME_LOGPAGE_HEALTH
) {
1527 (void) nvme_get_logpage(nvme
, (void **)&health_log
,
1528 &logsize
, event
.b
.ae_logpage
, -1);
1530 dev_err(nvme
->n_dip
, CE_WARN
, "!wrong logpage in "
1531 "async event reply: %d", event
.b
.ae_logpage
);
1532 atomic_inc_32(&nvme
->n_wrong_logpage
);
1535 switch (event
.b
.ae_info
) {
1536 case NVME_ASYNC_HEALTH_RELIABILITY
:
1537 dev_err(nvme
->n_dip
, CE_WARN
,
1538 "!device reliability compromised");
1539 /* TODO: send ereport */
1540 atomic_inc_32(&nvme
->n_reliability_event
);
1543 case NVME_ASYNC_HEALTH_TEMPERATURE
:
1544 dev_err(nvme
->n_dip
, CE_WARN
,
1545 "!temperature above threshold");
1546 /* TODO: send ereport */
1547 atomic_inc_32(&nvme
->n_temperature_event
);
1550 case NVME_ASYNC_HEALTH_SPARE
:
1551 dev_err(nvme
->n_dip
, CE_WARN
,
1552 "!spare space below threshold");
1553 /* TODO: send ereport */
1554 atomic_inc_32(&nvme
->n_spare_event
);
1559 case NVME_ASYNC_TYPE_VENDOR
:
1560 dev_err(nvme
->n_dip
, CE_WARN
, "!vendor specific async event "
1561 "received, info = %x, logpage = %x", event
.b
.ae_info
,
1562 event
.b
.ae_logpage
);
1563 atomic_inc_32(&nvme
->n_vendor_event
);
1567 dev_err(nvme
->n_dip
, CE_WARN
, "!unknown async event received, "
1568 "type = %x, info = %x, logpage = %x", event
.b
.ae_type
,
1569 event
.b
.ae_info
, event
.b
.ae_logpage
);
1570 atomic_inc_32(&nvme
->n_unknown_event
);
1575 kmem_free(error_log
, logsize
);
1578 kmem_free(health_log
, logsize
);
1582 nvme_admin_cmd(nvme_cmd_t
*cmd
, int sec
)
1584 mutex_enter(&cmd
->nc_mutex
);
1585 nvme_submit_admin_cmd(cmd
->nc_nvme
->n_adminq
, cmd
);
1586 nvme_wait_cmd(cmd
, sec
);
1587 mutex_exit(&cmd
->nc_mutex
);
1591 nvme_async_event(nvme_t
*nvme
)
1595 cmd
= nvme_alloc_cmd(nvme
, KM_SLEEP
);
1597 cmd
->nc_sqe
.sqe_opc
= NVME_OPC_ASYNC_EVENT
;
1598 cmd
->nc_callback
= nvme_async_event_task
;
1599 cmd
->nc_dontpanic
= B_TRUE
;
1601 nvme_submit_admin_cmd(nvme
->n_adminq
, cmd
);
1605 nvme_format_nvm(nvme_t
*nvme
, uint32_t nsid
, uint8_t lbaf
, boolean_t ms
,
1606 uint8_t pi
, boolean_t pil
, uint8_t ses
)
1608 nvme_cmd_t
*cmd
= nvme_alloc_cmd(nvme
, KM_SLEEP
);
1609 nvme_format_nvm_t format_nvm
= { 0 };
1612 format_nvm
.b
.fm_lbaf
= lbaf
& 0xf;
1613 format_nvm
.b
.fm_ms
= ms
? 1 : 0;
1614 format_nvm
.b
.fm_pi
= pi
& 0x7;
1615 format_nvm
.b
.fm_pil
= pil
? 1 : 0;
1616 format_nvm
.b
.fm_ses
= ses
& 0x7;
1619 cmd
->nc_callback
= nvme_wakeup_cmd
;
1620 cmd
->nc_sqe
.sqe_nsid
= nsid
;
1621 cmd
->nc_sqe
.sqe_opc
= NVME_OPC_NVM_FORMAT
;
1622 cmd
->nc_sqe
.sqe_cdw10
= format_nvm
.r
;
1625 * Some devices like Samsung SM951 don't allow formatting of all
1626 * namespaces in one command. Handle that gracefully.
1628 if (nsid
== (uint32_t)-1)
1629 cmd
->nc_dontpanic
= B_TRUE
;
1631 nvme_admin_cmd(cmd
, nvme_format_cmd_timeout
);
1633 if ((ret
= nvme_check_cmd_status(cmd
)) != 0) {
1634 dev_err(nvme
->n_dip
, CE_WARN
,
1635 "!FORMAT failed with sct = %x, sc = %x",
1636 cmd
->nc_cqe
.cqe_sf
.sf_sct
, cmd
->nc_cqe
.cqe_sf
.sf_sc
);
1644 nvme_get_logpage(nvme_t
*nvme
, void **buf
, size_t *bufsize
, uint8_t logpage
,
1647 nvme_cmd_t
*cmd
= nvme_alloc_cmd(nvme
, KM_SLEEP
);
1648 nvme_getlogpage_t getlogpage
= { 0 };
1652 va_start(ap
, logpage
);
1655 cmd
->nc_callback
= nvme_wakeup_cmd
;
1656 cmd
->nc_sqe
.sqe_opc
= NVME_OPC_GET_LOG_PAGE
;
1658 getlogpage
.b
.lp_lid
= logpage
;
1661 case NVME_LOGPAGE_ERROR
:
1662 cmd
->nc_sqe
.sqe_nsid
= (uint32_t)-1;
1664 * The GET LOG PAGE command can use at most 2 pages to return
1665 * data, PRP lists are not supported.
1667 *bufsize
= MIN(2 * nvme
->n_pagesize
,
1668 nvme
->n_error_log_len
* sizeof (nvme_error_log_entry_t
));
1671 case NVME_LOGPAGE_HEALTH
:
1672 cmd
->nc_sqe
.sqe_nsid
= va_arg(ap
, uint32_t);
1673 *bufsize
= sizeof (nvme_health_log_t
);
1676 case NVME_LOGPAGE_FWSLOT
:
1677 cmd
->nc_sqe
.sqe_nsid
= (uint32_t)-1;
1678 *bufsize
= sizeof (nvme_fwslot_log_t
);
1682 dev_err(nvme
->n_dip
, CE_WARN
, "!unknown log page requested: %d",
1684 atomic_inc_32(&nvme
->n_unknown_logpage
);
1691 getlogpage
.b
.lp_numd
= *bufsize
/ sizeof (uint32_t) - 1;
1693 cmd
->nc_sqe
.sqe_cdw10
= getlogpage
.r
;
1695 if (nvme_zalloc_dma(nvme
, getlogpage
.b
.lp_numd
* sizeof (uint32_t),
1696 DDI_DMA_READ
, &nvme
->n_prp_dma_attr
, &cmd
->nc_dma
) != DDI_SUCCESS
) {
1697 dev_err(nvme
->n_dip
, CE_WARN
,
1698 "!nvme_zalloc_dma failed for GET LOG PAGE");
1703 if (cmd
->nc_dma
->nd_ncookie
> 2) {
1704 dev_err(nvme
->n_dip
, CE_WARN
,
1705 "!too many DMA cookies for GET LOG PAGE");
1706 atomic_inc_32(&nvme
->n_too_many_cookies
);
1711 cmd
->nc_sqe
.sqe_dptr
.d_prp
[0] = cmd
->nc_dma
->nd_cookie
.dmac_laddress
;
1712 if (cmd
->nc_dma
->nd_ncookie
> 1) {
1713 ddi_dma_nextcookie(cmd
->nc_dma
->nd_dmah
,
1714 &cmd
->nc_dma
->nd_cookie
);
1715 cmd
->nc_sqe
.sqe_dptr
.d_prp
[1] =
1716 cmd
->nc_dma
->nd_cookie
.dmac_laddress
;
1719 nvme_admin_cmd(cmd
, nvme_admin_cmd_timeout
);
1721 if ((ret
= nvme_check_cmd_status(cmd
)) != 0) {
1722 dev_err(nvme
->n_dip
, CE_WARN
,
1723 "!GET LOG PAGE failed with sct = %x, sc = %x",
1724 cmd
->nc_cqe
.cqe_sf
.sf_sct
, cmd
->nc_cqe
.cqe_sf
.sf_sc
);
1728 *buf
= kmem_alloc(*bufsize
, KM_SLEEP
);
1729 bcopy(cmd
->nc_dma
->nd_memp
, *buf
, *bufsize
);
1738 nvme_identify(nvme_t
*nvme
, uint32_t nsid
, void **buf
)
1740 nvme_cmd_t
*cmd
= nvme_alloc_cmd(nvme
, KM_SLEEP
);
1747 cmd
->nc_callback
= nvme_wakeup_cmd
;
1748 cmd
->nc_sqe
.sqe_opc
= NVME_OPC_IDENTIFY
;
1749 cmd
->nc_sqe
.sqe_nsid
= nsid
;
1750 cmd
->nc_sqe
.sqe_cdw10
= nsid
? NVME_IDENTIFY_NSID
: NVME_IDENTIFY_CTRL
;
1752 if (nvme_zalloc_dma(nvme
, NVME_IDENTIFY_BUFSIZE
, DDI_DMA_READ
,
1753 &nvme
->n_prp_dma_attr
, &cmd
->nc_dma
) != DDI_SUCCESS
) {
1754 dev_err(nvme
->n_dip
, CE_WARN
,
1755 "!nvme_zalloc_dma failed for IDENTIFY");
1760 if (cmd
->nc_dma
->nd_ncookie
> 2) {
1761 dev_err(nvme
->n_dip
, CE_WARN
,
1762 "!too many DMA cookies for IDENTIFY");
1763 atomic_inc_32(&nvme
->n_too_many_cookies
);
1768 cmd
->nc_sqe
.sqe_dptr
.d_prp
[0] = cmd
->nc_dma
->nd_cookie
.dmac_laddress
;
1769 if (cmd
->nc_dma
->nd_ncookie
> 1) {
1770 ddi_dma_nextcookie(cmd
->nc_dma
->nd_dmah
,
1771 &cmd
->nc_dma
->nd_cookie
);
1772 cmd
->nc_sqe
.sqe_dptr
.d_prp
[1] =
1773 cmd
->nc_dma
->nd_cookie
.dmac_laddress
;
1776 nvme_admin_cmd(cmd
, nvme_admin_cmd_timeout
);
1778 if ((ret
= nvme_check_cmd_status(cmd
)) != 0) {
1779 dev_err(nvme
->n_dip
, CE_WARN
,
1780 "!IDENTIFY failed with sct = %x, sc = %x",
1781 cmd
->nc_cqe
.cqe_sf
.sf_sct
, cmd
->nc_cqe
.cqe_sf
.sf_sc
);
1785 *buf
= kmem_alloc(NVME_IDENTIFY_BUFSIZE
, KM_SLEEP
);
1786 bcopy(cmd
->nc_dma
->nd_memp
, *buf
, NVME_IDENTIFY_BUFSIZE
);
1795 nvme_set_features(nvme_t
*nvme
, uint32_t nsid
, uint8_t feature
, uint32_t val
,
1798 _NOTE(ARGUNUSED(nsid
));
1799 nvme_cmd_t
*cmd
= nvme_alloc_cmd(nvme
, KM_SLEEP
);
1802 ASSERT(res
!= NULL
);
1805 cmd
->nc_callback
= nvme_wakeup_cmd
;
1806 cmd
->nc_sqe
.sqe_opc
= NVME_OPC_SET_FEATURES
;
1807 cmd
->nc_sqe
.sqe_cdw10
= feature
;
1808 cmd
->nc_sqe
.sqe_cdw11
= val
;
1811 case NVME_FEAT_WRITE_CACHE
:
1812 if (!nvme
->n_write_cache_present
)
1816 case NVME_FEAT_NQUEUES
:
1823 nvme_admin_cmd(cmd
, nvme_admin_cmd_timeout
);
1825 if ((ret
= nvme_check_cmd_status(cmd
)) != 0) {
1826 dev_err(nvme
->n_dip
, CE_WARN
,
1827 "!SET FEATURES %d failed with sct = %x, sc = %x",
1828 feature
, cmd
->nc_cqe
.cqe_sf
.sf_sct
,
1829 cmd
->nc_cqe
.cqe_sf
.sf_sc
);
1833 *res
= cmd
->nc_cqe
.cqe_dw0
;
1841 nvme_get_features(nvme_t
*nvme
, uint32_t nsid
, uint8_t feature
, uint32_t *res
,
1842 void **buf
, size_t *bufsize
)
1844 nvme_cmd_t
*cmd
= nvme_alloc_cmd(nvme
, KM_SLEEP
);
1847 ASSERT(res
!= NULL
);
1849 if (bufsize
!= NULL
)
1853 cmd
->nc_callback
= nvme_wakeup_cmd
;
1854 cmd
->nc_sqe
.sqe_opc
= NVME_OPC_GET_FEATURES
;
1855 cmd
->nc_sqe
.sqe_cdw10
= feature
;
1856 cmd
->nc_sqe
.sqe_cdw11
= *res
;
1859 * For some of the optional features there doesn't seem to be a method
1860 * of detecting whether it is supported other than using it. This will
1861 * cause "Invalid Field in Command" error, which is normally considered
1862 * a programming error. Set the nc_dontpanic flag to override the panic
1863 * in nvme_check_generic_cmd_status().
1866 case NVME_FEAT_ARBITRATION
:
1867 case NVME_FEAT_POWER_MGMT
:
1868 case NVME_FEAT_TEMPERATURE
:
1869 case NVME_FEAT_ERROR
:
1870 case NVME_FEAT_NQUEUES
:
1871 case NVME_FEAT_INTR_COAL
:
1872 case NVME_FEAT_INTR_VECT
:
1873 case NVME_FEAT_WRITE_ATOM
:
1874 case NVME_FEAT_ASYNC_EVENT
:
1877 case NVME_FEAT_WRITE_CACHE
:
1878 if (!nvme
->n_write_cache_present
)
1882 case NVME_FEAT_LBA_RANGE
:
1883 if (!nvme
->n_lba_range_supported
)
1886 cmd
->nc_dontpanic
= B_TRUE
;
1887 cmd
->nc_sqe
.sqe_nsid
= nsid
;
1888 ASSERT(bufsize
!= NULL
);
1889 *bufsize
= NVME_LBA_RANGE_BUFSIZE
;
1892 case NVME_FEAT_AUTO_PST
:
1893 if (!nvme
->n_auto_pst_supported
)
1896 ASSERT(bufsize
!= NULL
);
1897 *bufsize
= NVME_AUTO_PST_BUFSIZE
;
1900 case NVME_FEAT_PROGRESS
:
1901 if (!nvme
->n_progress_supported
)
1904 cmd
->nc_dontpanic
= B_TRUE
;
1911 if (bufsize
!= NULL
&& *bufsize
!= 0) {
1912 if (nvme_zalloc_dma(nvme
, *bufsize
, DDI_DMA_READ
,
1913 &nvme
->n_prp_dma_attr
, &cmd
->nc_dma
) != DDI_SUCCESS
) {
1914 dev_err(nvme
->n_dip
, CE_WARN
,
1915 "!nvme_zalloc_dma failed for GET FEATURES");
1920 if (cmd
->nc_dma
->nd_ncookie
> 2) {
1921 dev_err(nvme
->n_dip
, CE_WARN
,
1922 "!too many DMA cookies for GET FEATURES");
1923 atomic_inc_32(&nvme
->n_too_many_cookies
);
1928 cmd
->nc_sqe
.sqe_dptr
.d_prp
[0] =
1929 cmd
->nc_dma
->nd_cookie
.dmac_laddress
;
1930 if (cmd
->nc_dma
->nd_ncookie
> 1) {
1931 ddi_dma_nextcookie(cmd
->nc_dma
->nd_dmah
,
1932 &cmd
->nc_dma
->nd_cookie
);
1933 cmd
->nc_sqe
.sqe_dptr
.d_prp
[1] =
1934 cmd
->nc_dma
->nd_cookie
.dmac_laddress
;
1938 nvme_admin_cmd(cmd
, nvme_admin_cmd_timeout
);
1940 if ((ret
= nvme_check_cmd_status(cmd
)) != 0) {
1941 boolean_t known
= B_TRUE
;
1943 /* Check if this is unsupported optional feature */
1944 if (cmd
->nc_cqe
.cqe_sf
.sf_sct
== NVME_CQE_SCT_GENERIC
&&
1945 cmd
->nc_cqe
.cqe_sf
.sf_sc
== NVME_CQE_SC_GEN_INV_FLD
) {
1947 case NVME_FEAT_LBA_RANGE
:
1948 nvme
->n_lba_range_supported
= B_FALSE
;
1950 case NVME_FEAT_PROGRESS
:
1951 nvme
->n_progress_supported
= B_FALSE
;
1961 /* Report the error otherwise */
1963 dev_err(nvme
->n_dip
, CE_WARN
,
1964 "!GET FEATURES %d failed with sct = %x, sc = %x",
1965 feature
, cmd
->nc_cqe
.cqe_sf
.sf_sct
,
1966 cmd
->nc_cqe
.cqe_sf
.sf_sc
);
1972 if (bufsize
!= NULL
&& *bufsize
!= 0) {
1973 ASSERT(buf
!= NULL
);
1974 *buf
= kmem_alloc(*bufsize
, KM_SLEEP
);
1975 bcopy(cmd
->nc_dma
->nd_memp
, *buf
, *bufsize
);
1978 *res
= cmd
->nc_cqe
.cqe_dw0
;
1986 nvme_write_cache_set(nvme_t
*nvme
, boolean_t enable
)
1988 nvme_write_cache_t nwc
= { 0 };
1993 return (nvme_set_features(nvme
, 0, NVME_FEAT_WRITE_CACHE
, nwc
.r
,
1998 nvme_set_nqueues(nvme_t
*nvme
, uint16_t *nqueues
)
2000 nvme_nqueues_t nq
= { 0 };
2003 nq
.b
.nq_nsq
= nq
.b
.nq_ncq
= *nqueues
- 1;
2005 ret
= nvme_set_features(nvme
, 0, NVME_FEAT_NQUEUES
, nq
.r
, &nq
.r
);
2009 * Always use the same number of submission and completion
2010 * queues, and never use more than the requested number of
2013 *nqueues
= MIN(*nqueues
, MIN(nq
.b
.nq_nsq
, nq
.b
.nq_ncq
) + 1);
2020 nvme_create_io_qpair(nvme_t
*nvme
, nvme_qpair_t
*qp
, uint16_t idx
)
2022 nvme_cmd_t
*cmd
= nvme_alloc_cmd(nvme
, KM_SLEEP
);
2023 nvme_create_queue_dw10_t dw10
= { 0 };
2024 nvme_create_cq_dw11_t c_dw11
= { 0 };
2025 nvme_create_sq_dw11_t s_dw11
= { 0 };
2029 dw10
.b
.q_qsize
= qp
->nq_nentry
- 1;
2032 c_dw11
.b
.cq_ien
= 1;
2033 c_dw11
.b
.cq_iv
= idx
% nvme
->n_intr_cnt
;
2036 cmd
->nc_callback
= nvme_wakeup_cmd
;
2037 cmd
->nc_sqe
.sqe_opc
= NVME_OPC_CREATE_CQUEUE
;
2038 cmd
->nc_sqe
.sqe_cdw10
= dw10
.r
;
2039 cmd
->nc_sqe
.sqe_cdw11
= c_dw11
.r
;
2040 cmd
->nc_sqe
.sqe_dptr
.d_prp
[0] = qp
->nq_cqdma
->nd_cookie
.dmac_laddress
;
2042 nvme_admin_cmd(cmd
, nvme_admin_cmd_timeout
);
2044 if ((ret
= nvme_check_cmd_status(cmd
)) != 0) {
2045 dev_err(nvme
->n_dip
, CE_WARN
,
2046 "!CREATE CQUEUE failed with sct = %x, sc = %x",
2047 cmd
->nc_cqe
.cqe_sf
.sf_sct
, cmd
->nc_cqe
.cqe_sf
.sf_sc
);
2054 s_dw11
.b
.sq_cqid
= idx
;
2056 cmd
= nvme_alloc_cmd(nvme
, KM_SLEEP
);
2058 cmd
->nc_callback
= nvme_wakeup_cmd
;
2059 cmd
->nc_sqe
.sqe_opc
= NVME_OPC_CREATE_SQUEUE
;
2060 cmd
->nc_sqe
.sqe_cdw10
= dw10
.r
;
2061 cmd
->nc_sqe
.sqe_cdw11
= s_dw11
.r
;
2062 cmd
->nc_sqe
.sqe_dptr
.d_prp
[0] = qp
->nq_sqdma
->nd_cookie
.dmac_laddress
;
2064 nvme_admin_cmd(cmd
, nvme_admin_cmd_timeout
);
2066 if ((ret
= nvme_check_cmd_status(cmd
)) != 0) {
2067 dev_err(nvme
->n_dip
, CE_WARN
,
2068 "!CREATE SQUEUE failed with sct = %x, sc = %x",
2069 cmd
->nc_cqe
.cqe_sf
.sf_sct
, cmd
->nc_cqe
.cqe_sf
.sf_sc
);
2080 nvme_reset(nvme_t
*nvme
, boolean_t quiesce
)
2082 nvme_reg_csts_t csts
;
2085 nvme_put32(nvme
, NVME_REG_CC
, 0);
2087 csts
.r
= nvme_get32(nvme
, NVME_REG_CSTS
);
2088 if (csts
.b
.csts_rdy
== 1) {
2089 nvme_put32(nvme
, NVME_REG_CC
, 0);
2090 for (i
= 0; i
!= nvme
->n_timeout
* 10; i
++) {
2091 csts
.r
= nvme_get32(nvme
, NVME_REG_CSTS
);
2092 if (csts
.b
.csts_rdy
== 0)
2096 drv_usecwait(50000);
2098 delay(drv_usectohz(50000));
2102 nvme_put32(nvme
, NVME_REG_AQA
, 0);
2103 nvme_put32(nvme
, NVME_REG_ASQ
, 0);
2104 nvme_put32(nvme
, NVME_REG_ACQ
, 0);
2106 csts
.r
= nvme_get32(nvme
, NVME_REG_CSTS
);
2107 return (csts
.b
.csts_rdy
== 0 ? B_TRUE
: B_FALSE
);
2111 nvme_shutdown(nvme_t
*nvme
, int mode
, boolean_t quiesce
)
2114 nvme_reg_csts_t csts
;
2117 ASSERT(mode
== NVME_CC_SHN_NORMAL
|| mode
== NVME_CC_SHN_ABRUPT
);
2119 cc
.r
= nvme_get32(nvme
, NVME_REG_CC
);
2120 cc
.b
.cc_shn
= mode
& 0x3;
2121 nvme_put32(nvme
, NVME_REG_CC
, cc
.r
);
2123 for (i
= 0; i
!= 10; i
++) {
2124 csts
.r
= nvme_get32(nvme
, NVME_REG_CSTS
);
2125 if (csts
.b
.csts_shst
== NVME_CSTS_SHN_COMPLETE
)
2129 drv_usecwait(100000);
2131 delay(drv_usectohz(100000));
2137 nvme_prepare_devid(nvme_t
*nvme
, uint32_t nsid
)
2140 * Section 7.7 of the spec describes how to get a unique ID for
2141 * the controller: the vendor ID, the model name and the serial
2142 * number shall be unique when combined.
2144 * If a namespace has no EUI64 we use the above and add the hex
2145 * namespace ID to get a unique ID for the namespace.
2147 char model
[sizeof (nvme
->n_idctl
->id_model
) + 1];
2148 char serial
[sizeof (nvme
->n_idctl
->id_serial
) + 1];
2150 bcopy(nvme
->n_idctl
->id_model
, model
, sizeof (nvme
->n_idctl
->id_model
));
2151 bcopy(nvme
->n_idctl
->id_serial
, serial
,
2152 sizeof (nvme
->n_idctl
->id_serial
));
2154 model
[sizeof (nvme
->n_idctl
->id_model
)] = '\0';
2155 serial
[sizeof (nvme
->n_idctl
->id_serial
)] = '\0';
2157 nvme
->n_ns
[nsid
- 1].ns_devid
= kmem_asprintf("%4X-%s-%s-%X",
2158 nvme
->n_idctl
->id_vid
, model
, serial
, nsid
);
2162 nvme_init_ns(nvme_t
*nvme
, int nsid
)
2164 nvme_namespace_t
*ns
= &nvme
->n_ns
[nsid
- 1];
2165 nvme_identify_nsid_t
*idns
;
2170 if (nvme_identify(nvme
, nsid
, (void **)&idns
) != 0) {
2171 dev_err(nvme
->n_dip
, CE_WARN
,
2172 "!failed to identify namespace %d", nsid
);
2173 return (DDI_FAILURE
);
2178 ns
->ns_block_count
= idns
->id_nsize
;
2180 1 << idns
->id_lbaf
[idns
->id_flbas
.lba_format
].lbaf_lbads
;
2181 ns
->ns_best_block_size
= ns
->ns_block_size
;
2184 * Get the EUI64 if present. Use it for devid and device node names.
2186 if (NVME_VERSION_ATLEAST(&nvme
->n_version
, 1, 1))
2187 bcopy(idns
->id_eui64
, ns
->ns_eui64
, sizeof (ns
->ns_eui64
));
2189 /*LINTED: E_BAD_PTR_CAST_ALIGN*/
2190 if (*(uint64_t *)ns
->ns_eui64
!= 0) {
2191 uint8_t *eui64
= ns
->ns_eui64
;
2193 (void) snprintf(ns
->ns_name
, sizeof (ns
->ns_name
),
2194 "%02x%02x%02x%02x%02x%02x%02x%02x",
2195 eui64
[0], eui64
[1], eui64
[2], eui64
[3],
2196 eui64
[4], eui64
[5], eui64
[6], eui64
[7]);
2198 (void) snprintf(ns
->ns_name
, sizeof (ns
->ns_name
), "%d",
2201 nvme_prepare_devid(nvme
, ns
->ns_id
);
2205 * Find the LBA format with no metadata and the best relative
2206 * performance. A value of 3 means "degraded", 0 is best.
2209 for (int j
= 0; j
<= idns
->id_nlbaf
; j
++) {
2210 if (idns
->id_lbaf
[j
].lbaf_lbads
== 0)
2212 if (idns
->id_lbaf
[j
].lbaf_ms
!= 0)
2214 if (idns
->id_lbaf
[j
].lbaf_rp
>= last_rp
)
2216 last_rp
= idns
->id_lbaf
[j
].lbaf_rp
;
2217 ns
->ns_best_block_size
=
2218 1 << idns
->id_lbaf
[j
].lbaf_lbads
;
2221 if (ns
->ns_best_block_size
< nvme
->n_min_block_size
)
2222 ns
->ns_best_block_size
= nvme
->n_min_block_size
;
2225 * We currently don't support namespaces that use either:
2226 * - protection information
2227 * - illegal block size (< 512)
2229 if (idns
->id_dps
.dp_pinfo
) {
2230 dev_err(nvme
->n_dip
, CE_WARN
,
2231 "!ignoring namespace %d, unsupported feature: "
2232 "pinfo = %d", nsid
, idns
->id_dps
.dp_pinfo
);
2233 ns
->ns_ignore
= B_TRUE
;
2234 } else if (ns
->ns_block_size
< 512) {
2235 dev_err(nvme
->n_dip
, CE_WARN
,
2236 "!ignoring namespace %d, unsupported block size %"PRIu64
,
2237 nsid
, (uint64_t)ns
->ns_block_size
);
2238 ns
->ns_ignore
= B_TRUE
;
2240 ns
->ns_ignore
= B_FALSE
;
2243 return (DDI_SUCCESS
);
2247 nvme_init(nvme_t
*nvme
)
2249 nvme_reg_cc_t cc
= { 0 };
2250 nvme_reg_aqa_t aqa
= { 0 };
2251 nvme_reg_asq_t asq
= { 0 };
2252 nvme_reg_acq_t acq
= { 0 };
2255 nvme_reg_csts_t csts
;
2258 char model
[sizeof (nvme
->n_idctl
->id_model
) + 1];
2259 char *vendor
, *product
;
2261 /* Check controller version */
2262 vs
.r
= nvme_get32(nvme
, NVME_REG_VS
);
2263 nvme
->n_version
.v_major
= vs
.b
.vs_mjr
;
2264 nvme
->n_version
.v_minor
= vs
.b
.vs_mnr
;
2265 dev_err(nvme
->n_dip
, CE_CONT
, "?NVMe spec version %d.%d",
2266 nvme
->n_version
.v_major
, nvme
->n_version
.v_minor
);
2268 if (NVME_VERSION_HIGHER(&nvme
->n_version
,
2269 nvme_version_major
, nvme_version_minor
)) {
2270 dev_err(nvme
->n_dip
, CE_WARN
, "!no support for version > %d.%d",
2271 nvme_version_major
, nvme_version_minor
);
2272 if (nvme
->n_strict_version
)
2276 /* retrieve controller configuration */
2277 cap
.r
= nvme_get64(nvme
, NVME_REG_CAP
);
2279 if ((cap
.b
.cap_css
& NVME_CAP_CSS_NVM
) == 0) {
2280 dev_err(nvme
->n_dip
, CE_WARN
,
2281 "!NVM command set not supported by hardware");
2285 nvme
->n_nssr_supported
= cap
.b
.cap_nssrs
;
2286 nvme
->n_doorbell_stride
= 4 << cap
.b
.cap_dstrd
;
2287 nvme
->n_timeout
= cap
.b
.cap_to
;
2288 nvme
->n_arbitration_mechanisms
= cap
.b
.cap_ams
;
2289 nvme
->n_cont_queues_reqd
= cap
.b
.cap_cqr
;
2290 nvme
->n_max_queue_entries
= cap
.b
.cap_mqes
+ 1;
2293 * The MPSMIN and MPSMAX fields in the CAP register use 0 to specify
2294 * the base page size of 4k (1<<12), so add 12 here to get the real
2297 nvme
->n_pageshift
= MIN(MAX(cap
.b
.cap_mpsmin
+ 12, PAGESHIFT
),
2298 cap
.b
.cap_mpsmax
+ 12);
2299 nvme
->n_pagesize
= 1UL << (nvme
->n_pageshift
);
2302 * Set up Queue DMA to transfer at least 1 page-aligned page at a time.
2304 nvme
->n_queue_dma_attr
.dma_attr_align
= nvme
->n_pagesize
;
2305 nvme
->n_queue_dma_attr
.dma_attr_minxfer
= nvme
->n_pagesize
;
2308 * Set up PRP DMA to transfer 1 page-aligned page at a time.
2309 * Maxxfer may be increased after we identified the controller limits.
2311 nvme
->n_prp_dma_attr
.dma_attr_maxxfer
= nvme
->n_pagesize
;
2312 nvme
->n_prp_dma_attr
.dma_attr_minxfer
= nvme
->n_pagesize
;
2313 nvme
->n_prp_dma_attr
.dma_attr_align
= nvme
->n_pagesize
;
2314 nvme
->n_prp_dma_attr
.dma_attr_seg
= nvme
->n_pagesize
- 1;
2317 * Reset controller if it's still in ready state.
2319 if (nvme_reset(nvme
, B_FALSE
) == B_FALSE
) {
2320 dev_err(nvme
->n_dip
, CE_WARN
, "!unable to reset controller");
2321 ddi_fm_service_impact(nvme
->n_dip
, DDI_SERVICE_LOST
);
2322 nvme
->n_dead
= B_TRUE
;
2327 * Create the admin queue pair.
2329 if (nvme_alloc_qpair(nvme
, nvme
->n_admin_queue_len
, &nvme
->n_adminq
, 0)
2331 dev_err(nvme
->n_dip
, CE_WARN
,
2332 "!unable to allocate admin qpair");
2335 nvme
->n_ioq
= kmem_alloc(sizeof (nvme_qpair_t
*), KM_SLEEP
);
2336 nvme
->n_ioq
[0] = nvme
->n_adminq
;
2338 nvme
->n_progress
|= NVME_ADMIN_QUEUE
;
2340 (void) ddi_prop_update_int(DDI_DEV_T_NONE
, nvme
->n_dip
,
2341 "admin-queue-len", nvme
->n_admin_queue_len
);
2343 aqa
.b
.aqa_asqs
= aqa
.b
.aqa_acqs
= nvme
->n_admin_queue_len
- 1;
2344 asq
= nvme
->n_adminq
->nq_sqdma
->nd_cookie
.dmac_laddress
;
2345 acq
= nvme
->n_adminq
->nq_cqdma
->nd_cookie
.dmac_laddress
;
2347 ASSERT((asq
& (nvme
->n_pagesize
- 1)) == 0);
2348 ASSERT((acq
& (nvme
->n_pagesize
- 1)) == 0);
2350 nvme_put32(nvme
, NVME_REG_AQA
, aqa
.r
);
2351 nvme_put64(nvme
, NVME_REG_ASQ
, asq
);
2352 nvme_put64(nvme
, NVME_REG_ACQ
, acq
);
2354 cc
.b
.cc_ams
= 0; /* use Round-Robin arbitration */
2355 cc
.b
.cc_css
= 0; /* use NVM command set */
2356 cc
.b
.cc_mps
= nvme
->n_pageshift
- 12;
2357 cc
.b
.cc_shn
= 0; /* no shutdown in progress */
2358 cc
.b
.cc_en
= 1; /* enable controller */
2359 cc
.b
.cc_iosqes
= 6; /* submission queue entry is 2^6 bytes long */
2360 cc
.b
.cc_iocqes
= 4; /* completion queue entry is 2^4 bytes long */
2362 nvme_put32(nvme
, NVME_REG_CC
, cc
.r
);
2365 * Wait for the controller to become ready.
2367 csts
.r
= nvme_get32(nvme
, NVME_REG_CSTS
);
2368 if (csts
.b
.csts_rdy
== 0) {
2369 for (i
= 0; i
!= nvme
->n_timeout
* 10; i
++) {
2370 delay(drv_usectohz(50000));
2371 csts
.r
= nvme_get32(nvme
, NVME_REG_CSTS
);
2373 if (csts
.b
.csts_cfs
== 1) {
2374 dev_err(nvme
->n_dip
, CE_WARN
,
2375 "!controller fatal status at init");
2376 ddi_fm_service_impact(nvme
->n_dip
,
2378 nvme
->n_dead
= B_TRUE
;
2382 if (csts
.b
.csts_rdy
== 1)
2387 if (csts
.b
.csts_rdy
== 0) {
2388 dev_err(nvme
->n_dip
, CE_WARN
, "!controller not ready");
2389 ddi_fm_service_impact(nvme
->n_dip
, DDI_SERVICE_LOST
);
2390 nvme
->n_dead
= B_TRUE
;
2395 * Assume an abort command limit of 1. We'll destroy and re-init
2396 * that later when we know the true abort command limit.
2398 sema_init(&nvme
->n_abort_sema
, 1, NULL
, SEMA_DRIVER
, NULL
);
2401 * Setup initial interrupt for admin queue.
2403 if ((nvme_setup_interrupts(nvme
, DDI_INTR_TYPE_MSIX
, 1)
2405 (nvme_setup_interrupts(nvme
, DDI_INTR_TYPE_MSI
, 1)
2407 (nvme_setup_interrupts(nvme
, DDI_INTR_TYPE_FIXED
, 1)
2409 dev_err(nvme
->n_dip
, CE_WARN
,
2410 "!failed to setup initial interrupt");
2415 * Post an asynchronous event command to catch errors.
2416 * We assume the asynchronous events are supported as required by
2417 * specification (Figure 40 in section 5 of NVMe 1.2).
2418 * However, since at least qemu does not follow the specification,
2419 * we need a mechanism to protect ourselves.
2421 nvme
->n_async_event_supported
= B_TRUE
;
2422 nvme_async_event(nvme
);
2425 * Identify Controller
2427 if (nvme_identify(nvme
, 0, (void **)&nvme
->n_idctl
) != 0) {
2428 dev_err(nvme
->n_dip
, CE_WARN
,
2429 "!failed to identify controller");
2434 * Get Vendor & Product ID
2436 bcopy(nvme
->n_idctl
->id_model
, model
, sizeof (nvme
->n_idctl
->id_model
));
2437 model
[sizeof (nvme
->n_idctl
->id_model
)] = '\0';
2438 sata_split_model(model
, &vendor
, &product
);
2441 nvme
->n_vendor
= strdup("NVMe");
2443 nvme
->n_vendor
= strdup(vendor
);
2445 nvme
->n_product
= strdup(product
);
2448 * Get controller limits.
2450 nvme
->n_async_event_limit
= MAX(NVME_MIN_ASYNC_EVENT_LIMIT
,
2451 MIN(nvme
->n_admin_queue_len
/ 10,
2452 MIN(nvme
->n_idctl
->id_aerl
+ 1, nvme
->n_async_event_limit
)));
2454 (void) ddi_prop_update_int(DDI_DEV_T_NONE
, nvme
->n_dip
,
2455 "async-event-limit", nvme
->n_async_event_limit
);
2457 nvme
->n_abort_command_limit
= nvme
->n_idctl
->id_acl
+ 1;
2460 * Reinitialize the semaphore with the true abort command limit
2461 * supported by the hardware. It's not necessary to disable interrupts
2462 * as only command aborts use the semaphore, and no commands are
2463 * executed or aborted while we're here.
2465 sema_destroy(&nvme
->n_abort_sema
);
2466 sema_init(&nvme
->n_abort_sema
, nvme
->n_abort_command_limit
- 1, NULL
,
2469 nvme
->n_progress
|= NVME_CTRL_LIMITS
;
2471 if (nvme
->n_idctl
->id_mdts
== 0)
2472 nvme
->n_max_data_transfer_size
= nvme
->n_pagesize
* 65536;
2474 nvme
->n_max_data_transfer_size
=
2475 1ull << (nvme
->n_pageshift
+ nvme
->n_idctl
->id_mdts
);
2477 nvme
->n_error_log_len
= nvme
->n_idctl
->id_elpe
+ 1;
2480 * Limit n_max_data_transfer_size to what we can handle in one PRP.
2481 * Chained PRPs are currently unsupported.
2483 * This is a no-op on hardware which doesn't support a transfer size
2484 * big enough to require chained PRPs.
2486 nvme
->n_max_data_transfer_size
= MIN(nvme
->n_max_data_transfer_size
,
2487 (nvme
->n_pagesize
/ sizeof (uint64_t) * nvme
->n_pagesize
));
2489 nvme
->n_prp_dma_attr
.dma_attr_maxxfer
= nvme
->n_max_data_transfer_size
;
2492 * Make sure the minimum/maximum queue entry sizes are not
2493 * larger/smaller than the default.
2496 if (((1 << nvme
->n_idctl
->id_sqes
.qes_min
) > sizeof (nvme_sqe_t
)) ||
2497 ((1 << nvme
->n_idctl
->id_sqes
.qes_max
) < sizeof (nvme_sqe_t
)) ||
2498 ((1 << nvme
->n_idctl
->id_cqes
.qes_min
) > sizeof (nvme_cqe_t
)) ||
2499 ((1 << nvme
->n_idctl
->id_cqes
.qes_max
) < sizeof (nvme_cqe_t
)))
2503 * Check for the presence of a Volatile Write Cache. If present,
2504 * enable or disable based on the value of the property
2505 * volatile-write-cache-enable (default is enabled).
2507 nvme
->n_write_cache_present
=
2508 nvme
->n_idctl
->id_vwc
.vwc_present
== 0 ? B_FALSE
: B_TRUE
;
2510 (void) ddi_prop_update_int(DDI_DEV_T_NONE
, nvme
->n_dip
,
2511 "volatile-write-cache-present",
2512 nvme
->n_write_cache_present
? 1 : 0);
2514 if (!nvme
->n_write_cache_present
) {
2515 nvme
->n_write_cache_enabled
= B_FALSE
;
2516 } else if (nvme_write_cache_set(nvme
, nvme
->n_write_cache_enabled
)
2518 dev_err(nvme
->n_dip
, CE_WARN
,
2519 "!failed to %sable volatile write cache",
2520 nvme
->n_write_cache_enabled
? "en" : "dis");
2522 * Assume the cache is (still) enabled.
2524 nvme
->n_write_cache_enabled
= B_TRUE
;
2527 (void) ddi_prop_update_int(DDI_DEV_T_NONE
, nvme
->n_dip
,
2528 "volatile-write-cache-enable",
2529 nvme
->n_write_cache_enabled
? 1 : 0);
2532 * Assume LBA Range Type feature is supported. If it isn't this
2533 * will be set to B_FALSE by nvme_get_features().
2535 nvme
->n_lba_range_supported
= B_TRUE
;
2538 * Check support for Autonomous Power State Transition.
2540 if (NVME_VERSION_ATLEAST(&nvme
->n_version
, 1, 1))
2541 nvme
->n_auto_pst_supported
=
2542 nvme
->n_idctl
->id_apsta
.ap_sup
== 0 ? B_FALSE
: B_TRUE
;
2545 * Assume Software Progress Marker feature is supported. If it isn't
2546 * this will be set to B_FALSE by nvme_get_features().
2548 nvme
->n_progress_supported
= B_TRUE
;
2551 * Identify Namespaces
2553 nvme
->n_namespace_count
= nvme
->n_idctl
->id_nn
;
2555 if (nvme
->n_namespace_count
== 0) {
2556 dev_err(nvme
->n_dip
, CE_WARN
,
2557 "!controllers without namespaces are not supported");
2561 if (nvme
->n_namespace_count
> NVME_MINOR_MAX
) {
2562 dev_err(nvme
->n_dip
, CE_WARN
,
2563 "!too many namespaces: %d, limiting to %d\n",
2564 nvme
->n_namespace_count
, NVME_MINOR_MAX
);
2565 nvme
->n_namespace_count
= NVME_MINOR_MAX
;
2568 nvme
->n_ns
= kmem_zalloc(sizeof (nvme_namespace_t
) *
2569 nvme
->n_namespace_count
, KM_SLEEP
);
2571 for (i
= 0; i
!= nvme
->n_namespace_count
; i
++) {
2572 mutex_init(&nvme
->n_ns
[i
].ns_minor
.nm_mutex
, NULL
, MUTEX_DRIVER
,
2574 if (nvme_init_ns(nvme
, i
+ 1) != DDI_SUCCESS
)
2579 * Try to set up MSI/MSI-X interrupts.
2581 if ((nvme
->n_intr_types
& (DDI_INTR_TYPE_MSI
| DDI_INTR_TYPE_MSIX
))
2583 nvme_release_interrupts(nvme
);
2585 nqueues
= MIN(UINT16_MAX
, ncpus
);
2587 if ((nvme_setup_interrupts(nvme
, DDI_INTR_TYPE_MSIX
,
2588 nqueues
) != DDI_SUCCESS
) &&
2589 (nvme_setup_interrupts(nvme
, DDI_INTR_TYPE_MSI
,
2590 nqueues
) != DDI_SUCCESS
)) {
2591 dev_err(nvme
->n_dip
, CE_WARN
,
2592 "!failed to setup MSI/MSI-X interrupts");
2597 nqueues
= nvme
->n_intr_cnt
;
2600 * Create I/O queue pairs.
2603 if (nvme_set_nqueues(nvme
, &nqueues
) != 0) {
2604 dev_err(nvme
->n_dip
, CE_WARN
,
2605 "!failed to set number of I/O queues to %d",
2611 * Reallocate I/O queue array
2613 kmem_free(nvme
->n_ioq
, sizeof (nvme_qpair_t
*));
2614 nvme
->n_ioq
= kmem_zalloc(sizeof (nvme_qpair_t
*) *
2615 (nqueues
+ 1), KM_SLEEP
);
2616 nvme
->n_ioq
[0] = nvme
->n_adminq
;
2618 nvme
->n_ioq_count
= nqueues
;
2621 * If we got less queues than we asked for we might as well give
2622 * some of the interrupt vectors back to the system.
2624 if (nvme
->n_ioq_count
< nvme
->n_intr_cnt
) {
2625 nvme_release_interrupts(nvme
);
2627 if (nvme_setup_interrupts(nvme
, nvme
->n_intr_type
,
2628 nvme
->n_ioq_count
) != DDI_SUCCESS
) {
2629 dev_err(nvme
->n_dip
, CE_WARN
,
2630 "!failed to reduce number of interrupts");
2636 * Alloc & register I/O queue pairs
2638 nvme
->n_io_queue_len
=
2639 MIN(nvme
->n_io_queue_len
, nvme
->n_max_queue_entries
);
2640 (void) ddi_prop_update_int(DDI_DEV_T_NONE
, nvme
->n_dip
, "io-queue-len",
2641 nvme
->n_io_queue_len
);
2643 for (i
= 1; i
!= nvme
->n_ioq_count
+ 1; i
++) {
2644 if (nvme_alloc_qpair(nvme
, nvme
->n_io_queue_len
,
2645 &nvme
->n_ioq
[i
], i
) != DDI_SUCCESS
) {
2646 dev_err(nvme
->n_dip
, CE_WARN
,
2647 "!unable to allocate I/O qpair %d", i
);
2651 if (nvme_create_io_qpair(nvme
, nvme
->n_ioq
[i
], i
) != 0) {
2652 dev_err(nvme
->n_dip
, CE_WARN
,
2653 "!unable to create I/O qpair %d", i
);
2659 * Post more asynchronous events commands to reduce event reporting
2660 * latency as suggested by the spec.
2662 if (nvme
->n_async_event_supported
) {
2663 for (i
= 1; i
!= nvme
->n_async_event_limit
; i
++)
2664 nvme_async_event(nvme
);
2667 return (DDI_SUCCESS
);
2670 (void) nvme_reset(nvme
, B_FALSE
);
2671 return (DDI_FAILURE
);
2675 nvme_intr(caddr_t arg1
, caddr_t arg2
)
2677 /*LINTED: E_PTR_BAD_CAST_ALIGN*/
2678 nvme_t
*nvme
= (nvme_t
*)arg1
;
2679 int inum
= (int)(uintptr_t)arg2
;
2684 if (inum
>= nvme
->n_intr_cnt
)
2685 return (DDI_INTR_UNCLAIMED
);
2688 return (nvme
->n_intr_type
== DDI_INTR_TYPE_FIXED
?
2689 DDI_INTR_UNCLAIMED
: DDI_INTR_CLAIMED
);
2692 * The interrupt vector a queue uses is calculated as queue_idx %
2693 * intr_cnt in nvme_create_io_qpair(). Iterate through the queue array
2694 * in steps of n_intr_cnt to process all queues using this vector.
2697 qnum
< nvme
->n_ioq_count
+ 1 && nvme
->n_ioq
[qnum
] != NULL
;
2698 qnum
+= nvme
->n_intr_cnt
) {
2699 while ((cmd
= nvme_retrieve_cmd(nvme
, nvme
->n_ioq
[qnum
]))) {
2700 taskq_dispatch_ent((taskq_t
*)cmd
->nc_nvme
->n_cmd_taskq
,
2701 cmd
->nc_callback
, cmd
, TQ_NOSLEEP
, &cmd
->nc_tqent
);
2706 return (ccnt
> 0 ? DDI_INTR_CLAIMED
: DDI_INTR_UNCLAIMED
);
2710 nvme_release_interrupts(nvme_t
*nvme
)
2714 for (i
= 0; i
< nvme
->n_intr_cnt
; i
++) {
2715 if (nvme
->n_inth
[i
] == NULL
)
2718 if (nvme
->n_intr_cap
& DDI_INTR_FLAG_BLOCK
)
2719 (void) ddi_intr_block_disable(&nvme
->n_inth
[i
], 1);
2721 (void) ddi_intr_disable(nvme
->n_inth
[i
]);
2723 (void) ddi_intr_remove_handler(nvme
->n_inth
[i
]);
2724 (void) ddi_intr_free(nvme
->n_inth
[i
]);
2727 kmem_free(nvme
->n_inth
, nvme
->n_inth_sz
);
2728 nvme
->n_inth
= NULL
;
2729 nvme
->n_inth_sz
= 0;
2731 nvme
->n_progress
&= ~NVME_INTERRUPTS
;
2735 nvme_setup_interrupts(nvme_t
*nvme
, int intr_type
, int nqpairs
)
2737 int nintrs
, navail
, count
;
2741 if (nvme
->n_intr_types
== 0) {
2742 ret
= ddi_intr_get_supported_types(nvme
->n_dip
,
2743 &nvme
->n_intr_types
);
2744 if (ret
!= DDI_SUCCESS
) {
2745 dev_err(nvme
->n_dip
, CE_WARN
,
2746 "!%s: ddi_intr_get_supported types failed",
2751 if (get_hwenv() == HW_VMWARE
)
2752 nvme
->n_intr_types
&= ~DDI_INTR_TYPE_MSIX
;
2756 if ((nvme
->n_intr_types
& intr_type
) == 0)
2757 return (DDI_FAILURE
);
2759 ret
= ddi_intr_get_nintrs(nvme
->n_dip
, intr_type
, &nintrs
);
2760 if (ret
!= DDI_SUCCESS
) {
2761 dev_err(nvme
->n_dip
, CE_WARN
, "!%s: ddi_intr_get_nintrs failed",
2766 ret
= ddi_intr_get_navail(nvme
->n_dip
, intr_type
, &navail
);
2767 if (ret
!= DDI_SUCCESS
) {
2768 dev_err(nvme
->n_dip
, CE_WARN
, "!%s: ddi_intr_get_navail failed",
2773 /* We want at most one interrupt per queue pair. */
2774 if (navail
> nqpairs
)
2777 nvme
->n_inth_sz
= sizeof (ddi_intr_handle_t
) * navail
;
2778 nvme
->n_inth
= kmem_zalloc(nvme
->n_inth_sz
, KM_SLEEP
);
2780 ret
= ddi_intr_alloc(nvme
->n_dip
, nvme
->n_inth
, intr_type
, 0, navail
,
2782 if (ret
!= DDI_SUCCESS
) {
2783 dev_err(nvme
->n_dip
, CE_WARN
, "!%s: ddi_intr_alloc failed",
2788 nvme
->n_intr_cnt
= count
;
2790 ret
= ddi_intr_get_pri(nvme
->n_inth
[0], &nvme
->n_intr_pri
);
2791 if (ret
!= DDI_SUCCESS
) {
2792 dev_err(nvme
->n_dip
, CE_WARN
, "!%s: ddi_intr_get_pri failed",
2797 for (i
= 0; i
< count
; i
++) {
2798 ret
= ddi_intr_add_handler(nvme
->n_inth
[i
], nvme_intr
,
2799 (void *)nvme
, (void *)(uintptr_t)i
);
2800 if (ret
!= DDI_SUCCESS
) {
2801 dev_err(nvme
->n_dip
, CE_WARN
,
2802 "!%s: ddi_intr_add_handler failed", __func__
);
2807 (void) ddi_intr_get_cap(nvme
->n_inth
[0], &nvme
->n_intr_cap
);
2809 for (i
= 0; i
< count
; i
++) {
2810 if (nvme
->n_intr_cap
& DDI_INTR_FLAG_BLOCK
)
2811 ret
= ddi_intr_block_enable(&nvme
->n_inth
[i
], 1);
2813 ret
= ddi_intr_enable(nvme
->n_inth
[i
]);
2815 if (ret
!= DDI_SUCCESS
) {
2816 dev_err(nvme
->n_dip
, CE_WARN
,
2817 "!%s: enabling interrupt %d failed", __func__
, i
);
2822 nvme
->n_intr_type
= intr_type
;
2824 nvme
->n_progress
|= NVME_INTERRUPTS
;
2826 return (DDI_SUCCESS
);
2829 nvme_release_interrupts(nvme
);
2835 nvme_fm_errcb(dev_info_t
*dip
, ddi_fm_error_t
*fm_error
, const void *arg
)
2837 _NOTE(ARGUNUSED(arg
));
2839 pci_ereport_post(dip
, fm_error
, NULL
);
2840 return (fm_error
->fme_status
);
2844 nvme_attach(dev_info_t
*dip
, ddi_attach_cmd_t cmd
)
2853 if (cmd
!= DDI_ATTACH
)
2854 return (DDI_FAILURE
);
2856 instance
= ddi_get_instance(dip
);
2858 if (ddi_soft_state_zalloc(nvme_state
, instance
) != DDI_SUCCESS
)
2859 return (DDI_FAILURE
);
2861 nvme
= ddi_get_soft_state(nvme_state
, instance
);
2862 ddi_set_driver_private(dip
, nvme
);
2865 mutex_init(&nvme
->n_minor
.nm_mutex
, NULL
, MUTEX_DRIVER
, NULL
);
2867 nvme
->n_strict_version
= ddi_prop_get_int(DDI_DEV_T_ANY
, dip
,
2868 DDI_PROP_DONTPASS
, "strict-version", 1) == 1 ? B_TRUE
: B_FALSE
;
2869 nvme
->n_ignore_unknown_vendor_status
= ddi_prop_get_int(DDI_DEV_T_ANY
,
2870 dip
, DDI_PROP_DONTPASS
, "ignore-unknown-vendor-status", 0) == 1 ?
2872 nvme
->n_admin_queue_len
= ddi_prop_get_int(DDI_DEV_T_ANY
, dip
,
2873 DDI_PROP_DONTPASS
, "admin-queue-len", NVME_DEFAULT_ADMIN_QUEUE_LEN
);
2874 nvme
->n_io_queue_len
= ddi_prop_get_int(DDI_DEV_T_ANY
, dip
,
2875 DDI_PROP_DONTPASS
, "io-queue-len", NVME_DEFAULT_IO_QUEUE_LEN
);
2876 nvme
->n_async_event_limit
= ddi_prop_get_int(DDI_DEV_T_ANY
, dip
,
2877 DDI_PROP_DONTPASS
, "async-event-limit",
2878 NVME_DEFAULT_ASYNC_EVENT_LIMIT
);
2879 nvme
->n_write_cache_enabled
= ddi_prop_get_int(DDI_DEV_T_ANY
, dip
,
2880 DDI_PROP_DONTPASS
, "volatile-write-cache-enable", 1) != 0 ?
2882 nvme
->n_min_block_size
= ddi_prop_get_int(DDI_DEV_T_ANY
, dip
,
2883 DDI_PROP_DONTPASS
, "min-phys-block-size",
2884 NVME_DEFAULT_MIN_BLOCK_SIZE
);
2886 if (!ISP2(nvme
->n_min_block_size
) ||
2887 (nvme
->n_min_block_size
< NVME_DEFAULT_MIN_BLOCK_SIZE
)) {
2888 dev_err(dip
, CE_WARN
, "!min-phys-block-size %s, "
2889 "using default %d", ISP2(nvme
->n_min_block_size
) ?
2890 "too low" : "not a power of 2",
2891 NVME_DEFAULT_MIN_BLOCK_SIZE
);
2892 nvme
->n_min_block_size
= NVME_DEFAULT_MIN_BLOCK_SIZE
;
2895 if (nvme
->n_admin_queue_len
< NVME_MIN_ADMIN_QUEUE_LEN
)
2896 nvme
->n_admin_queue_len
= NVME_MIN_ADMIN_QUEUE_LEN
;
2897 else if (nvme
->n_admin_queue_len
> NVME_MAX_ADMIN_QUEUE_LEN
)
2898 nvme
->n_admin_queue_len
= NVME_MAX_ADMIN_QUEUE_LEN
;
2900 if (nvme
->n_io_queue_len
< NVME_MIN_IO_QUEUE_LEN
)
2901 nvme
->n_io_queue_len
= NVME_MIN_IO_QUEUE_LEN
;
2903 if (nvme
->n_async_event_limit
< 1)
2904 nvme
->n_async_event_limit
= NVME_DEFAULT_ASYNC_EVENT_LIMIT
;
2906 nvme
->n_reg_acc_attr
= nvme_reg_acc_attr
;
2907 nvme
->n_queue_dma_attr
= nvme_queue_dma_attr
;
2908 nvme
->n_prp_dma_attr
= nvme_prp_dma_attr
;
2909 nvme
->n_sgl_dma_attr
= nvme_sgl_dma_attr
;
2912 * Setup FMA support.
2914 nvme
->n_fm_cap
= ddi_getprop(DDI_DEV_T_ANY
, dip
,
2915 DDI_PROP_CANSLEEP
| DDI_PROP_DONTPASS
, "fm-capable",
2916 DDI_FM_EREPORT_CAPABLE
| DDI_FM_ACCCHK_CAPABLE
|
2917 DDI_FM_DMACHK_CAPABLE
| DDI_FM_ERRCB_CAPABLE
);
2919 ddi_fm_init(dip
, &nvme
->n_fm_cap
, &nvme
->n_fm_ibc
);
2921 if (nvme
->n_fm_cap
) {
2922 if (nvme
->n_fm_cap
& DDI_FM_ACCCHK_CAPABLE
)
2923 nvme
->n_reg_acc_attr
.devacc_attr_access
=
2926 if (nvme
->n_fm_cap
& DDI_FM_DMACHK_CAPABLE
) {
2927 nvme
->n_prp_dma_attr
.dma_attr_flags
|= DDI_DMA_FLAGERR
;
2928 nvme
->n_sgl_dma_attr
.dma_attr_flags
|= DDI_DMA_FLAGERR
;
2931 if (DDI_FM_EREPORT_CAP(nvme
->n_fm_cap
) ||
2932 DDI_FM_ERRCB_CAP(nvme
->n_fm_cap
))
2933 pci_ereport_setup(dip
);
2935 if (DDI_FM_ERRCB_CAP(nvme
->n_fm_cap
))
2936 ddi_fm_handler_register(dip
, nvme_fm_errcb
,
2940 nvme
->n_progress
|= NVME_FMA_INIT
;
2943 * The spec defines several register sets. Only the controller
2944 * registers (set 1) are currently used.
2946 if (ddi_dev_nregs(dip
, &nregs
) == DDI_FAILURE
||
2948 ddi_dev_regsize(dip
, 1, ®size
) == DDI_FAILURE
)
2951 if (ddi_regs_map_setup(dip
, 1, &nvme
->n_regs
, 0, regsize
,
2952 &nvme
->n_reg_acc_attr
, &nvme
->n_regh
) != DDI_SUCCESS
) {
2953 dev_err(dip
, CE_WARN
, "!failed to map regset 1");
2957 nvme
->n_progress
|= NVME_REGS_MAPPED
;
2960 * Create taskq for command completion.
2962 (void) snprintf(name
, sizeof (name
), "%s%d_cmd_taskq",
2963 ddi_driver_name(dip
), ddi_get_instance(dip
));
2964 nvme
->n_cmd_taskq
= ddi_taskq_create(dip
, name
, MIN(UINT16_MAX
, ncpus
),
2965 TASKQ_DEFAULTPRI
, 0);
2966 if (nvme
->n_cmd_taskq
== NULL
) {
2967 dev_err(dip
, CE_WARN
, "!failed to create cmd taskq");
2972 * Create PRP DMA cache
2974 (void) snprintf(name
, sizeof (name
), "%s%d_prp_cache",
2975 ddi_driver_name(dip
), ddi_get_instance(dip
));
2976 nvme
->n_prp_cache
= kmem_cache_create(name
, sizeof (nvme_dma_t
),
2977 0, nvme_prp_dma_constructor
, nvme_prp_dma_destructor
,
2978 NULL
, (void *)nvme
, NULL
, 0);
2980 if (nvme_init(nvme
) != DDI_SUCCESS
)
2984 * Attach the blkdev driver for each namespace.
2986 for (i
= 0; i
!= nvme
->n_namespace_count
; i
++) {
2987 if (ddi_create_minor_node(nvme
->n_dip
, nvme
->n_ns
[i
].ns_name
,
2988 S_IFCHR
, NVME_MINOR(ddi_get_instance(nvme
->n_dip
), i
+ 1),
2989 DDI_NT_NVME_ATTACHMENT_POINT
, 0) != DDI_SUCCESS
) {
2990 dev_err(dip
, CE_WARN
,
2991 "!failed to create minor node for namespace %d", i
);
2995 if (nvme
->n_ns
[i
].ns_ignore
)
2998 nvme
->n_ns
[i
].ns_bd_hdl
= bd_alloc_handle(&nvme
->n_ns
[i
],
2999 &nvme_bd_ops
, &nvme
->n_prp_dma_attr
, KM_SLEEP
);
3001 if (nvme
->n_ns
[i
].ns_bd_hdl
== NULL
) {
3002 dev_err(dip
, CE_WARN
,
3003 "!failed to get blkdev handle for namespace %d", i
);
3007 if (bd_attach_handle(dip
, nvme
->n_ns
[i
].ns_bd_hdl
)
3009 dev_err(dip
, CE_WARN
,
3010 "!failed to attach blkdev handle for namespace %d",
3016 if (ddi_create_minor_node(dip
, "devctl", S_IFCHR
,
3017 NVME_MINOR(ddi_get_instance(dip
), 0), DDI_NT_NVME_NEXUS
, 0)
3019 dev_err(dip
, CE_WARN
, "nvme_attach: "
3020 "cannot create devctl minor node");
3024 return (DDI_SUCCESS
);
3027 /* attach successful anyway so that FMA can retire the device */
3029 return (DDI_SUCCESS
);
3031 (void) nvme_detach(dip
, DDI_DETACH
);
3033 return (DDI_FAILURE
);
3037 nvme_detach(dev_info_t
*dip
, ddi_detach_cmd_t cmd
)
3042 if (cmd
!= DDI_DETACH
)
3043 return (DDI_FAILURE
);
3045 instance
= ddi_get_instance(dip
);
3047 nvme
= ddi_get_soft_state(nvme_state
, instance
);
3050 return (DDI_FAILURE
);
3052 ddi_remove_minor_node(dip
, "devctl");
3053 mutex_destroy(&nvme
->n_minor
.nm_mutex
);
3056 for (i
= 0; i
!= nvme
->n_namespace_count
; i
++) {
3057 ddi_remove_minor_node(dip
, nvme
->n_ns
[i
].ns_name
);
3058 mutex_destroy(&nvme
->n_ns
[i
].ns_minor
.nm_mutex
);
3060 if (nvme
->n_ns
[i
].ns_bd_hdl
) {
3061 (void) bd_detach_handle(
3062 nvme
->n_ns
[i
].ns_bd_hdl
);
3063 bd_free_handle(nvme
->n_ns
[i
].ns_bd_hdl
);
3066 if (nvme
->n_ns
[i
].ns_idns
)
3067 kmem_free(nvme
->n_ns
[i
].ns_idns
,
3068 sizeof (nvme_identify_nsid_t
));
3069 if (nvme
->n_ns
[i
].ns_devid
)
3070 strfree(nvme
->n_ns
[i
].ns_devid
);
3073 kmem_free(nvme
->n_ns
, sizeof (nvme_namespace_t
) *
3074 nvme
->n_namespace_count
);
3077 if (nvme
->n_progress
& NVME_INTERRUPTS
)
3078 nvme_release_interrupts(nvme
);
3080 if (nvme
->n_cmd_taskq
)
3081 ddi_taskq_wait(nvme
->n_cmd_taskq
);
3083 if (nvme
->n_ioq_count
> 0) {
3084 for (i
= 1; i
!= nvme
->n_ioq_count
+ 1; i
++) {
3085 if (nvme
->n_ioq
[i
] != NULL
) {
3086 /* TODO: send destroy queue commands */
3087 nvme_free_qpair(nvme
->n_ioq
[i
]);
3091 kmem_free(nvme
->n_ioq
, sizeof (nvme_qpair_t
*) *
3092 (nvme
->n_ioq_count
+ 1));
3095 if (nvme
->n_prp_cache
!= NULL
) {
3096 kmem_cache_destroy(nvme
->n_prp_cache
);
3099 if (nvme
->n_progress
& NVME_REGS_MAPPED
) {
3100 nvme_shutdown(nvme
, NVME_CC_SHN_NORMAL
, B_FALSE
);
3101 (void) nvme_reset(nvme
, B_FALSE
);
3104 if (nvme
->n_cmd_taskq
)
3105 ddi_taskq_destroy(nvme
->n_cmd_taskq
);
3107 if (nvme
->n_progress
& NVME_CTRL_LIMITS
)
3108 sema_destroy(&nvme
->n_abort_sema
);
3110 if (nvme
->n_progress
& NVME_ADMIN_QUEUE
)
3111 nvme_free_qpair(nvme
->n_adminq
);
3114 kmem_free(nvme
->n_idctl
, NVME_IDENTIFY_BUFSIZE
);
3116 if (nvme
->n_progress
& NVME_REGS_MAPPED
)
3117 ddi_regs_map_free(&nvme
->n_regh
);
3119 if (nvme
->n_progress
& NVME_FMA_INIT
) {
3120 if (DDI_FM_ERRCB_CAP(nvme
->n_fm_cap
))
3121 ddi_fm_handler_unregister(nvme
->n_dip
);
3123 if (DDI_FM_EREPORT_CAP(nvme
->n_fm_cap
) ||
3124 DDI_FM_ERRCB_CAP(nvme
->n_fm_cap
))
3125 pci_ereport_teardown(nvme
->n_dip
);
3127 ddi_fm_fini(nvme
->n_dip
);
3130 if (nvme
->n_vendor
!= NULL
)
3131 strfree(nvme
->n_vendor
);
3133 if (nvme
->n_product
!= NULL
)
3134 strfree(nvme
->n_product
);
3136 ddi_soft_state_free(nvme_state
, instance
);
3138 return (DDI_SUCCESS
);
3142 nvme_quiesce(dev_info_t
*dip
)
3147 instance
= ddi_get_instance(dip
);
3149 nvme
= ddi_get_soft_state(nvme_state
, instance
);
3152 return (DDI_FAILURE
);
3154 nvme_shutdown(nvme
, NVME_CC_SHN_ABRUPT
, B_TRUE
);
3156 (void) nvme_reset(nvme
, B_TRUE
);
3158 return (DDI_FAILURE
);
3162 nvme_fill_prp(nvme_cmd_t
*cmd
, bd_xfer_t
*xfer
)
3164 nvme_t
*nvme
= cmd
->nc_nvme
;
3165 int nprp_page
, nprp
;
3168 if (xfer
->x_ndmac
== 0)
3169 return (DDI_FAILURE
);
3171 cmd
->nc_sqe
.sqe_dptr
.d_prp
[0] = xfer
->x_dmac
.dmac_laddress
;
3172 ddi_dma_nextcookie(xfer
->x_dmah
, &xfer
->x_dmac
);
3174 if (xfer
->x_ndmac
== 1) {
3175 cmd
->nc_sqe
.sqe_dptr
.d_prp
[1] = 0;
3176 return (DDI_SUCCESS
);
3177 } else if (xfer
->x_ndmac
== 2) {
3178 cmd
->nc_sqe
.sqe_dptr
.d_prp
[1] = xfer
->x_dmac
.dmac_laddress
;
3179 return (DDI_SUCCESS
);
3184 nprp_page
= nvme
->n_pagesize
/ sizeof (uint64_t) - 1;
3185 ASSERT(nprp_page
> 0);
3186 nprp
= (xfer
->x_ndmac
+ nprp_page
- 1) / nprp_page
;
3189 * We currently don't support chained PRPs and set up our DMA
3190 * attributes to reflect that. If we still get an I/O request
3191 * that needs a chained PRP something is very wrong.
3195 cmd
->nc_dma
= kmem_cache_alloc(nvme
->n_prp_cache
, KM_SLEEP
);
3196 bzero(cmd
->nc_dma
->nd_memp
, cmd
->nc_dma
->nd_len
);
3198 cmd
->nc_sqe
.sqe_dptr
.d_prp
[1] = cmd
->nc_dma
->nd_cookie
.dmac_laddress
;
3200 /*LINTED: E_PTR_BAD_CAST_ALIGN*/
3201 for (prp
= (uint64_t *)cmd
->nc_dma
->nd_memp
;
3203 prp
++, xfer
->x_ndmac
--) {
3204 *prp
= xfer
->x_dmac
.dmac_laddress
;
3205 ddi_dma_nextcookie(xfer
->x_dmah
, &xfer
->x_dmac
);
3208 (void) ddi_dma_sync(cmd
->nc_dma
->nd_dmah
, 0, cmd
->nc_dma
->nd_len
,
3209 DDI_DMA_SYNC_FORDEV
);
3210 return (DDI_SUCCESS
);
3214 nvme_create_nvm_cmd(nvme_namespace_t
*ns
, uint8_t opc
, bd_xfer_t
*xfer
)
3216 nvme_t
*nvme
= ns
->ns_nvme
;
3220 * Blkdev only sets BD_XFER_POLL when dumping, so don't sleep.
3222 cmd
= nvme_alloc_cmd(nvme
, (xfer
->x_flags
& BD_XFER_POLL
) ?
3223 KM_NOSLEEP
: KM_SLEEP
);
3228 cmd
->nc_sqe
.sqe_opc
= opc
;
3229 cmd
->nc_callback
= nvme_bd_xfer_done
;
3230 cmd
->nc_xfer
= xfer
;
3233 case NVME_OPC_NVM_WRITE
:
3234 case NVME_OPC_NVM_READ
:
3235 VERIFY(xfer
->x_nblks
<= 0x10000);
3237 cmd
->nc_sqe
.sqe_nsid
= ns
->ns_id
;
3239 cmd
->nc_sqe
.sqe_cdw10
= xfer
->x_blkno
& 0xffffffffu
;
3240 cmd
->nc_sqe
.sqe_cdw11
= (xfer
->x_blkno
>> 32);
3241 cmd
->nc_sqe
.sqe_cdw12
= (uint16_t)(xfer
->x_nblks
- 1);
3243 if (nvme_fill_prp(cmd
, xfer
) != DDI_SUCCESS
)
3247 case NVME_OPC_NVM_FLUSH
:
3248 cmd
->nc_sqe
.sqe_nsid
= ns
->ns_id
;
3263 nvme_bd_xfer_done(void *arg
)
3265 nvme_cmd_t
*cmd
= arg
;
3266 bd_xfer_t
*xfer
= cmd
->nc_xfer
;
3269 error
= nvme_check_cmd_status(cmd
);
3272 bd_xfer_done(xfer
, error
);
3276 nvme_bd_driveinfo(void *arg
, bd_drive_t
*drive
)
3278 nvme_namespace_t
*ns
= arg
;
3279 nvme_t
*nvme
= ns
->ns_nvme
;
3282 * blkdev maintains one queue size per instance (namespace),
3283 * but all namespace share the I/O queues.
3284 * TODO: need to figure out a sane default, or use per-NS I/O queues,
3285 * or change blkdev to handle EAGAIN
3287 drive
->d_qsize
= nvme
->n_ioq_count
* nvme
->n_io_queue_len
3288 / nvme
->n_namespace_count
;
3291 * d_maxxfer is not set, which means the value is taken from the DMA
3292 * attributes specified to bd_alloc_handle.
3295 drive
->d_removable
= B_FALSE
;
3296 drive
->d_hotpluggable
= B_FALSE
;
3298 bcopy(ns
->ns_eui64
, drive
->d_eui64
, sizeof (drive
->d_eui64
));
3299 drive
->d_target
= ns
->ns_id
;
3302 drive
->d_model
= nvme
->n_idctl
->id_model
;
3303 drive
->d_model_len
= sizeof (nvme
->n_idctl
->id_model
);
3304 drive
->d_vendor
= nvme
->n_vendor
;
3305 drive
->d_vendor_len
= strlen(nvme
->n_vendor
);
3306 drive
->d_product
= nvme
->n_product
;
3307 drive
->d_product_len
= strlen(nvme
->n_product
);
3308 drive
->d_serial
= nvme
->n_idctl
->id_serial
;
3309 drive
->d_serial_len
= sizeof (nvme
->n_idctl
->id_serial
);
3310 drive
->d_revision
= nvme
->n_idctl
->id_fwrev
;
3311 drive
->d_revision_len
= sizeof (nvme
->n_idctl
->id_fwrev
);
3315 nvme_bd_mediainfo(void *arg
, bd_media_t
*media
)
3317 nvme_namespace_t
*ns
= arg
;
3319 media
->m_nblks
= ns
->ns_block_count
;
3320 media
->m_blksize
= ns
->ns_block_size
;
3321 media
->m_readonly
= B_FALSE
;
3322 media
->m_solidstate
= B_TRUE
;
3324 media
->m_pblksize
= ns
->ns_best_block_size
;
3330 nvme_bd_cmd(nvme_namespace_t
*ns
, bd_xfer_t
*xfer
, uint8_t opc
)
3332 nvme_t
*nvme
= ns
->ns_nvme
;
3341 cmd
= nvme_create_nvm_cmd(ns
, opc
, xfer
);
3345 cmd
->nc_sqid
= (CPU
->cpu_id
% nvme
->n_ioq_count
) + 1;
3346 ASSERT(cmd
->nc_sqid
<= nvme
->n_ioq_count
);
3347 ioq
= nvme
->n_ioq
[cmd
->nc_sqid
];
3350 * Get the polling flag before submitting the command. The command may
3351 * complete immediately after it was submitted, which means we must
3352 * treat both cmd and xfer as if they have been freed already.
3354 poll
= (xfer
->x_flags
& BD_XFER_POLL
) != 0;
3356 ret
= nvme_submit_io_cmd(ioq
, cmd
);
3365 cmd
= nvme_retrieve_cmd(nvme
, ioq
);
3367 nvme_bd_xfer_done(cmd
);
3370 } while (ioq
->nq_active_cmds
!= 0);
3376 nvme_bd_read(void *arg
, bd_xfer_t
*xfer
)
3378 nvme_namespace_t
*ns
= arg
;
3380 return (nvme_bd_cmd(ns
, xfer
, NVME_OPC_NVM_READ
));
3384 nvme_bd_write(void *arg
, bd_xfer_t
*xfer
)
3386 nvme_namespace_t
*ns
= arg
;
3388 return (nvme_bd_cmd(ns
, xfer
, NVME_OPC_NVM_WRITE
));
3392 nvme_bd_sync(void *arg
, bd_xfer_t
*xfer
)
3394 nvme_namespace_t
*ns
= arg
;
3396 if (ns
->ns_nvme
->n_dead
)
3400 * If the volatile write cache is not present or not enabled the FLUSH
3401 * command is a no-op, so we can take a shortcut here.
3403 if (!ns
->ns_nvme
->n_write_cache_present
) {
3404 bd_xfer_done(xfer
, ENOTSUP
);
3408 if (!ns
->ns_nvme
->n_write_cache_enabled
) {
3409 bd_xfer_done(xfer
, 0);
3413 return (nvme_bd_cmd(ns
, xfer
, NVME_OPC_NVM_FLUSH
));
3417 nvme_bd_devid(void *arg
, dev_info_t
*devinfo
, ddi_devid_t
*devid
)
3419 nvme_namespace_t
*ns
= arg
;
3421 /*LINTED: E_BAD_PTR_CAST_ALIGN*/
3422 if (*(uint64_t *)ns
->ns_eui64
!= 0) {
3423 return (ddi_devid_init(devinfo
, DEVID_SCSI3_WWN
,
3424 sizeof (ns
->ns_eui64
), ns
->ns_eui64
, devid
));
3426 return (ddi_devid_init(devinfo
, DEVID_ENCAP
,
3427 strlen(ns
->ns_devid
), ns
->ns_devid
, devid
));
3432 nvme_open(dev_t
*devp
, int flag
, int otyp
, cred_t
*cred_p
)
3435 _NOTE(ARGUNUSED(cred_p
));
3437 minor_t minor
= getminor(*devp
);
3438 nvme_t
*nvme
= ddi_get_soft_state(nvme_state
, NVME_MINOR_INST(minor
));
3439 int nsid
= NVME_MINOR_NSID(minor
);
3440 nvme_minor_state_t
*nm
;
3443 if (otyp
!= OTYP_CHR
)
3449 if (nsid
> nvme
->n_namespace_count
)
3455 nm
= nsid
== 0 ? &nvme
->n_minor
: &nvme
->n_ns
[nsid
- 1].ns_minor
;
3457 mutex_enter(&nm
->nm_mutex
);
3464 if (nm
->nm_ocnt
!= 0) {
3468 nm
->nm_oexcl
= B_TRUE
;
3474 mutex_exit(&nm
->nm_mutex
);
3480 nvme_close(dev_t dev
, int flag
, int otyp
, cred_t
*cred_p
)
3483 _NOTE(ARGUNUSED(cred_p
));
3484 _NOTE(ARGUNUSED(flag
));
3486 minor_t minor
= getminor(dev
);
3487 nvme_t
*nvme
= ddi_get_soft_state(nvme_state
, NVME_MINOR_INST(minor
));
3488 int nsid
= NVME_MINOR_NSID(minor
);
3489 nvme_minor_state_t
*nm
;
3491 if (otyp
!= OTYP_CHR
)
3497 if (nsid
> nvme
->n_namespace_count
)
3500 nm
= nsid
== 0 ? &nvme
->n_minor
: &nvme
->n_ns
[nsid
- 1].ns_minor
;
3502 mutex_enter(&nm
->nm_mutex
);
3504 nm
->nm_oexcl
= B_FALSE
;
3506 ASSERT(nm
->nm_ocnt
> 0);
3508 mutex_exit(&nm
->nm_mutex
);
3514 nvme_ioctl_identify(nvme_t
*nvme
, int nsid
, nvme_ioctl_t
*nioc
, int mode
,
3517 _NOTE(ARGUNUSED(cred_p
));
3521 if ((mode
& FREAD
) == 0)
3524 if (nioc
->n_len
< NVME_IDENTIFY_BUFSIZE
)
3527 if ((rv
= nvme_identify(nvme
, nsid
, (void **)&idctl
)) != 0)
3530 if (ddi_copyout(idctl
, (void *)nioc
->n_buf
, NVME_IDENTIFY_BUFSIZE
, mode
)
3534 kmem_free(idctl
, NVME_IDENTIFY_BUFSIZE
);
3540 nvme_ioctl_capabilities(nvme_t
*nvme
, int nsid
, nvme_ioctl_t
*nioc
,
3541 int mode
, cred_t
*cred_p
)
3543 _NOTE(ARGUNUSED(nsid
, cred_p
));
3545 nvme_reg_cap_t cap
= { 0 };
3546 nvme_capabilities_t nc
;
3548 if ((mode
& FREAD
) == 0)
3551 if (nioc
->n_len
< sizeof (nc
))
3554 cap
.r
= nvme_get64(nvme
, NVME_REG_CAP
);
3557 * The MPSMIN and MPSMAX fields in the CAP register use 0 to
3558 * specify the base page size of 4k (1<<12), so add 12 here to
3559 * get the real page size value.
3561 nc
.mpsmax
= 1 << (12 + cap
.b
.cap_mpsmax
);
3562 nc
.mpsmin
= 1 << (12 + cap
.b
.cap_mpsmin
);
3564 if (ddi_copyout(&nc
, (void *)nioc
->n_buf
, sizeof (nc
), mode
) != 0)
3571 nvme_ioctl_get_logpage(nvme_t
*nvme
, int nsid
, nvme_ioctl_t
*nioc
,
3572 int mode
, cred_t
*cred_p
)
3574 _NOTE(ARGUNUSED(cred_p
));
3579 if ((mode
& FREAD
) == 0)
3582 switch (nioc
->n_arg
) {
3583 case NVME_LOGPAGE_ERROR
:
3587 case NVME_LOGPAGE_HEALTH
:
3588 if (nsid
!= 0 && nvme
->n_idctl
->id_lpa
.lp_smart
== 0)
3592 nsid
= (uint32_t)-1;
3595 case NVME_LOGPAGE_FWSLOT
:
3603 if (nvme_get_logpage(nvme
, &log
, &bufsize
, nioc
->n_arg
, nsid
)
3607 if (nioc
->n_len
< bufsize
) {
3608 kmem_free(log
, bufsize
);
3612 if (ddi_copyout(log
, (void *)nioc
->n_buf
, bufsize
, mode
) != 0)
3615 nioc
->n_len
= bufsize
;
3616 kmem_free(log
, bufsize
);
3622 nvme_ioctl_get_features(nvme_t
*nvme
, int nsid
, nvme_ioctl_t
*nioc
,
3623 int mode
, cred_t
*cred_p
)
3625 _NOTE(ARGUNUSED(cred_p
));
3632 if ((mode
& FREAD
) == 0)
3635 if ((nioc
->n_arg
>> 32) > 0xff)
3638 feature
= (uint8_t)(nioc
->n_arg
>> 32);
3641 case NVME_FEAT_ARBITRATION
:
3642 case NVME_FEAT_POWER_MGMT
:
3643 case NVME_FEAT_TEMPERATURE
:
3644 case NVME_FEAT_ERROR
:
3645 case NVME_FEAT_NQUEUES
:
3646 case NVME_FEAT_INTR_COAL
:
3647 case NVME_FEAT_WRITE_ATOM
:
3648 case NVME_FEAT_ASYNC_EVENT
:
3649 case NVME_FEAT_PROGRESS
:
3654 case NVME_FEAT_INTR_VECT
:
3658 res
= nioc
->n_arg
& 0xffffffffUL
;
3659 if (res
>= nvme
->n_intr_cnt
)
3663 case NVME_FEAT_LBA_RANGE
:
3664 if (nvme
->n_lba_range_supported
== B_FALSE
)
3668 nsid
> nvme
->n_namespace_count
)
3673 case NVME_FEAT_WRITE_CACHE
:
3677 if (!nvme
->n_write_cache_present
)
3682 case NVME_FEAT_AUTO_PST
:
3686 if (!nvme
->n_auto_pst_supported
)
3695 rv
= nvme_get_features(nvme
, nsid
, feature
, &res
, &buf
, &bufsize
);
3699 if (nioc
->n_len
< bufsize
) {
3700 kmem_free(buf
, bufsize
);
3704 if (buf
&& ddi_copyout(buf
, (void*)nioc
->n_buf
, bufsize
, mode
) != 0)
3707 kmem_free(buf
, bufsize
);
3709 nioc
->n_len
= bufsize
;
3715 nvme_ioctl_intr_cnt(nvme_t
*nvme
, int nsid
, nvme_ioctl_t
*nioc
, int mode
,
3718 _NOTE(ARGUNUSED(nsid
, mode
, cred_p
));
3720 if ((mode
& FREAD
) == 0)
3723 nioc
->n_arg
= nvme
->n_intr_cnt
;
3728 nvme_ioctl_version(nvme_t
*nvme
, int nsid
, nvme_ioctl_t
*nioc
, int mode
,
3731 _NOTE(ARGUNUSED(nsid
, cred_p
));
3734 if ((mode
& FREAD
) == 0)
3737 if (nioc
->n_len
< sizeof (nvme
->n_version
))
3740 if (ddi_copyout(&nvme
->n_version
, (void *)nioc
->n_buf
,
3741 sizeof (nvme
->n_version
), mode
) != 0)
3748 nvme_ioctl_format(nvme_t
*nvme
, int nsid
, nvme_ioctl_t
*nioc
, int mode
,
3751 _NOTE(ARGUNUSED(mode
));
3752 nvme_format_nvm_t frmt
= { 0 };
3753 int c_nsid
= nsid
!= 0 ? nsid
- 1 : 0;
3755 if ((mode
& FWRITE
) == 0 || secpolicy_sys_config(cred_p
, B_FALSE
) != 0)
3758 frmt
.r
= nioc
->n_arg
& 0xffffffff;
3761 * Check whether the FORMAT NVM command is supported.
3763 if (nvme
->n_idctl
->id_oacs
.oa_format
== 0)
3767 * Don't allow format or secure erase of individual namespace if that
3768 * would cause a format or secure erase of all namespaces.
3770 if (nsid
!= 0 && nvme
->n_idctl
->id_fna
.fn_format
!= 0)
3773 if (nsid
!= 0 && frmt
.b
.fm_ses
!= NVME_FRMT_SES_NONE
&&
3774 nvme
->n_idctl
->id_fna
.fn_sec_erase
!= 0)
3778 * Don't allow formatting with Protection Information.
3780 if (frmt
.b
.fm_pi
!= 0 || frmt
.b
.fm_pil
!= 0 || frmt
.b
.fm_ms
!= 0)
3784 * Don't allow formatting using an illegal LBA format, or any LBA format
3785 * that uses metadata.
3787 if (frmt
.b
.fm_lbaf
> nvme
->n_ns
[c_nsid
].ns_idns
->id_nlbaf
||
3788 nvme
->n_ns
[c_nsid
].ns_idns
->id_lbaf
[frmt
.b
.fm_lbaf
].lbaf_ms
!= 0)
3792 * Don't allow formatting using an illegal Secure Erase setting.
3794 if (frmt
.b
.fm_ses
> NVME_FRMT_MAX_SES
||
3795 (frmt
.b
.fm_ses
== NVME_FRMT_SES_CRYPTO
&&
3796 nvme
->n_idctl
->id_fna
.fn_crypt_erase
== 0))
3800 nsid
= (uint32_t)-1;
3802 return (nvme_format_nvm(nvme
, nsid
, frmt
.b
.fm_lbaf
, B_FALSE
, 0, B_FALSE
,
3807 nvme_ioctl_detach(nvme_t
*nvme
, int nsid
, nvme_ioctl_t
*nioc
, int mode
,
3810 _NOTE(ARGUNUSED(nioc
, mode
));
3813 if ((mode
& FWRITE
) == 0 || secpolicy_sys_config(cred_p
, B_FALSE
) != 0)
3819 rv
= bd_detach_handle(nvme
->n_ns
[nsid
- 1].ns_bd_hdl
);
3820 if (rv
!= DDI_SUCCESS
)
3827 nvme_ioctl_attach(nvme_t
*nvme
, int nsid
, nvme_ioctl_t
*nioc
, int mode
,
3830 _NOTE(ARGUNUSED(nioc
, mode
));
3831 nvme_identify_nsid_t
*idns
;
3834 if ((mode
& FWRITE
) == 0 || secpolicy_sys_config(cred_p
, B_FALSE
) != 0)
3841 * Identify namespace again, free old identify data.
3843 idns
= nvme
->n_ns
[nsid
- 1].ns_idns
;
3844 if (nvme_init_ns(nvme
, nsid
) != DDI_SUCCESS
)
3847 kmem_free(idns
, sizeof (nvme_identify_nsid_t
));
3849 rv
= bd_attach_handle(nvme
->n_dip
, nvme
->n_ns
[nsid
- 1].ns_bd_hdl
);
3850 if (rv
!= DDI_SUCCESS
)
3857 nvme_ioctl(dev_t dev
, int cmd
, intptr_t arg
, int mode
, cred_t
*cred_p
,
3861 _NOTE(ARGUNUSED(rval_p
));
3863 minor_t minor
= getminor(dev
);
3864 nvme_t
*nvme
= ddi_get_soft_state(nvme_state
, NVME_MINOR_INST(minor
));
3865 int nsid
= NVME_MINOR_NSID(minor
);
3869 int (*nvme_ioctl
[])(nvme_t
*, int, nvme_ioctl_t
*, int, cred_t
*) = {
3871 nvme_ioctl_identify
,
3872 nvme_ioctl_identify
,
3873 nvme_ioctl_capabilities
,
3874 nvme_ioctl_get_logpage
,
3875 nvme_ioctl_get_features
,
3876 nvme_ioctl_intr_cnt
,
3886 if (nsid
> nvme
->n_namespace_count
)
3890 return (ndi_devctl_ioctl(nvme
->n_dip
, cmd
, arg
, mode
, 0));
3892 #ifdef _MULTI_DATAMODEL
3893 switch (ddi_model_convert_from(mode
& FMODELS
)) {
3894 case DDI_MODEL_ILP32
: {
3895 nvme_ioctl32_t nioc32
;
3896 if (ddi_copyin((void*)arg
, &nioc32
, sizeof (nvme_ioctl32_t
),
3899 nioc
.n_len
= nioc32
.n_len
;
3900 nioc
.n_buf
= nioc32
.n_buf
;
3901 nioc
.n_arg
= nioc32
.n_arg
;
3904 case DDI_MODEL_NONE
:
3906 if (ddi_copyin((void*)arg
, &nioc
, sizeof (nvme_ioctl_t
), mode
)
3909 #ifdef _MULTI_DATAMODEL
3914 if (nvme
->n_dead
&& cmd
!= NVME_IOC_DETACH
)
3918 if (cmd
== NVME_IOC_IDENTIFY_CTRL
) {
3920 * This makes NVME_IOC_IDENTIFY_CTRL work the same on devctl and
3921 * attachment point nodes.
3924 } else if (cmd
== NVME_IOC_IDENTIFY_NSID
&& nsid
== 0) {
3926 * This makes NVME_IOC_IDENTIFY_NSID work on a devctl node, it
3927 * will always return identify data for namespace 1.
3932 if (IS_NVME_IOC(cmd
) && nvme_ioctl
[NVME_IOC_CMD(cmd
)] != NULL
)
3933 rv
= nvme_ioctl
[NVME_IOC_CMD(cmd
)](nvme
, nsid
, &nioc
, mode
,
3938 #ifdef _MULTI_DATAMODEL
3939 switch (ddi_model_convert_from(mode
& FMODELS
)) {
3940 case DDI_MODEL_ILP32
: {
3941 nvme_ioctl32_t nioc32
;
3943 nioc32
.n_len
= (size32_t
)nioc
.n_len
;
3944 nioc32
.n_buf
= (uintptr32_t
)nioc
.n_buf
;
3945 nioc32
.n_arg
= nioc
.n_arg
;
3947 if (ddi_copyout(&nioc32
, (void *)arg
, sizeof (nvme_ioctl32_t
),
3952 case DDI_MODEL_NONE
:
3954 if (ddi_copyout(&nioc
, (void *)arg
, sizeof (nvme_ioctl_t
), mode
)
3957 #ifdef _MULTI_DATAMODEL