2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
4 * Copyright (c) 2017 Shunsuke Mie
5 * Copyright (c) 2018 Leon Dang
6 * Copyright (c) 2020 Chuck Tuffli
8 * Function crc16 Copyright (c) 2017, Fedor Uporov
9 * Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * bhyve PCIe-NVMe device emulation.
37 * -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#,dsm=<opt>
44 * maxq = max number of queues
45 * qsz = max elements in each queue
46 * ioslots = max number of concurrent io requests
47 * sectsz = sector size (defaults to blockif sector size)
48 * ser = serial number (20-chars max)
49 * eui64 = IEEE Extended Unique Identifier (8 byte value)
50 * dsm = DataSet Management support. Option is one of auto, enable,disable
55 - create async event for smart and log
59 #include <sys/cdefs.h>
60 __FBSDID("$FreeBSD$");
62 #include <sys/errno.h>
63 #include <sys/types.h>
64 #include <net/ieee_oui.h>
71 #include <pthread_np.h>
72 #include <semaphore.h>
80 #include <machine/atomic.h>
81 #include <machine/vmm.h>
84 #include <dev/nvme/nvme.h>
93 static int nvme_debug
= 0;
94 #define DPRINTF(fmt, args...) if (nvme_debug) PRINTLN(fmt, ##args)
95 #define WPRINTF(fmt, args...) PRINTLN(fmt, ##args)
97 /* defaults; can be overridden */
98 #define NVME_MSIX_BAR 4
100 #define NVME_IOSLOTS 8
102 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */
103 #define NVME_MMIO_SPACE_MIN (1 << 14)
105 #define NVME_QUEUES 16
106 #define NVME_MAX_QENTRIES 2048
107 /* Memory Page size Minimum reported in CAP register */
108 #define NVME_MPSMIN 0
109 /* MPSMIN converted to bytes */
110 #define NVME_MPSMIN_BYTES (1 << (12 + NVME_MPSMIN))
112 #define NVME_PRP2_ITEMS (PAGE_SIZE/sizeof(uint64_t))
114 /* Note the + 1 allows for the initial descriptor to not be page aligned */
115 #define NVME_MAX_IOVEC ((1 << NVME_MDTS) + 1)
116 #define NVME_MAX_DATA_SIZE ((1 << NVME_MDTS) * NVME_MPSMIN_BYTES)
118 /* This is a synthetic status code to indicate there is no status */
119 #define NVME_NO_STATUS 0xffff
120 #define NVME_COMPLETION_VALID(c) ((c).status != NVME_NO_STATUS)
122 /* Reported temperature in Kelvin (i.e. room temperature) */
123 #define NVME_TEMPERATURE 296
127 /* Convert a zero-based value into a one-based value */
128 #define ONE_BASED(zero) ((zero) + 1)
129 /* Convert a one-based value into a zero-based value */
130 #define ZERO_BASED(one) ((one) - 1)
132 /* Encode number of SQ's and CQ's for Set/Get Features */
133 #define NVME_FEATURE_NUM_QUEUES(sc) \
134 (ZERO_BASED((sc)->num_squeues) & 0xffff) | \
135 (ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16;
137 #define NVME_DOORBELL_OFFSET offsetof(struct nvme_registers, doorbell)
139 enum nvme_controller_register_offsets
{
140 NVME_CR_CAP_LOW
= 0x00,
141 NVME_CR_CAP_HI
= 0x04,
143 NVME_CR_INTMS
= 0x0c,
144 NVME_CR_INTMC
= 0x10,
149 NVME_CR_ASQ_LOW
= 0x28,
150 NVME_CR_ASQ_HI
= 0x2c,
151 NVME_CR_ACQ_LOW
= 0x30,
152 NVME_CR_ACQ_HI
= 0x34,
155 enum nvme_cmd_cdw11
{
156 NVME_CMD_CDW11_PC
= 0x0001,
157 NVME_CMD_CDW11_IEN
= 0x0002,
158 NVME_CMD_CDW11_IV
= 0xFFFF0000,
166 #define NVME_CQ_INTEN 0x01
167 #define NVME_CQ_INTCOAL 0x02
169 struct nvme_completion_queue
{
170 struct nvme_completion
*qbase
;
173 uint16_t tail
; /* nvme progress */
174 uint16_t head
; /* guest progress */
179 struct nvme_submission_queue
{
180 struct nvme_command
*qbase
;
183 uint16_t head
; /* nvme progress */
184 uint16_t tail
; /* guest progress */
185 uint16_t cqid
; /* completion queue id */
189 enum nvme_storage_type
{
190 NVME_STOR_BLOCKIF
= 0,
194 struct pci_nvme_blockstore
{
195 enum nvme_storage_type type
;
199 uint32_t sectsz_bits
;
201 uint32_t deallocate
:1;
205 * Calculate the number of additional page descriptors for guest IO requests
206 * based on the advertised Max Data Transfer (MDTS) and given the number of
207 * default iovec's in a struct blockif_req.
209 #define MDTS_PAD_SIZE \
210 ( NVME_MAX_IOVEC > BLOCKIF_IOV_MAX ? \
211 NVME_MAX_IOVEC - BLOCKIF_IOV_MAX : \
214 struct pci_nvme_ioreq
{
215 struct pci_nvme_softc
*sc
;
216 STAILQ_ENTRY(pci_nvme_ioreq
) link
;
217 struct nvme_submission_queue
*nvme_sq
;
220 /* command information */
225 uint64_t prev_gpaddr
;
229 struct blockif_req io_req
;
231 struct iovec iovpadding
[MDTS_PAD_SIZE
];
235 /* Dataset Management bit in ONCS reflects backing storage capability */
236 NVME_DATASET_MANAGEMENT_AUTO
,
237 /* Unconditionally set Dataset Management bit in ONCS */
238 NVME_DATASET_MANAGEMENT_ENABLE
,
239 /* Unconditionally clear Dataset Management bit in ONCS */
240 NVME_DATASET_MANAGEMENT_DISABLE
,
243 struct pci_nvme_softc
;
244 struct nvme_feature_obj
;
246 typedef void (*nvme_feature_cb
)(struct pci_nvme_softc
*,
247 struct nvme_feature_obj
*,
248 struct nvme_command
*,
249 struct nvme_completion
*);
251 struct nvme_feature_obj
{
255 bool namespace_specific
;
258 #define NVME_FID_MAX (NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION + 1)
261 PCI_NVME_AE_TYPE_ERROR
= 0,
262 PCI_NVME_AE_TYPE_SMART
,
263 PCI_NVME_AE_TYPE_NOTICE
,
264 PCI_NVME_AE_TYPE_IO_CMD
= 6,
265 PCI_NVME_AE_TYPE_VENDOR
= 7,
266 PCI_NVME_AE_TYPE_MAX
/* Must be last */
267 } pci_nvme_async_type
;
269 /* Asynchronous Event Requests */
270 struct pci_nvme_aer
{
271 STAILQ_ENTRY(pci_nvme_aer
) link
;
272 uint16_t cid
; /* Command ID of the submitted AER */
275 /** Asynchronous Event Information - Notice */
277 PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED
= 0,
278 PCI_NVME_AEI_NOTICE_FW_ACTIVATION
,
279 PCI_NVME_AEI_NOTICE_TELEMETRY_CHANGE
,
280 PCI_NVME_AEI_NOTICE_ANA_CHANGE
,
281 PCI_NVME_AEI_NOTICE_PREDICT_LATENCY_CHANGE
,
282 PCI_NVME_AEI_NOTICE_LBA_STATUS_ALERT
,
283 PCI_NVME_AEI_NOTICE_ENDURANCE_GROUP_CHANGE
,
284 PCI_NVME_AEI_NOTICE_MAX
,
285 } pci_nvme_async_event_info_notice
;
287 #define PCI_NVME_AEI_NOTICE_SHIFT 8
288 #define PCI_NVME_AEI_NOTICE_MASK(event) (1 << (event + PCI_NVME_AEI_NOTICE_SHIFT))
290 /* Asynchronous Event Notifications */
291 struct pci_nvme_aen
{
292 pci_nvme_async_type atype
;
298 * By default, enable all Asynchrnous Event Notifications:
299 * SMART / Health Critical Warnings
300 * Namespace Attribute Notices
302 #define PCI_NVME_AEN_DEFAULT_MASK 0x11f
305 NVME_CNTRLTYPE_IO
= 1,
306 NVME_CNTRLTYPE_DISCOVERY
= 2,
307 NVME_CNTRLTYPE_ADMIN
= 3,
308 } pci_nvme_cntrl_type
;
310 struct pci_nvme_softc
{
311 struct pci_devinst
*nsc_pi
;
315 struct nvme_registers regs
;
317 struct nvme_namespace_data nsdata
;
318 struct nvme_controller_data ctrldata
;
319 struct nvme_error_information_entry err_log
;
320 struct nvme_health_information_page health_log
;
321 struct nvme_firmware_page fw_log
;
322 struct nvme_ns_list ns_log
;
324 struct pci_nvme_blockstore nvstore
;
326 uint16_t max_qentries
; /* max entries per queue */
327 uint32_t max_queues
; /* max number of IO SQ's or CQ's */
328 uint32_t num_cqueues
;
329 uint32_t num_squeues
;
330 bool num_q_is_set
; /* Has host set Number of Queues */
332 struct pci_nvme_ioreq
*ioreqs
;
333 STAILQ_HEAD(, pci_nvme_ioreq
) ioreqs_free
; /* free list of ioreqs */
334 uint32_t pending_ios
;
339 * Memory mapped Submission and Completion queues
340 * Each array includes both Admin and IO queues
342 struct nvme_completion_queue
*compl_queues
;
343 struct nvme_submission_queue
*submit_queues
;
345 struct nvme_feature_obj feat
[NVME_FID_MAX
];
347 enum nvme_dsm_type dataset_management
;
349 /* Accounting for SMART data */
350 __uint128_t read_data_units
;
351 __uint128_t write_data_units
;
352 __uint128_t read_commands
;
353 __uint128_t write_commands
;
354 uint32_t read_dunits_remainder
;
355 uint32_t write_dunits_remainder
;
357 STAILQ_HEAD(, pci_nvme_aer
) aer_list
;
358 pthread_mutex_t aer_mtx
;
360 struct pci_nvme_aen aen
[PCI_NVME_AE_TYPE_MAX
];
362 pthread_mutex_t aen_mtx
;
363 pthread_cond_t aen_cond
;
367 static void pci_nvme_cq_update(struct pci_nvme_softc
*sc
,
368 struct nvme_completion_queue
*cq
,
373 static struct pci_nvme_ioreq
*pci_nvme_get_ioreq(struct pci_nvme_softc
*);
374 static void pci_nvme_release_ioreq(struct pci_nvme_softc
*, struct pci_nvme_ioreq
*);
375 static void pci_nvme_io_done(struct blockif_req
*, int);
377 /* Controller Configuration utils */
378 #define NVME_CC_GET_EN(cc) \
379 ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK)
380 #define NVME_CC_GET_CSS(cc) \
381 ((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK)
382 #define NVME_CC_GET_SHN(cc) \
383 ((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK)
384 #define NVME_CC_GET_IOSQES(cc) \
385 ((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK)
386 #define NVME_CC_GET_IOCQES(cc) \
387 ((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK)
389 #define NVME_CC_WRITE_MASK \
390 ((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \
391 (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \
392 (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT))
394 #define NVME_CC_NEN_WRITE_MASK \
395 ((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \
396 (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \
397 (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT))
399 /* Controller Status utils */
400 #define NVME_CSTS_GET_RDY(sts) \
401 ((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK)
403 #define NVME_CSTS_RDY (1 << NVME_CSTS_REG_RDY_SHIFT)
404 #define NVME_CSTS_CFS (1 << NVME_CSTS_REG_CFS_SHIFT)
406 /* Completion Queue status word utils */
407 #define NVME_STATUS_P (1 << NVME_STATUS_P_SHIFT)
408 #define NVME_STATUS_MASK \
409 ((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\
410 (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT))
412 #define NVME_ONCS_DSM (NVME_CTRLR_DATA_ONCS_DSM_MASK << \
413 NVME_CTRLR_DATA_ONCS_DSM_SHIFT)
415 static void nvme_feature_invalid_cb(struct pci_nvme_softc
*,
416 struct nvme_feature_obj
*,
417 struct nvme_command
*,
418 struct nvme_completion
*);
419 static void nvme_feature_temperature(struct pci_nvme_softc
*,
420 struct nvme_feature_obj
*,
421 struct nvme_command
*,
422 struct nvme_completion
*);
423 static void nvme_feature_num_queues(struct pci_nvme_softc
*,
424 struct nvme_feature_obj
*,
425 struct nvme_command
*,
426 struct nvme_completion
*);
427 static void nvme_feature_iv_config(struct pci_nvme_softc
*,
428 struct nvme_feature_obj
*,
429 struct nvme_command
*,
430 struct nvme_completion
*);
431 static void nvme_feature_async_event(struct pci_nvme_softc
*,
432 struct nvme_feature_obj
*,
433 struct nvme_command
*,
434 struct nvme_completion
*);
436 static void *aen_thr(void *arg
);
439 cpywithpad(char *dst
, size_t dst_size
, const char *src
, char pad
)
443 len
= strnlen(src
, dst_size
);
444 memset(dst
, pad
, dst_size
);
445 memcpy(dst
, src
, len
);
449 pci_nvme_status_tc(uint16_t *status
, uint16_t type
, uint16_t code
)
452 *status
&= ~NVME_STATUS_MASK
;
453 *status
|= (type
& NVME_STATUS_SCT_MASK
) << NVME_STATUS_SCT_SHIFT
|
454 (code
& NVME_STATUS_SC_MASK
) << NVME_STATUS_SC_SHIFT
;
458 pci_nvme_status_genc(uint16_t *status
, uint16_t code
)
461 pci_nvme_status_tc(status
, NVME_SCT_GENERIC
, code
);
465 * Initialize the requested number or IO Submission and Completion Queues.
466 * Admin queues are allocated implicitly.
469 pci_nvme_init_queues(struct pci_nvme_softc
*sc
, uint32_t nsq
, uint32_t ncq
)
474 * Allocate and initialize the Submission Queues
476 if (nsq
> NVME_QUEUES
) {
477 WPRINTF("%s: clamping number of SQ from %u to %u",
478 __func__
, nsq
, NVME_QUEUES
);
482 sc
->num_squeues
= nsq
;
484 sc
->submit_queues
= calloc(sc
->num_squeues
+ 1,
485 sizeof(struct nvme_submission_queue
));
486 if (sc
->submit_queues
== NULL
) {
487 WPRINTF("%s: SQ allocation failed", __func__
);
490 struct nvme_submission_queue
*sq
= sc
->submit_queues
;
492 for (i
= 0; i
< sc
->num_squeues
+ 1; i
++)
493 pthread_mutex_init(&sq
[i
].mtx
, NULL
);
497 * Allocate and initialize the Completion Queues
499 if (ncq
> NVME_QUEUES
) {
500 WPRINTF("%s: clamping number of CQ from %u to %u",
501 __func__
, ncq
, NVME_QUEUES
);
505 sc
->num_cqueues
= ncq
;
507 sc
->compl_queues
= calloc(sc
->num_cqueues
+ 1,
508 sizeof(struct nvme_completion_queue
));
509 if (sc
->compl_queues
== NULL
) {
510 WPRINTF("%s: CQ allocation failed", __func__
);
513 struct nvme_completion_queue
*cq
= sc
->compl_queues
;
515 for (i
= 0; i
< sc
->num_cqueues
+ 1; i
++)
516 pthread_mutex_init(&cq
[i
].mtx
, NULL
);
521 pci_nvme_init_ctrldata(struct pci_nvme_softc
*sc
)
523 struct nvme_controller_data
*cd
= &sc
->ctrldata
;
528 cpywithpad((char *)cd
->mn
, sizeof(cd
->mn
), "bhyve-NVMe", ' ');
529 cpywithpad((char *)cd
->fr
, sizeof(cd
->fr
), "1.0", ' ');
531 /* Num of submission commands that we can handle at a time (2^rab) */
541 cd
->mdts
= NVME_MDTS
; /* max data transfer size (2^mdts * CAP.MPSMIN) */
543 cd
->ver
= NVME_REV(1,4);
545 cd
->cntrltype
= NVME_CNTRLTYPE_IO
;
546 cd
->oacs
= 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT
;
547 cd
->oaes
= NVMEB(NVME_CTRLR_DATA_OAES_NS_ATTR
);
551 /* Advertise 1, Read-only firmware slot */
552 cd
->frmw
= NVMEB(NVME_CTRLR_DATA_FRMW_SLOT1_RO
) |
553 (1 << NVME_CTRLR_DATA_FRMW_NUM_SLOTS_SHIFT
);
554 cd
->lpa
= 0; /* TODO: support some simple things like SMART */
555 cd
->elpe
= 0; /* max error log page entries */
557 * Report a single power state (zero-based value)
558 * power_state[] values are left as zero to indicate "Not reported"
562 /* Warning Composite Temperature Threshold */
566 /* SANICAP must not be 0 for Revision 1.4 and later NVMe Controllers */
567 cd
->sanicap
= (NVME_CTRLR_DATA_SANICAP_NODMMAS_NO
<<
568 NVME_CTRLR_DATA_SANICAP_NODMMAS_SHIFT
);
570 cd
->sqes
= (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT
) |
571 (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT
);
572 cd
->cqes
= (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT
) |
573 (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT
);
574 cd
->nn
= 1; /* number of namespaces */
577 switch (sc
->dataset_management
) {
578 case NVME_DATASET_MANAGEMENT_AUTO
:
579 if (sc
->nvstore
.deallocate
)
580 cd
->oncs
|= NVME_ONCS_DSM
;
582 case NVME_DATASET_MANAGEMENT_ENABLE
:
583 cd
->oncs
|= NVME_ONCS_DSM
;
589 cd
->fna
= NVME_CTRLR_DATA_FNA_FORMAT_ALL_MASK
<<
590 NVME_CTRLR_DATA_FNA_FORMAT_ALL_SHIFT
;
592 cd
->vwc
= NVME_CTRLR_DATA_VWC_ALL_NO
<< NVME_CTRLR_DATA_VWC_ALL_SHIFT
;
596 * Calculate the CRC-16 of the given buffer
597 * See copyright attribution at top of file
600 crc16(uint16_t crc
, const void *buffer
, unsigned int len
)
602 const unsigned char *cp
= buffer
;
603 /* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */
604 static uint16_t const crc16_table
[256] = {
605 0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241,
606 0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440,
607 0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40,
608 0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841,
609 0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40,
610 0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41,
611 0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641,
612 0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040,
613 0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240,
614 0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441,
615 0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41,
616 0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840,
617 0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41,
618 0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40,
619 0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640,
620 0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041,
621 0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240,
622 0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441,
623 0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41,
624 0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840,
625 0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41,
626 0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40,
627 0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640,
628 0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041,
629 0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241,
630 0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440,
631 0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40,
632 0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841,
633 0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40,
634 0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41,
635 0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641,
636 0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040
640 crc
= (((crc
>> 8) & 0xffU
) ^
641 crc16_table
[(crc
^ *cp
++) & 0xffU
]) & 0x0000ffffU
;
646 pci_nvme_init_nsdata_size(struct pci_nvme_blockstore
*nvstore
,
647 struct nvme_namespace_data
*nd
)
650 /* Get capacity and block size information from backing store */
651 nd
->nsze
= nvstore
->size
/ nvstore
->sectsz
;
657 pci_nvme_init_nsdata(struct pci_nvme_softc
*sc
,
658 struct nvme_namespace_data
*nd
, uint32_t nsid
,
659 struct pci_nvme_blockstore
*nvstore
)
662 pci_nvme_init_nsdata_size(nvstore
, nd
);
664 if (nvstore
->type
== NVME_STOR_BLOCKIF
)
665 nvstore
->deallocate
= blockif_candelete(nvstore
->ctx
);
667 nd
->nlbaf
= 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */
670 /* Create an EUI-64 if user did not provide one */
671 if (nvstore
->eui64
== 0) {
673 uint64_t eui64
= nvstore
->eui64
;
675 asprintf(&data
, "%s%u%u%u", get_config_value("name"),
676 sc
->nsc_pi
->pi_bus
, sc
->nsc_pi
->pi_slot
,
677 sc
->nsc_pi
->pi_func
);
680 eui64
= OUI_FREEBSD_NVME_LOW
| crc16(0, data
, strlen(data
));
683 nvstore
->eui64
= (eui64
<< 16) | (nsid
& 0xffff);
685 be64enc(nd
->eui64
, nvstore
->eui64
);
687 /* LBA data-sz = 2^lbads */
688 nd
->lbaf
[0] = nvstore
->sectsz_bits
<< NVME_NS_DATA_LBAF_LBADS_SHIFT
;
692 pci_nvme_init_logpages(struct pci_nvme_softc
*sc
)
694 __uint128_t power_cycles
= 1;
696 memset(&sc
->err_log
, 0, sizeof(sc
->err_log
));
697 memset(&sc
->health_log
, 0, sizeof(sc
->health_log
));
698 memset(&sc
->fw_log
, 0, sizeof(sc
->fw_log
));
699 memset(&sc
->ns_log
, 0, sizeof(sc
->ns_log
));
701 /* Set read/write remainder to round up according to spec */
702 sc
->read_dunits_remainder
= 999;
703 sc
->write_dunits_remainder
= 999;
705 /* Set nominal Health values checked by implementations */
706 sc
->health_log
.temperature
= NVME_TEMPERATURE
;
707 sc
->health_log
.available_spare
= 100;
708 sc
->health_log
.available_spare_threshold
= 10;
710 /* Set Active Firmware Info to slot 1 */
711 sc
->fw_log
.afi
= (1 << NVME_FIRMWARE_PAGE_AFI_SLOT_SHIFT
);
712 memcpy(&sc
->fw_log
.revision
[0], sc
->ctrldata
.fr
,
713 sizeof(sc
->fw_log
.revision
[0]));
715 memcpy(&sc
->health_log
.power_cycles
, &power_cycles
,
716 sizeof(sc
->health_log
.power_cycles
));
720 pci_nvme_init_features(struct pci_nvme_softc
*sc
)
722 enum nvme_feature fid
;
724 for (fid
= 0; fid
< NVME_FID_MAX
; fid
++) {
726 case NVME_FEAT_ARBITRATION
:
727 case NVME_FEAT_POWER_MANAGEMENT
:
728 case NVME_FEAT_INTERRUPT_COALESCING
: //XXX
729 case NVME_FEAT_WRITE_ATOMICITY
:
730 /* Mandatory but no special handling required */
731 //XXX hang - case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG:
732 //XXX hang - case NVME_FEAT_HOST_BEHAVIOR_SUPPORT:
733 // this returns a data buffer
735 case NVME_FEAT_TEMPERATURE_THRESHOLD
:
736 sc
->feat
[fid
].set
= nvme_feature_temperature
;
738 case NVME_FEAT_ERROR_RECOVERY
:
739 sc
->feat
[fid
].namespace_specific
= true;
741 case NVME_FEAT_NUMBER_OF_QUEUES
:
742 sc
->feat
[fid
].set
= nvme_feature_num_queues
;
744 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION
:
745 sc
->feat
[fid
].set
= nvme_feature_iv_config
;
747 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION
:
748 sc
->feat
[fid
].set
= nvme_feature_async_event
;
749 /* Enable all AENs by default */
750 sc
->feat
[fid
].cdw11
= PCI_NVME_AEN_DEFAULT_MASK
;
753 sc
->feat
[fid
].set
= nvme_feature_invalid_cb
;
754 sc
->feat
[fid
].get
= nvme_feature_invalid_cb
;
760 pci_nvme_aer_reset(struct pci_nvme_softc
*sc
)
763 STAILQ_INIT(&sc
->aer_list
);
768 pci_nvme_aer_init(struct pci_nvme_softc
*sc
)
771 pthread_mutex_init(&sc
->aer_mtx
, NULL
);
772 pci_nvme_aer_reset(sc
);
776 pci_nvme_aer_destroy(struct pci_nvme_softc
*sc
)
778 struct pci_nvme_aer
*aer
= NULL
;
780 pthread_mutex_lock(&sc
->aer_mtx
);
781 while (!STAILQ_EMPTY(&sc
->aer_list
)) {
782 aer
= STAILQ_FIRST(&sc
->aer_list
);
783 STAILQ_REMOVE_HEAD(&sc
->aer_list
, link
);
786 pthread_mutex_unlock(&sc
->aer_mtx
);
788 pci_nvme_aer_reset(sc
);
792 pci_nvme_aer_available(struct pci_nvme_softc
*sc
)
795 return (sc
->aer_count
!= 0);
799 pci_nvme_aer_limit_reached(struct pci_nvme_softc
*sc
)
801 struct nvme_controller_data
*cd
= &sc
->ctrldata
;
803 /* AERL is a zero based value while aer_count is one's based */
804 return (sc
->aer_count
== (cd
->aerl
+ 1U));
808 * Add an Async Event Request
810 * Stores an AER to be returned later if the Controller needs to notify the
812 * Note that while the NVMe spec doesn't require Controllers to return AER's
813 * in order, this implementation does preserve the order.
816 pci_nvme_aer_add(struct pci_nvme_softc
*sc
, uint16_t cid
)
818 struct pci_nvme_aer
*aer
= NULL
;
820 aer
= calloc(1, sizeof(struct pci_nvme_aer
));
824 /* Save the Command ID for use in the completion message */
827 pthread_mutex_lock(&sc
->aer_mtx
);
829 STAILQ_INSERT_TAIL(&sc
->aer_list
, aer
, link
);
830 pthread_mutex_unlock(&sc
->aer_mtx
);
836 * Get an Async Event Request structure
838 * Returns a pointer to an AER previously submitted by the host or NULL if
839 * no AER's exist. Caller is responsible for freeing the returned struct.
841 static struct pci_nvme_aer
*
842 pci_nvme_aer_get(struct pci_nvme_softc
*sc
)
844 struct pci_nvme_aer
*aer
= NULL
;
846 pthread_mutex_lock(&sc
->aer_mtx
);
847 aer
= STAILQ_FIRST(&sc
->aer_list
);
849 STAILQ_REMOVE_HEAD(&sc
->aer_list
, link
);
852 pthread_mutex_unlock(&sc
->aer_mtx
);
858 pci_nvme_aen_reset(struct pci_nvme_softc
*sc
)
862 memset(sc
->aen
, 0, PCI_NVME_AE_TYPE_MAX
* sizeof(struct pci_nvme_aen
));
864 for (atype
= 0; atype
< PCI_NVME_AE_TYPE_MAX
; atype
++) {
865 sc
->aen
[atype
].atype
= atype
;
870 pci_nvme_aen_init(struct pci_nvme_softc
*sc
)
874 pci_nvme_aen_reset(sc
);
876 pthread_mutex_init(&sc
->aen_mtx
, NULL
);
877 pthread_create(&sc
->aen_tid
, NULL
, aen_thr
, sc
);
878 snprintf(nstr
, sizeof(nstr
), "nvme-aen-%d:%d", sc
->nsc_pi
->pi_slot
,
879 sc
->nsc_pi
->pi_func
);
880 pthread_set_name_np(sc
->aen_tid
, nstr
);
884 pci_nvme_aen_destroy(struct pci_nvme_softc
*sc
)
887 pci_nvme_aen_reset(sc
);
890 /* Notify the AEN thread of pending work */
892 pci_nvme_aen_notify(struct pci_nvme_softc
*sc
)
895 pthread_cond_signal(&sc
->aen_cond
);
899 * Post an Asynchronous Event Notification
902 pci_nvme_aen_post(struct pci_nvme_softc
*sc
, pci_nvme_async_type atype
,
905 struct pci_nvme_aen
*aen
;
907 if (atype
>= PCI_NVME_AE_TYPE_MAX
) {
911 pthread_mutex_lock(&sc
->aen_mtx
);
912 aen
= &sc
->aen
[atype
];
914 /* Has the controller already posted an event of this type? */
916 pthread_mutex_unlock(&sc
->aen_mtx
);
920 aen
->event_data
= event_data
;
922 pthread_mutex_unlock(&sc
->aen_mtx
);
924 pci_nvme_aen_notify(sc
);
930 pci_nvme_aen_process(struct pci_nvme_softc
*sc
)
932 struct pci_nvme_aer
*aer
;
933 struct pci_nvme_aen
*aen
;
934 pci_nvme_async_type atype
;
943 assert(pthread_mutex_isowned_np(&sc
->aen_mtx
));
944 for (atype
= 0; atype
< PCI_NVME_AE_TYPE_MAX
; atype
++) {
945 aen
= &sc
->aen
[atype
];
946 /* Previous iterations may have depleted the available AER's */
947 if (!pci_nvme_aer_available(sc
)) {
948 DPRINTF("%s: no AER", __func__
);
953 DPRINTF("%s: no AEN posted for atype=%#x", __func__
, atype
);
957 status
= NVME_SC_SUCCESS
;
959 /* Is the event masked? */
961 sc
->feat
[NVME_FEAT_ASYNC_EVENT_CONFIGURATION
].cdw11
;
963 DPRINTF("%s: atype=%#x mask=%#x event_data=%#x", __func__
, atype
, mask
, aen
->event_data
);
965 case PCI_NVME_AE_TYPE_ERROR
:
966 lid
= NVME_LOG_ERROR
;
968 case PCI_NVME_AE_TYPE_SMART
:
970 if ((mask
& aen
->event_data
) == 0)
972 lid
= NVME_LOG_HEALTH_INFORMATION
;
974 case PCI_NVME_AE_TYPE_NOTICE
:
975 if (aen
->event_data
>= PCI_NVME_AEI_NOTICE_MAX
) {
976 EPRINTLN("%s unknown AEN notice type %u",
977 __func__
, aen
->event_data
);
978 status
= NVME_SC_INTERNAL_DEVICE_ERROR
;
982 if ((PCI_NVME_AEI_NOTICE_MASK(aen
->event_data
) & mask
) == 0)
984 switch (aen
->event_data
) {
985 case PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED
:
986 lid
= NVME_LOG_CHANGED_NAMESPACE
;
988 case PCI_NVME_AEI_NOTICE_FW_ACTIVATION
:
989 lid
= NVME_LOG_FIRMWARE_SLOT
;
991 case PCI_NVME_AEI_NOTICE_TELEMETRY_CHANGE
:
992 lid
= NVME_LOG_TELEMETRY_CONTROLLER_INITIATED
;
994 case PCI_NVME_AEI_NOTICE_ANA_CHANGE
:
995 lid
= NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS
;
997 case PCI_NVME_AEI_NOTICE_PREDICT_LATENCY_CHANGE
:
998 lid
= NVME_LOG_PREDICTABLE_LATENCY_EVENT_AGGREGATE
;
1000 case PCI_NVME_AEI_NOTICE_LBA_STATUS_ALERT
:
1001 lid
= NVME_LOG_LBA_STATUS_INFORMATION
;
1003 case PCI_NVME_AEI_NOTICE_ENDURANCE_GROUP_CHANGE
:
1004 lid
= NVME_LOG_ENDURANCE_GROUP_EVENT_AGGREGATE
;
1012 EPRINTLN("%s unknown AEN type %u", __func__
, atype
);
1013 status
= NVME_SC_INTERNAL_DEVICE_ERROR
;
1018 aer
= pci_nvme_aer_get(sc
);
1019 assert(aer
!= NULL
);
1021 DPRINTF("%s: CID=%#x CDW0=%#x", __func__
, aer
->cid
, (lid
<< 16) | (aen
->event_data
<< 8) | atype
);
1022 pci_nvme_cq_update(sc
, &sc
->compl_queues
[0],
1023 (lid
<< 16) | (aen
->event_data
<< 8) | atype
, /* cdw0 */
1028 aen
->event_data
= 0;
1029 aen
->posted
= false;
1031 pci_generate_msix(sc
->nsc_pi
, 0);
1038 struct pci_nvme_softc
*sc
;
1042 pthread_mutex_lock(&sc
->aen_mtx
);
1044 pci_nvme_aen_process(sc
);
1045 pthread_cond_wait(&sc
->aen_cond
, &sc
->aen_mtx
);
1047 #ifdef __FreeBSD__ /* Smatch spots unreachable code */
1048 pthread_mutex_unlock(&sc
->aen_mtx
);
1056 pci_nvme_reset_locked(struct pci_nvme_softc
*sc
)
1060 DPRINTF("%s", __func__
);
1062 sc
->regs
.cap_lo
= (ZERO_BASED(sc
->max_qentries
) & NVME_CAP_LO_REG_MQES_MASK
) |
1063 (1 << NVME_CAP_LO_REG_CQR_SHIFT
) |
1064 (60 << NVME_CAP_LO_REG_TO_SHIFT
);
1066 sc
->regs
.cap_hi
= 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT
;
1068 sc
->regs
.vs
= NVME_REV(1,4); /* NVMe v1.4 */
1072 assert(sc
->submit_queues
!= NULL
);
1074 for (i
= 0; i
< sc
->num_squeues
+ 1; i
++) {
1075 sc
->submit_queues
[i
].qbase
= NULL
;
1076 sc
->submit_queues
[i
].size
= 0;
1077 sc
->submit_queues
[i
].cqid
= 0;
1078 sc
->submit_queues
[i
].tail
= 0;
1079 sc
->submit_queues
[i
].head
= 0;
1082 assert(sc
->compl_queues
!= NULL
);
1084 for (i
= 0; i
< sc
->num_cqueues
+ 1; i
++) {
1085 sc
->compl_queues
[i
].qbase
= NULL
;
1086 sc
->compl_queues
[i
].size
= 0;
1087 sc
->compl_queues
[i
].tail
= 0;
1088 sc
->compl_queues
[i
].head
= 0;
1091 sc
->num_q_is_set
= false;
1093 pci_nvme_aer_destroy(sc
);
1094 pci_nvme_aen_destroy(sc
);
1097 * Clear CSTS.RDY last to prevent the host from enabling Controller
1098 * before cleanup completes
1104 pci_nvme_reset(struct pci_nvme_softc
*sc
)
1106 pthread_mutex_lock(&sc
->mtx
);
1107 pci_nvme_reset_locked(sc
);
1108 pthread_mutex_unlock(&sc
->mtx
);
1112 pci_nvme_init_controller(struct vmctx
*ctx
, struct pci_nvme_softc
*sc
)
1114 uint16_t acqs
, asqs
;
1116 DPRINTF("%s", __func__
);
1119 * NVMe 2.0 states that "enabling a controller while this field is
1120 * cleared to 0h produces undefined results" for both ACQS and
1121 * ASQS. If zero, set CFS and do not become ready.
1123 asqs
= ONE_BASED(sc
->regs
.aqa
& NVME_AQA_REG_ASQS_MASK
);
1125 EPRINTLN("%s: illegal ASQS value %#x (aqa=%#x)", __func__
,
1126 asqs
- 1, sc
->regs
.aqa
);
1127 sc
->regs
.csts
|= NVME_CSTS_CFS
;
1130 sc
->submit_queues
[0].size
= asqs
;
1131 sc
->submit_queues
[0].qbase
= vm_map_gpa(ctx
, sc
->regs
.asq
,
1132 sizeof(struct nvme_command
) * asqs
);
1133 if (sc
->submit_queues
[0].qbase
== NULL
) {
1134 EPRINTLN("%s: ASQ vm_map_gpa(%lx) failed", __func__
,
1136 sc
->regs
.csts
|= NVME_CSTS_CFS
;
1140 DPRINTF("%s mapping Admin-SQ guest 0x%lx, host: %p",
1141 __func__
, sc
->regs
.asq
, sc
->submit_queues
[0].qbase
);
1143 acqs
= ONE_BASED((sc
->regs
.aqa
>> NVME_AQA_REG_ACQS_SHIFT
) &
1144 NVME_AQA_REG_ACQS_MASK
);
1146 EPRINTLN("%s: illegal ACQS value %#x (aqa=%#x)", __func__
,
1147 acqs
- 1, sc
->regs
.aqa
);
1148 sc
->regs
.csts
|= NVME_CSTS_CFS
;
1151 sc
->compl_queues
[0].size
= acqs
;
1152 sc
->compl_queues
[0].qbase
= vm_map_gpa(ctx
, sc
->regs
.acq
,
1153 sizeof(struct nvme_completion
) * acqs
);
1154 if (sc
->compl_queues
[0].qbase
== NULL
) {
1155 EPRINTLN("%s: ACQ vm_map_gpa(%lx) failed", __func__
,
1157 sc
->regs
.csts
|= NVME_CSTS_CFS
;
1160 sc
->compl_queues
[0].intr_en
= NVME_CQ_INTEN
;
1162 DPRINTF("%s mapping Admin-CQ guest 0x%lx, host: %p",
1163 __func__
, sc
->regs
.acq
, sc
->compl_queues
[0].qbase
);
1169 nvme_prp_memcpy(struct vmctx
*ctx
, uint64_t prp1
, uint64_t prp2
, uint8_t *b
,
1170 size_t len
, enum nvme_copy_dir dir
)
1175 if (len
> (8 * 1024)) {
1179 /* Copy from the start of prp1 to the end of the physical page */
1180 bytes
= PAGE_SIZE
- (prp1
& PAGE_MASK
);
1181 bytes
= MIN(bytes
, len
);
1183 p
= vm_map_gpa(ctx
, prp1
, bytes
);
1188 if (dir
== NVME_COPY_TO_PRP
)
1189 memcpy(p
, b
, bytes
);
1191 memcpy(b
, p
, bytes
);
1200 len
= MIN(len
, PAGE_SIZE
);
1202 p
= vm_map_gpa(ctx
, prp2
, len
);
1207 if (dir
== NVME_COPY_TO_PRP
)
1216 * Write a Completion Queue Entry update
1218 * Write the completion and update the doorbell value
1221 pci_nvme_cq_update(struct pci_nvme_softc
*sc
,
1222 struct nvme_completion_queue
*cq
,
1228 struct nvme_submission_queue
*sq
= &sc
->submit_queues
[sqid
];
1229 struct nvme_completion
*cqe
;
1231 assert(cq
->qbase
!= NULL
);
1233 pthread_mutex_lock(&cq
->mtx
);
1235 cqe
= &cq
->qbase
[cq
->tail
];
1237 /* Flip the phase bit */
1238 status
|= (cqe
->status
^ NVME_STATUS_P
) & NVME_STATUS_P_MASK
;
1241 cqe
->sqhd
= sq
->head
;
1244 cqe
->status
= status
;
1247 if (cq
->tail
>= cq
->size
) {
1251 pthread_mutex_unlock(&cq
->mtx
);
1255 nvme_opc_delete_io_sq(struct pci_nvme_softc
* sc
, struct nvme_command
* command
,
1256 struct nvme_completion
* compl)
1258 uint16_t qid
= command
->cdw10
& 0xffff;
1260 DPRINTF("%s DELETE_IO_SQ %u", __func__
, qid
);
1261 if (qid
== 0 || qid
> sc
->num_squeues
||
1262 (sc
->submit_queues
[qid
].qbase
== NULL
)) {
1263 WPRINTF("%s NOT PERMITTED queue id %u / num_squeues %u",
1264 __func__
, qid
, sc
->num_squeues
);
1265 pci_nvme_status_tc(&compl->status
, NVME_SCT_COMMAND_SPECIFIC
,
1266 NVME_SC_INVALID_QUEUE_IDENTIFIER
);
1270 sc
->submit_queues
[qid
].qbase
= NULL
;
1271 sc
->submit_queues
[qid
].cqid
= 0;
1272 pci_nvme_status_genc(&compl->status
, NVME_SC_SUCCESS
);
1277 nvme_opc_create_io_sq(struct pci_nvme_softc
* sc
, struct nvme_command
* command
,
1278 struct nvme_completion
* compl)
1280 if (command
->cdw11
& NVME_CMD_CDW11_PC
) {
1281 uint16_t qid
= command
->cdw10
& 0xffff;
1282 struct nvme_submission_queue
*nsq
;
1284 if ((qid
== 0) || (qid
> sc
->num_squeues
) ||
1285 (sc
->submit_queues
[qid
].qbase
!= NULL
)) {
1286 WPRINTF("%s queue index %u > num_squeues %u",
1287 __func__
, qid
, sc
->num_squeues
);
1288 pci_nvme_status_tc(&compl->status
,
1289 NVME_SCT_COMMAND_SPECIFIC
,
1290 NVME_SC_INVALID_QUEUE_IDENTIFIER
);
1294 nsq
= &sc
->submit_queues
[qid
];
1295 nsq
->size
= ONE_BASED((command
->cdw10
>> 16) & 0xffff);
1296 DPRINTF("%s size=%u (max=%u)", __func__
, nsq
->size
, sc
->max_qentries
);
1297 if ((nsq
->size
< 2) || (nsq
->size
> sc
->max_qentries
)) {
1299 * Queues must specify at least two entries
1300 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
1301 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
1303 pci_nvme_status_tc(&compl->status
,
1304 NVME_SCT_COMMAND_SPECIFIC
,
1305 NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED
);
1308 nsq
->head
= nsq
->tail
= 0;
1310 nsq
->cqid
= (command
->cdw11
>> 16) & 0xffff;
1311 if ((nsq
->cqid
== 0) || (nsq
->cqid
> sc
->num_cqueues
)) {
1312 pci_nvme_status_tc(&compl->status
,
1313 NVME_SCT_COMMAND_SPECIFIC
,
1314 NVME_SC_INVALID_QUEUE_IDENTIFIER
);
1318 if (sc
->compl_queues
[nsq
->cqid
].qbase
== NULL
) {
1319 pci_nvme_status_tc(&compl->status
,
1320 NVME_SCT_COMMAND_SPECIFIC
,
1321 NVME_SC_COMPLETION_QUEUE_INVALID
);
1325 nsq
->qpriority
= (command
->cdw11
>> 1) & 0x03;
1327 nsq
->qbase
= vm_map_gpa(sc
->nsc_pi
->pi_vmctx
, command
->prp1
,
1328 sizeof(struct nvme_command
) * (size_t)nsq
->size
);
1330 DPRINTF("%s sq %u size %u gaddr %p cqid %u", __func__
,
1331 qid
, nsq
->size
, nsq
->qbase
, nsq
->cqid
);
1333 pci_nvme_status_genc(&compl->status
, NVME_SC_SUCCESS
);
1335 DPRINTF("%s completed creating IOSQ qid %u",
1339 * Guest sent non-cont submission queue request.
1340 * This setting is unsupported by this emulation.
1342 WPRINTF("%s unsupported non-contig (list-based) "
1343 "create i/o submission queue", __func__
);
1345 pci_nvme_status_genc(&compl->status
, NVME_SC_INVALID_FIELD
);
1351 nvme_opc_delete_io_cq(struct pci_nvme_softc
* sc
, struct nvme_command
* command
,
1352 struct nvme_completion
* compl)
1354 uint16_t qid
= command
->cdw10
& 0xffff;
1357 DPRINTF("%s DELETE_IO_CQ %u", __func__
, qid
);
1358 if (qid
== 0 || qid
> sc
->num_cqueues
||
1359 (sc
->compl_queues
[qid
].qbase
== NULL
)) {
1360 WPRINTF("%s queue index %u / num_cqueues %u",
1361 __func__
, qid
, sc
->num_cqueues
);
1362 pci_nvme_status_tc(&compl->status
, NVME_SCT_COMMAND_SPECIFIC
,
1363 NVME_SC_INVALID_QUEUE_IDENTIFIER
);
1367 /* Deleting an Active CQ is an error */
1368 for (sqid
= 1; sqid
< sc
->num_squeues
+ 1; sqid
++)
1369 if (sc
->submit_queues
[sqid
].cqid
== qid
) {
1370 pci_nvme_status_tc(&compl->status
,
1371 NVME_SCT_COMMAND_SPECIFIC
,
1372 NVME_SC_INVALID_QUEUE_DELETION
);
1376 sc
->compl_queues
[qid
].qbase
= NULL
;
1377 pci_nvme_status_genc(&compl->status
, NVME_SC_SUCCESS
);
1382 nvme_opc_create_io_cq(struct pci_nvme_softc
* sc
, struct nvme_command
* command
,
1383 struct nvme_completion
* compl)
1385 struct nvme_completion_queue
*ncq
;
1386 uint16_t qid
= command
->cdw10
& 0xffff;
1388 /* Only support Physically Contiguous queues */
1389 if ((command
->cdw11
& NVME_CMD_CDW11_PC
) == 0) {
1390 WPRINTF("%s unsupported non-contig (list-based) "
1391 "create i/o completion queue",
1394 pci_nvme_status_genc(&compl->status
, NVME_SC_INVALID_FIELD
);
1398 if ((qid
== 0) || (qid
> sc
->num_cqueues
) ||
1399 (sc
->compl_queues
[qid
].qbase
!= NULL
)) {
1400 WPRINTF("%s queue index %u > num_cqueues %u",
1401 __func__
, qid
, sc
->num_cqueues
);
1402 pci_nvme_status_tc(&compl->status
,
1403 NVME_SCT_COMMAND_SPECIFIC
,
1404 NVME_SC_INVALID_QUEUE_IDENTIFIER
);
1408 ncq
= &sc
->compl_queues
[qid
];
1409 ncq
->intr_en
= (command
->cdw11
& NVME_CMD_CDW11_IEN
) >> 1;
1410 ncq
->intr_vec
= (command
->cdw11
>> 16) & 0xffff;
1411 if (ncq
->intr_vec
> (sc
->max_queues
+ 1)) {
1412 pci_nvme_status_tc(&compl->status
,
1413 NVME_SCT_COMMAND_SPECIFIC
,
1414 NVME_SC_INVALID_INTERRUPT_VECTOR
);
1418 ncq
->size
= ONE_BASED((command
->cdw10
>> 16) & 0xffff);
1419 if ((ncq
->size
< 2) || (ncq
->size
> sc
->max_qentries
)) {
1421 * Queues must specify at least two entries
1422 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
1423 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
1425 pci_nvme_status_tc(&compl->status
,
1426 NVME_SCT_COMMAND_SPECIFIC
,
1427 NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED
);
1430 ncq
->head
= ncq
->tail
= 0;
1431 ncq
->qbase
= vm_map_gpa(sc
->nsc_pi
->pi_vmctx
,
1433 sizeof(struct nvme_command
) * (size_t)ncq
->size
);
1435 pci_nvme_status_genc(&compl->status
, NVME_SC_SUCCESS
);
1442 nvme_opc_get_log_page(struct pci_nvme_softc
* sc
, struct nvme_command
* command
,
1443 struct nvme_completion
* compl)
1449 pci_nvme_status_genc(&compl->status
, NVME_SC_SUCCESS
);
1452 * Command specifies the number of dwords to return in fields NUMDU
1453 * and NUMDL. This is a zero-based value.
1455 logpage
= command
->cdw10
& 0xFF;
1456 logsize
= ((command
->cdw11
<< 16) | (command
->cdw10
>> 16)) + 1;
1457 logsize
*= sizeof(uint32_t);
1458 logoff
= ((uint64_t)(command
->cdw13
) << 32) | command
->cdw12
;
1460 DPRINTF("%s log page %u len %u", __func__
, logpage
, logsize
);
1463 case NVME_LOG_ERROR
:
1464 if (logoff
>= sizeof(sc
->err_log
)) {
1465 pci_nvme_status_genc(&compl->status
,
1466 NVME_SC_INVALID_FIELD
);
1470 nvme_prp_memcpy(sc
->nsc_pi
->pi_vmctx
, command
->prp1
,
1471 command
->prp2
, (uint8_t *)&sc
->err_log
+ logoff
,
1472 MIN(logsize
- logoff
, sizeof(sc
->err_log
)),
1475 case NVME_LOG_HEALTH_INFORMATION
:
1476 if (logoff
>= sizeof(sc
->health_log
)) {
1477 pci_nvme_status_genc(&compl->status
,
1478 NVME_SC_INVALID_FIELD
);
1482 pthread_mutex_lock(&sc
->mtx
);
1483 memcpy(&sc
->health_log
.data_units_read
, &sc
->read_data_units
,
1484 sizeof(sc
->health_log
.data_units_read
));
1485 memcpy(&sc
->health_log
.data_units_written
, &sc
->write_data_units
,
1486 sizeof(sc
->health_log
.data_units_written
));
1487 memcpy(&sc
->health_log
.host_read_commands
, &sc
->read_commands
,
1488 sizeof(sc
->health_log
.host_read_commands
));
1489 memcpy(&sc
->health_log
.host_write_commands
, &sc
->write_commands
,
1490 sizeof(sc
->health_log
.host_write_commands
));
1491 pthread_mutex_unlock(&sc
->mtx
);
1493 nvme_prp_memcpy(sc
->nsc_pi
->pi_vmctx
, command
->prp1
,
1494 command
->prp2
, (uint8_t *)&sc
->health_log
+ logoff
,
1495 MIN(logsize
- logoff
, sizeof(sc
->health_log
)),
1498 case NVME_LOG_FIRMWARE_SLOT
:
1499 if (logoff
>= sizeof(sc
->fw_log
)) {
1500 pci_nvme_status_genc(&compl->status
,
1501 NVME_SC_INVALID_FIELD
);
1505 nvme_prp_memcpy(sc
->nsc_pi
->pi_vmctx
, command
->prp1
,
1506 command
->prp2
, (uint8_t *)&sc
->fw_log
+ logoff
,
1507 MIN(logsize
- logoff
, sizeof(sc
->fw_log
)),
1510 case NVME_LOG_CHANGED_NAMESPACE
:
1511 if (logoff
>= sizeof(sc
->ns_log
)) {
1512 pci_nvme_status_genc(&compl->status
,
1513 NVME_SC_INVALID_FIELD
);
1517 nvme_prp_memcpy(sc
->nsc_pi
->pi_vmctx
, command
->prp1
,
1518 command
->prp2
, (uint8_t *)&sc
->ns_log
+ logoff
,
1519 MIN(logsize
- logoff
, sizeof(sc
->ns_log
)),
1521 memset(&sc
->ns_log
, 0, sizeof(sc
->ns_log
));
1524 DPRINTF("%s get log page %x command not supported",
1527 pci_nvme_status_tc(&compl->status
, NVME_SCT_COMMAND_SPECIFIC
,
1528 NVME_SC_INVALID_LOG_PAGE
);
1535 nvme_opc_identify(struct pci_nvme_softc
* sc
, struct nvme_command
* command
,
1536 struct nvme_completion
* compl)
1541 DPRINTF("%s identify 0x%x nsid 0x%x", __func__
,
1542 command
->cdw10
& 0xFF, command
->nsid
);
1545 pci_nvme_status_genc(&status
, NVME_SC_SUCCESS
);
1547 switch (command
->cdw10
& 0xFF) {
1548 case 0x00: /* return Identify Namespace data structure */
1549 /* Global NS only valid with NS Management */
1550 if (command
->nsid
== NVME_GLOBAL_NAMESPACE_TAG
) {
1551 pci_nvme_status_genc(&status
,
1552 NVME_SC_INVALID_NAMESPACE_OR_FORMAT
);
1555 nvme_prp_memcpy(sc
->nsc_pi
->pi_vmctx
, command
->prp1
,
1556 command
->prp2
, (uint8_t *)&sc
->nsdata
, sizeof(sc
->nsdata
),
1559 case 0x01: /* return Identify Controller data structure */
1560 nvme_prp_memcpy(sc
->nsc_pi
->pi_vmctx
, command
->prp1
,
1561 command
->prp2
, (uint8_t *)&sc
->ctrldata
,
1562 sizeof(sc
->ctrldata
),
1565 case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */
1566 dest
= vm_map_gpa(sc
->nsc_pi
->pi_vmctx
, command
->prp1
,
1567 sizeof(uint32_t) * 1024);
1568 /* All unused entries shall be zero */
1569 memset(dest
, 0, sizeof(uint32_t) * 1024);
1570 ((uint32_t *)dest
)[0] = 1;
1572 case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */
1573 if (command
->nsid
!= 1) {
1574 pci_nvme_status_genc(&status
,
1575 NVME_SC_INVALID_NAMESPACE_OR_FORMAT
);
1578 dest
= vm_map_gpa(sc
->nsc_pi
->pi_vmctx
, command
->prp1
,
1579 sizeof(uint32_t) * 1024);
1580 /* All bytes after the descriptor shall be zero */
1581 memset(dest
, 0, sizeof(uint32_t) * 1024);
1583 /* Return NIDT=1 (i.e. EUI64) descriptor */
1584 ((uint8_t *)dest
)[0] = 1;
1585 ((uint8_t *)dest
)[1] = sizeof(uint64_t);
1586 memcpy(((uint8_t *)dest
) + 4, sc
->nsdata
.eui64
, sizeof(uint64_t));
1590 * Controller list is optional but used by UNH tests. Return
1591 * a valid but empty list.
1593 dest
= vm_map_gpa(sc
->nsc_pi
->pi_vmctx
, command
->prp1
,
1594 sizeof(uint16_t) * 2048);
1595 memset(dest
, 0, sizeof(uint16_t) * 2048);
1598 DPRINTF("%s unsupported identify command requested 0x%x",
1599 __func__
, command
->cdw10
& 0xFF);
1600 pci_nvme_status_genc(&status
, NVME_SC_INVALID_FIELD
);
1604 compl->status
= status
;
1609 nvme_fid_to_name(uint8_t fid
)
1614 case NVME_FEAT_ARBITRATION
:
1615 name
= "Arbitration";
1617 case NVME_FEAT_POWER_MANAGEMENT
:
1618 name
= "Power Management";
1620 case NVME_FEAT_LBA_RANGE_TYPE
:
1621 name
= "LBA Range Type";
1623 case NVME_FEAT_TEMPERATURE_THRESHOLD
:
1624 name
= "Temperature Threshold";
1626 case NVME_FEAT_ERROR_RECOVERY
:
1627 name
= "Error Recovery";
1629 case NVME_FEAT_VOLATILE_WRITE_CACHE
:
1630 name
= "Volatile Write Cache";
1632 case NVME_FEAT_NUMBER_OF_QUEUES
:
1633 name
= "Number of Queues";
1635 case NVME_FEAT_INTERRUPT_COALESCING
:
1636 name
= "Interrupt Coalescing";
1638 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION
:
1639 name
= "Interrupt Vector Configuration";
1641 case NVME_FEAT_WRITE_ATOMICITY
:
1642 name
= "Write Atomicity Normal";
1644 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION
:
1645 name
= "Asynchronous Event Configuration";
1647 case NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION
:
1648 name
= "Autonomous Power State Transition";
1650 case NVME_FEAT_HOST_MEMORY_BUFFER
:
1651 name
= "Host Memory Buffer";
1653 case NVME_FEAT_TIMESTAMP
:
1656 case NVME_FEAT_KEEP_ALIVE_TIMER
:
1657 name
= "Keep Alive Timer";
1659 case NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT
:
1660 name
= "Host Controlled Thermal Management";
1662 case NVME_FEAT_NON_OP_POWER_STATE_CONFIG
:
1663 name
= "Non-Operation Power State Config";
1665 case NVME_FEAT_READ_RECOVERY_LEVEL_CONFIG
:
1666 name
= "Read Recovery Level Config";
1668 case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG
:
1669 name
= "Predictable Latency Mode Config";
1671 case NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW
:
1672 name
= "Predictable Latency Mode Window";
1674 case NVME_FEAT_LBA_STATUS_INFORMATION_ATTRIBUTES
:
1675 name
= "LBA Status Information Report Interval";
1677 case NVME_FEAT_HOST_BEHAVIOR_SUPPORT
:
1678 name
= "Host Behavior Support";
1680 case NVME_FEAT_SANITIZE_CONFIG
:
1681 name
= "Sanitize Config";
1683 case NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION
:
1684 name
= "Endurance Group Event Configuration";
1686 case NVME_FEAT_SOFTWARE_PROGRESS_MARKER
:
1687 name
= "Software Progress Marker";
1689 case NVME_FEAT_HOST_IDENTIFIER
:
1690 name
= "Host Identifier";
1692 case NVME_FEAT_RESERVATION_NOTIFICATION_MASK
:
1693 name
= "Reservation Notification Mask";
1695 case NVME_FEAT_RESERVATION_PERSISTENCE
:
1696 name
= "Reservation Persistence";
1698 case NVME_FEAT_NAMESPACE_WRITE_PROTECTION_CONFIG
:
1699 name
= "Namespace Write Protection Config";
1710 nvme_feature_invalid_cb(struct pci_nvme_softc
*sc __unused
,
1711 struct nvme_feature_obj
*feat __unused
,
1712 struct nvme_command
*command __unused
,
1713 struct nvme_completion
*compl)
1715 pci_nvme_status_genc(&compl->status
, NVME_SC_INVALID_FIELD
);
1719 nvme_feature_iv_config(struct pci_nvme_softc
*sc
,
1720 struct nvme_feature_obj
*feat __unused
,
1721 struct nvme_command
*command
,
1722 struct nvme_completion
*compl)
1725 uint32_t cdw11
= command
->cdw11
;
1729 pci_nvme_status_genc(&compl->status
, NVME_SC_INVALID_FIELD
);
1731 iv
= cdw11
& 0xffff;
1732 cd
= cdw11
& (1 << 16);
1734 if (iv
> (sc
->max_queues
+ 1)) {
1738 /* No Interrupt Coalescing (i.e. not Coalescing Disable) for Admin Q */
1739 if ((iv
== 0) && !cd
)
1742 /* Requested Interrupt Vector must be used by a CQ */
1743 for (i
= 0; i
< sc
->num_cqueues
+ 1; i
++) {
1744 if (sc
->compl_queues
[i
].intr_vec
== iv
) {
1745 pci_nvme_status_genc(&compl->status
, NVME_SC_SUCCESS
);
1750 #define NVME_ASYNC_EVENT_ENDURANCE_GROUP (0x4000)
1752 nvme_feature_async_event(struct pci_nvme_softc
*sc __unused
,
1753 struct nvme_feature_obj
*feat __unused
,
1754 struct nvme_command
*command
,
1755 struct nvme_completion
*compl)
1757 if (command
->cdw11
& NVME_ASYNC_EVENT_ENDURANCE_GROUP
)
1758 pci_nvme_status_genc(&compl->status
, NVME_SC_INVALID_FIELD
);
1761 #define NVME_TEMP_THRESH_OVER 0
1762 #define NVME_TEMP_THRESH_UNDER 1
1764 nvme_feature_temperature(struct pci_nvme_softc
*sc
,
1765 struct nvme_feature_obj
*feat __unused
,
1766 struct nvme_command
*command
,
1767 struct nvme_completion
*compl)
1769 uint16_t tmpth
; /* Temperature Threshold */
1770 uint8_t tmpsel
; /* Threshold Temperature Select */
1771 uint8_t thsel
; /* Threshold Type Select */
1772 bool set_crit
= false;
1775 tmpth
= command
->cdw11
& 0xffff;
1776 tmpsel
= (command
->cdw11
>> 16) & 0xf;
1777 thsel
= (command
->cdw11
>> 20) & 0x3;
1779 DPRINTF("%s: tmpth=%#x tmpsel=%#x thsel=%#x", __func__
, tmpth
, tmpsel
, thsel
);
1781 /* Check for unsupported values */
1782 if (((tmpsel
!= 0) && (tmpsel
!= 0xf)) ||
1783 (thsel
> NVME_TEMP_THRESH_UNDER
)) {
1784 pci_nvme_status_genc(&compl->status
, NVME_SC_INVALID_FIELD
);
1788 if (((thsel
== NVME_TEMP_THRESH_OVER
) && (NVME_TEMPERATURE
>= tmpth
)) ||
1789 ((thsel
== NVME_TEMP_THRESH_UNDER
) && (NVME_TEMPERATURE
<= tmpth
)))
1792 pthread_mutex_lock(&sc
->mtx
);
1794 sc
->health_log
.critical_warning
|=
1795 NVME_CRIT_WARN_ST_TEMPERATURE
;
1797 sc
->health_log
.critical_warning
&=
1798 ~NVME_CRIT_WARN_ST_TEMPERATURE
;
1799 pthread_mutex_unlock(&sc
->mtx
);
1801 report_crit
= sc
->feat
[NVME_FEAT_ASYNC_EVENT_CONFIGURATION
].cdw11
&
1802 NVME_CRIT_WARN_ST_TEMPERATURE
;
1804 if (set_crit
&& report_crit
)
1805 pci_nvme_aen_post(sc
, PCI_NVME_AE_TYPE_SMART
,
1806 sc
->health_log
.critical_warning
);
1808 DPRINTF("%s: set_crit=%c critical_warning=%#x status=%#x", __func__
, set_crit
? 'T':'F', sc
->health_log
.critical_warning
, compl->status
);
1812 nvme_feature_num_queues(struct pci_nvme_softc
*sc
,
1813 struct nvme_feature_obj
*feat __unused
,
1814 struct nvme_command
*command
,
1815 struct nvme_completion
*compl)
1817 uint16_t nqr
; /* Number of Queues Requested */
1819 if (sc
->num_q_is_set
) {
1820 WPRINTF("%s: Number of Queues already set", __func__
);
1821 pci_nvme_status_genc(&compl->status
,
1822 NVME_SC_COMMAND_SEQUENCE_ERROR
);
1826 nqr
= command
->cdw11
& 0xFFFF;
1827 if (nqr
== 0xffff) {
1828 WPRINTF("%s: Illegal NSQR value %#x", __func__
, nqr
);
1829 pci_nvme_status_genc(&compl->status
, NVME_SC_INVALID_FIELD
);
1833 sc
->num_squeues
= ONE_BASED(nqr
);
1834 if (sc
->num_squeues
> sc
->max_queues
) {
1835 DPRINTF("NSQR=%u is greater than max %u", sc
->num_squeues
,
1837 sc
->num_squeues
= sc
->max_queues
;
1840 nqr
= (command
->cdw11
>> 16) & 0xFFFF;
1841 if (nqr
== 0xffff) {
1842 WPRINTF("%s: Illegal NCQR value %#x", __func__
, nqr
);
1843 pci_nvme_status_genc(&compl->status
, NVME_SC_INVALID_FIELD
);
1847 sc
->num_cqueues
= ONE_BASED(nqr
);
1848 if (sc
->num_cqueues
> sc
->max_queues
) {
1849 DPRINTF("NCQR=%u is greater than max %u", sc
->num_cqueues
,
1851 sc
->num_cqueues
= sc
->max_queues
;
1854 /* Patch the command value which will be saved on callback's return */
1855 command
->cdw11
= NVME_FEATURE_NUM_QUEUES(sc
);
1856 compl->cdw0
= NVME_FEATURE_NUM_QUEUES(sc
);
1858 sc
->num_q_is_set
= true;
1862 nvme_opc_set_features(struct pci_nvme_softc
*sc
, struct nvme_command
*command
,
1863 struct nvme_completion
*compl)
1865 struct nvme_feature_obj
*feat
;
1866 uint32_t nsid
= command
->nsid
;
1867 uint8_t fid
= NVMEV(NVME_FEAT_SET_FID
, command
->cdw10
);
1868 bool sv
= NVMEV(NVME_FEAT_SET_SV
, command
->cdw10
);
1870 DPRINTF("%s: Feature ID 0x%x (%s)", __func__
, fid
, nvme_fid_to_name(fid
));
1872 if (fid
>= NVME_FID_MAX
) {
1873 DPRINTF("%s invalid feature 0x%x", __func__
, fid
);
1874 pci_nvme_status_genc(&compl->status
, NVME_SC_INVALID_FIELD
);
1879 pci_nvme_status_tc(&compl->status
, NVME_SCT_COMMAND_SPECIFIC
,
1880 NVME_SC_FEATURE_NOT_SAVEABLE
);
1884 feat
= &sc
->feat
[fid
];
1886 if (feat
->namespace_specific
&& (nsid
== NVME_GLOBAL_NAMESPACE_TAG
)) {
1887 pci_nvme_status_genc(&compl->status
, NVME_SC_INVALID_FIELD
);
1891 if (!feat
->namespace_specific
&&
1892 !((nsid
== 0) || (nsid
== NVME_GLOBAL_NAMESPACE_TAG
))) {
1893 pci_nvme_status_tc(&compl->status
, NVME_SCT_COMMAND_SPECIFIC
,
1894 NVME_SC_FEATURE_NOT_NS_SPECIFIC
);
1899 pci_nvme_status_genc(&compl->status
, NVME_SC_SUCCESS
);
1902 feat
->set(sc
, feat
, command
, compl);
1904 pci_nvme_status_tc(&compl->status
, NVME_SCT_COMMAND_SPECIFIC
,
1905 NVME_SC_FEATURE_NOT_CHANGEABLE
);
1909 DPRINTF("%s: status=%#x cdw11=%#x", __func__
, compl->status
, command
->cdw11
);
1910 if (compl->status
== NVME_SC_SUCCESS
) {
1911 feat
->cdw11
= command
->cdw11
;
1912 if ((fid
== NVME_FEAT_ASYNC_EVENT_CONFIGURATION
) &&
1913 (command
->cdw11
!= 0))
1914 pci_nvme_aen_notify(sc
);
1920 #define NVME_FEATURES_SEL_SUPPORTED 0x3
1921 #define NVME_FEATURES_NS_SPECIFIC (1 << 1)
1924 nvme_opc_get_features(struct pci_nvme_softc
* sc
, struct nvme_command
* command
,
1925 struct nvme_completion
* compl)
1927 struct nvme_feature_obj
*feat
;
1928 uint8_t fid
= command
->cdw10
& 0xFF;
1929 uint8_t sel
= (command
->cdw10
>> 8) & 0x7;
1931 DPRINTF("%s: Feature ID 0x%x (%s)", __func__
, fid
, nvme_fid_to_name(fid
));
1933 if (fid
>= NVME_FID_MAX
) {
1934 DPRINTF("%s invalid feature 0x%x", __func__
, fid
);
1935 pci_nvme_status_genc(&compl->status
, NVME_SC_INVALID_FIELD
);
1940 pci_nvme_status_genc(&compl->status
, NVME_SC_SUCCESS
);
1942 feat
= &sc
->feat
[fid
];
1944 feat
->get(sc
, feat
, command
, compl);
1947 if (compl->status
== NVME_SC_SUCCESS
) {
1948 if ((sel
== NVME_FEATURES_SEL_SUPPORTED
) && feat
->namespace_specific
)
1949 compl->cdw0
= NVME_FEATURES_NS_SPECIFIC
;
1951 compl->cdw0
= feat
->cdw11
;
1958 nvme_opc_format_nvm(struct pci_nvme_softc
* sc
, struct nvme_command
* command
,
1959 struct nvme_completion
* compl)
1961 uint8_t ses
, lbaf
, pi
;
1963 /* Only supports Secure Erase Setting - User Data Erase */
1964 ses
= (command
->cdw10
>> 9) & 0x7;
1966 pci_nvme_status_genc(&compl->status
, NVME_SC_INVALID_FIELD
);
1970 /* Only supports a single LBA Format */
1971 lbaf
= command
->cdw10
& 0xf;
1973 pci_nvme_status_tc(&compl->status
, NVME_SCT_COMMAND_SPECIFIC
,
1974 NVME_SC_INVALID_FORMAT
);
1978 /* Doesn't support Protection Infomation */
1979 pi
= (command
->cdw10
>> 5) & 0x7;
1981 pci_nvme_status_genc(&compl->status
, NVME_SC_INVALID_FIELD
);
1985 if (sc
->nvstore
.type
== NVME_STOR_RAM
) {
1986 if (sc
->nvstore
.ctx
)
1987 free(sc
->nvstore
.ctx
);
1988 sc
->nvstore
.ctx
= calloc(1, sc
->nvstore
.size
);
1989 pci_nvme_status_genc(&compl->status
, NVME_SC_SUCCESS
);
1991 struct pci_nvme_ioreq
*req
;
1994 req
= pci_nvme_get_ioreq(sc
);
1996 pci_nvme_status_genc(&compl->status
,
1997 NVME_SC_INTERNAL_DEVICE_ERROR
);
1998 WPRINTF("%s: unable to allocate IO req", __func__
);
2001 req
->nvme_sq
= &sc
->submit_queues
[0];
2003 req
->opc
= command
->opc
;
2004 req
->cid
= command
->cid
;
2005 req
->nsid
= command
->nsid
;
2007 req
->io_req
.br_offset
= 0;
2008 req
->io_req
.br_resid
= sc
->nvstore
.size
;
2009 req
->io_req
.br_callback
= pci_nvme_io_done
;
2011 err
= blockif_delete(sc
->nvstore
.ctx
, &req
->io_req
);
2013 pci_nvme_status_genc(&compl->status
,
2014 NVME_SC_INTERNAL_DEVICE_ERROR
);
2015 pci_nvme_release_ioreq(sc
, req
);
2017 compl->status
= NVME_NO_STATUS
;
2024 nvme_opc_abort(struct pci_nvme_softc
*sc __unused
, struct nvme_command
*command
,
2025 struct nvme_completion
*compl)
2027 DPRINTF("%s submission queue %u, command ID 0x%x", __func__
,
2028 command
->cdw10
& 0xFFFF, (command
->cdw10
>> 16) & 0xFFFF);
2030 /* TODO: search for the command ID and abort it */
2033 pci_nvme_status_genc(&compl->status
, NVME_SC_SUCCESS
);
2038 nvme_opc_async_event_req(struct pci_nvme_softc
* sc
,
2039 struct nvme_command
* command
, struct nvme_completion
* compl)
2041 DPRINTF("%s async event request count=%u aerl=%u cid=%#x", __func__
,
2042 sc
->aer_count
, sc
->ctrldata
.aerl
, command
->cid
);
2044 /* Don't exceed the Async Event Request Limit (AERL). */
2045 if (pci_nvme_aer_limit_reached(sc
)) {
2046 pci_nvme_status_tc(&compl->status
, NVME_SCT_COMMAND_SPECIFIC
,
2047 NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED
);
2051 if (pci_nvme_aer_add(sc
, command
->cid
)) {
2052 pci_nvme_status_tc(&compl->status
, NVME_SCT_GENERIC
,
2053 NVME_SC_INTERNAL_DEVICE_ERROR
);
2058 * Raise events when they happen based on the Set Features cmd.
2059 * These events happen async, so only set completion successful if
2060 * there is an event reflective of the request to get event.
2062 compl->status
= NVME_NO_STATUS
;
2063 pci_nvme_aen_notify(sc
);
2069 pci_nvme_handle_admin_cmd(struct pci_nvme_softc
* sc
, uint64_t value
)
2071 struct nvme_completion
compl;
2072 struct nvme_command
*cmd
;
2073 struct nvme_submission_queue
*sq
;
2074 struct nvme_completion_queue
*cq
;
2077 DPRINTF("%s index %u", __func__
, (uint32_t)value
);
2079 sq
= &sc
->submit_queues
[0];
2080 cq
= &sc
->compl_queues
[0];
2082 pthread_mutex_lock(&sq
->mtx
);
2085 DPRINTF("sqhead %u, tail %u", sqhead
, sq
->tail
);
2087 while (sqhead
!= atomic_load_acq_short(&sq
->tail
)) {
2088 cmd
= &(sq
->qbase
)[sqhead
];
2093 case NVME_OPC_DELETE_IO_SQ
:
2094 DPRINTF("%s command DELETE_IO_SQ", __func__
);
2095 nvme_opc_delete_io_sq(sc
, cmd
, &compl);
2097 case NVME_OPC_CREATE_IO_SQ
:
2098 DPRINTF("%s command CREATE_IO_SQ", __func__
);
2099 nvme_opc_create_io_sq(sc
, cmd
, &compl);
2101 case NVME_OPC_DELETE_IO_CQ
:
2102 DPRINTF("%s command DELETE_IO_CQ", __func__
);
2103 nvme_opc_delete_io_cq(sc
, cmd
, &compl);
2105 case NVME_OPC_CREATE_IO_CQ
:
2106 DPRINTF("%s command CREATE_IO_CQ", __func__
);
2107 nvme_opc_create_io_cq(sc
, cmd
, &compl);
2109 case NVME_OPC_GET_LOG_PAGE
:
2110 DPRINTF("%s command GET_LOG_PAGE", __func__
);
2111 nvme_opc_get_log_page(sc
, cmd
, &compl);
2113 case NVME_OPC_IDENTIFY
:
2114 DPRINTF("%s command IDENTIFY", __func__
);
2115 nvme_opc_identify(sc
, cmd
, &compl);
2117 case NVME_OPC_ABORT
:
2118 DPRINTF("%s command ABORT", __func__
);
2119 nvme_opc_abort(sc
, cmd
, &compl);
2121 case NVME_OPC_SET_FEATURES
:
2122 DPRINTF("%s command SET_FEATURES", __func__
);
2123 nvme_opc_set_features(sc
, cmd
, &compl);
2125 case NVME_OPC_GET_FEATURES
:
2126 DPRINTF("%s command GET_FEATURES", __func__
);
2127 nvme_opc_get_features(sc
, cmd
, &compl);
2129 case NVME_OPC_FIRMWARE_ACTIVATE
:
2130 DPRINTF("%s command FIRMWARE_ACTIVATE", __func__
);
2131 pci_nvme_status_tc(&compl.status
,
2132 NVME_SCT_COMMAND_SPECIFIC
,
2133 NVME_SC_INVALID_FIRMWARE_SLOT
);
2135 case NVME_OPC_ASYNC_EVENT_REQUEST
:
2136 DPRINTF("%s command ASYNC_EVENT_REQ", __func__
);
2137 nvme_opc_async_event_req(sc
, cmd
, &compl);
2139 case NVME_OPC_FORMAT_NVM
:
2140 DPRINTF("%s command FORMAT_NVM", __func__
);
2141 if ((sc
->ctrldata
.oacs
&
2142 (1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT
)) == 0) {
2143 pci_nvme_status_genc(&compl.status
, NVME_SC_INVALID_OPCODE
);
2146 nvme_opc_format_nvm(sc
, cmd
, &compl);
2148 case NVME_OPC_SECURITY_SEND
:
2149 case NVME_OPC_SECURITY_RECEIVE
:
2150 case NVME_OPC_SANITIZE
:
2151 case NVME_OPC_GET_LBA_STATUS
:
2152 DPRINTF("%s command OPC=%#x (unsupported)", __func__
,
2154 /* Valid but unsupported opcodes */
2155 pci_nvme_status_genc(&compl.status
, NVME_SC_INVALID_FIELD
);
2158 DPRINTF("%s command OPC=%#X (not implemented)",
2161 pci_nvme_status_genc(&compl.status
, NVME_SC_INVALID_OPCODE
);
2163 sqhead
= (sqhead
+ 1) % sq
->size
;
2165 if (NVME_COMPLETION_VALID(compl)) {
2166 pci_nvme_cq_update(sc
, &sc
->compl_queues
[0],
2174 DPRINTF("setting sqhead %u", sqhead
);
2177 if (cq
->head
!= cq
->tail
)
2178 pci_generate_msix(sc
->nsc_pi
, 0);
2180 pthread_mutex_unlock(&sq
->mtx
);
2184 * Update the Write and Read statistics reported in SMART data
2186 * NVMe defines "data unit" as thousand's of 512 byte blocks and is rounded up.
2187 * E.g. 1 data unit is 1 - 1,000 512 byte blocks. 3 data units are 2,001 - 3,000
2188 * 512 byte blocks. Rounding up is acheived by initializing the remainder to 999.
2191 pci_nvme_stats_write_read_update(struct pci_nvme_softc
*sc
, uint8_t opc
,
2192 size_t bytes
, uint16_t status
)
2195 pthread_mutex_lock(&sc
->mtx
);
2197 case NVME_OPC_WRITE
:
2198 sc
->write_commands
++;
2199 if (status
!= NVME_SC_SUCCESS
)
2201 sc
->write_dunits_remainder
+= (bytes
/ 512);
2202 while (sc
->write_dunits_remainder
>= 1000) {
2203 sc
->write_data_units
++;
2204 sc
->write_dunits_remainder
-= 1000;
2208 sc
->read_commands
++;
2209 if (status
!= NVME_SC_SUCCESS
)
2211 sc
->read_dunits_remainder
+= (bytes
/ 512);
2212 while (sc
->read_dunits_remainder
>= 1000) {
2213 sc
->read_data_units
++;
2214 sc
->read_dunits_remainder
-= 1000;
2218 DPRINTF("%s: Invalid OPC 0x%02x for stats", __func__
, opc
);
2221 pthread_mutex_unlock(&sc
->mtx
);
2225 * Check if the combination of Starting LBA (slba) and number of blocks
2226 * exceeds the range of the underlying storage.
2228 * Because NVMe specifies the SLBA in blocks as a uint64_t and blockif stores
2229 * the capacity in bytes as a uint64_t, care must be taken to avoid integer
2233 pci_nvme_out_of_range(struct pci_nvme_blockstore
*nvstore
, uint64_t slba
,
2236 size_t offset
, bytes
;
2238 /* Overflow check of multiplying Starting LBA by the sector size */
2239 if (slba
>> (64 - nvstore
->sectsz_bits
))
2242 offset
= slba
<< nvstore
->sectsz_bits
;
2243 bytes
= nblocks
<< nvstore
->sectsz_bits
;
2245 /* Overflow check of Number of Logical Blocks */
2246 if ((nvstore
->size
<= offset
) || ((nvstore
->size
- offset
) < bytes
))
2253 pci_nvme_append_iov_req(struct pci_nvme_softc
*sc __unused
,
2254 struct pci_nvme_ioreq
*req
, uint64_t gpaddr
, size_t size
, uint64_t offset
)
2257 bool range_is_contiguous
;
2262 if (req
->io_req
.br_iovcnt
== NVME_MAX_IOVEC
) {
2267 * Minimize the number of IOVs by concatenating contiguous address
2268 * ranges. If the IOV count is zero, there is no previous range to
2271 if (req
->io_req
.br_iovcnt
== 0)
2272 range_is_contiguous
= false;
2274 range_is_contiguous
= (req
->prev_gpaddr
+ req
->prev_size
) == gpaddr
;
2276 if (range_is_contiguous
) {
2277 iovidx
= req
->io_req
.br_iovcnt
- 1;
2279 req
->io_req
.br_iov
[iovidx
].iov_base
=
2280 paddr_guest2host(req
->sc
->nsc_pi
->pi_vmctx
,
2281 req
->prev_gpaddr
, size
);
2282 if (req
->io_req
.br_iov
[iovidx
].iov_base
== NULL
)
2285 req
->prev_size
+= size
;
2286 req
->io_req
.br_resid
+= size
;
2288 req
->io_req
.br_iov
[iovidx
].iov_len
= req
->prev_size
;
2290 iovidx
= req
->io_req
.br_iovcnt
;
2292 req
->io_req
.br_offset
= offset
;
2293 req
->io_req
.br_resid
= 0;
2294 req
->io_req
.br_param
= req
;
2297 req
->io_req
.br_iov
[iovidx
].iov_base
=
2298 paddr_guest2host(req
->sc
->nsc_pi
->pi_vmctx
,
2300 if (req
->io_req
.br_iov
[iovidx
].iov_base
== NULL
)
2303 req
->io_req
.br_iov
[iovidx
].iov_len
= size
;
2305 req
->prev_gpaddr
= gpaddr
;
2306 req
->prev_size
= size
;
2307 req
->io_req
.br_resid
+= size
;
2309 req
->io_req
.br_iovcnt
++;
2316 pci_nvme_set_completion(struct pci_nvme_softc
*sc
,
2317 struct nvme_submission_queue
*sq
, int sqid
, uint16_t cid
, uint16_t status
)
2319 struct nvme_completion_queue
*cq
= &sc
->compl_queues
[sq
->cqid
];
2321 DPRINTF("%s sqid %d cqid %u cid %u status: 0x%x 0x%x",
2322 __func__
, sqid
, sq
->cqid
, cid
, NVME_STATUS_GET_SCT(status
),
2323 NVME_STATUS_GET_SC(status
));
2325 pci_nvme_cq_update(sc
, cq
, 0, cid
, sqid
, status
);
2327 if (cq
->head
!= cq
->tail
) {
2328 if (cq
->intr_en
& NVME_CQ_INTEN
) {
2329 pci_generate_msix(sc
->nsc_pi
, cq
->intr_vec
);
2331 DPRINTF("%s: CQ%u interrupt disabled",
2332 __func__
, sq
->cqid
);
2338 pci_nvme_release_ioreq(struct pci_nvme_softc
*sc
, struct pci_nvme_ioreq
*req
)
2341 req
->nvme_sq
= NULL
;
2344 pthread_mutex_lock(&sc
->mtx
);
2346 STAILQ_INSERT_TAIL(&sc
->ioreqs_free
, req
, link
);
2349 /* when no more IO pending, can set to ready if device reset/enabled */
2350 if (sc
->pending_ios
== 0 &&
2351 NVME_CC_GET_EN(sc
->regs
.cc
) && !(NVME_CSTS_GET_RDY(sc
->regs
.csts
)))
2352 sc
->regs
.csts
|= NVME_CSTS_RDY
;
2354 pthread_mutex_unlock(&sc
->mtx
);
2356 sem_post(&sc
->iosemlock
);
2359 static struct pci_nvme_ioreq
*
2360 pci_nvme_get_ioreq(struct pci_nvme_softc
*sc
)
2362 struct pci_nvme_ioreq
*req
= NULL
;
2364 sem_wait(&sc
->iosemlock
);
2365 pthread_mutex_lock(&sc
->mtx
);
2367 req
= STAILQ_FIRST(&sc
->ioreqs_free
);
2368 assert(req
!= NULL
);
2369 STAILQ_REMOVE_HEAD(&sc
->ioreqs_free
, link
);
2375 pthread_mutex_unlock(&sc
->mtx
);
2377 req
->io_req
.br_iovcnt
= 0;
2378 req
->io_req
.br_offset
= 0;
2379 req
->io_req
.br_resid
= 0;
2380 req
->io_req
.br_param
= req
;
2381 req
->prev_gpaddr
= 0;
2388 pci_nvme_io_done(struct blockif_req
*br
, int err
)
2390 struct pci_nvme_ioreq
*req
= br
->br_param
;
2391 struct nvme_submission_queue
*sq
= req
->nvme_sq
;
2392 uint16_t code
, status
;
2394 DPRINTF("%s error %d %s", __func__
, err
, strerror(err
));
2396 /* TODO return correct error */
2397 code
= err
? NVME_SC_DATA_TRANSFER_ERROR
: NVME_SC_SUCCESS
;
2399 pci_nvme_status_genc(&status
, code
);
2401 pci_nvme_set_completion(req
->sc
, sq
, req
->sqid
, req
->cid
, status
);
2402 pci_nvme_stats_write_read_update(req
->sc
, req
->opc
,
2403 req
->bytes
, status
);
2404 pci_nvme_release_ioreq(req
->sc
, req
);
2408 * Implements the Flush command. The specification states:
2409 * If a volatile write cache is not present, Flush commands complete
2410 * successfully and have no effect
2411 * in the description of the Volatile Write Cache (VWC) field of the Identify
2412 * Controller data. Therefore, set status to Success if the command is
2413 * not supported (i.e. RAM or as indicated by the blockif).
2416 nvme_opc_flush(struct pci_nvme_softc
*sc __unused
,
2417 struct nvme_command
*cmd __unused
,
2418 struct pci_nvme_blockstore
*nvstore
,
2419 struct pci_nvme_ioreq
*req
,
2422 bool pending
= false;
2424 if (nvstore
->type
== NVME_STOR_RAM
) {
2425 pci_nvme_status_genc(status
, NVME_SC_SUCCESS
);
2429 req
->io_req
.br_callback
= pci_nvme_io_done
;
2431 err
= blockif_flush(nvstore
->ctx
, &req
->io_req
);
2437 pci_nvme_status_genc(status
, NVME_SC_SUCCESS
);
2440 pci_nvme_status_genc(status
, NVME_SC_INTERNAL_DEVICE_ERROR
);
2448 nvme_write_read_ram(struct pci_nvme_softc
*sc
,
2449 struct pci_nvme_blockstore
*nvstore
,
2450 uint64_t prp1
, uint64_t prp2
,
2451 size_t offset
, uint64_t bytes
,
2454 uint8_t *buf
= nvstore
->ctx
;
2455 enum nvme_copy_dir dir
;
2459 dir
= NVME_COPY_TO_PRP
;
2461 dir
= NVME_COPY_FROM_PRP
;
2464 if (nvme_prp_memcpy(sc
->nsc_pi
->pi_vmctx
, prp1
, prp2
,
2465 buf
+ offset
, bytes
, dir
))
2466 pci_nvme_status_genc(&status
,
2467 NVME_SC_DATA_TRANSFER_ERROR
);
2469 pci_nvme_status_genc(&status
, NVME_SC_SUCCESS
);
2475 nvme_write_read_blockif(struct pci_nvme_softc
*sc
,
2476 struct pci_nvme_blockstore
*nvstore
,
2477 struct pci_nvme_ioreq
*req
,
2478 uint64_t prp1
, uint64_t prp2
,
2479 size_t offset
, uint64_t bytes
,
2484 uint16_t status
= NVME_NO_STATUS
;
2486 size
= MIN(PAGE_SIZE
- (prp1
% PAGE_SIZE
), bytes
);
2487 if (pci_nvme_append_iov_req(sc
, req
, prp1
, size
, offset
)) {
2497 } else if (bytes
<= PAGE_SIZE
) {
2499 if (pci_nvme_append_iov_req(sc
, req
, prp2
, size
, offset
)) {
2504 void *vmctx
= sc
->nsc_pi
->pi_vmctx
;
2505 uint64_t *prp_list
= &prp2
;
2506 uint64_t *last
= prp_list
;
2508 /* PRP2 is pointer to a physical region page list */
2510 /* Last entry in list points to the next list */
2511 if ((prp_list
== last
) && (bytes
> PAGE_SIZE
)) {
2512 uint64_t prp
= *prp_list
;
2514 prp_list
= paddr_guest2host(vmctx
, prp
,
2515 PAGE_SIZE
- (prp
% PAGE_SIZE
));
2516 if (prp_list
== NULL
) {
2520 last
= prp_list
+ (NVME_PRP2_ITEMS
- 1);
2523 size
= MIN(bytes
, PAGE_SIZE
);
2525 if (pci_nvme_append_iov_req(sc
, req
, *prp_list
, size
,
2537 req
->io_req
.br_callback
= pci_nvme_io_done
;
2539 err
= blockif_write(nvstore
->ctx
, &req
->io_req
);
2541 err
= blockif_read(nvstore
->ctx
, &req
->io_req
);
2544 pci_nvme_status_genc(&status
, NVME_SC_DATA_TRANSFER_ERROR
);
2550 nvme_opc_write_read(struct pci_nvme_softc
*sc
,
2551 struct nvme_command
*cmd
,
2552 struct pci_nvme_blockstore
*nvstore
,
2553 struct pci_nvme_ioreq
*req
,
2556 uint64_t lba
, nblocks
, bytes
;
2558 bool is_write
= cmd
->opc
== NVME_OPC_WRITE
;
2559 bool pending
= false;
2561 lba
= ((uint64_t)cmd
->cdw11
<< 32) | cmd
->cdw10
;
2562 nblocks
= (cmd
->cdw12
& 0xFFFF) + 1;
2563 bytes
= nblocks
<< nvstore
->sectsz_bits
;
2564 if (bytes
> NVME_MAX_DATA_SIZE
) {
2565 WPRINTF("%s command would exceed MDTS", __func__
);
2566 pci_nvme_status_genc(status
, NVME_SC_INVALID_FIELD
);
2570 if (pci_nvme_out_of_range(nvstore
, lba
, nblocks
)) {
2571 WPRINTF("%s command would exceed LBA range(slba=%#lx nblocks=%#lx)",
2572 __func__
, lba
, nblocks
);
2573 pci_nvme_status_genc(status
, NVME_SC_LBA_OUT_OF_RANGE
);
2577 offset
= lba
<< nvstore
->sectsz_bits
;
2580 req
->io_req
.br_offset
= lba
;
2582 /* PRP bits 1:0 must be zero */
2583 cmd
->prp1
&= ~0x3UL
;
2584 cmd
->prp2
&= ~0x3UL
;
2586 if (nvstore
->type
== NVME_STOR_RAM
) {
2587 *status
= nvme_write_read_ram(sc
, nvstore
, cmd
->prp1
,
2588 cmd
->prp2
, offset
, bytes
, is_write
);
2590 *status
= nvme_write_read_blockif(sc
, nvstore
, req
,
2591 cmd
->prp1
, cmd
->prp2
, offset
, bytes
, is_write
);
2593 if (*status
== NVME_NO_STATUS
)
2598 pci_nvme_stats_write_read_update(sc
, cmd
->opc
, bytes
, *status
);
2604 pci_nvme_dealloc_sm(struct blockif_req
*br
, int err
)
2606 struct pci_nvme_ioreq
*req
= br
->br_param
;
2607 struct pci_nvme_softc
*sc
= req
->sc
;
2613 pci_nvme_status_genc(&status
, NVME_SC_INTERNAL_DEVICE_ERROR
);
2614 } else if ((req
->prev_gpaddr
+ 1) == (req
->prev_size
)) {
2615 pci_nvme_status_genc(&status
, NVME_SC_SUCCESS
);
2617 struct iovec
*iov
= req
->io_req
.br_iov
;
2620 iov
+= req
->prev_gpaddr
;
2622 /* The iov_* values already include the sector size */
2623 req
->io_req
.br_offset
= (off_t
)iov
->iov_base
;
2624 req
->io_req
.br_resid
= iov
->iov_len
;
2625 if (blockif_delete(sc
->nvstore
.ctx
, &req
->io_req
)) {
2626 pci_nvme_status_genc(&status
,
2627 NVME_SC_INTERNAL_DEVICE_ERROR
);
2633 pci_nvme_set_completion(sc
, req
->nvme_sq
, req
->sqid
, req
->cid
,
2635 pci_nvme_release_ioreq(sc
, req
);
2640 nvme_opc_dataset_mgmt(struct pci_nvme_softc
*sc
,
2641 struct nvme_command
*cmd
,
2642 struct pci_nvme_blockstore
*nvstore
,
2643 struct pci_nvme_ioreq
*req
,
2646 struct nvme_dsm_range
*range
= NULL
;
2647 uint32_t nr
, r
, non_zero
, dr
;
2649 bool pending
= false;
2651 if ((sc
->ctrldata
.oncs
& NVME_ONCS_DSM
) == 0) {
2652 pci_nvme_status_genc(status
, NVME_SC_INVALID_OPCODE
);
2656 nr
= cmd
->cdw10
& 0xff;
2658 /* copy locally because a range entry could straddle PRPs */
2660 range
= calloc(1, NVME_MAX_DSM_TRIM
);
2662 _Static_assert(NVME_MAX_DSM_TRIM
% sizeof(struct nvme_dsm_range
) == 0,
2663 "NVME_MAX_DSM_TRIM is not a multiple of struct size");
2664 range
= calloc(NVME_MAX_DSM_TRIM
/ sizeof (*range
), sizeof (*range
));
2666 if (range
== NULL
) {
2667 pci_nvme_status_genc(status
, NVME_SC_INTERNAL_DEVICE_ERROR
);
2670 nvme_prp_memcpy(sc
->nsc_pi
->pi_vmctx
, cmd
->prp1
, cmd
->prp2
,
2671 (uint8_t *)range
, NVME_MAX_DSM_TRIM
, NVME_COPY_FROM_PRP
);
2673 /* Check for invalid ranges and the number of non-zero lengths */
2675 for (r
= 0; r
<= nr
; r
++) {
2676 if (pci_nvme_out_of_range(nvstore
,
2677 range
[r
].starting_lba
, range
[r
].length
)) {
2678 pci_nvme_status_genc(status
, NVME_SC_LBA_OUT_OF_RANGE
);
2681 if (range
[r
].length
!= 0)
2685 if (cmd
->cdw11
& NVME_DSM_ATTR_DEALLOCATE
) {
2686 size_t offset
, bytes
;
2687 int sectsz_bits
= sc
->nvstore
.sectsz_bits
;
2690 * DSM calls are advisory only, and compliant controllers
2691 * may choose to take no actions (i.e. return Success).
2693 if (!nvstore
->deallocate
) {
2694 pci_nvme_status_genc(status
, NVME_SC_SUCCESS
);
2698 /* If all ranges have a zero length, return Success */
2699 if (non_zero
== 0) {
2700 pci_nvme_status_genc(status
, NVME_SC_SUCCESS
);
2705 pci_nvme_status_genc(status
, NVME_SC_INTERNAL_DEVICE_ERROR
);
2709 offset
= range
[0].starting_lba
<< sectsz_bits
;
2710 bytes
= range
[0].length
<< sectsz_bits
;
2713 * If the request is for more than a single range, store
2714 * the ranges in the br_iov. Optimize for the common case
2715 * of a single range.
2717 * Note that NVMe Number of Ranges is a zero based value
2719 req
->io_req
.br_iovcnt
= 0;
2720 req
->io_req
.br_offset
= offset
;
2721 req
->io_req
.br_resid
= bytes
;
2724 req
->io_req
.br_callback
= pci_nvme_io_done
;
2726 struct iovec
*iov
= req
->io_req
.br_iov
;
2728 for (r
= 0, dr
= 0; r
<= nr
; r
++) {
2729 offset
= range
[r
].starting_lba
<< sectsz_bits
;
2730 bytes
= range
[r
].length
<< sectsz_bits
;
2734 if ((nvstore
->size
- offset
) < bytes
) {
2735 pci_nvme_status_genc(status
,
2736 NVME_SC_LBA_OUT_OF_RANGE
);
2739 iov
[dr
].iov_base
= (void *)offset
;
2740 iov
[dr
].iov_len
= bytes
;
2743 req
->io_req
.br_callback
= pci_nvme_dealloc_sm
;
2746 * Use prev_gpaddr to track the current entry and
2747 * prev_size to track the number of entries
2749 req
->prev_gpaddr
= 0;
2750 req
->prev_size
= dr
;
2753 err
= blockif_delete(nvstore
->ctx
, &req
->io_req
);
2755 pci_nvme_status_genc(status
, NVME_SC_INTERNAL_DEVICE_ERROR
);
2765 pci_nvme_handle_io_cmd(struct pci_nvme_softc
* sc
, uint16_t idx
)
2767 struct nvme_submission_queue
*sq
;
2771 /* handle all submissions up to sq->tail index */
2772 sq
= &sc
->submit_queues
[idx
];
2774 pthread_mutex_lock(&sq
->mtx
);
2777 DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p",
2778 idx
, sqhead
, sq
->tail
, sq
->qbase
);
2780 while (sqhead
!= atomic_load_acq_short(&sq
->tail
)) {
2781 struct nvme_command
*cmd
;
2782 struct pci_nvme_ioreq
*req
;
2790 cmd
= &sq
->qbase
[sqhead
];
2791 sqhead
= (sqhead
+ 1) % sq
->size
;
2793 nsid
= le32toh(cmd
->nsid
);
2794 if ((nsid
== 0) || (nsid
> sc
->ctrldata
.nn
)) {
2795 pci_nvme_status_genc(&status
,
2796 NVME_SC_INVALID_NAMESPACE_OR_FORMAT
);
2798 NVME_STATUS_DNR_MASK
<< NVME_STATUS_DNR_SHIFT
;
2802 req
= pci_nvme_get_ioreq(sc
);
2804 pci_nvme_status_genc(&status
,
2805 NVME_SC_INTERNAL_DEVICE_ERROR
);
2806 WPRINTF("%s: unable to allocate IO req", __func__
);
2811 req
->opc
= cmd
->opc
;
2812 req
->cid
= cmd
->cid
;
2813 req
->nsid
= cmd
->nsid
;
2816 case NVME_OPC_FLUSH
:
2817 pending
= nvme_opc_flush(sc
, cmd
, &sc
->nvstore
,
2820 case NVME_OPC_WRITE
:
2822 pending
= nvme_opc_write_read(sc
, cmd
, &sc
->nvstore
,
2825 case NVME_OPC_WRITE_ZEROES
:
2826 /* TODO: write zeroes
2827 WPRINTF("%s write zeroes lba 0x%lx blocks %u",
2828 __func__, lba, cmd->cdw12 & 0xFFFF); */
2829 pci_nvme_status_genc(&status
, NVME_SC_SUCCESS
);
2831 case NVME_OPC_DATASET_MANAGEMENT
:
2832 pending
= nvme_opc_dataset_mgmt(sc
, cmd
, &sc
->nvstore
,
2836 WPRINTF("%s unhandled io command 0x%x",
2837 __func__
, cmd
->opc
);
2838 pci_nvme_status_genc(&status
, NVME_SC_INVALID_OPCODE
);
2842 pci_nvme_set_completion(sc
, sq
, idx
, cmd
->cid
, status
);
2844 pci_nvme_release_ioreq(sc
, req
);
2850 pthread_mutex_unlock(&sq
->mtx
);
2854 pci_nvme_handle_doorbell(struct pci_nvme_softc
* sc
,
2855 uint64_t idx
, int is_sq
, uint64_t value
)
2857 DPRINTF("nvme doorbell %lu, %s, val 0x%lx",
2858 idx
, is_sq
? "SQ" : "CQ", value
& 0xFFFF);
2861 if (idx
> sc
->num_squeues
) {
2862 WPRINTF("%s queue index %lu overflow from "
2864 __func__
, idx
, sc
->num_squeues
);
2868 atomic_store_short(&sc
->submit_queues
[idx
].tail
,
2872 pci_nvme_handle_admin_cmd(sc
, value
);
2874 /* submission queue; handle new entries in SQ */
2875 if (idx
> sc
->num_squeues
) {
2876 WPRINTF("%s SQ index %lu overflow from "
2878 __func__
, idx
, sc
->num_squeues
);
2881 pci_nvme_handle_io_cmd(sc
, (uint16_t)idx
);
2884 if (idx
> sc
->num_cqueues
) {
2885 WPRINTF("%s queue index %lu overflow from "
2887 __func__
, idx
, sc
->num_cqueues
);
2891 atomic_store_short(&sc
->compl_queues
[idx
].head
,
2897 pci_nvme_bar0_reg_dumps(const char *func
, uint64_t offset
, int iswrite
)
2899 const char *s
= iswrite
? "WRITE" : "READ";
2902 case NVME_CR_CAP_LOW
:
2903 DPRINTF("%s %s NVME_CR_CAP_LOW", func
, s
);
2905 case NVME_CR_CAP_HI
:
2906 DPRINTF("%s %s NVME_CR_CAP_HI", func
, s
);
2909 DPRINTF("%s %s NVME_CR_VS", func
, s
);
2912 DPRINTF("%s %s NVME_CR_INTMS", func
, s
);
2915 DPRINTF("%s %s NVME_CR_INTMC", func
, s
);
2918 DPRINTF("%s %s NVME_CR_CC", func
, s
);
2921 DPRINTF("%s %s NVME_CR_CSTS", func
, s
);
2924 DPRINTF("%s %s NVME_CR_NSSR", func
, s
);
2927 DPRINTF("%s %s NVME_CR_AQA", func
, s
);
2929 case NVME_CR_ASQ_LOW
:
2930 DPRINTF("%s %s NVME_CR_ASQ_LOW", func
, s
);
2932 case NVME_CR_ASQ_HI
:
2933 DPRINTF("%s %s NVME_CR_ASQ_HI", func
, s
);
2935 case NVME_CR_ACQ_LOW
:
2936 DPRINTF("%s %s NVME_CR_ACQ_LOW", func
, s
);
2938 case NVME_CR_ACQ_HI
:
2939 DPRINTF("%s %s NVME_CR_ACQ_HI", func
, s
);
2942 DPRINTF("unknown nvme bar-0 offset 0x%lx", offset
);
2948 pci_nvme_write_bar_0(struct vmctx
*ctx
, struct pci_nvme_softc
* sc
,
2949 uint64_t offset
, int size
, uint64_t value
)
2953 if (offset
>= NVME_DOORBELL_OFFSET
) {
2954 uint64_t belloffset
= offset
- NVME_DOORBELL_OFFSET
;
2955 uint64_t idx
= belloffset
/ 8; /* door bell size = 2*int */
2956 int is_sq
= (belloffset
% 8) < 4;
2958 if ((sc
->regs
.csts
& NVME_CSTS_RDY
) == 0) {
2959 WPRINTF("doorbell write prior to RDY (offset=%#lx)\n",
2964 if (belloffset
> ((sc
->max_queues
+1) * 8 - 4)) {
2965 WPRINTF("guest attempted an overflow write offset "
2966 "0x%lx, val 0x%lx in %s",
2967 offset
, value
, __func__
);
2972 if (sc
->submit_queues
[idx
].qbase
== NULL
)
2974 } else if (sc
->compl_queues
[idx
].qbase
== NULL
)
2977 pci_nvme_handle_doorbell(sc
, idx
, is_sq
, value
);
2981 DPRINTF("nvme-write offset 0x%lx, size %d, value 0x%lx",
2982 offset
, size
, value
);
2985 WPRINTF("guest wrote invalid size %d (offset 0x%lx, "
2986 "val 0x%lx) to bar0 in %s",
2987 size
, offset
, value
, __func__
);
2988 /* TODO: shutdown device */
2992 pci_nvme_bar0_reg_dumps(__func__
, offset
, 1);
2994 pthread_mutex_lock(&sc
->mtx
);
2997 case NVME_CR_CAP_LOW
:
2998 case NVME_CR_CAP_HI
:
3005 /* MSI-X, so ignore */
3008 /* MSI-X, so ignore */
3011 ccreg
= (uint32_t)value
;
3013 DPRINTF("%s NVME_CR_CC en %x css %x shn %x iosqes %u "
3016 NVME_CC_GET_EN(ccreg
), NVME_CC_GET_CSS(ccreg
),
3017 NVME_CC_GET_SHN(ccreg
), NVME_CC_GET_IOSQES(ccreg
),
3018 NVME_CC_GET_IOCQES(ccreg
));
3020 if (NVME_CC_GET_SHN(ccreg
)) {
3021 /* perform shutdown - flush out data to backend */
3022 sc
->regs
.csts
&= ~(NVME_CSTS_REG_SHST_MASK
<<
3023 NVME_CSTS_REG_SHST_SHIFT
);
3024 sc
->regs
.csts
|= NVME_SHST_COMPLETE
<<
3025 NVME_CSTS_REG_SHST_SHIFT
;
3027 if (NVME_CC_GET_EN(ccreg
) != NVME_CC_GET_EN(sc
->regs
.cc
)) {
3028 if (NVME_CC_GET_EN(ccreg
) == 0)
3029 /* transition 1-> causes controller reset */
3030 pci_nvme_reset_locked(sc
);
3032 pci_nvme_init_controller(ctx
, sc
);
3035 /* Insert the iocqes, iosqes and en bits from the write */
3036 sc
->regs
.cc
&= ~NVME_CC_WRITE_MASK
;
3037 sc
->regs
.cc
|= ccreg
& NVME_CC_WRITE_MASK
;
3038 if (NVME_CC_GET_EN(ccreg
) == 0) {
3039 /* Insert the ams, mps and css bit fields */
3040 sc
->regs
.cc
&= ~NVME_CC_NEN_WRITE_MASK
;
3041 sc
->regs
.cc
|= ccreg
& NVME_CC_NEN_WRITE_MASK
;
3042 sc
->regs
.csts
&= ~NVME_CSTS_RDY
;
3043 } else if ((sc
->pending_ios
== 0) &&
3044 !(sc
->regs
.csts
& NVME_CSTS_CFS
)) {
3045 sc
->regs
.csts
|= NVME_CSTS_RDY
;
3051 /* ignore writes; don't support subsystem reset */
3054 sc
->regs
.aqa
= (uint32_t)value
;
3056 case NVME_CR_ASQ_LOW
:
3057 sc
->regs
.asq
= (sc
->regs
.asq
& (0xFFFFFFFF00000000)) |
3058 (0xFFFFF000 & value
);
3060 case NVME_CR_ASQ_HI
:
3061 sc
->regs
.asq
= (sc
->regs
.asq
& (0x00000000FFFFFFFF)) |
3064 case NVME_CR_ACQ_LOW
:
3065 sc
->regs
.acq
= (sc
->regs
.acq
& (0xFFFFFFFF00000000)) |
3066 (0xFFFFF000 & value
);
3068 case NVME_CR_ACQ_HI
:
3069 sc
->regs
.acq
= (sc
->regs
.acq
& (0x00000000FFFFFFFF)) |
3073 DPRINTF("%s unknown offset 0x%lx, value 0x%lx size %d",
3074 __func__
, offset
, value
, size
);
3076 pthread_mutex_unlock(&sc
->mtx
);
3080 pci_nvme_write(struct vmctx
*ctx
, struct pci_devinst
*pi
,
3081 int baridx
, uint64_t offset
, int size
, uint64_t value
)
3083 struct pci_nvme_softc
* sc
= pi
->pi_arg
;
3085 if (baridx
== pci_msix_table_bar(pi
) ||
3086 baridx
== pci_msix_pba_bar(pi
)) {
3087 DPRINTF("nvme-write baridx %d, msix: off 0x%lx, size %d, "
3088 " value 0x%lx", baridx
, offset
, size
, value
);
3090 pci_emul_msix_twrite(pi
, offset
, size
, value
);
3096 pci_nvme_write_bar_0(ctx
, sc
, offset
, size
, value
);
3100 DPRINTF("%s unknown baridx %d, val 0x%lx",
3101 __func__
, baridx
, value
);
3105 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc
* sc
,
3106 uint64_t offset
, int size
)
3110 pci_nvme_bar0_reg_dumps(__func__
, offset
, 0);
3112 if (offset
< NVME_DOORBELL_OFFSET
) {
3113 void *p
= &(sc
->regs
);
3114 pthread_mutex_lock(&sc
->mtx
);
3115 memcpy(&value
, (void *)((uintptr_t)p
+ offset
), size
);
3116 pthread_mutex_unlock(&sc
->mtx
);
3119 WPRINTF("pci_nvme: read invalid offset %ld", offset
);
3130 value
&= 0xFFFFFFFF;
3134 DPRINTF(" nvme-read offset 0x%lx, size %d -> value 0x%x",
3135 offset
, size
, (uint32_t)value
);
3143 pci_nvme_read(struct vmctx
*ctx __unused
,
3144 struct pci_devinst
*pi
, int baridx
, uint64_t offset
, int size
)
3146 struct pci_nvme_softc
* sc
= pi
->pi_arg
;
3148 if (baridx
== pci_msix_table_bar(pi
) ||
3149 baridx
== pci_msix_pba_bar(pi
)) {
3150 DPRINTF("nvme-read bar: %d, msix: regoff 0x%lx, size %d",
3151 baridx
, offset
, size
);
3153 return pci_emul_msix_tread(pi
, offset
, size
);
3158 return pci_nvme_read_bar_0(sc
, offset
, size
);
3161 DPRINTF("unknown bar %d, 0x%lx", baridx
, offset
);
3168 pci_nvme_parse_config(struct pci_nvme_softc
*sc
, nvlist_t
*nvl
)
3170 char bident
[sizeof("XXX:XXX")];
3174 sc
->max_queues
= NVME_QUEUES
;
3175 sc
->max_qentries
= NVME_MAX_QENTRIES
;
3176 sc
->ioslots
= NVME_IOSLOTS
;
3177 sc
->num_squeues
= sc
->max_queues
;
3178 sc
->num_cqueues
= sc
->max_queues
;
3179 sc
->dataset_management
= NVME_DATASET_MANAGEMENT_AUTO
;
3182 snprintf(sc
->ctrldata
.sn
, sizeof(sc
->ctrldata
.sn
),
3183 "NVME-%d-%d", sc
->nsc_pi
->pi_slot
, sc
->nsc_pi
->pi_func
);
3185 snprintf((char *)sc
->ctrldata
.sn
, sizeof(sc
->ctrldata
.sn
),
3186 "NVME-%d-%d", sc
->nsc_pi
->pi_slot
, sc
->nsc_pi
->pi_func
);
3189 value
= get_config_value_node(nvl
, "maxq");
3191 sc
->max_queues
= atoi(value
);
3192 value
= get_config_value_node(nvl
, "qsz");
3193 if (value
!= NULL
) {
3194 sc
->max_qentries
= atoi(value
);
3195 if (sc
->max_qentries
<= 0) {
3196 EPRINTLN("nvme: Invalid qsz option %d",
3201 value
= get_config_value_node(nvl
, "ioslots");
3202 if (value
!= NULL
) {
3203 sc
->ioslots
= atoi(value
);
3204 if (sc
->ioslots
<= 0) {
3205 EPRINTLN("Invalid ioslots option %d", sc
->ioslots
);
3209 value
= get_config_value_node(nvl
, "sectsz");
3211 sectsz
= atoi(value
);
3212 value
= get_config_value_node(nvl
, "ser");
3213 if (value
!= NULL
) {
3215 * This field indicates the Product Serial Number in
3216 * 7-bit ASCII, unused bytes should be space characters.
3219 cpywithpad((char *)sc
->ctrldata
.sn
,
3220 sizeof(sc
->ctrldata
.sn
), value
, ' ');
3222 value
= get_config_value_node(nvl
, "eui64");
3224 sc
->nvstore
.eui64
= htobe64(strtoull(value
, NULL
, 0));
3225 value
= get_config_value_node(nvl
, "dsm");
3226 if (value
!= NULL
) {
3227 if (strcmp(value
, "auto") == 0)
3228 sc
->dataset_management
= NVME_DATASET_MANAGEMENT_AUTO
;
3229 else if (strcmp(value
, "enable") == 0)
3230 sc
->dataset_management
= NVME_DATASET_MANAGEMENT_ENABLE
;
3231 else if (strcmp(value
, "disable") == 0)
3232 sc
->dataset_management
= NVME_DATASET_MANAGEMENT_DISABLE
;
3235 value
= get_config_value_node(nvl
, "ram");
3236 if (value
!= NULL
) {
3237 uint64_t sz
= strtoull(value
, NULL
, 10);
3239 sc
->nvstore
.type
= NVME_STOR_RAM
;
3240 sc
->nvstore
.size
= sz
* 1024 * 1024;
3241 sc
->nvstore
.ctx
= calloc(1, sc
->nvstore
.size
);
3242 sc
->nvstore
.sectsz
= 4096;
3243 sc
->nvstore
.sectsz_bits
= 12;
3244 if (sc
->nvstore
.ctx
== NULL
) {
3245 EPRINTLN("nvme: Unable to allocate RAM");
3249 snprintf(bident
, sizeof(bident
), "%u:%u",
3250 sc
->nsc_pi
->pi_slot
, sc
->nsc_pi
->pi_func
);
3251 sc
->nvstore
.ctx
= blockif_open(nvl
, bident
);
3252 if (sc
->nvstore
.ctx
== NULL
) {
3253 EPRINTLN("nvme: Could not open backing file: %s",
3257 sc
->nvstore
.type
= NVME_STOR_BLOCKIF
;
3258 sc
->nvstore
.size
= blockif_size(sc
->nvstore
.ctx
);
3261 if (sectsz
== 512 || sectsz
== 4096 || sectsz
== 8192)
3262 sc
->nvstore
.sectsz
= sectsz
;
3263 else if (sc
->nvstore
.type
!= NVME_STOR_RAM
)
3264 sc
->nvstore
.sectsz
= blockif_sectsz(sc
->nvstore
.ctx
);
3265 for (sc
->nvstore
.sectsz_bits
= 9;
3266 (1U << sc
->nvstore
.sectsz_bits
) < sc
->nvstore
.sectsz
;
3267 sc
->nvstore
.sectsz_bits
++);
3269 if (sc
->max_queues
<= 0 || sc
->max_queues
> NVME_QUEUES
)
3270 sc
->max_queues
= NVME_QUEUES
;
3276 pci_nvme_resized(struct blockif_ctxt
*bctxt __unused
, void *arg
,
3279 struct pci_nvme_softc
*sc
;
3280 struct pci_nvme_blockstore
*nvstore
;
3281 struct nvme_namespace_data
*nd
;
3284 nvstore
= &sc
->nvstore
;
3287 nvstore
->size
= new_size
;
3288 pci_nvme_init_nsdata_size(nvstore
, nd
);
3290 /* Add changed NSID to list */
3291 sc
->ns_log
.ns
[0] = 1;
3292 sc
->ns_log
.ns
[1] = 0;
3294 pci_nvme_aen_post(sc
, PCI_NVME_AE_TYPE_NOTICE
,
3295 PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED
);
3299 pci_nvme_init(struct vmctx
*ctx __unused
, struct pci_devinst
*pi
, nvlist_t
*nvl
)
3301 struct pci_nvme_softc
*sc
;
3302 uint32_t pci_membar_sz
;
3307 sc
= calloc(1, sizeof(struct pci_nvme_softc
));
3311 error
= pci_nvme_parse_config(sc
, nvl
);
3317 STAILQ_INIT(&sc
->ioreqs_free
);
3318 sc
->ioreqs
= calloc(sc
->ioslots
, sizeof(struct pci_nvme_ioreq
));
3319 for (uint32_t i
= 0; i
< sc
->ioslots
; i
++) {
3320 STAILQ_INSERT_TAIL(&sc
->ioreqs_free
, &sc
->ioreqs
[i
], link
);
3323 pci_set_cfgdata16(pi
, PCIR_DEVICE
, 0x0A0A);
3324 pci_set_cfgdata16(pi
, PCIR_VENDOR
, 0xFB5D);
3325 pci_set_cfgdata8(pi
, PCIR_CLASS
, PCIC_STORAGE
);
3326 pci_set_cfgdata8(pi
, PCIR_SUBCLASS
, PCIS_STORAGE_NVM
);
3327 pci_set_cfgdata8(pi
, PCIR_PROGIF
,
3328 PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0
);
3331 * Allocate size of NVMe registers + doorbell space for all queues.
3333 * The specification requires a minimum memory I/O window size of 16K.
3334 * The Windows driver will refuse to start a device with a smaller
3337 pci_membar_sz
= sizeof(struct nvme_registers
) +
3338 2 * sizeof(uint32_t) * (sc
->max_queues
+ 1);
3339 pci_membar_sz
= MAX(pci_membar_sz
, NVME_MMIO_SPACE_MIN
);
3341 DPRINTF("nvme membar size: %u", pci_membar_sz
);
3343 error
= pci_emul_alloc_bar(pi
, 0, PCIBAR_MEM64
, pci_membar_sz
);
3345 WPRINTF("%s pci alloc mem bar failed", __func__
);
3349 error
= pci_emul_add_msixcap(pi
, sc
->max_queues
+ 1, NVME_MSIX_BAR
);
3351 WPRINTF("%s pci add msixcap failed", __func__
);
3355 error
= pci_emul_add_pciecap(pi
, PCIEM_TYPE_ROOT_INT_EP
);
3357 WPRINTF("%s pci add Express capability failed", __func__
);
3361 pthread_mutex_init(&sc
->mtx
, NULL
);
3362 sem_init(&sc
->iosemlock
, 0, sc
->ioslots
);
3363 blockif_register_resize_callback(sc
->nvstore
.ctx
, pci_nvme_resized
, sc
);
3365 pci_nvme_init_queues(sc
, sc
->max_queues
, sc
->max_queues
);
3367 * Controller data depends on Namespace data so initialize Namespace
3370 pci_nvme_init_nsdata(sc
, &sc
->nsdata
, 1, &sc
->nvstore
);
3371 pci_nvme_init_ctrldata(sc
);
3372 pci_nvme_init_logpages(sc
);
3373 pci_nvme_init_features(sc
);
3375 pci_nvme_aer_init(sc
);
3376 pci_nvme_aen_init(sc
);
3380 pci_lintr_request(pi
);
3387 pci_nvme_legacy_config(nvlist_t
*nvl
, const char *opts
)
3394 if (strncmp(opts
, "ram=", 4) == 0) {
3395 cp
= strchr(opts
, ',');
3397 set_config_value_node(nvl
, "ram", opts
+ 4);
3400 ram
= strndup(opts
+ 4, cp
- opts
- 4);
3401 set_config_value_node(nvl
, "ram", ram
);
3403 return (pci_parse_legacy_config(nvl
, cp
+ 1));
3405 return (blockif_legacy_config(nvl
, opts
));
3408 static const struct pci_devemu pci_de_nvme
= {
3410 .pe_init
= pci_nvme_init
,
3411 .pe_legacy_config
= pci_nvme_legacy_config
,
3412 .pe_barwrite
= pci_nvme_write
,
3413 .pe_barread
= pci_nvme_read
3415 PCI_EMUL_SET(pci_de_nvme
);