15325 bhyve upstream sync 2023 January
[illumos-gate.git] / usr / src / cmd / bhyve / pci_nvme.c
blobb9d8873a872d388952cc6b105a25d68461de89de
1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
4 * Copyright (c) 2017 Shunsuke Mie
5 * Copyright (c) 2018 Leon Dang
6 * Copyright (c) 2020 Chuck Tuffli
8 * Function crc16 Copyright (c) 2017, Fedor Uporov
9 * Obtained from function ext2_crc16() in sys/fs/ext2fs/ext2_csum.c
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
34 * bhyve PCIe-NVMe device emulation.
36 * options:
37 * -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z,eui64=#,dsm=<opt>
39 * accepted devpath:
40 * /dev/blockdev
41 * /path/to/image
42 * ram=size_in_MiB
44 * maxq = max number of queues
45 * qsz = max elements in each queue
46 * ioslots = max number of concurrent io requests
47 * sectsz = sector size (defaults to blockif sector size)
48 * ser = serial number (20-chars max)
49 * eui64 = IEEE Extended Unique Identifier (8 byte value)
50 * dsm = DataSet Management support. Option is one of auto, enable,disable
54 /* TODO:
55 - create async event for smart and log
56 - intr coalesce
59 #include <sys/cdefs.h>
60 __FBSDID("$FreeBSD$");
62 #include <sys/errno.h>
63 #include <sys/types.h>
64 #include <net/ieee_oui.h>
65 #ifndef __FreeBSD__
66 #include <endian.h>
67 #endif
69 #include <assert.h>
70 #include <pthread.h>
71 #include <pthread_np.h>
72 #include <semaphore.h>
73 #include <stdbool.h>
74 #include <stddef.h>
75 #include <stdint.h>
76 #include <stdio.h>
77 #include <stdlib.h>
78 #include <string.h>
80 #include <machine/atomic.h>
81 #include <machine/vmm.h>
82 #include <vmmapi.h>
84 #include <dev/nvme/nvme.h>
86 #include "bhyverun.h"
87 #include "block_if.h"
88 #include "config.h"
89 #include "debug.h"
90 #include "pci_emul.h"
93 static int nvme_debug = 0;
94 #define DPRINTF(fmt, args...) if (nvme_debug) PRINTLN(fmt, ##args)
95 #define WPRINTF(fmt, args...) PRINTLN(fmt, ##args)
97 /* defaults; can be overridden */
98 #define NVME_MSIX_BAR 4
100 #define NVME_IOSLOTS 8
102 /* The NVMe spec defines bits 13:4 in BAR0 as reserved */
103 #define NVME_MMIO_SPACE_MIN (1 << 14)
105 #define NVME_QUEUES 16
106 #define NVME_MAX_QENTRIES 2048
107 /* Memory Page size Minimum reported in CAP register */
108 #define NVME_MPSMIN 0
109 /* MPSMIN converted to bytes */
110 #define NVME_MPSMIN_BYTES (1 << (12 + NVME_MPSMIN))
112 #define NVME_PRP2_ITEMS (PAGE_SIZE/sizeof(uint64_t))
113 #define NVME_MDTS 9
114 /* Note the + 1 allows for the initial descriptor to not be page aligned */
115 #define NVME_MAX_IOVEC ((1 << NVME_MDTS) + 1)
116 #define NVME_MAX_DATA_SIZE ((1 << NVME_MDTS) * NVME_MPSMIN_BYTES)
118 /* This is a synthetic status code to indicate there is no status */
119 #define NVME_NO_STATUS 0xffff
120 #define NVME_COMPLETION_VALID(c) ((c).status != NVME_NO_STATUS)
122 /* Reported temperature in Kelvin (i.e. room temperature) */
123 #define NVME_TEMPERATURE 296
125 /* helpers */
127 /* Convert a zero-based value into a one-based value */
128 #define ONE_BASED(zero) ((zero) + 1)
129 /* Convert a one-based value into a zero-based value */
130 #define ZERO_BASED(one) ((one) - 1)
132 /* Encode number of SQ's and CQ's for Set/Get Features */
133 #define NVME_FEATURE_NUM_QUEUES(sc) \
134 (ZERO_BASED((sc)->num_squeues) & 0xffff) | \
135 (ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16;
137 #define NVME_DOORBELL_OFFSET offsetof(struct nvme_registers, doorbell)
139 enum nvme_controller_register_offsets {
140 NVME_CR_CAP_LOW = 0x00,
141 NVME_CR_CAP_HI = 0x04,
142 NVME_CR_VS = 0x08,
143 NVME_CR_INTMS = 0x0c,
144 NVME_CR_INTMC = 0x10,
145 NVME_CR_CC = 0x14,
146 NVME_CR_CSTS = 0x1c,
147 NVME_CR_NSSR = 0x20,
148 NVME_CR_AQA = 0x24,
149 NVME_CR_ASQ_LOW = 0x28,
150 NVME_CR_ASQ_HI = 0x2c,
151 NVME_CR_ACQ_LOW = 0x30,
152 NVME_CR_ACQ_HI = 0x34,
155 enum nvme_cmd_cdw11 {
156 NVME_CMD_CDW11_PC = 0x0001,
157 NVME_CMD_CDW11_IEN = 0x0002,
158 NVME_CMD_CDW11_IV = 0xFFFF0000,
161 enum nvme_copy_dir {
162 NVME_COPY_TO_PRP,
163 NVME_COPY_FROM_PRP,
166 #define NVME_CQ_INTEN 0x01
167 #define NVME_CQ_INTCOAL 0x02
169 struct nvme_completion_queue {
170 struct nvme_completion *qbase;
171 pthread_mutex_t mtx;
172 uint32_t size;
173 uint16_t tail; /* nvme progress */
174 uint16_t head; /* guest progress */
175 uint16_t intr_vec;
176 uint32_t intr_en;
179 struct nvme_submission_queue {
180 struct nvme_command *qbase;
181 pthread_mutex_t mtx;
182 uint32_t size;
183 uint16_t head; /* nvme progress */
184 uint16_t tail; /* guest progress */
185 uint16_t cqid; /* completion queue id */
186 int qpriority;
189 enum nvme_storage_type {
190 NVME_STOR_BLOCKIF = 0,
191 NVME_STOR_RAM = 1,
194 struct pci_nvme_blockstore {
195 enum nvme_storage_type type;
196 void *ctx;
197 uint64_t size;
198 uint32_t sectsz;
199 uint32_t sectsz_bits;
200 uint64_t eui64;
201 uint32_t deallocate:1;
205 * Calculate the number of additional page descriptors for guest IO requests
206 * based on the advertised Max Data Transfer (MDTS) and given the number of
207 * default iovec's in a struct blockif_req.
209 #define MDTS_PAD_SIZE \
210 ( NVME_MAX_IOVEC > BLOCKIF_IOV_MAX ? \
211 NVME_MAX_IOVEC - BLOCKIF_IOV_MAX : \
214 struct pci_nvme_ioreq {
215 struct pci_nvme_softc *sc;
216 STAILQ_ENTRY(pci_nvme_ioreq) link;
217 struct nvme_submission_queue *nvme_sq;
218 uint16_t sqid;
220 /* command information */
221 uint16_t opc;
222 uint16_t cid;
223 uint32_t nsid;
225 uint64_t prev_gpaddr;
226 size_t prev_size;
227 size_t bytes;
229 struct blockif_req io_req;
231 struct iovec iovpadding[MDTS_PAD_SIZE];
234 enum nvme_dsm_type {
235 /* Dataset Management bit in ONCS reflects backing storage capability */
236 NVME_DATASET_MANAGEMENT_AUTO,
237 /* Unconditionally set Dataset Management bit in ONCS */
238 NVME_DATASET_MANAGEMENT_ENABLE,
239 /* Unconditionally clear Dataset Management bit in ONCS */
240 NVME_DATASET_MANAGEMENT_DISABLE,
243 struct pci_nvme_softc;
244 struct nvme_feature_obj;
246 typedef void (*nvme_feature_cb)(struct pci_nvme_softc *,
247 struct nvme_feature_obj *,
248 struct nvme_command *,
249 struct nvme_completion *);
251 struct nvme_feature_obj {
252 uint32_t cdw11;
253 nvme_feature_cb set;
254 nvme_feature_cb get;
255 bool namespace_specific;
258 #define NVME_FID_MAX (NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION + 1)
260 typedef enum {
261 PCI_NVME_AE_TYPE_ERROR = 0,
262 PCI_NVME_AE_TYPE_SMART,
263 PCI_NVME_AE_TYPE_NOTICE,
264 PCI_NVME_AE_TYPE_IO_CMD = 6,
265 PCI_NVME_AE_TYPE_VENDOR = 7,
266 PCI_NVME_AE_TYPE_MAX /* Must be last */
267 } pci_nvme_async_type;
269 /* Asynchronous Event Requests */
270 struct pci_nvme_aer {
271 STAILQ_ENTRY(pci_nvme_aer) link;
272 uint16_t cid; /* Command ID of the submitted AER */
275 /** Asynchronous Event Information - Notice */
276 typedef enum {
277 PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED = 0,
278 PCI_NVME_AEI_NOTICE_FW_ACTIVATION,
279 PCI_NVME_AEI_NOTICE_TELEMETRY_CHANGE,
280 PCI_NVME_AEI_NOTICE_ANA_CHANGE,
281 PCI_NVME_AEI_NOTICE_PREDICT_LATENCY_CHANGE,
282 PCI_NVME_AEI_NOTICE_LBA_STATUS_ALERT,
283 PCI_NVME_AEI_NOTICE_ENDURANCE_GROUP_CHANGE,
284 PCI_NVME_AEI_NOTICE_MAX,
285 } pci_nvme_async_event_info_notice;
287 #define PCI_NVME_AEI_NOTICE_SHIFT 8
288 #define PCI_NVME_AEI_NOTICE_MASK(event) (1 << (event + PCI_NVME_AEI_NOTICE_SHIFT))
290 /* Asynchronous Event Notifications */
291 struct pci_nvme_aen {
292 pci_nvme_async_type atype;
293 uint32_t event_data;
294 bool posted;
298 * By default, enable all Asynchrnous Event Notifications:
299 * SMART / Health Critical Warnings
300 * Namespace Attribute Notices
302 #define PCI_NVME_AEN_DEFAULT_MASK 0x11f
304 typedef enum {
305 NVME_CNTRLTYPE_IO = 1,
306 NVME_CNTRLTYPE_DISCOVERY = 2,
307 NVME_CNTRLTYPE_ADMIN = 3,
308 } pci_nvme_cntrl_type;
310 struct pci_nvme_softc {
311 struct pci_devinst *nsc_pi;
313 pthread_mutex_t mtx;
315 struct nvme_registers regs;
317 struct nvme_namespace_data nsdata;
318 struct nvme_controller_data ctrldata;
319 struct nvme_error_information_entry err_log;
320 struct nvme_health_information_page health_log;
321 struct nvme_firmware_page fw_log;
322 struct nvme_ns_list ns_log;
324 struct pci_nvme_blockstore nvstore;
326 uint16_t max_qentries; /* max entries per queue */
327 uint32_t max_queues; /* max number of IO SQ's or CQ's */
328 uint32_t num_cqueues;
329 uint32_t num_squeues;
330 bool num_q_is_set; /* Has host set Number of Queues */
332 struct pci_nvme_ioreq *ioreqs;
333 STAILQ_HEAD(, pci_nvme_ioreq) ioreqs_free; /* free list of ioreqs */
334 uint32_t pending_ios;
335 uint32_t ioslots;
336 sem_t iosemlock;
339 * Memory mapped Submission and Completion queues
340 * Each array includes both Admin and IO queues
342 struct nvme_completion_queue *compl_queues;
343 struct nvme_submission_queue *submit_queues;
345 struct nvme_feature_obj feat[NVME_FID_MAX];
347 enum nvme_dsm_type dataset_management;
349 /* Accounting for SMART data */
350 __uint128_t read_data_units;
351 __uint128_t write_data_units;
352 __uint128_t read_commands;
353 __uint128_t write_commands;
354 uint32_t read_dunits_remainder;
355 uint32_t write_dunits_remainder;
357 STAILQ_HEAD(, pci_nvme_aer) aer_list;
358 pthread_mutex_t aer_mtx;
359 uint32_t aer_count;
360 struct pci_nvme_aen aen[PCI_NVME_AE_TYPE_MAX];
361 pthread_t aen_tid;
362 pthread_mutex_t aen_mtx;
363 pthread_cond_t aen_cond;
367 static void pci_nvme_cq_update(struct pci_nvme_softc *sc,
368 struct nvme_completion_queue *cq,
369 uint32_t cdw0,
370 uint16_t cid,
371 uint16_t sqid,
372 uint16_t status);
373 static struct pci_nvme_ioreq *pci_nvme_get_ioreq(struct pci_nvme_softc *);
374 static void pci_nvme_release_ioreq(struct pci_nvme_softc *, struct pci_nvme_ioreq *);
375 static void pci_nvme_io_done(struct blockif_req *, int);
377 /* Controller Configuration utils */
378 #define NVME_CC_GET_EN(cc) \
379 ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK)
380 #define NVME_CC_GET_CSS(cc) \
381 ((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK)
382 #define NVME_CC_GET_SHN(cc) \
383 ((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK)
384 #define NVME_CC_GET_IOSQES(cc) \
385 ((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK)
386 #define NVME_CC_GET_IOCQES(cc) \
387 ((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK)
389 #define NVME_CC_WRITE_MASK \
390 ((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \
391 (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \
392 (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT))
394 #define NVME_CC_NEN_WRITE_MASK \
395 ((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \
396 (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \
397 (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT))
399 /* Controller Status utils */
400 #define NVME_CSTS_GET_RDY(sts) \
401 ((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK)
403 #define NVME_CSTS_RDY (1 << NVME_CSTS_REG_RDY_SHIFT)
404 #define NVME_CSTS_CFS (1 << NVME_CSTS_REG_CFS_SHIFT)
406 /* Completion Queue status word utils */
407 #define NVME_STATUS_P (1 << NVME_STATUS_P_SHIFT)
408 #define NVME_STATUS_MASK \
409 ((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\
410 (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT))
412 #define NVME_ONCS_DSM (NVME_CTRLR_DATA_ONCS_DSM_MASK << \
413 NVME_CTRLR_DATA_ONCS_DSM_SHIFT)
415 static void nvme_feature_invalid_cb(struct pci_nvme_softc *,
416 struct nvme_feature_obj *,
417 struct nvme_command *,
418 struct nvme_completion *);
419 static void nvme_feature_temperature(struct pci_nvme_softc *,
420 struct nvme_feature_obj *,
421 struct nvme_command *,
422 struct nvme_completion *);
423 static void nvme_feature_num_queues(struct pci_nvme_softc *,
424 struct nvme_feature_obj *,
425 struct nvme_command *,
426 struct nvme_completion *);
427 static void nvme_feature_iv_config(struct pci_nvme_softc *,
428 struct nvme_feature_obj *,
429 struct nvme_command *,
430 struct nvme_completion *);
431 static void nvme_feature_async_event(struct pci_nvme_softc *,
432 struct nvme_feature_obj *,
433 struct nvme_command *,
434 struct nvme_completion *);
436 static void *aen_thr(void *arg);
438 static __inline void
439 cpywithpad(char *dst, size_t dst_size, const char *src, char pad)
441 size_t len;
443 len = strnlen(src, dst_size);
444 memset(dst, pad, dst_size);
445 memcpy(dst, src, len);
448 static __inline void
449 pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code)
452 *status &= ~NVME_STATUS_MASK;
453 *status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT |
454 (code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT;
457 static __inline void
458 pci_nvme_status_genc(uint16_t *status, uint16_t code)
461 pci_nvme_status_tc(status, NVME_SCT_GENERIC, code);
465 * Initialize the requested number or IO Submission and Completion Queues.
466 * Admin queues are allocated implicitly.
468 static void
469 pci_nvme_init_queues(struct pci_nvme_softc *sc, uint32_t nsq, uint32_t ncq)
471 uint32_t i;
474 * Allocate and initialize the Submission Queues
476 if (nsq > NVME_QUEUES) {
477 WPRINTF("%s: clamping number of SQ from %u to %u",
478 __func__, nsq, NVME_QUEUES);
479 nsq = NVME_QUEUES;
482 sc->num_squeues = nsq;
484 sc->submit_queues = calloc(sc->num_squeues + 1,
485 sizeof(struct nvme_submission_queue));
486 if (sc->submit_queues == NULL) {
487 WPRINTF("%s: SQ allocation failed", __func__);
488 sc->num_squeues = 0;
489 } else {
490 struct nvme_submission_queue *sq = sc->submit_queues;
492 for (i = 0; i < sc->num_squeues + 1; i++)
493 pthread_mutex_init(&sq[i].mtx, NULL);
497 * Allocate and initialize the Completion Queues
499 if (ncq > NVME_QUEUES) {
500 WPRINTF("%s: clamping number of CQ from %u to %u",
501 __func__, ncq, NVME_QUEUES);
502 ncq = NVME_QUEUES;
505 sc->num_cqueues = ncq;
507 sc->compl_queues = calloc(sc->num_cqueues + 1,
508 sizeof(struct nvme_completion_queue));
509 if (sc->compl_queues == NULL) {
510 WPRINTF("%s: CQ allocation failed", __func__);
511 sc->num_cqueues = 0;
512 } else {
513 struct nvme_completion_queue *cq = sc->compl_queues;
515 for (i = 0; i < sc->num_cqueues + 1; i++)
516 pthread_mutex_init(&cq[i].mtx, NULL);
520 static void
521 pci_nvme_init_ctrldata(struct pci_nvme_softc *sc)
523 struct nvme_controller_data *cd = &sc->ctrldata;
525 cd->vid = 0xFB5D;
526 cd->ssvid = 0x0000;
528 cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' ');
529 cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' ');
531 /* Num of submission commands that we can handle at a time (2^rab) */
532 cd->rab = 4;
534 /* FreeBSD OUI */
535 cd->ieee[0] = 0x58;
536 cd->ieee[1] = 0x9c;
537 cd->ieee[2] = 0xfc;
539 cd->mic = 0;
541 cd->mdts = NVME_MDTS; /* max data transfer size (2^mdts * CAP.MPSMIN) */
543 cd->ver = NVME_REV(1,4);
545 cd->cntrltype = NVME_CNTRLTYPE_IO;
546 cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT;
547 cd->oaes = NVMEB(NVME_CTRLR_DATA_OAES_NS_ATTR);
548 cd->acl = 2;
549 cd->aerl = 4;
551 /* Advertise 1, Read-only firmware slot */
552 cd->frmw = NVMEB(NVME_CTRLR_DATA_FRMW_SLOT1_RO) |
553 (1 << NVME_CTRLR_DATA_FRMW_NUM_SLOTS_SHIFT);
554 cd->lpa = 0; /* TODO: support some simple things like SMART */
555 cd->elpe = 0; /* max error log page entries */
557 * Report a single power state (zero-based value)
558 * power_state[] values are left as zero to indicate "Not reported"
560 cd->npss = 0;
562 /* Warning Composite Temperature Threshold */
563 cd->wctemp = 0x0157;
564 cd->cctemp = 0x0157;
566 /* SANICAP must not be 0 for Revision 1.4 and later NVMe Controllers */
567 cd->sanicap = (NVME_CTRLR_DATA_SANICAP_NODMMAS_NO <<
568 NVME_CTRLR_DATA_SANICAP_NODMMAS_SHIFT);
570 cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) |
571 (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT);
572 cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) |
573 (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT);
574 cd->nn = 1; /* number of namespaces */
576 cd->oncs = 0;
577 switch (sc->dataset_management) {
578 case NVME_DATASET_MANAGEMENT_AUTO:
579 if (sc->nvstore.deallocate)
580 cd->oncs |= NVME_ONCS_DSM;
581 break;
582 case NVME_DATASET_MANAGEMENT_ENABLE:
583 cd->oncs |= NVME_ONCS_DSM;
584 break;
585 default:
586 break;
589 cd->fna = NVME_CTRLR_DATA_FNA_FORMAT_ALL_MASK <<
590 NVME_CTRLR_DATA_FNA_FORMAT_ALL_SHIFT;
592 cd->vwc = NVME_CTRLR_DATA_VWC_ALL_NO << NVME_CTRLR_DATA_VWC_ALL_SHIFT;
596 * Calculate the CRC-16 of the given buffer
597 * See copyright attribution at top of file
599 static uint16_t
600 crc16(uint16_t crc, const void *buffer, unsigned int len)
602 const unsigned char *cp = buffer;
603 /* CRC table for the CRC-16. The poly is 0x8005 (x16 + x15 + x2 + 1). */
604 static uint16_t const crc16_table[256] = {
605 0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241,
606 0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440,
607 0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40,
608 0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841,
609 0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40,
610 0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41,
611 0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641,
612 0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040,
613 0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240,
614 0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441,
615 0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41,
616 0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840,
617 0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41,
618 0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40,
619 0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640,
620 0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041,
621 0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240,
622 0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441,
623 0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41,
624 0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840,
625 0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41,
626 0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40,
627 0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640,
628 0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041,
629 0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241,
630 0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440,
631 0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40,
632 0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841,
633 0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40,
634 0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41,
635 0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641,
636 0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040
639 while (len--)
640 crc = (((crc >> 8) & 0xffU) ^
641 crc16_table[(crc ^ *cp++) & 0xffU]) & 0x0000ffffU;
642 return crc;
645 static void
646 pci_nvme_init_nsdata_size(struct pci_nvme_blockstore *nvstore,
647 struct nvme_namespace_data *nd)
650 /* Get capacity and block size information from backing store */
651 nd->nsze = nvstore->size / nvstore->sectsz;
652 nd->ncap = nd->nsze;
653 nd->nuse = nd->nsze;
656 static void
657 pci_nvme_init_nsdata(struct pci_nvme_softc *sc,
658 struct nvme_namespace_data *nd, uint32_t nsid,
659 struct pci_nvme_blockstore *nvstore)
662 pci_nvme_init_nsdata_size(nvstore, nd);
664 if (nvstore->type == NVME_STOR_BLOCKIF)
665 nvstore->deallocate = blockif_candelete(nvstore->ctx);
667 nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */
668 nd->flbas = 0;
670 /* Create an EUI-64 if user did not provide one */
671 if (nvstore->eui64 == 0) {
672 char *data = NULL;
673 uint64_t eui64 = nvstore->eui64;
675 asprintf(&data, "%s%u%u%u", get_config_value("name"),
676 sc->nsc_pi->pi_bus, sc->nsc_pi->pi_slot,
677 sc->nsc_pi->pi_func);
679 if (data != NULL) {
680 eui64 = OUI_FREEBSD_NVME_LOW | crc16(0, data, strlen(data));
681 free(data);
683 nvstore->eui64 = (eui64 << 16) | (nsid & 0xffff);
685 be64enc(nd->eui64, nvstore->eui64);
687 /* LBA data-sz = 2^lbads */
688 nd->lbaf[0] = nvstore->sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT;
691 static void
692 pci_nvme_init_logpages(struct pci_nvme_softc *sc)
694 __uint128_t power_cycles = 1;
696 memset(&sc->err_log, 0, sizeof(sc->err_log));
697 memset(&sc->health_log, 0, sizeof(sc->health_log));
698 memset(&sc->fw_log, 0, sizeof(sc->fw_log));
699 memset(&sc->ns_log, 0, sizeof(sc->ns_log));
701 /* Set read/write remainder to round up according to spec */
702 sc->read_dunits_remainder = 999;
703 sc->write_dunits_remainder = 999;
705 /* Set nominal Health values checked by implementations */
706 sc->health_log.temperature = NVME_TEMPERATURE;
707 sc->health_log.available_spare = 100;
708 sc->health_log.available_spare_threshold = 10;
710 /* Set Active Firmware Info to slot 1 */
711 sc->fw_log.afi = (1 << NVME_FIRMWARE_PAGE_AFI_SLOT_SHIFT);
712 memcpy(&sc->fw_log.revision[0], sc->ctrldata.fr,
713 sizeof(sc->fw_log.revision[0]));
715 memcpy(&sc->health_log.power_cycles, &power_cycles,
716 sizeof(sc->health_log.power_cycles));
719 static void
720 pci_nvme_init_features(struct pci_nvme_softc *sc)
722 enum nvme_feature fid;
724 for (fid = 0; fid < NVME_FID_MAX; fid++) {
725 switch (fid) {
726 case NVME_FEAT_ARBITRATION:
727 case NVME_FEAT_POWER_MANAGEMENT:
728 case NVME_FEAT_INTERRUPT_COALESCING: //XXX
729 case NVME_FEAT_WRITE_ATOMICITY:
730 /* Mandatory but no special handling required */
731 //XXX hang - case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG:
732 //XXX hang - case NVME_FEAT_HOST_BEHAVIOR_SUPPORT:
733 // this returns a data buffer
734 break;
735 case NVME_FEAT_TEMPERATURE_THRESHOLD:
736 sc->feat[fid].set = nvme_feature_temperature;
737 break;
738 case NVME_FEAT_ERROR_RECOVERY:
739 sc->feat[fid].namespace_specific = true;
740 break;
741 case NVME_FEAT_NUMBER_OF_QUEUES:
742 sc->feat[fid].set = nvme_feature_num_queues;
743 break;
744 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
745 sc->feat[fid].set = nvme_feature_iv_config;
746 break;
747 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
748 sc->feat[fid].set = nvme_feature_async_event;
749 /* Enable all AENs by default */
750 sc->feat[fid].cdw11 = PCI_NVME_AEN_DEFAULT_MASK;
751 break;
752 default:
753 sc->feat[fid].set = nvme_feature_invalid_cb;
754 sc->feat[fid].get = nvme_feature_invalid_cb;
759 static void
760 pci_nvme_aer_reset(struct pci_nvme_softc *sc)
763 STAILQ_INIT(&sc->aer_list);
764 sc->aer_count = 0;
767 static void
768 pci_nvme_aer_init(struct pci_nvme_softc *sc)
771 pthread_mutex_init(&sc->aer_mtx, NULL);
772 pci_nvme_aer_reset(sc);
775 static void
776 pci_nvme_aer_destroy(struct pci_nvme_softc *sc)
778 struct pci_nvme_aer *aer = NULL;
780 pthread_mutex_lock(&sc->aer_mtx);
781 while (!STAILQ_EMPTY(&sc->aer_list)) {
782 aer = STAILQ_FIRST(&sc->aer_list);
783 STAILQ_REMOVE_HEAD(&sc->aer_list, link);
784 free(aer);
786 pthread_mutex_unlock(&sc->aer_mtx);
788 pci_nvme_aer_reset(sc);
791 static bool
792 pci_nvme_aer_available(struct pci_nvme_softc *sc)
795 return (sc->aer_count != 0);
798 static bool
799 pci_nvme_aer_limit_reached(struct pci_nvme_softc *sc)
801 struct nvme_controller_data *cd = &sc->ctrldata;
803 /* AERL is a zero based value while aer_count is one's based */
804 return (sc->aer_count == (cd->aerl + 1U));
808 * Add an Async Event Request
810 * Stores an AER to be returned later if the Controller needs to notify the
811 * host of an event.
812 * Note that while the NVMe spec doesn't require Controllers to return AER's
813 * in order, this implementation does preserve the order.
815 static int
816 pci_nvme_aer_add(struct pci_nvme_softc *sc, uint16_t cid)
818 struct pci_nvme_aer *aer = NULL;
820 aer = calloc(1, sizeof(struct pci_nvme_aer));
821 if (aer == NULL)
822 return (-1);
824 /* Save the Command ID for use in the completion message */
825 aer->cid = cid;
827 pthread_mutex_lock(&sc->aer_mtx);
828 sc->aer_count++;
829 STAILQ_INSERT_TAIL(&sc->aer_list, aer, link);
830 pthread_mutex_unlock(&sc->aer_mtx);
832 return (0);
836 * Get an Async Event Request structure
838 * Returns a pointer to an AER previously submitted by the host or NULL if
839 * no AER's exist. Caller is responsible for freeing the returned struct.
841 static struct pci_nvme_aer *
842 pci_nvme_aer_get(struct pci_nvme_softc *sc)
844 struct pci_nvme_aer *aer = NULL;
846 pthread_mutex_lock(&sc->aer_mtx);
847 aer = STAILQ_FIRST(&sc->aer_list);
848 if (aer != NULL) {
849 STAILQ_REMOVE_HEAD(&sc->aer_list, link);
850 sc->aer_count--;
852 pthread_mutex_unlock(&sc->aer_mtx);
854 return (aer);
857 static void
858 pci_nvme_aen_reset(struct pci_nvme_softc *sc)
860 uint32_t atype;
862 memset(sc->aen, 0, PCI_NVME_AE_TYPE_MAX * sizeof(struct pci_nvme_aen));
864 for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) {
865 sc->aen[atype].atype = atype;
869 static void
870 pci_nvme_aen_init(struct pci_nvme_softc *sc)
872 char nstr[80];
874 pci_nvme_aen_reset(sc);
876 pthread_mutex_init(&sc->aen_mtx, NULL);
877 pthread_create(&sc->aen_tid, NULL, aen_thr, sc);
878 snprintf(nstr, sizeof(nstr), "nvme-aen-%d:%d", sc->nsc_pi->pi_slot,
879 sc->nsc_pi->pi_func);
880 pthread_set_name_np(sc->aen_tid, nstr);
883 static void
884 pci_nvme_aen_destroy(struct pci_nvme_softc *sc)
887 pci_nvme_aen_reset(sc);
890 /* Notify the AEN thread of pending work */
891 static void
892 pci_nvme_aen_notify(struct pci_nvme_softc *sc)
895 pthread_cond_signal(&sc->aen_cond);
899 * Post an Asynchronous Event Notification
901 static int32_t
902 pci_nvme_aen_post(struct pci_nvme_softc *sc, pci_nvme_async_type atype,
903 uint32_t event_data)
905 struct pci_nvme_aen *aen;
907 if (atype >= PCI_NVME_AE_TYPE_MAX) {
908 return(EINVAL);
911 pthread_mutex_lock(&sc->aen_mtx);
912 aen = &sc->aen[atype];
914 /* Has the controller already posted an event of this type? */
915 if (aen->posted) {
916 pthread_mutex_unlock(&sc->aen_mtx);
917 return(EALREADY);
920 aen->event_data = event_data;
921 aen->posted = true;
922 pthread_mutex_unlock(&sc->aen_mtx);
924 pci_nvme_aen_notify(sc);
926 return(0);
929 static void
930 pci_nvme_aen_process(struct pci_nvme_softc *sc)
932 struct pci_nvme_aer *aer;
933 struct pci_nvme_aen *aen;
934 pci_nvme_async_type atype;
935 uint32_t mask;
936 uint16_t status;
937 uint8_t lid;
939 #ifndef __FreeBSD__
940 lid = 0;
941 #endif
943 assert(pthread_mutex_isowned_np(&sc->aen_mtx));
944 for (atype = 0; atype < PCI_NVME_AE_TYPE_MAX; atype++) {
945 aen = &sc->aen[atype];
946 /* Previous iterations may have depleted the available AER's */
947 if (!pci_nvme_aer_available(sc)) {
948 DPRINTF("%s: no AER", __func__);
949 break;
952 if (!aen->posted) {
953 DPRINTF("%s: no AEN posted for atype=%#x", __func__, atype);
954 continue;
957 status = NVME_SC_SUCCESS;
959 /* Is the event masked? */
960 mask =
961 sc->feat[NVME_FEAT_ASYNC_EVENT_CONFIGURATION].cdw11;
963 DPRINTF("%s: atype=%#x mask=%#x event_data=%#x", __func__, atype, mask, aen->event_data);
964 switch (atype) {
965 case PCI_NVME_AE_TYPE_ERROR:
966 lid = NVME_LOG_ERROR;
967 break;
968 case PCI_NVME_AE_TYPE_SMART:
969 mask &= 0xff;
970 if ((mask & aen->event_data) == 0)
971 continue;
972 lid = NVME_LOG_HEALTH_INFORMATION;
973 break;
974 case PCI_NVME_AE_TYPE_NOTICE:
975 if (aen->event_data >= PCI_NVME_AEI_NOTICE_MAX) {
976 EPRINTLN("%s unknown AEN notice type %u",
977 __func__, aen->event_data);
978 status = NVME_SC_INTERNAL_DEVICE_ERROR;
979 lid = 0;
980 break;
982 if ((PCI_NVME_AEI_NOTICE_MASK(aen->event_data) & mask) == 0)
983 continue;
984 switch (aen->event_data) {
985 case PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED:
986 lid = NVME_LOG_CHANGED_NAMESPACE;
987 break;
988 case PCI_NVME_AEI_NOTICE_FW_ACTIVATION:
989 lid = NVME_LOG_FIRMWARE_SLOT;
990 break;
991 case PCI_NVME_AEI_NOTICE_TELEMETRY_CHANGE:
992 lid = NVME_LOG_TELEMETRY_CONTROLLER_INITIATED;
993 break;
994 case PCI_NVME_AEI_NOTICE_ANA_CHANGE:
995 lid = NVME_LOG_ASYMMETRIC_NAMESPACE_ACCESS;
996 break;
997 case PCI_NVME_AEI_NOTICE_PREDICT_LATENCY_CHANGE:
998 lid = NVME_LOG_PREDICTABLE_LATENCY_EVENT_AGGREGATE;
999 break;
1000 case PCI_NVME_AEI_NOTICE_LBA_STATUS_ALERT:
1001 lid = NVME_LOG_LBA_STATUS_INFORMATION;
1002 break;
1003 case PCI_NVME_AEI_NOTICE_ENDURANCE_GROUP_CHANGE:
1004 lid = NVME_LOG_ENDURANCE_GROUP_EVENT_AGGREGATE;
1005 break;
1006 default:
1007 lid = 0;
1009 break;
1010 default:
1011 /* bad type?!? */
1012 EPRINTLN("%s unknown AEN type %u", __func__, atype);
1013 status = NVME_SC_INTERNAL_DEVICE_ERROR;
1014 lid = 0;
1015 break;
1018 aer = pci_nvme_aer_get(sc);
1019 assert(aer != NULL);
1021 DPRINTF("%s: CID=%#x CDW0=%#x", __func__, aer->cid, (lid << 16) | (aen->event_data << 8) | atype);
1022 pci_nvme_cq_update(sc, &sc->compl_queues[0],
1023 (lid << 16) | (aen->event_data << 8) | atype, /* cdw0 */
1024 aer->cid,
1025 0, /* SQID */
1026 status);
1028 aen->event_data = 0;
1029 aen->posted = false;
1031 pci_generate_msix(sc->nsc_pi, 0);
1035 static void *
1036 aen_thr(void *arg)
1038 struct pci_nvme_softc *sc;
1040 sc = arg;
1042 pthread_mutex_lock(&sc->aen_mtx);
1043 for (;;) {
1044 pci_nvme_aen_process(sc);
1045 pthread_cond_wait(&sc->aen_cond, &sc->aen_mtx);
1047 #ifdef __FreeBSD__ /* Smatch spots unreachable code */
1048 pthread_mutex_unlock(&sc->aen_mtx);
1050 pthread_exit(NULL);
1051 #endif
1052 return (NULL);
1055 static void
1056 pci_nvme_reset_locked(struct pci_nvme_softc *sc)
1058 uint32_t i;
1060 DPRINTF("%s", __func__);
1062 sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) |
1063 (1 << NVME_CAP_LO_REG_CQR_SHIFT) |
1064 (60 << NVME_CAP_LO_REG_TO_SHIFT);
1066 sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT;
1068 sc->regs.vs = NVME_REV(1,4); /* NVMe v1.4 */
1070 sc->regs.cc = 0;
1072 assert(sc->submit_queues != NULL);
1074 for (i = 0; i < sc->num_squeues + 1; i++) {
1075 sc->submit_queues[i].qbase = NULL;
1076 sc->submit_queues[i].size = 0;
1077 sc->submit_queues[i].cqid = 0;
1078 sc->submit_queues[i].tail = 0;
1079 sc->submit_queues[i].head = 0;
1082 assert(sc->compl_queues != NULL);
1084 for (i = 0; i < sc->num_cqueues + 1; i++) {
1085 sc->compl_queues[i].qbase = NULL;
1086 sc->compl_queues[i].size = 0;
1087 sc->compl_queues[i].tail = 0;
1088 sc->compl_queues[i].head = 0;
1091 sc->num_q_is_set = false;
1093 pci_nvme_aer_destroy(sc);
1094 pci_nvme_aen_destroy(sc);
1097 * Clear CSTS.RDY last to prevent the host from enabling Controller
1098 * before cleanup completes
1100 sc->regs.csts = 0;
1103 static void
1104 pci_nvme_reset(struct pci_nvme_softc *sc)
1106 pthread_mutex_lock(&sc->mtx);
1107 pci_nvme_reset_locked(sc);
1108 pthread_mutex_unlock(&sc->mtx);
1111 static int
1112 pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc)
1114 uint16_t acqs, asqs;
1116 DPRINTF("%s", __func__);
1119 * NVMe 2.0 states that "enabling a controller while this field is
1120 * cleared to 0h produces undefined results" for both ACQS and
1121 * ASQS. If zero, set CFS and do not become ready.
1123 asqs = ONE_BASED(sc->regs.aqa & NVME_AQA_REG_ASQS_MASK);
1124 if (asqs < 2) {
1125 EPRINTLN("%s: illegal ASQS value %#x (aqa=%#x)", __func__,
1126 asqs - 1, sc->regs.aqa);
1127 sc->regs.csts |= NVME_CSTS_CFS;
1128 return (-1);
1130 sc->submit_queues[0].size = asqs;
1131 sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq,
1132 sizeof(struct nvme_command) * asqs);
1133 if (sc->submit_queues[0].qbase == NULL) {
1134 EPRINTLN("%s: ASQ vm_map_gpa(%lx) failed", __func__,
1135 sc->regs.asq);
1136 sc->regs.csts |= NVME_CSTS_CFS;
1137 return (-1);
1140 DPRINTF("%s mapping Admin-SQ guest 0x%lx, host: %p",
1141 __func__, sc->regs.asq, sc->submit_queues[0].qbase);
1143 acqs = ONE_BASED((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) &
1144 NVME_AQA_REG_ACQS_MASK);
1145 if (acqs < 2) {
1146 EPRINTLN("%s: illegal ACQS value %#x (aqa=%#x)", __func__,
1147 acqs - 1, sc->regs.aqa);
1148 sc->regs.csts |= NVME_CSTS_CFS;
1149 return (-1);
1151 sc->compl_queues[0].size = acqs;
1152 sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq,
1153 sizeof(struct nvme_completion) * acqs);
1154 if (sc->compl_queues[0].qbase == NULL) {
1155 EPRINTLN("%s: ACQ vm_map_gpa(%lx) failed", __func__,
1156 sc->regs.acq);
1157 sc->regs.csts |= NVME_CSTS_CFS;
1158 return (-1);
1160 sc->compl_queues[0].intr_en = NVME_CQ_INTEN;
1162 DPRINTF("%s mapping Admin-CQ guest 0x%lx, host: %p",
1163 __func__, sc->regs.acq, sc->compl_queues[0].qbase);
1165 return (0);
1168 static int
1169 nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *b,
1170 size_t len, enum nvme_copy_dir dir)
1172 uint8_t *p;
1173 size_t bytes;
1175 if (len > (8 * 1024)) {
1176 return (-1);
1179 /* Copy from the start of prp1 to the end of the physical page */
1180 bytes = PAGE_SIZE - (prp1 & PAGE_MASK);
1181 bytes = MIN(bytes, len);
1183 p = vm_map_gpa(ctx, prp1, bytes);
1184 if (p == NULL) {
1185 return (-1);
1188 if (dir == NVME_COPY_TO_PRP)
1189 memcpy(p, b, bytes);
1190 else
1191 memcpy(b, p, bytes);
1193 b += bytes;
1195 len -= bytes;
1196 if (len == 0) {
1197 return (0);
1200 len = MIN(len, PAGE_SIZE);
1202 p = vm_map_gpa(ctx, prp2, len);
1203 if (p == NULL) {
1204 return (-1);
1207 if (dir == NVME_COPY_TO_PRP)
1208 memcpy(p, b, len);
1209 else
1210 memcpy(b, p, len);
1212 return (0);
1216 * Write a Completion Queue Entry update
1218 * Write the completion and update the doorbell value
1220 static void
1221 pci_nvme_cq_update(struct pci_nvme_softc *sc,
1222 struct nvme_completion_queue *cq,
1223 uint32_t cdw0,
1224 uint16_t cid,
1225 uint16_t sqid,
1226 uint16_t status)
1228 struct nvme_submission_queue *sq = &sc->submit_queues[sqid];
1229 struct nvme_completion *cqe;
1231 assert(cq->qbase != NULL);
1233 pthread_mutex_lock(&cq->mtx);
1235 cqe = &cq->qbase[cq->tail];
1237 /* Flip the phase bit */
1238 status |= (cqe->status ^ NVME_STATUS_P) & NVME_STATUS_P_MASK;
1240 cqe->cdw0 = cdw0;
1241 cqe->sqhd = sq->head;
1242 cqe->sqid = sqid;
1243 cqe->cid = cid;
1244 cqe->status = status;
1246 cq->tail++;
1247 if (cq->tail >= cq->size) {
1248 cq->tail = 0;
1251 pthread_mutex_unlock(&cq->mtx);
1254 static int
1255 nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
1256 struct nvme_completion* compl)
1258 uint16_t qid = command->cdw10 & 0xffff;
1260 DPRINTF("%s DELETE_IO_SQ %u", __func__, qid);
1261 if (qid == 0 || qid > sc->num_squeues ||
1262 (sc->submit_queues[qid].qbase == NULL)) {
1263 WPRINTF("%s NOT PERMITTED queue id %u / num_squeues %u",
1264 __func__, qid, sc->num_squeues);
1265 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1266 NVME_SC_INVALID_QUEUE_IDENTIFIER);
1267 return (1);
1270 sc->submit_queues[qid].qbase = NULL;
1271 sc->submit_queues[qid].cqid = 0;
1272 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1273 return (1);
1276 static int
1277 nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command,
1278 struct nvme_completion* compl)
1280 if (command->cdw11 & NVME_CMD_CDW11_PC) {
1281 uint16_t qid = command->cdw10 & 0xffff;
1282 struct nvme_submission_queue *nsq;
1284 if ((qid == 0) || (qid > sc->num_squeues) ||
1285 (sc->submit_queues[qid].qbase != NULL)) {
1286 WPRINTF("%s queue index %u > num_squeues %u",
1287 __func__, qid, sc->num_squeues);
1288 pci_nvme_status_tc(&compl->status,
1289 NVME_SCT_COMMAND_SPECIFIC,
1290 NVME_SC_INVALID_QUEUE_IDENTIFIER);
1291 return (1);
1294 nsq = &sc->submit_queues[qid];
1295 nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
1296 DPRINTF("%s size=%u (max=%u)", __func__, nsq->size, sc->max_qentries);
1297 if ((nsq->size < 2) || (nsq->size > sc->max_qentries)) {
1299 * Queues must specify at least two entries
1300 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
1301 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
1303 pci_nvme_status_tc(&compl->status,
1304 NVME_SCT_COMMAND_SPECIFIC,
1305 NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
1306 return (1);
1308 nsq->head = nsq->tail = 0;
1310 nsq->cqid = (command->cdw11 >> 16) & 0xffff;
1311 if ((nsq->cqid == 0) || (nsq->cqid > sc->num_cqueues)) {
1312 pci_nvme_status_tc(&compl->status,
1313 NVME_SCT_COMMAND_SPECIFIC,
1314 NVME_SC_INVALID_QUEUE_IDENTIFIER);
1315 return (1);
1318 if (sc->compl_queues[nsq->cqid].qbase == NULL) {
1319 pci_nvme_status_tc(&compl->status,
1320 NVME_SCT_COMMAND_SPECIFIC,
1321 NVME_SC_COMPLETION_QUEUE_INVALID);
1322 return (1);
1325 nsq->qpriority = (command->cdw11 >> 1) & 0x03;
1327 nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1328 sizeof(struct nvme_command) * (size_t)nsq->size);
1330 DPRINTF("%s sq %u size %u gaddr %p cqid %u", __func__,
1331 qid, nsq->size, nsq->qbase, nsq->cqid);
1333 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1335 DPRINTF("%s completed creating IOSQ qid %u",
1336 __func__, qid);
1337 } else {
1339 * Guest sent non-cont submission queue request.
1340 * This setting is unsupported by this emulation.
1342 WPRINTF("%s unsupported non-contig (list-based) "
1343 "create i/o submission queue", __func__);
1345 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1347 return (1);
1350 static int
1351 nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
1352 struct nvme_completion* compl)
1354 uint16_t qid = command->cdw10 & 0xffff;
1355 uint16_t sqid;
1357 DPRINTF("%s DELETE_IO_CQ %u", __func__, qid);
1358 if (qid == 0 || qid > sc->num_cqueues ||
1359 (sc->compl_queues[qid].qbase == NULL)) {
1360 WPRINTF("%s queue index %u / num_cqueues %u",
1361 __func__, qid, sc->num_cqueues);
1362 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1363 NVME_SC_INVALID_QUEUE_IDENTIFIER);
1364 return (1);
1367 /* Deleting an Active CQ is an error */
1368 for (sqid = 1; sqid < sc->num_squeues + 1; sqid++)
1369 if (sc->submit_queues[sqid].cqid == qid) {
1370 pci_nvme_status_tc(&compl->status,
1371 NVME_SCT_COMMAND_SPECIFIC,
1372 NVME_SC_INVALID_QUEUE_DELETION);
1373 return (1);
1376 sc->compl_queues[qid].qbase = NULL;
1377 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1378 return (1);
1381 static int
1382 nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command,
1383 struct nvme_completion* compl)
1385 struct nvme_completion_queue *ncq;
1386 uint16_t qid = command->cdw10 & 0xffff;
1388 /* Only support Physically Contiguous queues */
1389 if ((command->cdw11 & NVME_CMD_CDW11_PC) == 0) {
1390 WPRINTF("%s unsupported non-contig (list-based) "
1391 "create i/o completion queue",
1392 __func__);
1394 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1395 return (1);
1398 if ((qid == 0) || (qid > sc->num_cqueues) ||
1399 (sc->compl_queues[qid].qbase != NULL)) {
1400 WPRINTF("%s queue index %u > num_cqueues %u",
1401 __func__, qid, sc->num_cqueues);
1402 pci_nvme_status_tc(&compl->status,
1403 NVME_SCT_COMMAND_SPECIFIC,
1404 NVME_SC_INVALID_QUEUE_IDENTIFIER);
1405 return (1);
1408 ncq = &sc->compl_queues[qid];
1409 ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1;
1410 ncq->intr_vec = (command->cdw11 >> 16) & 0xffff;
1411 if (ncq->intr_vec > (sc->max_queues + 1)) {
1412 pci_nvme_status_tc(&compl->status,
1413 NVME_SCT_COMMAND_SPECIFIC,
1414 NVME_SC_INVALID_INTERRUPT_VECTOR);
1415 return (1);
1418 ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff);
1419 if ((ncq->size < 2) || (ncq->size > sc->max_qentries)) {
1421 * Queues must specify at least two entries
1422 * NOTE: "MAXIMUM QUEUE SIZE EXCEEDED" was renamed to
1423 * "INVALID QUEUE SIZE" in the NVM Express 1.3 Spec
1425 pci_nvme_status_tc(&compl->status,
1426 NVME_SCT_COMMAND_SPECIFIC,
1427 NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED);
1428 return (1);
1430 ncq->head = ncq->tail = 0;
1431 ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx,
1432 command->prp1,
1433 sizeof(struct nvme_command) * (size_t)ncq->size);
1435 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1438 return (1);
1441 static int
1442 nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command,
1443 struct nvme_completion* compl)
1445 uint64_t logoff;
1446 uint32_t logsize;
1447 uint8_t logpage;
1449 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1452 * Command specifies the number of dwords to return in fields NUMDU
1453 * and NUMDL. This is a zero-based value.
1455 logpage = command->cdw10 & 0xFF;
1456 logsize = ((command->cdw11 << 16) | (command->cdw10 >> 16)) + 1;
1457 logsize *= sizeof(uint32_t);
1458 logoff = ((uint64_t)(command->cdw13) << 32) | command->cdw12;
1460 DPRINTF("%s log page %u len %u", __func__, logpage, logsize);
1462 switch (logpage) {
1463 case NVME_LOG_ERROR:
1464 if (logoff >= sizeof(sc->err_log)) {
1465 pci_nvme_status_genc(&compl->status,
1466 NVME_SC_INVALID_FIELD);
1467 break;
1470 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1471 command->prp2, (uint8_t *)&sc->err_log + logoff,
1472 MIN(logsize - logoff, sizeof(sc->err_log)),
1473 NVME_COPY_TO_PRP);
1474 break;
1475 case NVME_LOG_HEALTH_INFORMATION:
1476 if (logoff >= sizeof(sc->health_log)) {
1477 pci_nvme_status_genc(&compl->status,
1478 NVME_SC_INVALID_FIELD);
1479 break;
1482 pthread_mutex_lock(&sc->mtx);
1483 memcpy(&sc->health_log.data_units_read, &sc->read_data_units,
1484 sizeof(sc->health_log.data_units_read));
1485 memcpy(&sc->health_log.data_units_written, &sc->write_data_units,
1486 sizeof(sc->health_log.data_units_written));
1487 memcpy(&sc->health_log.host_read_commands, &sc->read_commands,
1488 sizeof(sc->health_log.host_read_commands));
1489 memcpy(&sc->health_log.host_write_commands, &sc->write_commands,
1490 sizeof(sc->health_log.host_write_commands));
1491 pthread_mutex_unlock(&sc->mtx);
1493 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1494 command->prp2, (uint8_t *)&sc->health_log + logoff,
1495 MIN(logsize - logoff, sizeof(sc->health_log)),
1496 NVME_COPY_TO_PRP);
1497 break;
1498 case NVME_LOG_FIRMWARE_SLOT:
1499 if (logoff >= sizeof(sc->fw_log)) {
1500 pci_nvme_status_genc(&compl->status,
1501 NVME_SC_INVALID_FIELD);
1502 break;
1505 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1506 command->prp2, (uint8_t *)&sc->fw_log + logoff,
1507 MIN(logsize - logoff, sizeof(sc->fw_log)),
1508 NVME_COPY_TO_PRP);
1509 break;
1510 case NVME_LOG_CHANGED_NAMESPACE:
1511 if (logoff >= sizeof(sc->ns_log)) {
1512 pci_nvme_status_genc(&compl->status,
1513 NVME_SC_INVALID_FIELD);
1514 break;
1517 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1518 command->prp2, (uint8_t *)&sc->ns_log + logoff,
1519 MIN(logsize - logoff, sizeof(sc->ns_log)),
1520 NVME_COPY_TO_PRP);
1521 memset(&sc->ns_log, 0, sizeof(sc->ns_log));
1522 break;
1523 default:
1524 DPRINTF("%s get log page %x command not supported",
1525 __func__, logpage);
1527 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1528 NVME_SC_INVALID_LOG_PAGE);
1531 return (1);
1534 static int
1535 nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command,
1536 struct nvme_completion* compl)
1538 void *dest;
1539 uint16_t status;
1541 DPRINTF("%s identify 0x%x nsid 0x%x", __func__,
1542 command->cdw10 & 0xFF, command->nsid);
1544 status = 0;
1545 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
1547 switch (command->cdw10 & 0xFF) {
1548 case 0x00: /* return Identify Namespace data structure */
1549 /* Global NS only valid with NS Management */
1550 if (command->nsid == NVME_GLOBAL_NAMESPACE_TAG) {
1551 pci_nvme_status_genc(&status,
1552 NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
1553 break;
1555 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1556 command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata),
1557 NVME_COPY_TO_PRP);
1558 break;
1559 case 0x01: /* return Identify Controller data structure */
1560 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1,
1561 command->prp2, (uint8_t *)&sc->ctrldata,
1562 sizeof(sc->ctrldata),
1563 NVME_COPY_TO_PRP);
1564 break;
1565 case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */
1566 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1567 sizeof(uint32_t) * 1024);
1568 /* All unused entries shall be zero */
1569 memset(dest, 0, sizeof(uint32_t) * 1024);
1570 ((uint32_t *)dest)[0] = 1;
1571 break;
1572 case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */
1573 if (command->nsid != 1) {
1574 pci_nvme_status_genc(&status,
1575 NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
1576 break;
1578 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1579 sizeof(uint32_t) * 1024);
1580 /* All bytes after the descriptor shall be zero */
1581 memset(dest, 0, sizeof(uint32_t) * 1024);
1583 /* Return NIDT=1 (i.e. EUI64) descriptor */
1584 ((uint8_t *)dest)[0] = 1;
1585 ((uint8_t *)dest)[1] = sizeof(uint64_t);
1586 memcpy(((uint8_t *)dest) + 4, sc->nsdata.eui64, sizeof(uint64_t));
1587 break;
1588 case 0x13:
1590 * Controller list is optional but used by UNH tests. Return
1591 * a valid but empty list.
1593 dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1,
1594 sizeof(uint16_t) * 2048);
1595 memset(dest, 0, sizeof(uint16_t) * 2048);
1596 break;
1597 default:
1598 DPRINTF("%s unsupported identify command requested 0x%x",
1599 __func__, command->cdw10 & 0xFF);
1600 pci_nvme_status_genc(&status, NVME_SC_INVALID_FIELD);
1601 break;
1604 compl->status = status;
1605 return (1);
1608 static const char *
1609 nvme_fid_to_name(uint8_t fid)
1611 const char *name;
1613 switch (fid) {
1614 case NVME_FEAT_ARBITRATION:
1615 name = "Arbitration";
1616 break;
1617 case NVME_FEAT_POWER_MANAGEMENT:
1618 name = "Power Management";
1619 break;
1620 case NVME_FEAT_LBA_RANGE_TYPE:
1621 name = "LBA Range Type";
1622 break;
1623 case NVME_FEAT_TEMPERATURE_THRESHOLD:
1624 name = "Temperature Threshold";
1625 break;
1626 case NVME_FEAT_ERROR_RECOVERY:
1627 name = "Error Recovery";
1628 break;
1629 case NVME_FEAT_VOLATILE_WRITE_CACHE:
1630 name = "Volatile Write Cache";
1631 break;
1632 case NVME_FEAT_NUMBER_OF_QUEUES:
1633 name = "Number of Queues";
1634 break;
1635 case NVME_FEAT_INTERRUPT_COALESCING:
1636 name = "Interrupt Coalescing";
1637 break;
1638 case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION:
1639 name = "Interrupt Vector Configuration";
1640 break;
1641 case NVME_FEAT_WRITE_ATOMICITY:
1642 name = "Write Atomicity Normal";
1643 break;
1644 case NVME_FEAT_ASYNC_EVENT_CONFIGURATION:
1645 name = "Asynchronous Event Configuration";
1646 break;
1647 case NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION:
1648 name = "Autonomous Power State Transition";
1649 break;
1650 case NVME_FEAT_HOST_MEMORY_BUFFER:
1651 name = "Host Memory Buffer";
1652 break;
1653 case NVME_FEAT_TIMESTAMP:
1654 name = "Timestamp";
1655 break;
1656 case NVME_FEAT_KEEP_ALIVE_TIMER:
1657 name = "Keep Alive Timer";
1658 break;
1659 case NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT:
1660 name = "Host Controlled Thermal Management";
1661 break;
1662 case NVME_FEAT_NON_OP_POWER_STATE_CONFIG:
1663 name = "Non-Operation Power State Config";
1664 break;
1665 case NVME_FEAT_READ_RECOVERY_LEVEL_CONFIG:
1666 name = "Read Recovery Level Config";
1667 break;
1668 case NVME_FEAT_PREDICTABLE_LATENCY_MODE_CONFIG:
1669 name = "Predictable Latency Mode Config";
1670 break;
1671 case NVME_FEAT_PREDICTABLE_LATENCY_MODE_WINDOW:
1672 name = "Predictable Latency Mode Window";
1673 break;
1674 case NVME_FEAT_LBA_STATUS_INFORMATION_ATTRIBUTES:
1675 name = "LBA Status Information Report Interval";
1676 break;
1677 case NVME_FEAT_HOST_BEHAVIOR_SUPPORT:
1678 name = "Host Behavior Support";
1679 break;
1680 case NVME_FEAT_SANITIZE_CONFIG:
1681 name = "Sanitize Config";
1682 break;
1683 case NVME_FEAT_ENDURANCE_GROUP_EVENT_CONFIGURATION:
1684 name = "Endurance Group Event Configuration";
1685 break;
1686 case NVME_FEAT_SOFTWARE_PROGRESS_MARKER:
1687 name = "Software Progress Marker";
1688 break;
1689 case NVME_FEAT_HOST_IDENTIFIER:
1690 name = "Host Identifier";
1691 break;
1692 case NVME_FEAT_RESERVATION_NOTIFICATION_MASK:
1693 name = "Reservation Notification Mask";
1694 break;
1695 case NVME_FEAT_RESERVATION_PERSISTENCE:
1696 name = "Reservation Persistence";
1697 break;
1698 case NVME_FEAT_NAMESPACE_WRITE_PROTECTION_CONFIG:
1699 name = "Namespace Write Protection Config";
1700 break;
1701 default:
1702 name = "Unknown";
1703 break;
1706 return (name);
1709 static void
1710 nvme_feature_invalid_cb(struct pci_nvme_softc *sc __unused,
1711 struct nvme_feature_obj *feat __unused,
1712 struct nvme_command *command __unused,
1713 struct nvme_completion *compl)
1715 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1718 static void
1719 nvme_feature_iv_config(struct pci_nvme_softc *sc,
1720 struct nvme_feature_obj *feat __unused,
1721 struct nvme_command *command,
1722 struct nvme_completion *compl)
1724 uint32_t i;
1725 uint32_t cdw11 = command->cdw11;
1726 uint16_t iv;
1727 bool cd;
1729 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1731 iv = cdw11 & 0xffff;
1732 cd = cdw11 & (1 << 16);
1734 if (iv > (sc->max_queues + 1)) {
1735 return;
1738 /* No Interrupt Coalescing (i.e. not Coalescing Disable) for Admin Q */
1739 if ((iv == 0) && !cd)
1740 return;
1742 /* Requested Interrupt Vector must be used by a CQ */
1743 for (i = 0; i < sc->num_cqueues + 1; i++) {
1744 if (sc->compl_queues[i].intr_vec == iv) {
1745 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1750 #define NVME_ASYNC_EVENT_ENDURANCE_GROUP (0x4000)
1751 static void
1752 nvme_feature_async_event(struct pci_nvme_softc *sc __unused,
1753 struct nvme_feature_obj *feat __unused,
1754 struct nvme_command *command,
1755 struct nvme_completion *compl)
1757 if (command->cdw11 & NVME_ASYNC_EVENT_ENDURANCE_GROUP)
1758 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1761 #define NVME_TEMP_THRESH_OVER 0
1762 #define NVME_TEMP_THRESH_UNDER 1
1763 static void
1764 nvme_feature_temperature(struct pci_nvme_softc *sc,
1765 struct nvme_feature_obj *feat __unused,
1766 struct nvme_command *command,
1767 struct nvme_completion *compl)
1769 uint16_t tmpth; /* Temperature Threshold */
1770 uint8_t tmpsel; /* Threshold Temperature Select */
1771 uint8_t thsel; /* Threshold Type Select */
1772 bool set_crit = false;
1773 bool report_crit;
1775 tmpth = command->cdw11 & 0xffff;
1776 tmpsel = (command->cdw11 >> 16) & 0xf;
1777 thsel = (command->cdw11 >> 20) & 0x3;
1779 DPRINTF("%s: tmpth=%#x tmpsel=%#x thsel=%#x", __func__, tmpth, tmpsel, thsel);
1781 /* Check for unsupported values */
1782 if (((tmpsel != 0) && (tmpsel != 0xf)) ||
1783 (thsel > NVME_TEMP_THRESH_UNDER)) {
1784 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1785 return;
1788 if (((thsel == NVME_TEMP_THRESH_OVER) && (NVME_TEMPERATURE >= tmpth)) ||
1789 ((thsel == NVME_TEMP_THRESH_UNDER) && (NVME_TEMPERATURE <= tmpth)))
1790 set_crit = true;
1792 pthread_mutex_lock(&sc->mtx);
1793 if (set_crit)
1794 sc->health_log.critical_warning |=
1795 NVME_CRIT_WARN_ST_TEMPERATURE;
1796 else
1797 sc->health_log.critical_warning &=
1798 ~NVME_CRIT_WARN_ST_TEMPERATURE;
1799 pthread_mutex_unlock(&sc->mtx);
1801 report_crit = sc->feat[NVME_FEAT_ASYNC_EVENT_CONFIGURATION].cdw11 &
1802 NVME_CRIT_WARN_ST_TEMPERATURE;
1804 if (set_crit && report_crit)
1805 pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_SMART,
1806 sc->health_log.critical_warning);
1808 DPRINTF("%s: set_crit=%c critical_warning=%#x status=%#x", __func__, set_crit ? 'T':'F', sc->health_log.critical_warning, compl->status);
1811 static void
1812 nvme_feature_num_queues(struct pci_nvme_softc *sc,
1813 struct nvme_feature_obj *feat __unused,
1814 struct nvme_command *command,
1815 struct nvme_completion *compl)
1817 uint16_t nqr; /* Number of Queues Requested */
1819 if (sc->num_q_is_set) {
1820 WPRINTF("%s: Number of Queues already set", __func__);
1821 pci_nvme_status_genc(&compl->status,
1822 NVME_SC_COMMAND_SEQUENCE_ERROR);
1823 return;
1826 nqr = command->cdw11 & 0xFFFF;
1827 if (nqr == 0xffff) {
1828 WPRINTF("%s: Illegal NSQR value %#x", __func__, nqr);
1829 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1830 return;
1833 sc->num_squeues = ONE_BASED(nqr);
1834 if (sc->num_squeues > sc->max_queues) {
1835 DPRINTF("NSQR=%u is greater than max %u", sc->num_squeues,
1836 sc->max_queues);
1837 sc->num_squeues = sc->max_queues;
1840 nqr = (command->cdw11 >> 16) & 0xFFFF;
1841 if (nqr == 0xffff) {
1842 WPRINTF("%s: Illegal NCQR value %#x", __func__, nqr);
1843 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1844 return;
1847 sc->num_cqueues = ONE_BASED(nqr);
1848 if (sc->num_cqueues > sc->max_queues) {
1849 DPRINTF("NCQR=%u is greater than max %u", sc->num_cqueues,
1850 sc->max_queues);
1851 sc->num_cqueues = sc->max_queues;
1854 /* Patch the command value which will be saved on callback's return */
1855 command->cdw11 = NVME_FEATURE_NUM_QUEUES(sc);
1856 compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc);
1858 sc->num_q_is_set = true;
1861 static int
1862 nvme_opc_set_features(struct pci_nvme_softc *sc, struct nvme_command *command,
1863 struct nvme_completion *compl)
1865 struct nvme_feature_obj *feat;
1866 uint32_t nsid = command->nsid;
1867 uint8_t fid = NVMEV(NVME_FEAT_SET_FID, command->cdw10);
1868 bool sv = NVMEV(NVME_FEAT_SET_SV, command->cdw10);
1870 DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1872 if (fid >= NVME_FID_MAX) {
1873 DPRINTF("%s invalid feature 0x%x", __func__, fid);
1874 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1875 return (1);
1878 if (sv) {
1879 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1880 NVME_SC_FEATURE_NOT_SAVEABLE);
1881 return (1);
1884 feat = &sc->feat[fid];
1886 if (feat->namespace_specific && (nsid == NVME_GLOBAL_NAMESPACE_TAG)) {
1887 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1888 return (1);
1891 if (!feat->namespace_specific &&
1892 !((nsid == 0) || (nsid == NVME_GLOBAL_NAMESPACE_TAG))) {
1893 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1894 NVME_SC_FEATURE_NOT_NS_SPECIFIC);
1895 return (1);
1898 compl->cdw0 = 0;
1899 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1901 if (feat->set)
1902 feat->set(sc, feat, command, compl);
1903 else {
1904 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1905 NVME_SC_FEATURE_NOT_CHANGEABLE);
1906 return (1);
1909 DPRINTF("%s: status=%#x cdw11=%#x", __func__, compl->status, command->cdw11);
1910 if (compl->status == NVME_SC_SUCCESS) {
1911 feat->cdw11 = command->cdw11;
1912 if ((fid == NVME_FEAT_ASYNC_EVENT_CONFIGURATION) &&
1913 (command->cdw11 != 0))
1914 pci_nvme_aen_notify(sc);
1917 return (0);
1920 #define NVME_FEATURES_SEL_SUPPORTED 0x3
1921 #define NVME_FEATURES_NS_SPECIFIC (1 << 1)
1923 static int
1924 nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command,
1925 struct nvme_completion* compl)
1927 struct nvme_feature_obj *feat;
1928 uint8_t fid = command->cdw10 & 0xFF;
1929 uint8_t sel = (command->cdw10 >> 8) & 0x7;
1931 DPRINTF("%s: Feature ID 0x%x (%s)", __func__, fid, nvme_fid_to_name(fid));
1933 if (fid >= NVME_FID_MAX) {
1934 DPRINTF("%s invalid feature 0x%x", __func__, fid);
1935 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1936 return (1);
1939 compl->cdw0 = 0;
1940 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1942 feat = &sc->feat[fid];
1943 if (feat->get) {
1944 feat->get(sc, feat, command, compl);
1947 if (compl->status == NVME_SC_SUCCESS) {
1948 if ((sel == NVME_FEATURES_SEL_SUPPORTED) && feat->namespace_specific)
1949 compl->cdw0 = NVME_FEATURES_NS_SPECIFIC;
1950 else
1951 compl->cdw0 = feat->cdw11;
1954 return (0);
1957 static int
1958 nvme_opc_format_nvm(struct pci_nvme_softc* sc, struct nvme_command* command,
1959 struct nvme_completion* compl)
1961 uint8_t ses, lbaf, pi;
1963 /* Only supports Secure Erase Setting - User Data Erase */
1964 ses = (command->cdw10 >> 9) & 0x7;
1965 if (ses > 0x1) {
1966 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1967 return (1);
1970 /* Only supports a single LBA Format */
1971 lbaf = command->cdw10 & 0xf;
1972 if (lbaf != 0) {
1973 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
1974 NVME_SC_INVALID_FORMAT);
1975 return (1);
1978 /* Doesn't support Protection Infomation */
1979 pi = (command->cdw10 >> 5) & 0x7;
1980 if (pi != 0) {
1981 pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD);
1982 return (1);
1985 if (sc->nvstore.type == NVME_STOR_RAM) {
1986 if (sc->nvstore.ctx)
1987 free(sc->nvstore.ctx);
1988 sc->nvstore.ctx = calloc(1, sc->nvstore.size);
1989 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
1990 } else {
1991 struct pci_nvme_ioreq *req;
1992 int err;
1994 req = pci_nvme_get_ioreq(sc);
1995 if (req == NULL) {
1996 pci_nvme_status_genc(&compl->status,
1997 NVME_SC_INTERNAL_DEVICE_ERROR);
1998 WPRINTF("%s: unable to allocate IO req", __func__);
1999 return (1);
2001 req->nvme_sq = &sc->submit_queues[0];
2002 req->sqid = 0;
2003 req->opc = command->opc;
2004 req->cid = command->cid;
2005 req->nsid = command->nsid;
2007 req->io_req.br_offset = 0;
2008 req->io_req.br_resid = sc->nvstore.size;
2009 req->io_req.br_callback = pci_nvme_io_done;
2011 err = blockif_delete(sc->nvstore.ctx, &req->io_req);
2012 if (err) {
2013 pci_nvme_status_genc(&compl->status,
2014 NVME_SC_INTERNAL_DEVICE_ERROR);
2015 pci_nvme_release_ioreq(sc, req);
2016 } else
2017 compl->status = NVME_NO_STATUS;
2020 return (1);
2023 static int
2024 nvme_opc_abort(struct pci_nvme_softc *sc __unused, struct nvme_command *command,
2025 struct nvme_completion *compl)
2027 DPRINTF("%s submission queue %u, command ID 0x%x", __func__,
2028 command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF);
2030 /* TODO: search for the command ID and abort it */
2032 compl->cdw0 = 1;
2033 pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS);
2034 return (1);
2037 static int
2038 nvme_opc_async_event_req(struct pci_nvme_softc* sc,
2039 struct nvme_command* command, struct nvme_completion* compl)
2041 DPRINTF("%s async event request count=%u aerl=%u cid=%#x", __func__,
2042 sc->aer_count, sc->ctrldata.aerl, command->cid);
2044 /* Don't exceed the Async Event Request Limit (AERL). */
2045 if (pci_nvme_aer_limit_reached(sc)) {
2046 pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC,
2047 NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED);
2048 return (1);
2051 if (pci_nvme_aer_add(sc, command->cid)) {
2052 pci_nvme_status_tc(&compl->status, NVME_SCT_GENERIC,
2053 NVME_SC_INTERNAL_DEVICE_ERROR);
2054 return (1);
2058 * Raise events when they happen based on the Set Features cmd.
2059 * These events happen async, so only set completion successful if
2060 * there is an event reflective of the request to get event.
2062 compl->status = NVME_NO_STATUS;
2063 pci_nvme_aen_notify(sc);
2065 return (0);
2068 static void
2069 pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value)
2071 struct nvme_completion compl;
2072 struct nvme_command *cmd;
2073 struct nvme_submission_queue *sq;
2074 struct nvme_completion_queue *cq;
2075 uint16_t sqhead;
2077 DPRINTF("%s index %u", __func__, (uint32_t)value);
2079 sq = &sc->submit_queues[0];
2080 cq = &sc->compl_queues[0];
2082 pthread_mutex_lock(&sq->mtx);
2084 sqhead = sq->head;
2085 DPRINTF("sqhead %u, tail %u", sqhead, sq->tail);
2087 while (sqhead != atomic_load_acq_short(&sq->tail)) {
2088 cmd = &(sq->qbase)[sqhead];
2089 compl.cdw0 = 0;
2090 compl.status = 0;
2092 switch (cmd->opc) {
2093 case NVME_OPC_DELETE_IO_SQ:
2094 DPRINTF("%s command DELETE_IO_SQ", __func__);
2095 nvme_opc_delete_io_sq(sc, cmd, &compl);
2096 break;
2097 case NVME_OPC_CREATE_IO_SQ:
2098 DPRINTF("%s command CREATE_IO_SQ", __func__);
2099 nvme_opc_create_io_sq(sc, cmd, &compl);
2100 break;
2101 case NVME_OPC_DELETE_IO_CQ:
2102 DPRINTF("%s command DELETE_IO_CQ", __func__);
2103 nvme_opc_delete_io_cq(sc, cmd, &compl);
2104 break;
2105 case NVME_OPC_CREATE_IO_CQ:
2106 DPRINTF("%s command CREATE_IO_CQ", __func__);
2107 nvme_opc_create_io_cq(sc, cmd, &compl);
2108 break;
2109 case NVME_OPC_GET_LOG_PAGE:
2110 DPRINTF("%s command GET_LOG_PAGE", __func__);
2111 nvme_opc_get_log_page(sc, cmd, &compl);
2112 break;
2113 case NVME_OPC_IDENTIFY:
2114 DPRINTF("%s command IDENTIFY", __func__);
2115 nvme_opc_identify(sc, cmd, &compl);
2116 break;
2117 case NVME_OPC_ABORT:
2118 DPRINTF("%s command ABORT", __func__);
2119 nvme_opc_abort(sc, cmd, &compl);
2120 break;
2121 case NVME_OPC_SET_FEATURES:
2122 DPRINTF("%s command SET_FEATURES", __func__);
2123 nvme_opc_set_features(sc, cmd, &compl);
2124 break;
2125 case NVME_OPC_GET_FEATURES:
2126 DPRINTF("%s command GET_FEATURES", __func__);
2127 nvme_opc_get_features(sc, cmd, &compl);
2128 break;
2129 case NVME_OPC_FIRMWARE_ACTIVATE:
2130 DPRINTF("%s command FIRMWARE_ACTIVATE", __func__);
2131 pci_nvme_status_tc(&compl.status,
2132 NVME_SCT_COMMAND_SPECIFIC,
2133 NVME_SC_INVALID_FIRMWARE_SLOT);
2134 break;
2135 case NVME_OPC_ASYNC_EVENT_REQUEST:
2136 DPRINTF("%s command ASYNC_EVENT_REQ", __func__);
2137 nvme_opc_async_event_req(sc, cmd, &compl);
2138 break;
2139 case NVME_OPC_FORMAT_NVM:
2140 DPRINTF("%s command FORMAT_NVM", __func__);
2141 if ((sc->ctrldata.oacs &
2142 (1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT)) == 0) {
2143 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
2144 break;
2146 nvme_opc_format_nvm(sc, cmd, &compl);
2147 break;
2148 case NVME_OPC_SECURITY_SEND:
2149 case NVME_OPC_SECURITY_RECEIVE:
2150 case NVME_OPC_SANITIZE:
2151 case NVME_OPC_GET_LBA_STATUS:
2152 DPRINTF("%s command OPC=%#x (unsupported)", __func__,
2153 cmd->opc);
2154 /* Valid but unsupported opcodes */
2155 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_FIELD);
2156 break;
2157 default:
2158 DPRINTF("%s command OPC=%#X (not implemented)",
2159 __func__,
2160 cmd->opc);
2161 pci_nvme_status_genc(&compl.status, NVME_SC_INVALID_OPCODE);
2163 sqhead = (sqhead + 1) % sq->size;
2165 if (NVME_COMPLETION_VALID(compl)) {
2166 pci_nvme_cq_update(sc, &sc->compl_queues[0],
2167 compl.cdw0,
2168 cmd->cid,
2169 0, /* SQID */
2170 compl.status);
2174 DPRINTF("setting sqhead %u", sqhead);
2175 sq->head = sqhead;
2177 if (cq->head != cq->tail)
2178 pci_generate_msix(sc->nsc_pi, 0);
2180 pthread_mutex_unlock(&sq->mtx);
2184 * Update the Write and Read statistics reported in SMART data
2186 * NVMe defines "data unit" as thousand's of 512 byte blocks and is rounded up.
2187 * E.g. 1 data unit is 1 - 1,000 512 byte blocks. 3 data units are 2,001 - 3,000
2188 * 512 byte blocks. Rounding up is acheived by initializing the remainder to 999.
2190 static void
2191 pci_nvme_stats_write_read_update(struct pci_nvme_softc *sc, uint8_t opc,
2192 size_t bytes, uint16_t status)
2195 pthread_mutex_lock(&sc->mtx);
2196 switch (opc) {
2197 case NVME_OPC_WRITE:
2198 sc->write_commands++;
2199 if (status != NVME_SC_SUCCESS)
2200 break;
2201 sc->write_dunits_remainder += (bytes / 512);
2202 while (sc->write_dunits_remainder >= 1000) {
2203 sc->write_data_units++;
2204 sc->write_dunits_remainder -= 1000;
2206 break;
2207 case NVME_OPC_READ:
2208 sc->read_commands++;
2209 if (status != NVME_SC_SUCCESS)
2210 break;
2211 sc->read_dunits_remainder += (bytes / 512);
2212 while (sc->read_dunits_remainder >= 1000) {
2213 sc->read_data_units++;
2214 sc->read_dunits_remainder -= 1000;
2216 break;
2217 default:
2218 DPRINTF("%s: Invalid OPC 0x%02x for stats", __func__, opc);
2219 break;
2221 pthread_mutex_unlock(&sc->mtx);
2225 * Check if the combination of Starting LBA (slba) and number of blocks
2226 * exceeds the range of the underlying storage.
2228 * Because NVMe specifies the SLBA in blocks as a uint64_t and blockif stores
2229 * the capacity in bytes as a uint64_t, care must be taken to avoid integer
2230 * overflow.
2232 static bool
2233 pci_nvme_out_of_range(struct pci_nvme_blockstore *nvstore, uint64_t slba,
2234 uint32_t nblocks)
2236 size_t offset, bytes;
2238 /* Overflow check of multiplying Starting LBA by the sector size */
2239 if (slba >> (64 - nvstore->sectsz_bits))
2240 return (true);
2242 offset = slba << nvstore->sectsz_bits;
2243 bytes = nblocks << nvstore->sectsz_bits;
2245 /* Overflow check of Number of Logical Blocks */
2246 if ((nvstore->size <= offset) || ((nvstore->size - offset) < bytes))
2247 return (true);
2249 return (false);
2252 static int
2253 pci_nvme_append_iov_req(struct pci_nvme_softc *sc __unused,
2254 struct pci_nvme_ioreq *req, uint64_t gpaddr, size_t size, uint64_t offset)
2256 int iovidx;
2257 bool range_is_contiguous;
2259 if (req == NULL)
2260 return (-1);
2262 if (req->io_req.br_iovcnt == NVME_MAX_IOVEC) {
2263 return (-1);
2267 * Minimize the number of IOVs by concatenating contiguous address
2268 * ranges. If the IOV count is zero, there is no previous range to
2269 * concatenate.
2271 if (req->io_req.br_iovcnt == 0)
2272 range_is_contiguous = false;
2273 else
2274 range_is_contiguous = (req->prev_gpaddr + req->prev_size) == gpaddr;
2276 if (range_is_contiguous) {
2277 iovidx = req->io_req.br_iovcnt - 1;
2279 req->io_req.br_iov[iovidx].iov_base =
2280 paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
2281 req->prev_gpaddr, size);
2282 if (req->io_req.br_iov[iovidx].iov_base == NULL)
2283 return (-1);
2285 req->prev_size += size;
2286 req->io_req.br_resid += size;
2288 req->io_req.br_iov[iovidx].iov_len = req->prev_size;
2289 } else {
2290 iovidx = req->io_req.br_iovcnt;
2291 if (iovidx == 0) {
2292 req->io_req.br_offset = offset;
2293 req->io_req.br_resid = 0;
2294 req->io_req.br_param = req;
2297 req->io_req.br_iov[iovidx].iov_base =
2298 paddr_guest2host(req->sc->nsc_pi->pi_vmctx,
2299 gpaddr, size);
2300 if (req->io_req.br_iov[iovidx].iov_base == NULL)
2301 return (-1);
2303 req->io_req.br_iov[iovidx].iov_len = size;
2305 req->prev_gpaddr = gpaddr;
2306 req->prev_size = size;
2307 req->io_req.br_resid += size;
2309 req->io_req.br_iovcnt++;
2312 return (0);
2315 static void
2316 pci_nvme_set_completion(struct pci_nvme_softc *sc,
2317 struct nvme_submission_queue *sq, int sqid, uint16_t cid, uint16_t status)
2319 struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid];
2321 DPRINTF("%s sqid %d cqid %u cid %u status: 0x%x 0x%x",
2322 __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status),
2323 NVME_STATUS_GET_SC(status));
2325 pci_nvme_cq_update(sc, cq, 0, cid, sqid, status);
2327 if (cq->head != cq->tail) {
2328 if (cq->intr_en & NVME_CQ_INTEN) {
2329 pci_generate_msix(sc->nsc_pi, cq->intr_vec);
2330 } else {
2331 DPRINTF("%s: CQ%u interrupt disabled",
2332 __func__, sq->cqid);
2337 static void
2338 pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req)
2340 req->sc = NULL;
2341 req->nvme_sq = NULL;
2342 req->sqid = 0;
2344 pthread_mutex_lock(&sc->mtx);
2346 STAILQ_INSERT_TAIL(&sc->ioreqs_free, req, link);
2347 sc->pending_ios--;
2349 /* when no more IO pending, can set to ready if device reset/enabled */
2350 if (sc->pending_ios == 0 &&
2351 NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts)))
2352 sc->regs.csts |= NVME_CSTS_RDY;
2354 pthread_mutex_unlock(&sc->mtx);
2356 sem_post(&sc->iosemlock);
2359 static struct pci_nvme_ioreq *
2360 pci_nvme_get_ioreq(struct pci_nvme_softc *sc)
2362 struct pci_nvme_ioreq *req = NULL;
2364 sem_wait(&sc->iosemlock);
2365 pthread_mutex_lock(&sc->mtx);
2367 req = STAILQ_FIRST(&sc->ioreqs_free);
2368 assert(req != NULL);
2369 STAILQ_REMOVE_HEAD(&sc->ioreqs_free, link);
2371 req->sc = sc;
2373 sc->pending_ios++;
2375 pthread_mutex_unlock(&sc->mtx);
2377 req->io_req.br_iovcnt = 0;
2378 req->io_req.br_offset = 0;
2379 req->io_req.br_resid = 0;
2380 req->io_req.br_param = req;
2381 req->prev_gpaddr = 0;
2382 req->prev_size = 0;
2384 return req;
2387 static void
2388 pci_nvme_io_done(struct blockif_req *br, int err)
2390 struct pci_nvme_ioreq *req = br->br_param;
2391 struct nvme_submission_queue *sq = req->nvme_sq;
2392 uint16_t code, status;
2394 DPRINTF("%s error %d %s", __func__, err, strerror(err));
2396 /* TODO return correct error */
2397 code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS;
2398 status = 0;
2399 pci_nvme_status_genc(&status, code);
2401 pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, status);
2402 pci_nvme_stats_write_read_update(req->sc, req->opc,
2403 req->bytes, status);
2404 pci_nvme_release_ioreq(req->sc, req);
2408 * Implements the Flush command. The specification states:
2409 * If a volatile write cache is not present, Flush commands complete
2410 * successfully and have no effect
2411 * in the description of the Volatile Write Cache (VWC) field of the Identify
2412 * Controller data. Therefore, set status to Success if the command is
2413 * not supported (i.e. RAM or as indicated by the blockif).
2415 static bool
2416 nvme_opc_flush(struct pci_nvme_softc *sc __unused,
2417 struct nvme_command *cmd __unused,
2418 struct pci_nvme_blockstore *nvstore,
2419 struct pci_nvme_ioreq *req,
2420 uint16_t *status)
2422 bool pending = false;
2424 if (nvstore->type == NVME_STOR_RAM) {
2425 pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2426 } else {
2427 int err;
2429 req->io_req.br_callback = pci_nvme_io_done;
2431 err = blockif_flush(nvstore->ctx, &req->io_req);
2432 switch (err) {
2433 case 0:
2434 pending = true;
2435 break;
2436 case EOPNOTSUPP:
2437 pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2438 break;
2439 default:
2440 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2444 return (pending);
2447 static uint16_t
2448 nvme_write_read_ram(struct pci_nvme_softc *sc,
2449 struct pci_nvme_blockstore *nvstore,
2450 uint64_t prp1, uint64_t prp2,
2451 size_t offset, uint64_t bytes,
2452 bool is_write)
2454 uint8_t *buf = nvstore->ctx;
2455 enum nvme_copy_dir dir;
2456 uint16_t status;
2458 if (is_write)
2459 dir = NVME_COPY_TO_PRP;
2460 else
2461 dir = NVME_COPY_FROM_PRP;
2463 status = 0;
2464 if (nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, prp1, prp2,
2465 buf + offset, bytes, dir))
2466 pci_nvme_status_genc(&status,
2467 NVME_SC_DATA_TRANSFER_ERROR);
2468 else
2469 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2471 return (status);
2474 static uint16_t
2475 nvme_write_read_blockif(struct pci_nvme_softc *sc,
2476 struct pci_nvme_blockstore *nvstore,
2477 struct pci_nvme_ioreq *req,
2478 uint64_t prp1, uint64_t prp2,
2479 size_t offset, uint64_t bytes,
2480 bool is_write)
2482 uint64_t size;
2483 int err;
2484 uint16_t status = NVME_NO_STATUS;
2486 size = MIN(PAGE_SIZE - (prp1 % PAGE_SIZE), bytes);
2487 if (pci_nvme_append_iov_req(sc, req, prp1, size, offset)) {
2488 err = -1;
2489 goto out;
2492 offset += size;
2493 bytes -= size;
2495 if (bytes == 0) {
2497 } else if (bytes <= PAGE_SIZE) {
2498 size = bytes;
2499 if (pci_nvme_append_iov_req(sc, req, prp2, size, offset)) {
2500 err = -1;
2501 goto out;
2503 } else {
2504 void *vmctx = sc->nsc_pi->pi_vmctx;
2505 uint64_t *prp_list = &prp2;
2506 uint64_t *last = prp_list;
2508 /* PRP2 is pointer to a physical region page list */
2509 while (bytes) {
2510 /* Last entry in list points to the next list */
2511 if ((prp_list == last) && (bytes > PAGE_SIZE)) {
2512 uint64_t prp = *prp_list;
2514 prp_list = paddr_guest2host(vmctx, prp,
2515 PAGE_SIZE - (prp % PAGE_SIZE));
2516 if (prp_list == NULL) {
2517 err = -1;
2518 goto out;
2520 last = prp_list + (NVME_PRP2_ITEMS - 1);
2523 size = MIN(bytes, PAGE_SIZE);
2525 if (pci_nvme_append_iov_req(sc, req, *prp_list, size,
2526 offset)) {
2527 err = -1;
2528 goto out;
2531 offset += size;
2532 bytes -= size;
2534 prp_list++;
2537 req->io_req.br_callback = pci_nvme_io_done;
2538 if (is_write)
2539 err = blockif_write(nvstore->ctx, &req->io_req);
2540 else
2541 err = blockif_read(nvstore->ctx, &req->io_req);
2542 out:
2543 if (err)
2544 pci_nvme_status_genc(&status, NVME_SC_DATA_TRANSFER_ERROR);
2546 return (status);
2549 static bool
2550 nvme_opc_write_read(struct pci_nvme_softc *sc,
2551 struct nvme_command *cmd,
2552 struct pci_nvme_blockstore *nvstore,
2553 struct pci_nvme_ioreq *req,
2554 uint16_t *status)
2556 uint64_t lba, nblocks, bytes;
2557 size_t offset;
2558 bool is_write = cmd->opc == NVME_OPC_WRITE;
2559 bool pending = false;
2561 lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10;
2562 nblocks = (cmd->cdw12 & 0xFFFF) + 1;
2563 bytes = nblocks << nvstore->sectsz_bits;
2564 if (bytes > NVME_MAX_DATA_SIZE) {
2565 WPRINTF("%s command would exceed MDTS", __func__);
2566 pci_nvme_status_genc(status, NVME_SC_INVALID_FIELD);
2567 goto out;
2570 if (pci_nvme_out_of_range(nvstore, lba, nblocks)) {
2571 WPRINTF("%s command would exceed LBA range(slba=%#lx nblocks=%#lx)",
2572 __func__, lba, nblocks);
2573 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
2574 goto out;
2577 offset = lba << nvstore->sectsz_bits;
2579 req->bytes = bytes;
2580 req->io_req.br_offset = lba;
2582 /* PRP bits 1:0 must be zero */
2583 cmd->prp1 &= ~0x3UL;
2584 cmd->prp2 &= ~0x3UL;
2586 if (nvstore->type == NVME_STOR_RAM) {
2587 *status = nvme_write_read_ram(sc, nvstore, cmd->prp1,
2588 cmd->prp2, offset, bytes, is_write);
2589 } else {
2590 *status = nvme_write_read_blockif(sc, nvstore, req,
2591 cmd->prp1, cmd->prp2, offset, bytes, is_write);
2593 if (*status == NVME_NO_STATUS)
2594 pending = true;
2596 out:
2597 if (!pending)
2598 pci_nvme_stats_write_read_update(sc, cmd->opc, bytes, *status);
2600 return (pending);
2603 static void
2604 pci_nvme_dealloc_sm(struct blockif_req *br, int err)
2606 struct pci_nvme_ioreq *req = br->br_param;
2607 struct pci_nvme_softc *sc = req->sc;
2608 bool done = true;
2609 uint16_t status;
2611 status = 0;
2612 if (err) {
2613 pci_nvme_status_genc(&status, NVME_SC_INTERNAL_DEVICE_ERROR);
2614 } else if ((req->prev_gpaddr + 1) == (req->prev_size)) {
2615 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2616 } else {
2617 struct iovec *iov = req->io_req.br_iov;
2619 req->prev_gpaddr++;
2620 iov += req->prev_gpaddr;
2622 /* The iov_* values already include the sector size */
2623 req->io_req.br_offset = (off_t)iov->iov_base;
2624 req->io_req.br_resid = iov->iov_len;
2625 if (blockif_delete(sc->nvstore.ctx, &req->io_req)) {
2626 pci_nvme_status_genc(&status,
2627 NVME_SC_INTERNAL_DEVICE_ERROR);
2628 } else
2629 done = false;
2632 if (done) {
2633 pci_nvme_set_completion(sc, req->nvme_sq, req->sqid, req->cid,
2634 status);
2635 pci_nvme_release_ioreq(sc, req);
2639 static bool
2640 nvme_opc_dataset_mgmt(struct pci_nvme_softc *sc,
2641 struct nvme_command *cmd,
2642 struct pci_nvme_blockstore *nvstore,
2643 struct pci_nvme_ioreq *req,
2644 uint16_t *status)
2646 struct nvme_dsm_range *range = NULL;
2647 uint32_t nr, r, non_zero, dr;
2648 int err;
2649 bool pending = false;
2651 if ((sc->ctrldata.oncs & NVME_ONCS_DSM) == 0) {
2652 pci_nvme_status_genc(status, NVME_SC_INVALID_OPCODE);
2653 goto out;
2656 nr = cmd->cdw10 & 0xff;
2658 /* copy locally because a range entry could straddle PRPs */
2659 #ifdef __FreeBSD__
2660 range = calloc(1, NVME_MAX_DSM_TRIM);
2661 #else
2662 _Static_assert(NVME_MAX_DSM_TRIM % sizeof(struct nvme_dsm_range) == 0,
2663 "NVME_MAX_DSM_TRIM is not a multiple of struct size");
2664 range = calloc(NVME_MAX_DSM_TRIM / sizeof (*range), sizeof (*range));
2665 #endif
2666 if (range == NULL) {
2667 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2668 goto out;
2670 nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, cmd->prp1, cmd->prp2,
2671 (uint8_t *)range, NVME_MAX_DSM_TRIM, NVME_COPY_FROM_PRP);
2673 /* Check for invalid ranges and the number of non-zero lengths */
2674 non_zero = 0;
2675 for (r = 0; r <= nr; r++) {
2676 if (pci_nvme_out_of_range(nvstore,
2677 range[r].starting_lba, range[r].length)) {
2678 pci_nvme_status_genc(status, NVME_SC_LBA_OUT_OF_RANGE);
2679 goto out;
2681 if (range[r].length != 0)
2682 non_zero++;
2685 if (cmd->cdw11 & NVME_DSM_ATTR_DEALLOCATE) {
2686 size_t offset, bytes;
2687 int sectsz_bits = sc->nvstore.sectsz_bits;
2690 * DSM calls are advisory only, and compliant controllers
2691 * may choose to take no actions (i.e. return Success).
2693 if (!nvstore->deallocate) {
2694 pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2695 goto out;
2698 /* If all ranges have a zero length, return Success */
2699 if (non_zero == 0) {
2700 pci_nvme_status_genc(status, NVME_SC_SUCCESS);
2701 goto out;
2704 if (req == NULL) {
2705 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2706 goto out;
2709 offset = range[0].starting_lba << sectsz_bits;
2710 bytes = range[0].length << sectsz_bits;
2713 * If the request is for more than a single range, store
2714 * the ranges in the br_iov. Optimize for the common case
2715 * of a single range.
2717 * Note that NVMe Number of Ranges is a zero based value
2719 req->io_req.br_iovcnt = 0;
2720 req->io_req.br_offset = offset;
2721 req->io_req.br_resid = bytes;
2723 if (nr == 0) {
2724 req->io_req.br_callback = pci_nvme_io_done;
2725 } else {
2726 struct iovec *iov = req->io_req.br_iov;
2728 for (r = 0, dr = 0; r <= nr; r++) {
2729 offset = range[r].starting_lba << sectsz_bits;
2730 bytes = range[r].length << sectsz_bits;
2731 if (bytes == 0)
2732 continue;
2734 if ((nvstore->size - offset) < bytes) {
2735 pci_nvme_status_genc(status,
2736 NVME_SC_LBA_OUT_OF_RANGE);
2737 goto out;
2739 iov[dr].iov_base = (void *)offset;
2740 iov[dr].iov_len = bytes;
2741 dr++;
2743 req->io_req.br_callback = pci_nvme_dealloc_sm;
2746 * Use prev_gpaddr to track the current entry and
2747 * prev_size to track the number of entries
2749 req->prev_gpaddr = 0;
2750 req->prev_size = dr;
2753 err = blockif_delete(nvstore->ctx, &req->io_req);
2754 if (err)
2755 pci_nvme_status_genc(status, NVME_SC_INTERNAL_DEVICE_ERROR);
2756 else
2757 pending = true;
2759 out:
2760 free(range);
2761 return (pending);
2764 static void
2765 pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx)
2767 struct nvme_submission_queue *sq;
2768 uint16_t status;
2769 uint16_t sqhead;
2771 /* handle all submissions up to sq->tail index */
2772 sq = &sc->submit_queues[idx];
2774 pthread_mutex_lock(&sq->mtx);
2776 sqhead = sq->head;
2777 DPRINTF("nvme_handle_io qid %u head %u tail %u cmdlist %p",
2778 idx, sqhead, sq->tail, sq->qbase);
2780 while (sqhead != atomic_load_acq_short(&sq->tail)) {
2781 struct nvme_command *cmd;
2782 struct pci_nvme_ioreq *req;
2783 uint32_t nsid;
2784 bool pending;
2786 pending = false;
2787 req = NULL;
2788 status = 0;
2790 cmd = &sq->qbase[sqhead];
2791 sqhead = (sqhead + 1) % sq->size;
2793 nsid = le32toh(cmd->nsid);
2794 if ((nsid == 0) || (nsid > sc->ctrldata.nn)) {
2795 pci_nvme_status_genc(&status,
2796 NVME_SC_INVALID_NAMESPACE_OR_FORMAT);
2797 status |=
2798 NVME_STATUS_DNR_MASK << NVME_STATUS_DNR_SHIFT;
2799 goto complete;
2802 req = pci_nvme_get_ioreq(sc);
2803 if (req == NULL) {
2804 pci_nvme_status_genc(&status,
2805 NVME_SC_INTERNAL_DEVICE_ERROR);
2806 WPRINTF("%s: unable to allocate IO req", __func__);
2807 goto complete;
2809 req->nvme_sq = sq;
2810 req->sqid = idx;
2811 req->opc = cmd->opc;
2812 req->cid = cmd->cid;
2813 req->nsid = cmd->nsid;
2815 switch (cmd->opc) {
2816 case NVME_OPC_FLUSH:
2817 pending = nvme_opc_flush(sc, cmd, &sc->nvstore,
2818 req, &status);
2819 break;
2820 case NVME_OPC_WRITE:
2821 case NVME_OPC_READ:
2822 pending = nvme_opc_write_read(sc, cmd, &sc->nvstore,
2823 req, &status);
2824 break;
2825 case NVME_OPC_WRITE_ZEROES:
2826 /* TODO: write zeroes
2827 WPRINTF("%s write zeroes lba 0x%lx blocks %u",
2828 __func__, lba, cmd->cdw12 & 0xFFFF); */
2829 pci_nvme_status_genc(&status, NVME_SC_SUCCESS);
2830 break;
2831 case NVME_OPC_DATASET_MANAGEMENT:
2832 pending = nvme_opc_dataset_mgmt(sc, cmd, &sc->nvstore,
2833 req, &status);
2834 break;
2835 default:
2836 WPRINTF("%s unhandled io command 0x%x",
2837 __func__, cmd->opc);
2838 pci_nvme_status_genc(&status, NVME_SC_INVALID_OPCODE);
2840 complete:
2841 if (!pending) {
2842 pci_nvme_set_completion(sc, sq, idx, cmd->cid, status);
2843 if (req != NULL)
2844 pci_nvme_release_ioreq(sc, req);
2848 sq->head = sqhead;
2850 pthread_mutex_unlock(&sq->mtx);
2853 static void
2854 pci_nvme_handle_doorbell(struct pci_nvme_softc* sc,
2855 uint64_t idx, int is_sq, uint64_t value)
2857 DPRINTF("nvme doorbell %lu, %s, val 0x%lx",
2858 idx, is_sq ? "SQ" : "CQ", value & 0xFFFF);
2860 if (is_sq) {
2861 if (idx > sc->num_squeues) {
2862 WPRINTF("%s queue index %lu overflow from "
2863 "guest (max %u)",
2864 __func__, idx, sc->num_squeues);
2865 return;
2868 atomic_store_short(&sc->submit_queues[idx].tail,
2869 (uint16_t)value);
2871 if (idx == 0) {
2872 pci_nvme_handle_admin_cmd(sc, value);
2873 } else {
2874 /* submission queue; handle new entries in SQ */
2875 if (idx > sc->num_squeues) {
2876 WPRINTF("%s SQ index %lu overflow from "
2877 "guest (max %u)",
2878 __func__, idx, sc->num_squeues);
2879 return;
2881 pci_nvme_handle_io_cmd(sc, (uint16_t)idx);
2883 } else {
2884 if (idx > sc->num_cqueues) {
2885 WPRINTF("%s queue index %lu overflow from "
2886 "guest (max %u)",
2887 __func__, idx, sc->num_cqueues);
2888 return;
2891 atomic_store_short(&sc->compl_queues[idx].head,
2892 (uint16_t)value);
2896 static void
2897 pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite)
2899 const char *s = iswrite ? "WRITE" : "READ";
2901 switch (offset) {
2902 case NVME_CR_CAP_LOW:
2903 DPRINTF("%s %s NVME_CR_CAP_LOW", func, s);
2904 break;
2905 case NVME_CR_CAP_HI:
2906 DPRINTF("%s %s NVME_CR_CAP_HI", func, s);
2907 break;
2908 case NVME_CR_VS:
2909 DPRINTF("%s %s NVME_CR_VS", func, s);
2910 break;
2911 case NVME_CR_INTMS:
2912 DPRINTF("%s %s NVME_CR_INTMS", func, s);
2913 break;
2914 case NVME_CR_INTMC:
2915 DPRINTF("%s %s NVME_CR_INTMC", func, s);
2916 break;
2917 case NVME_CR_CC:
2918 DPRINTF("%s %s NVME_CR_CC", func, s);
2919 break;
2920 case NVME_CR_CSTS:
2921 DPRINTF("%s %s NVME_CR_CSTS", func, s);
2922 break;
2923 case NVME_CR_NSSR:
2924 DPRINTF("%s %s NVME_CR_NSSR", func, s);
2925 break;
2926 case NVME_CR_AQA:
2927 DPRINTF("%s %s NVME_CR_AQA", func, s);
2928 break;
2929 case NVME_CR_ASQ_LOW:
2930 DPRINTF("%s %s NVME_CR_ASQ_LOW", func, s);
2931 break;
2932 case NVME_CR_ASQ_HI:
2933 DPRINTF("%s %s NVME_CR_ASQ_HI", func, s);
2934 break;
2935 case NVME_CR_ACQ_LOW:
2936 DPRINTF("%s %s NVME_CR_ACQ_LOW", func, s);
2937 break;
2938 case NVME_CR_ACQ_HI:
2939 DPRINTF("%s %s NVME_CR_ACQ_HI", func, s);
2940 break;
2941 default:
2942 DPRINTF("unknown nvme bar-0 offset 0x%lx", offset);
2947 static void
2948 pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc,
2949 uint64_t offset, int size, uint64_t value)
2951 uint32_t ccreg;
2953 if (offset >= NVME_DOORBELL_OFFSET) {
2954 uint64_t belloffset = offset - NVME_DOORBELL_OFFSET;
2955 uint64_t idx = belloffset / 8; /* door bell size = 2*int */
2956 int is_sq = (belloffset % 8) < 4;
2958 if ((sc->regs.csts & NVME_CSTS_RDY) == 0) {
2959 WPRINTF("doorbell write prior to RDY (offset=%#lx)\n",
2960 offset);
2961 return;
2964 if (belloffset > ((sc->max_queues+1) * 8 - 4)) {
2965 WPRINTF("guest attempted an overflow write offset "
2966 "0x%lx, val 0x%lx in %s",
2967 offset, value, __func__);
2968 return;
2971 if (is_sq) {
2972 if (sc->submit_queues[idx].qbase == NULL)
2973 return;
2974 } else if (sc->compl_queues[idx].qbase == NULL)
2975 return;
2977 pci_nvme_handle_doorbell(sc, idx, is_sq, value);
2978 return;
2981 DPRINTF("nvme-write offset 0x%lx, size %d, value 0x%lx",
2982 offset, size, value);
2984 if (size != 4) {
2985 WPRINTF("guest wrote invalid size %d (offset 0x%lx, "
2986 "val 0x%lx) to bar0 in %s",
2987 size, offset, value, __func__);
2988 /* TODO: shutdown device */
2989 return;
2992 pci_nvme_bar0_reg_dumps(__func__, offset, 1);
2994 pthread_mutex_lock(&sc->mtx);
2996 switch (offset) {
2997 case NVME_CR_CAP_LOW:
2998 case NVME_CR_CAP_HI:
2999 /* readonly */
3000 break;
3001 case NVME_CR_VS:
3002 /* readonly */
3003 break;
3004 case NVME_CR_INTMS:
3005 /* MSI-X, so ignore */
3006 break;
3007 case NVME_CR_INTMC:
3008 /* MSI-X, so ignore */
3009 break;
3010 case NVME_CR_CC:
3011 ccreg = (uint32_t)value;
3013 DPRINTF("%s NVME_CR_CC en %x css %x shn %x iosqes %u "
3014 "iocqes %u",
3015 __func__,
3016 NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg),
3017 NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg),
3018 NVME_CC_GET_IOCQES(ccreg));
3020 if (NVME_CC_GET_SHN(ccreg)) {
3021 /* perform shutdown - flush out data to backend */
3022 sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK <<
3023 NVME_CSTS_REG_SHST_SHIFT);
3024 sc->regs.csts |= NVME_SHST_COMPLETE <<
3025 NVME_CSTS_REG_SHST_SHIFT;
3027 if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) {
3028 if (NVME_CC_GET_EN(ccreg) == 0)
3029 /* transition 1-> causes controller reset */
3030 pci_nvme_reset_locked(sc);
3031 else
3032 pci_nvme_init_controller(ctx, sc);
3035 /* Insert the iocqes, iosqes and en bits from the write */
3036 sc->regs.cc &= ~NVME_CC_WRITE_MASK;
3037 sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK;
3038 if (NVME_CC_GET_EN(ccreg) == 0) {
3039 /* Insert the ams, mps and css bit fields */
3040 sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK;
3041 sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK;
3042 sc->regs.csts &= ~NVME_CSTS_RDY;
3043 } else if ((sc->pending_ios == 0) &&
3044 !(sc->regs.csts & NVME_CSTS_CFS)) {
3045 sc->regs.csts |= NVME_CSTS_RDY;
3047 break;
3048 case NVME_CR_CSTS:
3049 break;
3050 case NVME_CR_NSSR:
3051 /* ignore writes; don't support subsystem reset */
3052 break;
3053 case NVME_CR_AQA:
3054 sc->regs.aqa = (uint32_t)value;
3055 break;
3056 case NVME_CR_ASQ_LOW:
3057 sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) |
3058 (0xFFFFF000 & value);
3059 break;
3060 case NVME_CR_ASQ_HI:
3061 sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) |
3062 (value << 32);
3063 break;
3064 case NVME_CR_ACQ_LOW:
3065 sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) |
3066 (0xFFFFF000 & value);
3067 break;
3068 case NVME_CR_ACQ_HI:
3069 sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) |
3070 (value << 32);
3071 break;
3072 default:
3073 DPRINTF("%s unknown offset 0x%lx, value 0x%lx size %d",
3074 __func__, offset, value, size);
3076 pthread_mutex_unlock(&sc->mtx);
3079 static void
3080 pci_nvme_write(struct vmctx *ctx, struct pci_devinst *pi,
3081 int baridx, uint64_t offset, int size, uint64_t value)
3083 struct pci_nvme_softc* sc = pi->pi_arg;
3085 if (baridx == pci_msix_table_bar(pi) ||
3086 baridx == pci_msix_pba_bar(pi)) {
3087 DPRINTF("nvme-write baridx %d, msix: off 0x%lx, size %d, "
3088 " value 0x%lx", baridx, offset, size, value);
3090 pci_emul_msix_twrite(pi, offset, size, value);
3091 return;
3094 switch (baridx) {
3095 case 0:
3096 pci_nvme_write_bar_0(ctx, sc, offset, size, value);
3097 break;
3099 default:
3100 DPRINTF("%s unknown baridx %d, val 0x%lx",
3101 __func__, baridx, value);
3105 static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc,
3106 uint64_t offset, int size)
3108 uint64_t value;
3110 pci_nvme_bar0_reg_dumps(__func__, offset, 0);
3112 if (offset < NVME_DOORBELL_OFFSET) {
3113 void *p = &(sc->regs);
3114 pthread_mutex_lock(&sc->mtx);
3115 memcpy(&value, (void *)((uintptr_t)p + offset), size);
3116 pthread_mutex_unlock(&sc->mtx);
3117 } else {
3118 value = 0;
3119 WPRINTF("pci_nvme: read invalid offset %ld", offset);
3122 switch (size) {
3123 case 1:
3124 value &= 0xFF;
3125 break;
3126 case 2:
3127 value &= 0xFFFF;
3128 break;
3129 case 4:
3130 value &= 0xFFFFFFFF;
3131 break;
3134 DPRINTF(" nvme-read offset 0x%lx, size %d -> value 0x%x",
3135 offset, size, (uint32_t)value);
3137 return (value);
3142 static uint64_t
3143 pci_nvme_read(struct vmctx *ctx __unused,
3144 struct pci_devinst *pi, int baridx, uint64_t offset, int size)
3146 struct pci_nvme_softc* sc = pi->pi_arg;
3148 if (baridx == pci_msix_table_bar(pi) ||
3149 baridx == pci_msix_pba_bar(pi)) {
3150 DPRINTF("nvme-read bar: %d, msix: regoff 0x%lx, size %d",
3151 baridx, offset, size);
3153 return pci_emul_msix_tread(pi, offset, size);
3156 switch (baridx) {
3157 case 0:
3158 return pci_nvme_read_bar_0(sc, offset, size);
3160 default:
3161 DPRINTF("unknown bar %d, 0x%lx", baridx, offset);
3164 return (0);
3167 static int
3168 pci_nvme_parse_config(struct pci_nvme_softc *sc, nvlist_t *nvl)
3170 char bident[sizeof("XXX:XXX")];
3171 const char *value;
3172 uint32_t sectsz;
3174 sc->max_queues = NVME_QUEUES;
3175 sc->max_qentries = NVME_MAX_QENTRIES;
3176 sc->ioslots = NVME_IOSLOTS;
3177 sc->num_squeues = sc->max_queues;
3178 sc->num_cqueues = sc->max_queues;
3179 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
3180 sectsz = 0;
3181 #ifdef __FreeBSD__
3182 snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn),
3183 "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
3184 #else
3185 snprintf((char *)sc->ctrldata.sn, sizeof(sc->ctrldata.sn),
3186 "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
3187 #endif
3189 value = get_config_value_node(nvl, "maxq");
3190 if (value != NULL)
3191 sc->max_queues = atoi(value);
3192 value = get_config_value_node(nvl, "qsz");
3193 if (value != NULL) {
3194 sc->max_qentries = atoi(value);
3195 if (sc->max_qentries <= 0) {
3196 EPRINTLN("nvme: Invalid qsz option %d",
3197 sc->max_qentries);
3198 return (-1);
3201 value = get_config_value_node(nvl, "ioslots");
3202 if (value != NULL) {
3203 sc->ioslots = atoi(value);
3204 if (sc->ioslots <= 0) {
3205 EPRINTLN("Invalid ioslots option %d", sc->ioslots);
3206 return (-1);
3209 value = get_config_value_node(nvl, "sectsz");
3210 if (value != NULL)
3211 sectsz = atoi(value);
3212 value = get_config_value_node(nvl, "ser");
3213 if (value != NULL) {
3215 * This field indicates the Product Serial Number in
3216 * 7-bit ASCII, unused bytes should be space characters.
3217 * Ref: NVMe v1.3c.
3219 cpywithpad((char *)sc->ctrldata.sn,
3220 sizeof(sc->ctrldata.sn), value, ' ');
3222 value = get_config_value_node(nvl, "eui64");
3223 if (value != NULL)
3224 sc->nvstore.eui64 = htobe64(strtoull(value, NULL, 0));
3225 value = get_config_value_node(nvl, "dsm");
3226 if (value != NULL) {
3227 if (strcmp(value, "auto") == 0)
3228 sc->dataset_management = NVME_DATASET_MANAGEMENT_AUTO;
3229 else if (strcmp(value, "enable") == 0)
3230 sc->dataset_management = NVME_DATASET_MANAGEMENT_ENABLE;
3231 else if (strcmp(value, "disable") == 0)
3232 sc->dataset_management = NVME_DATASET_MANAGEMENT_DISABLE;
3235 value = get_config_value_node(nvl, "ram");
3236 if (value != NULL) {
3237 uint64_t sz = strtoull(value, NULL, 10);
3239 sc->nvstore.type = NVME_STOR_RAM;
3240 sc->nvstore.size = sz * 1024 * 1024;
3241 sc->nvstore.ctx = calloc(1, sc->nvstore.size);
3242 sc->nvstore.sectsz = 4096;
3243 sc->nvstore.sectsz_bits = 12;
3244 if (sc->nvstore.ctx == NULL) {
3245 EPRINTLN("nvme: Unable to allocate RAM");
3246 return (-1);
3248 } else {
3249 snprintf(bident, sizeof(bident), "%u:%u",
3250 sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func);
3251 sc->nvstore.ctx = blockif_open(nvl, bident);
3252 if (sc->nvstore.ctx == NULL) {
3253 EPRINTLN("nvme: Could not open backing file: %s",
3254 strerror(errno));
3255 return (-1);
3257 sc->nvstore.type = NVME_STOR_BLOCKIF;
3258 sc->nvstore.size = blockif_size(sc->nvstore.ctx);
3261 if (sectsz == 512 || sectsz == 4096 || sectsz == 8192)
3262 sc->nvstore.sectsz = sectsz;
3263 else if (sc->nvstore.type != NVME_STOR_RAM)
3264 sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx);
3265 for (sc->nvstore.sectsz_bits = 9;
3266 (1U << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz;
3267 sc->nvstore.sectsz_bits++);
3269 if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES)
3270 sc->max_queues = NVME_QUEUES;
3272 return (0);
3275 static void
3276 pci_nvme_resized(struct blockif_ctxt *bctxt __unused, void *arg,
3277 size_t new_size)
3279 struct pci_nvme_softc *sc;
3280 struct pci_nvme_blockstore *nvstore;
3281 struct nvme_namespace_data *nd;
3283 sc = arg;
3284 nvstore = &sc->nvstore;
3285 nd = &sc->nsdata;
3287 nvstore->size = new_size;
3288 pci_nvme_init_nsdata_size(nvstore, nd);
3290 /* Add changed NSID to list */
3291 sc->ns_log.ns[0] = 1;
3292 sc->ns_log.ns[1] = 0;
3294 pci_nvme_aen_post(sc, PCI_NVME_AE_TYPE_NOTICE,
3295 PCI_NVME_AEI_NOTICE_NS_ATTR_CHANGED);
3298 static int
3299 pci_nvme_init(struct vmctx *ctx __unused, struct pci_devinst *pi, nvlist_t *nvl)
3301 struct pci_nvme_softc *sc;
3302 uint32_t pci_membar_sz;
3303 int error;
3305 error = 0;
3307 sc = calloc(1, sizeof(struct pci_nvme_softc));
3308 pi->pi_arg = sc;
3309 sc->nsc_pi = pi;
3311 error = pci_nvme_parse_config(sc, nvl);
3312 if (error < 0)
3313 goto done;
3314 else
3315 error = 0;
3317 STAILQ_INIT(&sc->ioreqs_free);
3318 sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq));
3319 for (uint32_t i = 0; i < sc->ioslots; i++) {
3320 STAILQ_INSERT_TAIL(&sc->ioreqs_free, &sc->ioreqs[i], link);
3323 pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A);
3324 pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D);
3325 pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
3326 pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM);
3327 pci_set_cfgdata8(pi, PCIR_PROGIF,
3328 PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0);
3331 * Allocate size of NVMe registers + doorbell space for all queues.
3333 * The specification requires a minimum memory I/O window size of 16K.
3334 * The Windows driver will refuse to start a device with a smaller
3335 * window.
3337 pci_membar_sz = sizeof(struct nvme_registers) +
3338 2 * sizeof(uint32_t) * (sc->max_queues + 1);
3339 pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN);
3341 DPRINTF("nvme membar size: %u", pci_membar_sz);
3343 error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz);
3344 if (error) {
3345 WPRINTF("%s pci alloc mem bar failed", __func__);
3346 goto done;
3349 error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR);
3350 if (error) {
3351 WPRINTF("%s pci add msixcap failed", __func__);
3352 goto done;
3355 error = pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_INT_EP);
3356 if (error) {
3357 WPRINTF("%s pci add Express capability failed", __func__);
3358 goto done;
3361 pthread_mutex_init(&sc->mtx, NULL);
3362 sem_init(&sc->iosemlock, 0, sc->ioslots);
3363 blockif_register_resize_callback(sc->nvstore.ctx, pci_nvme_resized, sc);
3365 pci_nvme_init_queues(sc, sc->max_queues, sc->max_queues);
3367 * Controller data depends on Namespace data so initialize Namespace
3368 * data first.
3370 pci_nvme_init_nsdata(sc, &sc->nsdata, 1, &sc->nvstore);
3371 pci_nvme_init_ctrldata(sc);
3372 pci_nvme_init_logpages(sc);
3373 pci_nvme_init_features(sc);
3375 pci_nvme_aer_init(sc);
3376 pci_nvme_aen_init(sc);
3378 pci_nvme_reset(sc);
3380 pci_lintr_request(pi);
3382 done:
3383 return (error);
3386 static int
3387 pci_nvme_legacy_config(nvlist_t *nvl, const char *opts)
3389 char *cp, *ram;
3391 if (opts == NULL)
3392 return (0);
3394 if (strncmp(opts, "ram=", 4) == 0) {
3395 cp = strchr(opts, ',');
3396 if (cp == NULL) {
3397 set_config_value_node(nvl, "ram", opts + 4);
3398 return (0);
3400 ram = strndup(opts + 4, cp - opts - 4);
3401 set_config_value_node(nvl, "ram", ram);
3402 free(ram);
3403 return (pci_parse_legacy_config(nvl, cp + 1));
3404 } else
3405 return (blockif_legacy_config(nvl, opts));
3408 static const struct pci_devemu pci_de_nvme = {
3409 .pe_emu = "nvme",
3410 .pe_init = pci_nvme_init,
3411 .pe_legacy_config = pci_nvme_legacy_config,
3412 .pe_barwrite = pci_nvme_write,
3413 .pe_barread = pci_nvme_read
3415 PCI_EMUL_SET(pci_de_nvme);