Merge remote-tracking branch 'remotes/cohuck-gitlab/tags/s390x-20210316' into staging
[qemu.git] / hw / block / nvme.c
blobd439e44db839160e83f603e0826dcd467d1cb870
1 /*
2 * QEMU NVM Express Controller
4 * Copyright (c) 2012, Intel Corporation
6 * Written by Keith Busch <keith.busch@intel.com>
8 * This code is licensed under the GNU GPL v2 or later.
9 */
11 /**
12 * Reference Specs: http://www.nvmexpress.org, 1.4, 1.3, 1.2, 1.1, 1.0e
14 * https://nvmexpress.org/developers/nvme-specification/
17 /**
18 * Usage: add options:
19 * -drive file=<file>,if=none,id=<drive_id>
20 * -device nvme-subsys,id=<subsys_id>,nqn=<nqn_id>
21 * -device nvme,serial=<serial>,id=<bus_name>, \
22 * cmb_size_mb=<cmb_size_mb[optional]>, \
23 * [pmrdev=<mem_backend_file_id>,] \
24 * max_ioqpairs=<N[optional]>, \
25 * aerl=<N[optional]>,aer_max_queued=<N[optional]>, \
26 * mdts=<N[optional]>,zoned.zasl=<N[optional]>, \
27 * subsys=<subsys_id>
28 * -device nvme-ns,drive=<drive_id>,bus=<bus_name>,nsid=<nsid>,\
29 * zoned=<true|false[optional]>, \
30 * subsys=<subsys_id>,detached=<true|false[optional]>
32 * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
33 * offset 0 in BAR2 and supports only WDS, RDS and SQS for now. By default, the
34 * device will use the "v1.4 CMB scheme" - use the `legacy-cmb` parameter to
35 * always enable the CMBLOC and CMBSZ registers (v1.3 behavior).
37 * Enabling pmr emulation can be achieved by pointing to memory-backend-file.
38 * For example:
39 * -object memory-backend-file,id=<mem_id>,share=on,mem-path=<file_path>, \
40 * size=<size> .... -device nvme,...,pmrdev=<mem_id>
42 * The PMR will use BAR 4/5 exclusively.
44 * To place controller(s) and namespace(s) to a subsystem, then provide
45 * nvme-subsys device as above.
47 * nvme subsystem device parameters
48 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
49 * - `nqn`
50 * This parameter provides the `<nqn_id>` part of the string
51 * `nqn.2019-08.org.qemu:<nqn_id>` which will be reported in the SUBNQN field
52 * of subsystem controllers. Note that `<nqn_id>` should be unique per
53 * subsystem, but this is not enforced by QEMU. If not specified, it will
54 * default to the value of the `id` parameter (`<subsys_id>`).
56 * nvme device parameters
57 * ~~~~~~~~~~~~~~~~~~~~~~
58 * - `subsys`
59 * Specifying this parameter attaches the controller to the subsystem and
60 * the SUBNQN field in the controller will report the NQN of the subsystem
61 * device. This also enables multi controller capability represented in
62 * Identify Controller data structure in CMIC (Controller Multi-path I/O and
63 * Namesapce Sharing Capabilities).
65 * - `aerl`
66 * The Asynchronous Event Request Limit (AERL). Indicates the maximum number
67 * of concurrently outstanding Asynchronous Event Request commands support
68 * by the controller. This is a 0's based value.
70 * - `aer_max_queued`
71 * This is the maximum number of events that the device will enqueue for
72 * completion when there are no outstanding AERs. When the maximum number of
73 * enqueued events are reached, subsequent events will be dropped.
75 * - `mdts`
76 * Indicates the maximum data transfer size for a command that transfers data
77 * between host-accessible memory and the controller. The value is specified
78 * as a power of two (2^n) and is in units of the minimum memory page size
79 * (CAP.MPSMIN). The default value is 7 (i.e. 512 KiB).
81 * - `zoned.zasl`
82 * Indicates the maximum data transfer size for the Zone Append command. Like
83 * `mdts`, the value is specified as a power of two (2^n) and is in units of
84 * the minimum memory page size (CAP.MPSMIN). The default value is 0 (i.e.
85 * defaulting to the value of `mdts`).
87 * nvme namespace device parameters
88 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
89 * - `subsys`
90 * If given, the namespace will be attached to all controllers in the
91 * subsystem. Otherwise, `bus` must be given to attach this namespace to a
92 * specific controller as a non-shared namespace.
94 * - `detached`
95 * This parameter is only valid together with the `subsys` parameter. If left
96 * at the default value (`false/off`), the namespace will be attached to all
97 * controllers in the NVMe subsystem at boot-up. If set to `true/on`, the
98 * namespace will be be available in the subsystem not not attached to any
99 * controllers.
101 * Setting `zoned` to true selects Zoned Command Set at the namespace.
102 * In this case, the following namespace properties are available to configure
103 * zoned operation:
104 * zoned.zone_size=<zone size in bytes, default: 128MiB>
105 * The number may be followed by K, M, G as in kilo-, mega- or giga-.
107 * zoned.zone_capacity=<zone capacity in bytes, default: zone size>
108 * The value 0 (default) forces zone capacity to be the same as zone
109 * size. The value of this property may not exceed zone size.
111 * zoned.descr_ext_size=<zone descriptor extension size, default 0>
112 * This value needs to be specified in 64B units. If it is zero,
113 * namespace(s) will not support zone descriptor extensions.
115 * zoned.max_active=<Maximum Active Resources (zones), default: 0>
116 * The default value means there is no limit to the number of
117 * concurrently active zones.
119 * zoned.max_open=<Maximum Open Resources (zones), default: 0>
120 * The default value means there is no limit to the number of
121 * concurrently open zones.
123 * zoned.cross_read=<enable RAZB, default: false>
124 * Setting this property to true enables Read Across Zone Boundaries.
127 #include "qemu/osdep.h"
128 #include "qemu/units.h"
129 #include "qemu/error-report.h"
130 #include "hw/block/block.h"
131 #include "hw/pci/msix.h"
132 #include "hw/pci/pci.h"
133 #include "hw/qdev-properties.h"
134 #include "migration/vmstate.h"
135 #include "sysemu/sysemu.h"
136 #include "qapi/error.h"
137 #include "qapi/visitor.h"
138 #include "sysemu/hostmem.h"
139 #include "sysemu/block-backend.h"
140 #include "exec/memory.h"
141 #include "qemu/log.h"
142 #include "qemu/module.h"
143 #include "qemu/cutils.h"
144 #include "trace.h"
145 #include "nvme.h"
146 #include "nvme-ns.h"
148 #define NVME_MAX_IOQPAIRS 0xffff
149 #define NVME_DB_SIZE 4
150 #define NVME_SPEC_VER 0x00010400
151 #define NVME_CMB_BIR 2
152 #define NVME_PMR_BIR 4
153 #define NVME_TEMPERATURE 0x143
154 #define NVME_TEMPERATURE_WARNING 0x157
155 #define NVME_TEMPERATURE_CRITICAL 0x175
156 #define NVME_NUM_FW_SLOTS 1
158 #define NVME_GUEST_ERR(trace, fmt, ...) \
159 do { \
160 (trace_##trace)(__VA_ARGS__); \
161 qemu_log_mask(LOG_GUEST_ERROR, #trace \
162 " in %s: " fmt "\n", __func__, ## __VA_ARGS__); \
163 } while (0)
165 static const bool nvme_feature_support[NVME_FID_MAX] = {
166 [NVME_ARBITRATION] = true,
167 [NVME_POWER_MANAGEMENT] = true,
168 [NVME_TEMPERATURE_THRESHOLD] = true,
169 [NVME_ERROR_RECOVERY] = true,
170 [NVME_VOLATILE_WRITE_CACHE] = true,
171 [NVME_NUMBER_OF_QUEUES] = true,
172 [NVME_INTERRUPT_COALESCING] = true,
173 [NVME_INTERRUPT_VECTOR_CONF] = true,
174 [NVME_WRITE_ATOMICITY] = true,
175 [NVME_ASYNCHRONOUS_EVENT_CONF] = true,
176 [NVME_TIMESTAMP] = true,
179 static const uint32_t nvme_feature_cap[NVME_FID_MAX] = {
180 [NVME_TEMPERATURE_THRESHOLD] = NVME_FEAT_CAP_CHANGE,
181 [NVME_ERROR_RECOVERY] = NVME_FEAT_CAP_CHANGE | NVME_FEAT_CAP_NS,
182 [NVME_VOLATILE_WRITE_CACHE] = NVME_FEAT_CAP_CHANGE,
183 [NVME_NUMBER_OF_QUEUES] = NVME_FEAT_CAP_CHANGE,
184 [NVME_ASYNCHRONOUS_EVENT_CONF] = NVME_FEAT_CAP_CHANGE,
185 [NVME_TIMESTAMP] = NVME_FEAT_CAP_CHANGE,
188 static const uint32_t nvme_cse_acs[256] = {
189 [NVME_ADM_CMD_DELETE_SQ] = NVME_CMD_EFF_CSUPP,
190 [NVME_ADM_CMD_CREATE_SQ] = NVME_CMD_EFF_CSUPP,
191 [NVME_ADM_CMD_GET_LOG_PAGE] = NVME_CMD_EFF_CSUPP,
192 [NVME_ADM_CMD_DELETE_CQ] = NVME_CMD_EFF_CSUPP,
193 [NVME_ADM_CMD_CREATE_CQ] = NVME_CMD_EFF_CSUPP,
194 [NVME_ADM_CMD_IDENTIFY] = NVME_CMD_EFF_CSUPP,
195 [NVME_ADM_CMD_ABORT] = NVME_CMD_EFF_CSUPP,
196 [NVME_ADM_CMD_SET_FEATURES] = NVME_CMD_EFF_CSUPP,
197 [NVME_ADM_CMD_GET_FEATURES] = NVME_CMD_EFF_CSUPP,
198 [NVME_ADM_CMD_ASYNC_EV_REQ] = NVME_CMD_EFF_CSUPP,
199 [NVME_ADM_CMD_NS_ATTACHMENT] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_NIC,
202 static const uint32_t nvme_cse_iocs_none[256];
204 static const uint32_t nvme_cse_iocs_nvm[256] = {
205 [NVME_CMD_FLUSH] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
206 [NVME_CMD_WRITE_ZEROES] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
207 [NVME_CMD_WRITE] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
208 [NVME_CMD_READ] = NVME_CMD_EFF_CSUPP,
209 [NVME_CMD_DSM] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
210 [NVME_CMD_COPY] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
211 [NVME_CMD_COMPARE] = NVME_CMD_EFF_CSUPP,
214 static const uint32_t nvme_cse_iocs_zoned[256] = {
215 [NVME_CMD_FLUSH] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
216 [NVME_CMD_WRITE_ZEROES] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
217 [NVME_CMD_WRITE] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
218 [NVME_CMD_READ] = NVME_CMD_EFF_CSUPP,
219 [NVME_CMD_DSM] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
220 [NVME_CMD_COPY] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
221 [NVME_CMD_COMPARE] = NVME_CMD_EFF_CSUPP,
222 [NVME_CMD_ZONE_APPEND] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
223 [NVME_CMD_ZONE_MGMT_SEND] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
224 [NVME_CMD_ZONE_MGMT_RECV] = NVME_CMD_EFF_CSUPP,
227 static void nvme_process_sq(void *opaque);
229 static uint16_t nvme_cid(NvmeRequest *req)
231 if (!req) {
232 return 0xffff;
235 return le16_to_cpu(req->cqe.cid);
238 static uint16_t nvme_sqid(NvmeRequest *req)
240 return le16_to_cpu(req->sq->sqid);
243 static void nvme_assign_zone_state(NvmeNamespace *ns, NvmeZone *zone,
244 NvmeZoneState state)
246 if (QTAILQ_IN_USE(zone, entry)) {
247 switch (nvme_get_zone_state(zone)) {
248 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
249 QTAILQ_REMOVE(&ns->exp_open_zones, zone, entry);
250 break;
251 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
252 QTAILQ_REMOVE(&ns->imp_open_zones, zone, entry);
253 break;
254 case NVME_ZONE_STATE_CLOSED:
255 QTAILQ_REMOVE(&ns->closed_zones, zone, entry);
256 break;
257 case NVME_ZONE_STATE_FULL:
258 QTAILQ_REMOVE(&ns->full_zones, zone, entry);
259 default:
264 nvme_set_zone_state(zone, state);
266 switch (state) {
267 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
268 QTAILQ_INSERT_TAIL(&ns->exp_open_zones, zone, entry);
269 break;
270 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
271 QTAILQ_INSERT_TAIL(&ns->imp_open_zones, zone, entry);
272 break;
273 case NVME_ZONE_STATE_CLOSED:
274 QTAILQ_INSERT_TAIL(&ns->closed_zones, zone, entry);
275 break;
276 case NVME_ZONE_STATE_FULL:
277 QTAILQ_INSERT_TAIL(&ns->full_zones, zone, entry);
278 case NVME_ZONE_STATE_READ_ONLY:
279 break;
280 default:
281 zone->d.za = 0;
286 * Check if we can open a zone without exceeding open/active limits.
287 * AOR stands for "Active and Open Resources" (see TP 4053 section 2.5).
289 static int nvme_aor_check(NvmeNamespace *ns, uint32_t act, uint32_t opn)
291 if (ns->params.max_active_zones != 0 &&
292 ns->nr_active_zones + act > ns->params.max_active_zones) {
293 trace_pci_nvme_err_insuff_active_res(ns->params.max_active_zones);
294 return NVME_ZONE_TOO_MANY_ACTIVE | NVME_DNR;
296 if (ns->params.max_open_zones != 0 &&
297 ns->nr_open_zones + opn > ns->params.max_open_zones) {
298 trace_pci_nvme_err_insuff_open_res(ns->params.max_open_zones);
299 return NVME_ZONE_TOO_MANY_OPEN | NVME_DNR;
302 return NVME_SUCCESS;
305 static bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr)
307 hwaddr hi, lo;
309 if (!n->cmb.cmse) {
310 return false;
313 lo = n->params.legacy_cmb ? n->cmb.mem.addr : n->cmb.cba;
314 hi = lo + int128_get64(n->cmb.mem.size);
316 return addr >= lo && addr < hi;
319 static inline void *nvme_addr_to_cmb(NvmeCtrl *n, hwaddr addr)
321 hwaddr base = n->params.legacy_cmb ? n->cmb.mem.addr : n->cmb.cba;
322 return &n->cmb.buf[addr - base];
325 static bool nvme_addr_is_pmr(NvmeCtrl *n, hwaddr addr)
327 hwaddr hi;
329 if (!n->pmr.cmse) {
330 return false;
333 hi = n->pmr.cba + int128_get64(n->pmr.dev->mr.size);
335 return addr >= n->pmr.cba && addr < hi;
338 static inline void *nvme_addr_to_pmr(NvmeCtrl *n, hwaddr addr)
340 return memory_region_get_ram_ptr(&n->pmr.dev->mr) + (addr - n->pmr.cba);
343 static int nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
345 hwaddr hi = addr + size - 1;
346 if (hi < addr) {
347 return 1;
350 if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr) && nvme_addr_is_cmb(n, hi)) {
351 memcpy(buf, nvme_addr_to_cmb(n, addr), size);
352 return 0;
355 if (nvme_addr_is_pmr(n, addr) && nvme_addr_is_pmr(n, hi)) {
356 memcpy(buf, nvme_addr_to_pmr(n, addr), size);
357 return 0;
360 return pci_dma_read(&n->parent_obj, addr, buf, size);
363 static bool nvme_nsid_valid(NvmeCtrl *n, uint32_t nsid)
365 return nsid && (nsid == NVME_NSID_BROADCAST || nsid <= n->num_namespaces);
368 static int nvme_check_sqid(NvmeCtrl *n, uint16_t sqid)
370 return sqid < n->params.max_ioqpairs + 1 && n->sq[sqid] != NULL ? 0 : -1;
373 static int nvme_check_cqid(NvmeCtrl *n, uint16_t cqid)
375 return cqid < n->params.max_ioqpairs + 1 && n->cq[cqid] != NULL ? 0 : -1;
378 static void nvme_inc_cq_tail(NvmeCQueue *cq)
380 cq->tail++;
381 if (cq->tail >= cq->size) {
382 cq->tail = 0;
383 cq->phase = !cq->phase;
387 static void nvme_inc_sq_head(NvmeSQueue *sq)
389 sq->head = (sq->head + 1) % sq->size;
392 static uint8_t nvme_cq_full(NvmeCQueue *cq)
394 return (cq->tail + 1) % cq->size == cq->head;
397 static uint8_t nvme_sq_empty(NvmeSQueue *sq)
399 return sq->head == sq->tail;
402 static void nvme_irq_check(NvmeCtrl *n)
404 if (msix_enabled(&(n->parent_obj))) {
405 return;
407 if (~n->bar.intms & n->irq_status) {
408 pci_irq_assert(&n->parent_obj);
409 } else {
410 pci_irq_deassert(&n->parent_obj);
414 static void nvme_irq_assert(NvmeCtrl *n, NvmeCQueue *cq)
416 if (cq->irq_enabled) {
417 if (msix_enabled(&(n->parent_obj))) {
418 trace_pci_nvme_irq_msix(cq->vector);
419 msix_notify(&(n->parent_obj), cq->vector);
420 } else {
421 trace_pci_nvme_irq_pin();
422 assert(cq->vector < 32);
423 n->irq_status |= 1 << cq->vector;
424 nvme_irq_check(n);
426 } else {
427 trace_pci_nvme_irq_masked();
431 static void nvme_irq_deassert(NvmeCtrl *n, NvmeCQueue *cq)
433 if (cq->irq_enabled) {
434 if (msix_enabled(&(n->parent_obj))) {
435 return;
436 } else {
437 assert(cq->vector < 32);
438 n->irq_status &= ~(1 << cq->vector);
439 nvme_irq_check(n);
444 static void nvme_req_clear(NvmeRequest *req)
446 req->ns = NULL;
447 req->opaque = NULL;
448 memset(&req->cqe, 0x0, sizeof(req->cqe));
449 req->status = NVME_SUCCESS;
452 static inline void nvme_sg_init(NvmeCtrl *n, NvmeSg *sg, bool dma)
454 if (dma) {
455 pci_dma_sglist_init(&sg->qsg, &n->parent_obj, 0);
456 sg->flags = NVME_SG_DMA;
457 } else {
458 qemu_iovec_init(&sg->iov, 0);
461 sg->flags |= NVME_SG_ALLOC;
464 static inline void nvme_sg_unmap(NvmeSg *sg)
466 if (!(sg->flags & NVME_SG_ALLOC)) {
467 return;
470 if (sg->flags & NVME_SG_DMA) {
471 qemu_sglist_destroy(&sg->qsg);
472 } else {
473 qemu_iovec_destroy(&sg->iov);
476 memset(sg, 0x0, sizeof(*sg));
479 static uint16_t nvme_map_addr_cmb(NvmeCtrl *n, QEMUIOVector *iov, hwaddr addr,
480 size_t len)
482 if (!len) {
483 return NVME_SUCCESS;
486 trace_pci_nvme_map_addr_cmb(addr, len);
488 if (!nvme_addr_is_cmb(n, addr) || !nvme_addr_is_cmb(n, addr + len - 1)) {
489 return NVME_DATA_TRAS_ERROR;
492 qemu_iovec_add(iov, nvme_addr_to_cmb(n, addr), len);
494 return NVME_SUCCESS;
497 static uint16_t nvme_map_addr_pmr(NvmeCtrl *n, QEMUIOVector *iov, hwaddr addr,
498 size_t len)
500 if (!len) {
501 return NVME_SUCCESS;
504 if (!nvme_addr_is_pmr(n, addr) || !nvme_addr_is_pmr(n, addr + len - 1)) {
505 return NVME_DATA_TRAS_ERROR;
508 qemu_iovec_add(iov, nvme_addr_to_pmr(n, addr), len);
510 return NVME_SUCCESS;
513 static uint16_t nvme_map_addr(NvmeCtrl *n, NvmeSg *sg, hwaddr addr, size_t len)
515 bool cmb = false, pmr = false;
517 if (!len) {
518 return NVME_SUCCESS;
521 trace_pci_nvme_map_addr(addr, len);
523 if (nvme_addr_is_cmb(n, addr)) {
524 cmb = true;
525 } else if (nvme_addr_is_pmr(n, addr)) {
526 pmr = true;
529 if (cmb || pmr) {
530 if (sg->flags & NVME_SG_DMA) {
531 return NVME_INVALID_USE_OF_CMB | NVME_DNR;
534 if (cmb) {
535 return nvme_map_addr_cmb(n, &sg->iov, addr, len);
536 } else {
537 return nvme_map_addr_pmr(n, &sg->iov, addr, len);
541 if (!(sg->flags & NVME_SG_DMA)) {
542 return NVME_INVALID_USE_OF_CMB | NVME_DNR;
545 qemu_sglist_add(&sg->qsg, addr, len);
547 return NVME_SUCCESS;
550 static inline bool nvme_addr_is_dma(NvmeCtrl *n, hwaddr addr)
552 return !(nvme_addr_is_cmb(n, addr) || nvme_addr_is_pmr(n, addr));
555 static uint16_t nvme_map_prp(NvmeCtrl *n, NvmeSg *sg, uint64_t prp1,
556 uint64_t prp2, uint32_t len)
558 hwaddr trans_len = n->page_size - (prp1 % n->page_size);
559 trans_len = MIN(len, trans_len);
560 int num_prps = (len >> n->page_bits) + 1;
561 uint16_t status;
562 int ret;
564 trace_pci_nvme_map_prp(trans_len, len, prp1, prp2, num_prps);
566 nvme_sg_init(n, sg, nvme_addr_is_dma(n, prp1));
568 status = nvme_map_addr(n, sg, prp1, trans_len);
569 if (status) {
570 goto unmap;
573 len -= trans_len;
574 if (len) {
575 if (len > n->page_size) {
576 uint64_t prp_list[n->max_prp_ents];
577 uint32_t nents, prp_trans;
578 int i = 0;
580 nents = (len + n->page_size - 1) >> n->page_bits;
581 prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t);
582 ret = nvme_addr_read(n, prp2, (void *)prp_list, prp_trans);
583 if (ret) {
584 trace_pci_nvme_err_addr_read(prp2);
585 status = NVME_DATA_TRAS_ERROR;
586 goto unmap;
588 while (len != 0) {
589 uint64_t prp_ent = le64_to_cpu(prp_list[i]);
591 if (i == n->max_prp_ents - 1 && len > n->page_size) {
592 if (unlikely(prp_ent & (n->page_size - 1))) {
593 trace_pci_nvme_err_invalid_prplist_ent(prp_ent);
594 status = NVME_INVALID_PRP_OFFSET | NVME_DNR;
595 goto unmap;
598 i = 0;
599 nents = (len + n->page_size - 1) >> n->page_bits;
600 prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t);
601 ret = nvme_addr_read(n, prp_ent, (void *)prp_list,
602 prp_trans);
603 if (ret) {
604 trace_pci_nvme_err_addr_read(prp_ent);
605 status = NVME_DATA_TRAS_ERROR;
606 goto unmap;
608 prp_ent = le64_to_cpu(prp_list[i]);
611 if (unlikely(prp_ent & (n->page_size - 1))) {
612 trace_pci_nvme_err_invalid_prplist_ent(prp_ent);
613 status = NVME_INVALID_PRP_OFFSET | NVME_DNR;
614 goto unmap;
617 trans_len = MIN(len, n->page_size);
618 status = nvme_map_addr(n, sg, prp_ent, trans_len);
619 if (status) {
620 goto unmap;
623 len -= trans_len;
624 i++;
626 } else {
627 if (unlikely(prp2 & (n->page_size - 1))) {
628 trace_pci_nvme_err_invalid_prp2_align(prp2);
629 status = NVME_INVALID_PRP_OFFSET | NVME_DNR;
630 goto unmap;
632 status = nvme_map_addr(n, sg, prp2, len);
633 if (status) {
634 goto unmap;
639 return NVME_SUCCESS;
641 unmap:
642 nvme_sg_unmap(sg);
643 return status;
647 * Map 'nsgld' data descriptors from 'segment'. The function will subtract the
648 * number of bytes mapped in len.
650 static uint16_t nvme_map_sgl_data(NvmeCtrl *n, NvmeSg *sg,
651 NvmeSglDescriptor *segment, uint64_t nsgld,
652 size_t *len, NvmeCmd *cmd)
654 dma_addr_t addr, trans_len;
655 uint32_t dlen;
656 uint16_t status;
658 for (int i = 0; i < nsgld; i++) {
659 uint8_t type = NVME_SGL_TYPE(segment[i].type);
661 switch (type) {
662 case NVME_SGL_DESCR_TYPE_BIT_BUCKET:
663 if (cmd->opcode == NVME_CMD_WRITE) {
664 continue;
666 case NVME_SGL_DESCR_TYPE_DATA_BLOCK:
667 break;
668 case NVME_SGL_DESCR_TYPE_SEGMENT:
669 case NVME_SGL_DESCR_TYPE_LAST_SEGMENT:
670 return NVME_INVALID_NUM_SGL_DESCRS | NVME_DNR;
671 default:
672 return NVME_SGL_DESCR_TYPE_INVALID | NVME_DNR;
675 dlen = le32_to_cpu(segment[i].len);
677 if (!dlen) {
678 continue;
681 if (*len == 0) {
683 * All data has been mapped, but the SGL contains additional
684 * segments and/or descriptors. The controller might accept
685 * ignoring the rest of the SGL.
687 uint32_t sgls = le32_to_cpu(n->id_ctrl.sgls);
688 if (sgls & NVME_CTRL_SGLS_EXCESS_LENGTH) {
689 break;
692 trace_pci_nvme_err_invalid_sgl_excess_length(dlen);
693 return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
696 trans_len = MIN(*len, dlen);
698 if (type == NVME_SGL_DESCR_TYPE_BIT_BUCKET) {
699 goto next;
702 addr = le64_to_cpu(segment[i].addr);
704 if (UINT64_MAX - addr < dlen) {
705 return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
708 status = nvme_map_addr(n, sg, addr, trans_len);
709 if (status) {
710 return status;
713 next:
714 *len -= trans_len;
717 return NVME_SUCCESS;
720 static uint16_t nvme_map_sgl(NvmeCtrl *n, NvmeSg *sg, NvmeSglDescriptor sgl,
721 size_t len, NvmeCmd *cmd)
724 * Read the segment in chunks of 256 descriptors (one 4k page) to avoid
725 * dynamically allocating a potentially huge SGL. The spec allows the SGL
726 * to be larger (as in number of bytes required to describe the SGL
727 * descriptors and segment chain) than the command transfer size, so it is
728 * not bounded by MDTS.
730 const int SEG_CHUNK_SIZE = 256;
732 NvmeSglDescriptor segment[SEG_CHUNK_SIZE], *sgld, *last_sgld;
733 uint64_t nsgld;
734 uint32_t seg_len;
735 uint16_t status;
736 hwaddr addr;
737 int ret;
739 sgld = &sgl;
740 addr = le64_to_cpu(sgl.addr);
742 trace_pci_nvme_map_sgl(NVME_SGL_TYPE(sgl.type), len);
744 nvme_sg_init(n, sg, nvme_addr_is_dma(n, addr));
747 * If the entire transfer can be described with a single data block it can
748 * be mapped directly.
750 if (NVME_SGL_TYPE(sgl.type) == NVME_SGL_DESCR_TYPE_DATA_BLOCK) {
751 status = nvme_map_sgl_data(n, sg, sgld, 1, &len, cmd);
752 if (status) {
753 goto unmap;
756 goto out;
759 for (;;) {
760 switch (NVME_SGL_TYPE(sgld->type)) {
761 case NVME_SGL_DESCR_TYPE_SEGMENT:
762 case NVME_SGL_DESCR_TYPE_LAST_SEGMENT:
763 break;
764 default:
765 return NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
768 seg_len = le32_to_cpu(sgld->len);
770 /* check the length of the (Last) Segment descriptor */
771 if ((!seg_len || seg_len & 0xf) &&
772 (NVME_SGL_TYPE(sgld->type) != NVME_SGL_DESCR_TYPE_BIT_BUCKET)) {
773 return NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
776 if (UINT64_MAX - addr < seg_len) {
777 return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
780 nsgld = seg_len / sizeof(NvmeSglDescriptor);
782 while (nsgld > SEG_CHUNK_SIZE) {
783 if (nvme_addr_read(n, addr, segment, sizeof(segment))) {
784 trace_pci_nvme_err_addr_read(addr);
785 status = NVME_DATA_TRAS_ERROR;
786 goto unmap;
789 status = nvme_map_sgl_data(n, sg, segment, SEG_CHUNK_SIZE,
790 &len, cmd);
791 if (status) {
792 goto unmap;
795 nsgld -= SEG_CHUNK_SIZE;
796 addr += SEG_CHUNK_SIZE * sizeof(NvmeSglDescriptor);
799 ret = nvme_addr_read(n, addr, segment, nsgld *
800 sizeof(NvmeSglDescriptor));
801 if (ret) {
802 trace_pci_nvme_err_addr_read(addr);
803 status = NVME_DATA_TRAS_ERROR;
804 goto unmap;
807 last_sgld = &segment[nsgld - 1];
810 * If the segment ends with a Data Block or Bit Bucket Descriptor Type,
811 * then we are done.
813 switch (NVME_SGL_TYPE(last_sgld->type)) {
814 case NVME_SGL_DESCR_TYPE_DATA_BLOCK:
815 case NVME_SGL_DESCR_TYPE_BIT_BUCKET:
816 status = nvme_map_sgl_data(n, sg, segment, nsgld, &len, cmd);
817 if (status) {
818 goto unmap;
821 goto out;
823 default:
824 break;
828 * If the last descriptor was not a Data Block or Bit Bucket, then the
829 * current segment must not be a Last Segment.
831 if (NVME_SGL_TYPE(sgld->type) == NVME_SGL_DESCR_TYPE_LAST_SEGMENT) {
832 status = NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
833 goto unmap;
836 sgld = last_sgld;
837 addr = le64_to_cpu(sgld->addr);
840 * Do not map the last descriptor; it will be a Segment or Last Segment
841 * descriptor and is handled by the next iteration.
843 status = nvme_map_sgl_data(n, sg, segment, nsgld - 1, &len, cmd);
844 if (status) {
845 goto unmap;
849 out:
850 /* if there is any residual left in len, the SGL was too short */
851 if (len) {
852 status = NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
853 goto unmap;
856 return NVME_SUCCESS;
858 unmap:
859 nvme_sg_unmap(sg);
860 return status;
863 static uint16_t nvme_map_dptr(NvmeCtrl *n, NvmeSg *sg, size_t len,
864 NvmeCmd *cmd)
866 uint64_t prp1, prp2;
868 switch (NVME_CMD_FLAGS_PSDT(cmd->flags)) {
869 case NVME_PSDT_PRP:
870 prp1 = le64_to_cpu(cmd->dptr.prp1);
871 prp2 = le64_to_cpu(cmd->dptr.prp2);
873 return nvme_map_prp(n, sg, prp1, prp2, len);
874 case NVME_PSDT_SGL_MPTR_CONTIGUOUS:
875 case NVME_PSDT_SGL_MPTR_SGL:
876 return nvme_map_sgl(n, sg, cmd->dptr.sgl, len, cmd);
877 default:
878 return NVME_INVALID_FIELD;
882 typedef enum NvmeTxDirection {
883 NVME_TX_DIRECTION_TO_DEVICE = 0,
884 NVME_TX_DIRECTION_FROM_DEVICE = 1,
885 } NvmeTxDirection;
887 static uint16_t nvme_tx(NvmeCtrl *n, NvmeSg *sg, uint8_t *ptr, uint32_t len,
888 NvmeTxDirection dir)
890 assert(sg->flags & NVME_SG_ALLOC);
892 if (sg->flags & NVME_SG_DMA) {
893 uint64_t residual;
895 if (dir == NVME_TX_DIRECTION_TO_DEVICE) {
896 residual = dma_buf_write(ptr, len, &sg->qsg);
897 } else {
898 residual = dma_buf_read(ptr, len, &sg->qsg);
901 if (unlikely(residual)) {
902 trace_pci_nvme_err_invalid_dma();
903 return NVME_INVALID_FIELD | NVME_DNR;
905 } else {
906 size_t bytes;
908 if (dir == NVME_TX_DIRECTION_TO_DEVICE) {
909 bytes = qemu_iovec_to_buf(&sg->iov, 0, ptr, len);
910 } else {
911 bytes = qemu_iovec_from_buf(&sg->iov, 0, ptr, len);
914 if (unlikely(bytes != len)) {
915 trace_pci_nvme_err_invalid_dma();
916 return NVME_INVALID_FIELD | NVME_DNR;
920 return NVME_SUCCESS;
923 static inline uint16_t nvme_c2h(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
924 NvmeRequest *req)
926 uint16_t status;
928 status = nvme_map_dptr(n, &req->sg, len, &req->cmd);
929 if (status) {
930 return status;
933 return nvme_tx(n, &req->sg, ptr, len, NVME_TX_DIRECTION_FROM_DEVICE);
936 static inline uint16_t nvme_h2c(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
937 NvmeRequest *req)
939 uint16_t status;
941 status = nvme_map_dptr(n, &req->sg, len, &req->cmd);
942 if (status) {
943 return status;
946 return nvme_tx(n, &req->sg, ptr, len, NVME_TX_DIRECTION_TO_DEVICE);
949 static inline void nvme_blk_read(BlockBackend *blk, int64_t offset,
950 BlockCompletionFunc *cb, NvmeRequest *req)
952 assert(req->sg.flags & NVME_SG_ALLOC);
954 if (req->sg.flags & NVME_SG_DMA) {
955 req->aiocb = dma_blk_read(blk, &req->sg.qsg, offset, BDRV_SECTOR_SIZE,
956 cb, req);
957 } else {
958 req->aiocb = blk_aio_preadv(blk, offset, &req->sg.iov, 0, cb, req);
962 static inline void nvme_blk_write(BlockBackend *blk, int64_t offset,
963 BlockCompletionFunc *cb, NvmeRequest *req)
965 assert(req->sg.flags & NVME_SG_ALLOC);
967 if (req->sg.flags & NVME_SG_DMA) {
968 req->aiocb = dma_blk_write(blk, &req->sg.qsg, offset, BDRV_SECTOR_SIZE,
969 cb, req);
970 } else {
971 req->aiocb = blk_aio_pwritev(blk, offset, &req->sg.iov, 0, cb, req);
975 static void nvme_post_cqes(void *opaque)
977 NvmeCQueue *cq = opaque;
978 NvmeCtrl *n = cq->ctrl;
979 NvmeRequest *req, *next;
980 int ret;
982 QTAILQ_FOREACH_SAFE(req, &cq->req_list, entry, next) {
983 NvmeSQueue *sq;
984 hwaddr addr;
986 if (nvme_cq_full(cq)) {
987 break;
990 sq = req->sq;
991 req->cqe.status = cpu_to_le16((req->status << 1) | cq->phase);
992 req->cqe.sq_id = cpu_to_le16(sq->sqid);
993 req->cqe.sq_head = cpu_to_le16(sq->head);
994 addr = cq->dma_addr + cq->tail * n->cqe_size;
995 ret = pci_dma_write(&n->parent_obj, addr, (void *)&req->cqe,
996 sizeof(req->cqe));
997 if (ret) {
998 trace_pci_nvme_err_addr_write(addr);
999 trace_pci_nvme_err_cfs();
1000 n->bar.csts = NVME_CSTS_FAILED;
1001 break;
1003 QTAILQ_REMOVE(&cq->req_list, req, entry);
1004 nvme_inc_cq_tail(cq);
1005 nvme_sg_unmap(&req->sg);
1006 QTAILQ_INSERT_TAIL(&sq->req_list, req, entry);
1008 if (cq->tail != cq->head) {
1009 nvme_irq_assert(n, cq);
1013 static void nvme_enqueue_req_completion(NvmeCQueue *cq, NvmeRequest *req)
1015 assert(cq->cqid == req->sq->cqid);
1016 trace_pci_nvme_enqueue_req_completion(nvme_cid(req), cq->cqid,
1017 req->status);
1019 if (req->status) {
1020 trace_pci_nvme_err_req_status(nvme_cid(req), nvme_nsid(req->ns),
1021 req->status, req->cmd.opcode);
1024 QTAILQ_REMOVE(&req->sq->out_req_list, req, entry);
1025 QTAILQ_INSERT_TAIL(&cq->req_list, req, entry);
1026 timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
1029 static void nvme_process_aers(void *opaque)
1031 NvmeCtrl *n = opaque;
1032 NvmeAsyncEvent *event, *next;
1034 trace_pci_nvme_process_aers(n->aer_queued);
1036 QTAILQ_FOREACH_SAFE(event, &n->aer_queue, entry, next) {
1037 NvmeRequest *req;
1038 NvmeAerResult *result;
1040 /* can't post cqe if there is nothing to complete */
1041 if (!n->outstanding_aers) {
1042 trace_pci_nvme_no_outstanding_aers();
1043 break;
1046 /* ignore if masked (cqe posted, but event not cleared) */
1047 if (n->aer_mask & (1 << event->result.event_type)) {
1048 trace_pci_nvme_aer_masked(event->result.event_type, n->aer_mask);
1049 continue;
1052 QTAILQ_REMOVE(&n->aer_queue, event, entry);
1053 n->aer_queued--;
1055 n->aer_mask |= 1 << event->result.event_type;
1056 n->outstanding_aers--;
1058 req = n->aer_reqs[n->outstanding_aers];
1060 result = (NvmeAerResult *) &req->cqe.result;
1061 result->event_type = event->result.event_type;
1062 result->event_info = event->result.event_info;
1063 result->log_page = event->result.log_page;
1064 g_free(event);
1066 trace_pci_nvme_aer_post_cqe(result->event_type, result->event_info,
1067 result->log_page);
1069 nvme_enqueue_req_completion(&n->admin_cq, req);
1073 static void nvme_enqueue_event(NvmeCtrl *n, uint8_t event_type,
1074 uint8_t event_info, uint8_t log_page)
1076 NvmeAsyncEvent *event;
1078 trace_pci_nvme_enqueue_event(event_type, event_info, log_page);
1080 if (n->aer_queued == n->params.aer_max_queued) {
1081 trace_pci_nvme_enqueue_event_noqueue(n->aer_queued);
1082 return;
1085 event = g_new(NvmeAsyncEvent, 1);
1086 event->result = (NvmeAerResult) {
1087 .event_type = event_type,
1088 .event_info = event_info,
1089 .log_page = log_page,
1092 QTAILQ_INSERT_TAIL(&n->aer_queue, event, entry);
1093 n->aer_queued++;
1095 nvme_process_aers(n);
1098 static void nvme_smart_event(NvmeCtrl *n, uint8_t event)
1100 uint8_t aer_info;
1102 /* Ref SPEC <Asynchronous Event Information 0x2013 SMART / Health Status> */
1103 if (!(NVME_AEC_SMART(n->features.async_config) & event)) {
1104 return;
1107 switch (event) {
1108 case NVME_SMART_SPARE:
1109 aer_info = NVME_AER_INFO_SMART_SPARE_THRESH;
1110 break;
1111 case NVME_SMART_TEMPERATURE:
1112 aer_info = NVME_AER_INFO_SMART_TEMP_THRESH;
1113 break;
1114 case NVME_SMART_RELIABILITY:
1115 case NVME_SMART_MEDIA_READ_ONLY:
1116 case NVME_SMART_FAILED_VOLATILE_MEDIA:
1117 case NVME_SMART_PMR_UNRELIABLE:
1118 aer_info = NVME_AER_INFO_SMART_RELIABILITY;
1119 break;
1120 default:
1121 return;
1124 nvme_enqueue_event(n, NVME_AER_TYPE_SMART, aer_info, NVME_LOG_SMART_INFO);
1127 static void nvme_clear_events(NvmeCtrl *n, uint8_t event_type)
1129 n->aer_mask &= ~(1 << event_type);
1130 if (!QTAILQ_EMPTY(&n->aer_queue)) {
1131 nvme_process_aers(n);
1135 static inline uint16_t nvme_check_mdts(NvmeCtrl *n, size_t len)
1137 uint8_t mdts = n->params.mdts;
1139 if (mdts && len > n->page_size << mdts) {
1140 trace_pci_nvme_err_mdts(len);
1141 return NVME_INVALID_FIELD | NVME_DNR;
1144 return NVME_SUCCESS;
1147 static inline uint16_t nvme_check_bounds(NvmeNamespace *ns, uint64_t slba,
1148 uint32_t nlb)
1150 uint64_t nsze = le64_to_cpu(ns->id_ns.nsze);
1152 if (unlikely(UINT64_MAX - slba < nlb || slba + nlb > nsze)) {
1153 return NVME_LBA_RANGE | NVME_DNR;
1156 return NVME_SUCCESS;
1159 static uint16_t nvme_check_dulbe(NvmeNamespace *ns, uint64_t slba,
1160 uint32_t nlb)
1162 BlockDriverState *bs = blk_bs(ns->blkconf.blk);
1164 int64_t pnum = 0, bytes = nvme_l2b(ns, nlb);
1165 int64_t offset = nvme_l2b(ns, slba);
1166 bool zeroed;
1167 int ret;
1169 Error *local_err = NULL;
1172 * `pnum` holds the number of bytes after offset that shares the same
1173 * allocation status as the byte at offset. If `pnum` is different from
1174 * `bytes`, we should check the allocation status of the next range and
1175 * continue this until all bytes have been checked.
1177 do {
1178 bytes -= pnum;
1180 ret = bdrv_block_status(bs, offset, bytes, &pnum, NULL, NULL);
1181 if (ret < 0) {
1182 error_setg_errno(&local_err, -ret, "unable to get block status");
1183 error_report_err(local_err);
1185 return NVME_INTERNAL_DEV_ERROR;
1188 zeroed = !!(ret & BDRV_BLOCK_ZERO);
1190 trace_pci_nvme_block_status(offset, bytes, pnum, ret, zeroed);
1192 if (zeroed) {
1193 return NVME_DULB;
1196 offset += pnum;
1197 } while (pnum != bytes);
1199 return NVME_SUCCESS;
1202 static void nvme_aio_err(NvmeRequest *req, int ret)
1204 uint16_t status = NVME_SUCCESS;
1205 Error *local_err = NULL;
1207 switch (req->cmd.opcode) {
1208 case NVME_CMD_READ:
1209 status = NVME_UNRECOVERED_READ;
1210 break;
1211 case NVME_CMD_FLUSH:
1212 case NVME_CMD_WRITE:
1213 case NVME_CMD_WRITE_ZEROES:
1214 case NVME_CMD_ZONE_APPEND:
1215 status = NVME_WRITE_FAULT;
1216 break;
1217 default:
1218 status = NVME_INTERNAL_DEV_ERROR;
1219 break;
1222 trace_pci_nvme_err_aio(nvme_cid(req), strerror(-ret), status);
1224 error_setg_errno(&local_err, -ret, "aio failed");
1225 error_report_err(local_err);
1228 * Set the command status code to the first encountered error but allow a
1229 * subsequent Internal Device Error to trump it.
1231 if (req->status && status != NVME_INTERNAL_DEV_ERROR) {
1232 return;
1235 req->status = status;
1238 static inline uint32_t nvme_zone_idx(NvmeNamespace *ns, uint64_t slba)
1240 return ns->zone_size_log2 > 0 ? slba >> ns->zone_size_log2 :
1241 slba / ns->zone_size;
1244 static inline NvmeZone *nvme_get_zone_by_slba(NvmeNamespace *ns, uint64_t slba)
1246 uint32_t zone_idx = nvme_zone_idx(ns, slba);
1248 assert(zone_idx < ns->num_zones);
1249 return &ns->zone_array[zone_idx];
1252 static uint16_t nvme_check_zone_state_for_write(NvmeZone *zone)
1254 uint64_t zslba = zone->d.zslba;
1256 switch (nvme_get_zone_state(zone)) {
1257 case NVME_ZONE_STATE_EMPTY:
1258 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1259 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1260 case NVME_ZONE_STATE_CLOSED:
1261 return NVME_SUCCESS;
1262 case NVME_ZONE_STATE_FULL:
1263 trace_pci_nvme_err_zone_is_full(zslba);
1264 return NVME_ZONE_FULL;
1265 case NVME_ZONE_STATE_OFFLINE:
1266 trace_pci_nvme_err_zone_is_offline(zslba);
1267 return NVME_ZONE_OFFLINE;
1268 case NVME_ZONE_STATE_READ_ONLY:
1269 trace_pci_nvme_err_zone_is_read_only(zslba);
1270 return NVME_ZONE_READ_ONLY;
1271 default:
1272 assert(false);
1275 return NVME_INTERNAL_DEV_ERROR;
1278 static uint16_t nvme_check_zone_write(NvmeNamespace *ns, NvmeZone *zone,
1279 uint64_t slba, uint32_t nlb)
1281 uint64_t zcap = nvme_zone_wr_boundary(zone);
1282 uint16_t status;
1284 status = nvme_check_zone_state_for_write(zone);
1285 if (status) {
1286 return status;
1289 if (unlikely(slba != zone->w_ptr)) {
1290 trace_pci_nvme_err_write_not_at_wp(slba, zone->d.zslba, zone->w_ptr);
1291 return NVME_ZONE_INVALID_WRITE;
1294 if (unlikely((slba + nlb) > zcap)) {
1295 trace_pci_nvme_err_zone_boundary(slba, nlb, zcap);
1296 return NVME_ZONE_BOUNDARY_ERROR;
1299 return NVME_SUCCESS;
1302 static uint16_t nvme_check_zone_state_for_read(NvmeZone *zone)
1304 switch (nvme_get_zone_state(zone)) {
1305 case NVME_ZONE_STATE_EMPTY:
1306 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1307 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1308 case NVME_ZONE_STATE_FULL:
1309 case NVME_ZONE_STATE_CLOSED:
1310 case NVME_ZONE_STATE_READ_ONLY:
1311 return NVME_SUCCESS;
1312 case NVME_ZONE_STATE_OFFLINE:
1313 trace_pci_nvme_err_zone_is_offline(zone->d.zslba);
1314 return NVME_ZONE_OFFLINE;
1315 default:
1316 assert(false);
1319 return NVME_INTERNAL_DEV_ERROR;
1322 static uint16_t nvme_check_zone_read(NvmeNamespace *ns, uint64_t slba,
1323 uint32_t nlb)
1325 NvmeZone *zone = nvme_get_zone_by_slba(ns, slba);
1326 uint64_t bndry = nvme_zone_rd_boundary(ns, zone);
1327 uint64_t end = slba + nlb;
1328 uint16_t status;
1330 status = nvme_check_zone_state_for_read(zone);
1331 if (status) {
1333 } else if (unlikely(end > bndry)) {
1334 if (!ns->params.cross_zone_read) {
1335 status = NVME_ZONE_BOUNDARY_ERROR;
1336 } else {
1338 * Read across zone boundary - check that all subsequent
1339 * zones that are being read have an appropriate state.
1341 do {
1342 zone++;
1343 status = nvme_check_zone_state_for_read(zone);
1344 if (status) {
1345 break;
1347 } while (end > nvme_zone_rd_boundary(ns, zone));
1351 return status;
1354 static uint16_t nvme_zrm_finish(NvmeNamespace *ns, NvmeZone *zone)
1356 switch (nvme_get_zone_state(zone)) {
1357 case NVME_ZONE_STATE_FULL:
1358 return NVME_SUCCESS;
1360 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1361 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1362 nvme_aor_dec_open(ns);
1363 /* fallthrough */
1364 case NVME_ZONE_STATE_CLOSED:
1365 nvme_aor_dec_active(ns);
1366 /* fallthrough */
1367 case NVME_ZONE_STATE_EMPTY:
1368 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_FULL);
1369 return NVME_SUCCESS;
1371 default:
1372 return NVME_ZONE_INVAL_TRANSITION;
1376 static uint16_t nvme_zrm_close(NvmeNamespace *ns, NvmeZone *zone)
1378 switch (nvme_get_zone_state(zone)) {
1379 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1380 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1381 nvme_aor_dec_open(ns);
1382 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_CLOSED);
1383 /* fall through */
1384 case NVME_ZONE_STATE_CLOSED:
1385 return NVME_SUCCESS;
1387 default:
1388 return NVME_ZONE_INVAL_TRANSITION;
1392 static void nvme_zrm_auto_transition_zone(NvmeNamespace *ns)
1394 NvmeZone *zone;
1396 if (ns->params.max_open_zones &&
1397 ns->nr_open_zones == ns->params.max_open_zones) {
1398 zone = QTAILQ_FIRST(&ns->imp_open_zones);
1399 if (zone) {
1401 * Automatically close this implicitly open zone.
1403 QTAILQ_REMOVE(&ns->imp_open_zones, zone, entry);
1404 nvme_zrm_close(ns, zone);
1409 static uint16_t __nvme_zrm_open(NvmeNamespace *ns, NvmeZone *zone,
1410 bool implicit)
1412 int act = 0;
1413 uint16_t status;
1415 switch (nvme_get_zone_state(zone)) {
1416 case NVME_ZONE_STATE_EMPTY:
1417 act = 1;
1419 /* fallthrough */
1421 case NVME_ZONE_STATE_CLOSED:
1422 nvme_zrm_auto_transition_zone(ns);
1423 status = nvme_aor_check(ns, act, 1);
1424 if (status) {
1425 return status;
1428 if (act) {
1429 nvme_aor_inc_active(ns);
1432 nvme_aor_inc_open(ns);
1434 if (implicit) {
1435 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_IMPLICITLY_OPEN);
1436 return NVME_SUCCESS;
1439 /* fallthrough */
1441 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1442 if (implicit) {
1443 return NVME_SUCCESS;
1446 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EXPLICITLY_OPEN);
1448 /* fallthrough */
1450 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1451 return NVME_SUCCESS;
1453 default:
1454 return NVME_ZONE_INVAL_TRANSITION;
1458 static inline uint16_t nvme_zrm_auto(NvmeNamespace *ns, NvmeZone *zone)
1460 return __nvme_zrm_open(ns, zone, true);
1463 static inline uint16_t nvme_zrm_open(NvmeNamespace *ns, NvmeZone *zone)
1465 return __nvme_zrm_open(ns, zone, false);
1468 static void __nvme_advance_zone_wp(NvmeNamespace *ns, NvmeZone *zone,
1469 uint32_t nlb)
1471 zone->d.wp += nlb;
1473 if (zone->d.wp == nvme_zone_wr_boundary(zone)) {
1474 nvme_zrm_finish(ns, zone);
1478 static void nvme_finalize_zoned_write(NvmeNamespace *ns, NvmeRequest *req)
1480 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1481 NvmeZone *zone;
1482 uint64_t slba;
1483 uint32_t nlb;
1485 slba = le64_to_cpu(rw->slba);
1486 nlb = le16_to_cpu(rw->nlb) + 1;
1487 zone = nvme_get_zone_by_slba(ns, slba);
1489 __nvme_advance_zone_wp(ns, zone, nlb);
1492 static inline bool nvme_is_write(NvmeRequest *req)
1494 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1496 return rw->opcode == NVME_CMD_WRITE ||
1497 rw->opcode == NVME_CMD_ZONE_APPEND ||
1498 rw->opcode == NVME_CMD_WRITE_ZEROES;
1501 static void nvme_rw_cb(void *opaque, int ret)
1503 NvmeRequest *req = opaque;
1504 NvmeNamespace *ns = req->ns;
1506 BlockBackend *blk = ns->blkconf.blk;
1507 BlockAcctCookie *acct = &req->acct;
1508 BlockAcctStats *stats = blk_get_stats(blk);
1510 trace_pci_nvme_rw_cb(nvme_cid(req), blk_name(blk));
1512 if (ns->params.zoned && nvme_is_write(req)) {
1513 nvme_finalize_zoned_write(ns, req);
1516 if (!ret) {
1517 block_acct_done(stats, acct);
1518 } else {
1519 block_acct_failed(stats, acct);
1520 nvme_aio_err(req, ret);
1523 nvme_enqueue_req_completion(nvme_cq(req), req);
1526 struct nvme_aio_flush_ctx {
1527 NvmeRequest *req;
1528 NvmeNamespace *ns;
1529 BlockAcctCookie acct;
1532 static void nvme_aio_flush_cb(void *opaque, int ret)
1534 struct nvme_aio_flush_ctx *ctx = opaque;
1535 NvmeRequest *req = ctx->req;
1536 uintptr_t *num_flushes = (uintptr_t *)&req->opaque;
1538 BlockBackend *blk = ctx->ns->blkconf.blk;
1539 BlockAcctCookie *acct = &ctx->acct;
1540 BlockAcctStats *stats = blk_get_stats(blk);
1542 trace_pci_nvme_aio_flush_cb(nvme_cid(req), blk_name(blk));
1544 if (!ret) {
1545 block_acct_done(stats, acct);
1546 } else {
1547 block_acct_failed(stats, acct);
1548 nvme_aio_err(req, ret);
1551 (*num_flushes)--;
1552 g_free(ctx);
1554 if (*num_flushes) {
1555 return;
1558 nvme_enqueue_req_completion(nvme_cq(req), req);
1561 static void nvme_aio_discard_cb(void *opaque, int ret)
1563 NvmeRequest *req = opaque;
1564 uintptr_t *discards = (uintptr_t *)&req->opaque;
1566 trace_pci_nvme_aio_discard_cb(nvme_cid(req));
1568 if (ret) {
1569 nvme_aio_err(req, ret);
1572 (*discards)--;
1574 if (*discards) {
1575 return;
1578 nvme_enqueue_req_completion(nvme_cq(req), req);
1581 struct nvme_zone_reset_ctx {
1582 NvmeRequest *req;
1583 NvmeZone *zone;
1586 static void nvme_aio_zone_reset_cb(void *opaque, int ret)
1588 struct nvme_zone_reset_ctx *ctx = opaque;
1589 NvmeRequest *req = ctx->req;
1590 NvmeNamespace *ns = req->ns;
1591 NvmeZone *zone = ctx->zone;
1592 uintptr_t *resets = (uintptr_t *)&req->opaque;
1594 g_free(ctx);
1596 trace_pci_nvme_aio_zone_reset_cb(nvme_cid(req), zone->d.zslba);
1598 if (!ret) {
1599 switch (nvme_get_zone_state(zone)) {
1600 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1601 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1602 nvme_aor_dec_open(ns);
1603 /* fall through */
1604 case NVME_ZONE_STATE_CLOSED:
1605 nvme_aor_dec_active(ns);
1606 /* fall through */
1607 case NVME_ZONE_STATE_FULL:
1608 zone->w_ptr = zone->d.zslba;
1609 zone->d.wp = zone->w_ptr;
1610 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EMPTY);
1611 /* fall through */
1612 default:
1613 break;
1615 } else {
1616 nvme_aio_err(req, ret);
1619 (*resets)--;
1621 if (*resets) {
1622 return;
1625 nvme_enqueue_req_completion(nvme_cq(req), req);
1628 struct nvme_copy_ctx {
1629 int copies;
1630 uint8_t *bounce;
1631 uint32_t nlb;
1634 struct nvme_copy_in_ctx {
1635 NvmeRequest *req;
1636 QEMUIOVector iov;
1639 static void nvme_copy_cb(void *opaque, int ret)
1641 NvmeRequest *req = opaque;
1642 NvmeNamespace *ns = req->ns;
1643 struct nvme_copy_ctx *ctx = req->opaque;
1645 trace_pci_nvme_copy_cb(nvme_cid(req));
1647 if (ns->params.zoned) {
1648 NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd;
1649 uint64_t sdlba = le64_to_cpu(copy->sdlba);
1650 NvmeZone *zone = nvme_get_zone_by_slba(ns, sdlba);
1652 __nvme_advance_zone_wp(ns, zone, ctx->nlb);
1655 if (!ret) {
1656 block_acct_done(blk_get_stats(ns->blkconf.blk), &req->acct);
1657 } else {
1658 block_acct_failed(blk_get_stats(ns->blkconf.blk), &req->acct);
1659 nvme_aio_err(req, ret);
1662 g_free(ctx->bounce);
1663 g_free(ctx);
1665 nvme_enqueue_req_completion(nvme_cq(req), req);
1668 static void nvme_copy_in_complete(NvmeRequest *req)
1670 NvmeNamespace *ns = req->ns;
1671 NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd;
1672 struct nvme_copy_ctx *ctx = req->opaque;
1673 uint64_t sdlba = le64_to_cpu(copy->sdlba);
1674 uint16_t status;
1676 trace_pci_nvme_copy_in_complete(nvme_cid(req));
1678 block_acct_done(blk_get_stats(ns->blkconf.blk), &req->acct);
1680 status = nvme_check_bounds(ns, sdlba, ctx->nlb);
1681 if (status) {
1682 trace_pci_nvme_err_invalid_lba_range(sdlba, ctx->nlb, ns->id_ns.nsze);
1683 goto invalid;
1686 if (ns->params.zoned) {
1687 NvmeZone *zone = nvme_get_zone_by_slba(ns, sdlba);
1689 status = nvme_check_zone_write(ns, zone, sdlba, ctx->nlb);
1690 if (status) {
1691 goto invalid;
1694 status = nvme_zrm_auto(ns, zone);
1695 if (status) {
1696 goto invalid;
1699 zone->w_ptr += ctx->nlb;
1702 qemu_iovec_init(&req->sg.iov, 1);
1703 qemu_iovec_add(&req->sg.iov, ctx->bounce, nvme_l2b(ns, ctx->nlb));
1705 block_acct_start(blk_get_stats(ns->blkconf.blk), &req->acct, 0,
1706 BLOCK_ACCT_WRITE);
1708 req->aiocb = blk_aio_pwritev(ns->blkconf.blk, nvme_l2b(ns, sdlba),
1709 &req->sg.iov, 0, nvme_copy_cb, req);
1711 return;
1713 invalid:
1714 req->status = status;
1716 g_free(ctx->bounce);
1717 g_free(ctx);
1719 nvme_enqueue_req_completion(nvme_cq(req), req);
1722 static void nvme_aio_copy_in_cb(void *opaque, int ret)
1724 struct nvme_copy_in_ctx *in_ctx = opaque;
1725 NvmeRequest *req = in_ctx->req;
1726 NvmeNamespace *ns = req->ns;
1727 struct nvme_copy_ctx *ctx = req->opaque;
1729 qemu_iovec_destroy(&in_ctx->iov);
1730 g_free(in_ctx);
1732 trace_pci_nvme_aio_copy_in_cb(nvme_cid(req));
1734 if (ret) {
1735 nvme_aio_err(req, ret);
1738 ctx->copies--;
1740 if (ctx->copies) {
1741 return;
1744 if (req->status) {
1745 block_acct_failed(blk_get_stats(ns->blkconf.blk), &req->acct);
1747 g_free(ctx->bounce);
1748 g_free(ctx);
1750 nvme_enqueue_req_completion(nvme_cq(req), req);
1752 return;
1755 nvme_copy_in_complete(req);
1758 struct nvme_compare_ctx {
1759 QEMUIOVector iov;
1760 uint8_t *bounce;
1763 static void nvme_compare_cb(void *opaque, int ret)
1765 NvmeRequest *req = opaque;
1766 NvmeNamespace *ns = req->ns;
1767 struct nvme_compare_ctx *ctx = req->opaque;
1768 g_autofree uint8_t *buf = NULL;
1769 uint16_t status;
1771 trace_pci_nvme_compare_cb(nvme_cid(req));
1773 if (!ret) {
1774 block_acct_done(blk_get_stats(ns->blkconf.blk), &req->acct);
1775 } else {
1776 block_acct_failed(blk_get_stats(ns->blkconf.blk), &req->acct);
1777 nvme_aio_err(req, ret);
1778 goto out;
1781 buf = g_malloc(ctx->iov.size);
1783 status = nvme_h2c(nvme_ctrl(req), buf, ctx->iov.size, req);
1784 if (status) {
1785 req->status = status;
1786 goto out;
1789 if (memcmp(buf, ctx->bounce, ctx->iov.size)) {
1790 req->status = NVME_CMP_FAILURE;
1793 out:
1794 qemu_iovec_destroy(&ctx->iov);
1795 g_free(ctx->bounce);
1796 g_free(ctx);
1798 nvme_enqueue_req_completion(nvme_cq(req), req);
1801 static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest *req)
1803 NvmeNamespace *ns = req->ns;
1804 NvmeDsmCmd *dsm = (NvmeDsmCmd *) &req->cmd;
1806 uint32_t attr = le32_to_cpu(dsm->attributes);
1807 uint32_t nr = (le32_to_cpu(dsm->nr) & 0xff) + 1;
1809 uint16_t status = NVME_SUCCESS;
1811 trace_pci_nvme_dsm(nvme_cid(req), nvme_nsid(ns), nr, attr);
1813 if (attr & NVME_DSMGMT_AD) {
1814 int64_t offset;
1815 size_t len;
1816 NvmeDsmRange range[nr];
1817 uintptr_t *discards = (uintptr_t *)&req->opaque;
1819 status = nvme_h2c(n, (uint8_t *)range, sizeof(range), req);
1820 if (status) {
1821 return status;
1825 * AIO callbacks may be called immediately, so initialize discards to 1
1826 * to make sure the the callback does not complete the request before
1827 * all discards have been issued.
1829 *discards = 1;
1831 for (int i = 0; i < nr; i++) {
1832 uint64_t slba = le64_to_cpu(range[i].slba);
1833 uint32_t nlb = le32_to_cpu(range[i].nlb);
1835 if (nvme_check_bounds(ns, slba, nlb)) {
1836 trace_pci_nvme_err_invalid_lba_range(slba, nlb,
1837 ns->id_ns.nsze);
1838 continue;
1841 trace_pci_nvme_dsm_deallocate(nvme_cid(req), nvme_nsid(ns), slba,
1842 nlb);
1844 if (nlb > n->dmrsl) {
1845 trace_pci_nvme_dsm_single_range_limit_exceeded(nlb, n->dmrsl);
1848 offset = nvme_l2b(ns, slba);
1849 len = nvme_l2b(ns, nlb);
1851 while (len) {
1852 size_t bytes = MIN(BDRV_REQUEST_MAX_BYTES, len);
1854 (*discards)++;
1856 blk_aio_pdiscard(ns->blkconf.blk, offset, bytes,
1857 nvme_aio_discard_cb, req);
1859 offset += bytes;
1860 len -= bytes;
1864 /* account for the 1-initialization */
1865 (*discards)--;
1867 if (*discards) {
1868 status = NVME_NO_COMPLETE;
1869 } else {
1870 status = req->status;
1874 return status;
1877 static uint16_t nvme_copy(NvmeCtrl *n, NvmeRequest *req)
1879 NvmeNamespace *ns = req->ns;
1880 NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd;
1881 g_autofree NvmeCopySourceRange *range = NULL;
1883 uint16_t nr = copy->nr + 1;
1884 uint8_t format = copy->control[0] & 0xf;
1885 uint32_t nlb = 0;
1887 uint8_t *bounce = NULL, *bouncep = NULL;
1888 struct nvme_copy_ctx *ctx;
1889 uint16_t status;
1890 int i;
1892 trace_pci_nvme_copy(nvme_cid(req), nvme_nsid(ns), nr, format);
1894 if (!(n->id_ctrl.ocfs & (1 << format))) {
1895 trace_pci_nvme_err_copy_invalid_format(format);
1896 return NVME_INVALID_FIELD | NVME_DNR;
1899 if (nr > ns->id_ns.msrc + 1) {
1900 return NVME_CMD_SIZE_LIMIT | NVME_DNR;
1903 range = g_new(NvmeCopySourceRange, nr);
1905 status = nvme_h2c(n, (uint8_t *)range, nr * sizeof(NvmeCopySourceRange),
1906 req);
1907 if (status) {
1908 return status;
1911 for (i = 0; i < nr; i++) {
1912 uint64_t slba = le64_to_cpu(range[i].slba);
1913 uint32_t _nlb = le16_to_cpu(range[i].nlb) + 1;
1915 if (_nlb > le16_to_cpu(ns->id_ns.mssrl)) {
1916 return NVME_CMD_SIZE_LIMIT | NVME_DNR;
1919 status = nvme_check_bounds(ns, slba, _nlb);
1920 if (status) {
1921 trace_pci_nvme_err_invalid_lba_range(slba, _nlb, ns->id_ns.nsze);
1922 return status;
1925 if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
1926 status = nvme_check_dulbe(ns, slba, _nlb);
1927 if (status) {
1928 return status;
1932 if (ns->params.zoned) {
1933 status = nvme_check_zone_read(ns, slba, _nlb);
1934 if (status) {
1935 return status;
1939 nlb += _nlb;
1942 if (nlb > le32_to_cpu(ns->id_ns.mcl)) {
1943 return NVME_CMD_SIZE_LIMIT | NVME_DNR;
1946 bounce = bouncep = g_malloc(nvme_l2b(ns, nlb));
1948 block_acct_start(blk_get_stats(ns->blkconf.blk), &req->acct, 0,
1949 BLOCK_ACCT_READ);
1951 ctx = g_new(struct nvme_copy_ctx, 1);
1953 ctx->bounce = bounce;
1954 ctx->nlb = nlb;
1955 ctx->copies = 1;
1957 req->opaque = ctx;
1959 for (i = 0; i < nr; i++) {
1960 uint64_t slba = le64_to_cpu(range[i].slba);
1961 uint32_t nlb = le16_to_cpu(range[i].nlb) + 1;
1963 size_t len = nvme_l2b(ns, nlb);
1964 int64_t offset = nvme_l2b(ns, slba);
1966 trace_pci_nvme_copy_source_range(slba, nlb);
1968 struct nvme_copy_in_ctx *in_ctx = g_new(struct nvme_copy_in_ctx, 1);
1969 in_ctx->req = req;
1971 qemu_iovec_init(&in_ctx->iov, 1);
1972 qemu_iovec_add(&in_ctx->iov, bouncep, len);
1974 ctx->copies++;
1976 blk_aio_preadv(ns->blkconf.blk, offset, &in_ctx->iov, 0,
1977 nvme_aio_copy_in_cb, in_ctx);
1979 bouncep += len;
1982 /* account for the 1-initialization */
1983 ctx->copies--;
1985 if (!ctx->copies) {
1986 nvme_copy_in_complete(req);
1989 return NVME_NO_COMPLETE;
1992 static uint16_t nvme_compare(NvmeCtrl *n, NvmeRequest *req)
1994 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1995 NvmeNamespace *ns = req->ns;
1996 BlockBackend *blk = ns->blkconf.blk;
1997 uint64_t slba = le64_to_cpu(rw->slba);
1998 uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
1999 size_t len = nvme_l2b(ns, nlb);
2000 int64_t offset = nvme_l2b(ns, slba);
2001 uint8_t *bounce = NULL;
2002 struct nvme_compare_ctx *ctx = NULL;
2003 uint16_t status;
2005 trace_pci_nvme_compare(nvme_cid(req), nvme_nsid(ns), slba, nlb);
2007 status = nvme_check_mdts(n, len);
2008 if (status) {
2009 return status;
2012 status = nvme_check_bounds(ns, slba, nlb);
2013 if (status) {
2014 trace_pci_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze);
2015 return status;
2018 if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
2019 status = nvme_check_dulbe(ns, slba, nlb);
2020 if (status) {
2021 return status;
2025 bounce = g_malloc(len);
2027 ctx = g_new(struct nvme_compare_ctx, 1);
2028 ctx->bounce = bounce;
2030 req->opaque = ctx;
2032 qemu_iovec_init(&ctx->iov, 1);
2033 qemu_iovec_add(&ctx->iov, bounce, len);
2035 block_acct_start(blk_get_stats(blk), &req->acct, len, BLOCK_ACCT_READ);
2036 blk_aio_preadv(blk, offset, &ctx->iov, 0, nvme_compare_cb, req);
2038 return NVME_NO_COMPLETE;
2041 static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req)
2043 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
2044 uintptr_t *num_flushes = (uintptr_t *)&req->opaque;
2045 uint16_t status;
2046 struct nvme_aio_flush_ctx *ctx;
2047 NvmeNamespace *ns;
2049 trace_pci_nvme_flush(nvme_cid(req), nsid);
2051 if (nsid != NVME_NSID_BROADCAST) {
2052 req->ns = nvme_ns(n, nsid);
2053 if (unlikely(!req->ns)) {
2054 return NVME_INVALID_FIELD | NVME_DNR;
2057 block_acct_start(blk_get_stats(req->ns->blkconf.blk), &req->acct, 0,
2058 BLOCK_ACCT_FLUSH);
2059 req->aiocb = blk_aio_flush(req->ns->blkconf.blk, nvme_rw_cb, req);
2060 return NVME_NO_COMPLETE;
2063 /* 1-initialize; see comment in nvme_dsm */
2064 *num_flushes = 1;
2066 for (int i = 1; i <= n->num_namespaces; i++) {
2067 ns = nvme_ns(n, i);
2068 if (!ns) {
2069 continue;
2072 ctx = g_new(struct nvme_aio_flush_ctx, 1);
2073 ctx->req = req;
2074 ctx->ns = ns;
2076 (*num_flushes)++;
2078 block_acct_start(blk_get_stats(ns->blkconf.blk), &ctx->acct, 0,
2079 BLOCK_ACCT_FLUSH);
2080 blk_aio_flush(ns->blkconf.blk, nvme_aio_flush_cb, ctx);
2083 /* account for the 1-initialization */
2084 (*num_flushes)--;
2086 if (*num_flushes) {
2087 status = NVME_NO_COMPLETE;
2088 } else {
2089 status = req->status;
2092 return status;
2095 static uint16_t nvme_read(NvmeCtrl *n, NvmeRequest *req)
2097 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2098 NvmeNamespace *ns = req->ns;
2099 uint64_t slba = le64_to_cpu(rw->slba);
2100 uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
2101 uint64_t data_size = nvme_l2b(ns, nlb);
2102 uint64_t data_offset;
2103 BlockBackend *blk = ns->blkconf.blk;
2104 uint16_t status;
2106 trace_pci_nvme_read(nvme_cid(req), nvme_nsid(ns), nlb, data_size, slba);
2108 status = nvme_check_mdts(n, data_size);
2109 if (status) {
2110 goto invalid;
2113 status = nvme_check_bounds(ns, slba, nlb);
2114 if (status) {
2115 trace_pci_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze);
2116 goto invalid;
2119 if (ns->params.zoned) {
2120 status = nvme_check_zone_read(ns, slba, nlb);
2121 if (status) {
2122 trace_pci_nvme_err_zone_read_not_ok(slba, nlb, status);
2123 goto invalid;
2127 status = nvme_map_dptr(n, &req->sg, data_size, &req->cmd);
2128 if (status) {
2129 goto invalid;
2132 if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
2133 status = nvme_check_dulbe(ns, slba, nlb);
2134 if (status) {
2135 goto invalid;
2139 data_offset = nvme_l2b(ns, slba);
2141 block_acct_start(blk_get_stats(blk), &req->acct, data_size,
2142 BLOCK_ACCT_READ);
2143 nvme_blk_read(blk, data_offset, nvme_rw_cb, req);
2144 return NVME_NO_COMPLETE;
2146 invalid:
2147 block_acct_invalid(blk_get_stats(blk), BLOCK_ACCT_READ);
2148 return status | NVME_DNR;
2151 static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append,
2152 bool wrz)
2154 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2155 NvmeNamespace *ns = req->ns;
2156 uint64_t slba = le64_to_cpu(rw->slba);
2157 uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
2158 uint64_t data_size = nvme_l2b(ns, nlb);
2159 uint64_t data_offset;
2160 NvmeZone *zone;
2161 NvmeZonedResult *res = (NvmeZonedResult *)&req->cqe;
2162 BlockBackend *blk = ns->blkconf.blk;
2163 uint16_t status;
2165 trace_pci_nvme_write(nvme_cid(req), nvme_io_opc_str(rw->opcode),
2166 nvme_nsid(ns), nlb, data_size, slba);
2168 if (!wrz) {
2169 status = nvme_check_mdts(n, data_size);
2170 if (status) {
2171 goto invalid;
2175 status = nvme_check_bounds(ns, slba, nlb);
2176 if (status) {
2177 trace_pci_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze);
2178 goto invalid;
2181 if (ns->params.zoned) {
2182 zone = nvme_get_zone_by_slba(ns, slba);
2184 if (append) {
2185 if (unlikely(slba != zone->d.zslba)) {
2186 trace_pci_nvme_err_append_not_at_start(slba, zone->d.zslba);
2187 status = NVME_INVALID_FIELD;
2188 goto invalid;
2191 if (n->params.zasl && data_size > n->page_size << n->params.zasl) {
2192 trace_pci_nvme_err_zasl(data_size);
2193 return NVME_INVALID_FIELD | NVME_DNR;
2196 slba = zone->w_ptr;
2197 res->slba = cpu_to_le64(slba);
2200 status = nvme_check_zone_write(ns, zone, slba, nlb);
2201 if (status) {
2202 goto invalid;
2205 status = nvme_zrm_auto(ns, zone);
2206 if (status) {
2207 goto invalid;
2210 zone->w_ptr += nlb;
2213 data_offset = nvme_l2b(ns, slba);
2215 if (!wrz) {
2216 status = nvme_map_dptr(n, &req->sg, data_size, &req->cmd);
2217 if (status) {
2218 goto invalid;
2221 block_acct_start(blk_get_stats(blk), &req->acct, data_size,
2222 BLOCK_ACCT_WRITE);
2223 nvme_blk_write(blk, data_offset, nvme_rw_cb, req);
2224 } else {
2225 req->aiocb = blk_aio_pwrite_zeroes(blk, data_offset, data_size,
2226 BDRV_REQ_MAY_UNMAP, nvme_rw_cb,
2227 req);
2229 return NVME_NO_COMPLETE;
2231 invalid:
2232 block_acct_invalid(blk_get_stats(blk), BLOCK_ACCT_WRITE);
2233 return status | NVME_DNR;
2236 static inline uint16_t nvme_write(NvmeCtrl *n, NvmeRequest *req)
2238 return nvme_do_write(n, req, false, false);
2241 static inline uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeRequest *req)
2243 return nvme_do_write(n, req, false, true);
2246 static inline uint16_t nvme_zone_append(NvmeCtrl *n, NvmeRequest *req)
2248 return nvme_do_write(n, req, true, false);
2251 static uint16_t nvme_get_mgmt_zone_slba_idx(NvmeNamespace *ns, NvmeCmd *c,
2252 uint64_t *slba, uint32_t *zone_idx)
2254 uint32_t dw10 = le32_to_cpu(c->cdw10);
2255 uint32_t dw11 = le32_to_cpu(c->cdw11);
2257 if (!ns->params.zoned) {
2258 trace_pci_nvme_err_invalid_opc(c->opcode);
2259 return NVME_INVALID_OPCODE | NVME_DNR;
2262 *slba = ((uint64_t)dw11) << 32 | dw10;
2263 if (unlikely(*slba >= ns->id_ns.nsze)) {
2264 trace_pci_nvme_err_invalid_lba_range(*slba, 0, ns->id_ns.nsze);
2265 *slba = 0;
2266 return NVME_LBA_RANGE | NVME_DNR;
2269 *zone_idx = nvme_zone_idx(ns, *slba);
2270 assert(*zone_idx < ns->num_zones);
2272 return NVME_SUCCESS;
2275 typedef uint16_t (*op_handler_t)(NvmeNamespace *, NvmeZone *, NvmeZoneState,
2276 NvmeRequest *);
2278 enum NvmeZoneProcessingMask {
2279 NVME_PROC_CURRENT_ZONE = 0,
2280 NVME_PROC_OPENED_ZONES = 1 << 0,
2281 NVME_PROC_CLOSED_ZONES = 1 << 1,
2282 NVME_PROC_READ_ONLY_ZONES = 1 << 2,
2283 NVME_PROC_FULL_ZONES = 1 << 3,
2286 static uint16_t nvme_open_zone(NvmeNamespace *ns, NvmeZone *zone,
2287 NvmeZoneState state, NvmeRequest *req)
2289 return nvme_zrm_open(ns, zone);
2292 static uint16_t nvme_close_zone(NvmeNamespace *ns, NvmeZone *zone,
2293 NvmeZoneState state, NvmeRequest *req)
2295 return nvme_zrm_close(ns, zone);
2298 static uint16_t nvme_finish_zone(NvmeNamespace *ns, NvmeZone *zone,
2299 NvmeZoneState state, NvmeRequest *req)
2301 return nvme_zrm_finish(ns, zone);
2304 static uint16_t nvme_reset_zone(NvmeNamespace *ns, NvmeZone *zone,
2305 NvmeZoneState state, NvmeRequest *req)
2307 uintptr_t *resets = (uintptr_t *)&req->opaque;
2308 struct nvme_zone_reset_ctx *ctx;
2310 switch (state) {
2311 case NVME_ZONE_STATE_EMPTY:
2312 return NVME_SUCCESS;
2313 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
2314 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
2315 case NVME_ZONE_STATE_CLOSED:
2316 case NVME_ZONE_STATE_FULL:
2317 break;
2318 default:
2319 return NVME_ZONE_INVAL_TRANSITION;
2323 * The zone reset aio callback needs to know the zone that is being reset
2324 * in order to transition the zone on completion.
2326 ctx = g_new(struct nvme_zone_reset_ctx, 1);
2327 ctx->req = req;
2328 ctx->zone = zone;
2330 (*resets)++;
2332 blk_aio_pwrite_zeroes(ns->blkconf.blk, nvme_l2b(ns, zone->d.zslba),
2333 nvme_l2b(ns, ns->zone_size), BDRV_REQ_MAY_UNMAP,
2334 nvme_aio_zone_reset_cb, ctx);
2336 return NVME_NO_COMPLETE;
2339 static uint16_t nvme_offline_zone(NvmeNamespace *ns, NvmeZone *zone,
2340 NvmeZoneState state, NvmeRequest *req)
2342 switch (state) {
2343 case NVME_ZONE_STATE_READ_ONLY:
2344 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_OFFLINE);
2345 /* fall through */
2346 case NVME_ZONE_STATE_OFFLINE:
2347 return NVME_SUCCESS;
2348 default:
2349 return NVME_ZONE_INVAL_TRANSITION;
2353 static uint16_t nvme_set_zd_ext(NvmeNamespace *ns, NvmeZone *zone)
2355 uint16_t status;
2356 uint8_t state = nvme_get_zone_state(zone);
2358 if (state == NVME_ZONE_STATE_EMPTY) {
2359 status = nvme_aor_check(ns, 1, 0);
2360 if (status) {
2361 return status;
2363 nvme_aor_inc_active(ns);
2364 zone->d.za |= NVME_ZA_ZD_EXT_VALID;
2365 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_CLOSED);
2366 return NVME_SUCCESS;
2369 return NVME_ZONE_INVAL_TRANSITION;
2372 static uint16_t nvme_bulk_proc_zone(NvmeNamespace *ns, NvmeZone *zone,
2373 enum NvmeZoneProcessingMask proc_mask,
2374 op_handler_t op_hndlr, NvmeRequest *req)
2376 uint16_t status = NVME_SUCCESS;
2377 NvmeZoneState zs = nvme_get_zone_state(zone);
2378 bool proc_zone;
2380 switch (zs) {
2381 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
2382 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
2383 proc_zone = proc_mask & NVME_PROC_OPENED_ZONES;
2384 break;
2385 case NVME_ZONE_STATE_CLOSED:
2386 proc_zone = proc_mask & NVME_PROC_CLOSED_ZONES;
2387 break;
2388 case NVME_ZONE_STATE_READ_ONLY:
2389 proc_zone = proc_mask & NVME_PROC_READ_ONLY_ZONES;
2390 break;
2391 case NVME_ZONE_STATE_FULL:
2392 proc_zone = proc_mask & NVME_PROC_FULL_ZONES;
2393 break;
2394 default:
2395 proc_zone = false;
2398 if (proc_zone) {
2399 status = op_hndlr(ns, zone, zs, req);
2402 return status;
2405 static uint16_t nvme_do_zone_op(NvmeNamespace *ns, NvmeZone *zone,
2406 enum NvmeZoneProcessingMask proc_mask,
2407 op_handler_t op_hndlr, NvmeRequest *req)
2409 NvmeZone *next;
2410 uint16_t status = NVME_SUCCESS;
2411 int i;
2413 if (!proc_mask) {
2414 status = op_hndlr(ns, zone, nvme_get_zone_state(zone), req);
2415 } else {
2416 if (proc_mask & NVME_PROC_CLOSED_ZONES) {
2417 QTAILQ_FOREACH_SAFE(zone, &ns->closed_zones, entry, next) {
2418 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
2419 req);
2420 if (status && status != NVME_NO_COMPLETE) {
2421 goto out;
2425 if (proc_mask & NVME_PROC_OPENED_ZONES) {
2426 QTAILQ_FOREACH_SAFE(zone, &ns->imp_open_zones, entry, next) {
2427 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
2428 req);
2429 if (status && status != NVME_NO_COMPLETE) {
2430 goto out;
2434 QTAILQ_FOREACH_SAFE(zone, &ns->exp_open_zones, entry, next) {
2435 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
2436 req);
2437 if (status && status != NVME_NO_COMPLETE) {
2438 goto out;
2442 if (proc_mask & NVME_PROC_FULL_ZONES) {
2443 QTAILQ_FOREACH_SAFE(zone, &ns->full_zones, entry, next) {
2444 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
2445 req);
2446 if (status && status != NVME_NO_COMPLETE) {
2447 goto out;
2452 if (proc_mask & NVME_PROC_READ_ONLY_ZONES) {
2453 for (i = 0; i < ns->num_zones; i++, zone++) {
2454 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
2455 req);
2456 if (status && status != NVME_NO_COMPLETE) {
2457 goto out;
2463 out:
2464 return status;
2467 static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, NvmeRequest *req)
2469 NvmeCmd *cmd = (NvmeCmd *)&req->cmd;
2470 NvmeNamespace *ns = req->ns;
2471 NvmeZone *zone;
2472 uintptr_t *resets;
2473 uint8_t *zd_ext;
2474 uint32_t dw13 = le32_to_cpu(cmd->cdw13);
2475 uint64_t slba = 0;
2476 uint32_t zone_idx = 0;
2477 uint16_t status;
2478 uint8_t action;
2479 bool all;
2480 enum NvmeZoneProcessingMask proc_mask = NVME_PROC_CURRENT_ZONE;
2482 action = dw13 & 0xff;
2483 all = dw13 & 0x100;
2485 req->status = NVME_SUCCESS;
2487 if (!all) {
2488 status = nvme_get_mgmt_zone_slba_idx(ns, cmd, &slba, &zone_idx);
2489 if (status) {
2490 return status;
2494 zone = &ns->zone_array[zone_idx];
2495 if (slba != zone->d.zslba) {
2496 trace_pci_nvme_err_unaligned_zone_cmd(action, slba, zone->d.zslba);
2497 return NVME_INVALID_FIELD | NVME_DNR;
2500 switch (action) {
2502 case NVME_ZONE_ACTION_OPEN:
2503 if (all) {
2504 proc_mask = NVME_PROC_CLOSED_ZONES;
2506 trace_pci_nvme_open_zone(slba, zone_idx, all);
2507 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_open_zone, req);
2508 break;
2510 case NVME_ZONE_ACTION_CLOSE:
2511 if (all) {
2512 proc_mask = NVME_PROC_OPENED_ZONES;
2514 trace_pci_nvme_close_zone(slba, zone_idx, all);
2515 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_close_zone, req);
2516 break;
2518 case NVME_ZONE_ACTION_FINISH:
2519 if (all) {
2520 proc_mask = NVME_PROC_OPENED_ZONES | NVME_PROC_CLOSED_ZONES;
2522 trace_pci_nvme_finish_zone(slba, zone_idx, all);
2523 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_finish_zone, req);
2524 break;
2526 case NVME_ZONE_ACTION_RESET:
2527 resets = (uintptr_t *)&req->opaque;
2529 if (all) {
2530 proc_mask = NVME_PROC_OPENED_ZONES | NVME_PROC_CLOSED_ZONES |
2531 NVME_PROC_FULL_ZONES;
2533 trace_pci_nvme_reset_zone(slba, zone_idx, all);
2535 *resets = 1;
2537 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_reset_zone, req);
2539 (*resets)--;
2541 return *resets ? NVME_NO_COMPLETE : req->status;
2543 case NVME_ZONE_ACTION_OFFLINE:
2544 if (all) {
2545 proc_mask = NVME_PROC_READ_ONLY_ZONES;
2547 trace_pci_nvme_offline_zone(slba, zone_idx, all);
2548 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_offline_zone, req);
2549 break;
2551 case NVME_ZONE_ACTION_SET_ZD_EXT:
2552 trace_pci_nvme_set_descriptor_extension(slba, zone_idx);
2553 if (all || !ns->params.zd_extension_size) {
2554 return NVME_INVALID_FIELD | NVME_DNR;
2556 zd_ext = nvme_get_zd_extension(ns, zone_idx);
2557 status = nvme_h2c(n, zd_ext, ns->params.zd_extension_size, req);
2558 if (status) {
2559 trace_pci_nvme_err_zd_extension_map_error(zone_idx);
2560 return status;
2563 status = nvme_set_zd_ext(ns, zone);
2564 if (status == NVME_SUCCESS) {
2565 trace_pci_nvme_zd_extension_set(zone_idx);
2566 return status;
2568 break;
2570 default:
2571 trace_pci_nvme_err_invalid_mgmt_action(action);
2572 status = NVME_INVALID_FIELD;
2575 if (status == NVME_ZONE_INVAL_TRANSITION) {
2576 trace_pci_nvme_err_invalid_zone_state_transition(action, slba,
2577 zone->d.za);
2579 if (status) {
2580 status |= NVME_DNR;
2583 return status;
2586 static bool nvme_zone_matches_filter(uint32_t zafs, NvmeZone *zl)
2588 NvmeZoneState zs = nvme_get_zone_state(zl);
2590 switch (zafs) {
2591 case NVME_ZONE_REPORT_ALL:
2592 return true;
2593 case NVME_ZONE_REPORT_EMPTY:
2594 return zs == NVME_ZONE_STATE_EMPTY;
2595 case NVME_ZONE_REPORT_IMPLICITLY_OPEN:
2596 return zs == NVME_ZONE_STATE_IMPLICITLY_OPEN;
2597 case NVME_ZONE_REPORT_EXPLICITLY_OPEN:
2598 return zs == NVME_ZONE_STATE_EXPLICITLY_OPEN;
2599 case NVME_ZONE_REPORT_CLOSED:
2600 return zs == NVME_ZONE_STATE_CLOSED;
2601 case NVME_ZONE_REPORT_FULL:
2602 return zs == NVME_ZONE_STATE_FULL;
2603 case NVME_ZONE_REPORT_READ_ONLY:
2604 return zs == NVME_ZONE_STATE_READ_ONLY;
2605 case NVME_ZONE_REPORT_OFFLINE:
2606 return zs == NVME_ZONE_STATE_OFFLINE;
2607 default:
2608 return false;
2612 static uint16_t nvme_zone_mgmt_recv(NvmeCtrl *n, NvmeRequest *req)
2614 NvmeCmd *cmd = (NvmeCmd *)&req->cmd;
2615 NvmeNamespace *ns = req->ns;
2616 /* cdw12 is zero-based number of dwords to return. Convert to bytes */
2617 uint32_t data_size = (le32_to_cpu(cmd->cdw12) + 1) << 2;
2618 uint32_t dw13 = le32_to_cpu(cmd->cdw13);
2619 uint32_t zone_idx, zra, zrasf, partial;
2620 uint64_t max_zones, nr_zones = 0;
2621 uint16_t status;
2622 uint64_t slba, capacity = nvme_ns_nlbas(ns);
2623 NvmeZoneDescr *z;
2624 NvmeZone *zone;
2625 NvmeZoneReportHeader *header;
2626 void *buf, *buf_p;
2627 size_t zone_entry_sz;
2629 req->status = NVME_SUCCESS;
2631 status = nvme_get_mgmt_zone_slba_idx(ns, cmd, &slba, &zone_idx);
2632 if (status) {
2633 return status;
2636 zra = dw13 & 0xff;
2637 if (zra != NVME_ZONE_REPORT && zra != NVME_ZONE_REPORT_EXTENDED) {
2638 return NVME_INVALID_FIELD | NVME_DNR;
2640 if (zra == NVME_ZONE_REPORT_EXTENDED && !ns->params.zd_extension_size) {
2641 return NVME_INVALID_FIELD | NVME_DNR;
2644 zrasf = (dw13 >> 8) & 0xff;
2645 if (zrasf > NVME_ZONE_REPORT_OFFLINE) {
2646 return NVME_INVALID_FIELD | NVME_DNR;
2649 if (data_size < sizeof(NvmeZoneReportHeader)) {
2650 return NVME_INVALID_FIELD | NVME_DNR;
2653 status = nvme_check_mdts(n, data_size);
2654 if (status) {
2655 return status;
2658 partial = (dw13 >> 16) & 0x01;
2660 zone_entry_sz = sizeof(NvmeZoneDescr);
2661 if (zra == NVME_ZONE_REPORT_EXTENDED) {
2662 zone_entry_sz += ns->params.zd_extension_size;
2665 max_zones = (data_size - sizeof(NvmeZoneReportHeader)) / zone_entry_sz;
2666 buf = g_malloc0(data_size);
2668 zone = &ns->zone_array[zone_idx];
2669 for (; slba < capacity; slba += ns->zone_size) {
2670 if (partial && nr_zones >= max_zones) {
2671 break;
2673 if (nvme_zone_matches_filter(zrasf, zone++)) {
2674 nr_zones++;
2677 header = (NvmeZoneReportHeader *)buf;
2678 header->nr_zones = cpu_to_le64(nr_zones);
2680 buf_p = buf + sizeof(NvmeZoneReportHeader);
2681 for (; zone_idx < ns->num_zones && max_zones > 0; zone_idx++) {
2682 zone = &ns->zone_array[zone_idx];
2683 if (nvme_zone_matches_filter(zrasf, zone)) {
2684 z = (NvmeZoneDescr *)buf_p;
2685 buf_p += sizeof(NvmeZoneDescr);
2687 z->zt = zone->d.zt;
2688 z->zs = zone->d.zs;
2689 z->zcap = cpu_to_le64(zone->d.zcap);
2690 z->zslba = cpu_to_le64(zone->d.zslba);
2691 z->za = zone->d.za;
2693 if (nvme_wp_is_valid(zone)) {
2694 z->wp = cpu_to_le64(zone->d.wp);
2695 } else {
2696 z->wp = cpu_to_le64(~0ULL);
2699 if (zra == NVME_ZONE_REPORT_EXTENDED) {
2700 if (zone->d.za & NVME_ZA_ZD_EXT_VALID) {
2701 memcpy(buf_p, nvme_get_zd_extension(ns, zone_idx),
2702 ns->params.zd_extension_size);
2704 buf_p += ns->params.zd_extension_size;
2707 max_zones--;
2711 status = nvme_c2h(n, (uint8_t *)buf, data_size, req);
2713 g_free(buf);
2715 return status;
2718 static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req)
2720 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
2722 trace_pci_nvme_io_cmd(nvme_cid(req), nsid, nvme_sqid(req),
2723 req->cmd.opcode, nvme_io_opc_str(req->cmd.opcode));
2725 if (!nvme_nsid_valid(n, nsid)) {
2726 return NVME_INVALID_NSID | NVME_DNR;
2730 * In the base NVM command set, Flush may apply to all namespaces
2731 * (indicated by NSID being set to 0xFFFFFFFF). But if that feature is used
2732 * along with TP 4056 (Namespace Types), it may be pretty screwed up.
2734 * If NSID is indeed set to 0xFFFFFFFF, we simply cannot associate the
2735 * opcode with a specific command since we cannot determine a unique I/O
2736 * command set. Opcode 0x0 could have any other meaning than something
2737 * equivalent to flushing and say it DOES have completely different
2738 * semantics in some other command set - does an NSID of 0xFFFFFFFF then
2739 * mean "for all namespaces, apply whatever command set specific command
2740 * that uses the 0x0 opcode?" Or does it mean "for all namespaces, apply
2741 * whatever command that uses the 0x0 opcode if, and only if, it allows
2742 * NSID to be 0xFFFFFFFF"?
2744 * Anyway (and luckily), for now, we do not care about this since the
2745 * device only supports namespace types that includes the NVM Flush command
2746 * (NVM and Zoned), so always do an NVM Flush.
2748 if (req->cmd.opcode == NVME_CMD_FLUSH) {
2749 return nvme_flush(n, req);
2752 req->ns = nvme_ns(n, nsid);
2753 if (unlikely(!req->ns)) {
2754 return NVME_INVALID_FIELD | NVME_DNR;
2757 if (!(req->ns->iocs[req->cmd.opcode] & NVME_CMD_EFF_CSUPP)) {
2758 trace_pci_nvme_err_invalid_opc(req->cmd.opcode);
2759 return NVME_INVALID_OPCODE | NVME_DNR;
2762 switch (req->cmd.opcode) {
2763 case NVME_CMD_WRITE_ZEROES:
2764 return nvme_write_zeroes(n, req);
2765 case NVME_CMD_ZONE_APPEND:
2766 return nvme_zone_append(n, req);
2767 case NVME_CMD_WRITE:
2768 return nvme_write(n, req);
2769 case NVME_CMD_READ:
2770 return nvme_read(n, req);
2771 case NVME_CMD_COMPARE:
2772 return nvme_compare(n, req);
2773 case NVME_CMD_DSM:
2774 return nvme_dsm(n, req);
2775 case NVME_CMD_COPY:
2776 return nvme_copy(n, req);
2777 case NVME_CMD_ZONE_MGMT_SEND:
2778 return nvme_zone_mgmt_send(n, req);
2779 case NVME_CMD_ZONE_MGMT_RECV:
2780 return nvme_zone_mgmt_recv(n, req);
2781 default:
2782 assert(false);
2785 return NVME_INVALID_OPCODE | NVME_DNR;
2788 static void nvme_free_sq(NvmeSQueue *sq, NvmeCtrl *n)
2790 n->sq[sq->sqid] = NULL;
2791 timer_free(sq->timer);
2792 g_free(sq->io_req);
2793 if (sq->sqid) {
2794 g_free(sq);
2798 static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeRequest *req)
2800 NvmeDeleteQ *c = (NvmeDeleteQ *)&req->cmd;
2801 NvmeRequest *r, *next;
2802 NvmeSQueue *sq;
2803 NvmeCQueue *cq;
2804 uint16_t qid = le16_to_cpu(c->qid);
2806 if (unlikely(!qid || nvme_check_sqid(n, qid))) {
2807 trace_pci_nvme_err_invalid_del_sq(qid);
2808 return NVME_INVALID_QID | NVME_DNR;
2811 trace_pci_nvme_del_sq(qid);
2813 sq = n->sq[qid];
2814 while (!QTAILQ_EMPTY(&sq->out_req_list)) {
2815 r = QTAILQ_FIRST(&sq->out_req_list);
2816 assert(r->aiocb);
2817 blk_aio_cancel(r->aiocb);
2819 if (!nvme_check_cqid(n, sq->cqid)) {
2820 cq = n->cq[sq->cqid];
2821 QTAILQ_REMOVE(&cq->sq_list, sq, entry);
2823 nvme_post_cqes(cq);
2824 QTAILQ_FOREACH_SAFE(r, &cq->req_list, entry, next) {
2825 if (r->sq == sq) {
2826 QTAILQ_REMOVE(&cq->req_list, r, entry);
2827 QTAILQ_INSERT_TAIL(&sq->req_list, r, entry);
2832 nvme_free_sq(sq, n);
2833 return NVME_SUCCESS;
2836 static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, uint64_t dma_addr,
2837 uint16_t sqid, uint16_t cqid, uint16_t size)
2839 int i;
2840 NvmeCQueue *cq;
2842 sq->ctrl = n;
2843 sq->dma_addr = dma_addr;
2844 sq->sqid = sqid;
2845 sq->size = size;
2846 sq->cqid = cqid;
2847 sq->head = sq->tail = 0;
2848 sq->io_req = g_new0(NvmeRequest, sq->size);
2850 QTAILQ_INIT(&sq->req_list);
2851 QTAILQ_INIT(&sq->out_req_list);
2852 for (i = 0; i < sq->size; i++) {
2853 sq->io_req[i].sq = sq;
2854 QTAILQ_INSERT_TAIL(&(sq->req_list), &sq->io_req[i], entry);
2856 sq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_process_sq, sq);
2858 assert(n->cq[cqid]);
2859 cq = n->cq[cqid];
2860 QTAILQ_INSERT_TAIL(&(cq->sq_list), sq, entry);
2861 n->sq[sqid] = sq;
2864 static uint16_t nvme_create_sq(NvmeCtrl *n, NvmeRequest *req)
2866 NvmeSQueue *sq;
2867 NvmeCreateSq *c = (NvmeCreateSq *)&req->cmd;
2869 uint16_t cqid = le16_to_cpu(c->cqid);
2870 uint16_t sqid = le16_to_cpu(c->sqid);
2871 uint16_t qsize = le16_to_cpu(c->qsize);
2872 uint16_t qflags = le16_to_cpu(c->sq_flags);
2873 uint64_t prp1 = le64_to_cpu(c->prp1);
2875 trace_pci_nvme_create_sq(prp1, sqid, cqid, qsize, qflags);
2877 if (unlikely(!cqid || nvme_check_cqid(n, cqid))) {
2878 trace_pci_nvme_err_invalid_create_sq_cqid(cqid);
2879 return NVME_INVALID_CQID | NVME_DNR;
2881 if (unlikely(!sqid || sqid > n->params.max_ioqpairs ||
2882 n->sq[sqid] != NULL)) {
2883 trace_pci_nvme_err_invalid_create_sq_sqid(sqid);
2884 return NVME_INVALID_QID | NVME_DNR;
2886 if (unlikely(!qsize || qsize > NVME_CAP_MQES(n->bar.cap))) {
2887 trace_pci_nvme_err_invalid_create_sq_size(qsize);
2888 return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
2890 if (unlikely(prp1 & (n->page_size - 1))) {
2891 trace_pci_nvme_err_invalid_create_sq_addr(prp1);
2892 return NVME_INVALID_PRP_OFFSET | NVME_DNR;
2894 if (unlikely(!(NVME_SQ_FLAGS_PC(qflags)))) {
2895 trace_pci_nvme_err_invalid_create_sq_qflags(NVME_SQ_FLAGS_PC(qflags));
2896 return NVME_INVALID_FIELD | NVME_DNR;
2898 sq = g_malloc0(sizeof(*sq));
2899 nvme_init_sq(sq, n, prp1, sqid, cqid, qsize + 1);
2900 return NVME_SUCCESS;
2903 struct nvme_stats {
2904 uint64_t units_read;
2905 uint64_t units_written;
2906 uint64_t read_commands;
2907 uint64_t write_commands;
2910 static void nvme_set_blk_stats(NvmeNamespace *ns, struct nvme_stats *stats)
2912 BlockAcctStats *s = blk_get_stats(ns->blkconf.blk);
2914 stats->units_read += s->nr_bytes[BLOCK_ACCT_READ] >> BDRV_SECTOR_BITS;
2915 stats->units_written += s->nr_bytes[BLOCK_ACCT_WRITE] >> BDRV_SECTOR_BITS;
2916 stats->read_commands += s->nr_ops[BLOCK_ACCT_READ];
2917 stats->write_commands += s->nr_ops[BLOCK_ACCT_WRITE];
2920 static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
2921 uint64_t off, NvmeRequest *req)
2923 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
2924 struct nvme_stats stats = { 0 };
2925 NvmeSmartLog smart = { 0 };
2926 uint32_t trans_len;
2927 NvmeNamespace *ns;
2928 time_t current_ms;
2930 if (off >= sizeof(smart)) {
2931 return NVME_INVALID_FIELD | NVME_DNR;
2934 if (nsid != 0xffffffff) {
2935 ns = nvme_ns(n, nsid);
2936 if (!ns) {
2937 return NVME_INVALID_NSID | NVME_DNR;
2939 nvme_set_blk_stats(ns, &stats);
2940 } else {
2941 int i;
2943 for (i = 1; i <= n->num_namespaces; i++) {
2944 ns = nvme_ns(n, i);
2945 if (!ns) {
2946 continue;
2948 nvme_set_blk_stats(ns, &stats);
2952 trans_len = MIN(sizeof(smart) - off, buf_len);
2953 smart.critical_warning = n->smart_critical_warning;
2955 smart.data_units_read[0] = cpu_to_le64(DIV_ROUND_UP(stats.units_read,
2956 1000));
2957 smart.data_units_written[0] = cpu_to_le64(DIV_ROUND_UP(stats.units_written,
2958 1000));
2959 smart.host_read_commands[0] = cpu_to_le64(stats.read_commands);
2960 smart.host_write_commands[0] = cpu_to_le64(stats.write_commands);
2962 smart.temperature = cpu_to_le16(n->temperature);
2964 if ((n->temperature >= n->features.temp_thresh_hi) ||
2965 (n->temperature <= n->features.temp_thresh_low)) {
2966 smart.critical_warning |= NVME_SMART_TEMPERATURE;
2969 current_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
2970 smart.power_on_hours[0] =
2971 cpu_to_le64((((current_ms - n->starttime_ms) / 1000) / 60) / 60);
2973 if (!rae) {
2974 nvme_clear_events(n, NVME_AER_TYPE_SMART);
2977 return nvme_c2h(n, (uint8_t *) &smart + off, trans_len, req);
2980 static uint16_t nvme_fw_log_info(NvmeCtrl *n, uint32_t buf_len, uint64_t off,
2981 NvmeRequest *req)
2983 uint32_t trans_len;
2984 NvmeFwSlotInfoLog fw_log = {
2985 .afi = 0x1,
2988 if (off >= sizeof(fw_log)) {
2989 return NVME_INVALID_FIELD | NVME_DNR;
2992 strpadcpy((char *)&fw_log.frs1, sizeof(fw_log.frs1), "1.0", ' ');
2993 trans_len = MIN(sizeof(fw_log) - off, buf_len);
2995 return nvme_c2h(n, (uint8_t *) &fw_log + off, trans_len, req);
2998 static uint16_t nvme_error_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
2999 uint64_t off, NvmeRequest *req)
3001 uint32_t trans_len;
3002 NvmeErrorLog errlog;
3004 if (off >= sizeof(errlog)) {
3005 return NVME_INVALID_FIELD | NVME_DNR;
3008 if (!rae) {
3009 nvme_clear_events(n, NVME_AER_TYPE_ERROR);
3012 memset(&errlog, 0x0, sizeof(errlog));
3013 trans_len = MIN(sizeof(errlog) - off, buf_len);
3015 return nvme_c2h(n, (uint8_t *)&errlog, trans_len, req);
3018 static uint16_t nvme_changed_nslist(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
3019 uint64_t off, NvmeRequest *req)
3021 uint32_t nslist[1024];
3022 uint32_t trans_len;
3023 int i = 0;
3024 uint32_t nsid;
3026 memset(nslist, 0x0, sizeof(nslist));
3027 trans_len = MIN(sizeof(nslist) - off, buf_len);
3029 while ((nsid = find_first_bit(n->changed_nsids, NVME_CHANGED_NSID_SIZE)) !=
3030 NVME_CHANGED_NSID_SIZE) {
3032 * If more than 1024 namespaces, the first entry in the log page should
3033 * be set to 0xffffffff and the others to 0 as spec.
3035 if (i == ARRAY_SIZE(nslist)) {
3036 memset(nslist, 0x0, sizeof(nslist));
3037 nslist[0] = 0xffffffff;
3038 break;
3041 nslist[i++] = nsid;
3042 clear_bit(nsid, n->changed_nsids);
3046 * Remove all the remaining list entries in case returns directly due to
3047 * more than 1024 namespaces.
3049 if (nslist[0] == 0xffffffff) {
3050 bitmap_zero(n->changed_nsids, NVME_CHANGED_NSID_SIZE);
3053 if (!rae) {
3054 nvme_clear_events(n, NVME_AER_TYPE_NOTICE);
3057 return nvme_c2h(n, ((uint8_t *)nslist) + off, trans_len, req);
3060 static uint16_t nvme_cmd_effects(NvmeCtrl *n, uint8_t csi, uint32_t buf_len,
3061 uint64_t off, NvmeRequest *req)
3063 NvmeEffectsLog log = {};
3064 const uint32_t *src_iocs = NULL;
3065 uint32_t trans_len;
3067 if (off >= sizeof(log)) {
3068 trace_pci_nvme_err_invalid_log_page_offset(off, sizeof(log));
3069 return NVME_INVALID_FIELD | NVME_DNR;
3072 switch (NVME_CC_CSS(n->bar.cc)) {
3073 case NVME_CC_CSS_NVM:
3074 src_iocs = nvme_cse_iocs_nvm;
3075 /* fall through */
3076 case NVME_CC_CSS_ADMIN_ONLY:
3077 break;
3078 case NVME_CC_CSS_CSI:
3079 switch (csi) {
3080 case NVME_CSI_NVM:
3081 src_iocs = nvme_cse_iocs_nvm;
3082 break;
3083 case NVME_CSI_ZONED:
3084 src_iocs = nvme_cse_iocs_zoned;
3085 break;
3089 memcpy(log.acs, nvme_cse_acs, sizeof(nvme_cse_acs));
3091 if (src_iocs) {
3092 memcpy(log.iocs, src_iocs, sizeof(log.iocs));
3095 trans_len = MIN(sizeof(log) - off, buf_len);
3097 return nvme_c2h(n, ((uint8_t *)&log) + off, trans_len, req);
3100 static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req)
3102 NvmeCmd *cmd = &req->cmd;
3104 uint32_t dw10 = le32_to_cpu(cmd->cdw10);
3105 uint32_t dw11 = le32_to_cpu(cmd->cdw11);
3106 uint32_t dw12 = le32_to_cpu(cmd->cdw12);
3107 uint32_t dw13 = le32_to_cpu(cmd->cdw13);
3108 uint8_t lid = dw10 & 0xff;
3109 uint8_t lsp = (dw10 >> 8) & 0xf;
3110 uint8_t rae = (dw10 >> 15) & 0x1;
3111 uint8_t csi = le32_to_cpu(cmd->cdw14) >> 24;
3112 uint32_t numdl, numdu;
3113 uint64_t off, lpol, lpou;
3114 size_t len;
3115 uint16_t status;
3117 numdl = (dw10 >> 16);
3118 numdu = (dw11 & 0xffff);
3119 lpol = dw12;
3120 lpou = dw13;
3122 len = (((numdu << 16) | numdl) + 1) << 2;
3123 off = (lpou << 32ULL) | lpol;
3125 if (off & 0x3) {
3126 return NVME_INVALID_FIELD | NVME_DNR;
3129 trace_pci_nvme_get_log(nvme_cid(req), lid, lsp, rae, len, off);
3131 status = nvme_check_mdts(n, len);
3132 if (status) {
3133 return status;
3136 switch (lid) {
3137 case NVME_LOG_ERROR_INFO:
3138 return nvme_error_info(n, rae, len, off, req);
3139 case NVME_LOG_SMART_INFO:
3140 return nvme_smart_info(n, rae, len, off, req);
3141 case NVME_LOG_FW_SLOT_INFO:
3142 return nvme_fw_log_info(n, len, off, req);
3143 case NVME_LOG_CHANGED_NSLIST:
3144 return nvme_changed_nslist(n, rae, len, off, req);
3145 case NVME_LOG_CMD_EFFECTS:
3146 return nvme_cmd_effects(n, csi, len, off, req);
3147 default:
3148 trace_pci_nvme_err_invalid_log_page(nvme_cid(req), lid);
3149 return NVME_INVALID_FIELD | NVME_DNR;
3153 static void nvme_free_cq(NvmeCQueue *cq, NvmeCtrl *n)
3155 n->cq[cq->cqid] = NULL;
3156 timer_free(cq->timer);
3157 if (msix_enabled(&n->parent_obj)) {
3158 msix_vector_unuse(&n->parent_obj, cq->vector);
3160 if (cq->cqid) {
3161 g_free(cq);
3165 static uint16_t nvme_del_cq(NvmeCtrl *n, NvmeRequest *req)
3167 NvmeDeleteQ *c = (NvmeDeleteQ *)&req->cmd;
3168 NvmeCQueue *cq;
3169 uint16_t qid = le16_to_cpu(c->qid);
3171 if (unlikely(!qid || nvme_check_cqid(n, qid))) {
3172 trace_pci_nvme_err_invalid_del_cq_cqid(qid);
3173 return NVME_INVALID_CQID | NVME_DNR;
3176 cq = n->cq[qid];
3177 if (unlikely(!QTAILQ_EMPTY(&cq->sq_list))) {
3178 trace_pci_nvme_err_invalid_del_cq_notempty(qid);
3179 return NVME_INVALID_QUEUE_DEL;
3181 nvme_irq_deassert(n, cq);
3182 trace_pci_nvme_del_cq(qid);
3183 nvme_free_cq(cq, n);
3184 return NVME_SUCCESS;
3187 static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, uint64_t dma_addr,
3188 uint16_t cqid, uint16_t vector, uint16_t size,
3189 uint16_t irq_enabled)
3191 int ret;
3193 if (msix_enabled(&n->parent_obj)) {
3194 ret = msix_vector_use(&n->parent_obj, vector);
3195 assert(ret == 0);
3197 cq->ctrl = n;
3198 cq->cqid = cqid;
3199 cq->size = size;
3200 cq->dma_addr = dma_addr;
3201 cq->phase = 1;
3202 cq->irq_enabled = irq_enabled;
3203 cq->vector = vector;
3204 cq->head = cq->tail = 0;
3205 QTAILQ_INIT(&cq->req_list);
3206 QTAILQ_INIT(&cq->sq_list);
3207 n->cq[cqid] = cq;
3208 cq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_post_cqes, cq);
3211 static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeRequest *req)
3213 NvmeCQueue *cq;
3214 NvmeCreateCq *c = (NvmeCreateCq *)&req->cmd;
3215 uint16_t cqid = le16_to_cpu(c->cqid);
3216 uint16_t vector = le16_to_cpu(c->irq_vector);
3217 uint16_t qsize = le16_to_cpu(c->qsize);
3218 uint16_t qflags = le16_to_cpu(c->cq_flags);
3219 uint64_t prp1 = le64_to_cpu(c->prp1);
3221 trace_pci_nvme_create_cq(prp1, cqid, vector, qsize, qflags,
3222 NVME_CQ_FLAGS_IEN(qflags) != 0);
3224 if (unlikely(!cqid || cqid > n->params.max_ioqpairs ||
3225 n->cq[cqid] != NULL)) {
3226 trace_pci_nvme_err_invalid_create_cq_cqid(cqid);
3227 return NVME_INVALID_QID | NVME_DNR;
3229 if (unlikely(!qsize || qsize > NVME_CAP_MQES(n->bar.cap))) {
3230 trace_pci_nvme_err_invalid_create_cq_size(qsize);
3231 return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
3233 if (unlikely(prp1 & (n->page_size - 1))) {
3234 trace_pci_nvme_err_invalid_create_cq_addr(prp1);
3235 return NVME_INVALID_PRP_OFFSET | NVME_DNR;
3237 if (unlikely(!msix_enabled(&n->parent_obj) && vector)) {
3238 trace_pci_nvme_err_invalid_create_cq_vector(vector);
3239 return NVME_INVALID_IRQ_VECTOR | NVME_DNR;
3241 if (unlikely(vector >= n->params.msix_qsize)) {
3242 trace_pci_nvme_err_invalid_create_cq_vector(vector);
3243 return NVME_INVALID_IRQ_VECTOR | NVME_DNR;
3245 if (unlikely(!(NVME_CQ_FLAGS_PC(qflags)))) {
3246 trace_pci_nvme_err_invalid_create_cq_qflags(NVME_CQ_FLAGS_PC(qflags));
3247 return NVME_INVALID_FIELD | NVME_DNR;
3250 cq = g_malloc0(sizeof(*cq));
3251 nvme_init_cq(cq, n, prp1, cqid, vector, qsize + 1,
3252 NVME_CQ_FLAGS_IEN(qflags));
3255 * It is only required to set qs_created when creating a completion queue;
3256 * creating a submission queue without a matching completion queue will
3257 * fail.
3259 n->qs_created = true;
3260 return NVME_SUCCESS;
3263 static uint16_t nvme_rpt_empty_id_struct(NvmeCtrl *n, NvmeRequest *req)
3265 uint8_t id[NVME_IDENTIFY_DATA_SIZE] = {};
3267 return nvme_c2h(n, id, sizeof(id), req);
3270 static inline bool nvme_csi_has_nvm_support(NvmeNamespace *ns)
3272 switch (ns->csi) {
3273 case NVME_CSI_NVM:
3274 case NVME_CSI_ZONED:
3275 return true;
3277 return false;
3280 static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeRequest *req)
3282 trace_pci_nvme_identify_ctrl();
3284 return nvme_c2h(n, (uint8_t *)&n->id_ctrl, sizeof(n->id_ctrl), req);
3287 static uint16_t nvme_identify_ctrl_csi(NvmeCtrl *n, NvmeRequest *req)
3289 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
3290 uint8_t id[NVME_IDENTIFY_DATA_SIZE] = {};
3292 trace_pci_nvme_identify_ctrl_csi(c->csi);
3294 switch (c->csi) {
3295 case NVME_CSI_NVM:
3296 ((NvmeIdCtrlNvm *)&id)->dmrsl = cpu_to_le32(n->dmrsl);
3297 break;
3299 case NVME_CSI_ZONED:
3300 ((NvmeIdCtrlZoned *)&id)->zasl = n->params.zasl;
3301 break;
3303 default:
3304 return NVME_INVALID_FIELD | NVME_DNR;
3307 return nvme_c2h(n, id, sizeof(id), req);
3310 static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req, bool active)
3312 NvmeNamespace *ns;
3313 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
3314 uint32_t nsid = le32_to_cpu(c->nsid);
3316 trace_pci_nvme_identify_ns(nsid);
3318 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
3319 return NVME_INVALID_NSID | NVME_DNR;
3322 ns = nvme_ns(n, nsid);
3323 if (unlikely(!ns)) {
3324 if (!active) {
3325 ns = nvme_subsys_ns(n->subsys, nsid);
3326 if (!ns) {
3327 return nvme_rpt_empty_id_struct(n, req);
3329 } else {
3330 return nvme_rpt_empty_id_struct(n, req);
3334 if (c->csi == NVME_CSI_NVM && nvme_csi_has_nvm_support(ns)) {
3335 return nvme_c2h(n, (uint8_t *)&ns->id_ns, sizeof(NvmeIdNs), req);
3338 return NVME_INVALID_CMD_SET | NVME_DNR;
3341 static uint16_t nvme_identify_ns_attached_list(NvmeCtrl *n, NvmeRequest *req)
3343 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
3344 uint16_t min_id = le16_to_cpu(c->ctrlid);
3345 uint16_t list[NVME_CONTROLLER_LIST_SIZE] = {};
3346 uint16_t *ids = &list[1];
3347 NvmeNamespace *ns;
3348 NvmeCtrl *ctrl;
3349 int cntlid, nr_ids = 0;
3351 trace_pci_nvme_identify_ns_attached_list(min_id);
3353 if (c->nsid == NVME_NSID_BROADCAST) {
3354 return NVME_INVALID_FIELD | NVME_DNR;
3357 ns = nvme_subsys_ns(n->subsys, c->nsid);
3358 if (!ns) {
3359 return NVME_INVALID_FIELD | NVME_DNR;
3362 for (cntlid = min_id; cntlid < ARRAY_SIZE(n->subsys->ctrls); cntlid++) {
3363 ctrl = nvme_subsys_ctrl(n->subsys, cntlid);
3364 if (!ctrl) {
3365 continue;
3368 if (!nvme_ns_is_attached(ctrl, ns)) {
3369 continue;
3372 ids[nr_ids++] = cntlid;
3375 list[0] = nr_ids;
3377 return nvme_c2h(n, (uint8_t *)list, sizeof(list), req);
3380 static uint16_t nvme_identify_ns_csi(NvmeCtrl *n, NvmeRequest *req,
3381 bool active)
3383 NvmeNamespace *ns;
3384 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
3385 uint32_t nsid = le32_to_cpu(c->nsid);
3387 trace_pci_nvme_identify_ns_csi(nsid, c->csi);
3389 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
3390 return NVME_INVALID_NSID | NVME_DNR;
3393 ns = nvme_ns(n, nsid);
3394 if (unlikely(!ns)) {
3395 if (!active) {
3396 ns = nvme_subsys_ns(n->subsys, nsid);
3397 if (!ns) {
3398 return nvme_rpt_empty_id_struct(n, req);
3400 } else {
3401 return nvme_rpt_empty_id_struct(n, req);
3405 if (c->csi == NVME_CSI_NVM && nvme_csi_has_nvm_support(ns)) {
3406 return nvme_rpt_empty_id_struct(n, req);
3407 } else if (c->csi == NVME_CSI_ZONED && ns->csi == NVME_CSI_ZONED) {
3408 return nvme_c2h(n, (uint8_t *)ns->id_ns_zoned, sizeof(NvmeIdNsZoned),
3409 req);
3412 return NVME_INVALID_FIELD | NVME_DNR;
3415 static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeRequest *req,
3416 bool active)
3418 NvmeNamespace *ns;
3419 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
3420 uint32_t min_nsid = le32_to_cpu(c->nsid);
3421 uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
3422 static const int data_len = sizeof(list);
3423 uint32_t *list_ptr = (uint32_t *)list;
3424 int i, j = 0;
3426 trace_pci_nvme_identify_nslist(min_nsid);
3429 * Both 0xffffffff (NVME_NSID_BROADCAST) and 0xfffffffe are invalid values
3430 * since the Active Namespace ID List should return namespaces with ids
3431 * *higher* than the NSID specified in the command. This is also specified
3432 * in the spec (NVM Express v1.3d, Section 5.15.4).
3434 if (min_nsid >= NVME_NSID_BROADCAST - 1) {
3435 return NVME_INVALID_NSID | NVME_DNR;
3438 for (i = 1; i <= n->num_namespaces; i++) {
3439 ns = nvme_ns(n, i);
3440 if (!ns) {
3441 if (!active) {
3442 ns = nvme_subsys_ns(n->subsys, i);
3443 if (!ns) {
3444 continue;
3446 } else {
3447 continue;
3450 if (ns->params.nsid <= min_nsid) {
3451 continue;
3453 list_ptr[j++] = cpu_to_le32(ns->params.nsid);
3454 if (j == data_len / sizeof(uint32_t)) {
3455 break;
3459 return nvme_c2h(n, list, data_len, req);
3462 static uint16_t nvme_identify_nslist_csi(NvmeCtrl *n, NvmeRequest *req,
3463 bool active)
3465 NvmeNamespace *ns;
3466 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
3467 uint32_t min_nsid = le32_to_cpu(c->nsid);
3468 uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
3469 static const int data_len = sizeof(list);
3470 uint32_t *list_ptr = (uint32_t *)list;
3471 int i, j = 0;
3473 trace_pci_nvme_identify_nslist_csi(min_nsid, c->csi);
3476 * Same as in nvme_identify_nslist(), 0xffffffff/0xfffffffe are invalid.
3478 if (min_nsid >= NVME_NSID_BROADCAST - 1) {
3479 return NVME_INVALID_NSID | NVME_DNR;
3482 if (c->csi != NVME_CSI_NVM && c->csi != NVME_CSI_ZONED) {
3483 return NVME_INVALID_FIELD | NVME_DNR;
3486 for (i = 1; i <= n->num_namespaces; i++) {
3487 ns = nvme_ns(n, i);
3488 if (!ns) {
3489 if (!active) {
3490 ns = nvme_subsys_ns(n->subsys, i);
3491 if (!ns) {
3492 continue;
3494 } else {
3495 continue;
3498 if (ns->params.nsid <= min_nsid || c->csi != ns->csi) {
3499 continue;
3501 list_ptr[j++] = cpu_to_le32(ns->params.nsid);
3502 if (j == data_len / sizeof(uint32_t)) {
3503 break;
3507 return nvme_c2h(n, list, data_len, req);
3510 static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeRequest *req)
3512 NvmeNamespace *ns;
3513 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
3514 uint32_t nsid = le32_to_cpu(c->nsid);
3515 uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
3517 struct data {
3518 struct {
3519 NvmeIdNsDescr hdr;
3520 uint8_t v[NVME_NIDL_UUID];
3521 } uuid;
3522 struct {
3523 NvmeIdNsDescr hdr;
3524 uint8_t v;
3525 } csi;
3528 struct data *ns_descrs = (struct data *)list;
3530 trace_pci_nvme_identify_ns_descr_list(nsid);
3532 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
3533 return NVME_INVALID_NSID | NVME_DNR;
3536 ns = nvme_ns(n, nsid);
3537 if (unlikely(!ns)) {
3538 return NVME_INVALID_FIELD | NVME_DNR;
3542 * Because the NGUID and EUI64 fields are 0 in the Identify Namespace data
3543 * structure, a Namespace UUID (nidt = 0x3) must be reported in the
3544 * Namespace Identification Descriptor. Add the namespace UUID here.
3546 ns_descrs->uuid.hdr.nidt = NVME_NIDT_UUID;
3547 ns_descrs->uuid.hdr.nidl = NVME_NIDL_UUID;
3548 memcpy(&ns_descrs->uuid.v, ns->params.uuid.data, NVME_NIDL_UUID);
3550 ns_descrs->csi.hdr.nidt = NVME_NIDT_CSI;
3551 ns_descrs->csi.hdr.nidl = NVME_NIDL_CSI;
3552 ns_descrs->csi.v = ns->csi;
3554 return nvme_c2h(n, list, sizeof(list), req);
3557 static uint16_t nvme_identify_cmd_set(NvmeCtrl *n, NvmeRequest *req)
3559 uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
3560 static const int data_len = sizeof(list);
3562 trace_pci_nvme_identify_cmd_set();
3564 NVME_SET_CSI(*list, NVME_CSI_NVM);
3565 NVME_SET_CSI(*list, NVME_CSI_ZONED);
3567 return nvme_c2h(n, list, data_len, req);
3570 static uint16_t nvme_identify(NvmeCtrl *n, NvmeRequest *req)
3572 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
3574 trace_pci_nvme_identify(nvme_cid(req), c->cns, le16_to_cpu(c->ctrlid),
3575 c->csi);
3577 switch (c->cns) {
3578 case NVME_ID_CNS_NS:
3579 return nvme_identify_ns(n, req, true);
3580 case NVME_ID_CNS_NS_PRESENT:
3581 return nvme_identify_ns(n, req, false);
3582 case NVME_ID_CNS_NS_ATTACHED_CTRL_LIST:
3583 return nvme_identify_ns_attached_list(n, req);
3584 case NVME_ID_CNS_CS_NS:
3585 return nvme_identify_ns_csi(n, req, true);
3586 case NVME_ID_CNS_CS_NS_PRESENT:
3587 return nvme_identify_ns_csi(n, req, false);
3588 case NVME_ID_CNS_CTRL:
3589 return nvme_identify_ctrl(n, req);
3590 case NVME_ID_CNS_CS_CTRL:
3591 return nvme_identify_ctrl_csi(n, req);
3592 case NVME_ID_CNS_NS_ACTIVE_LIST:
3593 return nvme_identify_nslist(n, req, true);
3594 case NVME_ID_CNS_NS_PRESENT_LIST:
3595 return nvme_identify_nslist(n, req, false);
3596 case NVME_ID_CNS_CS_NS_ACTIVE_LIST:
3597 return nvme_identify_nslist_csi(n, req, true);
3598 case NVME_ID_CNS_CS_NS_PRESENT_LIST:
3599 return nvme_identify_nslist_csi(n, req, false);
3600 case NVME_ID_CNS_NS_DESCR_LIST:
3601 return nvme_identify_ns_descr_list(n, req);
3602 case NVME_ID_CNS_IO_COMMAND_SET:
3603 return nvme_identify_cmd_set(n, req);
3604 default:
3605 trace_pci_nvme_err_invalid_identify_cns(le32_to_cpu(c->cns));
3606 return NVME_INVALID_FIELD | NVME_DNR;
3610 static uint16_t nvme_abort(NvmeCtrl *n, NvmeRequest *req)
3612 uint16_t sqid = le32_to_cpu(req->cmd.cdw10) & 0xffff;
3614 req->cqe.result = 1;
3615 if (nvme_check_sqid(n, sqid)) {
3616 return NVME_INVALID_FIELD | NVME_DNR;
3619 return NVME_SUCCESS;
3622 static inline void nvme_set_timestamp(NvmeCtrl *n, uint64_t ts)
3624 trace_pci_nvme_setfeat_timestamp(ts);
3626 n->host_timestamp = le64_to_cpu(ts);
3627 n->timestamp_set_qemu_clock_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
3630 static inline uint64_t nvme_get_timestamp(const NvmeCtrl *n)
3632 uint64_t current_time = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
3633 uint64_t elapsed_time = current_time - n->timestamp_set_qemu_clock_ms;
3635 union nvme_timestamp {
3636 struct {
3637 uint64_t timestamp:48;
3638 uint64_t sync:1;
3639 uint64_t origin:3;
3640 uint64_t rsvd1:12;
3642 uint64_t all;
3645 union nvme_timestamp ts;
3646 ts.all = 0;
3647 ts.timestamp = n->host_timestamp + elapsed_time;
3649 /* If the host timestamp is non-zero, set the timestamp origin */
3650 ts.origin = n->host_timestamp ? 0x01 : 0x00;
3652 trace_pci_nvme_getfeat_timestamp(ts.all);
3654 return cpu_to_le64(ts.all);
3657 static uint16_t nvme_get_feature_timestamp(NvmeCtrl *n, NvmeRequest *req)
3659 uint64_t timestamp = nvme_get_timestamp(n);
3661 return nvme_c2h(n, (uint8_t *)&timestamp, sizeof(timestamp), req);
3664 static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest *req)
3666 NvmeCmd *cmd = &req->cmd;
3667 uint32_t dw10 = le32_to_cpu(cmd->cdw10);
3668 uint32_t dw11 = le32_to_cpu(cmd->cdw11);
3669 uint32_t nsid = le32_to_cpu(cmd->nsid);
3670 uint32_t result;
3671 uint8_t fid = NVME_GETSETFEAT_FID(dw10);
3672 NvmeGetFeatureSelect sel = NVME_GETFEAT_SELECT(dw10);
3673 uint16_t iv;
3674 NvmeNamespace *ns;
3675 int i;
3677 static const uint32_t nvme_feature_default[NVME_FID_MAX] = {
3678 [NVME_ARBITRATION] = NVME_ARB_AB_NOLIMIT,
3681 trace_pci_nvme_getfeat(nvme_cid(req), nsid, fid, sel, dw11);
3683 if (!nvme_feature_support[fid]) {
3684 return NVME_INVALID_FIELD | NVME_DNR;
3687 if (nvme_feature_cap[fid] & NVME_FEAT_CAP_NS) {
3688 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
3690 * The Reservation Notification Mask and Reservation Persistence
3691 * features require a status code of Invalid Field in Command when
3692 * NSID is 0xFFFFFFFF. Since the device does not support those
3693 * features we can always return Invalid Namespace or Format as we
3694 * should do for all other features.
3696 return NVME_INVALID_NSID | NVME_DNR;
3699 if (!nvme_ns(n, nsid)) {
3700 return NVME_INVALID_FIELD | NVME_DNR;
3704 switch (sel) {
3705 case NVME_GETFEAT_SELECT_CURRENT:
3706 break;
3707 case NVME_GETFEAT_SELECT_SAVED:
3708 /* no features are saveable by the controller; fallthrough */
3709 case NVME_GETFEAT_SELECT_DEFAULT:
3710 goto defaults;
3711 case NVME_GETFEAT_SELECT_CAP:
3712 result = nvme_feature_cap[fid];
3713 goto out;
3716 switch (fid) {
3717 case NVME_TEMPERATURE_THRESHOLD:
3718 result = 0;
3721 * The controller only implements the Composite Temperature sensor, so
3722 * return 0 for all other sensors.
3724 if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) {
3725 goto out;
3728 switch (NVME_TEMP_THSEL(dw11)) {
3729 case NVME_TEMP_THSEL_OVER:
3730 result = n->features.temp_thresh_hi;
3731 goto out;
3732 case NVME_TEMP_THSEL_UNDER:
3733 result = n->features.temp_thresh_low;
3734 goto out;
3737 return NVME_INVALID_FIELD | NVME_DNR;
3738 case NVME_ERROR_RECOVERY:
3739 if (!nvme_nsid_valid(n, nsid)) {
3740 return NVME_INVALID_NSID | NVME_DNR;
3743 ns = nvme_ns(n, nsid);
3744 if (unlikely(!ns)) {
3745 return NVME_INVALID_FIELD | NVME_DNR;
3748 result = ns->features.err_rec;
3749 goto out;
3750 case NVME_VOLATILE_WRITE_CACHE:
3751 result = 0;
3752 for (i = 1; i <= n->num_namespaces; i++) {
3753 ns = nvme_ns(n, i);
3754 if (!ns) {
3755 continue;
3758 result = blk_enable_write_cache(ns->blkconf.blk);
3759 if (result) {
3760 break;
3763 trace_pci_nvme_getfeat_vwcache(result ? "enabled" : "disabled");
3764 goto out;
3765 case NVME_ASYNCHRONOUS_EVENT_CONF:
3766 result = n->features.async_config;
3767 goto out;
3768 case NVME_TIMESTAMP:
3769 return nvme_get_feature_timestamp(n, req);
3770 default:
3771 break;
3774 defaults:
3775 switch (fid) {
3776 case NVME_TEMPERATURE_THRESHOLD:
3777 result = 0;
3779 if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) {
3780 break;
3783 if (NVME_TEMP_THSEL(dw11) == NVME_TEMP_THSEL_OVER) {
3784 result = NVME_TEMPERATURE_WARNING;
3787 break;
3788 case NVME_NUMBER_OF_QUEUES:
3789 result = (n->params.max_ioqpairs - 1) |
3790 ((n->params.max_ioqpairs - 1) << 16);
3791 trace_pci_nvme_getfeat_numq(result);
3792 break;
3793 case NVME_INTERRUPT_VECTOR_CONF:
3794 iv = dw11 & 0xffff;
3795 if (iv >= n->params.max_ioqpairs + 1) {
3796 return NVME_INVALID_FIELD | NVME_DNR;
3799 result = iv;
3800 if (iv == n->admin_cq.vector) {
3801 result |= NVME_INTVC_NOCOALESCING;
3803 break;
3804 case NVME_COMMAND_SET_PROFILE:
3805 result = 0;
3806 break;
3807 default:
3808 result = nvme_feature_default[fid];
3809 break;
3812 out:
3813 req->cqe.result = cpu_to_le32(result);
3814 return NVME_SUCCESS;
3817 static uint16_t nvme_set_feature_timestamp(NvmeCtrl *n, NvmeRequest *req)
3819 uint16_t ret;
3820 uint64_t timestamp;
3822 ret = nvme_h2c(n, (uint8_t *)&timestamp, sizeof(timestamp), req);
3823 if (ret) {
3824 return ret;
3827 nvme_set_timestamp(n, timestamp);
3829 return NVME_SUCCESS;
3832 static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest *req)
3834 NvmeNamespace *ns = NULL;
3836 NvmeCmd *cmd = &req->cmd;
3837 uint32_t dw10 = le32_to_cpu(cmd->cdw10);
3838 uint32_t dw11 = le32_to_cpu(cmd->cdw11);
3839 uint32_t nsid = le32_to_cpu(cmd->nsid);
3840 uint8_t fid = NVME_GETSETFEAT_FID(dw10);
3841 uint8_t save = NVME_SETFEAT_SAVE(dw10);
3842 int i;
3844 trace_pci_nvme_setfeat(nvme_cid(req), nsid, fid, save, dw11);
3846 if (save && !(nvme_feature_cap[fid] & NVME_FEAT_CAP_SAVE)) {
3847 return NVME_FID_NOT_SAVEABLE | NVME_DNR;
3850 if (!nvme_feature_support[fid]) {
3851 return NVME_INVALID_FIELD | NVME_DNR;
3854 if (nvme_feature_cap[fid] & NVME_FEAT_CAP_NS) {
3855 if (nsid != NVME_NSID_BROADCAST) {
3856 if (!nvme_nsid_valid(n, nsid)) {
3857 return NVME_INVALID_NSID | NVME_DNR;
3860 ns = nvme_ns(n, nsid);
3861 if (unlikely(!ns)) {
3862 return NVME_INVALID_FIELD | NVME_DNR;
3865 } else if (nsid && nsid != NVME_NSID_BROADCAST) {
3866 if (!nvme_nsid_valid(n, nsid)) {
3867 return NVME_INVALID_NSID | NVME_DNR;
3870 return NVME_FEAT_NOT_NS_SPEC | NVME_DNR;
3873 if (!(nvme_feature_cap[fid] & NVME_FEAT_CAP_CHANGE)) {
3874 return NVME_FEAT_NOT_CHANGEABLE | NVME_DNR;
3877 switch (fid) {
3878 case NVME_TEMPERATURE_THRESHOLD:
3879 if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) {
3880 break;
3883 switch (NVME_TEMP_THSEL(dw11)) {
3884 case NVME_TEMP_THSEL_OVER:
3885 n->features.temp_thresh_hi = NVME_TEMP_TMPTH(dw11);
3886 break;
3887 case NVME_TEMP_THSEL_UNDER:
3888 n->features.temp_thresh_low = NVME_TEMP_TMPTH(dw11);
3889 break;
3890 default:
3891 return NVME_INVALID_FIELD | NVME_DNR;
3894 if ((n->temperature >= n->features.temp_thresh_hi) ||
3895 (n->temperature <= n->features.temp_thresh_low)) {
3896 nvme_smart_event(n, NVME_AER_INFO_SMART_TEMP_THRESH);
3899 break;
3900 case NVME_ERROR_RECOVERY:
3901 if (nsid == NVME_NSID_BROADCAST) {
3902 for (i = 1; i <= n->num_namespaces; i++) {
3903 ns = nvme_ns(n, i);
3905 if (!ns) {
3906 continue;
3909 if (NVME_ID_NS_NSFEAT_DULBE(ns->id_ns.nsfeat)) {
3910 ns->features.err_rec = dw11;
3914 break;
3917 assert(ns);
3918 if (NVME_ID_NS_NSFEAT_DULBE(ns->id_ns.nsfeat)) {
3919 ns->features.err_rec = dw11;
3921 break;
3922 case NVME_VOLATILE_WRITE_CACHE:
3923 for (i = 1; i <= n->num_namespaces; i++) {
3924 ns = nvme_ns(n, i);
3925 if (!ns) {
3926 continue;
3929 if (!(dw11 & 0x1) && blk_enable_write_cache(ns->blkconf.blk)) {
3930 blk_flush(ns->blkconf.blk);
3933 blk_set_enable_write_cache(ns->blkconf.blk, dw11 & 1);
3936 break;
3938 case NVME_NUMBER_OF_QUEUES:
3939 if (n->qs_created) {
3940 return NVME_CMD_SEQ_ERROR | NVME_DNR;
3944 * NVMe v1.3, Section 5.21.1.7: 0xffff is not an allowed value for NCQR
3945 * and NSQR.
3947 if ((dw11 & 0xffff) == 0xffff || ((dw11 >> 16) & 0xffff) == 0xffff) {
3948 return NVME_INVALID_FIELD | NVME_DNR;
3951 trace_pci_nvme_setfeat_numq((dw11 & 0xFFFF) + 1,
3952 ((dw11 >> 16) & 0xFFFF) + 1,
3953 n->params.max_ioqpairs,
3954 n->params.max_ioqpairs);
3955 req->cqe.result = cpu_to_le32((n->params.max_ioqpairs - 1) |
3956 ((n->params.max_ioqpairs - 1) << 16));
3957 break;
3958 case NVME_ASYNCHRONOUS_EVENT_CONF:
3959 n->features.async_config = dw11;
3960 break;
3961 case NVME_TIMESTAMP:
3962 return nvme_set_feature_timestamp(n, req);
3963 case NVME_COMMAND_SET_PROFILE:
3964 if (dw11 & 0x1ff) {
3965 trace_pci_nvme_err_invalid_iocsci(dw11 & 0x1ff);
3966 return NVME_CMD_SET_CMB_REJECTED | NVME_DNR;
3968 break;
3969 default:
3970 return NVME_FEAT_NOT_CHANGEABLE | NVME_DNR;
3972 return NVME_SUCCESS;
3975 static uint16_t nvme_aer(NvmeCtrl *n, NvmeRequest *req)
3977 trace_pci_nvme_aer(nvme_cid(req));
3979 if (n->outstanding_aers > n->params.aerl) {
3980 trace_pci_nvme_aer_aerl_exceeded();
3981 return NVME_AER_LIMIT_EXCEEDED;
3984 n->aer_reqs[n->outstanding_aers] = req;
3985 n->outstanding_aers++;
3987 if (!QTAILQ_EMPTY(&n->aer_queue)) {
3988 nvme_process_aers(n);
3991 return NVME_NO_COMPLETE;
3994 static void __nvme_select_ns_iocs(NvmeCtrl *n, NvmeNamespace *ns);
3995 static uint16_t nvme_ns_attachment(NvmeCtrl *n, NvmeRequest *req)
3997 NvmeNamespace *ns;
3998 NvmeCtrl *ctrl;
3999 uint16_t list[NVME_CONTROLLER_LIST_SIZE] = {};
4000 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
4001 uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
4002 bool attach = !(dw10 & 0xf);
4003 uint16_t *nr_ids = &list[0];
4004 uint16_t *ids = &list[1];
4005 uint16_t ret;
4006 int i;
4008 trace_pci_nvme_ns_attachment(nvme_cid(req), dw10 & 0xf);
4010 ns = nvme_subsys_ns(n->subsys, nsid);
4011 if (!ns) {
4012 return NVME_INVALID_FIELD | NVME_DNR;
4015 ret = nvme_h2c(n, (uint8_t *)list, 4096, req);
4016 if (ret) {
4017 return ret;
4020 if (!*nr_ids) {
4021 return NVME_NS_CTRL_LIST_INVALID | NVME_DNR;
4024 for (i = 0; i < *nr_ids; i++) {
4025 ctrl = nvme_subsys_ctrl(n->subsys, ids[i]);
4026 if (!ctrl) {
4027 return NVME_NS_CTRL_LIST_INVALID | NVME_DNR;
4030 if (attach) {
4031 if (nvme_ns_is_attached(ctrl, ns)) {
4032 return NVME_NS_ALREADY_ATTACHED | NVME_DNR;
4035 nvme_ns_attach(ctrl, ns);
4036 __nvme_select_ns_iocs(ctrl, ns);
4037 } else {
4038 if (!nvme_ns_is_attached(ctrl, ns)) {
4039 return NVME_NS_NOT_ATTACHED | NVME_DNR;
4042 nvme_ns_detach(ctrl, ns);
4046 * Add namespace id to the changed namespace id list for event clearing
4047 * via Get Log Page command.
4049 if (!test_and_set_bit(nsid, ctrl->changed_nsids)) {
4050 nvme_enqueue_event(ctrl, NVME_AER_TYPE_NOTICE,
4051 NVME_AER_INFO_NOTICE_NS_ATTR_CHANGED,
4052 NVME_LOG_CHANGED_NSLIST);
4056 return NVME_SUCCESS;
4059 static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest *req)
4061 trace_pci_nvme_admin_cmd(nvme_cid(req), nvme_sqid(req), req->cmd.opcode,
4062 nvme_adm_opc_str(req->cmd.opcode));
4064 if (!(nvme_cse_acs[req->cmd.opcode] & NVME_CMD_EFF_CSUPP)) {
4065 trace_pci_nvme_err_invalid_admin_opc(req->cmd.opcode);
4066 return NVME_INVALID_OPCODE | NVME_DNR;
4069 /* SGLs shall not be used for Admin commands in NVMe over PCIe */
4070 if (NVME_CMD_FLAGS_PSDT(req->cmd.flags) != NVME_PSDT_PRP) {
4071 return NVME_INVALID_FIELD | NVME_DNR;
4074 switch (req->cmd.opcode) {
4075 case NVME_ADM_CMD_DELETE_SQ:
4076 return nvme_del_sq(n, req);
4077 case NVME_ADM_CMD_CREATE_SQ:
4078 return nvme_create_sq(n, req);
4079 case NVME_ADM_CMD_GET_LOG_PAGE:
4080 return nvme_get_log(n, req);
4081 case NVME_ADM_CMD_DELETE_CQ:
4082 return nvme_del_cq(n, req);
4083 case NVME_ADM_CMD_CREATE_CQ:
4084 return nvme_create_cq(n, req);
4085 case NVME_ADM_CMD_IDENTIFY:
4086 return nvme_identify(n, req);
4087 case NVME_ADM_CMD_ABORT:
4088 return nvme_abort(n, req);
4089 case NVME_ADM_CMD_SET_FEATURES:
4090 return nvme_set_feature(n, req);
4091 case NVME_ADM_CMD_GET_FEATURES:
4092 return nvme_get_feature(n, req);
4093 case NVME_ADM_CMD_ASYNC_EV_REQ:
4094 return nvme_aer(n, req);
4095 case NVME_ADM_CMD_NS_ATTACHMENT:
4096 return nvme_ns_attachment(n, req);
4097 default:
4098 assert(false);
4101 return NVME_INVALID_OPCODE | NVME_DNR;
4104 static void nvme_process_sq(void *opaque)
4106 NvmeSQueue *sq = opaque;
4107 NvmeCtrl *n = sq->ctrl;
4108 NvmeCQueue *cq = n->cq[sq->cqid];
4110 uint16_t status;
4111 hwaddr addr;
4112 NvmeCmd cmd;
4113 NvmeRequest *req;
4115 while (!(nvme_sq_empty(sq) || QTAILQ_EMPTY(&sq->req_list))) {
4116 addr = sq->dma_addr + sq->head * n->sqe_size;
4117 if (nvme_addr_read(n, addr, (void *)&cmd, sizeof(cmd))) {
4118 trace_pci_nvme_err_addr_read(addr);
4119 trace_pci_nvme_err_cfs();
4120 n->bar.csts = NVME_CSTS_FAILED;
4121 break;
4123 nvme_inc_sq_head(sq);
4125 req = QTAILQ_FIRST(&sq->req_list);
4126 QTAILQ_REMOVE(&sq->req_list, req, entry);
4127 QTAILQ_INSERT_TAIL(&sq->out_req_list, req, entry);
4128 nvme_req_clear(req);
4129 req->cqe.cid = cmd.cid;
4130 memcpy(&req->cmd, &cmd, sizeof(NvmeCmd));
4132 status = sq->sqid ? nvme_io_cmd(n, req) :
4133 nvme_admin_cmd(n, req);
4134 if (status != NVME_NO_COMPLETE) {
4135 req->status = status;
4136 nvme_enqueue_req_completion(cq, req);
4141 static void nvme_ctrl_reset(NvmeCtrl *n)
4143 NvmeNamespace *ns;
4144 int i;
4146 for (i = 1; i <= n->num_namespaces; i++) {
4147 ns = nvme_ns(n, i);
4148 if (!ns) {
4149 continue;
4152 nvme_ns_drain(ns);
4155 for (i = 0; i < n->params.max_ioqpairs + 1; i++) {
4156 if (n->sq[i] != NULL) {
4157 nvme_free_sq(n->sq[i], n);
4160 for (i = 0; i < n->params.max_ioqpairs + 1; i++) {
4161 if (n->cq[i] != NULL) {
4162 nvme_free_cq(n->cq[i], n);
4166 while (!QTAILQ_EMPTY(&n->aer_queue)) {
4167 NvmeAsyncEvent *event = QTAILQ_FIRST(&n->aer_queue);
4168 QTAILQ_REMOVE(&n->aer_queue, event, entry);
4169 g_free(event);
4172 n->aer_queued = 0;
4173 n->outstanding_aers = 0;
4174 n->qs_created = false;
4176 n->bar.cc = 0;
4179 static void nvme_ctrl_shutdown(NvmeCtrl *n)
4181 NvmeNamespace *ns;
4182 int i;
4184 if (n->pmr.dev) {
4185 memory_region_msync(&n->pmr.dev->mr, 0, n->pmr.dev->size);
4188 for (i = 1; i <= n->num_namespaces; i++) {
4189 ns = nvme_ns(n, i);
4190 if (!ns) {
4191 continue;
4194 nvme_ns_shutdown(ns);
4198 static void __nvme_select_ns_iocs(NvmeCtrl *n, NvmeNamespace *ns)
4200 ns->iocs = nvme_cse_iocs_none;
4201 switch (ns->csi) {
4202 case NVME_CSI_NVM:
4203 if (NVME_CC_CSS(n->bar.cc) != NVME_CC_CSS_ADMIN_ONLY) {
4204 ns->iocs = nvme_cse_iocs_nvm;
4206 break;
4207 case NVME_CSI_ZONED:
4208 if (NVME_CC_CSS(n->bar.cc) == NVME_CC_CSS_CSI) {
4209 ns->iocs = nvme_cse_iocs_zoned;
4210 } else if (NVME_CC_CSS(n->bar.cc) == NVME_CC_CSS_NVM) {
4211 ns->iocs = nvme_cse_iocs_nvm;
4213 break;
4217 static void nvme_select_ns_iocs(NvmeCtrl *n)
4219 NvmeNamespace *ns;
4220 int i;
4222 for (i = 1; i <= n->num_namespaces; i++) {
4223 ns = nvme_ns(n, i);
4224 if (!ns) {
4225 continue;
4228 __nvme_select_ns_iocs(n, ns);
4232 static int nvme_start_ctrl(NvmeCtrl *n)
4234 uint32_t page_bits = NVME_CC_MPS(n->bar.cc) + 12;
4235 uint32_t page_size = 1 << page_bits;
4237 if (unlikely(n->cq[0])) {
4238 trace_pci_nvme_err_startfail_cq();
4239 return -1;
4241 if (unlikely(n->sq[0])) {
4242 trace_pci_nvme_err_startfail_sq();
4243 return -1;
4245 if (unlikely(!n->bar.asq)) {
4246 trace_pci_nvme_err_startfail_nbarasq();
4247 return -1;
4249 if (unlikely(!n->bar.acq)) {
4250 trace_pci_nvme_err_startfail_nbaracq();
4251 return -1;
4253 if (unlikely(n->bar.asq & (page_size - 1))) {
4254 trace_pci_nvme_err_startfail_asq_misaligned(n->bar.asq);
4255 return -1;
4257 if (unlikely(n->bar.acq & (page_size - 1))) {
4258 trace_pci_nvme_err_startfail_acq_misaligned(n->bar.acq);
4259 return -1;
4261 if (unlikely(!(NVME_CAP_CSS(n->bar.cap) & (1 << NVME_CC_CSS(n->bar.cc))))) {
4262 trace_pci_nvme_err_startfail_css(NVME_CC_CSS(n->bar.cc));
4263 return -1;
4265 if (unlikely(NVME_CC_MPS(n->bar.cc) <
4266 NVME_CAP_MPSMIN(n->bar.cap))) {
4267 trace_pci_nvme_err_startfail_page_too_small(
4268 NVME_CC_MPS(n->bar.cc),
4269 NVME_CAP_MPSMIN(n->bar.cap));
4270 return -1;
4272 if (unlikely(NVME_CC_MPS(n->bar.cc) >
4273 NVME_CAP_MPSMAX(n->bar.cap))) {
4274 trace_pci_nvme_err_startfail_page_too_large(
4275 NVME_CC_MPS(n->bar.cc),
4276 NVME_CAP_MPSMAX(n->bar.cap));
4277 return -1;
4279 if (unlikely(NVME_CC_IOCQES(n->bar.cc) <
4280 NVME_CTRL_CQES_MIN(n->id_ctrl.cqes))) {
4281 trace_pci_nvme_err_startfail_cqent_too_small(
4282 NVME_CC_IOCQES(n->bar.cc),
4283 NVME_CTRL_CQES_MIN(n->bar.cap));
4284 return -1;
4286 if (unlikely(NVME_CC_IOCQES(n->bar.cc) >
4287 NVME_CTRL_CQES_MAX(n->id_ctrl.cqes))) {
4288 trace_pci_nvme_err_startfail_cqent_too_large(
4289 NVME_CC_IOCQES(n->bar.cc),
4290 NVME_CTRL_CQES_MAX(n->bar.cap));
4291 return -1;
4293 if (unlikely(NVME_CC_IOSQES(n->bar.cc) <
4294 NVME_CTRL_SQES_MIN(n->id_ctrl.sqes))) {
4295 trace_pci_nvme_err_startfail_sqent_too_small(
4296 NVME_CC_IOSQES(n->bar.cc),
4297 NVME_CTRL_SQES_MIN(n->bar.cap));
4298 return -1;
4300 if (unlikely(NVME_CC_IOSQES(n->bar.cc) >
4301 NVME_CTRL_SQES_MAX(n->id_ctrl.sqes))) {
4302 trace_pci_nvme_err_startfail_sqent_too_large(
4303 NVME_CC_IOSQES(n->bar.cc),
4304 NVME_CTRL_SQES_MAX(n->bar.cap));
4305 return -1;
4307 if (unlikely(!NVME_AQA_ASQS(n->bar.aqa))) {
4308 trace_pci_nvme_err_startfail_asqent_sz_zero();
4309 return -1;
4311 if (unlikely(!NVME_AQA_ACQS(n->bar.aqa))) {
4312 trace_pci_nvme_err_startfail_acqent_sz_zero();
4313 return -1;
4316 n->page_bits = page_bits;
4317 n->page_size = page_size;
4318 n->max_prp_ents = n->page_size / sizeof(uint64_t);
4319 n->cqe_size = 1 << NVME_CC_IOCQES(n->bar.cc);
4320 n->sqe_size = 1 << NVME_CC_IOSQES(n->bar.cc);
4321 nvme_init_cq(&n->admin_cq, n, n->bar.acq, 0, 0,
4322 NVME_AQA_ACQS(n->bar.aqa) + 1, 1);
4323 nvme_init_sq(&n->admin_sq, n, n->bar.asq, 0, 0,
4324 NVME_AQA_ASQS(n->bar.aqa) + 1);
4326 nvme_set_timestamp(n, 0ULL);
4328 QTAILQ_INIT(&n->aer_queue);
4330 nvme_select_ns_iocs(n);
4332 return 0;
4335 static void nvme_cmb_enable_regs(NvmeCtrl *n)
4337 NVME_CMBLOC_SET_CDPCILS(n->bar.cmbloc, 1);
4338 NVME_CMBLOC_SET_CDPMLS(n->bar.cmbloc, 1);
4339 NVME_CMBLOC_SET_BIR(n->bar.cmbloc, NVME_CMB_BIR);
4341 NVME_CMBSZ_SET_SQS(n->bar.cmbsz, 1);
4342 NVME_CMBSZ_SET_CQS(n->bar.cmbsz, 0);
4343 NVME_CMBSZ_SET_LISTS(n->bar.cmbsz, 1);
4344 NVME_CMBSZ_SET_RDS(n->bar.cmbsz, 1);
4345 NVME_CMBSZ_SET_WDS(n->bar.cmbsz, 1);
4346 NVME_CMBSZ_SET_SZU(n->bar.cmbsz, 2); /* MBs */
4347 NVME_CMBSZ_SET_SZ(n->bar.cmbsz, n->params.cmb_size_mb);
4350 static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
4351 unsigned size)
4353 if (unlikely(offset & (sizeof(uint32_t) - 1))) {
4354 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_misaligned32,
4355 "MMIO write not 32-bit aligned,"
4356 " offset=0x%"PRIx64"", offset);
4357 /* should be ignored, fall through for now */
4360 if (unlikely(size < sizeof(uint32_t))) {
4361 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_toosmall,
4362 "MMIO write smaller than 32-bits,"
4363 " offset=0x%"PRIx64", size=%u",
4364 offset, size);
4365 /* should be ignored, fall through for now */
4368 switch (offset) {
4369 case 0xc: /* INTMS */
4370 if (unlikely(msix_enabled(&(n->parent_obj)))) {
4371 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_intmask_with_msix,
4372 "undefined access to interrupt mask set"
4373 " when MSI-X is enabled");
4374 /* should be ignored, fall through for now */
4376 n->bar.intms |= data & 0xffffffff;
4377 n->bar.intmc = n->bar.intms;
4378 trace_pci_nvme_mmio_intm_set(data & 0xffffffff, n->bar.intmc);
4379 nvme_irq_check(n);
4380 break;
4381 case 0x10: /* INTMC */
4382 if (unlikely(msix_enabled(&(n->parent_obj)))) {
4383 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_intmask_with_msix,
4384 "undefined access to interrupt mask clr"
4385 " when MSI-X is enabled");
4386 /* should be ignored, fall through for now */
4388 n->bar.intms &= ~(data & 0xffffffff);
4389 n->bar.intmc = n->bar.intms;
4390 trace_pci_nvme_mmio_intm_clr(data & 0xffffffff, n->bar.intmc);
4391 nvme_irq_check(n);
4392 break;
4393 case 0x14: /* CC */
4394 trace_pci_nvme_mmio_cfg(data & 0xffffffff);
4395 /* Windows first sends data, then sends enable bit */
4396 if (!NVME_CC_EN(data) && !NVME_CC_EN(n->bar.cc) &&
4397 !NVME_CC_SHN(data) && !NVME_CC_SHN(n->bar.cc))
4399 n->bar.cc = data;
4402 if (NVME_CC_EN(data) && !NVME_CC_EN(n->bar.cc)) {
4403 n->bar.cc = data;
4404 if (unlikely(nvme_start_ctrl(n))) {
4405 trace_pci_nvme_err_startfail();
4406 n->bar.csts = NVME_CSTS_FAILED;
4407 } else {
4408 trace_pci_nvme_mmio_start_success();
4409 n->bar.csts = NVME_CSTS_READY;
4411 } else if (!NVME_CC_EN(data) && NVME_CC_EN(n->bar.cc)) {
4412 trace_pci_nvme_mmio_stopped();
4413 nvme_ctrl_reset(n);
4414 n->bar.csts &= ~NVME_CSTS_READY;
4416 if (NVME_CC_SHN(data) && !(NVME_CC_SHN(n->bar.cc))) {
4417 trace_pci_nvme_mmio_shutdown_set();
4418 nvme_ctrl_shutdown(n);
4419 n->bar.cc = data;
4420 n->bar.csts |= NVME_CSTS_SHST_COMPLETE;
4421 } else if (!NVME_CC_SHN(data) && NVME_CC_SHN(n->bar.cc)) {
4422 trace_pci_nvme_mmio_shutdown_cleared();
4423 n->bar.csts &= ~NVME_CSTS_SHST_COMPLETE;
4424 n->bar.cc = data;
4426 break;
4427 case 0x1C: /* CSTS */
4428 if (data & (1 << 4)) {
4429 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_ssreset_w1c_unsupported,
4430 "attempted to W1C CSTS.NSSRO"
4431 " but CAP.NSSRS is zero (not supported)");
4432 } else if (data != 0) {
4433 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_ro_csts,
4434 "attempted to set a read only bit"
4435 " of controller status");
4437 break;
4438 case 0x20: /* NSSR */
4439 if (data == 0x4E564D65) {
4440 trace_pci_nvme_ub_mmiowr_ssreset_unsupported();
4441 } else {
4442 /* The spec says that writes of other values have no effect */
4443 return;
4445 break;
4446 case 0x24: /* AQA */
4447 n->bar.aqa = data & 0xffffffff;
4448 trace_pci_nvme_mmio_aqattr(data & 0xffffffff);
4449 break;
4450 case 0x28: /* ASQ */
4451 n->bar.asq = size == 8 ? data :
4452 (n->bar.asq & ~0xffffffffULL) | (data & 0xffffffff);
4453 trace_pci_nvme_mmio_asqaddr(data);
4454 break;
4455 case 0x2c: /* ASQ hi */
4456 n->bar.asq = (n->bar.asq & 0xffffffff) | (data << 32);
4457 trace_pci_nvme_mmio_asqaddr_hi(data, n->bar.asq);
4458 break;
4459 case 0x30: /* ACQ */
4460 trace_pci_nvme_mmio_acqaddr(data);
4461 n->bar.acq = size == 8 ? data :
4462 (n->bar.acq & ~0xffffffffULL) | (data & 0xffffffff);
4463 break;
4464 case 0x34: /* ACQ hi */
4465 n->bar.acq = (n->bar.acq & 0xffffffff) | (data << 32);
4466 trace_pci_nvme_mmio_acqaddr_hi(data, n->bar.acq);
4467 break;
4468 case 0x38: /* CMBLOC */
4469 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_cmbloc_reserved,
4470 "invalid write to reserved CMBLOC"
4471 " when CMBSZ is zero, ignored");
4472 return;
4473 case 0x3C: /* CMBSZ */
4474 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_cmbsz_readonly,
4475 "invalid write to read only CMBSZ, ignored");
4476 return;
4477 case 0x50: /* CMBMSC */
4478 if (!NVME_CAP_CMBS(n->bar.cap)) {
4479 return;
4482 n->bar.cmbmsc = size == 8 ? data :
4483 (n->bar.cmbmsc & ~0xffffffff) | (data & 0xffffffff);
4484 n->cmb.cmse = false;
4486 if (NVME_CMBMSC_CRE(data)) {
4487 nvme_cmb_enable_regs(n);
4489 if (NVME_CMBMSC_CMSE(data)) {
4490 hwaddr cba = NVME_CMBMSC_CBA(data) << CMBMSC_CBA_SHIFT;
4491 if (cba + int128_get64(n->cmb.mem.size) < cba) {
4492 NVME_CMBSTS_SET_CBAI(n->bar.cmbsts, 1);
4493 return;
4496 n->cmb.cba = cba;
4497 n->cmb.cmse = true;
4499 } else {
4500 n->bar.cmbsz = 0;
4501 n->bar.cmbloc = 0;
4504 return;
4505 case 0x54: /* CMBMSC hi */
4506 n->bar.cmbmsc = (n->bar.cmbmsc & 0xffffffff) | (data << 32);
4507 return;
4509 case 0xE00: /* PMRCAP */
4510 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrcap_readonly,
4511 "invalid write to PMRCAP register, ignored");
4512 return;
4513 case 0xE04: /* PMRCTL */
4514 n->bar.pmrctl = data;
4515 if (NVME_PMRCTL_EN(data)) {
4516 memory_region_set_enabled(&n->pmr.dev->mr, true);
4517 n->bar.pmrsts = 0;
4518 } else {
4519 memory_region_set_enabled(&n->pmr.dev->mr, false);
4520 NVME_PMRSTS_SET_NRDY(n->bar.pmrsts, 1);
4521 n->pmr.cmse = false;
4523 return;
4524 case 0xE08: /* PMRSTS */
4525 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrsts_readonly,
4526 "invalid write to PMRSTS register, ignored");
4527 return;
4528 case 0xE0C: /* PMREBS */
4529 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrebs_readonly,
4530 "invalid write to PMREBS register, ignored");
4531 return;
4532 case 0xE10: /* PMRSWTP */
4533 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrswtp_readonly,
4534 "invalid write to PMRSWTP register, ignored");
4535 return;
4536 case 0xE14: /* PMRMSCL */
4537 if (!NVME_CAP_PMRS(n->bar.cap)) {
4538 return;
4541 n->bar.pmrmsc = (n->bar.pmrmsc & ~0xffffffff) | (data & 0xffffffff);
4542 n->pmr.cmse = false;
4544 if (NVME_PMRMSC_CMSE(n->bar.pmrmsc)) {
4545 hwaddr cba = NVME_PMRMSC_CBA(n->bar.pmrmsc) << PMRMSC_CBA_SHIFT;
4546 if (cba + int128_get64(n->pmr.dev->mr.size) < cba) {
4547 NVME_PMRSTS_SET_CBAI(n->bar.pmrsts, 1);
4548 return;
4551 n->pmr.cmse = true;
4552 n->pmr.cba = cba;
4555 return;
4556 case 0xE18: /* PMRMSCU */
4557 if (!NVME_CAP_PMRS(n->bar.cap)) {
4558 return;
4561 n->bar.pmrmsc = (n->bar.pmrmsc & 0xffffffff) | (data << 32);
4562 return;
4563 default:
4564 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_invalid,
4565 "invalid MMIO write,"
4566 " offset=0x%"PRIx64", data=%"PRIx64"",
4567 offset, data);
4568 break;
4572 static uint64_t nvme_mmio_read(void *opaque, hwaddr addr, unsigned size)
4574 NvmeCtrl *n = (NvmeCtrl *)opaque;
4575 uint8_t *ptr = (uint8_t *)&n->bar;
4576 uint64_t val = 0;
4578 trace_pci_nvme_mmio_read(addr, size);
4580 if (unlikely(addr & (sizeof(uint32_t) - 1))) {
4581 NVME_GUEST_ERR(pci_nvme_ub_mmiord_misaligned32,
4582 "MMIO read not 32-bit aligned,"
4583 " offset=0x%"PRIx64"", addr);
4584 /* should RAZ, fall through for now */
4585 } else if (unlikely(size < sizeof(uint32_t))) {
4586 NVME_GUEST_ERR(pci_nvme_ub_mmiord_toosmall,
4587 "MMIO read smaller than 32-bits,"
4588 " offset=0x%"PRIx64"", addr);
4589 /* should RAZ, fall through for now */
4592 if (addr < sizeof(n->bar)) {
4594 * When PMRWBM bit 1 is set then read from
4595 * from PMRSTS should ensure prior writes
4596 * made it to persistent media
4598 if (addr == 0xE08 &&
4599 (NVME_PMRCAP_PMRWBM(n->bar.pmrcap) & 0x02)) {
4600 memory_region_msync(&n->pmr.dev->mr, 0, n->pmr.dev->size);
4602 memcpy(&val, ptr + addr, size);
4603 } else {
4604 NVME_GUEST_ERR(pci_nvme_ub_mmiord_invalid_ofs,
4605 "MMIO read beyond last register,"
4606 " offset=0x%"PRIx64", returning 0", addr);
4609 return val;
4612 static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
4614 uint32_t qid;
4616 if (unlikely(addr & ((1 << 2) - 1))) {
4617 NVME_GUEST_ERR(pci_nvme_ub_db_wr_misaligned,
4618 "doorbell write not 32-bit aligned,"
4619 " offset=0x%"PRIx64", ignoring", addr);
4620 return;
4623 if (((addr - 0x1000) >> 2) & 1) {
4624 /* Completion queue doorbell write */
4626 uint16_t new_head = val & 0xffff;
4627 int start_sqs;
4628 NvmeCQueue *cq;
4630 qid = (addr - (0x1000 + (1 << 2))) >> 3;
4631 if (unlikely(nvme_check_cqid(n, qid))) {
4632 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_cq,
4633 "completion queue doorbell write"
4634 " for nonexistent queue,"
4635 " sqid=%"PRIu32", ignoring", qid);
4638 * NVM Express v1.3d, Section 4.1 state: "If host software writes
4639 * an invalid value to the Submission Queue Tail Doorbell or
4640 * Completion Queue Head Doorbell regiter and an Asynchronous Event
4641 * Request command is outstanding, then an asynchronous event is
4642 * posted to the Admin Completion Queue with a status code of
4643 * Invalid Doorbell Write Value."
4645 * Also note that the spec includes the "Invalid Doorbell Register"
4646 * status code, but nowhere does it specify when to use it.
4647 * However, it seems reasonable to use it here in a similar
4648 * fashion.
4650 if (n->outstanding_aers) {
4651 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
4652 NVME_AER_INFO_ERR_INVALID_DB_REGISTER,
4653 NVME_LOG_ERROR_INFO);
4656 return;
4659 cq = n->cq[qid];
4660 if (unlikely(new_head >= cq->size)) {
4661 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_cqhead,
4662 "completion queue doorbell write value"
4663 " beyond queue size, sqid=%"PRIu32","
4664 " new_head=%"PRIu16", ignoring",
4665 qid, new_head);
4667 if (n->outstanding_aers) {
4668 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
4669 NVME_AER_INFO_ERR_INVALID_DB_VALUE,
4670 NVME_LOG_ERROR_INFO);
4673 return;
4676 trace_pci_nvme_mmio_doorbell_cq(cq->cqid, new_head);
4678 start_sqs = nvme_cq_full(cq) ? 1 : 0;
4679 cq->head = new_head;
4680 if (start_sqs) {
4681 NvmeSQueue *sq;
4682 QTAILQ_FOREACH(sq, &cq->sq_list, entry) {
4683 timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
4685 timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
4688 if (cq->tail == cq->head) {
4689 nvme_irq_deassert(n, cq);
4691 } else {
4692 /* Submission queue doorbell write */
4694 uint16_t new_tail = val & 0xffff;
4695 NvmeSQueue *sq;
4697 qid = (addr - 0x1000) >> 3;
4698 if (unlikely(nvme_check_sqid(n, qid))) {
4699 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_sq,
4700 "submission queue doorbell write"
4701 " for nonexistent queue,"
4702 " sqid=%"PRIu32", ignoring", qid);
4704 if (n->outstanding_aers) {
4705 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
4706 NVME_AER_INFO_ERR_INVALID_DB_REGISTER,
4707 NVME_LOG_ERROR_INFO);
4710 return;
4713 sq = n->sq[qid];
4714 if (unlikely(new_tail >= sq->size)) {
4715 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_sqtail,
4716 "submission queue doorbell write value"
4717 " beyond queue size, sqid=%"PRIu32","
4718 " new_tail=%"PRIu16", ignoring",
4719 qid, new_tail);
4721 if (n->outstanding_aers) {
4722 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
4723 NVME_AER_INFO_ERR_INVALID_DB_VALUE,
4724 NVME_LOG_ERROR_INFO);
4727 return;
4730 trace_pci_nvme_mmio_doorbell_sq(sq->sqid, new_tail);
4732 sq->tail = new_tail;
4733 timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
4737 static void nvme_mmio_write(void *opaque, hwaddr addr, uint64_t data,
4738 unsigned size)
4740 NvmeCtrl *n = (NvmeCtrl *)opaque;
4742 trace_pci_nvme_mmio_write(addr, data, size);
4744 if (addr < sizeof(n->bar)) {
4745 nvme_write_bar(n, addr, data, size);
4746 } else {
4747 nvme_process_db(n, addr, data);
4751 static const MemoryRegionOps nvme_mmio_ops = {
4752 .read = nvme_mmio_read,
4753 .write = nvme_mmio_write,
4754 .endianness = DEVICE_LITTLE_ENDIAN,
4755 .impl = {
4756 .min_access_size = 2,
4757 .max_access_size = 8,
4761 static void nvme_cmb_write(void *opaque, hwaddr addr, uint64_t data,
4762 unsigned size)
4764 NvmeCtrl *n = (NvmeCtrl *)opaque;
4765 stn_le_p(&n->cmb.buf[addr], size, data);
4768 static uint64_t nvme_cmb_read(void *opaque, hwaddr addr, unsigned size)
4770 NvmeCtrl *n = (NvmeCtrl *)opaque;
4771 return ldn_le_p(&n->cmb.buf[addr], size);
4774 static const MemoryRegionOps nvme_cmb_ops = {
4775 .read = nvme_cmb_read,
4776 .write = nvme_cmb_write,
4777 .endianness = DEVICE_LITTLE_ENDIAN,
4778 .impl = {
4779 .min_access_size = 1,
4780 .max_access_size = 8,
4784 static void nvme_check_constraints(NvmeCtrl *n, Error **errp)
4786 NvmeParams *params = &n->params;
4788 if (params->num_queues) {
4789 warn_report("num_queues is deprecated; please use max_ioqpairs "
4790 "instead");
4792 params->max_ioqpairs = params->num_queues - 1;
4795 if (n->conf.blk) {
4796 warn_report("drive property is deprecated; "
4797 "please use an nvme-ns device instead");
4800 if (params->max_ioqpairs < 1 ||
4801 params->max_ioqpairs > NVME_MAX_IOQPAIRS) {
4802 error_setg(errp, "max_ioqpairs must be between 1 and %d",
4803 NVME_MAX_IOQPAIRS);
4804 return;
4807 if (params->msix_qsize < 1 ||
4808 params->msix_qsize > PCI_MSIX_FLAGS_QSIZE + 1) {
4809 error_setg(errp, "msix_qsize must be between 1 and %d",
4810 PCI_MSIX_FLAGS_QSIZE + 1);
4811 return;
4814 if (!params->serial) {
4815 error_setg(errp, "serial property not set");
4816 return;
4819 if (n->pmr.dev) {
4820 if (host_memory_backend_is_mapped(n->pmr.dev)) {
4821 error_setg(errp, "can't use already busy memdev: %s",
4822 object_get_canonical_path_component(OBJECT(n->pmr.dev)));
4823 return;
4826 if (!is_power_of_2(n->pmr.dev->size)) {
4827 error_setg(errp, "pmr backend size needs to be power of 2 in size");
4828 return;
4831 host_memory_backend_set_mapped(n->pmr.dev, true);
4834 if (n->params.zasl > n->params.mdts) {
4835 error_setg(errp, "zoned.zasl (Zone Append Size Limit) must be less "
4836 "than or equal to mdts (Maximum Data Transfer Size)");
4837 return;
4841 static void nvme_init_state(NvmeCtrl *n)
4843 n->num_namespaces = NVME_MAX_NAMESPACES;
4844 /* add one to max_ioqpairs to account for the admin queue pair */
4845 n->reg_size = pow2ceil(sizeof(NvmeBar) +
4846 2 * (n->params.max_ioqpairs + 1) * NVME_DB_SIZE);
4847 n->sq = g_new0(NvmeSQueue *, n->params.max_ioqpairs + 1);
4848 n->cq = g_new0(NvmeCQueue *, n->params.max_ioqpairs + 1);
4849 n->temperature = NVME_TEMPERATURE;
4850 n->features.temp_thresh_hi = NVME_TEMPERATURE_WARNING;
4851 n->starttime_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
4852 n->aer_reqs = g_new0(NvmeRequest *, n->params.aerl + 1);
4855 static int nvme_attach_namespace(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
4857 if (nvme_ns_is_attached(n, ns)) {
4858 error_setg(errp,
4859 "namespace %d is already attached to controller %d",
4860 nvme_nsid(ns), n->cntlid);
4861 return -1;
4864 nvme_ns_attach(n, ns);
4866 return 0;
4869 int nvme_register_namespace(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
4871 uint32_t nsid = nvme_nsid(ns);
4873 if (nsid > NVME_MAX_NAMESPACES) {
4874 error_setg(errp, "invalid namespace id (must be between 0 and %d)",
4875 NVME_MAX_NAMESPACES);
4876 return -1;
4879 if (!nsid) {
4880 for (int i = 1; i <= n->num_namespaces; i++) {
4881 if (!nvme_ns(n, i)) {
4882 nsid = ns->params.nsid = i;
4883 break;
4887 if (!nsid) {
4888 error_setg(errp, "no free namespace id");
4889 return -1;
4891 } else {
4892 if (n->namespaces[nsid - 1]) {
4893 error_setg(errp, "namespace id '%d' is already in use", nsid);
4894 return -1;
4898 trace_pci_nvme_register_namespace(nsid);
4901 * If subsys is not given, namespae is always attached to the controller
4902 * because there's no subsystem to manage namespace allocation.
4904 if (!n->subsys) {
4905 if (ns->params.detached) {
4906 error_setg(errp,
4907 "detached needs nvme-subsys specified nvme or nvme-ns");
4908 return -1;
4911 return nvme_attach_namespace(n, ns, errp);
4912 } else {
4913 if (!ns->params.detached) {
4914 return nvme_attach_namespace(n, ns, errp);
4918 n->dmrsl = MIN_NON_ZERO(n->dmrsl,
4919 BDRV_REQUEST_MAX_BYTES / nvme_l2b(ns, 1));
4921 return 0;
4924 static void nvme_init_cmb(NvmeCtrl *n, PCIDevice *pci_dev)
4926 uint64_t cmb_size = n->params.cmb_size_mb * MiB;
4928 n->cmb.buf = g_malloc0(cmb_size);
4929 memory_region_init_io(&n->cmb.mem, OBJECT(n), &nvme_cmb_ops, n,
4930 "nvme-cmb", cmb_size);
4931 pci_register_bar(pci_dev, NVME_CMB_BIR,
4932 PCI_BASE_ADDRESS_SPACE_MEMORY |
4933 PCI_BASE_ADDRESS_MEM_TYPE_64 |
4934 PCI_BASE_ADDRESS_MEM_PREFETCH, &n->cmb.mem);
4936 NVME_CAP_SET_CMBS(n->bar.cap, 1);
4938 if (n->params.legacy_cmb) {
4939 nvme_cmb_enable_regs(n);
4940 n->cmb.cmse = true;
4944 static void nvme_init_pmr(NvmeCtrl *n, PCIDevice *pci_dev)
4946 NVME_PMRCAP_SET_RDS(n->bar.pmrcap, 1);
4947 NVME_PMRCAP_SET_WDS(n->bar.pmrcap, 1);
4948 NVME_PMRCAP_SET_BIR(n->bar.pmrcap, NVME_PMR_BIR);
4949 /* Turn on bit 1 support */
4950 NVME_PMRCAP_SET_PMRWBM(n->bar.pmrcap, 0x02);
4951 NVME_PMRCAP_SET_CMSS(n->bar.pmrcap, 1);
4953 pci_register_bar(pci_dev, NVME_PMRCAP_BIR(n->bar.pmrcap),
4954 PCI_BASE_ADDRESS_SPACE_MEMORY |
4955 PCI_BASE_ADDRESS_MEM_TYPE_64 |
4956 PCI_BASE_ADDRESS_MEM_PREFETCH, &n->pmr.dev->mr);
4958 memory_region_set_enabled(&n->pmr.dev->mr, false);
4961 static int nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp)
4963 uint8_t *pci_conf = pci_dev->config;
4964 uint64_t bar_size, msix_table_size, msix_pba_size;
4965 unsigned msix_table_offset, msix_pba_offset;
4966 int ret;
4968 Error *err = NULL;
4970 pci_conf[PCI_INTERRUPT_PIN] = 1;
4971 pci_config_set_prog_interface(pci_conf, 0x2);
4973 if (n->params.use_intel_id) {
4974 pci_config_set_vendor_id(pci_conf, PCI_VENDOR_ID_INTEL);
4975 pci_config_set_device_id(pci_conf, 0x5845);
4976 } else {
4977 pci_config_set_vendor_id(pci_conf, PCI_VENDOR_ID_REDHAT);
4978 pci_config_set_device_id(pci_conf, PCI_DEVICE_ID_REDHAT_NVME);
4981 pci_config_set_class(pci_conf, PCI_CLASS_STORAGE_EXPRESS);
4982 pcie_endpoint_cap_init(pci_dev, 0x80);
4984 bar_size = QEMU_ALIGN_UP(n->reg_size, 4 * KiB);
4985 msix_table_offset = bar_size;
4986 msix_table_size = PCI_MSIX_ENTRY_SIZE * n->params.msix_qsize;
4988 bar_size += msix_table_size;
4989 bar_size = QEMU_ALIGN_UP(bar_size, 4 * KiB);
4990 msix_pba_offset = bar_size;
4991 msix_pba_size = QEMU_ALIGN_UP(n->params.msix_qsize, 64) / 8;
4993 bar_size += msix_pba_size;
4994 bar_size = pow2ceil(bar_size);
4996 memory_region_init(&n->bar0, OBJECT(n), "nvme-bar0", bar_size);
4997 memory_region_init_io(&n->iomem, OBJECT(n), &nvme_mmio_ops, n, "nvme",
4998 n->reg_size);
4999 memory_region_add_subregion(&n->bar0, 0, &n->iomem);
5001 pci_register_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY |
5002 PCI_BASE_ADDRESS_MEM_TYPE_64, &n->bar0);
5003 ret = msix_init(pci_dev, n->params.msix_qsize,
5004 &n->bar0, 0, msix_table_offset,
5005 &n->bar0, 0, msix_pba_offset, 0, &err);
5006 if (ret < 0) {
5007 if (ret == -ENOTSUP) {
5008 warn_report_err(err);
5009 } else {
5010 error_propagate(errp, err);
5011 return ret;
5015 if (n->params.cmb_size_mb) {
5016 nvme_init_cmb(n, pci_dev);
5019 if (n->pmr.dev) {
5020 nvme_init_pmr(n, pci_dev);
5023 return 0;
5026 static void nvme_init_subnqn(NvmeCtrl *n)
5028 NvmeSubsystem *subsys = n->subsys;
5029 NvmeIdCtrl *id = &n->id_ctrl;
5031 if (!subsys) {
5032 snprintf((char *)id->subnqn, sizeof(id->subnqn),
5033 "nqn.2019-08.org.qemu:%s", n->params.serial);
5034 } else {
5035 pstrcpy((char *)id->subnqn, sizeof(id->subnqn), (char*)subsys->subnqn);
5039 static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev)
5041 NvmeIdCtrl *id = &n->id_ctrl;
5042 uint8_t *pci_conf = pci_dev->config;
5044 id->vid = cpu_to_le16(pci_get_word(pci_conf + PCI_VENDOR_ID));
5045 id->ssvid = cpu_to_le16(pci_get_word(pci_conf + PCI_SUBSYSTEM_VENDOR_ID));
5046 strpadcpy((char *)id->mn, sizeof(id->mn), "QEMU NVMe Ctrl", ' ');
5047 strpadcpy((char *)id->fr, sizeof(id->fr), "1.0", ' ');
5048 strpadcpy((char *)id->sn, sizeof(id->sn), n->params.serial, ' ');
5050 id->cntlid = cpu_to_le16(n->cntlid);
5052 id->oaes = cpu_to_le32(NVME_OAES_NS_ATTR);
5054 id->rab = 6;
5056 if (n->params.use_intel_id) {
5057 id->ieee[0] = 0xb3;
5058 id->ieee[1] = 0x02;
5059 id->ieee[2] = 0x00;
5060 } else {
5061 id->ieee[0] = 0x00;
5062 id->ieee[1] = 0x54;
5063 id->ieee[2] = 0x52;
5066 id->mdts = n->params.mdts;
5067 id->ver = cpu_to_le32(NVME_SPEC_VER);
5068 id->oacs = cpu_to_le16(NVME_OACS_NS_MGMT);
5069 id->cntrltype = 0x1;
5072 * Because the controller always completes the Abort command immediately,
5073 * there can never be more than one concurrently executing Abort command,
5074 * so this value is never used for anything. Note that there can easily be
5075 * many Abort commands in the queues, but they are not considered
5076 * "executing" until processed by nvme_abort.
5078 * The specification recommends a value of 3 for Abort Command Limit (four
5079 * concurrently outstanding Abort commands), so lets use that though it is
5080 * inconsequential.
5082 id->acl = 3;
5083 id->aerl = n->params.aerl;
5084 id->frmw = (NVME_NUM_FW_SLOTS << 1) | NVME_FRMW_SLOT1_RO;
5085 id->lpa = NVME_LPA_NS_SMART | NVME_LPA_CSE | NVME_LPA_EXTENDED;
5087 /* recommended default value (~70 C) */
5088 id->wctemp = cpu_to_le16(NVME_TEMPERATURE_WARNING);
5089 id->cctemp = cpu_to_le16(NVME_TEMPERATURE_CRITICAL);
5091 id->sqes = (0x6 << 4) | 0x6;
5092 id->cqes = (0x4 << 4) | 0x4;
5093 id->nn = cpu_to_le32(n->num_namespaces);
5094 id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROES | NVME_ONCS_TIMESTAMP |
5095 NVME_ONCS_FEATURES | NVME_ONCS_DSM |
5096 NVME_ONCS_COMPARE | NVME_ONCS_COPY);
5099 * NOTE: If this device ever supports a command set that does NOT use 0x0
5100 * as a Flush-equivalent operation, support for the broadcast NSID in Flush
5101 * should probably be removed.
5103 * See comment in nvme_io_cmd.
5105 id->vwc = NVME_VWC_NSID_BROADCAST_SUPPORT | NVME_VWC_PRESENT;
5107 id->ocfs = cpu_to_le16(NVME_OCFS_COPY_FORMAT_0);
5108 id->sgls = cpu_to_le32(NVME_CTRL_SGLS_SUPPORT_NO_ALIGN |
5109 NVME_CTRL_SGLS_BITBUCKET);
5111 nvme_init_subnqn(n);
5113 id->psd[0].mp = cpu_to_le16(0x9c4);
5114 id->psd[0].enlat = cpu_to_le32(0x10);
5115 id->psd[0].exlat = cpu_to_le32(0x4);
5117 if (n->subsys) {
5118 id->cmic |= NVME_CMIC_MULTI_CTRL;
5121 NVME_CAP_SET_MQES(n->bar.cap, 0x7ff);
5122 NVME_CAP_SET_CQR(n->bar.cap, 1);
5123 NVME_CAP_SET_TO(n->bar.cap, 0xf);
5124 NVME_CAP_SET_CSS(n->bar.cap, NVME_CAP_CSS_NVM);
5125 NVME_CAP_SET_CSS(n->bar.cap, NVME_CAP_CSS_CSI_SUPP);
5126 NVME_CAP_SET_CSS(n->bar.cap, NVME_CAP_CSS_ADMIN_ONLY);
5127 NVME_CAP_SET_MPSMAX(n->bar.cap, 4);
5128 NVME_CAP_SET_CMBS(n->bar.cap, n->params.cmb_size_mb ? 1 : 0);
5129 NVME_CAP_SET_PMRS(n->bar.cap, n->pmr.dev ? 1 : 0);
5131 n->bar.vs = NVME_SPEC_VER;
5132 n->bar.intmc = n->bar.intms = 0;
5135 static int nvme_init_subsys(NvmeCtrl *n, Error **errp)
5137 int cntlid;
5139 if (!n->subsys) {
5140 return 0;
5143 cntlid = nvme_subsys_register_ctrl(n, errp);
5144 if (cntlid < 0) {
5145 return -1;
5148 n->cntlid = cntlid;
5150 return 0;
5153 static void nvme_realize(PCIDevice *pci_dev, Error **errp)
5155 NvmeCtrl *n = NVME(pci_dev);
5156 NvmeNamespace *ns;
5157 Error *local_err = NULL;
5159 nvme_check_constraints(n, &local_err);
5160 if (local_err) {
5161 error_propagate(errp, local_err);
5162 return;
5165 qbus_create_inplace(&n->bus, sizeof(NvmeBus), TYPE_NVME_BUS,
5166 &pci_dev->qdev, n->parent_obj.qdev.id);
5168 nvme_init_state(n);
5169 if (nvme_init_pci(n, pci_dev, errp)) {
5170 return;
5173 if (nvme_init_subsys(n, errp)) {
5174 error_propagate(errp, local_err);
5175 return;
5177 nvme_init_ctrl(n, pci_dev);
5179 /* setup a namespace if the controller drive property was given */
5180 if (n->namespace.blkconf.blk) {
5181 ns = &n->namespace;
5182 ns->params.nsid = 1;
5184 if (nvme_ns_setup(ns, errp)) {
5185 return;
5188 if (nvme_register_namespace(n, ns, errp)) {
5189 return;
5194 static void nvme_exit(PCIDevice *pci_dev)
5196 NvmeCtrl *n = NVME(pci_dev);
5197 NvmeNamespace *ns;
5198 int i;
5200 nvme_ctrl_reset(n);
5202 for (i = 1; i <= n->num_namespaces; i++) {
5203 ns = nvme_ns(n, i);
5204 if (!ns) {
5205 continue;
5208 nvme_ns_cleanup(ns);
5211 g_free(n->cq);
5212 g_free(n->sq);
5213 g_free(n->aer_reqs);
5215 if (n->params.cmb_size_mb) {
5216 g_free(n->cmb.buf);
5219 if (n->pmr.dev) {
5220 host_memory_backend_set_mapped(n->pmr.dev, false);
5222 msix_uninit_exclusive_bar(pci_dev);
5225 static Property nvme_props[] = {
5226 DEFINE_BLOCK_PROPERTIES(NvmeCtrl, namespace.blkconf),
5227 DEFINE_PROP_LINK("pmrdev", NvmeCtrl, pmr.dev, TYPE_MEMORY_BACKEND,
5228 HostMemoryBackend *),
5229 DEFINE_PROP_LINK("subsys", NvmeCtrl, subsys, TYPE_NVME_SUBSYS,
5230 NvmeSubsystem *),
5231 DEFINE_PROP_STRING("serial", NvmeCtrl, params.serial),
5232 DEFINE_PROP_UINT32("cmb_size_mb", NvmeCtrl, params.cmb_size_mb, 0),
5233 DEFINE_PROP_UINT32("num_queues", NvmeCtrl, params.num_queues, 0),
5234 DEFINE_PROP_UINT32("max_ioqpairs", NvmeCtrl, params.max_ioqpairs, 64),
5235 DEFINE_PROP_UINT16("msix_qsize", NvmeCtrl, params.msix_qsize, 65),
5236 DEFINE_PROP_UINT8("aerl", NvmeCtrl, params.aerl, 3),
5237 DEFINE_PROP_UINT32("aer_max_queued", NvmeCtrl, params.aer_max_queued, 64),
5238 DEFINE_PROP_UINT8("mdts", NvmeCtrl, params.mdts, 7),
5239 DEFINE_PROP_BOOL("use-intel-id", NvmeCtrl, params.use_intel_id, false),
5240 DEFINE_PROP_BOOL("legacy-cmb", NvmeCtrl, params.legacy_cmb, false),
5241 DEFINE_PROP_UINT8("zoned.zasl", NvmeCtrl, params.zasl, 0),
5242 DEFINE_PROP_END_OF_LIST(),
5245 static void nvme_get_smart_warning(Object *obj, Visitor *v, const char *name,
5246 void *opaque, Error **errp)
5248 NvmeCtrl *n = NVME(obj);
5249 uint8_t value = n->smart_critical_warning;
5251 visit_type_uint8(v, name, &value, errp);
5254 static void nvme_set_smart_warning(Object *obj, Visitor *v, const char *name,
5255 void *opaque, Error **errp)
5257 NvmeCtrl *n = NVME(obj);
5258 uint8_t value, old_value, cap = 0, index, event;
5260 if (!visit_type_uint8(v, name, &value, errp)) {
5261 return;
5264 cap = NVME_SMART_SPARE | NVME_SMART_TEMPERATURE | NVME_SMART_RELIABILITY
5265 | NVME_SMART_MEDIA_READ_ONLY | NVME_SMART_FAILED_VOLATILE_MEDIA;
5266 if (NVME_CAP_PMRS(n->bar.cap)) {
5267 cap |= NVME_SMART_PMR_UNRELIABLE;
5270 if ((value & cap) != value) {
5271 error_setg(errp, "unsupported smart critical warning bits: 0x%x",
5272 value & ~cap);
5273 return;
5276 old_value = n->smart_critical_warning;
5277 n->smart_critical_warning = value;
5279 /* only inject new bits of smart critical warning */
5280 for (index = 0; index < NVME_SMART_WARN_MAX; index++) {
5281 event = 1 << index;
5282 if (value & ~old_value & event)
5283 nvme_smart_event(n, event);
5287 static const VMStateDescription nvme_vmstate = {
5288 .name = "nvme",
5289 .unmigratable = 1,
5292 static void nvme_class_init(ObjectClass *oc, void *data)
5294 DeviceClass *dc = DEVICE_CLASS(oc);
5295 PCIDeviceClass *pc = PCI_DEVICE_CLASS(oc);
5297 pc->realize = nvme_realize;
5298 pc->exit = nvme_exit;
5299 pc->class_id = PCI_CLASS_STORAGE_EXPRESS;
5300 pc->revision = 2;
5302 set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
5303 dc->desc = "Non-Volatile Memory Express";
5304 device_class_set_props(dc, nvme_props);
5305 dc->vmsd = &nvme_vmstate;
5308 static void nvme_instance_init(Object *obj)
5310 NvmeCtrl *n = NVME(obj);
5312 if (n->namespace.blkconf.blk) {
5313 device_add_bootindex_property(obj, &n->namespace.blkconf.bootindex,
5314 "bootindex", "/namespace@1,0",
5315 DEVICE(obj));
5318 object_property_add(obj, "smart_critical_warning", "uint8",
5319 nvme_get_smart_warning,
5320 nvme_set_smart_warning, NULL, NULL);
5323 static const TypeInfo nvme_info = {
5324 .name = TYPE_NVME,
5325 .parent = TYPE_PCI_DEVICE,
5326 .instance_size = sizeof(NvmeCtrl),
5327 .instance_init = nvme_instance_init,
5328 .class_init = nvme_class_init,
5329 .interfaces = (InterfaceInfo[]) {
5330 { INTERFACE_PCIE_DEVICE },
5335 static const TypeInfo nvme_bus_info = {
5336 .name = TYPE_NVME_BUS,
5337 .parent = TYPE_BUS,
5338 .instance_size = sizeof(NvmeBus),
5341 static void nvme_register_types(void)
5343 type_register_static(&nvme_info);
5344 type_register_static(&nvme_bus_info);
5347 type_init(nvme_register_types)