meson.build: Merge riscv32 and riscv64 cpu family
[qemu/ar7.git] / hw / nvme / ctrl.c
blob6a571d18cfae81fcd50dd6382f67a7f547c95efe
1 /*
2 * QEMU NVM Express Controller
4 * Copyright (c) 2012, Intel Corporation
6 * Written by Keith Busch <keith.busch@intel.com>
8 * This code is licensed under the GNU GPL v2 or later.
9 */
11 /**
12 * Reference Specs: http://www.nvmexpress.org, 1.4, 1.3, 1.2, 1.1, 1.0e
14 * https://nvmexpress.org/developers/nvme-specification/
17 * Notes on coding style
18 * ---------------------
19 * While QEMU coding style prefers lowercase hexadecimals in constants, the
20 * NVMe subsystem use thes format from the NVMe specifications in the comments
21 * (i.e. 'h' suffix instead of '0x' prefix).
23 * Usage
24 * -----
25 * See docs/system/nvme.rst for extensive documentation.
27 * Add options:
28 * -drive file=<file>,if=none,id=<drive_id>
29 * -device nvme-subsys,id=<subsys_id>,nqn=<nqn_id>
30 * -device nvme,serial=<serial>,id=<bus_name>, \
31 * cmb_size_mb=<cmb_size_mb[optional]>, \
32 * [pmrdev=<mem_backend_file_id>,] \
33 * max_ioqpairs=<N[optional]>, \
34 * aerl=<N[optional]>,aer_max_queued=<N[optional]>, \
35 * mdts=<N[optional]>,vsl=<N[optional]>, \
36 * zoned.zasl=<N[optional]>, \
37 * zoned.auto_transition=<on|off[optional]>, \
38 * subsys=<subsys_id>
39 * -device nvme-ns,drive=<drive_id>,bus=<bus_name>,nsid=<nsid>,\
40 * zoned=<true|false[optional]>, \
41 * subsys=<subsys_id>,detached=<true|false[optional]>
43 * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
44 * offset 0 in BAR2 and supports only WDS, RDS and SQS for now. By default, the
45 * device will use the "v1.4 CMB scheme" - use the `legacy-cmb` parameter to
46 * always enable the CMBLOC and CMBSZ registers (v1.3 behavior).
48 * Enabling pmr emulation can be achieved by pointing to memory-backend-file.
49 * For example:
50 * -object memory-backend-file,id=<mem_id>,share=on,mem-path=<file_path>, \
51 * size=<size> .... -device nvme,...,pmrdev=<mem_id>
53 * The PMR will use BAR 4/5 exclusively.
55 * To place controller(s) and namespace(s) to a subsystem, then provide
56 * nvme-subsys device as above.
58 * nvme subsystem device parameters
59 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
60 * - `nqn`
61 * This parameter provides the `<nqn_id>` part of the string
62 * `nqn.2019-08.org.qemu:<nqn_id>` which will be reported in the SUBNQN field
63 * of subsystem controllers. Note that `<nqn_id>` should be unique per
64 * subsystem, but this is not enforced by QEMU. If not specified, it will
65 * default to the value of the `id` parameter (`<subsys_id>`).
67 * nvme device parameters
68 * ~~~~~~~~~~~~~~~~~~~~~~
69 * - `subsys`
70 * Specifying this parameter attaches the controller to the subsystem and
71 * the SUBNQN field in the controller will report the NQN of the subsystem
72 * device. This also enables multi controller capability represented in
73 * Identify Controller data structure in CMIC (Controller Multi-path I/O and
74 * Namesapce Sharing Capabilities).
76 * - `aerl`
77 * The Asynchronous Event Request Limit (AERL). Indicates the maximum number
78 * of concurrently outstanding Asynchronous Event Request commands support
79 * by the controller. This is a 0's based value.
81 * - `aer_max_queued`
82 * This is the maximum number of events that the device will enqueue for
83 * completion when there are no outstanding AERs. When the maximum number of
84 * enqueued events are reached, subsequent events will be dropped.
86 * - `mdts`
87 * Indicates the maximum data transfer size for a command that transfers data
88 * between host-accessible memory and the controller. The value is specified
89 * as a power of two (2^n) and is in units of the minimum memory page size
90 * (CAP.MPSMIN). The default value is 7 (i.e. 512 KiB).
92 * - `vsl`
93 * Indicates the maximum data size limit for the Verify command. Like `mdts`,
94 * this value is specified as a power of two (2^n) and is in units of the
95 * minimum memory page size (CAP.MPSMIN). The default value is 7 (i.e. 512
96 * KiB).
98 * - `zoned.zasl`
99 * Indicates the maximum data transfer size for the Zone Append command. Like
100 * `mdts`, the value is specified as a power of two (2^n) and is in units of
101 * the minimum memory page size (CAP.MPSMIN). The default value is 0 (i.e.
102 * defaulting to the value of `mdts`).
104 * - `zoned.auto_transition`
105 * Indicates if zones in zone state implicitly opened can be automatically
106 * transitioned to zone state closed for resource management purposes.
107 * Defaults to 'on'.
109 * nvme namespace device parameters
110 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
111 * - `shared`
112 * When the parent nvme device (as defined explicitly by the 'bus' parameter
113 * or implicitly by the most recently defined NvmeBus) is linked to an
114 * nvme-subsys device, the namespace will be attached to all controllers in
115 * the subsystem. If set to 'off' (the default), the namespace will remain a
116 * private namespace and may only be attached to a single controller at a
117 * time.
119 * - `detached`
120 * This parameter is only valid together with the `subsys` parameter. If left
121 * at the default value (`false/off`), the namespace will be attached to all
122 * controllers in the NVMe subsystem at boot-up. If set to `true/on`, the
123 * namespace will be available in the subsystem but not attached to any
124 * controllers.
126 * Setting `zoned` to true selects Zoned Command Set at the namespace.
127 * In this case, the following namespace properties are available to configure
128 * zoned operation:
129 * zoned.zone_size=<zone size in bytes, default: 128MiB>
130 * The number may be followed by K, M, G as in kilo-, mega- or giga-.
132 * zoned.zone_capacity=<zone capacity in bytes, default: zone size>
133 * The value 0 (default) forces zone capacity to be the same as zone
134 * size. The value of this property may not exceed zone size.
136 * zoned.descr_ext_size=<zone descriptor extension size, default 0>
137 * This value needs to be specified in 64B units. If it is zero,
138 * namespace(s) will not support zone descriptor extensions.
140 * zoned.max_active=<Maximum Active Resources (zones), default: 0>
141 * The default value means there is no limit to the number of
142 * concurrently active zones.
144 * zoned.max_open=<Maximum Open Resources (zones), default: 0>
145 * The default value means there is no limit to the number of
146 * concurrently open zones.
148 * zoned.cross_read=<enable RAZB, default: false>
149 * Setting this property to true enables Read Across Zone Boundaries.
152 #include "qemu/osdep.h"
153 #include "qemu/cutils.h"
154 #include "qemu/error-report.h"
155 #include "qemu/log.h"
156 #include "qemu/units.h"
157 #include "qapi/error.h"
158 #include "qapi/visitor.h"
159 #include "sysemu/sysemu.h"
160 #include "sysemu/block-backend.h"
161 #include "sysemu/hostmem.h"
162 #include "hw/pci/msix.h"
163 #include "migration/vmstate.h"
165 #include "nvme.h"
166 #include "trace.h"
168 #define NVME_MAX_IOQPAIRS 0xffff
169 #define NVME_DB_SIZE 4
170 #define NVME_SPEC_VER 0x00010400
171 #define NVME_CMB_BIR 2
172 #define NVME_PMR_BIR 4
173 #define NVME_TEMPERATURE 0x143
174 #define NVME_TEMPERATURE_WARNING 0x157
175 #define NVME_TEMPERATURE_CRITICAL 0x175
176 #define NVME_NUM_FW_SLOTS 1
177 #define NVME_DEFAULT_MAX_ZA_SIZE (128 * KiB)
179 #define NVME_GUEST_ERR(trace, fmt, ...) \
180 do { \
181 (trace_##trace)(__VA_ARGS__); \
182 qemu_log_mask(LOG_GUEST_ERROR, #trace \
183 " in %s: " fmt "\n", __func__, ## __VA_ARGS__); \
184 } while (0)
186 static const bool nvme_feature_support[NVME_FID_MAX] = {
187 [NVME_ARBITRATION] = true,
188 [NVME_POWER_MANAGEMENT] = true,
189 [NVME_TEMPERATURE_THRESHOLD] = true,
190 [NVME_ERROR_RECOVERY] = true,
191 [NVME_VOLATILE_WRITE_CACHE] = true,
192 [NVME_NUMBER_OF_QUEUES] = true,
193 [NVME_INTERRUPT_COALESCING] = true,
194 [NVME_INTERRUPT_VECTOR_CONF] = true,
195 [NVME_WRITE_ATOMICITY] = true,
196 [NVME_ASYNCHRONOUS_EVENT_CONF] = true,
197 [NVME_TIMESTAMP] = true,
198 [NVME_COMMAND_SET_PROFILE] = true,
201 static const uint32_t nvme_feature_cap[NVME_FID_MAX] = {
202 [NVME_TEMPERATURE_THRESHOLD] = NVME_FEAT_CAP_CHANGE,
203 [NVME_ERROR_RECOVERY] = NVME_FEAT_CAP_CHANGE | NVME_FEAT_CAP_NS,
204 [NVME_VOLATILE_WRITE_CACHE] = NVME_FEAT_CAP_CHANGE,
205 [NVME_NUMBER_OF_QUEUES] = NVME_FEAT_CAP_CHANGE,
206 [NVME_ASYNCHRONOUS_EVENT_CONF] = NVME_FEAT_CAP_CHANGE,
207 [NVME_TIMESTAMP] = NVME_FEAT_CAP_CHANGE,
208 [NVME_COMMAND_SET_PROFILE] = NVME_FEAT_CAP_CHANGE,
211 static const uint32_t nvme_cse_acs[256] = {
212 [NVME_ADM_CMD_DELETE_SQ] = NVME_CMD_EFF_CSUPP,
213 [NVME_ADM_CMD_CREATE_SQ] = NVME_CMD_EFF_CSUPP,
214 [NVME_ADM_CMD_GET_LOG_PAGE] = NVME_CMD_EFF_CSUPP,
215 [NVME_ADM_CMD_DELETE_CQ] = NVME_CMD_EFF_CSUPP,
216 [NVME_ADM_CMD_CREATE_CQ] = NVME_CMD_EFF_CSUPP,
217 [NVME_ADM_CMD_IDENTIFY] = NVME_CMD_EFF_CSUPP,
218 [NVME_ADM_CMD_ABORT] = NVME_CMD_EFF_CSUPP,
219 [NVME_ADM_CMD_SET_FEATURES] = NVME_CMD_EFF_CSUPP,
220 [NVME_ADM_CMD_GET_FEATURES] = NVME_CMD_EFF_CSUPP,
221 [NVME_ADM_CMD_ASYNC_EV_REQ] = NVME_CMD_EFF_CSUPP,
222 [NVME_ADM_CMD_NS_ATTACHMENT] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_NIC,
223 [NVME_ADM_CMD_FORMAT_NVM] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
226 static const uint32_t nvme_cse_iocs_none[256];
228 static const uint32_t nvme_cse_iocs_nvm[256] = {
229 [NVME_CMD_FLUSH] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
230 [NVME_CMD_WRITE_ZEROES] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
231 [NVME_CMD_WRITE] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
232 [NVME_CMD_READ] = NVME_CMD_EFF_CSUPP,
233 [NVME_CMD_DSM] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
234 [NVME_CMD_VERIFY] = NVME_CMD_EFF_CSUPP,
235 [NVME_CMD_COPY] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
236 [NVME_CMD_COMPARE] = NVME_CMD_EFF_CSUPP,
239 static const uint32_t nvme_cse_iocs_zoned[256] = {
240 [NVME_CMD_FLUSH] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
241 [NVME_CMD_WRITE_ZEROES] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
242 [NVME_CMD_WRITE] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
243 [NVME_CMD_READ] = NVME_CMD_EFF_CSUPP,
244 [NVME_CMD_DSM] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
245 [NVME_CMD_VERIFY] = NVME_CMD_EFF_CSUPP,
246 [NVME_CMD_COPY] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
247 [NVME_CMD_COMPARE] = NVME_CMD_EFF_CSUPP,
248 [NVME_CMD_ZONE_APPEND] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
249 [NVME_CMD_ZONE_MGMT_SEND] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
250 [NVME_CMD_ZONE_MGMT_RECV] = NVME_CMD_EFF_CSUPP,
253 static void nvme_process_sq(void *opaque);
255 static uint16_t nvme_sqid(NvmeRequest *req)
257 return le16_to_cpu(req->sq->sqid);
260 static void nvme_assign_zone_state(NvmeNamespace *ns, NvmeZone *zone,
261 NvmeZoneState state)
263 if (QTAILQ_IN_USE(zone, entry)) {
264 switch (nvme_get_zone_state(zone)) {
265 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
266 QTAILQ_REMOVE(&ns->exp_open_zones, zone, entry);
267 break;
268 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
269 QTAILQ_REMOVE(&ns->imp_open_zones, zone, entry);
270 break;
271 case NVME_ZONE_STATE_CLOSED:
272 QTAILQ_REMOVE(&ns->closed_zones, zone, entry);
273 break;
274 case NVME_ZONE_STATE_FULL:
275 QTAILQ_REMOVE(&ns->full_zones, zone, entry);
276 default:
281 nvme_set_zone_state(zone, state);
283 switch (state) {
284 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
285 QTAILQ_INSERT_TAIL(&ns->exp_open_zones, zone, entry);
286 break;
287 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
288 QTAILQ_INSERT_TAIL(&ns->imp_open_zones, zone, entry);
289 break;
290 case NVME_ZONE_STATE_CLOSED:
291 QTAILQ_INSERT_TAIL(&ns->closed_zones, zone, entry);
292 break;
293 case NVME_ZONE_STATE_FULL:
294 QTAILQ_INSERT_TAIL(&ns->full_zones, zone, entry);
295 case NVME_ZONE_STATE_READ_ONLY:
296 break;
297 default:
298 zone->d.za = 0;
303 * Check if we can open a zone without exceeding open/active limits.
304 * AOR stands for "Active and Open Resources" (see TP 4053 section 2.5).
306 static int nvme_aor_check(NvmeNamespace *ns, uint32_t act, uint32_t opn)
308 if (ns->params.max_active_zones != 0 &&
309 ns->nr_active_zones + act > ns->params.max_active_zones) {
310 trace_pci_nvme_err_insuff_active_res(ns->params.max_active_zones);
311 return NVME_ZONE_TOO_MANY_ACTIVE | NVME_DNR;
313 if (ns->params.max_open_zones != 0 &&
314 ns->nr_open_zones + opn > ns->params.max_open_zones) {
315 trace_pci_nvme_err_insuff_open_res(ns->params.max_open_zones);
316 return NVME_ZONE_TOO_MANY_OPEN | NVME_DNR;
319 return NVME_SUCCESS;
322 static bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr)
324 hwaddr hi, lo;
326 if (!n->cmb.cmse) {
327 return false;
330 lo = n->params.legacy_cmb ? n->cmb.mem.addr : n->cmb.cba;
331 hi = lo + int128_get64(n->cmb.mem.size);
333 return addr >= lo && addr < hi;
336 static inline void *nvme_addr_to_cmb(NvmeCtrl *n, hwaddr addr)
338 hwaddr base = n->params.legacy_cmb ? n->cmb.mem.addr : n->cmb.cba;
339 return &n->cmb.buf[addr - base];
342 static bool nvme_addr_is_pmr(NvmeCtrl *n, hwaddr addr)
344 hwaddr hi;
346 if (!n->pmr.cmse) {
347 return false;
350 hi = n->pmr.cba + int128_get64(n->pmr.dev->mr.size);
352 return addr >= n->pmr.cba && addr < hi;
355 static inline void *nvme_addr_to_pmr(NvmeCtrl *n, hwaddr addr)
357 return memory_region_get_ram_ptr(&n->pmr.dev->mr) + (addr - n->pmr.cba);
360 static int nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
362 hwaddr hi = addr + size - 1;
363 if (hi < addr) {
364 return 1;
367 if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr) && nvme_addr_is_cmb(n, hi)) {
368 memcpy(buf, nvme_addr_to_cmb(n, addr), size);
369 return 0;
372 if (nvme_addr_is_pmr(n, addr) && nvme_addr_is_pmr(n, hi)) {
373 memcpy(buf, nvme_addr_to_pmr(n, addr), size);
374 return 0;
377 return pci_dma_read(&n->parent_obj, addr, buf, size);
380 static int nvme_addr_write(NvmeCtrl *n, hwaddr addr, void *buf, int size)
382 hwaddr hi = addr + size - 1;
383 if (hi < addr) {
384 return 1;
387 if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr) && nvme_addr_is_cmb(n, hi)) {
388 memcpy(nvme_addr_to_cmb(n, addr), buf, size);
389 return 0;
392 if (nvme_addr_is_pmr(n, addr) && nvme_addr_is_pmr(n, hi)) {
393 memcpy(nvme_addr_to_pmr(n, addr), buf, size);
394 return 0;
397 return pci_dma_write(&n->parent_obj, addr, buf, size);
400 static bool nvme_nsid_valid(NvmeCtrl *n, uint32_t nsid)
402 return nsid &&
403 (nsid == NVME_NSID_BROADCAST || nsid <= NVME_MAX_NAMESPACES);
406 static int nvme_check_sqid(NvmeCtrl *n, uint16_t sqid)
408 return sqid < n->params.max_ioqpairs + 1 && n->sq[sqid] != NULL ? 0 : -1;
411 static int nvme_check_cqid(NvmeCtrl *n, uint16_t cqid)
413 return cqid < n->params.max_ioqpairs + 1 && n->cq[cqid] != NULL ? 0 : -1;
416 static void nvme_inc_cq_tail(NvmeCQueue *cq)
418 cq->tail++;
419 if (cq->tail >= cq->size) {
420 cq->tail = 0;
421 cq->phase = !cq->phase;
425 static void nvme_inc_sq_head(NvmeSQueue *sq)
427 sq->head = (sq->head + 1) % sq->size;
430 static uint8_t nvme_cq_full(NvmeCQueue *cq)
432 return (cq->tail + 1) % cq->size == cq->head;
435 static uint8_t nvme_sq_empty(NvmeSQueue *sq)
437 return sq->head == sq->tail;
440 static void nvme_irq_check(NvmeCtrl *n)
442 uint32_t intms = ldl_le_p(&n->bar.intms);
444 if (msix_enabled(&(n->parent_obj))) {
445 return;
447 if (~intms & n->irq_status) {
448 pci_irq_assert(&n->parent_obj);
449 } else {
450 pci_irq_deassert(&n->parent_obj);
454 static void nvme_irq_assert(NvmeCtrl *n, NvmeCQueue *cq)
456 if (cq->irq_enabled) {
457 if (msix_enabled(&(n->parent_obj))) {
458 trace_pci_nvme_irq_msix(cq->vector);
459 msix_notify(&(n->parent_obj), cq->vector);
460 } else {
461 trace_pci_nvme_irq_pin();
462 assert(cq->vector < 32);
463 n->irq_status |= 1 << cq->vector;
464 nvme_irq_check(n);
466 } else {
467 trace_pci_nvme_irq_masked();
471 static void nvme_irq_deassert(NvmeCtrl *n, NvmeCQueue *cq)
473 if (cq->irq_enabled) {
474 if (msix_enabled(&(n->parent_obj))) {
475 return;
476 } else {
477 assert(cq->vector < 32);
478 if (!n->cq_pending) {
479 n->irq_status &= ~(1 << cq->vector);
481 nvme_irq_check(n);
486 static void nvme_req_clear(NvmeRequest *req)
488 req->ns = NULL;
489 req->opaque = NULL;
490 req->aiocb = NULL;
491 memset(&req->cqe, 0x0, sizeof(req->cqe));
492 req->status = NVME_SUCCESS;
495 static inline void nvme_sg_init(NvmeCtrl *n, NvmeSg *sg, bool dma)
497 if (dma) {
498 pci_dma_sglist_init(&sg->qsg, &n->parent_obj, 0);
499 sg->flags = NVME_SG_DMA;
500 } else {
501 qemu_iovec_init(&sg->iov, 0);
504 sg->flags |= NVME_SG_ALLOC;
507 static inline void nvme_sg_unmap(NvmeSg *sg)
509 if (!(sg->flags & NVME_SG_ALLOC)) {
510 return;
513 if (sg->flags & NVME_SG_DMA) {
514 qemu_sglist_destroy(&sg->qsg);
515 } else {
516 qemu_iovec_destroy(&sg->iov);
519 memset(sg, 0x0, sizeof(*sg));
523 * When metadata is transfered as extended LBAs, the DPTR mapped into `sg`
524 * holds both data and metadata. This function splits the data and metadata
525 * into two separate QSG/IOVs.
527 static void nvme_sg_split(NvmeSg *sg, NvmeNamespace *ns, NvmeSg *data,
528 NvmeSg *mdata)
530 NvmeSg *dst = data;
531 uint32_t trans_len, count = ns->lbasz;
532 uint64_t offset = 0;
533 bool dma = sg->flags & NVME_SG_DMA;
534 size_t sge_len;
535 size_t sg_len = dma ? sg->qsg.size : sg->iov.size;
536 int sg_idx = 0;
538 assert(sg->flags & NVME_SG_ALLOC);
540 while (sg_len) {
541 sge_len = dma ? sg->qsg.sg[sg_idx].len : sg->iov.iov[sg_idx].iov_len;
543 trans_len = MIN(sg_len, count);
544 trans_len = MIN(trans_len, sge_len - offset);
546 if (dst) {
547 if (dma) {
548 qemu_sglist_add(&dst->qsg, sg->qsg.sg[sg_idx].base + offset,
549 trans_len);
550 } else {
551 qemu_iovec_add(&dst->iov,
552 sg->iov.iov[sg_idx].iov_base + offset,
553 trans_len);
557 sg_len -= trans_len;
558 count -= trans_len;
559 offset += trans_len;
561 if (count == 0) {
562 dst = (dst == data) ? mdata : data;
563 count = (dst == data) ? ns->lbasz : ns->lbaf.ms;
566 if (sge_len == offset) {
567 offset = 0;
568 sg_idx++;
573 static uint16_t nvme_map_addr_cmb(NvmeCtrl *n, QEMUIOVector *iov, hwaddr addr,
574 size_t len)
576 if (!len) {
577 return NVME_SUCCESS;
580 trace_pci_nvme_map_addr_cmb(addr, len);
582 if (!nvme_addr_is_cmb(n, addr) || !nvme_addr_is_cmb(n, addr + len - 1)) {
583 return NVME_DATA_TRAS_ERROR;
586 qemu_iovec_add(iov, nvme_addr_to_cmb(n, addr), len);
588 return NVME_SUCCESS;
591 static uint16_t nvme_map_addr_pmr(NvmeCtrl *n, QEMUIOVector *iov, hwaddr addr,
592 size_t len)
594 if (!len) {
595 return NVME_SUCCESS;
598 if (!nvme_addr_is_pmr(n, addr) || !nvme_addr_is_pmr(n, addr + len - 1)) {
599 return NVME_DATA_TRAS_ERROR;
602 qemu_iovec_add(iov, nvme_addr_to_pmr(n, addr), len);
604 return NVME_SUCCESS;
607 static uint16_t nvme_map_addr(NvmeCtrl *n, NvmeSg *sg, hwaddr addr, size_t len)
609 bool cmb = false, pmr = false;
611 if (!len) {
612 return NVME_SUCCESS;
615 trace_pci_nvme_map_addr(addr, len);
617 if (nvme_addr_is_cmb(n, addr)) {
618 cmb = true;
619 } else if (nvme_addr_is_pmr(n, addr)) {
620 pmr = true;
623 if (cmb || pmr) {
624 if (sg->flags & NVME_SG_DMA) {
625 return NVME_INVALID_USE_OF_CMB | NVME_DNR;
628 if (sg->iov.niov + 1 > IOV_MAX) {
629 goto max_mappings_exceeded;
632 if (cmb) {
633 return nvme_map_addr_cmb(n, &sg->iov, addr, len);
634 } else {
635 return nvme_map_addr_pmr(n, &sg->iov, addr, len);
639 if (!(sg->flags & NVME_SG_DMA)) {
640 return NVME_INVALID_USE_OF_CMB | NVME_DNR;
643 if (sg->qsg.nsg + 1 > IOV_MAX) {
644 goto max_mappings_exceeded;
647 qemu_sglist_add(&sg->qsg, addr, len);
649 return NVME_SUCCESS;
651 max_mappings_exceeded:
652 NVME_GUEST_ERR(pci_nvme_ub_too_many_mappings,
653 "number of mappings exceed 1024");
654 return NVME_INTERNAL_DEV_ERROR | NVME_DNR;
657 static inline bool nvme_addr_is_dma(NvmeCtrl *n, hwaddr addr)
659 return !(nvme_addr_is_cmb(n, addr) || nvme_addr_is_pmr(n, addr));
662 static uint16_t nvme_map_prp(NvmeCtrl *n, NvmeSg *sg, uint64_t prp1,
663 uint64_t prp2, uint32_t len)
665 hwaddr trans_len = n->page_size - (prp1 % n->page_size);
666 trans_len = MIN(len, trans_len);
667 int num_prps = (len >> n->page_bits) + 1;
668 uint16_t status;
669 int ret;
671 trace_pci_nvme_map_prp(trans_len, len, prp1, prp2, num_prps);
673 nvme_sg_init(n, sg, nvme_addr_is_dma(n, prp1));
675 status = nvme_map_addr(n, sg, prp1, trans_len);
676 if (status) {
677 goto unmap;
680 len -= trans_len;
681 if (len) {
682 if (len > n->page_size) {
683 uint64_t prp_list[n->max_prp_ents];
684 uint32_t nents, prp_trans;
685 int i = 0;
688 * The first PRP list entry, pointed to by PRP2 may contain offset.
689 * Hence, we need to calculate the number of entries in based on
690 * that offset.
692 nents = (n->page_size - (prp2 & (n->page_size - 1))) >> 3;
693 prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t);
694 ret = nvme_addr_read(n, prp2, (void *)prp_list, prp_trans);
695 if (ret) {
696 trace_pci_nvme_err_addr_read(prp2);
697 status = NVME_DATA_TRAS_ERROR;
698 goto unmap;
700 while (len != 0) {
701 uint64_t prp_ent = le64_to_cpu(prp_list[i]);
703 if (i == nents - 1 && len > n->page_size) {
704 if (unlikely(prp_ent & (n->page_size - 1))) {
705 trace_pci_nvme_err_invalid_prplist_ent(prp_ent);
706 status = NVME_INVALID_PRP_OFFSET | NVME_DNR;
707 goto unmap;
710 i = 0;
711 nents = (len + n->page_size - 1) >> n->page_bits;
712 nents = MIN(nents, n->max_prp_ents);
713 prp_trans = nents * sizeof(uint64_t);
714 ret = nvme_addr_read(n, prp_ent, (void *)prp_list,
715 prp_trans);
716 if (ret) {
717 trace_pci_nvme_err_addr_read(prp_ent);
718 status = NVME_DATA_TRAS_ERROR;
719 goto unmap;
721 prp_ent = le64_to_cpu(prp_list[i]);
724 if (unlikely(prp_ent & (n->page_size - 1))) {
725 trace_pci_nvme_err_invalid_prplist_ent(prp_ent);
726 status = NVME_INVALID_PRP_OFFSET | NVME_DNR;
727 goto unmap;
730 trans_len = MIN(len, n->page_size);
731 status = nvme_map_addr(n, sg, prp_ent, trans_len);
732 if (status) {
733 goto unmap;
736 len -= trans_len;
737 i++;
739 } else {
740 if (unlikely(prp2 & (n->page_size - 1))) {
741 trace_pci_nvme_err_invalid_prp2_align(prp2);
742 status = NVME_INVALID_PRP_OFFSET | NVME_DNR;
743 goto unmap;
745 status = nvme_map_addr(n, sg, prp2, len);
746 if (status) {
747 goto unmap;
752 return NVME_SUCCESS;
754 unmap:
755 nvme_sg_unmap(sg);
756 return status;
760 * Map 'nsgld' data descriptors from 'segment'. The function will subtract the
761 * number of bytes mapped in len.
763 static uint16_t nvme_map_sgl_data(NvmeCtrl *n, NvmeSg *sg,
764 NvmeSglDescriptor *segment, uint64_t nsgld,
765 size_t *len, NvmeCmd *cmd)
767 dma_addr_t addr, trans_len;
768 uint32_t dlen;
769 uint16_t status;
771 for (int i = 0; i < nsgld; i++) {
772 uint8_t type = NVME_SGL_TYPE(segment[i].type);
774 switch (type) {
775 case NVME_SGL_DESCR_TYPE_BIT_BUCKET:
776 if (cmd->opcode == NVME_CMD_WRITE) {
777 continue;
779 case NVME_SGL_DESCR_TYPE_DATA_BLOCK:
780 break;
781 case NVME_SGL_DESCR_TYPE_SEGMENT:
782 case NVME_SGL_DESCR_TYPE_LAST_SEGMENT:
783 return NVME_INVALID_NUM_SGL_DESCRS | NVME_DNR;
784 default:
785 return NVME_SGL_DESCR_TYPE_INVALID | NVME_DNR;
788 dlen = le32_to_cpu(segment[i].len);
790 if (!dlen) {
791 continue;
794 if (*len == 0) {
796 * All data has been mapped, but the SGL contains additional
797 * segments and/or descriptors. The controller might accept
798 * ignoring the rest of the SGL.
800 uint32_t sgls = le32_to_cpu(n->id_ctrl.sgls);
801 if (sgls & NVME_CTRL_SGLS_EXCESS_LENGTH) {
802 break;
805 trace_pci_nvme_err_invalid_sgl_excess_length(dlen);
806 return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
809 trans_len = MIN(*len, dlen);
811 if (type == NVME_SGL_DESCR_TYPE_BIT_BUCKET) {
812 goto next;
815 addr = le64_to_cpu(segment[i].addr);
817 if (UINT64_MAX - addr < dlen) {
818 return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
821 status = nvme_map_addr(n, sg, addr, trans_len);
822 if (status) {
823 return status;
826 next:
827 *len -= trans_len;
830 return NVME_SUCCESS;
833 static uint16_t nvme_map_sgl(NvmeCtrl *n, NvmeSg *sg, NvmeSglDescriptor sgl,
834 size_t len, NvmeCmd *cmd)
837 * Read the segment in chunks of 256 descriptors (one 4k page) to avoid
838 * dynamically allocating a potentially huge SGL. The spec allows the SGL
839 * to be larger (as in number of bytes required to describe the SGL
840 * descriptors and segment chain) than the command transfer size, so it is
841 * not bounded by MDTS.
843 const int SEG_CHUNK_SIZE = 256;
845 NvmeSglDescriptor segment[SEG_CHUNK_SIZE], *sgld, *last_sgld;
846 uint64_t nsgld;
847 uint32_t seg_len;
848 uint16_t status;
849 hwaddr addr;
850 int ret;
852 sgld = &sgl;
853 addr = le64_to_cpu(sgl.addr);
855 trace_pci_nvme_map_sgl(NVME_SGL_TYPE(sgl.type), len);
857 nvme_sg_init(n, sg, nvme_addr_is_dma(n, addr));
860 * If the entire transfer can be described with a single data block it can
861 * be mapped directly.
863 if (NVME_SGL_TYPE(sgl.type) == NVME_SGL_DESCR_TYPE_DATA_BLOCK) {
864 status = nvme_map_sgl_data(n, sg, sgld, 1, &len, cmd);
865 if (status) {
866 goto unmap;
869 goto out;
872 for (;;) {
873 switch (NVME_SGL_TYPE(sgld->type)) {
874 case NVME_SGL_DESCR_TYPE_SEGMENT:
875 case NVME_SGL_DESCR_TYPE_LAST_SEGMENT:
876 break;
877 default:
878 return NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
881 seg_len = le32_to_cpu(sgld->len);
883 /* check the length of the (Last) Segment descriptor */
884 if ((!seg_len || seg_len & 0xf) &&
885 (NVME_SGL_TYPE(sgld->type) != NVME_SGL_DESCR_TYPE_BIT_BUCKET)) {
886 return NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
889 if (UINT64_MAX - addr < seg_len) {
890 return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
893 nsgld = seg_len / sizeof(NvmeSglDescriptor);
895 while (nsgld > SEG_CHUNK_SIZE) {
896 if (nvme_addr_read(n, addr, segment, sizeof(segment))) {
897 trace_pci_nvme_err_addr_read(addr);
898 status = NVME_DATA_TRAS_ERROR;
899 goto unmap;
902 status = nvme_map_sgl_data(n, sg, segment, SEG_CHUNK_SIZE,
903 &len, cmd);
904 if (status) {
905 goto unmap;
908 nsgld -= SEG_CHUNK_SIZE;
909 addr += SEG_CHUNK_SIZE * sizeof(NvmeSglDescriptor);
912 ret = nvme_addr_read(n, addr, segment, nsgld *
913 sizeof(NvmeSglDescriptor));
914 if (ret) {
915 trace_pci_nvme_err_addr_read(addr);
916 status = NVME_DATA_TRAS_ERROR;
917 goto unmap;
920 last_sgld = &segment[nsgld - 1];
923 * If the segment ends with a Data Block or Bit Bucket Descriptor Type,
924 * then we are done.
926 switch (NVME_SGL_TYPE(last_sgld->type)) {
927 case NVME_SGL_DESCR_TYPE_DATA_BLOCK:
928 case NVME_SGL_DESCR_TYPE_BIT_BUCKET:
929 status = nvme_map_sgl_data(n, sg, segment, nsgld, &len, cmd);
930 if (status) {
931 goto unmap;
934 goto out;
936 default:
937 break;
941 * If the last descriptor was not a Data Block or Bit Bucket, then the
942 * current segment must not be a Last Segment.
944 if (NVME_SGL_TYPE(sgld->type) == NVME_SGL_DESCR_TYPE_LAST_SEGMENT) {
945 status = NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
946 goto unmap;
949 sgld = last_sgld;
950 addr = le64_to_cpu(sgld->addr);
953 * Do not map the last descriptor; it will be a Segment or Last Segment
954 * descriptor and is handled by the next iteration.
956 status = nvme_map_sgl_data(n, sg, segment, nsgld - 1, &len, cmd);
957 if (status) {
958 goto unmap;
962 out:
963 /* if there is any residual left in len, the SGL was too short */
964 if (len) {
965 status = NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
966 goto unmap;
969 return NVME_SUCCESS;
971 unmap:
972 nvme_sg_unmap(sg);
973 return status;
976 uint16_t nvme_map_dptr(NvmeCtrl *n, NvmeSg *sg, size_t len,
977 NvmeCmd *cmd)
979 uint64_t prp1, prp2;
981 switch (NVME_CMD_FLAGS_PSDT(cmd->flags)) {
982 case NVME_PSDT_PRP:
983 prp1 = le64_to_cpu(cmd->dptr.prp1);
984 prp2 = le64_to_cpu(cmd->dptr.prp2);
986 return nvme_map_prp(n, sg, prp1, prp2, len);
987 case NVME_PSDT_SGL_MPTR_CONTIGUOUS:
988 case NVME_PSDT_SGL_MPTR_SGL:
989 return nvme_map_sgl(n, sg, cmd->dptr.sgl, len, cmd);
990 default:
991 return NVME_INVALID_FIELD;
995 static uint16_t nvme_map_mptr(NvmeCtrl *n, NvmeSg *sg, size_t len,
996 NvmeCmd *cmd)
998 int psdt = NVME_CMD_FLAGS_PSDT(cmd->flags);
999 hwaddr mptr = le64_to_cpu(cmd->mptr);
1000 uint16_t status;
1002 if (psdt == NVME_PSDT_SGL_MPTR_SGL) {
1003 NvmeSglDescriptor sgl;
1005 if (nvme_addr_read(n, mptr, &sgl, sizeof(sgl))) {
1006 return NVME_DATA_TRAS_ERROR;
1009 status = nvme_map_sgl(n, sg, sgl, len, cmd);
1010 if (status && (status & 0x7ff) == NVME_DATA_SGL_LEN_INVALID) {
1011 status = NVME_MD_SGL_LEN_INVALID | NVME_DNR;
1014 return status;
1017 nvme_sg_init(n, sg, nvme_addr_is_dma(n, mptr));
1018 status = nvme_map_addr(n, sg, mptr, len);
1019 if (status) {
1020 nvme_sg_unmap(sg);
1023 return status;
1026 static uint16_t nvme_map_data(NvmeCtrl *n, uint32_t nlb, NvmeRequest *req)
1028 NvmeNamespace *ns = req->ns;
1029 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1030 bool pi = !!NVME_ID_NS_DPS_TYPE(ns->id_ns.dps);
1031 bool pract = !!(le16_to_cpu(rw->control) & NVME_RW_PRINFO_PRACT);
1032 size_t len = nvme_l2b(ns, nlb);
1033 uint16_t status;
1035 if (nvme_ns_ext(ns) && !(pi && pract && ns->lbaf.ms == 8)) {
1036 NvmeSg sg;
1038 len += nvme_m2b(ns, nlb);
1040 status = nvme_map_dptr(n, &sg, len, &req->cmd);
1041 if (status) {
1042 return status;
1045 nvme_sg_init(n, &req->sg, sg.flags & NVME_SG_DMA);
1046 nvme_sg_split(&sg, ns, &req->sg, NULL);
1047 nvme_sg_unmap(&sg);
1049 return NVME_SUCCESS;
1052 return nvme_map_dptr(n, &req->sg, len, &req->cmd);
1055 static uint16_t nvme_map_mdata(NvmeCtrl *n, uint32_t nlb, NvmeRequest *req)
1057 NvmeNamespace *ns = req->ns;
1058 size_t len = nvme_m2b(ns, nlb);
1059 uint16_t status;
1061 if (nvme_ns_ext(ns)) {
1062 NvmeSg sg;
1064 len += nvme_l2b(ns, nlb);
1066 status = nvme_map_dptr(n, &sg, len, &req->cmd);
1067 if (status) {
1068 return status;
1071 nvme_sg_init(n, &req->sg, sg.flags & NVME_SG_DMA);
1072 nvme_sg_split(&sg, ns, NULL, &req->sg);
1073 nvme_sg_unmap(&sg);
1075 return NVME_SUCCESS;
1078 return nvme_map_mptr(n, &req->sg, len, &req->cmd);
1081 static uint16_t nvme_tx_interleaved(NvmeCtrl *n, NvmeSg *sg, uint8_t *ptr,
1082 uint32_t len, uint32_t bytes,
1083 int32_t skip_bytes, int64_t offset,
1084 NvmeTxDirection dir)
1086 hwaddr addr;
1087 uint32_t trans_len, count = bytes;
1088 bool dma = sg->flags & NVME_SG_DMA;
1089 int64_t sge_len;
1090 int sg_idx = 0;
1091 int ret;
1093 assert(sg->flags & NVME_SG_ALLOC);
1095 while (len) {
1096 sge_len = dma ? sg->qsg.sg[sg_idx].len : sg->iov.iov[sg_idx].iov_len;
1098 if (sge_len - offset < 0) {
1099 offset -= sge_len;
1100 sg_idx++;
1101 continue;
1104 if (sge_len == offset) {
1105 offset = 0;
1106 sg_idx++;
1107 continue;
1110 trans_len = MIN(len, count);
1111 trans_len = MIN(trans_len, sge_len - offset);
1113 if (dma) {
1114 addr = sg->qsg.sg[sg_idx].base + offset;
1115 } else {
1116 addr = (hwaddr)(uintptr_t)sg->iov.iov[sg_idx].iov_base + offset;
1119 if (dir == NVME_TX_DIRECTION_TO_DEVICE) {
1120 ret = nvme_addr_read(n, addr, ptr, trans_len);
1121 } else {
1122 ret = nvme_addr_write(n, addr, ptr, trans_len);
1125 if (ret) {
1126 return NVME_DATA_TRAS_ERROR;
1129 ptr += trans_len;
1130 len -= trans_len;
1131 count -= trans_len;
1132 offset += trans_len;
1134 if (count == 0) {
1135 count = bytes;
1136 offset += skip_bytes;
1140 return NVME_SUCCESS;
1143 static uint16_t nvme_tx(NvmeCtrl *n, NvmeSg *sg, uint8_t *ptr, uint32_t len,
1144 NvmeTxDirection dir)
1146 assert(sg->flags & NVME_SG_ALLOC);
1148 if (sg->flags & NVME_SG_DMA) {
1149 uint64_t residual;
1151 if (dir == NVME_TX_DIRECTION_TO_DEVICE) {
1152 residual = dma_buf_write(ptr, len, &sg->qsg);
1153 } else {
1154 residual = dma_buf_read(ptr, len, &sg->qsg);
1157 if (unlikely(residual)) {
1158 trace_pci_nvme_err_invalid_dma();
1159 return NVME_INVALID_FIELD | NVME_DNR;
1161 } else {
1162 size_t bytes;
1164 if (dir == NVME_TX_DIRECTION_TO_DEVICE) {
1165 bytes = qemu_iovec_to_buf(&sg->iov, 0, ptr, len);
1166 } else {
1167 bytes = qemu_iovec_from_buf(&sg->iov, 0, ptr, len);
1170 if (unlikely(bytes != len)) {
1171 trace_pci_nvme_err_invalid_dma();
1172 return NVME_INVALID_FIELD | NVME_DNR;
1176 return NVME_SUCCESS;
1179 static inline uint16_t nvme_c2h(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
1180 NvmeRequest *req)
1182 uint16_t status;
1184 status = nvme_map_dptr(n, &req->sg, len, &req->cmd);
1185 if (status) {
1186 return status;
1189 return nvme_tx(n, &req->sg, ptr, len, NVME_TX_DIRECTION_FROM_DEVICE);
1192 static inline uint16_t nvme_h2c(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
1193 NvmeRequest *req)
1195 uint16_t status;
1197 status = nvme_map_dptr(n, &req->sg, len, &req->cmd);
1198 if (status) {
1199 return status;
1202 return nvme_tx(n, &req->sg, ptr, len, NVME_TX_DIRECTION_TO_DEVICE);
1205 uint16_t nvme_bounce_data(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
1206 NvmeTxDirection dir, NvmeRequest *req)
1208 NvmeNamespace *ns = req->ns;
1209 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1210 bool pi = !!NVME_ID_NS_DPS_TYPE(ns->id_ns.dps);
1211 bool pract = !!(le16_to_cpu(rw->control) & NVME_RW_PRINFO_PRACT);
1213 if (nvme_ns_ext(ns) && !(pi && pract && ns->lbaf.ms == 8)) {
1214 return nvme_tx_interleaved(n, &req->sg, ptr, len, ns->lbasz,
1215 ns->lbaf.ms, 0, dir);
1218 return nvme_tx(n, &req->sg, ptr, len, dir);
1221 uint16_t nvme_bounce_mdata(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
1222 NvmeTxDirection dir, NvmeRequest *req)
1224 NvmeNamespace *ns = req->ns;
1225 uint16_t status;
1227 if (nvme_ns_ext(ns)) {
1228 return nvme_tx_interleaved(n, &req->sg, ptr, len, ns->lbaf.ms,
1229 ns->lbasz, ns->lbasz, dir);
1232 nvme_sg_unmap(&req->sg);
1234 status = nvme_map_mptr(n, &req->sg, len, &req->cmd);
1235 if (status) {
1236 return status;
1239 return nvme_tx(n, &req->sg, ptr, len, dir);
1242 static inline void nvme_blk_read(BlockBackend *blk, int64_t offset,
1243 BlockCompletionFunc *cb, NvmeRequest *req)
1245 assert(req->sg.flags & NVME_SG_ALLOC);
1247 if (req->sg.flags & NVME_SG_DMA) {
1248 req->aiocb = dma_blk_read(blk, &req->sg.qsg, offset, BDRV_SECTOR_SIZE,
1249 cb, req);
1250 } else {
1251 req->aiocb = blk_aio_preadv(blk, offset, &req->sg.iov, 0, cb, req);
1255 static inline void nvme_blk_write(BlockBackend *blk, int64_t offset,
1256 BlockCompletionFunc *cb, NvmeRequest *req)
1258 assert(req->sg.flags & NVME_SG_ALLOC);
1260 if (req->sg.flags & NVME_SG_DMA) {
1261 req->aiocb = dma_blk_write(blk, &req->sg.qsg, offset, BDRV_SECTOR_SIZE,
1262 cb, req);
1263 } else {
1264 req->aiocb = blk_aio_pwritev(blk, offset, &req->sg.iov, 0, cb, req);
1268 static void nvme_post_cqes(void *opaque)
1270 NvmeCQueue *cq = opaque;
1271 NvmeCtrl *n = cq->ctrl;
1272 NvmeRequest *req, *next;
1273 bool pending = cq->head != cq->tail;
1274 int ret;
1276 QTAILQ_FOREACH_SAFE(req, &cq->req_list, entry, next) {
1277 NvmeSQueue *sq;
1278 hwaddr addr;
1280 if (nvme_cq_full(cq)) {
1281 break;
1284 sq = req->sq;
1285 req->cqe.status = cpu_to_le16((req->status << 1) | cq->phase);
1286 req->cqe.sq_id = cpu_to_le16(sq->sqid);
1287 req->cqe.sq_head = cpu_to_le16(sq->head);
1288 addr = cq->dma_addr + cq->tail * n->cqe_size;
1289 ret = pci_dma_write(&n->parent_obj, addr, (void *)&req->cqe,
1290 sizeof(req->cqe));
1291 if (ret) {
1292 trace_pci_nvme_err_addr_write(addr);
1293 trace_pci_nvme_err_cfs();
1294 stl_le_p(&n->bar.csts, NVME_CSTS_FAILED);
1295 break;
1297 QTAILQ_REMOVE(&cq->req_list, req, entry);
1298 nvme_inc_cq_tail(cq);
1299 nvme_sg_unmap(&req->sg);
1300 QTAILQ_INSERT_TAIL(&sq->req_list, req, entry);
1302 if (cq->tail != cq->head) {
1303 if (cq->irq_enabled && !pending) {
1304 n->cq_pending++;
1307 nvme_irq_assert(n, cq);
1311 static void nvme_enqueue_req_completion(NvmeCQueue *cq, NvmeRequest *req)
1313 assert(cq->cqid == req->sq->cqid);
1314 trace_pci_nvme_enqueue_req_completion(nvme_cid(req), cq->cqid,
1315 le32_to_cpu(req->cqe.result),
1316 le32_to_cpu(req->cqe.dw1),
1317 req->status);
1319 if (req->status) {
1320 trace_pci_nvme_err_req_status(nvme_cid(req), nvme_nsid(req->ns),
1321 req->status, req->cmd.opcode);
1324 QTAILQ_REMOVE(&req->sq->out_req_list, req, entry);
1325 QTAILQ_INSERT_TAIL(&cq->req_list, req, entry);
1326 timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
1329 static void nvme_process_aers(void *opaque)
1331 NvmeCtrl *n = opaque;
1332 NvmeAsyncEvent *event, *next;
1334 trace_pci_nvme_process_aers(n->aer_queued);
1336 QTAILQ_FOREACH_SAFE(event, &n->aer_queue, entry, next) {
1337 NvmeRequest *req;
1338 NvmeAerResult *result;
1340 /* can't post cqe if there is nothing to complete */
1341 if (!n->outstanding_aers) {
1342 trace_pci_nvme_no_outstanding_aers();
1343 break;
1346 /* ignore if masked (cqe posted, but event not cleared) */
1347 if (n->aer_mask & (1 << event->result.event_type)) {
1348 trace_pci_nvme_aer_masked(event->result.event_type, n->aer_mask);
1349 continue;
1352 QTAILQ_REMOVE(&n->aer_queue, event, entry);
1353 n->aer_queued--;
1355 n->aer_mask |= 1 << event->result.event_type;
1356 n->outstanding_aers--;
1358 req = n->aer_reqs[n->outstanding_aers];
1360 result = (NvmeAerResult *) &req->cqe.result;
1361 result->event_type = event->result.event_type;
1362 result->event_info = event->result.event_info;
1363 result->log_page = event->result.log_page;
1364 g_free(event);
1366 trace_pci_nvme_aer_post_cqe(result->event_type, result->event_info,
1367 result->log_page);
1369 nvme_enqueue_req_completion(&n->admin_cq, req);
1373 static void nvme_enqueue_event(NvmeCtrl *n, uint8_t event_type,
1374 uint8_t event_info, uint8_t log_page)
1376 NvmeAsyncEvent *event;
1378 trace_pci_nvme_enqueue_event(event_type, event_info, log_page);
1380 if (n->aer_queued == n->params.aer_max_queued) {
1381 trace_pci_nvme_enqueue_event_noqueue(n->aer_queued);
1382 return;
1385 event = g_new(NvmeAsyncEvent, 1);
1386 event->result = (NvmeAerResult) {
1387 .event_type = event_type,
1388 .event_info = event_info,
1389 .log_page = log_page,
1392 QTAILQ_INSERT_TAIL(&n->aer_queue, event, entry);
1393 n->aer_queued++;
1395 nvme_process_aers(n);
1398 static void nvme_smart_event(NvmeCtrl *n, uint8_t event)
1400 uint8_t aer_info;
1402 /* Ref SPEC <Asynchronous Event Information 0x2013 SMART / Health Status> */
1403 if (!(NVME_AEC_SMART(n->features.async_config) & event)) {
1404 return;
1407 switch (event) {
1408 case NVME_SMART_SPARE:
1409 aer_info = NVME_AER_INFO_SMART_SPARE_THRESH;
1410 break;
1411 case NVME_SMART_TEMPERATURE:
1412 aer_info = NVME_AER_INFO_SMART_TEMP_THRESH;
1413 break;
1414 case NVME_SMART_RELIABILITY:
1415 case NVME_SMART_MEDIA_READ_ONLY:
1416 case NVME_SMART_FAILED_VOLATILE_MEDIA:
1417 case NVME_SMART_PMR_UNRELIABLE:
1418 aer_info = NVME_AER_INFO_SMART_RELIABILITY;
1419 break;
1420 default:
1421 return;
1424 nvme_enqueue_event(n, NVME_AER_TYPE_SMART, aer_info, NVME_LOG_SMART_INFO);
1427 static void nvme_clear_events(NvmeCtrl *n, uint8_t event_type)
1429 n->aer_mask &= ~(1 << event_type);
1430 if (!QTAILQ_EMPTY(&n->aer_queue)) {
1431 nvme_process_aers(n);
1435 static inline uint16_t nvme_check_mdts(NvmeCtrl *n, size_t len)
1437 uint8_t mdts = n->params.mdts;
1439 if (mdts && len > n->page_size << mdts) {
1440 trace_pci_nvme_err_mdts(len);
1441 return NVME_INVALID_FIELD | NVME_DNR;
1444 return NVME_SUCCESS;
1447 static inline uint16_t nvme_check_bounds(NvmeNamespace *ns, uint64_t slba,
1448 uint32_t nlb)
1450 uint64_t nsze = le64_to_cpu(ns->id_ns.nsze);
1452 if (unlikely(UINT64_MAX - slba < nlb || slba + nlb > nsze)) {
1453 trace_pci_nvme_err_invalid_lba_range(slba, nlb, nsze);
1454 return NVME_LBA_RANGE | NVME_DNR;
1457 return NVME_SUCCESS;
1460 static int nvme_block_status_all(NvmeNamespace *ns, uint64_t slba,
1461 uint32_t nlb, int flags)
1463 BlockDriverState *bs = blk_bs(ns->blkconf.blk);
1465 int64_t pnum = 0, bytes = nvme_l2b(ns, nlb);
1466 int64_t offset = nvme_l2b(ns, slba);
1467 int ret;
1470 * `pnum` holds the number of bytes after offset that shares the same
1471 * allocation status as the byte at offset. If `pnum` is different from
1472 * `bytes`, we should check the allocation status of the next range and
1473 * continue this until all bytes have been checked.
1475 do {
1476 bytes -= pnum;
1478 ret = bdrv_block_status(bs, offset, bytes, &pnum, NULL, NULL);
1479 if (ret < 0) {
1480 return ret;
1484 trace_pci_nvme_block_status(offset, bytes, pnum, ret,
1485 !!(ret & BDRV_BLOCK_ZERO));
1487 if (!(ret & flags)) {
1488 return 1;
1491 offset += pnum;
1492 } while (pnum != bytes);
1494 return 0;
1497 static uint16_t nvme_check_dulbe(NvmeNamespace *ns, uint64_t slba,
1498 uint32_t nlb)
1500 int ret;
1501 Error *err = NULL;
1503 ret = nvme_block_status_all(ns, slba, nlb, BDRV_BLOCK_DATA);
1504 if (ret) {
1505 if (ret < 0) {
1506 error_setg_errno(&err, -ret, "unable to get block status");
1507 error_report_err(err);
1509 return NVME_INTERNAL_DEV_ERROR;
1512 return NVME_DULB;
1515 return NVME_SUCCESS;
1518 static void nvme_aio_err(NvmeRequest *req, int ret)
1520 uint16_t status = NVME_SUCCESS;
1521 Error *local_err = NULL;
1523 switch (req->cmd.opcode) {
1524 case NVME_CMD_READ:
1525 status = NVME_UNRECOVERED_READ;
1526 break;
1527 case NVME_CMD_FLUSH:
1528 case NVME_CMD_WRITE:
1529 case NVME_CMD_WRITE_ZEROES:
1530 case NVME_CMD_ZONE_APPEND:
1531 status = NVME_WRITE_FAULT;
1532 break;
1533 default:
1534 status = NVME_INTERNAL_DEV_ERROR;
1535 break;
1538 trace_pci_nvme_err_aio(nvme_cid(req), strerror(-ret), status);
1540 error_setg_errno(&local_err, -ret, "aio failed");
1541 error_report_err(local_err);
1544 * Set the command status code to the first encountered error but allow a
1545 * subsequent Internal Device Error to trump it.
1547 if (req->status && status != NVME_INTERNAL_DEV_ERROR) {
1548 return;
1551 req->status = status;
1554 static inline uint32_t nvme_zone_idx(NvmeNamespace *ns, uint64_t slba)
1556 return ns->zone_size_log2 > 0 ? slba >> ns->zone_size_log2 :
1557 slba / ns->zone_size;
1560 static inline NvmeZone *nvme_get_zone_by_slba(NvmeNamespace *ns, uint64_t slba)
1562 uint32_t zone_idx = nvme_zone_idx(ns, slba);
1564 if (zone_idx >= ns->num_zones) {
1565 return NULL;
1568 return &ns->zone_array[zone_idx];
1571 static uint16_t nvme_check_zone_state_for_write(NvmeZone *zone)
1573 uint64_t zslba = zone->d.zslba;
1575 switch (nvme_get_zone_state(zone)) {
1576 case NVME_ZONE_STATE_EMPTY:
1577 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1578 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1579 case NVME_ZONE_STATE_CLOSED:
1580 return NVME_SUCCESS;
1581 case NVME_ZONE_STATE_FULL:
1582 trace_pci_nvme_err_zone_is_full(zslba);
1583 return NVME_ZONE_FULL;
1584 case NVME_ZONE_STATE_OFFLINE:
1585 trace_pci_nvme_err_zone_is_offline(zslba);
1586 return NVME_ZONE_OFFLINE;
1587 case NVME_ZONE_STATE_READ_ONLY:
1588 trace_pci_nvme_err_zone_is_read_only(zslba);
1589 return NVME_ZONE_READ_ONLY;
1590 default:
1591 assert(false);
1594 return NVME_INTERNAL_DEV_ERROR;
1597 static uint16_t nvme_check_zone_write(NvmeNamespace *ns, NvmeZone *zone,
1598 uint64_t slba, uint32_t nlb)
1600 uint64_t zcap = nvme_zone_wr_boundary(zone);
1601 uint16_t status;
1603 status = nvme_check_zone_state_for_write(zone);
1604 if (status) {
1605 return status;
1608 if (unlikely(slba != zone->w_ptr)) {
1609 trace_pci_nvme_err_write_not_at_wp(slba, zone->d.zslba, zone->w_ptr);
1610 return NVME_ZONE_INVALID_WRITE;
1613 if (unlikely((slba + nlb) > zcap)) {
1614 trace_pci_nvme_err_zone_boundary(slba, nlb, zcap);
1615 return NVME_ZONE_BOUNDARY_ERROR;
1618 return NVME_SUCCESS;
1621 static uint16_t nvme_check_zone_state_for_read(NvmeZone *zone)
1623 switch (nvme_get_zone_state(zone)) {
1624 case NVME_ZONE_STATE_EMPTY:
1625 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1626 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1627 case NVME_ZONE_STATE_FULL:
1628 case NVME_ZONE_STATE_CLOSED:
1629 case NVME_ZONE_STATE_READ_ONLY:
1630 return NVME_SUCCESS;
1631 case NVME_ZONE_STATE_OFFLINE:
1632 trace_pci_nvme_err_zone_is_offline(zone->d.zslba);
1633 return NVME_ZONE_OFFLINE;
1634 default:
1635 assert(false);
1638 return NVME_INTERNAL_DEV_ERROR;
1641 static uint16_t nvme_check_zone_read(NvmeNamespace *ns, uint64_t slba,
1642 uint32_t nlb)
1644 NvmeZone *zone;
1645 uint64_t bndry, end;
1646 uint16_t status;
1648 zone = nvme_get_zone_by_slba(ns, slba);
1649 assert(zone);
1651 bndry = nvme_zone_rd_boundary(ns, zone);
1652 end = slba + nlb;
1654 status = nvme_check_zone_state_for_read(zone);
1655 if (status) {
1657 } else if (unlikely(end > bndry)) {
1658 if (!ns->params.cross_zone_read) {
1659 status = NVME_ZONE_BOUNDARY_ERROR;
1660 } else {
1662 * Read across zone boundary - check that all subsequent
1663 * zones that are being read have an appropriate state.
1665 do {
1666 zone++;
1667 status = nvme_check_zone_state_for_read(zone);
1668 if (status) {
1669 break;
1671 } while (end > nvme_zone_rd_boundary(ns, zone));
1675 return status;
1678 static uint16_t nvme_zrm_finish(NvmeNamespace *ns, NvmeZone *zone)
1680 switch (nvme_get_zone_state(zone)) {
1681 case NVME_ZONE_STATE_FULL:
1682 return NVME_SUCCESS;
1684 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1685 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1686 nvme_aor_dec_open(ns);
1687 /* fallthrough */
1688 case NVME_ZONE_STATE_CLOSED:
1689 nvme_aor_dec_active(ns);
1690 /* fallthrough */
1691 case NVME_ZONE_STATE_EMPTY:
1692 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_FULL);
1693 return NVME_SUCCESS;
1695 default:
1696 return NVME_ZONE_INVAL_TRANSITION;
1700 static uint16_t nvme_zrm_close(NvmeNamespace *ns, NvmeZone *zone)
1702 switch (nvme_get_zone_state(zone)) {
1703 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1704 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1705 nvme_aor_dec_open(ns);
1706 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_CLOSED);
1707 /* fall through */
1708 case NVME_ZONE_STATE_CLOSED:
1709 return NVME_SUCCESS;
1711 default:
1712 return NVME_ZONE_INVAL_TRANSITION;
1716 static uint16_t nvme_zrm_reset(NvmeNamespace *ns, NvmeZone *zone)
1718 switch (nvme_get_zone_state(zone)) {
1719 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1720 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1721 nvme_aor_dec_open(ns);
1722 /* fallthrough */
1723 case NVME_ZONE_STATE_CLOSED:
1724 nvme_aor_dec_active(ns);
1725 /* fallthrough */
1726 case NVME_ZONE_STATE_FULL:
1727 zone->w_ptr = zone->d.zslba;
1728 zone->d.wp = zone->w_ptr;
1729 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EMPTY);
1730 /* fallthrough */
1731 case NVME_ZONE_STATE_EMPTY:
1732 return NVME_SUCCESS;
1734 default:
1735 return NVME_ZONE_INVAL_TRANSITION;
1739 static void nvme_zrm_auto_transition_zone(NvmeNamespace *ns)
1741 NvmeZone *zone;
1743 if (ns->params.max_open_zones &&
1744 ns->nr_open_zones == ns->params.max_open_zones) {
1745 zone = QTAILQ_FIRST(&ns->imp_open_zones);
1746 if (zone) {
1748 * Automatically close this implicitly open zone.
1750 QTAILQ_REMOVE(&ns->imp_open_zones, zone, entry);
1751 nvme_zrm_close(ns, zone);
1756 enum {
1757 NVME_ZRM_AUTO = 1 << 0,
1760 static uint16_t nvme_zrm_open_flags(NvmeCtrl *n, NvmeNamespace *ns,
1761 NvmeZone *zone, int flags)
1763 int act = 0;
1764 uint16_t status;
1766 switch (nvme_get_zone_state(zone)) {
1767 case NVME_ZONE_STATE_EMPTY:
1768 act = 1;
1770 /* fallthrough */
1772 case NVME_ZONE_STATE_CLOSED:
1773 if (n->params.auto_transition_zones) {
1774 nvme_zrm_auto_transition_zone(ns);
1776 status = nvme_aor_check(ns, act, 1);
1777 if (status) {
1778 return status;
1781 if (act) {
1782 nvme_aor_inc_active(ns);
1785 nvme_aor_inc_open(ns);
1787 if (flags & NVME_ZRM_AUTO) {
1788 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_IMPLICITLY_OPEN);
1789 return NVME_SUCCESS;
1792 /* fallthrough */
1794 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1795 if (flags & NVME_ZRM_AUTO) {
1796 return NVME_SUCCESS;
1799 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EXPLICITLY_OPEN);
1801 /* fallthrough */
1803 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1804 return NVME_SUCCESS;
1806 default:
1807 return NVME_ZONE_INVAL_TRANSITION;
1811 static inline uint16_t nvme_zrm_auto(NvmeCtrl *n, NvmeNamespace *ns,
1812 NvmeZone *zone)
1814 return nvme_zrm_open_flags(n, ns, zone, NVME_ZRM_AUTO);
1817 static inline uint16_t nvme_zrm_open(NvmeCtrl *n, NvmeNamespace *ns,
1818 NvmeZone *zone)
1820 return nvme_zrm_open_flags(n, ns, zone, 0);
1823 static void nvme_advance_zone_wp(NvmeNamespace *ns, NvmeZone *zone,
1824 uint32_t nlb)
1826 zone->d.wp += nlb;
1828 if (zone->d.wp == nvme_zone_wr_boundary(zone)) {
1829 nvme_zrm_finish(ns, zone);
1833 static void nvme_finalize_zoned_write(NvmeNamespace *ns, NvmeRequest *req)
1835 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1836 NvmeZone *zone;
1837 uint64_t slba;
1838 uint32_t nlb;
1840 slba = le64_to_cpu(rw->slba);
1841 nlb = le16_to_cpu(rw->nlb) + 1;
1842 zone = nvme_get_zone_by_slba(ns, slba);
1843 assert(zone);
1845 nvme_advance_zone_wp(ns, zone, nlb);
1848 static inline bool nvme_is_write(NvmeRequest *req)
1850 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1852 return rw->opcode == NVME_CMD_WRITE ||
1853 rw->opcode == NVME_CMD_ZONE_APPEND ||
1854 rw->opcode == NVME_CMD_WRITE_ZEROES;
1857 static AioContext *nvme_get_aio_context(BlockAIOCB *acb)
1859 return qemu_get_aio_context();
1862 static void nvme_misc_cb(void *opaque, int ret)
1864 NvmeRequest *req = opaque;
1866 trace_pci_nvme_misc_cb(nvme_cid(req));
1868 if (ret) {
1869 nvme_aio_err(req, ret);
1872 nvme_enqueue_req_completion(nvme_cq(req), req);
1875 void nvme_rw_complete_cb(void *opaque, int ret)
1877 NvmeRequest *req = opaque;
1878 NvmeNamespace *ns = req->ns;
1879 BlockBackend *blk = ns->blkconf.blk;
1880 BlockAcctCookie *acct = &req->acct;
1881 BlockAcctStats *stats = blk_get_stats(blk);
1883 trace_pci_nvme_rw_complete_cb(nvme_cid(req), blk_name(blk));
1885 if (ret) {
1886 block_acct_failed(stats, acct);
1887 nvme_aio_err(req, ret);
1888 } else {
1889 block_acct_done(stats, acct);
1892 if (ns->params.zoned && nvme_is_write(req)) {
1893 nvme_finalize_zoned_write(ns, req);
1896 nvme_enqueue_req_completion(nvme_cq(req), req);
1899 static void nvme_rw_cb(void *opaque, int ret)
1901 NvmeRequest *req = opaque;
1902 NvmeNamespace *ns = req->ns;
1904 BlockBackend *blk = ns->blkconf.blk;
1906 trace_pci_nvme_rw_cb(nvme_cid(req), blk_name(blk));
1908 if (ret) {
1909 goto out;
1912 if (ns->lbaf.ms) {
1913 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1914 uint64_t slba = le64_to_cpu(rw->slba);
1915 uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
1916 uint64_t offset = nvme_moff(ns, slba);
1918 if (req->cmd.opcode == NVME_CMD_WRITE_ZEROES) {
1919 size_t mlen = nvme_m2b(ns, nlb);
1921 req->aiocb = blk_aio_pwrite_zeroes(blk, offset, mlen,
1922 BDRV_REQ_MAY_UNMAP,
1923 nvme_rw_complete_cb, req);
1924 return;
1927 if (nvme_ns_ext(ns) || req->cmd.mptr) {
1928 uint16_t status;
1930 nvme_sg_unmap(&req->sg);
1931 status = nvme_map_mdata(nvme_ctrl(req), nlb, req);
1932 if (status) {
1933 ret = -EFAULT;
1934 goto out;
1937 if (req->cmd.opcode == NVME_CMD_READ) {
1938 return nvme_blk_read(blk, offset, nvme_rw_complete_cb, req);
1941 return nvme_blk_write(blk, offset, nvme_rw_complete_cb, req);
1945 out:
1946 nvme_rw_complete_cb(req, ret);
1949 static void nvme_verify_cb(void *opaque, int ret)
1951 NvmeBounceContext *ctx = opaque;
1952 NvmeRequest *req = ctx->req;
1953 NvmeNamespace *ns = req->ns;
1954 BlockBackend *blk = ns->blkconf.blk;
1955 BlockAcctCookie *acct = &req->acct;
1956 BlockAcctStats *stats = blk_get_stats(blk);
1957 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1958 uint64_t slba = le64_to_cpu(rw->slba);
1959 uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
1960 uint16_t apptag = le16_to_cpu(rw->apptag);
1961 uint16_t appmask = le16_to_cpu(rw->appmask);
1962 uint32_t reftag = le32_to_cpu(rw->reftag);
1963 uint16_t status;
1965 trace_pci_nvme_verify_cb(nvme_cid(req), prinfo, apptag, appmask, reftag);
1967 if (ret) {
1968 block_acct_failed(stats, acct);
1969 nvme_aio_err(req, ret);
1970 goto out;
1973 block_acct_done(stats, acct);
1975 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
1976 status = nvme_dif_mangle_mdata(ns, ctx->mdata.bounce,
1977 ctx->mdata.iov.size, slba);
1978 if (status) {
1979 req->status = status;
1980 goto out;
1983 req->status = nvme_dif_check(ns, ctx->data.bounce, ctx->data.iov.size,
1984 ctx->mdata.bounce, ctx->mdata.iov.size,
1985 prinfo, slba, apptag, appmask, &reftag);
1988 out:
1989 qemu_iovec_destroy(&ctx->data.iov);
1990 g_free(ctx->data.bounce);
1992 qemu_iovec_destroy(&ctx->mdata.iov);
1993 g_free(ctx->mdata.bounce);
1995 g_free(ctx);
1997 nvme_enqueue_req_completion(nvme_cq(req), req);
2001 static void nvme_verify_mdata_in_cb(void *opaque, int ret)
2003 NvmeBounceContext *ctx = opaque;
2004 NvmeRequest *req = ctx->req;
2005 NvmeNamespace *ns = req->ns;
2006 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2007 uint64_t slba = le64_to_cpu(rw->slba);
2008 uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
2009 size_t mlen = nvme_m2b(ns, nlb);
2010 uint64_t offset = nvme_moff(ns, slba);
2011 BlockBackend *blk = ns->blkconf.blk;
2013 trace_pci_nvme_verify_mdata_in_cb(nvme_cid(req), blk_name(blk));
2015 if (ret) {
2016 goto out;
2019 ctx->mdata.bounce = g_malloc(mlen);
2021 qemu_iovec_reset(&ctx->mdata.iov);
2022 qemu_iovec_add(&ctx->mdata.iov, ctx->mdata.bounce, mlen);
2024 req->aiocb = blk_aio_preadv(blk, offset, &ctx->mdata.iov, 0,
2025 nvme_verify_cb, ctx);
2026 return;
2028 out:
2029 nvme_verify_cb(ctx, ret);
2032 struct nvme_compare_ctx {
2033 struct {
2034 QEMUIOVector iov;
2035 uint8_t *bounce;
2036 } data;
2038 struct {
2039 QEMUIOVector iov;
2040 uint8_t *bounce;
2041 } mdata;
2044 static void nvme_compare_mdata_cb(void *opaque, int ret)
2046 NvmeRequest *req = opaque;
2047 NvmeNamespace *ns = req->ns;
2048 NvmeCtrl *n = nvme_ctrl(req);
2049 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2050 uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
2051 uint16_t apptag = le16_to_cpu(rw->apptag);
2052 uint16_t appmask = le16_to_cpu(rw->appmask);
2053 uint32_t reftag = le32_to_cpu(rw->reftag);
2054 struct nvme_compare_ctx *ctx = req->opaque;
2055 g_autofree uint8_t *buf = NULL;
2056 BlockBackend *blk = ns->blkconf.blk;
2057 BlockAcctCookie *acct = &req->acct;
2058 BlockAcctStats *stats = blk_get_stats(blk);
2059 uint16_t status = NVME_SUCCESS;
2061 trace_pci_nvme_compare_mdata_cb(nvme_cid(req));
2063 if (ret) {
2064 block_acct_failed(stats, acct);
2065 nvme_aio_err(req, ret);
2066 goto out;
2069 buf = g_malloc(ctx->mdata.iov.size);
2071 status = nvme_bounce_mdata(n, buf, ctx->mdata.iov.size,
2072 NVME_TX_DIRECTION_TO_DEVICE, req);
2073 if (status) {
2074 req->status = status;
2075 goto out;
2078 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2079 uint64_t slba = le64_to_cpu(rw->slba);
2080 uint8_t *bufp;
2081 uint8_t *mbufp = ctx->mdata.bounce;
2082 uint8_t *end = mbufp + ctx->mdata.iov.size;
2083 int16_t pil = 0;
2085 status = nvme_dif_check(ns, ctx->data.bounce, ctx->data.iov.size,
2086 ctx->mdata.bounce, ctx->mdata.iov.size, prinfo,
2087 slba, apptag, appmask, &reftag);
2088 if (status) {
2089 req->status = status;
2090 goto out;
2094 * When formatted with protection information, do not compare the DIF
2095 * tuple.
2097 if (!(ns->id_ns.dps & NVME_ID_NS_DPS_FIRST_EIGHT)) {
2098 pil = ns->lbaf.ms - sizeof(NvmeDifTuple);
2101 for (bufp = buf; mbufp < end; bufp += ns->lbaf.ms, mbufp += ns->lbaf.ms) {
2102 if (memcmp(bufp + pil, mbufp + pil, ns->lbaf.ms - pil)) {
2103 req->status = NVME_CMP_FAILURE;
2104 goto out;
2108 goto out;
2111 if (memcmp(buf, ctx->mdata.bounce, ctx->mdata.iov.size)) {
2112 req->status = NVME_CMP_FAILURE;
2113 goto out;
2116 block_acct_done(stats, acct);
2118 out:
2119 qemu_iovec_destroy(&ctx->data.iov);
2120 g_free(ctx->data.bounce);
2122 qemu_iovec_destroy(&ctx->mdata.iov);
2123 g_free(ctx->mdata.bounce);
2125 g_free(ctx);
2127 nvme_enqueue_req_completion(nvme_cq(req), req);
2130 static void nvme_compare_data_cb(void *opaque, int ret)
2132 NvmeRequest *req = opaque;
2133 NvmeCtrl *n = nvme_ctrl(req);
2134 NvmeNamespace *ns = req->ns;
2135 BlockBackend *blk = ns->blkconf.blk;
2136 BlockAcctCookie *acct = &req->acct;
2137 BlockAcctStats *stats = blk_get_stats(blk);
2139 struct nvme_compare_ctx *ctx = req->opaque;
2140 g_autofree uint8_t *buf = NULL;
2141 uint16_t status;
2143 trace_pci_nvme_compare_data_cb(nvme_cid(req));
2145 if (ret) {
2146 block_acct_failed(stats, acct);
2147 nvme_aio_err(req, ret);
2148 goto out;
2151 buf = g_malloc(ctx->data.iov.size);
2153 status = nvme_bounce_data(n, buf, ctx->data.iov.size,
2154 NVME_TX_DIRECTION_TO_DEVICE, req);
2155 if (status) {
2156 req->status = status;
2157 goto out;
2160 if (memcmp(buf, ctx->data.bounce, ctx->data.iov.size)) {
2161 req->status = NVME_CMP_FAILURE;
2162 goto out;
2165 if (ns->lbaf.ms) {
2166 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2167 uint64_t slba = le64_to_cpu(rw->slba);
2168 uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
2169 size_t mlen = nvme_m2b(ns, nlb);
2170 uint64_t offset = nvme_moff(ns, slba);
2172 ctx->mdata.bounce = g_malloc(mlen);
2174 qemu_iovec_init(&ctx->mdata.iov, 1);
2175 qemu_iovec_add(&ctx->mdata.iov, ctx->mdata.bounce, mlen);
2177 req->aiocb = blk_aio_preadv(blk, offset, &ctx->mdata.iov, 0,
2178 nvme_compare_mdata_cb, req);
2179 return;
2182 block_acct_done(stats, acct);
2184 out:
2185 qemu_iovec_destroy(&ctx->data.iov);
2186 g_free(ctx->data.bounce);
2187 g_free(ctx);
2189 nvme_enqueue_req_completion(nvme_cq(req), req);
2192 typedef struct NvmeDSMAIOCB {
2193 BlockAIOCB common;
2194 BlockAIOCB *aiocb;
2195 NvmeRequest *req;
2196 QEMUBH *bh;
2197 int ret;
2199 NvmeDsmRange *range;
2200 unsigned int nr;
2201 unsigned int idx;
2202 } NvmeDSMAIOCB;
2204 static void nvme_dsm_cancel(BlockAIOCB *aiocb)
2206 NvmeDSMAIOCB *iocb = container_of(aiocb, NvmeDSMAIOCB, common);
2208 /* break nvme_dsm_cb loop */
2209 iocb->idx = iocb->nr;
2210 iocb->ret = -ECANCELED;
2212 if (iocb->aiocb) {
2213 blk_aio_cancel_async(iocb->aiocb);
2214 iocb->aiocb = NULL;
2215 } else {
2217 * We only reach this if nvme_dsm_cancel() has already been called or
2218 * the command ran to completion and nvme_dsm_bh is scheduled to run.
2220 assert(iocb->idx == iocb->nr);
2224 static const AIOCBInfo nvme_dsm_aiocb_info = {
2225 .aiocb_size = sizeof(NvmeDSMAIOCB),
2226 .cancel_async = nvme_dsm_cancel,
2229 static void nvme_dsm_bh(void *opaque)
2231 NvmeDSMAIOCB *iocb = opaque;
2233 iocb->common.cb(iocb->common.opaque, iocb->ret);
2235 qemu_bh_delete(iocb->bh);
2236 iocb->bh = NULL;
2237 qemu_aio_unref(iocb);
2240 static void nvme_dsm_cb(void *opaque, int ret);
2242 static void nvme_dsm_md_cb(void *opaque, int ret)
2244 NvmeDSMAIOCB *iocb = opaque;
2245 NvmeRequest *req = iocb->req;
2246 NvmeNamespace *ns = req->ns;
2247 NvmeDsmRange *range;
2248 uint64_t slba;
2249 uint32_t nlb;
2251 if (ret < 0) {
2252 iocb->ret = ret;
2253 goto done;
2256 if (!ns->lbaf.ms) {
2257 nvme_dsm_cb(iocb, 0);
2258 return;
2261 range = &iocb->range[iocb->idx - 1];
2262 slba = le64_to_cpu(range->slba);
2263 nlb = le32_to_cpu(range->nlb);
2266 * Check that all block were discarded (zeroed); otherwise we do not zero
2267 * the metadata.
2270 ret = nvme_block_status_all(ns, slba, nlb, BDRV_BLOCK_ZERO);
2271 if (ret) {
2272 if (ret < 0) {
2273 iocb->ret = ret;
2274 goto done;
2277 nvme_dsm_cb(iocb, 0);
2280 iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk, nvme_moff(ns, slba),
2281 nvme_m2b(ns, nlb), BDRV_REQ_MAY_UNMAP,
2282 nvme_dsm_cb, iocb);
2283 return;
2285 done:
2286 iocb->aiocb = NULL;
2287 qemu_bh_schedule(iocb->bh);
2290 static void nvme_dsm_cb(void *opaque, int ret)
2292 NvmeDSMAIOCB *iocb = opaque;
2293 NvmeRequest *req = iocb->req;
2294 NvmeCtrl *n = nvme_ctrl(req);
2295 NvmeNamespace *ns = req->ns;
2296 NvmeDsmRange *range;
2297 uint64_t slba;
2298 uint32_t nlb;
2300 if (ret < 0) {
2301 iocb->ret = ret;
2302 goto done;
2305 next:
2306 if (iocb->idx == iocb->nr) {
2307 goto done;
2310 range = &iocb->range[iocb->idx++];
2311 slba = le64_to_cpu(range->slba);
2312 nlb = le32_to_cpu(range->nlb);
2314 trace_pci_nvme_dsm_deallocate(slba, nlb);
2316 if (nlb > n->dmrsl) {
2317 trace_pci_nvme_dsm_single_range_limit_exceeded(nlb, n->dmrsl);
2318 goto next;
2321 if (nvme_check_bounds(ns, slba, nlb)) {
2322 trace_pci_nvme_err_invalid_lba_range(slba, nlb,
2323 ns->id_ns.nsze);
2324 goto next;
2327 iocb->aiocb = blk_aio_pdiscard(ns->blkconf.blk, nvme_l2b(ns, slba),
2328 nvme_l2b(ns, nlb),
2329 nvme_dsm_md_cb, iocb);
2330 return;
2332 done:
2333 iocb->aiocb = NULL;
2334 qemu_bh_schedule(iocb->bh);
2337 static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest *req)
2339 NvmeNamespace *ns = req->ns;
2340 NvmeDsmCmd *dsm = (NvmeDsmCmd *) &req->cmd;
2341 uint32_t attr = le32_to_cpu(dsm->attributes);
2342 uint32_t nr = (le32_to_cpu(dsm->nr) & 0xff) + 1;
2343 uint16_t status = NVME_SUCCESS;
2345 trace_pci_nvme_dsm(nr, attr);
2347 if (attr & NVME_DSMGMT_AD) {
2348 NvmeDSMAIOCB *iocb = blk_aio_get(&nvme_dsm_aiocb_info, ns->blkconf.blk,
2349 nvme_misc_cb, req);
2351 iocb->req = req;
2352 iocb->bh = qemu_bh_new(nvme_dsm_bh, iocb);
2353 iocb->ret = 0;
2354 iocb->range = g_new(NvmeDsmRange, nr);
2355 iocb->nr = nr;
2356 iocb->idx = 0;
2358 status = nvme_h2c(n, (uint8_t *)iocb->range, sizeof(NvmeDsmRange) * nr,
2359 req);
2360 if (status) {
2361 return status;
2364 req->aiocb = &iocb->common;
2365 nvme_dsm_cb(iocb, 0);
2367 return NVME_NO_COMPLETE;
2370 return status;
2373 static uint16_t nvme_verify(NvmeCtrl *n, NvmeRequest *req)
2375 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2376 NvmeNamespace *ns = req->ns;
2377 BlockBackend *blk = ns->blkconf.blk;
2378 uint64_t slba = le64_to_cpu(rw->slba);
2379 uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
2380 size_t len = nvme_l2b(ns, nlb);
2381 int64_t offset = nvme_l2b(ns, slba);
2382 uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
2383 uint32_t reftag = le32_to_cpu(rw->reftag);
2384 NvmeBounceContext *ctx = NULL;
2385 uint16_t status;
2387 trace_pci_nvme_verify(nvme_cid(req), nvme_nsid(ns), slba, nlb);
2389 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2390 status = nvme_check_prinfo(ns, prinfo, slba, reftag);
2391 if (status) {
2392 return status;
2395 if (prinfo & NVME_PRINFO_PRACT) {
2396 return NVME_INVALID_PROT_INFO | NVME_DNR;
2400 if (len > n->page_size << n->params.vsl) {
2401 return NVME_INVALID_FIELD | NVME_DNR;
2404 status = nvme_check_bounds(ns, slba, nlb);
2405 if (status) {
2406 return status;
2409 if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
2410 status = nvme_check_dulbe(ns, slba, nlb);
2411 if (status) {
2412 return status;
2416 ctx = g_new0(NvmeBounceContext, 1);
2417 ctx->req = req;
2419 ctx->data.bounce = g_malloc(len);
2421 qemu_iovec_init(&ctx->data.iov, 1);
2422 qemu_iovec_add(&ctx->data.iov, ctx->data.bounce, len);
2424 block_acct_start(blk_get_stats(blk), &req->acct, ctx->data.iov.size,
2425 BLOCK_ACCT_READ);
2427 req->aiocb = blk_aio_preadv(ns->blkconf.blk, offset, &ctx->data.iov, 0,
2428 nvme_verify_mdata_in_cb, ctx);
2429 return NVME_NO_COMPLETE;
2432 typedef struct NvmeCopyAIOCB {
2433 BlockAIOCB common;
2434 BlockAIOCB *aiocb;
2435 NvmeRequest *req;
2436 QEMUBH *bh;
2437 int ret;
2439 NvmeCopySourceRange *ranges;
2440 int nr;
2441 int idx;
2443 uint8_t *bounce;
2444 QEMUIOVector iov;
2445 struct {
2446 BlockAcctCookie read;
2447 BlockAcctCookie write;
2448 } acct;
2450 uint32_t reftag;
2451 uint64_t slba;
2453 NvmeZone *zone;
2454 } NvmeCopyAIOCB;
2456 static void nvme_copy_cancel(BlockAIOCB *aiocb)
2458 NvmeCopyAIOCB *iocb = container_of(aiocb, NvmeCopyAIOCB, common);
2460 iocb->ret = -ECANCELED;
2462 if (iocb->aiocb) {
2463 blk_aio_cancel_async(iocb->aiocb);
2464 iocb->aiocb = NULL;
2468 static const AIOCBInfo nvme_copy_aiocb_info = {
2469 .aiocb_size = sizeof(NvmeCopyAIOCB),
2470 .cancel_async = nvme_copy_cancel,
2473 static void nvme_copy_bh(void *opaque)
2475 NvmeCopyAIOCB *iocb = opaque;
2476 NvmeRequest *req = iocb->req;
2477 NvmeNamespace *ns = req->ns;
2478 BlockAcctStats *stats = blk_get_stats(ns->blkconf.blk);
2480 if (iocb->idx != iocb->nr) {
2481 req->cqe.result = cpu_to_le32(iocb->idx);
2484 qemu_iovec_destroy(&iocb->iov);
2485 g_free(iocb->bounce);
2487 qemu_bh_delete(iocb->bh);
2488 iocb->bh = NULL;
2490 if (iocb->ret < 0) {
2491 block_acct_failed(stats, &iocb->acct.read);
2492 block_acct_failed(stats, &iocb->acct.write);
2493 } else {
2494 block_acct_done(stats, &iocb->acct.read);
2495 block_acct_done(stats, &iocb->acct.write);
2498 iocb->common.cb(iocb->common.opaque, iocb->ret);
2499 qemu_aio_unref(iocb);
2502 static void nvme_copy_cb(void *opaque, int ret);
2504 static void nvme_copy_out_completed_cb(void *opaque, int ret)
2506 NvmeCopyAIOCB *iocb = opaque;
2507 NvmeRequest *req = iocb->req;
2508 NvmeNamespace *ns = req->ns;
2509 NvmeCopySourceRange *range = &iocb->ranges[iocb->idx];
2510 uint32_t nlb = le32_to_cpu(range->nlb) + 1;
2512 if (ret < 0) {
2513 iocb->ret = ret;
2514 goto out;
2515 } else if (iocb->ret < 0) {
2516 goto out;
2519 if (ns->params.zoned) {
2520 nvme_advance_zone_wp(ns, iocb->zone, nlb);
2523 iocb->idx++;
2524 iocb->slba += nlb;
2525 out:
2526 nvme_copy_cb(iocb, iocb->ret);
2529 static void nvme_copy_out_cb(void *opaque, int ret)
2531 NvmeCopyAIOCB *iocb = opaque;
2532 NvmeRequest *req = iocb->req;
2533 NvmeNamespace *ns = req->ns;
2534 NvmeCopySourceRange *range;
2535 uint32_t nlb;
2536 size_t mlen;
2537 uint8_t *mbounce;
2539 if (ret < 0) {
2540 iocb->ret = ret;
2541 goto out;
2542 } else if (iocb->ret < 0) {
2543 goto out;
2546 if (!ns->lbaf.ms) {
2547 nvme_copy_out_completed_cb(iocb, 0);
2548 return;
2551 range = &iocb->ranges[iocb->idx];
2552 nlb = le32_to_cpu(range->nlb) + 1;
2554 mlen = nvme_m2b(ns, nlb);
2555 mbounce = iocb->bounce + nvme_l2b(ns, nlb);
2557 qemu_iovec_reset(&iocb->iov);
2558 qemu_iovec_add(&iocb->iov, mbounce, mlen);
2560 iocb->aiocb = blk_aio_pwritev(ns->blkconf.blk, nvme_moff(ns, iocb->slba),
2561 &iocb->iov, 0, nvme_copy_out_completed_cb,
2562 iocb);
2564 return;
2566 out:
2567 nvme_copy_cb(iocb, ret);
2570 static void nvme_copy_in_completed_cb(void *opaque, int ret)
2572 NvmeCopyAIOCB *iocb = opaque;
2573 NvmeRequest *req = iocb->req;
2574 NvmeNamespace *ns = req->ns;
2575 NvmeCopySourceRange *range;
2576 uint32_t nlb;
2577 size_t len;
2578 uint16_t status;
2580 if (ret < 0) {
2581 iocb->ret = ret;
2582 goto out;
2583 } else if (iocb->ret < 0) {
2584 goto out;
2587 range = &iocb->ranges[iocb->idx];
2588 nlb = le32_to_cpu(range->nlb) + 1;
2589 len = nvme_l2b(ns, nlb);
2591 trace_pci_nvme_copy_out(iocb->slba, nlb);
2593 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2594 NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd;
2596 uint16_t prinfor = ((copy->control[0] >> 4) & 0xf);
2597 uint16_t prinfow = ((copy->control[2] >> 2) & 0xf);
2599 uint16_t apptag = le16_to_cpu(range->apptag);
2600 uint16_t appmask = le16_to_cpu(range->appmask);
2601 uint32_t reftag = le32_to_cpu(range->reftag);
2603 uint64_t slba = le64_to_cpu(range->slba);
2604 size_t mlen = nvme_m2b(ns, nlb);
2605 uint8_t *mbounce = iocb->bounce + nvme_l2b(ns, nlb);
2607 status = nvme_dif_check(ns, iocb->bounce, len, mbounce, mlen, prinfor,
2608 slba, apptag, appmask, &reftag);
2609 if (status) {
2610 goto invalid;
2613 apptag = le16_to_cpu(copy->apptag);
2614 appmask = le16_to_cpu(copy->appmask);
2616 if (prinfow & NVME_PRINFO_PRACT) {
2617 status = nvme_check_prinfo(ns, prinfow, iocb->slba, iocb->reftag);
2618 if (status) {
2619 goto invalid;
2622 nvme_dif_pract_generate_dif(ns, iocb->bounce, len, mbounce, mlen,
2623 apptag, &iocb->reftag);
2624 } else {
2625 status = nvme_dif_check(ns, iocb->bounce, len, mbounce, mlen,
2626 prinfow, iocb->slba, apptag, appmask,
2627 &iocb->reftag);
2628 if (status) {
2629 goto invalid;
2634 status = nvme_check_bounds(ns, iocb->slba, nlb);
2635 if (status) {
2636 goto invalid;
2639 if (ns->params.zoned) {
2640 status = nvme_check_zone_write(ns, iocb->zone, iocb->slba, nlb);
2641 if (status) {
2642 goto invalid;
2645 iocb->zone->w_ptr += nlb;
2648 qemu_iovec_reset(&iocb->iov);
2649 qemu_iovec_add(&iocb->iov, iocb->bounce, len);
2651 iocb->aiocb = blk_aio_pwritev(ns->blkconf.blk, nvme_l2b(ns, iocb->slba),
2652 &iocb->iov, 0, nvme_copy_out_cb, iocb);
2654 return;
2656 invalid:
2657 req->status = status;
2658 iocb->aiocb = NULL;
2659 if (iocb->bh) {
2660 qemu_bh_schedule(iocb->bh);
2663 return;
2665 out:
2666 nvme_copy_cb(iocb, ret);
2669 static void nvme_copy_in_cb(void *opaque, int ret)
2671 NvmeCopyAIOCB *iocb = opaque;
2672 NvmeRequest *req = iocb->req;
2673 NvmeNamespace *ns = req->ns;
2674 NvmeCopySourceRange *range;
2675 uint64_t slba;
2676 uint32_t nlb;
2678 if (ret < 0) {
2679 iocb->ret = ret;
2680 goto out;
2681 } else if (iocb->ret < 0) {
2682 goto out;
2685 if (!ns->lbaf.ms) {
2686 nvme_copy_in_completed_cb(iocb, 0);
2687 return;
2690 range = &iocb->ranges[iocb->idx];
2691 slba = le64_to_cpu(range->slba);
2692 nlb = le32_to_cpu(range->nlb) + 1;
2694 qemu_iovec_reset(&iocb->iov);
2695 qemu_iovec_add(&iocb->iov, iocb->bounce + nvme_l2b(ns, nlb),
2696 nvme_m2b(ns, nlb));
2698 iocb->aiocb = blk_aio_preadv(ns->blkconf.blk, nvme_moff(ns, slba),
2699 &iocb->iov, 0, nvme_copy_in_completed_cb,
2700 iocb);
2701 return;
2703 out:
2704 nvme_copy_cb(iocb, iocb->ret);
2707 static void nvme_copy_cb(void *opaque, int ret)
2709 NvmeCopyAIOCB *iocb = opaque;
2710 NvmeRequest *req = iocb->req;
2711 NvmeNamespace *ns = req->ns;
2712 NvmeCopySourceRange *range;
2713 uint64_t slba;
2714 uint32_t nlb;
2715 size_t len;
2716 uint16_t status;
2718 if (ret < 0) {
2719 iocb->ret = ret;
2720 goto done;
2721 } else if (iocb->ret < 0) {
2722 goto done;
2725 if (iocb->idx == iocb->nr) {
2726 goto done;
2729 range = &iocb->ranges[iocb->idx];
2730 slba = le64_to_cpu(range->slba);
2731 nlb = le32_to_cpu(range->nlb) + 1;
2732 len = nvme_l2b(ns, nlb);
2734 trace_pci_nvme_copy_source_range(slba, nlb);
2736 if (nlb > le16_to_cpu(ns->id_ns.mssrl)) {
2737 status = NVME_CMD_SIZE_LIMIT | NVME_DNR;
2738 goto invalid;
2741 status = nvme_check_bounds(ns, slba, nlb);
2742 if (status) {
2743 goto invalid;
2746 if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
2747 status = nvme_check_dulbe(ns, slba, nlb);
2748 if (status) {
2749 goto invalid;
2753 if (ns->params.zoned) {
2754 status = nvme_check_zone_read(ns, slba, nlb);
2755 if (status) {
2756 goto invalid;
2760 qemu_iovec_reset(&iocb->iov);
2761 qemu_iovec_add(&iocb->iov, iocb->bounce, len);
2763 iocb->aiocb = blk_aio_preadv(ns->blkconf.blk, nvme_l2b(ns, slba),
2764 &iocb->iov, 0, nvme_copy_in_cb, iocb);
2765 return;
2767 invalid:
2768 req->status = status;
2769 done:
2770 iocb->aiocb = NULL;
2771 if (iocb->bh) {
2772 qemu_bh_schedule(iocb->bh);
2777 static uint16_t nvme_copy(NvmeCtrl *n, NvmeRequest *req)
2779 NvmeNamespace *ns = req->ns;
2780 NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd;
2781 NvmeCopyAIOCB *iocb = blk_aio_get(&nvme_copy_aiocb_info, ns->blkconf.blk,
2782 nvme_misc_cb, req);
2783 uint16_t nr = copy->nr + 1;
2784 uint8_t format = copy->control[0] & 0xf;
2785 uint16_t prinfor = ((copy->control[0] >> 4) & 0xf);
2786 uint16_t prinfow = ((copy->control[2] >> 2) & 0xf);
2788 uint16_t status;
2790 trace_pci_nvme_copy(nvme_cid(req), nvme_nsid(ns), nr, format);
2792 iocb->ranges = NULL;
2793 iocb->zone = NULL;
2795 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps) &&
2796 ((prinfor & NVME_PRINFO_PRACT) != (prinfow & NVME_PRINFO_PRACT))) {
2797 status = NVME_INVALID_FIELD | NVME_DNR;
2798 goto invalid;
2801 if (!(n->id_ctrl.ocfs & (1 << format))) {
2802 trace_pci_nvme_err_copy_invalid_format(format);
2803 status = NVME_INVALID_FIELD | NVME_DNR;
2804 goto invalid;
2807 if (nr > ns->id_ns.msrc + 1) {
2808 status = NVME_CMD_SIZE_LIMIT | NVME_DNR;
2809 goto invalid;
2812 iocb->ranges = g_new(NvmeCopySourceRange, nr);
2814 status = nvme_h2c(n, (uint8_t *)iocb->ranges,
2815 sizeof(NvmeCopySourceRange) * nr, req);
2816 if (status) {
2817 goto invalid;
2820 iocb->slba = le64_to_cpu(copy->sdlba);
2822 if (ns->params.zoned) {
2823 iocb->zone = nvme_get_zone_by_slba(ns, iocb->slba);
2824 if (!iocb->zone) {
2825 status = NVME_LBA_RANGE | NVME_DNR;
2826 goto invalid;
2829 status = nvme_zrm_auto(n, ns, iocb->zone);
2830 if (status) {
2831 goto invalid;
2835 iocb->req = req;
2836 iocb->bh = qemu_bh_new(nvme_copy_bh, iocb);
2837 iocb->ret = 0;
2838 iocb->nr = nr;
2839 iocb->idx = 0;
2840 iocb->reftag = le32_to_cpu(copy->reftag);
2841 iocb->bounce = g_malloc_n(le16_to_cpu(ns->id_ns.mssrl),
2842 ns->lbasz + ns->lbaf.ms);
2844 qemu_iovec_init(&iocb->iov, 1);
2846 block_acct_start(blk_get_stats(ns->blkconf.blk), &iocb->acct.read, 0,
2847 BLOCK_ACCT_READ);
2848 block_acct_start(blk_get_stats(ns->blkconf.blk), &iocb->acct.write, 0,
2849 BLOCK_ACCT_WRITE);
2851 req->aiocb = &iocb->common;
2852 nvme_copy_cb(iocb, 0);
2854 return NVME_NO_COMPLETE;
2856 invalid:
2857 g_free(iocb->ranges);
2858 qemu_aio_unref(iocb);
2859 return status;
2862 static uint16_t nvme_compare(NvmeCtrl *n, NvmeRequest *req)
2864 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2865 NvmeNamespace *ns = req->ns;
2866 BlockBackend *blk = ns->blkconf.blk;
2867 uint64_t slba = le64_to_cpu(rw->slba);
2868 uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
2869 uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
2870 size_t data_len = nvme_l2b(ns, nlb);
2871 size_t len = data_len;
2872 int64_t offset = nvme_l2b(ns, slba);
2873 struct nvme_compare_ctx *ctx = NULL;
2874 uint16_t status;
2876 trace_pci_nvme_compare(nvme_cid(req), nvme_nsid(ns), slba, nlb);
2878 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps) && (prinfo & NVME_PRINFO_PRACT)) {
2879 return NVME_INVALID_PROT_INFO | NVME_DNR;
2882 if (nvme_ns_ext(ns)) {
2883 len += nvme_m2b(ns, nlb);
2886 status = nvme_check_mdts(n, len);
2887 if (status) {
2888 return status;
2891 status = nvme_check_bounds(ns, slba, nlb);
2892 if (status) {
2893 return status;
2896 if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
2897 status = nvme_check_dulbe(ns, slba, nlb);
2898 if (status) {
2899 return status;
2903 status = nvme_map_dptr(n, &req->sg, len, &req->cmd);
2904 if (status) {
2905 return status;
2908 ctx = g_new(struct nvme_compare_ctx, 1);
2909 ctx->data.bounce = g_malloc(data_len);
2911 req->opaque = ctx;
2913 qemu_iovec_init(&ctx->data.iov, 1);
2914 qemu_iovec_add(&ctx->data.iov, ctx->data.bounce, data_len);
2916 block_acct_start(blk_get_stats(blk), &req->acct, data_len,
2917 BLOCK_ACCT_READ);
2918 req->aiocb = blk_aio_preadv(blk, offset, &ctx->data.iov, 0,
2919 nvme_compare_data_cb, req);
2921 return NVME_NO_COMPLETE;
2924 typedef struct NvmeFlushAIOCB {
2925 BlockAIOCB common;
2926 BlockAIOCB *aiocb;
2927 NvmeRequest *req;
2928 QEMUBH *bh;
2929 int ret;
2931 NvmeNamespace *ns;
2932 uint32_t nsid;
2933 bool broadcast;
2934 } NvmeFlushAIOCB;
2936 static void nvme_flush_cancel(BlockAIOCB *acb)
2938 NvmeFlushAIOCB *iocb = container_of(acb, NvmeFlushAIOCB, common);
2940 iocb->ret = -ECANCELED;
2942 if (iocb->aiocb) {
2943 blk_aio_cancel_async(iocb->aiocb);
2947 static const AIOCBInfo nvme_flush_aiocb_info = {
2948 .aiocb_size = sizeof(NvmeFlushAIOCB),
2949 .cancel_async = nvme_flush_cancel,
2950 .get_aio_context = nvme_get_aio_context,
2953 static void nvme_flush_ns_cb(void *opaque, int ret)
2955 NvmeFlushAIOCB *iocb = opaque;
2956 NvmeNamespace *ns = iocb->ns;
2958 if (ret < 0) {
2959 iocb->ret = ret;
2960 goto out;
2961 } else if (iocb->ret < 0) {
2962 goto out;
2965 if (ns) {
2966 trace_pci_nvme_flush_ns(iocb->nsid);
2968 iocb->ns = NULL;
2969 iocb->aiocb = blk_aio_flush(ns->blkconf.blk, nvme_flush_ns_cb, iocb);
2970 return;
2973 out:
2974 iocb->aiocb = NULL;
2975 qemu_bh_schedule(iocb->bh);
2978 static void nvme_flush_bh(void *opaque)
2980 NvmeFlushAIOCB *iocb = opaque;
2981 NvmeRequest *req = iocb->req;
2982 NvmeCtrl *n = nvme_ctrl(req);
2983 int i;
2985 if (iocb->ret < 0) {
2986 goto done;
2989 if (iocb->broadcast) {
2990 for (i = iocb->nsid + 1; i <= NVME_MAX_NAMESPACES; i++) {
2991 iocb->ns = nvme_ns(n, i);
2992 if (iocb->ns) {
2993 iocb->nsid = i;
2994 break;
2999 if (!iocb->ns) {
3000 goto done;
3003 nvme_flush_ns_cb(iocb, 0);
3004 return;
3006 done:
3007 qemu_bh_delete(iocb->bh);
3008 iocb->bh = NULL;
3010 iocb->common.cb(iocb->common.opaque, iocb->ret);
3012 qemu_aio_unref(iocb);
3014 return;
3017 static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req)
3019 NvmeFlushAIOCB *iocb;
3020 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
3021 uint16_t status;
3023 iocb = qemu_aio_get(&nvme_flush_aiocb_info, NULL, nvme_misc_cb, req);
3025 iocb->req = req;
3026 iocb->bh = qemu_bh_new(nvme_flush_bh, iocb);
3027 iocb->ret = 0;
3028 iocb->ns = NULL;
3029 iocb->nsid = 0;
3030 iocb->broadcast = (nsid == NVME_NSID_BROADCAST);
3032 if (!iocb->broadcast) {
3033 if (!nvme_nsid_valid(n, nsid)) {
3034 status = NVME_INVALID_NSID | NVME_DNR;
3035 goto out;
3038 iocb->ns = nvme_ns(n, nsid);
3039 if (!iocb->ns) {
3040 status = NVME_INVALID_FIELD | NVME_DNR;
3041 goto out;
3044 iocb->nsid = nsid;
3047 req->aiocb = &iocb->common;
3048 qemu_bh_schedule(iocb->bh);
3050 return NVME_NO_COMPLETE;
3052 out:
3053 qemu_bh_delete(iocb->bh);
3054 iocb->bh = NULL;
3055 qemu_aio_unref(iocb);
3057 return status;
3060 static uint16_t nvme_read(NvmeCtrl *n, NvmeRequest *req)
3062 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
3063 NvmeNamespace *ns = req->ns;
3064 uint64_t slba = le64_to_cpu(rw->slba);
3065 uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
3066 uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
3067 uint64_t data_size = nvme_l2b(ns, nlb);
3068 uint64_t mapped_size = data_size;
3069 uint64_t data_offset;
3070 BlockBackend *blk = ns->blkconf.blk;
3071 uint16_t status;
3073 if (nvme_ns_ext(ns)) {
3074 mapped_size += nvme_m2b(ns, nlb);
3076 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3077 bool pract = prinfo & NVME_PRINFO_PRACT;
3079 if (pract && ns->lbaf.ms == 8) {
3080 mapped_size = data_size;
3085 trace_pci_nvme_read(nvme_cid(req), nvme_nsid(ns), nlb, mapped_size, slba);
3087 status = nvme_check_mdts(n, mapped_size);
3088 if (status) {
3089 goto invalid;
3092 status = nvme_check_bounds(ns, slba, nlb);
3093 if (status) {
3094 goto invalid;
3097 if (ns->params.zoned) {
3098 status = nvme_check_zone_read(ns, slba, nlb);
3099 if (status) {
3100 trace_pci_nvme_err_zone_read_not_ok(slba, nlb, status);
3101 goto invalid;
3105 if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
3106 status = nvme_check_dulbe(ns, slba, nlb);
3107 if (status) {
3108 goto invalid;
3112 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3113 return nvme_dif_rw(n, req);
3116 status = nvme_map_data(n, nlb, req);
3117 if (status) {
3118 goto invalid;
3121 data_offset = nvme_l2b(ns, slba);
3123 block_acct_start(blk_get_stats(blk), &req->acct, data_size,
3124 BLOCK_ACCT_READ);
3125 nvme_blk_read(blk, data_offset, nvme_rw_cb, req);
3126 return NVME_NO_COMPLETE;
3128 invalid:
3129 block_acct_invalid(blk_get_stats(blk), BLOCK_ACCT_READ);
3130 return status | NVME_DNR;
3133 static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append,
3134 bool wrz)
3136 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
3137 NvmeNamespace *ns = req->ns;
3138 uint64_t slba = le64_to_cpu(rw->slba);
3139 uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
3140 uint16_t ctrl = le16_to_cpu(rw->control);
3141 uint8_t prinfo = NVME_RW_PRINFO(ctrl);
3142 uint64_t data_size = nvme_l2b(ns, nlb);
3143 uint64_t mapped_size = data_size;
3144 uint64_t data_offset;
3145 NvmeZone *zone;
3146 NvmeZonedResult *res = (NvmeZonedResult *)&req->cqe;
3147 BlockBackend *blk = ns->blkconf.blk;
3148 uint16_t status;
3150 if (nvme_ns_ext(ns)) {
3151 mapped_size += nvme_m2b(ns, nlb);
3153 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3154 bool pract = prinfo & NVME_PRINFO_PRACT;
3156 if (pract && ns->lbaf.ms == 8) {
3157 mapped_size -= nvme_m2b(ns, nlb);
3162 trace_pci_nvme_write(nvme_cid(req), nvme_io_opc_str(rw->opcode),
3163 nvme_nsid(ns), nlb, mapped_size, slba);
3165 if (!wrz) {
3166 status = nvme_check_mdts(n, mapped_size);
3167 if (status) {
3168 goto invalid;
3172 status = nvme_check_bounds(ns, slba, nlb);
3173 if (status) {
3174 goto invalid;
3177 if (ns->params.zoned) {
3178 zone = nvme_get_zone_by_slba(ns, slba);
3179 assert(zone);
3181 if (append) {
3182 bool piremap = !!(ctrl & NVME_RW_PIREMAP);
3184 if (unlikely(slba != zone->d.zslba)) {
3185 trace_pci_nvme_err_append_not_at_start(slba, zone->d.zslba);
3186 status = NVME_INVALID_FIELD;
3187 goto invalid;
3190 if (n->params.zasl &&
3191 data_size > (uint64_t)n->page_size << n->params.zasl) {
3192 trace_pci_nvme_err_zasl(data_size);
3193 return NVME_INVALID_FIELD | NVME_DNR;
3196 slba = zone->w_ptr;
3197 rw->slba = cpu_to_le64(slba);
3198 res->slba = cpu_to_le64(slba);
3200 switch (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3201 case NVME_ID_NS_DPS_TYPE_1:
3202 if (!piremap) {
3203 return NVME_INVALID_PROT_INFO | NVME_DNR;
3206 /* fallthrough */
3208 case NVME_ID_NS_DPS_TYPE_2:
3209 if (piremap) {
3210 uint32_t reftag = le32_to_cpu(rw->reftag);
3211 rw->reftag = cpu_to_le32(reftag + (slba - zone->d.zslba));
3214 break;
3216 case NVME_ID_NS_DPS_TYPE_3:
3217 if (piremap) {
3218 return NVME_INVALID_PROT_INFO | NVME_DNR;
3221 break;
3225 status = nvme_check_zone_write(ns, zone, slba, nlb);
3226 if (status) {
3227 goto invalid;
3230 status = nvme_zrm_auto(n, ns, zone);
3231 if (status) {
3232 goto invalid;
3235 zone->w_ptr += nlb;
3238 data_offset = nvme_l2b(ns, slba);
3240 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3241 return nvme_dif_rw(n, req);
3244 if (!wrz) {
3245 status = nvme_map_data(n, nlb, req);
3246 if (status) {
3247 goto invalid;
3250 block_acct_start(blk_get_stats(blk), &req->acct, data_size,
3251 BLOCK_ACCT_WRITE);
3252 nvme_blk_write(blk, data_offset, nvme_rw_cb, req);
3253 } else {
3254 req->aiocb = blk_aio_pwrite_zeroes(blk, data_offset, data_size,
3255 BDRV_REQ_MAY_UNMAP, nvme_rw_cb,
3256 req);
3259 return NVME_NO_COMPLETE;
3261 invalid:
3262 block_acct_invalid(blk_get_stats(blk), BLOCK_ACCT_WRITE);
3263 return status | NVME_DNR;
3266 static inline uint16_t nvme_write(NvmeCtrl *n, NvmeRequest *req)
3268 return nvme_do_write(n, req, false, false);
3271 static inline uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeRequest *req)
3273 return nvme_do_write(n, req, false, true);
3276 static inline uint16_t nvme_zone_append(NvmeCtrl *n, NvmeRequest *req)
3278 return nvme_do_write(n, req, true, false);
3281 static uint16_t nvme_get_mgmt_zone_slba_idx(NvmeNamespace *ns, NvmeCmd *c,
3282 uint64_t *slba, uint32_t *zone_idx)
3284 uint32_t dw10 = le32_to_cpu(c->cdw10);
3285 uint32_t dw11 = le32_to_cpu(c->cdw11);
3287 if (!ns->params.zoned) {
3288 trace_pci_nvme_err_invalid_opc(c->opcode);
3289 return NVME_INVALID_OPCODE | NVME_DNR;
3292 *slba = ((uint64_t)dw11) << 32 | dw10;
3293 if (unlikely(*slba >= ns->id_ns.nsze)) {
3294 trace_pci_nvme_err_invalid_lba_range(*slba, 0, ns->id_ns.nsze);
3295 *slba = 0;
3296 return NVME_LBA_RANGE | NVME_DNR;
3299 *zone_idx = nvme_zone_idx(ns, *slba);
3300 assert(*zone_idx < ns->num_zones);
3302 return NVME_SUCCESS;
3305 typedef uint16_t (*op_handler_t)(NvmeNamespace *, NvmeZone *, NvmeZoneState,
3306 NvmeRequest *);
3308 enum NvmeZoneProcessingMask {
3309 NVME_PROC_CURRENT_ZONE = 0,
3310 NVME_PROC_OPENED_ZONES = 1 << 0,
3311 NVME_PROC_CLOSED_ZONES = 1 << 1,
3312 NVME_PROC_READ_ONLY_ZONES = 1 << 2,
3313 NVME_PROC_FULL_ZONES = 1 << 3,
3316 static uint16_t nvme_open_zone(NvmeNamespace *ns, NvmeZone *zone,
3317 NvmeZoneState state, NvmeRequest *req)
3319 return nvme_zrm_open(nvme_ctrl(req), ns, zone);
3322 static uint16_t nvme_close_zone(NvmeNamespace *ns, NvmeZone *zone,
3323 NvmeZoneState state, NvmeRequest *req)
3325 return nvme_zrm_close(ns, zone);
3328 static uint16_t nvme_finish_zone(NvmeNamespace *ns, NvmeZone *zone,
3329 NvmeZoneState state, NvmeRequest *req)
3331 return nvme_zrm_finish(ns, zone);
3334 static uint16_t nvme_offline_zone(NvmeNamespace *ns, NvmeZone *zone,
3335 NvmeZoneState state, NvmeRequest *req)
3337 switch (state) {
3338 case NVME_ZONE_STATE_READ_ONLY:
3339 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_OFFLINE);
3340 /* fall through */
3341 case NVME_ZONE_STATE_OFFLINE:
3342 return NVME_SUCCESS;
3343 default:
3344 return NVME_ZONE_INVAL_TRANSITION;
3348 static uint16_t nvme_set_zd_ext(NvmeNamespace *ns, NvmeZone *zone)
3350 uint16_t status;
3351 uint8_t state = nvme_get_zone_state(zone);
3353 if (state == NVME_ZONE_STATE_EMPTY) {
3354 status = nvme_aor_check(ns, 1, 0);
3355 if (status) {
3356 return status;
3358 nvme_aor_inc_active(ns);
3359 zone->d.za |= NVME_ZA_ZD_EXT_VALID;
3360 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_CLOSED);
3361 return NVME_SUCCESS;
3364 return NVME_ZONE_INVAL_TRANSITION;
3367 static uint16_t nvme_bulk_proc_zone(NvmeNamespace *ns, NvmeZone *zone,
3368 enum NvmeZoneProcessingMask proc_mask,
3369 op_handler_t op_hndlr, NvmeRequest *req)
3371 uint16_t status = NVME_SUCCESS;
3372 NvmeZoneState zs = nvme_get_zone_state(zone);
3373 bool proc_zone;
3375 switch (zs) {
3376 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
3377 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
3378 proc_zone = proc_mask & NVME_PROC_OPENED_ZONES;
3379 break;
3380 case NVME_ZONE_STATE_CLOSED:
3381 proc_zone = proc_mask & NVME_PROC_CLOSED_ZONES;
3382 break;
3383 case NVME_ZONE_STATE_READ_ONLY:
3384 proc_zone = proc_mask & NVME_PROC_READ_ONLY_ZONES;
3385 break;
3386 case NVME_ZONE_STATE_FULL:
3387 proc_zone = proc_mask & NVME_PROC_FULL_ZONES;
3388 break;
3389 default:
3390 proc_zone = false;
3393 if (proc_zone) {
3394 status = op_hndlr(ns, zone, zs, req);
3397 return status;
3400 static uint16_t nvme_do_zone_op(NvmeNamespace *ns, NvmeZone *zone,
3401 enum NvmeZoneProcessingMask proc_mask,
3402 op_handler_t op_hndlr, NvmeRequest *req)
3404 NvmeZone *next;
3405 uint16_t status = NVME_SUCCESS;
3406 int i;
3408 if (!proc_mask) {
3409 status = op_hndlr(ns, zone, nvme_get_zone_state(zone), req);
3410 } else {
3411 if (proc_mask & NVME_PROC_CLOSED_ZONES) {
3412 QTAILQ_FOREACH_SAFE(zone, &ns->closed_zones, entry, next) {
3413 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
3414 req);
3415 if (status && status != NVME_NO_COMPLETE) {
3416 goto out;
3420 if (proc_mask & NVME_PROC_OPENED_ZONES) {
3421 QTAILQ_FOREACH_SAFE(zone, &ns->imp_open_zones, entry, next) {
3422 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
3423 req);
3424 if (status && status != NVME_NO_COMPLETE) {
3425 goto out;
3429 QTAILQ_FOREACH_SAFE(zone, &ns->exp_open_zones, entry, next) {
3430 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
3431 req);
3432 if (status && status != NVME_NO_COMPLETE) {
3433 goto out;
3437 if (proc_mask & NVME_PROC_FULL_ZONES) {
3438 QTAILQ_FOREACH_SAFE(zone, &ns->full_zones, entry, next) {
3439 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
3440 req);
3441 if (status && status != NVME_NO_COMPLETE) {
3442 goto out;
3447 if (proc_mask & NVME_PROC_READ_ONLY_ZONES) {
3448 for (i = 0; i < ns->num_zones; i++, zone++) {
3449 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
3450 req);
3451 if (status && status != NVME_NO_COMPLETE) {
3452 goto out;
3458 out:
3459 return status;
3462 typedef struct NvmeZoneResetAIOCB {
3463 BlockAIOCB common;
3464 BlockAIOCB *aiocb;
3465 NvmeRequest *req;
3466 QEMUBH *bh;
3467 int ret;
3469 bool all;
3470 int idx;
3471 NvmeZone *zone;
3472 } NvmeZoneResetAIOCB;
3474 static void nvme_zone_reset_cancel(BlockAIOCB *aiocb)
3476 NvmeZoneResetAIOCB *iocb = container_of(aiocb, NvmeZoneResetAIOCB, common);
3477 NvmeRequest *req = iocb->req;
3478 NvmeNamespace *ns = req->ns;
3480 iocb->idx = ns->num_zones;
3482 iocb->ret = -ECANCELED;
3484 if (iocb->aiocb) {
3485 blk_aio_cancel_async(iocb->aiocb);
3486 iocb->aiocb = NULL;
3490 static const AIOCBInfo nvme_zone_reset_aiocb_info = {
3491 .aiocb_size = sizeof(NvmeZoneResetAIOCB),
3492 .cancel_async = nvme_zone_reset_cancel,
3495 static void nvme_zone_reset_bh(void *opaque)
3497 NvmeZoneResetAIOCB *iocb = opaque;
3499 iocb->common.cb(iocb->common.opaque, iocb->ret);
3501 qemu_bh_delete(iocb->bh);
3502 iocb->bh = NULL;
3503 qemu_aio_unref(iocb);
3506 static void nvme_zone_reset_cb(void *opaque, int ret);
3508 static void nvme_zone_reset_epilogue_cb(void *opaque, int ret)
3510 NvmeZoneResetAIOCB *iocb = opaque;
3511 NvmeRequest *req = iocb->req;
3512 NvmeNamespace *ns = req->ns;
3513 int64_t moff;
3514 int count;
3516 if (ret < 0) {
3517 nvme_zone_reset_cb(iocb, ret);
3518 return;
3521 if (!ns->lbaf.ms) {
3522 nvme_zone_reset_cb(iocb, 0);
3523 return;
3526 moff = nvme_moff(ns, iocb->zone->d.zslba);
3527 count = nvme_m2b(ns, ns->zone_size);
3529 iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk, moff, count,
3530 BDRV_REQ_MAY_UNMAP,
3531 nvme_zone_reset_cb, iocb);
3532 return;
3535 static void nvme_zone_reset_cb(void *opaque, int ret)
3537 NvmeZoneResetAIOCB *iocb = opaque;
3538 NvmeRequest *req = iocb->req;
3539 NvmeNamespace *ns = req->ns;
3541 if (ret < 0) {
3542 iocb->ret = ret;
3543 goto done;
3546 if (iocb->zone) {
3547 nvme_zrm_reset(ns, iocb->zone);
3549 if (!iocb->all) {
3550 goto done;
3554 while (iocb->idx < ns->num_zones) {
3555 NvmeZone *zone = &ns->zone_array[iocb->idx++];
3557 switch (nvme_get_zone_state(zone)) {
3558 case NVME_ZONE_STATE_EMPTY:
3559 if (!iocb->all) {
3560 goto done;
3563 continue;
3565 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
3566 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
3567 case NVME_ZONE_STATE_CLOSED:
3568 case NVME_ZONE_STATE_FULL:
3569 iocb->zone = zone;
3570 break;
3572 default:
3573 continue;
3576 trace_pci_nvme_zns_zone_reset(zone->d.zslba);
3578 iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk,
3579 nvme_l2b(ns, zone->d.zslba),
3580 nvme_l2b(ns, ns->zone_size),
3581 BDRV_REQ_MAY_UNMAP,
3582 nvme_zone_reset_epilogue_cb,
3583 iocb);
3584 return;
3587 done:
3588 iocb->aiocb = NULL;
3589 if (iocb->bh) {
3590 qemu_bh_schedule(iocb->bh);
3594 static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, NvmeRequest *req)
3596 NvmeCmd *cmd = (NvmeCmd *)&req->cmd;
3597 NvmeNamespace *ns = req->ns;
3598 NvmeZone *zone;
3599 NvmeZoneResetAIOCB *iocb;
3600 uint8_t *zd_ext;
3601 uint32_t dw13 = le32_to_cpu(cmd->cdw13);
3602 uint64_t slba = 0;
3603 uint32_t zone_idx = 0;
3604 uint16_t status;
3605 uint8_t action;
3606 bool all;
3607 enum NvmeZoneProcessingMask proc_mask = NVME_PROC_CURRENT_ZONE;
3609 action = dw13 & 0xff;
3610 all = !!(dw13 & 0x100);
3612 req->status = NVME_SUCCESS;
3614 if (!all) {
3615 status = nvme_get_mgmt_zone_slba_idx(ns, cmd, &slba, &zone_idx);
3616 if (status) {
3617 return status;
3621 zone = &ns->zone_array[zone_idx];
3622 if (slba != zone->d.zslba) {
3623 trace_pci_nvme_err_unaligned_zone_cmd(action, slba, zone->d.zslba);
3624 return NVME_INVALID_FIELD | NVME_DNR;
3627 switch (action) {
3629 case NVME_ZONE_ACTION_OPEN:
3630 if (all) {
3631 proc_mask = NVME_PROC_CLOSED_ZONES;
3633 trace_pci_nvme_open_zone(slba, zone_idx, all);
3634 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_open_zone, req);
3635 break;
3637 case NVME_ZONE_ACTION_CLOSE:
3638 if (all) {
3639 proc_mask = NVME_PROC_OPENED_ZONES;
3641 trace_pci_nvme_close_zone(slba, zone_idx, all);
3642 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_close_zone, req);
3643 break;
3645 case NVME_ZONE_ACTION_FINISH:
3646 if (all) {
3647 proc_mask = NVME_PROC_OPENED_ZONES | NVME_PROC_CLOSED_ZONES;
3649 trace_pci_nvme_finish_zone(slba, zone_idx, all);
3650 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_finish_zone, req);
3651 break;
3653 case NVME_ZONE_ACTION_RESET:
3654 trace_pci_nvme_reset_zone(slba, zone_idx, all);
3656 iocb = blk_aio_get(&nvme_zone_reset_aiocb_info, ns->blkconf.blk,
3657 nvme_misc_cb, req);
3659 iocb->req = req;
3660 iocb->bh = qemu_bh_new(nvme_zone_reset_bh, iocb);
3661 iocb->ret = 0;
3662 iocb->all = all;
3663 iocb->idx = zone_idx;
3664 iocb->zone = NULL;
3666 req->aiocb = &iocb->common;
3667 nvme_zone_reset_cb(iocb, 0);
3669 return NVME_NO_COMPLETE;
3671 case NVME_ZONE_ACTION_OFFLINE:
3672 if (all) {
3673 proc_mask = NVME_PROC_READ_ONLY_ZONES;
3675 trace_pci_nvme_offline_zone(slba, zone_idx, all);
3676 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_offline_zone, req);
3677 break;
3679 case NVME_ZONE_ACTION_SET_ZD_EXT:
3680 trace_pci_nvme_set_descriptor_extension(slba, zone_idx);
3681 if (all || !ns->params.zd_extension_size) {
3682 return NVME_INVALID_FIELD | NVME_DNR;
3684 zd_ext = nvme_get_zd_extension(ns, zone_idx);
3685 status = nvme_h2c(n, zd_ext, ns->params.zd_extension_size, req);
3686 if (status) {
3687 trace_pci_nvme_err_zd_extension_map_error(zone_idx);
3688 return status;
3691 status = nvme_set_zd_ext(ns, zone);
3692 if (status == NVME_SUCCESS) {
3693 trace_pci_nvme_zd_extension_set(zone_idx);
3694 return status;
3696 break;
3698 default:
3699 trace_pci_nvme_err_invalid_mgmt_action(action);
3700 status = NVME_INVALID_FIELD;
3703 if (status == NVME_ZONE_INVAL_TRANSITION) {
3704 trace_pci_nvme_err_invalid_zone_state_transition(action, slba,
3705 zone->d.za);
3707 if (status) {
3708 status |= NVME_DNR;
3711 return status;
3714 static bool nvme_zone_matches_filter(uint32_t zafs, NvmeZone *zl)
3716 NvmeZoneState zs = nvme_get_zone_state(zl);
3718 switch (zafs) {
3719 case NVME_ZONE_REPORT_ALL:
3720 return true;
3721 case NVME_ZONE_REPORT_EMPTY:
3722 return zs == NVME_ZONE_STATE_EMPTY;
3723 case NVME_ZONE_REPORT_IMPLICITLY_OPEN:
3724 return zs == NVME_ZONE_STATE_IMPLICITLY_OPEN;
3725 case NVME_ZONE_REPORT_EXPLICITLY_OPEN:
3726 return zs == NVME_ZONE_STATE_EXPLICITLY_OPEN;
3727 case NVME_ZONE_REPORT_CLOSED:
3728 return zs == NVME_ZONE_STATE_CLOSED;
3729 case NVME_ZONE_REPORT_FULL:
3730 return zs == NVME_ZONE_STATE_FULL;
3731 case NVME_ZONE_REPORT_READ_ONLY:
3732 return zs == NVME_ZONE_STATE_READ_ONLY;
3733 case NVME_ZONE_REPORT_OFFLINE:
3734 return zs == NVME_ZONE_STATE_OFFLINE;
3735 default:
3736 return false;
3740 static uint16_t nvme_zone_mgmt_recv(NvmeCtrl *n, NvmeRequest *req)
3742 NvmeCmd *cmd = (NvmeCmd *)&req->cmd;
3743 NvmeNamespace *ns = req->ns;
3744 /* cdw12 is zero-based number of dwords to return. Convert to bytes */
3745 uint32_t data_size = (le32_to_cpu(cmd->cdw12) + 1) << 2;
3746 uint32_t dw13 = le32_to_cpu(cmd->cdw13);
3747 uint32_t zone_idx, zra, zrasf, partial;
3748 uint64_t max_zones, nr_zones = 0;
3749 uint16_t status;
3750 uint64_t slba;
3751 NvmeZoneDescr *z;
3752 NvmeZone *zone;
3753 NvmeZoneReportHeader *header;
3754 void *buf, *buf_p;
3755 size_t zone_entry_sz;
3756 int i;
3758 req->status = NVME_SUCCESS;
3760 status = nvme_get_mgmt_zone_slba_idx(ns, cmd, &slba, &zone_idx);
3761 if (status) {
3762 return status;
3765 zra = dw13 & 0xff;
3766 if (zra != NVME_ZONE_REPORT && zra != NVME_ZONE_REPORT_EXTENDED) {
3767 return NVME_INVALID_FIELD | NVME_DNR;
3769 if (zra == NVME_ZONE_REPORT_EXTENDED && !ns->params.zd_extension_size) {
3770 return NVME_INVALID_FIELD | NVME_DNR;
3773 zrasf = (dw13 >> 8) & 0xff;
3774 if (zrasf > NVME_ZONE_REPORT_OFFLINE) {
3775 return NVME_INVALID_FIELD | NVME_DNR;
3778 if (data_size < sizeof(NvmeZoneReportHeader)) {
3779 return NVME_INVALID_FIELD | NVME_DNR;
3782 status = nvme_check_mdts(n, data_size);
3783 if (status) {
3784 return status;
3787 partial = (dw13 >> 16) & 0x01;
3789 zone_entry_sz = sizeof(NvmeZoneDescr);
3790 if (zra == NVME_ZONE_REPORT_EXTENDED) {
3791 zone_entry_sz += ns->params.zd_extension_size;
3794 max_zones = (data_size - sizeof(NvmeZoneReportHeader)) / zone_entry_sz;
3795 buf = g_malloc0(data_size);
3797 zone = &ns->zone_array[zone_idx];
3798 for (i = zone_idx; i < ns->num_zones; i++) {
3799 if (partial && nr_zones >= max_zones) {
3800 break;
3802 if (nvme_zone_matches_filter(zrasf, zone++)) {
3803 nr_zones++;
3806 header = (NvmeZoneReportHeader *)buf;
3807 header->nr_zones = cpu_to_le64(nr_zones);
3809 buf_p = buf + sizeof(NvmeZoneReportHeader);
3810 for (; zone_idx < ns->num_zones && max_zones > 0; zone_idx++) {
3811 zone = &ns->zone_array[zone_idx];
3812 if (nvme_zone_matches_filter(zrasf, zone)) {
3813 z = (NvmeZoneDescr *)buf_p;
3814 buf_p += sizeof(NvmeZoneDescr);
3816 z->zt = zone->d.zt;
3817 z->zs = zone->d.zs;
3818 z->zcap = cpu_to_le64(zone->d.zcap);
3819 z->zslba = cpu_to_le64(zone->d.zslba);
3820 z->za = zone->d.za;
3822 if (nvme_wp_is_valid(zone)) {
3823 z->wp = cpu_to_le64(zone->d.wp);
3824 } else {
3825 z->wp = cpu_to_le64(~0ULL);
3828 if (zra == NVME_ZONE_REPORT_EXTENDED) {
3829 if (zone->d.za & NVME_ZA_ZD_EXT_VALID) {
3830 memcpy(buf_p, nvme_get_zd_extension(ns, zone_idx),
3831 ns->params.zd_extension_size);
3833 buf_p += ns->params.zd_extension_size;
3836 max_zones--;
3840 status = nvme_c2h(n, (uint8_t *)buf, data_size, req);
3842 g_free(buf);
3844 return status;
3847 static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req)
3849 NvmeNamespace *ns;
3850 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
3852 trace_pci_nvme_io_cmd(nvme_cid(req), nsid, nvme_sqid(req),
3853 req->cmd.opcode, nvme_io_opc_str(req->cmd.opcode));
3855 if (!nvme_nsid_valid(n, nsid)) {
3856 return NVME_INVALID_NSID | NVME_DNR;
3860 * In the base NVM command set, Flush may apply to all namespaces
3861 * (indicated by NSID being set to FFFFFFFFh). But if that feature is used
3862 * along with TP 4056 (Namespace Types), it may be pretty screwed up.
3864 * If NSID is indeed set to FFFFFFFFh, we simply cannot associate the
3865 * opcode with a specific command since we cannot determine a unique I/O
3866 * command set. Opcode 0h could have any other meaning than something
3867 * equivalent to flushing and say it DOES have completely different
3868 * semantics in some other command set - does an NSID of FFFFFFFFh then
3869 * mean "for all namespaces, apply whatever command set specific command
3870 * that uses the 0h opcode?" Or does it mean "for all namespaces, apply
3871 * whatever command that uses the 0h opcode if, and only if, it allows NSID
3872 * to be FFFFFFFFh"?
3874 * Anyway (and luckily), for now, we do not care about this since the
3875 * device only supports namespace types that includes the NVM Flush command
3876 * (NVM and Zoned), so always do an NVM Flush.
3878 if (req->cmd.opcode == NVME_CMD_FLUSH) {
3879 return nvme_flush(n, req);
3882 ns = nvme_ns(n, nsid);
3883 if (unlikely(!ns)) {
3884 return NVME_INVALID_FIELD | NVME_DNR;
3887 if (!(ns->iocs[req->cmd.opcode] & NVME_CMD_EFF_CSUPP)) {
3888 trace_pci_nvme_err_invalid_opc(req->cmd.opcode);
3889 return NVME_INVALID_OPCODE | NVME_DNR;
3892 if (ns->status) {
3893 return ns->status;
3896 if (NVME_CMD_FLAGS_FUSE(req->cmd.flags)) {
3897 return NVME_INVALID_FIELD;
3900 req->ns = ns;
3902 switch (req->cmd.opcode) {
3903 case NVME_CMD_WRITE_ZEROES:
3904 return nvme_write_zeroes(n, req);
3905 case NVME_CMD_ZONE_APPEND:
3906 return nvme_zone_append(n, req);
3907 case NVME_CMD_WRITE:
3908 return nvme_write(n, req);
3909 case NVME_CMD_READ:
3910 return nvme_read(n, req);
3911 case NVME_CMD_COMPARE:
3912 return nvme_compare(n, req);
3913 case NVME_CMD_DSM:
3914 return nvme_dsm(n, req);
3915 case NVME_CMD_VERIFY:
3916 return nvme_verify(n, req);
3917 case NVME_CMD_COPY:
3918 return nvme_copy(n, req);
3919 case NVME_CMD_ZONE_MGMT_SEND:
3920 return nvme_zone_mgmt_send(n, req);
3921 case NVME_CMD_ZONE_MGMT_RECV:
3922 return nvme_zone_mgmt_recv(n, req);
3923 default:
3924 assert(false);
3927 return NVME_INVALID_OPCODE | NVME_DNR;
3930 static void nvme_free_sq(NvmeSQueue *sq, NvmeCtrl *n)
3932 n->sq[sq->sqid] = NULL;
3933 timer_free(sq->timer);
3934 g_free(sq->io_req);
3935 if (sq->sqid) {
3936 g_free(sq);
3940 static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeRequest *req)
3942 NvmeDeleteQ *c = (NvmeDeleteQ *)&req->cmd;
3943 NvmeRequest *r, *next;
3944 NvmeSQueue *sq;
3945 NvmeCQueue *cq;
3946 uint16_t qid = le16_to_cpu(c->qid);
3948 if (unlikely(!qid || nvme_check_sqid(n, qid))) {
3949 trace_pci_nvme_err_invalid_del_sq(qid);
3950 return NVME_INVALID_QID | NVME_DNR;
3953 trace_pci_nvme_del_sq(qid);
3955 sq = n->sq[qid];
3956 while (!QTAILQ_EMPTY(&sq->out_req_list)) {
3957 r = QTAILQ_FIRST(&sq->out_req_list);
3958 assert(r->aiocb);
3959 blk_aio_cancel(r->aiocb);
3962 assert(QTAILQ_EMPTY(&sq->out_req_list));
3964 if (!nvme_check_cqid(n, sq->cqid)) {
3965 cq = n->cq[sq->cqid];
3966 QTAILQ_REMOVE(&cq->sq_list, sq, entry);
3968 nvme_post_cqes(cq);
3969 QTAILQ_FOREACH_SAFE(r, &cq->req_list, entry, next) {
3970 if (r->sq == sq) {
3971 QTAILQ_REMOVE(&cq->req_list, r, entry);
3972 QTAILQ_INSERT_TAIL(&sq->req_list, r, entry);
3977 nvme_free_sq(sq, n);
3978 return NVME_SUCCESS;
3981 static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, uint64_t dma_addr,
3982 uint16_t sqid, uint16_t cqid, uint16_t size)
3984 int i;
3985 NvmeCQueue *cq;
3987 sq->ctrl = n;
3988 sq->dma_addr = dma_addr;
3989 sq->sqid = sqid;
3990 sq->size = size;
3991 sq->cqid = cqid;
3992 sq->head = sq->tail = 0;
3993 sq->io_req = g_new0(NvmeRequest, sq->size);
3995 QTAILQ_INIT(&sq->req_list);
3996 QTAILQ_INIT(&sq->out_req_list);
3997 for (i = 0; i < sq->size; i++) {
3998 sq->io_req[i].sq = sq;
3999 QTAILQ_INSERT_TAIL(&(sq->req_list), &sq->io_req[i], entry);
4001 sq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_process_sq, sq);
4003 assert(n->cq[cqid]);
4004 cq = n->cq[cqid];
4005 QTAILQ_INSERT_TAIL(&(cq->sq_list), sq, entry);
4006 n->sq[sqid] = sq;
4009 static uint16_t nvme_create_sq(NvmeCtrl *n, NvmeRequest *req)
4011 NvmeSQueue *sq;
4012 NvmeCreateSq *c = (NvmeCreateSq *)&req->cmd;
4014 uint16_t cqid = le16_to_cpu(c->cqid);
4015 uint16_t sqid = le16_to_cpu(c->sqid);
4016 uint16_t qsize = le16_to_cpu(c->qsize);
4017 uint16_t qflags = le16_to_cpu(c->sq_flags);
4018 uint64_t prp1 = le64_to_cpu(c->prp1);
4020 trace_pci_nvme_create_sq(prp1, sqid, cqid, qsize, qflags);
4022 if (unlikely(!cqid || nvme_check_cqid(n, cqid))) {
4023 trace_pci_nvme_err_invalid_create_sq_cqid(cqid);
4024 return NVME_INVALID_CQID | NVME_DNR;
4026 if (unlikely(!sqid || sqid > n->params.max_ioqpairs ||
4027 n->sq[sqid] != NULL)) {
4028 trace_pci_nvme_err_invalid_create_sq_sqid(sqid);
4029 return NVME_INVALID_QID | NVME_DNR;
4031 if (unlikely(!qsize || qsize > NVME_CAP_MQES(ldq_le_p(&n->bar.cap)))) {
4032 trace_pci_nvme_err_invalid_create_sq_size(qsize);
4033 return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
4035 if (unlikely(prp1 & (n->page_size - 1))) {
4036 trace_pci_nvme_err_invalid_create_sq_addr(prp1);
4037 return NVME_INVALID_PRP_OFFSET | NVME_DNR;
4039 if (unlikely(!(NVME_SQ_FLAGS_PC(qflags)))) {
4040 trace_pci_nvme_err_invalid_create_sq_qflags(NVME_SQ_FLAGS_PC(qflags));
4041 return NVME_INVALID_FIELD | NVME_DNR;
4043 sq = g_malloc0(sizeof(*sq));
4044 nvme_init_sq(sq, n, prp1, sqid, cqid, qsize + 1);
4045 return NVME_SUCCESS;
4048 struct nvme_stats {
4049 uint64_t units_read;
4050 uint64_t units_written;
4051 uint64_t read_commands;
4052 uint64_t write_commands;
4055 static void nvme_set_blk_stats(NvmeNamespace *ns, struct nvme_stats *stats)
4057 BlockAcctStats *s = blk_get_stats(ns->blkconf.blk);
4059 stats->units_read += s->nr_bytes[BLOCK_ACCT_READ] >> BDRV_SECTOR_BITS;
4060 stats->units_written += s->nr_bytes[BLOCK_ACCT_WRITE] >> BDRV_SECTOR_BITS;
4061 stats->read_commands += s->nr_ops[BLOCK_ACCT_READ];
4062 stats->write_commands += s->nr_ops[BLOCK_ACCT_WRITE];
4065 static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
4066 uint64_t off, NvmeRequest *req)
4068 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
4069 struct nvme_stats stats = { 0 };
4070 NvmeSmartLog smart = { 0 };
4071 uint32_t trans_len;
4072 NvmeNamespace *ns;
4073 time_t current_ms;
4075 if (off >= sizeof(smart)) {
4076 return NVME_INVALID_FIELD | NVME_DNR;
4079 if (nsid != 0xffffffff) {
4080 ns = nvme_ns(n, nsid);
4081 if (!ns) {
4082 return NVME_INVALID_NSID | NVME_DNR;
4084 nvme_set_blk_stats(ns, &stats);
4085 } else {
4086 int i;
4088 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
4089 ns = nvme_ns(n, i);
4090 if (!ns) {
4091 continue;
4093 nvme_set_blk_stats(ns, &stats);
4097 trans_len = MIN(sizeof(smart) - off, buf_len);
4098 smart.critical_warning = n->smart_critical_warning;
4100 smart.data_units_read[0] = cpu_to_le64(DIV_ROUND_UP(stats.units_read,
4101 1000));
4102 smart.data_units_written[0] = cpu_to_le64(DIV_ROUND_UP(stats.units_written,
4103 1000));
4104 smart.host_read_commands[0] = cpu_to_le64(stats.read_commands);
4105 smart.host_write_commands[0] = cpu_to_le64(stats.write_commands);
4107 smart.temperature = cpu_to_le16(n->temperature);
4109 if ((n->temperature >= n->features.temp_thresh_hi) ||
4110 (n->temperature <= n->features.temp_thresh_low)) {
4111 smart.critical_warning |= NVME_SMART_TEMPERATURE;
4114 current_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
4115 smart.power_on_hours[0] =
4116 cpu_to_le64((((current_ms - n->starttime_ms) / 1000) / 60) / 60);
4118 if (!rae) {
4119 nvme_clear_events(n, NVME_AER_TYPE_SMART);
4122 return nvme_c2h(n, (uint8_t *) &smart + off, trans_len, req);
4125 static uint16_t nvme_fw_log_info(NvmeCtrl *n, uint32_t buf_len, uint64_t off,
4126 NvmeRequest *req)
4128 uint32_t trans_len;
4129 NvmeFwSlotInfoLog fw_log = {
4130 .afi = 0x1,
4133 if (off >= sizeof(fw_log)) {
4134 return NVME_INVALID_FIELD | NVME_DNR;
4137 strpadcpy((char *)&fw_log.frs1, sizeof(fw_log.frs1), "1.0", ' ');
4138 trans_len = MIN(sizeof(fw_log) - off, buf_len);
4140 return nvme_c2h(n, (uint8_t *) &fw_log + off, trans_len, req);
4143 static uint16_t nvme_error_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
4144 uint64_t off, NvmeRequest *req)
4146 uint32_t trans_len;
4147 NvmeErrorLog errlog;
4149 if (off >= sizeof(errlog)) {
4150 return NVME_INVALID_FIELD | NVME_DNR;
4153 if (!rae) {
4154 nvme_clear_events(n, NVME_AER_TYPE_ERROR);
4157 memset(&errlog, 0x0, sizeof(errlog));
4158 trans_len = MIN(sizeof(errlog) - off, buf_len);
4160 return nvme_c2h(n, (uint8_t *)&errlog, trans_len, req);
4163 static uint16_t nvme_changed_nslist(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
4164 uint64_t off, NvmeRequest *req)
4166 uint32_t nslist[1024];
4167 uint32_t trans_len;
4168 int i = 0;
4169 uint32_t nsid;
4171 memset(nslist, 0x0, sizeof(nslist));
4172 trans_len = MIN(sizeof(nslist) - off, buf_len);
4174 while ((nsid = find_first_bit(n->changed_nsids, NVME_CHANGED_NSID_SIZE)) !=
4175 NVME_CHANGED_NSID_SIZE) {
4177 * If more than 1024 namespaces, the first entry in the log page should
4178 * be set to FFFFFFFFh and the others to 0 as spec.
4180 if (i == ARRAY_SIZE(nslist)) {
4181 memset(nslist, 0x0, sizeof(nslist));
4182 nslist[0] = 0xffffffff;
4183 break;
4186 nslist[i++] = nsid;
4187 clear_bit(nsid, n->changed_nsids);
4191 * Remove all the remaining list entries in case returns directly due to
4192 * more than 1024 namespaces.
4194 if (nslist[0] == 0xffffffff) {
4195 bitmap_zero(n->changed_nsids, NVME_CHANGED_NSID_SIZE);
4198 if (!rae) {
4199 nvme_clear_events(n, NVME_AER_TYPE_NOTICE);
4202 return nvme_c2h(n, ((uint8_t *)nslist) + off, trans_len, req);
4205 static uint16_t nvme_cmd_effects(NvmeCtrl *n, uint8_t csi, uint32_t buf_len,
4206 uint64_t off, NvmeRequest *req)
4208 NvmeEffectsLog log = {};
4209 const uint32_t *src_iocs = NULL;
4210 uint32_t trans_len;
4212 if (off >= sizeof(log)) {
4213 trace_pci_nvme_err_invalid_log_page_offset(off, sizeof(log));
4214 return NVME_INVALID_FIELD | NVME_DNR;
4217 switch (NVME_CC_CSS(ldl_le_p(&n->bar.cc))) {
4218 case NVME_CC_CSS_NVM:
4219 src_iocs = nvme_cse_iocs_nvm;
4220 /* fall through */
4221 case NVME_CC_CSS_ADMIN_ONLY:
4222 break;
4223 case NVME_CC_CSS_CSI:
4224 switch (csi) {
4225 case NVME_CSI_NVM:
4226 src_iocs = nvme_cse_iocs_nvm;
4227 break;
4228 case NVME_CSI_ZONED:
4229 src_iocs = nvme_cse_iocs_zoned;
4230 break;
4234 memcpy(log.acs, nvme_cse_acs, sizeof(nvme_cse_acs));
4236 if (src_iocs) {
4237 memcpy(log.iocs, src_iocs, sizeof(log.iocs));
4240 trans_len = MIN(sizeof(log) - off, buf_len);
4242 return nvme_c2h(n, ((uint8_t *)&log) + off, trans_len, req);
4245 static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req)
4247 NvmeCmd *cmd = &req->cmd;
4249 uint32_t dw10 = le32_to_cpu(cmd->cdw10);
4250 uint32_t dw11 = le32_to_cpu(cmd->cdw11);
4251 uint32_t dw12 = le32_to_cpu(cmd->cdw12);
4252 uint32_t dw13 = le32_to_cpu(cmd->cdw13);
4253 uint8_t lid = dw10 & 0xff;
4254 uint8_t lsp = (dw10 >> 8) & 0xf;
4255 uint8_t rae = (dw10 >> 15) & 0x1;
4256 uint8_t csi = le32_to_cpu(cmd->cdw14) >> 24;
4257 uint32_t numdl, numdu;
4258 uint64_t off, lpol, lpou;
4259 size_t len;
4260 uint16_t status;
4262 numdl = (dw10 >> 16);
4263 numdu = (dw11 & 0xffff);
4264 lpol = dw12;
4265 lpou = dw13;
4267 len = (((numdu << 16) | numdl) + 1) << 2;
4268 off = (lpou << 32ULL) | lpol;
4270 if (off & 0x3) {
4271 return NVME_INVALID_FIELD | NVME_DNR;
4274 trace_pci_nvme_get_log(nvme_cid(req), lid, lsp, rae, len, off);
4276 status = nvme_check_mdts(n, len);
4277 if (status) {
4278 return status;
4281 switch (lid) {
4282 case NVME_LOG_ERROR_INFO:
4283 return nvme_error_info(n, rae, len, off, req);
4284 case NVME_LOG_SMART_INFO:
4285 return nvme_smart_info(n, rae, len, off, req);
4286 case NVME_LOG_FW_SLOT_INFO:
4287 return nvme_fw_log_info(n, len, off, req);
4288 case NVME_LOG_CHANGED_NSLIST:
4289 return nvme_changed_nslist(n, rae, len, off, req);
4290 case NVME_LOG_CMD_EFFECTS:
4291 return nvme_cmd_effects(n, csi, len, off, req);
4292 default:
4293 trace_pci_nvme_err_invalid_log_page(nvme_cid(req), lid);
4294 return NVME_INVALID_FIELD | NVME_DNR;
4298 static void nvme_free_cq(NvmeCQueue *cq, NvmeCtrl *n)
4300 n->cq[cq->cqid] = NULL;
4301 timer_free(cq->timer);
4302 if (msix_enabled(&n->parent_obj)) {
4303 msix_vector_unuse(&n->parent_obj, cq->vector);
4305 if (cq->cqid) {
4306 g_free(cq);
4310 static uint16_t nvme_del_cq(NvmeCtrl *n, NvmeRequest *req)
4312 NvmeDeleteQ *c = (NvmeDeleteQ *)&req->cmd;
4313 NvmeCQueue *cq;
4314 uint16_t qid = le16_to_cpu(c->qid);
4316 if (unlikely(!qid || nvme_check_cqid(n, qid))) {
4317 trace_pci_nvme_err_invalid_del_cq_cqid(qid);
4318 return NVME_INVALID_CQID | NVME_DNR;
4321 cq = n->cq[qid];
4322 if (unlikely(!QTAILQ_EMPTY(&cq->sq_list))) {
4323 trace_pci_nvme_err_invalid_del_cq_notempty(qid);
4324 return NVME_INVALID_QUEUE_DEL;
4327 if (cq->irq_enabled && cq->tail != cq->head) {
4328 n->cq_pending--;
4331 nvme_irq_deassert(n, cq);
4332 trace_pci_nvme_del_cq(qid);
4333 nvme_free_cq(cq, n);
4334 return NVME_SUCCESS;
4337 static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, uint64_t dma_addr,
4338 uint16_t cqid, uint16_t vector, uint16_t size,
4339 uint16_t irq_enabled)
4341 int ret;
4343 if (msix_enabled(&n->parent_obj)) {
4344 ret = msix_vector_use(&n->parent_obj, vector);
4345 assert(ret == 0);
4347 cq->ctrl = n;
4348 cq->cqid = cqid;
4349 cq->size = size;
4350 cq->dma_addr = dma_addr;
4351 cq->phase = 1;
4352 cq->irq_enabled = irq_enabled;
4353 cq->vector = vector;
4354 cq->head = cq->tail = 0;
4355 QTAILQ_INIT(&cq->req_list);
4356 QTAILQ_INIT(&cq->sq_list);
4357 n->cq[cqid] = cq;
4358 cq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_post_cqes, cq);
4361 static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeRequest *req)
4363 NvmeCQueue *cq;
4364 NvmeCreateCq *c = (NvmeCreateCq *)&req->cmd;
4365 uint16_t cqid = le16_to_cpu(c->cqid);
4366 uint16_t vector = le16_to_cpu(c->irq_vector);
4367 uint16_t qsize = le16_to_cpu(c->qsize);
4368 uint16_t qflags = le16_to_cpu(c->cq_flags);
4369 uint64_t prp1 = le64_to_cpu(c->prp1);
4371 trace_pci_nvme_create_cq(prp1, cqid, vector, qsize, qflags,
4372 NVME_CQ_FLAGS_IEN(qflags) != 0);
4374 if (unlikely(!cqid || cqid > n->params.max_ioqpairs ||
4375 n->cq[cqid] != NULL)) {
4376 trace_pci_nvme_err_invalid_create_cq_cqid(cqid);
4377 return NVME_INVALID_QID | NVME_DNR;
4379 if (unlikely(!qsize || qsize > NVME_CAP_MQES(ldq_le_p(&n->bar.cap)))) {
4380 trace_pci_nvme_err_invalid_create_cq_size(qsize);
4381 return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
4383 if (unlikely(prp1 & (n->page_size - 1))) {
4384 trace_pci_nvme_err_invalid_create_cq_addr(prp1);
4385 return NVME_INVALID_PRP_OFFSET | NVME_DNR;
4387 if (unlikely(!msix_enabled(&n->parent_obj) && vector)) {
4388 trace_pci_nvme_err_invalid_create_cq_vector(vector);
4389 return NVME_INVALID_IRQ_VECTOR | NVME_DNR;
4391 if (unlikely(vector >= n->params.msix_qsize)) {
4392 trace_pci_nvme_err_invalid_create_cq_vector(vector);
4393 return NVME_INVALID_IRQ_VECTOR | NVME_DNR;
4395 if (unlikely(!(NVME_CQ_FLAGS_PC(qflags)))) {
4396 trace_pci_nvme_err_invalid_create_cq_qflags(NVME_CQ_FLAGS_PC(qflags));
4397 return NVME_INVALID_FIELD | NVME_DNR;
4400 cq = g_malloc0(sizeof(*cq));
4401 nvme_init_cq(cq, n, prp1, cqid, vector, qsize + 1,
4402 NVME_CQ_FLAGS_IEN(qflags));
4405 * It is only required to set qs_created when creating a completion queue;
4406 * creating a submission queue without a matching completion queue will
4407 * fail.
4409 n->qs_created = true;
4410 return NVME_SUCCESS;
4413 static uint16_t nvme_rpt_empty_id_struct(NvmeCtrl *n, NvmeRequest *req)
4415 uint8_t id[NVME_IDENTIFY_DATA_SIZE] = {};
4417 return nvme_c2h(n, id, sizeof(id), req);
4420 static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeRequest *req)
4422 trace_pci_nvme_identify_ctrl();
4424 return nvme_c2h(n, (uint8_t *)&n->id_ctrl, sizeof(n->id_ctrl), req);
4427 static uint16_t nvme_identify_ctrl_csi(NvmeCtrl *n, NvmeRequest *req)
4429 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4430 uint8_t id[NVME_IDENTIFY_DATA_SIZE] = {};
4431 NvmeIdCtrlNvm *id_nvm = (NvmeIdCtrlNvm *)&id;
4433 trace_pci_nvme_identify_ctrl_csi(c->csi);
4435 switch (c->csi) {
4436 case NVME_CSI_NVM:
4437 id_nvm->vsl = n->params.vsl;
4438 id_nvm->dmrsl = cpu_to_le32(n->dmrsl);
4439 break;
4441 case NVME_CSI_ZONED:
4442 ((NvmeIdCtrlZoned *)&id)->zasl = n->params.zasl;
4443 break;
4445 default:
4446 return NVME_INVALID_FIELD | NVME_DNR;
4449 return nvme_c2h(n, id, sizeof(id), req);
4452 static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req, bool active)
4454 NvmeNamespace *ns;
4455 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4456 uint32_t nsid = le32_to_cpu(c->nsid);
4458 trace_pci_nvme_identify_ns(nsid);
4460 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
4461 return NVME_INVALID_NSID | NVME_DNR;
4464 ns = nvme_ns(n, nsid);
4465 if (unlikely(!ns)) {
4466 if (!active) {
4467 ns = nvme_subsys_ns(n->subsys, nsid);
4468 if (!ns) {
4469 return nvme_rpt_empty_id_struct(n, req);
4471 } else {
4472 return nvme_rpt_empty_id_struct(n, req);
4476 if (active || ns->csi == NVME_CSI_NVM) {
4477 return nvme_c2h(n, (uint8_t *)&ns->id_ns, sizeof(NvmeIdNs), req);
4480 return NVME_INVALID_CMD_SET | NVME_DNR;
4483 static uint16_t nvme_identify_ctrl_list(NvmeCtrl *n, NvmeRequest *req,
4484 bool attached)
4486 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4487 uint32_t nsid = le32_to_cpu(c->nsid);
4488 uint16_t min_id = le16_to_cpu(c->ctrlid);
4489 uint16_t list[NVME_CONTROLLER_LIST_SIZE] = {};
4490 uint16_t *ids = &list[1];
4491 NvmeNamespace *ns;
4492 NvmeCtrl *ctrl;
4493 int cntlid, nr_ids = 0;
4495 trace_pci_nvme_identify_ctrl_list(c->cns, min_id);
4497 if (!n->subsys) {
4498 return NVME_INVALID_FIELD | NVME_DNR;
4501 if (attached) {
4502 if (nsid == NVME_NSID_BROADCAST) {
4503 return NVME_INVALID_FIELD | NVME_DNR;
4506 ns = nvme_subsys_ns(n->subsys, nsid);
4507 if (!ns) {
4508 return NVME_INVALID_FIELD | NVME_DNR;
4512 for (cntlid = min_id; cntlid < ARRAY_SIZE(n->subsys->ctrls); cntlid++) {
4513 ctrl = nvme_subsys_ctrl(n->subsys, cntlid);
4514 if (!ctrl) {
4515 continue;
4518 if (attached && !nvme_ns(ctrl, nsid)) {
4519 continue;
4522 ids[nr_ids++] = cntlid;
4525 list[0] = nr_ids;
4527 return nvme_c2h(n, (uint8_t *)list, sizeof(list), req);
4530 static uint16_t nvme_identify_ns_csi(NvmeCtrl *n, NvmeRequest *req,
4531 bool active)
4533 NvmeNamespace *ns;
4534 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4535 uint32_t nsid = le32_to_cpu(c->nsid);
4537 trace_pci_nvme_identify_ns_csi(nsid, c->csi);
4539 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
4540 return NVME_INVALID_NSID | NVME_DNR;
4543 ns = nvme_ns(n, nsid);
4544 if (unlikely(!ns)) {
4545 if (!active) {
4546 ns = nvme_subsys_ns(n->subsys, nsid);
4547 if (!ns) {
4548 return nvme_rpt_empty_id_struct(n, req);
4550 } else {
4551 return nvme_rpt_empty_id_struct(n, req);
4555 if (c->csi == NVME_CSI_NVM) {
4556 return nvme_rpt_empty_id_struct(n, req);
4557 } else if (c->csi == NVME_CSI_ZONED && ns->csi == NVME_CSI_ZONED) {
4558 return nvme_c2h(n, (uint8_t *)ns->id_ns_zoned, sizeof(NvmeIdNsZoned),
4559 req);
4562 return NVME_INVALID_FIELD | NVME_DNR;
4565 static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeRequest *req,
4566 bool active)
4568 NvmeNamespace *ns;
4569 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4570 uint32_t min_nsid = le32_to_cpu(c->nsid);
4571 uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
4572 static const int data_len = sizeof(list);
4573 uint32_t *list_ptr = (uint32_t *)list;
4574 int i, j = 0;
4576 trace_pci_nvme_identify_nslist(min_nsid);
4579 * Both FFFFFFFFh (NVME_NSID_BROADCAST) and FFFFFFFFEh are invalid values
4580 * since the Active Namespace ID List should return namespaces with ids
4581 * *higher* than the NSID specified in the command. This is also specified
4582 * in the spec (NVM Express v1.3d, Section 5.15.4).
4584 if (min_nsid >= NVME_NSID_BROADCAST - 1) {
4585 return NVME_INVALID_NSID | NVME_DNR;
4588 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
4589 ns = nvme_ns(n, i);
4590 if (!ns) {
4591 if (!active) {
4592 ns = nvme_subsys_ns(n->subsys, i);
4593 if (!ns) {
4594 continue;
4596 } else {
4597 continue;
4600 if (ns->params.nsid <= min_nsid) {
4601 continue;
4603 list_ptr[j++] = cpu_to_le32(ns->params.nsid);
4604 if (j == data_len / sizeof(uint32_t)) {
4605 break;
4609 return nvme_c2h(n, list, data_len, req);
4612 static uint16_t nvme_identify_nslist_csi(NvmeCtrl *n, NvmeRequest *req,
4613 bool active)
4615 NvmeNamespace *ns;
4616 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4617 uint32_t min_nsid = le32_to_cpu(c->nsid);
4618 uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
4619 static const int data_len = sizeof(list);
4620 uint32_t *list_ptr = (uint32_t *)list;
4621 int i, j = 0;
4623 trace_pci_nvme_identify_nslist_csi(min_nsid, c->csi);
4626 * Same as in nvme_identify_nslist(), FFFFFFFFh/FFFFFFFFEh are invalid.
4628 if (min_nsid >= NVME_NSID_BROADCAST - 1) {
4629 return NVME_INVALID_NSID | NVME_DNR;
4632 if (c->csi != NVME_CSI_NVM && c->csi != NVME_CSI_ZONED) {
4633 return NVME_INVALID_FIELD | NVME_DNR;
4636 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
4637 ns = nvme_ns(n, i);
4638 if (!ns) {
4639 if (!active) {
4640 ns = nvme_subsys_ns(n->subsys, i);
4641 if (!ns) {
4642 continue;
4644 } else {
4645 continue;
4648 if (ns->params.nsid <= min_nsid || c->csi != ns->csi) {
4649 continue;
4651 list_ptr[j++] = cpu_to_le32(ns->params.nsid);
4652 if (j == data_len / sizeof(uint32_t)) {
4653 break;
4657 return nvme_c2h(n, list, data_len, req);
4660 static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeRequest *req)
4662 NvmeNamespace *ns;
4663 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4664 uint32_t nsid = le32_to_cpu(c->nsid);
4665 uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
4666 uint8_t *pos = list;
4667 struct {
4668 NvmeIdNsDescr hdr;
4669 uint8_t v[NVME_NIDL_UUID];
4670 } QEMU_PACKED uuid = {};
4671 struct {
4672 NvmeIdNsDescr hdr;
4673 uint64_t v;
4674 } QEMU_PACKED eui64 = {};
4675 struct {
4676 NvmeIdNsDescr hdr;
4677 uint8_t v;
4678 } QEMU_PACKED csi = {};
4680 trace_pci_nvme_identify_ns_descr_list(nsid);
4682 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
4683 return NVME_INVALID_NSID | NVME_DNR;
4686 ns = nvme_ns(n, nsid);
4687 if (unlikely(!ns)) {
4688 return NVME_INVALID_FIELD | NVME_DNR;
4692 * If the EUI-64 field is 0 and the NGUID field is 0, the namespace must
4693 * provide a valid Namespace UUID in the Namespace Identification Descriptor
4694 * data structure. QEMU does not yet support setting NGUID.
4696 uuid.hdr.nidt = NVME_NIDT_UUID;
4697 uuid.hdr.nidl = NVME_NIDL_UUID;
4698 memcpy(uuid.v, ns->params.uuid.data, NVME_NIDL_UUID);
4699 memcpy(pos, &uuid, sizeof(uuid));
4700 pos += sizeof(uuid);
4702 if (ns->params.eui64) {
4703 eui64.hdr.nidt = NVME_NIDT_EUI64;
4704 eui64.hdr.nidl = NVME_NIDL_EUI64;
4705 eui64.v = cpu_to_be64(ns->params.eui64);
4706 memcpy(pos, &eui64, sizeof(eui64));
4707 pos += sizeof(eui64);
4710 csi.hdr.nidt = NVME_NIDT_CSI;
4711 csi.hdr.nidl = NVME_NIDL_CSI;
4712 csi.v = ns->csi;
4713 memcpy(pos, &csi, sizeof(csi));
4714 pos += sizeof(csi);
4716 return nvme_c2h(n, list, sizeof(list), req);
4719 static uint16_t nvme_identify_cmd_set(NvmeCtrl *n, NvmeRequest *req)
4721 uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
4722 static const int data_len = sizeof(list);
4724 trace_pci_nvme_identify_cmd_set();
4726 NVME_SET_CSI(*list, NVME_CSI_NVM);
4727 NVME_SET_CSI(*list, NVME_CSI_ZONED);
4729 return nvme_c2h(n, list, data_len, req);
4732 static uint16_t nvme_identify(NvmeCtrl *n, NvmeRequest *req)
4734 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4736 trace_pci_nvme_identify(nvme_cid(req), c->cns, le16_to_cpu(c->ctrlid),
4737 c->csi);
4739 switch (c->cns) {
4740 case NVME_ID_CNS_NS:
4741 return nvme_identify_ns(n, req, true);
4742 case NVME_ID_CNS_NS_PRESENT:
4743 return nvme_identify_ns(n, req, false);
4744 case NVME_ID_CNS_NS_ATTACHED_CTRL_LIST:
4745 return nvme_identify_ctrl_list(n, req, true);
4746 case NVME_ID_CNS_CTRL_LIST:
4747 return nvme_identify_ctrl_list(n, req, false);
4748 case NVME_ID_CNS_CS_NS:
4749 return nvme_identify_ns_csi(n, req, true);
4750 case NVME_ID_CNS_CS_NS_PRESENT:
4751 return nvme_identify_ns_csi(n, req, false);
4752 case NVME_ID_CNS_CTRL:
4753 return nvme_identify_ctrl(n, req);
4754 case NVME_ID_CNS_CS_CTRL:
4755 return nvme_identify_ctrl_csi(n, req);
4756 case NVME_ID_CNS_NS_ACTIVE_LIST:
4757 return nvme_identify_nslist(n, req, true);
4758 case NVME_ID_CNS_NS_PRESENT_LIST:
4759 return nvme_identify_nslist(n, req, false);
4760 case NVME_ID_CNS_CS_NS_ACTIVE_LIST:
4761 return nvme_identify_nslist_csi(n, req, true);
4762 case NVME_ID_CNS_CS_NS_PRESENT_LIST:
4763 return nvme_identify_nslist_csi(n, req, false);
4764 case NVME_ID_CNS_NS_DESCR_LIST:
4765 return nvme_identify_ns_descr_list(n, req);
4766 case NVME_ID_CNS_IO_COMMAND_SET:
4767 return nvme_identify_cmd_set(n, req);
4768 default:
4769 trace_pci_nvme_err_invalid_identify_cns(le32_to_cpu(c->cns));
4770 return NVME_INVALID_FIELD | NVME_DNR;
4774 static uint16_t nvme_abort(NvmeCtrl *n, NvmeRequest *req)
4776 uint16_t sqid = le32_to_cpu(req->cmd.cdw10) & 0xffff;
4778 req->cqe.result = 1;
4779 if (nvme_check_sqid(n, sqid)) {
4780 return NVME_INVALID_FIELD | NVME_DNR;
4783 return NVME_SUCCESS;
4786 static inline void nvme_set_timestamp(NvmeCtrl *n, uint64_t ts)
4788 trace_pci_nvme_setfeat_timestamp(ts);
4790 n->host_timestamp = le64_to_cpu(ts);
4791 n->timestamp_set_qemu_clock_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
4794 static inline uint64_t nvme_get_timestamp(const NvmeCtrl *n)
4796 uint64_t current_time = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
4797 uint64_t elapsed_time = current_time - n->timestamp_set_qemu_clock_ms;
4799 union nvme_timestamp {
4800 struct {
4801 uint64_t timestamp:48;
4802 uint64_t sync:1;
4803 uint64_t origin:3;
4804 uint64_t rsvd1:12;
4806 uint64_t all;
4809 union nvme_timestamp ts;
4810 ts.all = 0;
4811 ts.timestamp = n->host_timestamp + elapsed_time;
4813 /* If the host timestamp is non-zero, set the timestamp origin */
4814 ts.origin = n->host_timestamp ? 0x01 : 0x00;
4816 trace_pci_nvme_getfeat_timestamp(ts.all);
4818 return cpu_to_le64(ts.all);
4821 static uint16_t nvme_get_feature_timestamp(NvmeCtrl *n, NvmeRequest *req)
4823 uint64_t timestamp = nvme_get_timestamp(n);
4825 return nvme_c2h(n, (uint8_t *)&timestamp, sizeof(timestamp), req);
4828 static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest *req)
4830 NvmeCmd *cmd = &req->cmd;
4831 uint32_t dw10 = le32_to_cpu(cmd->cdw10);
4832 uint32_t dw11 = le32_to_cpu(cmd->cdw11);
4833 uint32_t nsid = le32_to_cpu(cmd->nsid);
4834 uint32_t result;
4835 uint8_t fid = NVME_GETSETFEAT_FID(dw10);
4836 NvmeGetFeatureSelect sel = NVME_GETFEAT_SELECT(dw10);
4837 uint16_t iv;
4838 NvmeNamespace *ns;
4839 int i;
4841 static const uint32_t nvme_feature_default[NVME_FID_MAX] = {
4842 [NVME_ARBITRATION] = NVME_ARB_AB_NOLIMIT,
4845 trace_pci_nvme_getfeat(nvme_cid(req), nsid, fid, sel, dw11);
4847 if (!nvme_feature_support[fid]) {
4848 return NVME_INVALID_FIELD | NVME_DNR;
4851 if (nvme_feature_cap[fid] & NVME_FEAT_CAP_NS) {
4852 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
4854 * The Reservation Notification Mask and Reservation Persistence
4855 * features require a status code of Invalid Field in Command when
4856 * NSID is FFFFFFFFh. Since the device does not support those
4857 * features we can always return Invalid Namespace or Format as we
4858 * should do for all other features.
4860 return NVME_INVALID_NSID | NVME_DNR;
4863 if (!nvme_ns(n, nsid)) {
4864 return NVME_INVALID_FIELD | NVME_DNR;
4868 switch (sel) {
4869 case NVME_GETFEAT_SELECT_CURRENT:
4870 break;
4871 case NVME_GETFEAT_SELECT_SAVED:
4872 /* no features are saveable by the controller; fallthrough */
4873 case NVME_GETFEAT_SELECT_DEFAULT:
4874 goto defaults;
4875 case NVME_GETFEAT_SELECT_CAP:
4876 result = nvme_feature_cap[fid];
4877 goto out;
4880 switch (fid) {
4881 case NVME_TEMPERATURE_THRESHOLD:
4882 result = 0;
4885 * The controller only implements the Composite Temperature sensor, so
4886 * return 0 for all other sensors.
4888 if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) {
4889 goto out;
4892 switch (NVME_TEMP_THSEL(dw11)) {
4893 case NVME_TEMP_THSEL_OVER:
4894 result = n->features.temp_thresh_hi;
4895 goto out;
4896 case NVME_TEMP_THSEL_UNDER:
4897 result = n->features.temp_thresh_low;
4898 goto out;
4901 return NVME_INVALID_FIELD | NVME_DNR;
4902 case NVME_ERROR_RECOVERY:
4903 if (!nvme_nsid_valid(n, nsid)) {
4904 return NVME_INVALID_NSID | NVME_DNR;
4907 ns = nvme_ns(n, nsid);
4908 if (unlikely(!ns)) {
4909 return NVME_INVALID_FIELD | NVME_DNR;
4912 result = ns->features.err_rec;
4913 goto out;
4914 case NVME_VOLATILE_WRITE_CACHE:
4915 result = 0;
4916 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
4917 ns = nvme_ns(n, i);
4918 if (!ns) {
4919 continue;
4922 result = blk_enable_write_cache(ns->blkconf.blk);
4923 if (result) {
4924 break;
4927 trace_pci_nvme_getfeat_vwcache(result ? "enabled" : "disabled");
4928 goto out;
4929 case NVME_ASYNCHRONOUS_EVENT_CONF:
4930 result = n->features.async_config;
4931 goto out;
4932 case NVME_TIMESTAMP:
4933 return nvme_get_feature_timestamp(n, req);
4934 default:
4935 break;
4938 defaults:
4939 switch (fid) {
4940 case NVME_TEMPERATURE_THRESHOLD:
4941 result = 0;
4943 if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) {
4944 break;
4947 if (NVME_TEMP_THSEL(dw11) == NVME_TEMP_THSEL_OVER) {
4948 result = NVME_TEMPERATURE_WARNING;
4951 break;
4952 case NVME_NUMBER_OF_QUEUES:
4953 result = (n->params.max_ioqpairs - 1) |
4954 ((n->params.max_ioqpairs - 1) << 16);
4955 trace_pci_nvme_getfeat_numq(result);
4956 break;
4957 case NVME_INTERRUPT_VECTOR_CONF:
4958 iv = dw11 & 0xffff;
4959 if (iv >= n->params.max_ioqpairs + 1) {
4960 return NVME_INVALID_FIELD | NVME_DNR;
4963 result = iv;
4964 if (iv == n->admin_cq.vector) {
4965 result |= NVME_INTVC_NOCOALESCING;
4967 break;
4968 default:
4969 result = nvme_feature_default[fid];
4970 break;
4973 out:
4974 req->cqe.result = cpu_to_le32(result);
4975 return NVME_SUCCESS;
4978 static uint16_t nvme_set_feature_timestamp(NvmeCtrl *n, NvmeRequest *req)
4980 uint16_t ret;
4981 uint64_t timestamp;
4983 ret = nvme_h2c(n, (uint8_t *)&timestamp, sizeof(timestamp), req);
4984 if (ret) {
4985 return ret;
4988 nvme_set_timestamp(n, timestamp);
4990 return NVME_SUCCESS;
4993 static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest *req)
4995 NvmeNamespace *ns = NULL;
4997 NvmeCmd *cmd = &req->cmd;
4998 uint32_t dw10 = le32_to_cpu(cmd->cdw10);
4999 uint32_t dw11 = le32_to_cpu(cmd->cdw11);
5000 uint32_t nsid = le32_to_cpu(cmd->nsid);
5001 uint8_t fid = NVME_GETSETFEAT_FID(dw10);
5002 uint8_t save = NVME_SETFEAT_SAVE(dw10);
5003 int i;
5005 trace_pci_nvme_setfeat(nvme_cid(req), nsid, fid, save, dw11);
5007 if (save && !(nvme_feature_cap[fid] & NVME_FEAT_CAP_SAVE)) {
5008 return NVME_FID_NOT_SAVEABLE | NVME_DNR;
5011 if (!nvme_feature_support[fid]) {
5012 return NVME_INVALID_FIELD | NVME_DNR;
5015 if (nvme_feature_cap[fid] & NVME_FEAT_CAP_NS) {
5016 if (nsid != NVME_NSID_BROADCAST) {
5017 if (!nvme_nsid_valid(n, nsid)) {
5018 return NVME_INVALID_NSID | NVME_DNR;
5021 ns = nvme_ns(n, nsid);
5022 if (unlikely(!ns)) {
5023 return NVME_INVALID_FIELD | NVME_DNR;
5026 } else if (nsid && nsid != NVME_NSID_BROADCAST) {
5027 if (!nvme_nsid_valid(n, nsid)) {
5028 return NVME_INVALID_NSID | NVME_DNR;
5031 return NVME_FEAT_NOT_NS_SPEC | NVME_DNR;
5034 if (!(nvme_feature_cap[fid] & NVME_FEAT_CAP_CHANGE)) {
5035 return NVME_FEAT_NOT_CHANGEABLE | NVME_DNR;
5038 switch (fid) {
5039 case NVME_TEMPERATURE_THRESHOLD:
5040 if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) {
5041 break;
5044 switch (NVME_TEMP_THSEL(dw11)) {
5045 case NVME_TEMP_THSEL_OVER:
5046 n->features.temp_thresh_hi = NVME_TEMP_TMPTH(dw11);
5047 break;
5048 case NVME_TEMP_THSEL_UNDER:
5049 n->features.temp_thresh_low = NVME_TEMP_TMPTH(dw11);
5050 break;
5051 default:
5052 return NVME_INVALID_FIELD | NVME_DNR;
5055 if ((n->temperature >= n->features.temp_thresh_hi) ||
5056 (n->temperature <= n->features.temp_thresh_low)) {
5057 nvme_smart_event(n, NVME_AER_INFO_SMART_TEMP_THRESH);
5060 break;
5061 case NVME_ERROR_RECOVERY:
5062 if (nsid == NVME_NSID_BROADCAST) {
5063 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
5064 ns = nvme_ns(n, i);
5066 if (!ns) {
5067 continue;
5070 if (NVME_ID_NS_NSFEAT_DULBE(ns->id_ns.nsfeat)) {
5071 ns->features.err_rec = dw11;
5075 break;
5078 assert(ns);
5079 if (NVME_ID_NS_NSFEAT_DULBE(ns->id_ns.nsfeat)) {
5080 ns->features.err_rec = dw11;
5082 break;
5083 case NVME_VOLATILE_WRITE_CACHE:
5084 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
5085 ns = nvme_ns(n, i);
5086 if (!ns) {
5087 continue;
5090 if (!(dw11 & 0x1) && blk_enable_write_cache(ns->blkconf.blk)) {
5091 blk_flush(ns->blkconf.blk);
5094 blk_set_enable_write_cache(ns->blkconf.blk, dw11 & 1);
5097 break;
5099 case NVME_NUMBER_OF_QUEUES:
5100 if (n->qs_created) {
5101 return NVME_CMD_SEQ_ERROR | NVME_DNR;
5105 * NVMe v1.3, Section 5.21.1.7: FFFFh is not an allowed value for NCQR
5106 * and NSQR.
5108 if ((dw11 & 0xffff) == 0xffff || ((dw11 >> 16) & 0xffff) == 0xffff) {
5109 return NVME_INVALID_FIELD | NVME_DNR;
5112 trace_pci_nvme_setfeat_numq((dw11 & 0xffff) + 1,
5113 ((dw11 >> 16) & 0xffff) + 1,
5114 n->params.max_ioqpairs,
5115 n->params.max_ioqpairs);
5116 req->cqe.result = cpu_to_le32((n->params.max_ioqpairs - 1) |
5117 ((n->params.max_ioqpairs - 1) << 16));
5118 break;
5119 case NVME_ASYNCHRONOUS_EVENT_CONF:
5120 n->features.async_config = dw11;
5121 break;
5122 case NVME_TIMESTAMP:
5123 return nvme_set_feature_timestamp(n, req);
5124 case NVME_COMMAND_SET_PROFILE:
5125 if (dw11 & 0x1ff) {
5126 trace_pci_nvme_err_invalid_iocsci(dw11 & 0x1ff);
5127 return NVME_CMD_SET_CMB_REJECTED | NVME_DNR;
5129 break;
5130 default:
5131 return NVME_FEAT_NOT_CHANGEABLE | NVME_DNR;
5133 return NVME_SUCCESS;
5136 static uint16_t nvme_aer(NvmeCtrl *n, NvmeRequest *req)
5138 trace_pci_nvme_aer(nvme_cid(req));
5140 if (n->outstanding_aers > n->params.aerl) {
5141 trace_pci_nvme_aer_aerl_exceeded();
5142 return NVME_AER_LIMIT_EXCEEDED;
5145 n->aer_reqs[n->outstanding_aers] = req;
5146 n->outstanding_aers++;
5148 if (!QTAILQ_EMPTY(&n->aer_queue)) {
5149 nvme_process_aers(n);
5152 return NVME_NO_COMPLETE;
5155 static void nvme_update_dmrsl(NvmeCtrl *n)
5157 int nsid;
5159 for (nsid = 1; nsid <= NVME_MAX_NAMESPACES; nsid++) {
5160 NvmeNamespace *ns = nvme_ns(n, nsid);
5161 if (!ns) {
5162 continue;
5165 n->dmrsl = MIN_NON_ZERO(n->dmrsl,
5166 BDRV_REQUEST_MAX_BYTES / nvme_l2b(ns, 1));
5170 static void nvme_select_iocs_ns(NvmeCtrl *n, NvmeNamespace *ns)
5172 uint32_t cc = ldl_le_p(&n->bar.cc);
5174 ns->iocs = nvme_cse_iocs_none;
5175 switch (ns->csi) {
5176 case NVME_CSI_NVM:
5177 if (NVME_CC_CSS(cc) != NVME_CC_CSS_ADMIN_ONLY) {
5178 ns->iocs = nvme_cse_iocs_nvm;
5180 break;
5181 case NVME_CSI_ZONED:
5182 if (NVME_CC_CSS(cc) == NVME_CC_CSS_CSI) {
5183 ns->iocs = nvme_cse_iocs_zoned;
5184 } else if (NVME_CC_CSS(cc) == NVME_CC_CSS_NVM) {
5185 ns->iocs = nvme_cse_iocs_nvm;
5187 break;
5191 static uint16_t nvme_ns_attachment(NvmeCtrl *n, NvmeRequest *req)
5193 NvmeNamespace *ns;
5194 NvmeCtrl *ctrl;
5195 uint16_t list[NVME_CONTROLLER_LIST_SIZE] = {};
5196 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
5197 uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
5198 uint8_t sel = dw10 & 0xf;
5199 uint16_t *nr_ids = &list[0];
5200 uint16_t *ids = &list[1];
5201 uint16_t ret;
5202 int i;
5204 trace_pci_nvme_ns_attachment(nvme_cid(req), dw10 & 0xf);
5206 if (!nvme_nsid_valid(n, nsid)) {
5207 return NVME_INVALID_NSID | NVME_DNR;
5210 ns = nvme_subsys_ns(n->subsys, nsid);
5211 if (!ns) {
5212 return NVME_INVALID_FIELD | NVME_DNR;
5215 ret = nvme_h2c(n, (uint8_t *)list, 4096, req);
5216 if (ret) {
5217 return ret;
5220 if (!*nr_ids) {
5221 return NVME_NS_CTRL_LIST_INVALID | NVME_DNR;
5224 *nr_ids = MIN(*nr_ids, NVME_CONTROLLER_LIST_SIZE - 1);
5225 for (i = 0; i < *nr_ids; i++) {
5226 ctrl = nvme_subsys_ctrl(n->subsys, ids[i]);
5227 if (!ctrl) {
5228 return NVME_NS_CTRL_LIST_INVALID | NVME_DNR;
5231 switch (sel) {
5232 case NVME_NS_ATTACHMENT_ATTACH:
5233 if (nvme_ns(ctrl, nsid)) {
5234 return NVME_NS_ALREADY_ATTACHED | NVME_DNR;
5237 if (ns->attached && !ns->params.shared) {
5238 return NVME_NS_PRIVATE | NVME_DNR;
5241 nvme_attach_ns(ctrl, ns);
5242 nvme_select_iocs_ns(ctrl, ns);
5244 break;
5246 case NVME_NS_ATTACHMENT_DETACH:
5247 if (!nvme_ns(ctrl, nsid)) {
5248 return NVME_NS_NOT_ATTACHED | NVME_DNR;
5251 ctrl->namespaces[nsid] = NULL;
5252 ns->attached--;
5254 nvme_update_dmrsl(ctrl);
5256 break;
5258 default:
5259 return NVME_INVALID_FIELD | NVME_DNR;
5263 * Add namespace id to the changed namespace id list for event clearing
5264 * via Get Log Page command.
5266 if (!test_and_set_bit(nsid, ctrl->changed_nsids)) {
5267 nvme_enqueue_event(ctrl, NVME_AER_TYPE_NOTICE,
5268 NVME_AER_INFO_NOTICE_NS_ATTR_CHANGED,
5269 NVME_LOG_CHANGED_NSLIST);
5273 return NVME_SUCCESS;
5276 typedef struct NvmeFormatAIOCB {
5277 BlockAIOCB common;
5278 BlockAIOCB *aiocb;
5279 QEMUBH *bh;
5280 NvmeRequest *req;
5281 int ret;
5283 NvmeNamespace *ns;
5284 uint32_t nsid;
5285 bool broadcast;
5286 int64_t offset;
5287 } NvmeFormatAIOCB;
5289 static void nvme_format_bh(void *opaque);
5291 static void nvme_format_cancel(BlockAIOCB *aiocb)
5293 NvmeFormatAIOCB *iocb = container_of(aiocb, NvmeFormatAIOCB, common);
5295 if (iocb->aiocb) {
5296 blk_aio_cancel_async(iocb->aiocb);
5300 static const AIOCBInfo nvme_format_aiocb_info = {
5301 .aiocb_size = sizeof(NvmeFormatAIOCB),
5302 .cancel_async = nvme_format_cancel,
5303 .get_aio_context = nvme_get_aio_context,
5306 static void nvme_format_set(NvmeNamespace *ns, NvmeCmd *cmd)
5308 uint32_t dw10 = le32_to_cpu(cmd->cdw10);
5309 uint8_t lbaf = dw10 & 0xf;
5310 uint8_t pi = (dw10 >> 5) & 0x7;
5311 uint8_t mset = (dw10 >> 4) & 0x1;
5312 uint8_t pil = (dw10 >> 8) & 0x1;
5314 trace_pci_nvme_format_set(ns->params.nsid, lbaf, mset, pi, pil);
5316 ns->id_ns.dps = (pil << 3) | pi;
5317 ns->id_ns.flbas = lbaf | (mset << 4);
5319 nvme_ns_init_format(ns);
5322 static void nvme_format_ns_cb(void *opaque, int ret)
5324 NvmeFormatAIOCB *iocb = opaque;
5325 NvmeRequest *req = iocb->req;
5326 NvmeNamespace *ns = iocb->ns;
5327 int bytes;
5329 if (ret < 0) {
5330 iocb->ret = ret;
5331 goto done;
5334 assert(ns);
5336 if (iocb->offset < ns->size) {
5337 bytes = MIN(BDRV_REQUEST_MAX_BYTES, ns->size - iocb->offset);
5339 iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk, iocb->offset,
5340 bytes, BDRV_REQ_MAY_UNMAP,
5341 nvme_format_ns_cb, iocb);
5343 iocb->offset += bytes;
5344 return;
5347 nvme_format_set(ns, &req->cmd);
5348 ns->status = 0x0;
5349 iocb->ns = NULL;
5350 iocb->offset = 0;
5352 done:
5353 iocb->aiocb = NULL;
5354 qemu_bh_schedule(iocb->bh);
5357 static uint16_t nvme_format_check(NvmeNamespace *ns, uint8_t lbaf, uint8_t pi)
5359 if (ns->params.zoned) {
5360 return NVME_INVALID_FORMAT | NVME_DNR;
5363 if (lbaf > ns->id_ns.nlbaf) {
5364 return NVME_INVALID_FORMAT | NVME_DNR;
5367 if (pi && (ns->id_ns.lbaf[lbaf].ms < sizeof(NvmeDifTuple))) {
5368 return NVME_INVALID_FORMAT | NVME_DNR;
5371 if (pi && pi > NVME_ID_NS_DPS_TYPE_3) {
5372 return NVME_INVALID_FIELD | NVME_DNR;
5375 return NVME_SUCCESS;
5378 static void nvme_format_bh(void *opaque)
5380 NvmeFormatAIOCB *iocb = opaque;
5381 NvmeRequest *req = iocb->req;
5382 NvmeCtrl *n = nvme_ctrl(req);
5383 uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
5384 uint8_t lbaf = dw10 & 0xf;
5385 uint8_t pi = (dw10 >> 5) & 0x7;
5386 uint16_t status;
5387 int i;
5389 if (iocb->ret < 0) {
5390 goto done;
5393 if (iocb->broadcast) {
5394 for (i = iocb->nsid + 1; i <= NVME_MAX_NAMESPACES; i++) {
5395 iocb->ns = nvme_ns(n, i);
5396 if (iocb->ns) {
5397 iocb->nsid = i;
5398 break;
5403 if (!iocb->ns) {
5404 goto done;
5407 status = nvme_format_check(iocb->ns, lbaf, pi);
5408 if (status) {
5409 req->status = status;
5410 goto done;
5413 iocb->ns->status = NVME_FORMAT_IN_PROGRESS;
5414 nvme_format_ns_cb(iocb, 0);
5415 return;
5417 done:
5418 qemu_bh_delete(iocb->bh);
5419 iocb->bh = NULL;
5421 iocb->common.cb(iocb->common.opaque, iocb->ret);
5423 qemu_aio_unref(iocb);
5426 static uint16_t nvme_format(NvmeCtrl *n, NvmeRequest *req)
5428 NvmeFormatAIOCB *iocb;
5429 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
5430 uint16_t status;
5432 iocb = qemu_aio_get(&nvme_format_aiocb_info, NULL, nvme_misc_cb, req);
5434 iocb->req = req;
5435 iocb->bh = qemu_bh_new(nvme_format_bh, iocb);
5436 iocb->ret = 0;
5437 iocb->ns = NULL;
5438 iocb->nsid = 0;
5439 iocb->broadcast = (nsid == NVME_NSID_BROADCAST);
5440 iocb->offset = 0;
5442 if (!iocb->broadcast) {
5443 if (!nvme_nsid_valid(n, nsid)) {
5444 status = NVME_INVALID_NSID | NVME_DNR;
5445 goto out;
5448 iocb->ns = nvme_ns(n, nsid);
5449 if (!iocb->ns) {
5450 status = NVME_INVALID_FIELD | NVME_DNR;
5451 goto out;
5455 req->aiocb = &iocb->common;
5456 qemu_bh_schedule(iocb->bh);
5458 return NVME_NO_COMPLETE;
5460 out:
5461 qemu_bh_delete(iocb->bh);
5462 iocb->bh = NULL;
5463 qemu_aio_unref(iocb);
5464 return status;
5467 static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest *req)
5469 trace_pci_nvme_admin_cmd(nvme_cid(req), nvme_sqid(req), req->cmd.opcode,
5470 nvme_adm_opc_str(req->cmd.opcode));
5472 if (!(nvme_cse_acs[req->cmd.opcode] & NVME_CMD_EFF_CSUPP)) {
5473 trace_pci_nvme_err_invalid_admin_opc(req->cmd.opcode);
5474 return NVME_INVALID_OPCODE | NVME_DNR;
5477 /* SGLs shall not be used for Admin commands in NVMe over PCIe */
5478 if (NVME_CMD_FLAGS_PSDT(req->cmd.flags) != NVME_PSDT_PRP) {
5479 return NVME_INVALID_FIELD | NVME_DNR;
5482 if (NVME_CMD_FLAGS_FUSE(req->cmd.flags)) {
5483 return NVME_INVALID_FIELD;
5486 switch (req->cmd.opcode) {
5487 case NVME_ADM_CMD_DELETE_SQ:
5488 return nvme_del_sq(n, req);
5489 case NVME_ADM_CMD_CREATE_SQ:
5490 return nvme_create_sq(n, req);
5491 case NVME_ADM_CMD_GET_LOG_PAGE:
5492 return nvme_get_log(n, req);
5493 case NVME_ADM_CMD_DELETE_CQ:
5494 return nvme_del_cq(n, req);
5495 case NVME_ADM_CMD_CREATE_CQ:
5496 return nvme_create_cq(n, req);
5497 case NVME_ADM_CMD_IDENTIFY:
5498 return nvme_identify(n, req);
5499 case NVME_ADM_CMD_ABORT:
5500 return nvme_abort(n, req);
5501 case NVME_ADM_CMD_SET_FEATURES:
5502 return nvme_set_feature(n, req);
5503 case NVME_ADM_CMD_GET_FEATURES:
5504 return nvme_get_feature(n, req);
5505 case NVME_ADM_CMD_ASYNC_EV_REQ:
5506 return nvme_aer(n, req);
5507 case NVME_ADM_CMD_NS_ATTACHMENT:
5508 return nvme_ns_attachment(n, req);
5509 case NVME_ADM_CMD_FORMAT_NVM:
5510 return nvme_format(n, req);
5511 default:
5512 assert(false);
5515 return NVME_INVALID_OPCODE | NVME_DNR;
5518 static void nvme_process_sq(void *opaque)
5520 NvmeSQueue *sq = opaque;
5521 NvmeCtrl *n = sq->ctrl;
5522 NvmeCQueue *cq = n->cq[sq->cqid];
5524 uint16_t status;
5525 hwaddr addr;
5526 NvmeCmd cmd;
5527 NvmeRequest *req;
5529 while (!(nvme_sq_empty(sq) || QTAILQ_EMPTY(&sq->req_list))) {
5530 addr = sq->dma_addr + sq->head * n->sqe_size;
5531 if (nvme_addr_read(n, addr, (void *)&cmd, sizeof(cmd))) {
5532 trace_pci_nvme_err_addr_read(addr);
5533 trace_pci_nvme_err_cfs();
5534 stl_le_p(&n->bar.csts, NVME_CSTS_FAILED);
5535 break;
5537 nvme_inc_sq_head(sq);
5539 req = QTAILQ_FIRST(&sq->req_list);
5540 QTAILQ_REMOVE(&sq->req_list, req, entry);
5541 QTAILQ_INSERT_TAIL(&sq->out_req_list, req, entry);
5542 nvme_req_clear(req);
5543 req->cqe.cid = cmd.cid;
5544 memcpy(&req->cmd, &cmd, sizeof(NvmeCmd));
5546 status = sq->sqid ? nvme_io_cmd(n, req) :
5547 nvme_admin_cmd(n, req);
5548 if (status != NVME_NO_COMPLETE) {
5549 req->status = status;
5550 nvme_enqueue_req_completion(cq, req);
5555 static void nvme_ctrl_reset(NvmeCtrl *n)
5557 NvmeNamespace *ns;
5558 int i;
5560 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
5561 ns = nvme_ns(n, i);
5562 if (!ns) {
5563 continue;
5566 nvme_ns_drain(ns);
5569 for (i = 0; i < n->params.max_ioqpairs + 1; i++) {
5570 if (n->sq[i] != NULL) {
5571 nvme_free_sq(n->sq[i], n);
5574 for (i = 0; i < n->params.max_ioqpairs + 1; i++) {
5575 if (n->cq[i] != NULL) {
5576 nvme_free_cq(n->cq[i], n);
5580 while (!QTAILQ_EMPTY(&n->aer_queue)) {
5581 NvmeAsyncEvent *event = QTAILQ_FIRST(&n->aer_queue);
5582 QTAILQ_REMOVE(&n->aer_queue, event, entry);
5583 g_free(event);
5586 n->aer_queued = 0;
5587 n->outstanding_aers = 0;
5588 n->qs_created = false;
5591 static void nvme_ctrl_shutdown(NvmeCtrl *n)
5593 NvmeNamespace *ns;
5594 int i;
5596 if (n->pmr.dev) {
5597 memory_region_msync(&n->pmr.dev->mr, 0, n->pmr.dev->size);
5600 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
5601 ns = nvme_ns(n, i);
5602 if (!ns) {
5603 continue;
5606 nvme_ns_shutdown(ns);
5610 static void nvme_select_iocs(NvmeCtrl *n)
5612 NvmeNamespace *ns;
5613 int i;
5615 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
5616 ns = nvme_ns(n, i);
5617 if (!ns) {
5618 continue;
5621 nvme_select_iocs_ns(n, ns);
5625 static int nvme_start_ctrl(NvmeCtrl *n)
5627 uint64_t cap = ldq_le_p(&n->bar.cap);
5628 uint32_t cc = ldl_le_p(&n->bar.cc);
5629 uint32_t aqa = ldl_le_p(&n->bar.aqa);
5630 uint64_t asq = ldq_le_p(&n->bar.asq);
5631 uint64_t acq = ldq_le_p(&n->bar.acq);
5632 uint32_t page_bits = NVME_CC_MPS(cc) + 12;
5633 uint32_t page_size = 1 << page_bits;
5635 if (unlikely(n->cq[0])) {
5636 trace_pci_nvme_err_startfail_cq();
5637 return -1;
5639 if (unlikely(n->sq[0])) {
5640 trace_pci_nvme_err_startfail_sq();
5641 return -1;
5643 if (unlikely(asq & (page_size - 1))) {
5644 trace_pci_nvme_err_startfail_asq_misaligned(asq);
5645 return -1;
5647 if (unlikely(acq & (page_size - 1))) {
5648 trace_pci_nvme_err_startfail_acq_misaligned(acq);
5649 return -1;
5651 if (unlikely(!(NVME_CAP_CSS(cap) & (1 << NVME_CC_CSS(cc))))) {
5652 trace_pci_nvme_err_startfail_css(NVME_CC_CSS(cc));
5653 return -1;
5655 if (unlikely(NVME_CC_MPS(cc) < NVME_CAP_MPSMIN(cap))) {
5656 trace_pci_nvme_err_startfail_page_too_small(
5657 NVME_CC_MPS(cc),
5658 NVME_CAP_MPSMIN(cap));
5659 return -1;
5661 if (unlikely(NVME_CC_MPS(cc) >
5662 NVME_CAP_MPSMAX(cap))) {
5663 trace_pci_nvme_err_startfail_page_too_large(
5664 NVME_CC_MPS(cc),
5665 NVME_CAP_MPSMAX(cap));
5666 return -1;
5668 if (unlikely(NVME_CC_IOCQES(cc) <
5669 NVME_CTRL_CQES_MIN(n->id_ctrl.cqes))) {
5670 trace_pci_nvme_err_startfail_cqent_too_small(
5671 NVME_CC_IOCQES(cc),
5672 NVME_CTRL_CQES_MIN(cap));
5673 return -1;
5675 if (unlikely(NVME_CC_IOCQES(cc) >
5676 NVME_CTRL_CQES_MAX(n->id_ctrl.cqes))) {
5677 trace_pci_nvme_err_startfail_cqent_too_large(
5678 NVME_CC_IOCQES(cc),
5679 NVME_CTRL_CQES_MAX(cap));
5680 return -1;
5682 if (unlikely(NVME_CC_IOSQES(cc) <
5683 NVME_CTRL_SQES_MIN(n->id_ctrl.sqes))) {
5684 trace_pci_nvme_err_startfail_sqent_too_small(
5685 NVME_CC_IOSQES(cc),
5686 NVME_CTRL_SQES_MIN(cap));
5687 return -1;
5689 if (unlikely(NVME_CC_IOSQES(cc) >
5690 NVME_CTRL_SQES_MAX(n->id_ctrl.sqes))) {
5691 trace_pci_nvme_err_startfail_sqent_too_large(
5692 NVME_CC_IOSQES(cc),
5693 NVME_CTRL_SQES_MAX(cap));
5694 return -1;
5696 if (unlikely(!NVME_AQA_ASQS(aqa))) {
5697 trace_pci_nvme_err_startfail_asqent_sz_zero();
5698 return -1;
5700 if (unlikely(!NVME_AQA_ACQS(aqa))) {
5701 trace_pci_nvme_err_startfail_acqent_sz_zero();
5702 return -1;
5705 n->page_bits = page_bits;
5706 n->page_size = page_size;
5707 n->max_prp_ents = n->page_size / sizeof(uint64_t);
5708 n->cqe_size = 1 << NVME_CC_IOCQES(cc);
5709 n->sqe_size = 1 << NVME_CC_IOSQES(cc);
5710 nvme_init_cq(&n->admin_cq, n, acq, 0, 0, NVME_AQA_ACQS(aqa) + 1, 1);
5711 nvme_init_sq(&n->admin_sq, n, asq, 0, 0, NVME_AQA_ASQS(aqa) + 1);
5713 nvme_set_timestamp(n, 0ULL);
5715 QTAILQ_INIT(&n->aer_queue);
5717 nvme_select_iocs(n);
5719 return 0;
5722 static void nvme_cmb_enable_regs(NvmeCtrl *n)
5724 uint32_t cmbloc = ldl_le_p(&n->bar.cmbloc);
5725 uint32_t cmbsz = ldl_le_p(&n->bar.cmbsz);
5727 NVME_CMBLOC_SET_CDPCILS(cmbloc, 1);
5728 NVME_CMBLOC_SET_CDPMLS(cmbloc, 1);
5729 NVME_CMBLOC_SET_BIR(cmbloc, NVME_CMB_BIR);
5730 stl_le_p(&n->bar.cmbloc, cmbloc);
5732 NVME_CMBSZ_SET_SQS(cmbsz, 1);
5733 NVME_CMBSZ_SET_CQS(cmbsz, 0);
5734 NVME_CMBSZ_SET_LISTS(cmbsz, 1);
5735 NVME_CMBSZ_SET_RDS(cmbsz, 1);
5736 NVME_CMBSZ_SET_WDS(cmbsz, 1);
5737 NVME_CMBSZ_SET_SZU(cmbsz, 2); /* MBs */
5738 NVME_CMBSZ_SET_SZ(cmbsz, n->params.cmb_size_mb);
5739 stl_le_p(&n->bar.cmbsz, cmbsz);
5742 static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
5743 unsigned size)
5745 uint64_t cap = ldq_le_p(&n->bar.cap);
5746 uint32_t cc = ldl_le_p(&n->bar.cc);
5747 uint32_t intms = ldl_le_p(&n->bar.intms);
5748 uint32_t csts = ldl_le_p(&n->bar.csts);
5749 uint32_t pmrsts = ldl_le_p(&n->bar.pmrsts);
5751 if (unlikely(offset & (sizeof(uint32_t) - 1))) {
5752 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_misaligned32,
5753 "MMIO write not 32-bit aligned,"
5754 " offset=0x%"PRIx64"", offset);
5755 /* should be ignored, fall through for now */
5758 if (unlikely(size < sizeof(uint32_t))) {
5759 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_toosmall,
5760 "MMIO write smaller than 32-bits,"
5761 " offset=0x%"PRIx64", size=%u",
5762 offset, size);
5763 /* should be ignored, fall through for now */
5766 switch (offset) {
5767 case NVME_REG_INTMS:
5768 if (unlikely(msix_enabled(&(n->parent_obj)))) {
5769 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_intmask_with_msix,
5770 "undefined access to interrupt mask set"
5771 " when MSI-X is enabled");
5772 /* should be ignored, fall through for now */
5774 intms |= data;
5775 stl_le_p(&n->bar.intms, intms);
5776 n->bar.intmc = n->bar.intms;
5777 trace_pci_nvme_mmio_intm_set(data & 0xffffffff, intms);
5778 nvme_irq_check(n);
5779 break;
5780 case NVME_REG_INTMC:
5781 if (unlikely(msix_enabled(&(n->parent_obj)))) {
5782 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_intmask_with_msix,
5783 "undefined access to interrupt mask clr"
5784 " when MSI-X is enabled");
5785 /* should be ignored, fall through for now */
5787 intms &= ~data;
5788 stl_le_p(&n->bar.intms, intms);
5789 n->bar.intmc = n->bar.intms;
5790 trace_pci_nvme_mmio_intm_clr(data & 0xffffffff, intms);
5791 nvme_irq_check(n);
5792 break;
5793 case NVME_REG_CC:
5794 trace_pci_nvme_mmio_cfg(data & 0xffffffff);
5796 /* Windows first sends data, then sends enable bit */
5797 if (!NVME_CC_EN(data) && !NVME_CC_EN(cc) &&
5798 !NVME_CC_SHN(data) && !NVME_CC_SHN(cc))
5800 cc = data;
5803 if (NVME_CC_EN(data) && !NVME_CC_EN(cc)) {
5804 cc = data;
5806 /* flush CC since nvme_start_ctrl() needs the value */
5807 stl_le_p(&n->bar.cc, cc);
5808 if (unlikely(nvme_start_ctrl(n))) {
5809 trace_pci_nvme_err_startfail();
5810 csts = NVME_CSTS_FAILED;
5811 } else {
5812 trace_pci_nvme_mmio_start_success();
5813 csts = NVME_CSTS_READY;
5815 } else if (!NVME_CC_EN(data) && NVME_CC_EN(cc)) {
5816 trace_pci_nvme_mmio_stopped();
5817 nvme_ctrl_reset(n);
5818 cc = 0;
5819 csts &= ~NVME_CSTS_READY;
5822 if (NVME_CC_SHN(data) && !(NVME_CC_SHN(cc))) {
5823 trace_pci_nvme_mmio_shutdown_set();
5824 nvme_ctrl_shutdown(n);
5825 cc = data;
5826 csts |= NVME_CSTS_SHST_COMPLETE;
5827 } else if (!NVME_CC_SHN(data) && NVME_CC_SHN(cc)) {
5828 trace_pci_nvme_mmio_shutdown_cleared();
5829 csts &= ~NVME_CSTS_SHST_COMPLETE;
5830 cc = data;
5833 stl_le_p(&n->bar.cc, cc);
5834 stl_le_p(&n->bar.csts, csts);
5836 break;
5837 case NVME_REG_CSTS:
5838 if (data & (1 << 4)) {
5839 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_ssreset_w1c_unsupported,
5840 "attempted to W1C CSTS.NSSRO"
5841 " but CAP.NSSRS is zero (not supported)");
5842 } else if (data != 0) {
5843 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_ro_csts,
5844 "attempted to set a read only bit"
5845 " of controller status");
5847 break;
5848 case NVME_REG_NSSR:
5849 if (data == 0x4e564d65) {
5850 trace_pci_nvme_ub_mmiowr_ssreset_unsupported();
5851 } else {
5852 /* The spec says that writes of other values have no effect */
5853 return;
5855 break;
5856 case NVME_REG_AQA:
5857 stl_le_p(&n->bar.aqa, data);
5858 trace_pci_nvme_mmio_aqattr(data & 0xffffffff);
5859 break;
5860 case NVME_REG_ASQ:
5861 stn_le_p(&n->bar.asq, size, data);
5862 trace_pci_nvme_mmio_asqaddr(data);
5863 break;
5864 case NVME_REG_ASQ + 4:
5865 stl_le_p((uint8_t *)&n->bar.asq + 4, data);
5866 trace_pci_nvme_mmio_asqaddr_hi(data, ldq_le_p(&n->bar.asq));
5867 break;
5868 case NVME_REG_ACQ:
5869 trace_pci_nvme_mmio_acqaddr(data);
5870 stn_le_p(&n->bar.acq, size, data);
5871 break;
5872 case NVME_REG_ACQ + 4:
5873 stl_le_p((uint8_t *)&n->bar.acq + 4, data);
5874 trace_pci_nvme_mmio_acqaddr_hi(data, ldq_le_p(&n->bar.acq));
5875 break;
5876 case NVME_REG_CMBLOC:
5877 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_cmbloc_reserved,
5878 "invalid write to reserved CMBLOC"
5879 " when CMBSZ is zero, ignored");
5880 return;
5881 case NVME_REG_CMBSZ:
5882 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_cmbsz_readonly,
5883 "invalid write to read only CMBSZ, ignored");
5884 return;
5885 case NVME_REG_CMBMSC:
5886 if (!NVME_CAP_CMBS(cap)) {
5887 return;
5890 stn_le_p(&n->bar.cmbmsc, size, data);
5891 n->cmb.cmse = false;
5893 if (NVME_CMBMSC_CRE(data)) {
5894 nvme_cmb_enable_regs(n);
5896 if (NVME_CMBMSC_CMSE(data)) {
5897 uint64_t cmbmsc = ldq_le_p(&n->bar.cmbmsc);
5898 hwaddr cba = NVME_CMBMSC_CBA(cmbmsc) << CMBMSC_CBA_SHIFT;
5899 if (cba + int128_get64(n->cmb.mem.size) < cba) {
5900 uint32_t cmbsts = ldl_le_p(&n->bar.cmbsts);
5901 NVME_CMBSTS_SET_CBAI(cmbsts, 1);
5902 stl_le_p(&n->bar.cmbsts, cmbsts);
5903 return;
5906 n->cmb.cba = cba;
5907 n->cmb.cmse = true;
5909 } else {
5910 n->bar.cmbsz = 0;
5911 n->bar.cmbloc = 0;
5914 return;
5915 case NVME_REG_CMBMSC + 4:
5916 stl_le_p((uint8_t *)&n->bar.cmbmsc + 4, data);
5917 return;
5919 case NVME_REG_PMRCAP:
5920 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrcap_readonly,
5921 "invalid write to PMRCAP register, ignored");
5922 return;
5923 case NVME_REG_PMRCTL:
5924 if (!NVME_CAP_PMRS(cap)) {
5925 return;
5928 stl_le_p(&n->bar.pmrctl, data);
5929 if (NVME_PMRCTL_EN(data)) {
5930 memory_region_set_enabled(&n->pmr.dev->mr, true);
5931 pmrsts = 0;
5932 } else {
5933 memory_region_set_enabled(&n->pmr.dev->mr, false);
5934 NVME_PMRSTS_SET_NRDY(pmrsts, 1);
5935 n->pmr.cmse = false;
5937 stl_le_p(&n->bar.pmrsts, pmrsts);
5938 return;
5939 case NVME_REG_PMRSTS:
5940 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrsts_readonly,
5941 "invalid write to PMRSTS register, ignored");
5942 return;
5943 case NVME_REG_PMREBS:
5944 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrebs_readonly,
5945 "invalid write to PMREBS register, ignored");
5946 return;
5947 case NVME_REG_PMRSWTP:
5948 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrswtp_readonly,
5949 "invalid write to PMRSWTP register, ignored");
5950 return;
5951 case NVME_REG_PMRMSCL:
5952 if (!NVME_CAP_PMRS(cap)) {
5953 return;
5956 stl_le_p(&n->bar.pmrmscl, data);
5957 n->pmr.cmse = false;
5959 if (NVME_PMRMSCL_CMSE(data)) {
5960 uint64_t pmrmscu = ldl_le_p(&n->bar.pmrmscu);
5961 hwaddr cba = pmrmscu << 32 |
5962 (NVME_PMRMSCL_CBA(data) << PMRMSCL_CBA_SHIFT);
5963 if (cba + int128_get64(n->pmr.dev->mr.size) < cba) {
5964 NVME_PMRSTS_SET_CBAI(pmrsts, 1);
5965 stl_le_p(&n->bar.pmrsts, pmrsts);
5966 return;
5969 n->pmr.cmse = true;
5970 n->pmr.cba = cba;
5973 return;
5974 case NVME_REG_PMRMSCU:
5975 if (!NVME_CAP_PMRS(cap)) {
5976 return;
5979 stl_le_p(&n->bar.pmrmscu, data);
5980 return;
5981 default:
5982 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_invalid,
5983 "invalid MMIO write,"
5984 " offset=0x%"PRIx64", data=%"PRIx64"",
5985 offset, data);
5986 break;
5990 static uint64_t nvme_mmio_read(void *opaque, hwaddr addr, unsigned size)
5992 NvmeCtrl *n = (NvmeCtrl *)opaque;
5993 uint8_t *ptr = (uint8_t *)&n->bar;
5995 trace_pci_nvme_mmio_read(addr, size);
5997 if (unlikely(addr & (sizeof(uint32_t) - 1))) {
5998 NVME_GUEST_ERR(pci_nvme_ub_mmiord_misaligned32,
5999 "MMIO read not 32-bit aligned,"
6000 " offset=0x%"PRIx64"", addr);
6001 /* should RAZ, fall through for now */
6002 } else if (unlikely(size < sizeof(uint32_t))) {
6003 NVME_GUEST_ERR(pci_nvme_ub_mmiord_toosmall,
6004 "MMIO read smaller than 32-bits,"
6005 " offset=0x%"PRIx64"", addr);
6006 /* should RAZ, fall through for now */
6009 if (addr > sizeof(n->bar) - size) {
6010 NVME_GUEST_ERR(pci_nvme_ub_mmiord_invalid_ofs,
6011 "MMIO read beyond last register,"
6012 " offset=0x%"PRIx64", returning 0", addr);
6014 return 0;
6018 * When PMRWBM bit 1 is set then read from
6019 * from PMRSTS should ensure prior writes
6020 * made it to persistent media
6022 if (addr == NVME_REG_PMRSTS &&
6023 (NVME_PMRCAP_PMRWBM(ldl_le_p(&n->bar.pmrcap)) & 0x02)) {
6024 memory_region_msync(&n->pmr.dev->mr, 0, n->pmr.dev->size);
6027 return ldn_le_p(ptr + addr, size);
6030 static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
6032 uint32_t qid;
6034 if (unlikely(addr & ((1 << 2) - 1))) {
6035 NVME_GUEST_ERR(pci_nvme_ub_db_wr_misaligned,
6036 "doorbell write not 32-bit aligned,"
6037 " offset=0x%"PRIx64", ignoring", addr);
6038 return;
6041 if (((addr - 0x1000) >> 2) & 1) {
6042 /* Completion queue doorbell write */
6044 uint16_t new_head = val & 0xffff;
6045 int start_sqs;
6046 NvmeCQueue *cq;
6048 qid = (addr - (0x1000 + (1 << 2))) >> 3;
6049 if (unlikely(nvme_check_cqid(n, qid))) {
6050 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_cq,
6051 "completion queue doorbell write"
6052 " for nonexistent queue,"
6053 " sqid=%"PRIu32", ignoring", qid);
6056 * NVM Express v1.3d, Section 4.1 state: "If host software writes
6057 * an invalid value to the Submission Queue Tail Doorbell or
6058 * Completion Queue Head Doorbell regiter and an Asynchronous Event
6059 * Request command is outstanding, then an asynchronous event is
6060 * posted to the Admin Completion Queue with a status code of
6061 * Invalid Doorbell Write Value."
6063 * Also note that the spec includes the "Invalid Doorbell Register"
6064 * status code, but nowhere does it specify when to use it.
6065 * However, it seems reasonable to use it here in a similar
6066 * fashion.
6068 if (n->outstanding_aers) {
6069 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
6070 NVME_AER_INFO_ERR_INVALID_DB_REGISTER,
6071 NVME_LOG_ERROR_INFO);
6074 return;
6077 cq = n->cq[qid];
6078 if (unlikely(new_head >= cq->size)) {
6079 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_cqhead,
6080 "completion queue doorbell write value"
6081 " beyond queue size, sqid=%"PRIu32","
6082 " new_head=%"PRIu16", ignoring",
6083 qid, new_head);
6085 if (n->outstanding_aers) {
6086 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
6087 NVME_AER_INFO_ERR_INVALID_DB_VALUE,
6088 NVME_LOG_ERROR_INFO);
6091 return;
6094 trace_pci_nvme_mmio_doorbell_cq(cq->cqid, new_head);
6096 start_sqs = nvme_cq_full(cq) ? 1 : 0;
6097 cq->head = new_head;
6098 if (start_sqs) {
6099 NvmeSQueue *sq;
6100 QTAILQ_FOREACH(sq, &cq->sq_list, entry) {
6101 timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
6103 timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
6106 if (cq->tail == cq->head) {
6107 if (cq->irq_enabled) {
6108 n->cq_pending--;
6111 nvme_irq_deassert(n, cq);
6113 } else {
6114 /* Submission queue doorbell write */
6116 uint16_t new_tail = val & 0xffff;
6117 NvmeSQueue *sq;
6119 qid = (addr - 0x1000) >> 3;
6120 if (unlikely(nvme_check_sqid(n, qid))) {
6121 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_sq,
6122 "submission queue doorbell write"
6123 " for nonexistent queue,"
6124 " sqid=%"PRIu32", ignoring", qid);
6126 if (n->outstanding_aers) {
6127 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
6128 NVME_AER_INFO_ERR_INVALID_DB_REGISTER,
6129 NVME_LOG_ERROR_INFO);
6132 return;
6135 sq = n->sq[qid];
6136 if (unlikely(new_tail >= sq->size)) {
6137 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_sqtail,
6138 "submission queue doorbell write value"
6139 " beyond queue size, sqid=%"PRIu32","
6140 " new_tail=%"PRIu16", ignoring",
6141 qid, new_tail);
6143 if (n->outstanding_aers) {
6144 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
6145 NVME_AER_INFO_ERR_INVALID_DB_VALUE,
6146 NVME_LOG_ERROR_INFO);
6149 return;
6152 trace_pci_nvme_mmio_doorbell_sq(sq->sqid, new_tail);
6154 sq->tail = new_tail;
6155 timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
6159 static void nvme_mmio_write(void *opaque, hwaddr addr, uint64_t data,
6160 unsigned size)
6162 NvmeCtrl *n = (NvmeCtrl *)opaque;
6164 trace_pci_nvme_mmio_write(addr, data, size);
6166 if (addr < sizeof(n->bar)) {
6167 nvme_write_bar(n, addr, data, size);
6168 } else {
6169 nvme_process_db(n, addr, data);
6173 static const MemoryRegionOps nvme_mmio_ops = {
6174 .read = nvme_mmio_read,
6175 .write = nvme_mmio_write,
6176 .endianness = DEVICE_LITTLE_ENDIAN,
6177 .impl = {
6178 .min_access_size = 2,
6179 .max_access_size = 8,
6183 static void nvme_cmb_write(void *opaque, hwaddr addr, uint64_t data,
6184 unsigned size)
6186 NvmeCtrl *n = (NvmeCtrl *)opaque;
6187 stn_le_p(&n->cmb.buf[addr], size, data);
6190 static uint64_t nvme_cmb_read(void *opaque, hwaddr addr, unsigned size)
6192 NvmeCtrl *n = (NvmeCtrl *)opaque;
6193 return ldn_le_p(&n->cmb.buf[addr], size);
6196 static const MemoryRegionOps nvme_cmb_ops = {
6197 .read = nvme_cmb_read,
6198 .write = nvme_cmb_write,
6199 .endianness = DEVICE_LITTLE_ENDIAN,
6200 .impl = {
6201 .min_access_size = 1,
6202 .max_access_size = 8,
6206 static void nvme_check_constraints(NvmeCtrl *n, Error **errp)
6208 NvmeParams *params = &n->params;
6210 if (params->num_queues) {
6211 warn_report("num_queues is deprecated; please use max_ioqpairs "
6212 "instead");
6214 params->max_ioqpairs = params->num_queues - 1;
6217 if (n->namespace.blkconf.blk && n->subsys) {
6218 error_setg(errp, "subsystem support is unavailable with legacy "
6219 "namespace ('drive' property)");
6220 return;
6223 if (params->max_ioqpairs < 1 ||
6224 params->max_ioqpairs > NVME_MAX_IOQPAIRS) {
6225 error_setg(errp, "max_ioqpairs must be between 1 and %d",
6226 NVME_MAX_IOQPAIRS);
6227 return;
6230 if (params->msix_qsize < 1 ||
6231 params->msix_qsize > PCI_MSIX_FLAGS_QSIZE + 1) {
6232 error_setg(errp, "msix_qsize must be between 1 and %d",
6233 PCI_MSIX_FLAGS_QSIZE + 1);
6234 return;
6237 if (!params->serial) {
6238 error_setg(errp, "serial property not set");
6239 return;
6242 if (n->pmr.dev) {
6243 if (host_memory_backend_is_mapped(n->pmr.dev)) {
6244 error_setg(errp, "can't use already busy memdev: %s",
6245 object_get_canonical_path_component(OBJECT(n->pmr.dev)));
6246 return;
6249 if (!is_power_of_2(n->pmr.dev->size)) {
6250 error_setg(errp, "pmr backend size needs to be power of 2 in size");
6251 return;
6254 host_memory_backend_set_mapped(n->pmr.dev, true);
6257 if (n->params.zasl > n->params.mdts) {
6258 error_setg(errp, "zoned.zasl (Zone Append Size Limit) must be less "
6259 "than or equal to mdts (Maximum Data Transfer Size)");
6260 return;
6263 if (!n->params.vsl) {
6264 error_setg(errp, "vsl must be non-zero");
6265 return;
6269 static void nvme_init_state(NvmeCtrl *n)
6271 /* add one to max_ioqpairs to account for the admin queue pair */
6272 n->reg_size = pow2ceil(sizeof(NvmeBar) +
6273 2 * (n->params.max_ioqpairs + 1) * NVME_DB_SIZE);
6274 n->sq = g_new0(NvmeSQueue *, n->params.max_ioqpairs + 1);
6275 n->cq = g_new0(NvmeCQueue *, n->params.max_ioqpairs + 1);
6276 n->temperature = NVME_TEMPERATURE;
6277 n->features.temp_thresh_hi = NVME_TEMPERATURE_WARNING;
6278 n->starttime_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
6279 n->aer_reqs = g_new0(NvmeRequest *, n->params.aerl + 1);
6282 static void nvme_init_cmb(NvmeCtrl *n, PCIDevice *pci_dev)
6284 uint64_t cmb_size = n->params.cmb_size_mb * MiB;
6285 uint64_t cap = ldq_le_p(&n->bar.cap);
6287 n->cmb.buf = g_malloc0(cmb_size);
6288 memory_region_init_io(&n->cmb.mem, OBJECT(n), &nvme_cmb_ops, n,
6289 "nvme-cmb", cmb_size);
6290 pci_register_bar(pci_dev, NVME_CMB_BIR,
6291 PCI_BASE_ADDRESS_SPACE_MEMORY |
6292 PCI_BASE_ADDRESS_MEM_TYPE_64 |
6293 PCI_BASE_ADDRESS_MEM_PREFETCH, &n->cmb.mem);
6295 NVME_CAP_SET_CMBS(cap, 1);
6296 stq_le_p(&n->bar.cap, cap);
6298 if (n->params.legacy_cmb) {
6299 nvme_cmb_enable_regs(n);
6300 n->cmb.cmse = true;
6304 static void nvme_init_pmr(NvmeCtrl *n, PCIDevice *pci_dev)
6306 uint32_t pmrcap = ldl_le_p(&n->bar.pmrcap);
6308 NVME_PMRCAP_SET_RDS(pmrcap, 1);
6309 NVME_PMRCAP_SET_WDS(pmrcap, 1);
6310 NVME_PMRCAP_SET_BIR(pmrcap, NVME_PMR_BIR);
6311 /* Turn on bit 1 support */
6312 NVME_PMRCAP_SET_PMRWBM(pmrcap, 0x02);
6313 NVME_PMRCAP_SET_CMSS(pmrcap, 1);
6314 stl_le_p(&n->bar.pmrcap, pmrcap);
6316 pci_register_bar(pci_dev, NVME_PMR_BIR,
6317 PCI_BASE_ADDRESS_SPACE_MEMORY |
6318 PCI_BASE_ADDRESS_MEM_TYPE_64 |
6319 PCI_BASE_ADDRESS_MEM_PREFETCH, &n->pmr.dev->mr);
6321 memory_region_set_enabled(&n->pmr.dev->mr, false);
6324 static int nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp)
6326 uint8_t *pci_conf = pci_dev->config;
6327 uint64_t bar_size, msix_table_size, msix_pba_size;
6328 unsigned msix_table_offset, msix_pba_offset;
6329 int ret;
6331 Error *err = NULL;
6333 pci_conf[PCI_INTERRUPT_PIN] = 1;
6334 pci_config_set_prog_interface(pci_conf, 0x2);
6336 if (n->params.use_intel_id) {
6337 pci_config_set_vendor_id(pci_conf, PCI_VENDOR_ID_INTEL);
6338 pci_config_set_device_id(pci_conf, 0x5845);
6339 } else {
6340 pci_config_set_vendor_id(pci_conf, PCI_VENDOR_ID_REDHAT);
6341 pci_config_set_device_id(pci_conf, PCI_DEVICE_ID_REDHAT_NVME);
6344 pci_config_set_class(pci_conf, PCI_CLASS_STORAGE_EXPRESS);
6345 pcie_endpoint_cap_init(pci_dev, 0x80);
6347 bar_size = QEMU_ALIGN_UP(n->reg_size, 4 * KiB);
6348 msix_table_offset = bar_size;
6349 msix_table_size = PCI_MSIX_ENTRY_SIZE * n->params.msix_qsize;
6351 bar_size += msix_table_size;
6352 bar_size = QEMU_ALIGN_UP(bar_size, 4 * KiB);
6353 msix_pba_offset = bar_size;
6354 msix_pba_size = QEMU_ALIGN_UP(n->params.msix_qsize, 64) / 8;
6356 bar_size += msix_pba_size;
6357 bar_size = pow2ceil(bar_size);
6359 memory_region_init(&n->bar0, OBJECT(n), "nvme-bar0", bar_size);
6360 memory_region_init_io(&n->iomem, OBJECT(n), &nvme_mmio_ops, n, "nvme",
6361 n->reg_size);
6362 memory_region_add_subregion(&n->bar0, 0, &n->iomem);
6364 pci_register_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY |
6365 PCI_BASE_ADDRESS_MEM_TYPE_64, &n->bar0);
6366 ret = msix_init(pci_dev, n->params.msix_qsize,
6367 &n->bar0, 0, msix_table_offset,
6368 &n->bar0, 0, msix_pba_offset, 0, &err);
6369 if (ret < 0) {
6370 if (ret == -ENOTSUP) {
6371 warn_report_err(err);
6372 } else {
6373 error_propagate(errp, err);
6374 return ret;
6378 if (n->params.cmb_size_mb) {
6379 nvme_init_cmb(n, pci_dev);
6382 if (n->pmr.dev) {
6383 nvme_init_pmr(n, pci_dev);
6386 return 0;
6389 static void nvme_init_subnqn(NvmeCtrl *n)
6391 NvmeSubsystem *subsys = n->subsys;
6392 NvmeIdCtrl *id = &n->id_ctrl;
6394 if (!subsys) {
6395 snprintf((char *)id->subnqn, sizeof(id->subnqn),
6396 "nqn.2019-08.org.qemu:%s", n->params.serial);
6397 } else {
6398 pstrcpy((char *)id->subnqn, sizeof(id->subnqn), (char*)subsys->subnqn);
6402 static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev)
6404 NvmeIdCtrl *id = &n->id_ctrl;
6405 uint8_t *pci_conf = pci_dev->config;
6406 uint64_t cap = ldq_le_p(&n->bar.cap);
6408 id->vid = cpu_to_le16(pci_get_word(pci_conf + PCI_VENDOR_ID));
6409 id->ssvid = cpu_to_le16(pci_get_word(pci_conf + PCI_SUBSYSTEM_VENDOR_ID));
6410 strpadcpy((char *)id->mn, sizeof(id->mn), "QEMU NVMe Ctrl", ' ');
6411 strpadcpy((char *)id->fr, sizeof(id->fr), "1.0", ' ');
6412 strpadcpy((char *)id->sn, sizeof(id->sn), n->params.serial, ' ');
6414 id->cntlid = cpu_to_le16(n->cntlid);
6416 id->oaes = cpu_to_le32(NVME_OAES_NS_ATTR);
6418 id->rab = 6;
6420 if (n->params.use_intel_id) {
6421 id->ieee[0] = 0xb3;
6422 id->ieee[1] = 0x02;
6423 id->ieee[2] = 0x00;
6424 } else {
6425 id->ieee[0] = 0x00;
6426 id->ieee[1] = 0x54;
6427 id->ieee[2] = 0x52;
6430 id->mdts = n->params.mdts;
6431 id->ver = cpu_to_le32(NVME_SPEC_VER);
6432 id->oacs = cpu_to_le16(NVME_OACS_NS_MGMT | NVME_OACS_FORMAT);
6433 id->cntrltype = 0x1;
6436 * Because the controller always completes the Abort command immediately,
6437 * there can never be more than one concurrently executing Abort command,
6438 * so this value is never used for anything. Note that there can easily be
6439 * many Abort commands in the queues, but they are not considered
6440 * "executing" until processed by nvme_abort.
6442 * The specification recommends a value of 3 for Abort Command Limit (four
6443 * concurrently outstanding Abort commands), so lets use that though it is
6444 * inconsequential.
6446 id->acl = 3;
6447 id->aerl = n->params.aerl;
6448 id->frmw = (NVME_NUM_FW_SLOTS << 1) | NVME_FRMW_SLOT1_RO;
6449 id->lpa = NVME_LPA_NS_SMART | NVME_LPA_CSE | NVME_LPA_EXTENDED;
6451 /* recommended default value (~70 C) */
6452 id->wctemp = cpu_to_le16(NVME_TEMPERATURE_WARNING);
6453 id->cctemp = cpu_to_le16(NVME_TEMPERATURE_CRITICAL);
6455 id->sqes = (0x6 << 4) | 0x6;
6456 id->cqes = (0x4 << 4) | 0x4;
6457 id->nn = cpu_to_le32(NVME_MAX_NAMESPACES);
6458 id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROES | NVME_ONCS_TIMESTAMP |
6459 NVME_ONCS_FEATURES | NVME_ONCS_DSM |
6460 NVME_ONCS_COMPARE | NVME_ONCS_COPY);
6463 * NOTE: If this device ever supports a command set that does NOT use 0x0
6464 * as a Flush-equivalent operation, support for the broadcast NSID in Flush
6465 * should probably be removed.
6467 * See comment in nvme_io_cmd.
6469 id->vwc = NVME_VWC_NSID_BROADCAST_SUPPORT | NVME_VWC_PRESENT;
6471 id->ocfs = cpu_to_le16(NVME_OCFS_COPY_FORMAT_0);
6472 id->sgls = cpu_to_le32(NVME_CTRL_SGLS_SUPPORT_NO_ALIGN |
6473 NVME_CTRL_SGLS_BITBUCKET);
6475 nvme_init_subnqn(n);
6477 id->psd[0].mp = cpu_to_le16(0x9c4);
6478 id->psd[0].enlat = cpu_to_le32(0x10);
6479 id->psd[0].exlat = cpu_to_le32(0x4);
6481 if (n->subsys) {
6482 id->cmic |= NVME_CMIC_MULTI_CTRL;
6485 NVME_CAP_SET_MQES(cap, 0x7ff);
6486 NVME_CAP_SET_CQR(cap, 1);
6487 NVME_CAP_SET_TO(cap, 0xf);
6488 NVME_CAP_SET_CSS(cap, NVME_CAP_CSS_NVM);
6489 NVME_CAP_SET_CSS(cap, NVME_CAP_CSS_CSI_SUPP);
6490 NVME_CAP_SET_CSS(cap, NVME_CAP_CSS_ADMIN_ONLY);
6491 NVME_CAP_SET_MPSMAX(cap, 4);
6492 NVME_CAP_SET_CMBS(cap, n->params.cmb_size_mb ? 1 : 0);
6493 NVME_CAP_SET_PMRS(cap, n->pmr.dev ? 1 : 0);
6494 stq_le_p(&n->bar.cap, cap);
6496 stl_le_p(&n->bar.vs, NVME_SPEC_VER);
6497 n->bar.intmc = n->bar.intms = 0;
6500 static int nvme_init_subsys(NvmeCtrl *n, Error **errp)
6502 int cntlid;
6504 if (!n->subsys) {
6505 return 0;
6508 cntlid = nvme_subsys_register_ctrl(n, errp);
6509 if (cntlid < 0) {
6510 return -1;
6513 n->cntlid = cntlid;
6515 return 0;
6518 void nvme_attach_ns(NvmeCtrl *n, NvmeNamespace *ns)
6520 uint32_t nsid = ns->params.nsid;
6521 assert(nsid && nsid <= NVME_MAX_NAMESPACES);
6523 n->namespaces[nsid] = ns;
6524 ns->attached++;
6526 n->dmrsl = MIN_NON_ZERO(n->dmrsl,
6527 BDRV_REQUEST_MAX_BYTES / nvme_l2b(ns, 1));
6530 static void nvme_realize(PCIDevice *pci_dev, Error **errp)
6532 NvmeCtrl *n = NVME(pci_dev);
6533 NvmeNamespace *ns;
6534 Error *local_err = NULL;
6536 nvme_check_constraints(n, &local_err);
6537 if (local_err) {
6538 error_propagate(errp, local_err);
6539 return;
6542 qbus_init(&n->bus, sizeof(NvmeBus), TYPE_NVME_BUS,
6543 &pci_dev->qdev, n->parent_obj.qdev.id);
6545 nvme_init_state(n);
6546 if (nvme_init_pci(n, pci_dev, errp)) {
6547 return;
6550 if (nvme_init_subsys(n, errp)) {
6551 error_propagate(errp, local_err);
6552 return;
6554 nvme_init_ctrl(n, pci_dev);
6556 /* setup a namespace if the controller drive property was given */
6557 if (n->namespace.blkconf.blk) {
6558 ns = &n->namespace;
6559 ns->params.nsid = 1;
6561 if (nvme_ns_setup(ns, errp)) {
6562 return;
6565 nvme_attach_ns(n, ns);
6569 static void nvme_exit(PCIDevice *pci_dev)
6571 NvmeCtrl *n = NVME(pci_dev);
6572 NvmeNamespace *ns;
6573 int i;
6575 nvme_ctrl_reset(n);
6577 if (n->subsys) {
6578 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
6579 ns = nvme_ns(n, i);
6580 if (ns) {
6581 ns->attached--;
6585 nvme_subsys_unregister_ctrl(n->subsys, n);
6588 g_free(n->cq);
6589 g_free(n->sq);
6590 g_free(n->aer_reqs);
6592 if (n->params.cmb_size_mb) {
6593 g_free(n->cmb.buf);
6596 if (n->pmr.dev) {
6597 host_memory_backend_set_mapped(n->pmr.dev, false);
6599 msix_uninit(pci_dev, &n->bar0, &n->bar0);
6600 memory_region_del_subregion(&n->bar0, &n->iomem);
6603 static Property nvme_props[] = {
6604 DEFINE_BLOCK_PROPERTIES(NvmeCtrl, namespace.blkconf),
6605 DEFINE_PROP_LINK("pmrdev", NvmeCtrl, pmr.dev, TYPE_MEMORY_BACKEND,
6606 HostMemoryBackend *),
6607 DEFINE_PROP_LINK("subsys", NvmeCtrl, subsys, TYPE_NVME_SUBSYS,
6608 NvmeSubsystem *),
6609 DEFINE_PROP_STRING("serial", NvmeCtrl, params.serial),
6610 DEFINE_PROP_UINT32("cmb_size_mb", NvmeCtrl, params.cmb_size_mb, 0),
6611 DEFINE_PROP_UINT32("num_queues", NvmeCtrl, params.num_queues, 0),
6612 DEFINE_PROP_UINT32("max_ioqpairs", NvmeCtrl, params.max_ioqpairs, 64),
6613 DEFINE_PROP_UINT16("msix_qsize", NvmeCtrl, params.msix_qsize, 65),
6614 DEFINE_PROP_UINT8("aerl", NvmeCtrl, params.aerl, 3),
6615 DEFINE_PROP_UINT32("aer_max_queued", NvmeCtrl, params.aer_max_queued, 64),
6616 DEFINE_PROP_UINT8("mdts", NvmeCtrl, params.mdts, 7),
6617 DEFINE_PROP_UINT8("vsl", NvmeCtrl, params.vsl, 7),
6618 DEFINE_PROP_BOOL("use-intel-id", NvmeCtrl, params.use_intel_id, false),
6619 DEFINE_PROP_BOOL("legacy-cmb", NvmeCtrl, params.legacy_cmb, false),
6620 DEFINE_PROP_UINT8("zoned.zasl", NvmeCtrl, params.zasl, 0),
6621 DEFINE_PROP_BOOL("zoned.auto_transition", NvmeCtrl,
6622 params.auto_transition_zones, true),
6623 DEFINE_PROP_END_OF_LIST(),
6626 static void nvme_get_smart_warning(Object *obj, Visitor *v, const char *name,
6627 void *opaque, Error **errp)
6629 NvmeCtrl *n = NVME(obj);
6630 uint8_t value = n->smart_critical_warning;
6632 visit_type_uint8(v, name, &value, errp);
6635 static void nvme_set_smart_warning(Object *obj, Visitor *v, const char *name,
6636 void *opaque, Error **errp)
6638 NvmeCtrl *n = NVME(obj);
6639 uint8_t value, old_value, cap = 0, index, event;
6641 if (!visit_type_uint8(v, name, &value, errp)) {
6642 return;
6645 cap = NVME_SMART_SPARE | NVME_SMART_TEMPERATURE | NVME_SMART_RELIABILITY
6646 | NVME_SMART_MEDIA_READ_ONLY | NVME_SMART_FAILED_VOLATILE_MEDIA;
6647 if (NVME_CAP_PMRS(ldq_le_p(&n->bar.cap))) {
6648 cap |= NVME_SMART_PMR_UNRELIABLE;
6651 if ((value & cap) != value) {
6652 error_setg(errp, "unsupported smart critical warning bits: 0x%x",
6653 value & ~cap);
6654 return;
6657 old_value = n->smart_critical_warning;
6658 n->smart_critical_warning = value;
6660 /* only inject new bits of smart critical warning */
6661 for (index = 0; index < NVME_SMART_WARN_MAX; index++) {
6662 event = 1 << index;
6663 if (value & ~old_value & event)
6664 nvme_smart_event(n, event);
6668 static const VMStateDescription nvme_vmstate = {
6669 .name = "nvme",
6670 .unmigratable = 1,
6673 static void nvme_class_init(ObjectClass *oc, void *data)
6675 DeviceClass *dc = DEVICE_CLASS(oc);
6676 PCIDeviceClass *pc = PCI_DEVICE_CLASS(oc);
6678 pc->realize = nvme_realize;
6679 pc->exit = nvme_exit;
6680 pc->class_id = PCI_CLASS_STORAGE_EXPRESS;
6681 pc->revision = 2;
6683 set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
6684 dc->desc = "Non-Volatile Memory Express";
6685 device_class_set_props(dc, nvme_props);
6686 dc->vmsd = &nvme_vmstate;
6689 static void nvme_instance_init(Object *obj)
6691 NvmeCtrl *n = NVME(obj);
6693 device_add_bootindex_property(obj, &n->namespace.blkconf.bootindex,
6694 "bootindex", "/namespace@1,0",
6695 DEVICE(obj));
6697 object_property_add(obj, "smart_critical_warning", "uint8",
6698 nvme_get_smart_warning,
6699 nvme_set_smart_warning, NULL, NULL);
6702 static const TypeInfo nvme_info = {
6703 .name = TYPE_NVME,
6704 .parent = TYPE_PCI_DEVICE,
6705 .instance_size = sizeof(NvmeCtrl),
6706 .instance_init = nvme_instance_init,
6707 .class_init = nvme_class_init,
6708 .interfaces = (InterfaceInfo[]) {
6709 { INTERFACE_PCIE_DEVICE },
6714 static const TypeInfo nvme_bus_info = {
6715 .name = TYPE_NVME_BUS,
6716 .parent = TYPE_BUS,
6717 .instance_size = sizeof(NvmeBus),
6720 static void nvme_register_types(void)
6722 type_register_static(&nvme_info);
6723 type_register_static(&nvme_bus_info);
6726 type_init(nvme_register_types)