hw/i386/acpi-build: Resolve north rather than south bridges
[qemu/kevin.git] / hw / nvme / ctrl.c
blobd38fdd990ef6af43454cab5b89a14ca81d98d16a
1 /*
2 * QEMU NVM Express Controller
4 * Copyright (c) 2012, Intel Corporation
6 * Written by Keith Busch <keith.busch@intel.com>
8 * This code is licensed under the GNU GPL v2 or later.
9 */
11 /**
12 * Reference Specs: http://www.nvmexpress.org, 1.4, 1.3, 1.2, 1.1, 1.0e
14 * https://nvmexpress.org/developers/nvme-specification/
17 * Notes on coding style
18 * ---------------------
19 * While QEMU coding style prefers lowercase hexadecimals in constants, the
20 * NVMe subsystem use thes format from the NVMe specifications in the comments
21 * (i.e. 'h' suffix instead of '0x' prefix).
23 * Usage
24 * -----
25 * See docs/system/nvme.rst for extensive documentation.
27 * Add options:
28 * -drive file=<file>,if=none,id=<drive_id>
29 * -device nvme-subsys,id=<subsys_id>,nqn=<nqn_id>
30 * -device nvme,serial=<serial>,id=<bus_name>, \
31 * cmb_size_mb=<cmb_size_mb[optional]>, \
32 * [pmrdev=<mem_backend_file_id>,] \
33 * max_ioqpairs=<N[optional]>, \
34 * aerl=<N[optional]>,aer_max_queued=<N[optional]>, \
35 * mdts=<N[optional]>,vsl=<N[optional]>, \
36 * zoned.zasl=<N[optional]>, \
37 * zoned.auto_transition=<on|off[optional]>, \
38 * sriov_max_vfs=<N[optional]> \
39 * sriov_vq_flexible=<N[optional]> \
40 * sriov_vi_flexible=<N[optional]> \
41 * sriov_max_vi_per_vf=<N[optional]> \
42 * sriov_max_vq_per_vf=<N[optional]> \
43 * subsys=<subsys_id>
44 * -device nvme-ns,drive=<drive_id>,bus=<bus_name>,nsid=<nsid>,\
45 * zoned=<true|false[optional]>, \
46 * subsys=<subsys_id>,detached=<true|false[optional]>
48 * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
49 * offset 0 in BAR2 and supports only WDS, RDS and SQS for now. By default, the
50 * device will use the "v1.4 CMB scheme" - use the `legacy-cmb` parameter to
51 * always enable the CMBLOC and CMBSZ registers (v1.3 behavior).
53 * Enabling pmr emulation can be achieved by pointing to memory-backend-file.
54 * For example:
55 * -object memory-backend-file,id=<mem_id>,share=on,mem-path=<file_path>, \
56 * size=<size> .... -device nvme,...,pmrdev=<mem_id>
58 * The PMR will use BAR 4/5 exclusively.
60 * To place controller(s) and namespace(s) to a subsystem, then provide
61 * nvme-subsys device as above.
63 * nvme subsystem device parameters
64 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
65 * - `nqn`
66 * This parameter provides the `<nqn_id>` part of the string
67 * `nqn.2019-08.org.qemu:<nqn_id>` which will be reported in the SUBNQN field
68 * of subsystem controllers. Note that `<nqn_id>` should be unique per
69 * subsystem, but this is not enforced by QEMU. If not specified, it will
70 * default to the value of the `id` parameter (`<subsys_id>`).
72 * nvme device parameters
73 * ~~~~~~~~~~~~~~~~~~~~~~
74 * - `subsys`
75 * Specifying this parameter attaches the controller to the subsystem and
76 * the SUBNQN field in the controller will report the NQN of the subsystem
77 * device. This also enables multi controller capability represented in
78 * Identify Controller data structure in CMIC (Controller Multi-path I/O and
79 * Namespace Sharing Capabilities).
81 * - `aerl`
82 * The Asynchronous Event Request Limit (AERL). Indicates the maximum number
83 * of concurrently outstanding Asynchronous Event Request commands support
84 * by the controller. This is a 0's based value.
86 * - `aer_max_queued`
87 * This is the maximum number of events that the device will enqueue for
88 * completion when there are no outstanding AERs. When the maximum number of
89 * enqueued events are reached, subsequent events will be dropped.
91 * - `mdts`
92 * Indicates the maximum data transfer size for a command that transfers data
93 * between host-accessible memory and the controller. The value is specified
94 * as a power of two (2^n) and is in units of the minimum memory page size
95 * (CAP.MPSMIN). The default value is 7 (i.e. 512 KiB).
97 * - `vsl`
98 * Indicates the maximum data size limit for the Verify command. Like `mdts`,
99 * this value is specified as a power of two (2^n) and is in units of the
100 * minimum memory page size (CAP.MPSMIN). The default value is 7 (i.e. 512
101 * KiB).
103 * - `zoned.zasl`
104 * Indicates the maximum data transfer size for the Zone Append command. Like
105 * `mdts`, the value is specified as a power of two (2^n) and is in units of
106 * the minimum memory page size (CAP.MPSMIN). The default value is 0 (i.e.
107 * defaulting to the value of `mdts`).
109 * - `zoned.auto_transition`
110 * Indicates if zones in zone state implicitly opened can be automatically
111 * transitioned to zone state closed for resource management purposes.
112 * Defaults to 'on'.
114 * - `sriov_max_vfs`
115 * Indicates the maximum number of PCIe virtual functions supported
116 * by the controller. The default value is 0. Specifying a non-zero value
117 * enables reporting of both SR-IOV and ARI capabilities by the NVMe device.
118 * Virtual function controllers will not report SR-IOV capability.
120 * NOTE: Single Root I/O Virtualization support is experimental.
121 * All the related parameters may be subject to change.
123 * - `sriov_vq_flexible`
124 * Indicates the total number of flexible queue resources assignable to all
125 * the secondary controllers. Implicitly sets the number of primary
126 * controller's private resources to `(max_ioqpairs - sriov_vq_flexible)`.
128 * - `sriov_vi_flexible`
129 * Indicates the total number of flexible interrupt resources assignable to
130 * all the secondary controllers. Implicitly sets the number of primary
131 * controller's private resources to `(msix_qsize - sriov_vi_flexible)`.
133 * - `sriov_max_vi_per_vf`
134 * Indicates the maximum number of virtual interrupt resources assignable
135 * to a secondary controller. The default 0 resolves to
136 * `(sriov_vi_flexible / sriov_max_vfs)`.
138 * - `sriov_max_vq_per_vf`
139 * Indicates the maximum number of virtual queue resources assignable to
140 * a secondary controller. The default 0 resolves to
141 * `(sriov_vq_flexible / sriov_max_vfs)`.
143 * nvme namespace device parameters
144 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
145 * - `shared`
146 * When the parent nvme device (as defined explicitly by the 'bus' parameter
147 * or implicitly by the most recently defined NvmeBus) is linked to an
148 * nvme-subsys device, the namespace will be attached to all controllers in
149 * the subsystem. If set to 'off' (the default), the namespace will remain a
150 * private namespace and may only be attached to a single controller at a
151 * time.
153 * - `detached`
154 * This parameter is only valid together with the `subsys` parameter. If left
155 * at the default value (`false/off`), the namespace will be attached to all
156 * controllers in the NVMe subsystem at boot-up. If set to `true/on`, the
157 * namespace will be available in the subsystem but not attached to any
158 * controllers.
160 * Setting `zoned` to true selects Zoned Command Set at the namespace.
161 * In this case, the following namespace properties are available to configure
162 * zoned operation:
163 * zoned.zone_size=<zone size in bytes, default: 128MiB>
164 * The number may be followed by K, M, G as in kilo-, mega- or giga-.
166 * zoned.zone_capacity=<zone capacity in bytes, default: zone size>
167 * The value 0 (default) forces zone capacity to be the same as zone
168 * size. The value of this property may not exceed zone size.
170 * zoned.descr_ext_size=<zone descriptor extension size, default 0>
171 * This value needs to be specified in 64B units. If it is zero,
172 * namespace(s) will not support zone descriptor extensions.
174 * zoned.max_active=<Maximum Active Resources (zones), default: 0>
175 * The default value means there is no limit to the number of
176 * concurrently active zones.
178 * zoned.max_open=<Maximum Open Resources (zones), default: 0>
179 * The default value means there is no limit to the number of
180 * concurrently open zones.
182 * zoned.cross_read=<enable RAZB, default: false>
183 * Setting this property to true enables Read Across Zone Boundaries.
186 #include "qemu/osdep.h"
187 #include "qemu/cutils.h"
188 #include "qemu/error-report.h"
189 #include "qemu/log.h"
190 #include "qemu/units.h"
191 #include "qemu/range.h"
192 #include "qapi/error.h"
193 #include "qapi/visitor.h"
194 #include "sysemu/sysemu.h"
195 #include "sysemu/block-backend.h"
196 #include "sysemu/hostmem.h"
197 #include "hw/pci/msix.h"
198 #include "hw/pci/pcie_sriov.h"
199 #include "migration/vmstate.h"
201 #include "nvme.h"
202 #include "dif.h"
203 #include "trace.h"
205 #define NVME_MAX_IOQPAIRS 0xffff
206 #define NVME_DB_SIZE 4
207 #define NVME_SPEC_VER 0x00010400
208 #define NVME_CMB_BIR 2
209 #define NVME_PMR_BIR 4
210 #define NVME_TEMPERATURE 0x143
211 #define NVME_TEMPERATURE_WARNING 0x157
212 #define NVME_TEMPERATURE_CRITICAL 0x175
213 #define NVME_NUM_FW_SLOTS 1
214 #define NVME_DEFAULT_MAX_ZA_SIZE (128 * KiB)
215 #define NVME_MAX_VFS 127
216 #define NVME_VF_RES_GRANULARITY 1
217 #define NVME_VF_OFFSET 0x1
218 #define NVME_VF_STRIDE 1
220 #define NVME_GUEST_ERR(trace, fmt, ...) \
221 do { \
222 (trace_##trace)(__VA_ARGS__); \
223 qemu_log_mask(LOG_GUEST_ERROR, #trace \
224 " in %s: " fmt "\n", __func__, ## __VA_ARGS__); \
225 } while (0)
227 static const bool nvme_feature_support[NVME_FID_MAX] = {
228 [NVME_ARBITRATION] = true,
229 [NVME_POWER_MANAGEMENT] = true,
230 [NVME_TEMPERATURE_THRESHOLD] = true,
231 [NVME_ERROR_RECOVERY] = true,
232 [NVME_VOLATILE_WRITE_CACHE] = true,
233 [NVME_NUMBER_OF_QUEUES] = true,
234 [NVME_INTERRUPT_COALESCING] = true,
235 [NVME_INTERRUPT_VECTOR_CONF] = true,
236 [NVME_WRITE_ATOMICITY] = true,
237 [NVME_ASYNCHRONOUS_EVENT_CONF] = true,
238 [NVME_TIMESTAMP] = true,
239 [NVME_HOST_BEHAVIOR_SUPPORT] = true,
240 [NVME_COMMAND_SET_PROFILE] = true,
243 static const uint32_t nvme_feature_cap[NVME_FID_MAX] = {
244 [NVME_TEMPERATURE_THRESHOLD] = NVME_FEAT_CAP_CHANGE,
245 [NVME_ERROR_RECOVERY] = NVME_FEAT_CAP_CHANGE | NVME_FEAT_CAP_NS,
246 [NVME_VOLATILE_WRITE_CACHE] = NVME_FEAT_CAP_CHANGE,
247 [NVME_NUMBER_OF_QUEUES] = NVME_FEAT_CAP_CHANGE,
248 [NVME_ASYNCHRONOUS_EVENT_CONF] = NVME_FEAT_CAP_CHANGE,
249 [NVME_TIMESTAMP] = NVME_FEAT_CAP_CHANGE,
250 [NVME_HOST_BEHAVIOR_SUPPORT] = NVME_FEAT_CAP_CHANGE,
251 [NVME_COMMAND_SET_PROFILE] = NVME_FEAT_CAP_CHANGE,
254 static const uint32_t nvme_cse_acs[256] = {
255 [NVME_ADM_CMD_DELETE_SQ] = NVME_CMD_EFF_CSUPP,
256 [NVME_ADM_CMD_CREATE_SQ] = NVME_CMD_EFF_CSUPP,
257 [NVME_ADM_CMD_GET_LOG_PAGE] = NVME_CMD_EFF_CSUPP,
258 [NVME_ADM_CMD_DELETE_CQ] = NVME_CMD_EFF_CSUPP,
259 [NVME_ADM_CMD_CREATE_CQ] = NVME_CMD_EFF_CSUPP,
260 [NVME_ADM_CMD_IDENTIFY] = NVME_CMD_EFF_CSUPP,
261 [NVME_ADM_CMD_ABORT] = NVME_CMD_EFF_CSUPP,
262 [NVME_ADM_CMD_SET_FEATURES] = NVME_CMD_EFF_CSUPP,
263 [NVME_ADM_CMD_GET_FEATURES] = NVME_CMD_EFF_CSUPP,
264 [NVME_ADM_CMD_ASYNC_EV_REQ] = NVME_CMD_EFF_CSUPP,
265 [NVME_ADM_CMD_NS_ATTACHMENT] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_NIC,
266 [NVME_ADM_CMD_VIRT_MNGMT] = NVME_CMD_EFF_CSUPP,
267 [NVME_ADM_CMD_DBBUF_CONFIG] = NVME_CMD_EFF_CSUPP,
268 [NVME_ADM_CMD_FORMAT_NVM] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
271 static const uint32_t nvme_cse_iocs_none[256];
273 static const uint32_t nvme_cse_iocs_nvm[256] = {
274 [NVME_CMD_FLUSH] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
275 [NVME_CMD_WRITE_ZEROES] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
276 [NVME_CMD_WRITE] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
277 [NVME_CMD_READ] = NVME_CMD_EFF_CSUPP,
278 [NVME_CMD_DSM] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
279 [NVME_CMD_VERIFY] = NVME_CMD_EFF_CSUPP,
280 [NVME_CMD_COPY] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
281 [NVME_CMD_COMPARE] = NVME_CMD_EFF_CSUPP,
284 static const uint32_t nvme_cse_iocs_zoned[256] = {
285 [NVME_CMD_FLUSH] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
286 [NVME_CMD_WRITE_ZEROES] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
287 [NVME_CMD_WRITE] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
288 [NVME_CMD_READ] = NVME_CMD_EFF_CSUPP,
289 [NVME_CMD_DSM] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
290 [NVME_CMD_VERIFY] = NVME_CMD_EFF_CSUPP,
291 [NVME_CMD_COPY] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
292 [NVME_CMD_COMPARE] = NVME_CMD_EFF_CSUPP,
293 [NVME_CMD_ZONE_APPEND] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
294 [NVME_CMD_ZONE_MGMT_SEND] = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
295 [NVME_CMD_ZONE_MGMT_RECV] = NVME_CMD_EFF_CSUPP,
298 static void nvme_process_sq(void *opaque);
299 static void nvme_ctrl_reset(NvmeCtrl *n, NvmeResetType rst);
301 static uint16_t nvme_sqid(NvmeRequest *req)
303 return le16_to_cpu(req->sq->sqid);
306 static void nvme_assign_zone_state(NvmeNamespace *ns, NvmeZone *zone,
307 NvmeZoneState state)
309 if (QTAILQ_IN_USE(zone, entry)) {
310 switch (nvme_get_zone_state(zone)) {
311 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
312 QTAILQ_REMOVE(&ns->exp_open_zones, zone, entry);
313 break;
314 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
315 QTAILQ_REMOVE(&ns->imp_open_zones, zone, entry);
316 break;
317 case NVME_ZONE_STATE_CLOSED:
318 QTAILQ_REMOVE(&ns->closed_zones, zone, entry);
319 break;
320 case NVME_ZONE_STATE_FULL:
321 QTAILQ_REMOVE(&ns->full_zones, zone, entry);
322 default:
327 nvme_set_zone_state(zone, state);
329 switch (state) {
330 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
331 QTAILQ_INSERT_TAIL(&ns->exp_open_zones, zone, entry);
332 break;
333 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
334 QTAILQ_INSERT_TAIL(&ns->imp_open_zones, zone, entry);
335 break;
336 case NVME_ZONE_STATE_CLOSED:
337 QTAILQ_INSERT_TAIL(&ns->closed_zones, zone, entry);
338 break;
339 case NVME_ZONE_STATE_FULL:
340 QTAILQ_INSERT_TAIL(&ns->full_zones, zone, entry);
341 case NVME_ZONE_STATE_READ_ONLY:
342 break;
343 default:
344 zone->d.za = 0;
348 static uint16_t nvme_zns_check_resources(NvmeNamespace *ns, uint32_t act,
349 uint32_t opn, uint32_t zrwa)
351 if (ns->params.max_active_zones != 0 &&
352 ns->nr_active_zones + act > ns->params.max_active_zones) {
353 trace_pci_nvme_err_insuff_active_res(ns->params.max_active_zones);
354 return NVME_ZONE_TOO_MANY_ACTIVE | NVME_DNR;
357 if (ns->params.max_open_zones != 0 &&
358 ns->nr_open_zones + opn > ns->params.max_open_zones) {
359 trace_pci_nvme_err_insuff_open_res(ns->params.max_open_zones);
360 return NVME_ZONE_TOO_MANY_OPEN | NVME_DNR;
363 if (zrwa > ns->zns.numzrwa) {
364 return NVME_NOZRWA | NVME_DNR;
367 return NVME_SUCCESS;
371 * Check if we can open a zone without exceeding open/active limits.
372 * AOR stands for "Active and Open Resources" (see TP 4053 section 2.5).
374 static uint16_t nvme_aor_check(NvmeNamespace *ns, uint32_t act, uint32_t opn)
376 return nvme_zns_check_resources(ns, act, opn, 0);
379 static bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr)
381 hwaddr hi, lo;
383 if (!n->cmb.cmse) {
384 return false;
387 lo = n->params.legacy_cmb ? n->cmb.mem.addr : n->cmb.cba;
388 hi = lo + int128_get64(n->cmb.mem.size);
390 return addr >= lo && addr < hi;
393 static inline void *nvme_addr_to_cmb(NvmeCtrl *n, hwaddr addr)
395 hwaddr base = n->params.legacy_cmb ? n->cmb.mem.addr : n->cmb.cba;
396 return &n->cmb.buf[addr - base];
399 static bool nvme_addr_is_pmr(NvmeCtrl *n, hwaddr addr)
401 hwaddr hi;
403 if (!n->pmr.cmse) {
404 return false;
407 hi = n->pmr.cba + int128_get64(n->pmr.dev->mr.size);
409 return addr >= n->pmr.cba && addr < hi;
412 static inline void *nvme_addr_to_pmr(NvmeCtrl *n, hwaddr addr)
414 return memory_region_get_ram_ptr(&n->pmr.dev->mr) + (addr - n->pmr.cba);
417 static inline bool nvme_addr_is_iomem(NvmeCtrl *n, hwaddr addr)
419 hwaddr hi, lo;
422 * The purpose of this check is to guard against invalid "local" access to
423 * the iomem (i.e. controller registers). Thus, we check against the range
424 * covered by the 'bar0' MemoryRegion since that is currently composed of
425 * two subregions (the NVMe "MBAR" and the MSI-X table/pba). Note, however,
426 * that if the device model is ever changed to allow the CMB to be located
427 * in BAR0 as well, then this must be changed.
429 lo = n->bar0.addr;
430 hi = lo + int128_get64(n->bar0.size);
432 return addr >= lo && addr < hi;
435 static int nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
437 hwaddr hi = addr + size - 1;
438 if (hi < addr) {
439 return 1;
442 if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr) && nvme_addr_is_cmb(n, hi)) {
443 memcpy(buf, nvme_addr_to_cmb(n, addr), size);
444 return 0;
447 if (nvme_addr_is_pmr(n, addr) && nvme_addr_is_pmr(n, hi)) {
448 memcpy(buf, nvme_addr_to_pmr(n, addr), size);
449 return 0;
452 return pci_dma_read(&n->parent_obj, addr, buf, size);
455 static int nvme_addr_write(NvmeCtrl *n, hwaddr addr, const void *buf, int size)
457 hwaddr hi = addr + size - 1;
458 if (hi < addr) {
459 return 1;
462 if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr) && nvme_addr_is_cmb(n, hi)) {
463 memcpy(nvme_addr_to_cmb(n, addr), buf, size);
464 return 0;
467 if (nvme_addr_is_pmr(n, addr) && nvme_addr_is_pmr(n, hi)) {
468 memcpy(nvme_addr_to_pmr(n, addr), buf, size);
469 return 0;
472 return pci_dma_write(&n->parent_obj, addr, buf, size);
475 static bool nvme_nsid_valid(NvmeCtrl *n, uint32_t nsid)
477 return nsid &&
478 (nsid == NVME_NSID_BROADCAST || nsid <= NVME_MAX_NAMESPACES);
481 static int nvme_check_sqid(NvmeCtrl *n, uint16_t sqid)
483 return sqid < n->conf_ioqpairs + 1 && n->sq[sqid] != NULL ? 0 : -1;
486 static int nvme_check_cqid(NvmeCtrl *n, uint16_t cqid)
488 return cqid < n->conf_ioqpairs + 1 && n->cq[cqid] != NULL ? 0 : -1;
491 static void nvme_inc_cq_tail(NvmeCQueue *cq)
493 cq->tail++;
494 if (cq->tail >= cq->size) {
495 cq->tail = 0;
496 cq->phase = !cq->phase;
500 static void nvme_inc_sq_head(NvmeSQueue *sq)
502 sq->head = (sq->head + 1) % sq->size;
505 static uint8_t nvme_cq_full(NvmeCQueue *cq)
507 return (cq->tail + 1) % cq->size == cq->head;
510 static uint8_t nvme_sq_empty(NvmeSQueue *sq)
512 return sq->head == sq->tail;
515 static void nvme_irq_check(NvmeCtrl *n)
517 uint32_t intms = ldl_le_p(&n->bar.intms);
519 if (msix_enabled(&(n->parent_obj))) {
520 return;
522 if (~intms & n->irq_status) {
523 pci_irq_assert(&n->parent_obj);
524 } else {
525 pci_irq_deassert(&n->parent_obj);
529 static void nvme_irq_assert(NvmeCtrl *n, NvmeCQueue *cq)
531 if (cq->irq_enabled) {
532 if (msix_enabled(&(n->parent_obj))) {
533 trace_pci_nvme_irq_msix(cq->vector);
534 msix_notify(&(n->parent_obj), cq->vector);
535 } else {
536 trace_pci_nvme_irq_pin();
537 assert(cq->vector < 32);
538 n->irq_status |= 1 << cq->vector;
539 nvme_irq_check(n);
541 } else {
542 trace_pci_nvme_irq_masked();
546 static void nvme_irq_deassert(NvmeCtrl *n, NvmeCQueue *cq)
548 if (cq->irq_enabled) {
549 if (msix_enabled(&(n->parent_obj))) {
550 return;
551 } else {
552 assert(cq->vector < 32);
553 if (!n->cq_pending) {
554 n->irq_status &= ~(1 << cq->vector);
556 nvme_irq_check(n);
561 static void nvme_req_clear(NvmeRequest *req)
563 req->ns = NULL;
564 req->opaque = NULL;
565 req->aiocb = NULL;
566 memset(&req->cqe, 0x0, sizeof(req->cqe));
567 req->status = NVME_SUCCESS;
570 static inline void nvme_sg_init(NvmeCtrl *n, NvmeSg *sg, bool dma)
572 if (dma) {
573 pci_dma_sglist_init(&sg->qsg, &n->parent_obj, 0);
574 sg->flags = NVME_SG_DMA;
575 } else {
576 qemu_iovec_init(&sg->iov, 0);
579 sg->flags |= NVME_SG_ALLOC;
582 static inline void nvme_sg_unmap(NvmeSg *sg)
584 if (!(sg->flags & NVME_SG_ALLOC)) {
585 return;
588 if (sg->flags & NVME_SG_DMA) {
589 qemu_sglist_destroy(&sg->qsg);
590 } else {
591 qemu_iovec_destroy(&sg->iov);
594 memset(sg, 0x0, sizeof(*sg));
598 * When metadata is transfered as extended LBAs, the DPTR mapped into `sg`
599 * holds both data and metadata. This function splits the data and metadata
600 * into two separate QSG/IOVs.
602 static void nvme_sg_split(NvmeSg *sg, NvmeNamespace *ns, NvmeSg *data,
603 NvmeSg *mdata)
605 NvmeSg *dst = data;
606 uint32_t trans_len, count = ns->lbasz;
607 uint64_t offset = 0;
608 bool dma = sg->flags & NVME_SG_DMA;
609 size_t sge_len;
610 size_t sg_len = dma ? sg->qsg.size : sg->iov.size;
611 int sg_idx = 0;
613 assert(sg->flags & NVME_SG_ALLOC);
615 while (sg_len) {
616 sge_len = dma ? sg->qsg.sg[sg_idx].len : sg->iov.iov[sg_idx].iov_len;
618 trans_len = MIN(sg_len, count);
619 trans_len = MIN(trans_len, sge_len - offset);
621 if (dst) {
622 if (dma) {
623 qemu_sglist_add(&dst->qsg, sg->qsg.sg[sg_idx].base + offset,
624 trans_len);
625 } else {
626 qemu_iovec_add(&dst->iov,
627 sg->iov.iov[sg_idx].iov_base + offset,
628 trans_len);
632 sg_len -= trans_len;
633 count -= trans_len;
634 offset += trans_len;
636 if (count == 0) {
637 dst = (dst == data) ? mdata : data;
638 count = (dst == data) ? ns->lbasz : ns->lbaf.ms;
641 if (sge_len == offset) {
642 offset = 0;
643 sg_idx++;
648 static uint16_t nvme_map_addr_cmb(NvmeCtrl *n, QEMUIOVector *iov, hwaddr addr,
649 size_t len)
651 if (!len) {
652 return NVME_SUCCESS;
655 trace_pci_nvme_map_addr_cmb(addr, len);
657 if (!nvme_addr_is_cmb(n, addr) || !nvme_addr_is_cmb(n, addr + len - 1)) {
658 return NVME_DATA_TRAS_ERROR;
661 qemu_iovec_add(iov, nvme_addr_to_cmb(n, addr), len);
663 return NVME_SUCCESS;
666 static uint16_t nvme_map_addr_pmr(NvmeCtrl *n, QEMUIOVector *iov, hwaddr addr,
667 size_t len)
669 if (!len) {
670 return NVME_SUCCESS;
673 if (!nvme_addr_is_pmr(n, addr) || !nvme_addr_is_pmr(n, addr + len - 1)) {
674 return NVME_DATA_TRAS_ERROR;
677 qemu_iovec_add(iov, nvme_addr_to_pmr(n, addr), len);
679 return NVME_SUCCESS;
682 static uint16_t nvme_map_addr(NvmeCtrl *n, NvmeSg *sg, hwaddr addr, size_t len)
684 bool cmb = false, pmr = false;
686 if (!len) {
687 return NVME_SUCCESS;
690 trace_pci_nvme_map_addr(addr, len);
692 if (nvme_addr_is_iomem(n, addr)) {
693 return NVME_DATA_TRAS_ERROR;
696 if (nvme_addr_is_cmb(n, addr)) {
697 cmb = true;
698 } else if (nvme_addr_is_pmr(n, addr)) {
699 pmr = true;
702 if (cmb || pmr) {
703 if (sg->flags & NVME_SG_DMA) {
704 return NVME_INVALID_USE_OF_CMB | NVME_DNR;
707 if (sg->iov.niov + 1 > IOV_MAX) {
708 goto max_mappings_exceeded;
711 if (cmb) {
712 return nvme_map_addr_cmb(n, &sg->iov, addr, len);
713 } else {
714 return nvme_map_addr_pmr(n, &sg->iov, addr, len);
718 if (!(sg->flags & NVME_SG_DMA)) {
719 return NVME_INVALID_USE_OF_CMB | NVME_DNR;
722 if (sg->qsg.nsg + 1 > IOV_MAX) {
723 goto max_mappings_exceeded;
726 qemu_sglist_add(&sg->qsg, addr, len);
728 return NVME_SUCCESS;
730 max_mappings_exceeded:
731 NVME_GUEST_ERR(pci_nvme_ub_too_many_mappings,
732 "number of mappings exceed 1024");
733 return NVME_INTERNAL_DEV_ERROR | NVME_DNR;
736 static inline bool nvme_addr_is_dma(NvmeCtrl *n, hwaddr addr)
738 return !(nvme_addr_is_cmb(n, addr) || nvme_addr_is_pmr(n, addr));
741 static uint16_t nvme_map_prp(NvmeCtrl *n, NvmeSg *sg, uint64_t prp1,
742 uint64_t prp2, uint32_t len)
744 hwaddr trans_len = n->page_size - (prp1 % n->page_size);
745 trans_len = MIN(len, trans_len);
746 int num_prps = (len >> n->page_bits) + 1;
747 uint16_t status;
748 int ret;
750 trace_pci_nvme_map_prp(trans_len, len, prp1, prp2, num_prps);
752 nvme_sg_init(n, sg, nvme_addr_is_dma(n, prp1));
754 status = nvme_map_addr(n, sg, prp1, trans_len);
755 if (status) {
756 goto unmap;
759 len -= trans_len;
760 if (len) {
761 if (len > n->page_size) {
762 uint64_t prp_list[n->max_prp_ents];
763 uint32_t nents, prp_trans;
764 int i = 0;
767 * The first PRP list entry, pointed to by PRP2 may contain offset.
768 * Hence, we need to calculate the number of entries in based on
769 * that offset.
771 nents = (n->page_size - (prp2 & (n->page_size - 1))) >> 3;
772 prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t);
773 ret = nvme_addr_read(n, prp2, (void *)prp_list, prp_trans);
774 if (ret) {
775 trace_pci_nvme_err_addr_read(prp2);
776 status = NVME_DATA_TRAS_ERROR;
777 goto unmap;
779 while (len != 0) {
780 uint64_t prp_ent = le64_to_cpu(prp_list[i]);
782 if (i == nents - 1 && len > n->page_size) {
783 if (unlikely(prp_ent & (n->page_size - 1))) {
784 trace_pci_nvme_err_invalid_prplist_ent(prp_ent);
785 status = NVME_INVALID_PRP_OFFSET | NVME_DNR;
786 goto unmap;
789 i = 0;
790 nents = (len + n->page_size - 1) >> n->page_bits;
791 nents = MIN(nents, n->max_prp_ents);
792 prp_trans = nents * sizeof(uint64_t);
793 ret = nvme_addr_read(n, prp_ent, (void *)prp_list,
794 prp_trans);
795 if (ret) {
796 trace_pci_nvme_err_addr_read(prp_ent);
797 status = NVME_DATA_TRAS_ERROR;
798 goto unmap;
800 prp_ent = le64_to_cpu(prp_list[i]);
803 if (unlikely(prp_ent & (n->page_size - 1))) {
804 trace_pci_nvme_err_invalid_prplist_ent(prp_ent);
805 status = NVME_INVALID_PRP_OFFSET | NVME_DNR;
806 goto unmap;
809 trans_len = MIN(len, n->page_size);
810 status = nvme_map_addr(n, sg, prp_ent, trans_len);
811 if (status) {
812 goto unmap;
815 len -= trans_len;
816 i++;
818 } else {
819 if (unlikely(prp2 & (n->page_size - 1))) {
820 trace_pci_nvme_err_invalid_prp2_align(prp2);
821 status = NVME_INVALID_PRP_OFFSET | NVME_DNR;
822 goto unmap;
824 status = nvme_map_addr(n, sg, prp2, len);
825 if (status) {
826 goto unmap;
831 return NVME_SUCCESS;
833 unmap:
834 nvme_sg_unmap(sg);
835 return status;
839 * Map 'nsgld' data descriptors from 'segment'. The function will subtract the
840 * number of bytes mapped in len.
842 static uint16_t nvme_map_sgl_data(NvmeCtrl *n, NvmeSg *sg,
843 NvmeSglDescriptor *segment, uint64_t nsgld,
844 size_t *len, NvmeCmd *cmd)
846 dma_addr_t addr, trans_len;
847 uint32_t dlen;
848 uint16_t status;
850 for (int i = 0; i < nsgld; i++) {
851 uint8_t type = NVME_SGL_TYPE(segment[i].type);
853 switch (type) {
854 case NVME_SGL_DESCR_TYPE_DATA_BLOCK:
855 break;
856 case NVME_SGL_DESCR_TYPE_SEGMENT:
857 case NVME_SGL_DESCR_TYPE_LAST_SEGMENT:
858 return NVME_INVALID_NUM_SGL_DESCRS | NVME_DNR;
859 default:
860 return NVME_SGL_DESCR_TYPE_INVALID | NVME_DNR;
863 dlen = le32_to_cpu(segment[i].len);
865 if (!dlen) {
866 continue;
869 if (*len == 0) {
871 * All data has been mapped, but the SGL contains additional
872 * segments and/or descriptors. The controller might accept
873 * ignoring the rest of the SGL.
875 uint32_t sgls = le32_to_cpu(n->id_ctrl.sgls);
876 if (sgls & NVME_CTRL_SGLS_EXCESS_LENGTH) {
877 break;
880 trace_pci_nvme_err_invalid_sgl_excess_length(dlen);
881 return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
884 trans_len = MIN(*len, dlen);
886 addr = le64_to_cpu(segment[i].addr);
888 if (UINT64_MAX - addr < dlen) {
889 return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
892 status = nvme_map_addr(n, sg, addr, trans_len);
893 if (status) {
894 return status;
897 *len -= trans_len;
900 return NVME_SUCCESS;
903 static uint16_t nvme_map_sgl(NvmeCtrl *n, NvmeSg *sg, NvmeSglDescriptor sgl,
904 size_t len, NvmeCmd *cmd)
907 * Read the segment in chunks of 256 descriptors (one 4k page) to avoid
908 * dynamically allocating a potentially huge SGL. The spec allows the SGL
909 * to be larger (as in number of bytes required to describe the SGL
910 * descriptors and segment chain) than the command transfer size, so it is
911 * not bounded by MDTS.
913 const int SEG_CHUNK_SIZE = 256;
915 NvmeSglDescriptor segment[SEG_CHUNK_SIZE], *sgld, *last_sgld;
916 uint64_t nsgld;
917 uint32_t seg_len;
918 uint16_t status;
919 hwaddr addr;
920 int ret;
922 sgld = &sgl;
923 addr = le64_to_cpu(sgl.addr);
925 trace_pci_nvme_map_sgl(NVME_SGL_TYPE(sgl.type), len);
927 nvme_sg_init(n, sg, nvme_addr_is_dma(n, addr));
930 * If the entire transfer can be described with a single data block it can
931 * be mapped directly.
933 if (NVME_SGL_TYPE(sgl.type) == NVME_SGL_DESCR_TYPE_DATA_BLOCK) {
934 status = nvme_map_sgl_data(n, sg, sgld, 1, &len, cmd);
935 if (status) {
936 goto unmap;
939 goto out;
942 for (;;) {
943 switch (NVME_SGL_TYPE(sgld->type)) {
944 case NVME_SGL_DESCR_TYPE_SEGMENT:
945 case NVME_SGL_DESCR_TYPE_LAST_SEGMENT:
946 break;
947 default:
948 return NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
951 seg_len = le32_to_cpu(sgld->len);
953 /* check the length of the (Last) Segment descriptor */
954 if (!seg_len || seg_len & 0xf) {
955 return NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
958 if (UINT64_MAX - addr < seg_len) {
959 return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
962 nsgld = seg_len / sizeof(NvmeSglDescriptor);
964 while (nsgld > SEG_CHUNK_SIZE) {
965 if (nvme_addr_read(n, addr, segment, sizeof(segment))) {
966 trace_pci_nvme_err_addr_read(addr);
967 status = NVME_DATA_TRAS_ERROR;
968 goto unmap;
971 status = nvme_map_sgl_data(n, sg, segment, SEG_CHUNK_SIZE,
972 &len, cmd);
973 if (status) {
974 goto unmap;
977 nsgld -= SEG_CHUNK_SIZE;
978 addr += SEG_CHUNK_SIZE * sizeof(NvmeSglDescriptor);
981 ret = nvme_addr_read(n, addr, segment, nsgld *
982 sizeof(NvmeSglDescriptor));
983 if (ret) {
984 trace_pci_nvme_err_addr_read(addr);
985 status = NVME_DATA_TRAS_ERROR;
986 goto unmap;
989 last_sgld = &segment[nsgld - 1];
992 * If the segment ends with a Data Block, then we are done.
994 if (NVME_SGL_TYPE(last_sgld->type) == NVME_SGL_DESCR_TYPE_DATA_BLOCK) {
995 status = nvme_map_sgl_data(n, sg, segment, nsgld, &len, cmd);
996 if (status) {
997 goto unmap;
1000 goto out;
1004 * If the last descriptor was not a Data Block, then the current
1005 * segment must not be a Last Segment.
1007 if (NVME_SGL_TYPE(sgld->type) == NVME_SGL_DESCR_TYPE_LAST_SEGMENT) {
1008 status = NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
1009 goto unmap;
1012 sgld = last_sgld;
1013 addr = le64_to_cpu(sgld->addr);
1016 * Do not map the last descriptor; it will be a Segment or Last Segment
1017 * descriptor and is handled by the next iteration.
1019 status = nvme_map_sgl_data(n, sg, segment, nsgld - 1, &len, cmd);
1020 if (status) {
1021 goto unmap;
1025 out:
1026 /* if there is any residual left in len, the SGL was too short */
1027 if (len) {
1028 status = NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
1029 goto unmap;
1032 return NVME_SUCCESS;
1034 unmap:
1035 nvme_sg_unmap(sg);
1036 return status;
1039 uint16_t nvme_map_dptr(NvmeCtrl *n, NvmeSg *sg, size_t len,
1040 NvmeCmd *cmd)
1042 uint64_t prp1, prp2;
1044 switch (NVME_CMD_FLAGS_PSDT(cmd->flags)) {
1045 case NVME_PSDT_PRP:
1046 prp1 = le64_to_cpu(cmd->dptr.prp1);
1047 prp2 = le64_to_cpu(cmd->dptr.prp2);
1049 return nvme_map_prp(n, sg, prp1, prp2, len);
1050 case NVME_PSDT_SGL_MPTR_CONTIGUOUS:
1051 case NVME_PSDT_SGL_MPTR_SGL:
1052 return nvme_map_sgl(n, sg, cmd->dptr.sgl, len, cmd);
1053 default:
1054 return NVME_INVALID_FIELD;
1058 static uint16_t nvme_map_mptr(NvmeCtrl *n, NvmeSg *sg, size_t len,
1059 NvmeCmd *cmd)
1061 int psdt = NVME_CMD_FLAGS_PSDT(cmd->flags);
1062 hwaddr mptr = le64_to_cpu(cmd->mptr);
1063 uint16_t status;
1065 if (psdt == NVME_PSDT_SGL_MPTR_SGL) {
1066 NvmeSglDescriptor sgl;
1068 if (nvme_addr_read(n, mptr, &sgl, sizeof(sgl))) {
1069 return NVME_DATA_TRAS_ERROR;
1072 status = nvme_map_sgl(n, sg, sgl, len, cmd);
1073 if (status && (status & 0x7ff) == NVME_DATA_SGL_LEN_INVALID) {
1074 status = NVME_MD_SGL_LEN_INVALID | NVME_DNR;
1077 return status;
1080 nvme_sg_init(n, sg, nvme_addr_is_dma(n, mptr));
1081 status = nvme_map_addr(n, sg, mptr, len);
1082 if (status) {
1083 nvme_sg_unmap(sg);
1086 return status;
1089 static uint16_t nvme_map_data(NvmeCtrl *n, uint32_t nlb, NvmeRequest *req)
1091 NvmeNamespace *ns = req->ns;
1092 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1093 bool pi = !!NVME_ID_NS_DPS_TYPE(ns->id_ns.dps);
1094 bool pract = !!(le16_to_cpu(rw->control) & NVME_RW_PRINFO_PRACT);
1095 size_t len = nvme_l2b(ns, nlb);
1096 uint16_t status;
1098 if (nvme_ns_ext(ns) &&
1099 !(pi && pract && ns->lbaf.ms == nvme_pi_tuple_size(ns))) {
1100 NvmeSg sg;
1102 len += nvme_m2b(ns, nlb);
1104 status = nvme_map_dptr(n, &sg, len, &req->cmd);
1105 if (status) {
1106 return status;
1109 nvme_sg_init(n, &req->sg, sg.flags & NVME_SG_DMA);
1110 nvme_sg_split(&sg, ns, &req->sg, NULL);
1111 nvme_sg_unmap(&sg);
1113 return NVME_SUCCESS;
1116 return nvme_map_dptr(n, &req->sg, len, &req->cmd);
1119 static uint16_t nvme_map_mdata(NvmeCtrl *n, uint32_t nlb, NvmeRequest *req)
1121 NvmeNamespace *ns = req->ns;
1122 size_t len = nvme_m2b(ns, nlb);
1123 uint16_t status;
1125 if (nvme_ns_ext(ns)) {
1126 NvmeSg sg;
1128 len += nvme_l2b(ns, nlb);
1130 status = nvme_map_dptr(n, &sg, len, &req->cmd);
1131 if (status) {
1132 return status;
1135 nvme_sg_init(n, &req->sg, sg.flags & NVME_SG_DMA);
1136 nvme_sg_split(&sg, ns, NULL, &req->sg);
1137 nvme_sg_unmap(&sg);
1139 return NVME_SUCCESS;
1142 return nvme_map_mptr(n, &req->sg, len, &req->cmd);
1145 static uint16_t nvme_tx_interleaved(NvmeCtrl *n, NvmeSg *sg, uint8_t *ptr,
1146 uint32_t len, uint32_t bytes,
1147 int32_t skip_bytes, int64_t offset,
1148 NvmeTxDirection dir)
1150 hwaddr addr;
1151 uint32_t trans_len, count = bytes;
1152 bool dma = sg->flags & NVME_SG_DMA;
1153 int64_t sge_len;
1154 int sg_idx = 0;
1155 int ret;
1157 assert(sg->flags & NVME_SG_ALLOC);
1159 while (len) {
1160 sge_len = dma ? sg->qsg.sg[sg_idx].len : sg->iov.iov[sg_idx].iov_len;
1162 if (sge_len - offset < 0) {
1163 offset -= sge_len;
1164 sg_idx++;
1165 continue;
1168 if (sge_len == offset) {
1169 offset = 0;
1170 sg_idx++;
1171 continue;
1174 trans_len = MIN(len, count);
1175 trans_len = MIN(trans_len, sge_len - offset);
1177 if (dma) {
1178 addr = sg->qsg.sg[sg_idx].base + offset;
1179 } else {
1180 addr = (hwaddr)(uintptr_t)sg->iov.iov[sg_idx].iov_base + offset;
1183 if (dir == NVME_TX_DIRECTION_TO_DEVICE) {
1184 ret = nvme_addr_read(n, addr, ptr, trans_len);
1185 } else {
1186 ret = nvme_addr_write(n, addr, ptr, trans_len);
1189 if (ret) {
1190 return NVME_DATA_TRAS_ERROR;
1193 ptr += trans_len;
1194 len -= trans_len;
1195 count -= trans_len;
1196 offset += trans_len;
1198 if (count == 0) {
1199 count = bytes;
1200 offset += skip_bytes;
1204 return NVME_SUCCESS;
1207 static uint16_t nvme_tx(NvmeCtrl *n, NvmeSg *sg, void *ptr, uint32_t len,
1208 NvmeTxDirection dir)
1210 assert(sg->flags & NVME_SG_ALLOC);
1212 if (sg->flags & NVME_SG_DMA) {
1213 const MemTxAttrs attrs = MEMTXATTRS_UNSPECIFIED;
1214 dma_addr_t residual;
1216 if (dir == NVME_TX_DIRECTION_TO_DEVICE) {
1217 dma_buf_write(ptr, len, &residual, &sg->qsg, attrs);
1218 } else {
1219 dma_buf_read(ptr, len, &residual, &sg->qsg, attrs);
1222 if (unlikely(residual)) {
1223 trace_pci_nvme_err_invalid_dma();
1224 return NVME_INVALID_FIELD | NVME_DNR;
1226 } else {
1227 size_t bytes;
1229 if (dir == NVME_TX_DIRECTION_TO_DEVICE) {
1230 bytes = qemu_iovec_to_buf(&sg->iov, 0, ptr, len);
1231 } else {
1232 bytes = qemu_iovec_from_buf(&sg->iov, 0, ptr, len);
1235 if (unlikely(bytes != len)) {
1236 trace_pci_nvme_err_invalid_dma();
1237 return NVME_INVALID_FIELD | NVME_DNR;
1241 return NVME_SUCCESS;
1244 static inline uint16_t nvme_c2h(NvmeCtrl *n, void *ptr, uint32_t len,
1245 NvmeRequest *req)
1247 uint16_t status;
1249 status = nvme_map_dptr(n, &req->sg, len, &req->cmd);
1250 if (status) {
1251 return status;
1254 return nvme_tx(n, &req->sg, ptr, len, NVME_TX_DIRECTION_FROM_DEVICE);
1257 static inline uint16_t nvme_h2c(NvmeCtrl *n, void *ptr, uint32_t len,
1258 NvmeRequest *req)
1260 uint16_t status;
1262 status = nvme_map_dptr(n, &req->sg, len, &req->cmd);
1263 if (status) {
1264 return status;
1267 return nvme_tx(n, &req->sg, ptr, len, NVME_TX_DIRECTION_TO_DEVICE);
1270 uint16_t nvme_bounce_data(NvmeCtrl *n, void *ptr, uint32_t len,
1271 NvmeTxDirection dir, NvmeRequest *req)
1273 NvmeNamespace *ns = req->ns;
1274 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1275 bool pi = !!NVME_ID_NS_DPS_TYPE(ns->id_ns.dps);
1276 bool pract = !!(le16_to_cpu(rw->control) & NVME_RW_PRINFO_PRACT);
1278 if (nvme_ns_ext(ns) &&
1279 !(pi && pract && ns->lbaf.ms == nvme_pi_tuple_size(ns))) {
1280 return nvme_tx_interleaved(n, &req->sg, ptr, len, ns->lbasz,
1281 ns->lbaf.ms, 0, dir);
1284 return nvme_tx(n, &req->sg, ptr, len, dir);
1287 uint16_t nvme_bounce_mdata(NvmeCtrl *n, void *ptr, uint32_t len,
1288 NvmeTxDirection dir, NvmeRequest *req)
1290 NvmeNamespace *ns = req->ns;
1291 uint16_t status;
1293 if (nvme_ns_ext(ns)) {
1294 return nvme_tx_interleaved(n, &req->sg, ptr, len, ns->lbaf.ms,
1295 ns->lbasz, ns->lbasz, dir);
1298 nvme_sg_unmap(&req->sg);
1300 status = nvme_map_mptr(n, &req->sg, len, &req->cmd);
1301 if (status) {
1302 return status;
1305 return nvme_tx(n, &req->sg, ptr, len, dir);
1308 static inline void nvme_blk_read(BlockBackend *blk, int64_t offset,
1309 BlockCompletionFunc *cb, NvmeRequest *req)
1311 assert(req->sg.flags & NVME_SG_ALLOC);
1313 if (req->sg.flags & NVME_SG_DMA) {
1314 req->aiocb = dma_blk_read(blk, &req->sg.qsg, offset, BDRV_SECTOR_SIZE,
1315 cb, req);
1316 } else {
1317 req->aiocb = blk_aio_preadv(blk, offset, &req->sg.iov, 0, cb, req);
1321 static inline void nvme_blk_write(BlockBackend *blk, int64_t offset,
1322 BlockCompletionFunc *cb, NvmeRequest *req)
1324 assert(req->sg.flags & NVME_SG_ALLOC);
1326 if (req->sg.flags & NVME_SG_DMA) {
1327 req->aiocb = dma_blk_write(blk, &req->sg.qsg, offset, BDRV_SECTOR_SIZE,
1328 cb, req);
1329 } else {
1330 req->aiocb = blk_aio_pwritev(blk, offset, &req->sg.iov, 0, cb, req);
1334 static void nvme_update_cq_head(NvmeCQueue *cq)
1336 pci_dma_read(&cq->ctrl->parent_obj, cq->db_addr, &cq->head,
1337 sizeof(cq->head));
1338 trace_pci_nvme_shadow_doorbell_cq(cq->cqid, cq->head);
1341 static void nvme_post_cqes(void *opaque)
1343 NvmeCQueue *cq = opaque;
1344 NvmeCtrl *n = cq->ctrl;
1345 NvmeRequest *req, *next;
1346 bool pending = cq->head != cq->tail;
1347 int ret;
1349 QTAILQ_FOREACH_SAFE(req, &cq->req_list, entry, next) {
1350 NvmeSQueue *sq;
1351 hwaddr addr;
1353 if (n->dbbuf_enabled) {
1354 nvme_update_cq_head(cq);
1357 if (nvme_cq_full(cq)) {
1358 break;
1361 sq = req->sq;
1362 req->cqe.status = cpu_to_le16((req->status << 1) | cq->phase);
1363 req->cqe.sq_id = cpu_to_le16(sq->sqid);
1364 req->cqe.sq_head = cpu_to_le16(sq->head);
1365 addr = cq->dma_addr + cq->tail * n->cqe_size;
1366 ret = pci_dma_write(&n->parent_obj, addr, (void *)&req->cqe,
1367 sizeof(req->cqe));
1368 if (ret) {
1369 trace_pci_nvme_err_addr_write(addr);
1370 trace_pci_nvme_err_cfs();
1371 stl_le_p(&n->bar.csts, NVME_CSTS_FAILED);
1372 break;
1374 QTAILQ_REMOVE(&cq->req_list, req, entry);
1375 nvme_inc_cq_tail(cq);
1376 nvme_sg_unmap(&req->sg);
1377 QTAILQ_INSERT_TAIL(&sq->req_list, req, entry);
1379 if (cq->tail != cq->head) {
1380 if (cq->irq_enabled && !pending) {
1381 n->cq_pending++;
1384 nvme_irq_assert(n, cq);
1388 static void nvme_enqueue_req_completion(NvmeCQueue *cq, NvmeRequest *req)
1390 assert(cq->cqid == req->sq->cqid);
1391 trace_pci_nvme_enqueue_req_completion(nvme_cid(req), cq->cqid,
1392 le32_to_cpu(req->cqe.result),
1393 le32_to_cpu(req->cqe.dw1),
1394 req->status);
1396 if (req->status) {
1397 trace_pci_nvme_err_req_status(nvme_cid(req), nvme_nsid(req->ns),
1398 req->status, req->cmd.opcode);
1401 QTAILQ_REMOVE(&req->sq->out_req_list, req, entry);
1402 QTAILQ_INSERT_TAIL(&cq->req_list, req, entry);
1404 if (req->sq->ioeventfd_enabled) {
1405 /* Post CQE directly since we are in main loop thread */
1406 nvme_post_cqes(cq);
1407 } else {
1408 /* Schedule the timer to post CQE later since we are in vcpu thread */
1409 timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
1413 static void nvme_process_aers(void *opaque)
1415 NvmeCtrl *n = opaque;
1416 NvmeAsyncEvent *event, *next;
1418 trace_pci_nvme_process_aers(n->aer_queued);
1420 QTAILQ_FOREACH_SAFE(event, &n->aer_queue, entry, next) {
1421 NvmeRequest *req;
1422 NvmeAerResult *result;
1424 /* can't post cqe if there is nothing to complete */
1425 if (!n->outstanding_aers) {
1426 trace_pci_nvme_no_outstanding_aers();
1427 break;
1430 /* ignore if masked (cqe posted, but event not cleared) */
1431 if (n->aer_mask & (1 << event->result.event_type)) {
1432 trace_pci_nvme_aer_masked(event->result.event_type, n->aer_mask);
1433 continue;
1436 QTAILQ_REMOVE(&n->aer_queue, event, entry);
1437 n->aer_queued--;
1439 n->aer_mask |= 1 << event->result.event_type;
1440 n->outstanding_aers--;
1442 req = n->aer_reqs[n->outstanding_aers];
1444 result = (NvmeAerResult *) &req->cqe.result;
1445 result->event_type = event->result.event_type;
1446 result->event_info = event->result.event_info;
1447 result->log_page = event->result.log_page;
1448 g_free(event);
1450 trace_pci_nvme_aer_post_cqe(result->event_type, result->event_info,
1451 result->log_page);
1453 nvme_enqueue_req_completion(&n->admin_cq, req);
1457 static void nvme_enqueue_event(NvmeCtrl *n, uint8_t event_type,
1458 uint8_t event_info, uint8_t log_page)
1460 NvmeAsyncEvent *event;
1462 trace_pci_nvme_enqueue_event(event_type, event_info, log_page);
1464 if (n->aer_queued == n->params.aer_max_queued) {
1465 trace_pci_nvme_enqueue_event_noqueue(n->aer_queued);
1466 return;
1469 event = g_new(NvmeAsyncEvent, 1);
1470 event->result = (NvmeAerResult) {
1471 .event_type = event_type,
1472 .event_info = event_info,
1473 .log_page = log_page,
1476 QTAILQ_INSERT_TAIL(&n->aer_queue, event, entry);
1477 n->aer_queued++;
1479 nvme_process_aers(n);
1482 static void nvme_smart_event(NvmeCtrl *n, uint8_t event)
1484 uint8_t aer_info;
1486 /* Ref SPEC <Asynchronous Event Information 0x2013 SMART / Health Status> */
1487 if (!(NVME_AEC_SMART(n->features.async_config) & event)) {
1488 return;
1491 switch (event) {
1492 case NVME_SMART_SPARE:
1493 aer_info = NVME_AER_INFO_SMART_SPARE_THRESH;
1494 break;
1495 case NVME_SMART_TEMPERATURE:
1496 aer_info = NVME_AER_INFO_SMART_TEMP_THRESH;
1497 break;
1498 case NVME_SMART_RELIABILITY:
1499 case NVME_SMART_MEDIA_READ_ONLY:
1500 case NVME_SMART_FAILED_VOLATILE_MEDIA:
1501 case NVME_SMART_PMR_UNRELIABLE:
1502 aer_info = NVME_AER_INFO_SMART_RELIABILITY;
1503 break;
1504 default:
1505 return;
1508 nvme_enqueue_event(n, NVME_AER_TYPE_SMART, aer_info, NVME_LOG_SMART_INFO);
1511 static void nvme_clear_events(NvmeCtrl *n, uint8_t event_type)
1513 n->aer_mask &= ~(1 << event_type);
1514 if (!QTAILQ_EMPTY(&n->aer_queue)) {
1515 nvme_process_aers(n);
1519 static inline uint16_t nvme_check_mdts(NvmeCtrl *n, size_t len)
1521 uint8_t mdts = n->params.mdts;
1523 if (mdts && len > n->page_size << mdts) {
1524 trace_pci_nvme_err_mdts(len);
1525 return NVME_INVALID_FIELD | NVME_DNR;
1528 return NVME_SUCCESS;
1531 static inline uint16_t nvme_check_bounds(NvmeNamespace *ns, uint64_t slba,
1532 uint32_t nlb)
1534 uint64_t nsze = le64_to_cpu(ns->id_ns.nsze);
1536 if (unlikely(UINT64_MAX - slba < nlb || slba + nlb > nsze)) {
1537 trace_pci_nvme_err_invalid_lba_range(slba, nlb, nsze);
1538 return NVME_LBA_RANGE | NVME_DNR;
1541 return NVME_SUCCESS;
1544 static int nvme_block_status_all(NvmeNamespace *ns, uint64_t slba,
1545 uint32_t nlb, int flags)
1547 BlockDriverState *bs = blk_bs(ns->blkconf.blk);
1549 int64_t pnum = 0, bytes = nvme_l2b(ns, nlb);
1550 int64_t offset = nvme_l2b(ns, slba);
1551 int ret;
1554 * `pnum` holds the number of bytes after offset that shares the same
1555 * allocation status as the byte at offset. If `pnum` is different from
1556 * `bytes`, we should check the allocation status of the next range and
1557 * continue this until all bytes have been checked.
1559 do {
1560 bytes -= pnum;
1562 ret = bdrv_block_status(bs, offset, bytes, &pnum, NULL, NULL);
1563 if (ret < 0) {
1564 return ret;
1568 trace_pci_nvme_block_status(offset, bytes, pnum, ret,
1569 !!(ret & BDRV_BLOCK_ZERO));
1571 if (!(ret & flags)) {
1572 return 1;
1575 offset += pnum;
1576 } while (pnum != bytes);
1578 return 0;
1581 static uint16_t nvme_check_dulbe(NvmeNamespace *ns, uint64_t slba,
1582 uint32_t nlb)
1584 int ret;
1585 Error *err = NULL;
1587 ret = nvme_block_status_all(ns, slba, nlb, BDRV_BLOCK_DATA);
1588 if (ret) {
1589 if (ret < 0) {
1590 error_setg_errno(&err, -ret, "unable to get block status");
1591 error_report_err(err);
1593 return NVME_INTERNAL_DEV_ERROR;
1596 return NVME_DULB;
1599 return NVME_SUCCESS;
1602 static void nvme_aio_err(NvmeRequest *req, int ret)
1604 uint16_t status = NVME_SUCCESS;
1605 Error *local_err = NULL;
1607 switch (req->cmd.opcode) {
1608 case NVME_CMD_READ:
1609 status = NVME_UNRECOVERED_READ;
1610 break;
1611 case NVME_CMD_FLUSH:
1612 case NVME_CMD_WRITE:
1613 case NVME_CMD_WRITE_ZEROES:
1614 case NVME_CMD_ZONE_APPEND:
1615 status = NVME_WRITE_FAULT;
1616 break;
1617 default:
1618 status = NVME_INTERNAL_DEV_ERROR;
1619 break;
1622 trace_pci_nvme_err_aio(nvme_cid(req), strerror(-ret), status);
1624 error_setg_errno(&local_err, -ret, "aio failed");
1625 error_report_err(local_err);
1628 * Set the command status code to the first encountered error but allow a
1629 * subsequent Internal Device Error to trump it.
1631 if (req->status && status != NVME_INTERNAL_DEV_ERROR) {
1632 return;
1635 req->status = status;
1638 static inline uint32_t nvme_zone_idx(NvmeNamespace *ns, uint64_t slba)
1640 return ns->zone_size_log2 > 0 ? slba >> ns->zone_size_log2 :
1641 slba / ns->zone_size;
1644 static inline NvmeZone *nvme_get_zone_by_slba(NvmeNamespace *ns, uint64_t slba)
1646 uint32_t zone_idx = nvme_zone_idx(ns, slba);
1648 if (zone_idx >= ns->num_zones) {
1649 return NULL;
1652 return &ns->zone_array[zone_idx];
1655 static uint16_t nvme_check_zone_state_for_write(NvmeZone *zone)
1657 uint64_t zslba = zone->d.zslba;
1659 switch (nvme_get_zone_state(zone)) {
1660 case NVME_ZONE_STATE_EMPTY:
1661 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1662 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1663 case NVME_ZONE_STATE_CLOSED:
1664 return NVME_SUCCESS;
1665 case NVME_ZONE_STATE_FULL:
1666 trace_pci_nvme_err_zone_is_full(zslba);
1667 return NVME_ZONE_FULL;
1668 case NVME_ZONE_STATE_OFFLINE:
1669 trace_pci_nvme_err_zone_is_offline(zslba);
1670 return NVME_ZONE_OFFLINE;
1671 case NVME_ZONE_STATE_READ_ONLY:
1672 trace_pci_nvme_err_zone_is_read_only(zslba);
1673 return NVME_ZONE_READ_ONLY;
1674 default:
1675 assert(false);
1678 return NVME_INTERNAL_DEV_ERROR;
1681 static uint16_t nvme_check_zone_write(NvmeNamespace *ns, NvmeZone *zone,
1682 uint64_t slba, uint32_t nlb)
1684 uint64_t zcap = nvme_zone_wr_boundary(zone);
1685 uint16_t status;
1687 status = nvme_check_zone_state_for_write(zone);
1688 if (status) {
1689 return status;
1692 if (zone->d.za & NVME_ZA_ZRWA_VALID) {
1693 uint64_t ezrwa = zone->w_ptr + 2 * ns->zns.zrwas;
1695 if (slba < zone->w_ptr || slba + nlb > ezrwa) {
1696 trace_pci_nvme_err_zone_invalid_write(slba, zone->w_ptr);
1697 return NVME_ZONE_INVALID_WRITE;
1699 } else {
1700 if (unlikely(slba != zone->w_ptr)) {
1701 trace_pci_nvme_err_write_not_at_wp(slba, zone->d.zslba,
1702 zone->w_ptr);
1703 return NVME_ZONE_INVALID_WRITE;
1707 if (unlikely((slba + nlb) > zcap)) {
1708 trace_pci_nvme_err_zone_boundary(slba, nlb, zcap);
1709 return NVME_ZONE_BOUNDARY_ERROR;
1712 return NVME_SUCCESS;
1715 static uint16_t nvme_check_zone_state_for_read(NvmeZone *zone)
1717 switch (nvme_get_zone_state(zone)) {
1718 case NVME_ZONE_STATE_EMPTY:
1719 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1720 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1721 case NVME_ZONE_STATE_FULL:
1722 case NVME_ZONE_STATE_CLOSED:
1723 case NVME_ZONE_STATE_READ_ONLY:
1724 return NVME_SUCCESS;
1725 case NVME_ZONE_STATE_OFFLINE:
1726 trace_pci_nvme_err_zone_is_offline(zone->d.zslba);
1727 return NVME_ZONE_OFFLINE;
1728 default:
1729 assert(false);
1732 return NVME_INTERNAL_DEV_ERROR;
1735 static uint16_t nvme_check_zone_read(NvmeNamespace *ns, uint64_t slba,
1736 uint32_t nlb)
1738 NvmeZone *zone;
1739 uint64_t bndry, end;
1740 uint16_t status;
1742 zone = nvme_get_zone_by_slba(ns, slba);
1743 assert(zone);
1745 bndry = nvme_zone_rd_boundary(ns, zone);
1746 end = slba + nlb;
1748 status = nvme_check_zone_state_for_read(zone);
1749 if (status) {
1751 } else if (unlikely(end > bndry)) {
1752 if (!ns->params.cross_zone_read) {
1753 status = NVME_ZONE_BOUNDARY_ERROR;
1754 } else {
1756 * Read across zone boundary - check that all subsequent
1757 * zones that are being read have an appropriate state.
1759 do {
1760 zone++;
1761 status = nvme_check_zone_state_for_read(zone);
1762 if (status) {
1763 break;
1765 } while (end > nvme_zone_rd_boundary(ns, zone));
1769 return status;
1772 static uint16_t nvme_zrm_finish(NvmeNamespace *ns, NvmeZone *zone)
1774 switch (nvme_get_zone_state(zone)) {
1775 case NVME_ZONE_STATE_FULL:
1776 return NVME_SUCCESS;
1778 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1779 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1780 nvme_aor_dec_open(ns);
1781 /* fallthrough */
1782 case NVME_ZONE_STATE_CLOSED:
1783 nvme_aor_dec_active(ns);
1785 if (zone->d.za & NVME_ZA_ZRWA_VALID) {
1786 zone->d.za &= ~NVME_ZA_ZRWA_VALID;
1787 if (ns->params.numzrwa) {
1788 ns->zns.numzrwa++;
1792 /* fallthrough */
1793 case NVME_ZONE_STATE_EMPTY:
1794 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_FULL);
1795 return NVME_SUCCESS;
1797 default:
1798 return NVME_ZONE_INVAL_TRANSITION;
1802 static uint16_t nvme_zrm_close(NvmeNamespace *ns, NvmeZone *zone)
1804 switch (nvme_get_zone_state(zone)) {
1805 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1806 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1807 nvme_aor_dec_open(ns);
1808 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_CLOSED);
1809 /* fall through */
1810 case NVME_ZONE_STATE_CLOSED:
1811 return NVME_SUCCESS;
1813 default:
1814 return NVME_ZONE_INVAL_TRANSITION;
1818 static uint16_t nvme_zrm_reset(NvmeNamespace *ns, NvmeZone *zone)
1820 switch (nvme_get_zone_state(zone)) {
1821 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1822 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1823 nvme_aor_dec_open(ns);
1824 /* fallthrough */
1825 case NVME_ZONE_STATE_CLOSED:
1826 nvme_aor_dec_active(ns);
1828 if (zone->d.za & NVME_ZA_ZRWA_VALID) {
1829 if (ns->params.numzrwa) {
1830 ns->zns.numzrwa++;
1834 /* fallthrough */
1835 case NVME_ZONE_STATE_FULL:
1836 zone->w_ptr = zone->d.zslba;
1837 zone->d.wp = zone->w_ptr;
1838 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EMPTY);
1839 /* fallthrough */
1840 case NVME_ZONE_STATE_EMPTY:
1841 return NVME_SUCCESS;
1843 default:
1844 return NVME_ZONE_INVAL_TRANSITION;
1848 static void nvme_zrm_auto_transition_zone(NvmeNamespace *ns)
1850 NvmeZone *zone;
1852 if (ns->params.max_open_zones &&
1853 ns->nr_open_zones == ns->params.max_open_zones) {
1854 zone = QTAILQ_FIRST(&ns->imp_open_zones);
1855 if (zone) {
1857 * Automatically close this implicitly open zone.
1859 QTAILQ_REMOVE(&ns->imp_open_zones, zone, entry);
1860 nvme_zrm_close(ns, zone);
1865 enum {
1866 NVME_ZRM_AUTO = 1 << 0,
1867 NVME_ZRM_ZRWA = 1 << 1,
1870 static uint16_t nvme_zrm_open_flags(NvmeCtrl *n, NvmeNamespace *ns,
1871 NvmeZone *zone, int flags)
1873 int act = 0;
1874 uint16_t status;
1876 switch (nvme_get_zone_state(zone)) {
1877 case NVME_ZONE_STATE_EMPTY:
1878 act = 1;
1880 /* fallthrough */
1882 case NVME_ZONE_STATE_CLOSED:
1883 if (n->params.auto_transition_zones) {
1884 nvme_zrm_auto_transition_zone(ns);
1886 status = nvme_zns_check_resources(ns, act, 1,
1887 (flags & NVME_ZRM_ZRWA) ? 1 : 0);
1888 if (status) {
1889 return status;
1892 if (act) {
1893 nvme_aor_inc_active(ns);
1896 nvme_aor_inc_open(ns);
1898 if (flags & NVME_ZRM_AUTO) {
1899 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_IMPLICITLY_OPEN);
1900 return NVME_SUCCESS;
1903 /* fallthrough */
1905 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
1906 if (flags & NVME_ZRM_AUTO) {
1907 return NVME_SUCCESS;
1910 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_EXPLICITLY_OPEN);
1912 /* fallthrough */
1914 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
1915 if (flags & NVME_ZRM_ZRWA) {
1916 ns->zns.numzrwa--;
1918 zone->d.za |= NVME_ZA_ZRWA_VALID;
1921 return NVME_SUCCESS;
1923 default:
1924 return NVME_ZONE_INVAL_TRANSITION;
1928 static inline uint16_t nvme_zrm_auto(NvmeCtrl *n, NvmeNamespace *ns,
1929 NvmeZone *zone)
1931 return nvme_zrm_open_flags(n, ns, zone, NVME_ZRM_AUTO);
1934 static void nvme_advance_zone_wp(NvmeNamespace *ns, NvmeZone *zone,
1935 uint32_t nlb)
1937 zone->d.wp += nlb;
1939 if (zone->d.wp == nvme_zone_wr_boundary(zone)) {
1940 nvme_zrm_finish(ns, zone);
1944 static void nvme_zoned_zrwa_implicit_flush(NvmeNamespace *ns, NvmeZone *zone,
1945 uint32_t nlbc)
1947 uint16_t nzrwafgs = DIV_ROUND_UP(nlbc, ns->zns.zrwafg);
1949 nlbc = nzrwafgs * ns->zns.zrwafg;
1951 trace_pci_nvme_zoned_zrwa_implicit_flush(zone->d.zslba, nlbc);
1953 zone->w_ptr += nlbc;
1955 nvme_advance_zone_wp(ns, zone, nlbc);
1958 static void nvme_finalize_zoned_write(NvmeNamespace *ns, NvmeRequest *req)
1960 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1961 NvmeZone *zone;
1962 uint64_t slba;
1963 uint32_t nlb;
1965 slba = le64_to_cpu(rw->slba);
1966 nlb = le16_to_cpu(rw->nlb) + 1;
1967 zone = nvme_get_zone_by_slba(ns, slba);
1968 assert(zone);
1970 if (zone->d.za & NVME_ZA_ZRWA_VALID) {
1971 uint64_t ezrwa = zone->w_ptr + ns->zns.zrwas - 1;
1972 uint64_t elba = slba + nlb - 1;
1974 if (elba > ezrwa) {
1975 nvme_zoned_zrwa_implicit_flush(ns, zone, elba - ezrwa);
1978 return;
1981 nvme_advance_zone_wp(ns, zone, nlb);
1984 static inline bool nvme_is_write(NvmeRequest *req)
1986 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
1988 return rw->opcode == NVME_CMD_WRITE ||
1989 rw->opcode == NVME_CMD_ZONE_APPEND ||
1990 rw->opcode == NVME_CMD_WRITE_ZEROES;
1993 static AioContext *nvme_get_aio_context(BlockAIOCB *acb)
1995 return qemu_get_aio_context();
1998 static void nvme_misc_cb(void *opaque, int ret)
2000 NvmeRequest *req = opaque;
2002 trace_pci_nvme_misc_cb(nvme_cid(req));
2004 if (ret) {
2005 nvme_aio_err(req, ret);
2008 nvme_enqueue_req_completion(nvme_cq(req), req);
2011 void nvme_rw_complete_cb(void *opaque, int ret)
2013 NvmeRequest *req = opaque;
2014 NvmeNamespace *ns = req->ns;
2015 BlockBackend *blk = ns->blkconf.blk;
2016 BlockAcctCookie *acct = &req->acct;
2017 BlockAcctStats *stats = blk_get_stats(blk);
2019 trace_pci_nvme_rw_complete_cb(nvme_cid(req), blk_name(blk));
2021 if (ret) {
2022 block_acct_failed(stats, acct);
2023 nvme_aio_err(req, ret);
2024 } else {
2025 block_acct_done(stats, acct);
2028 if (ns->params.zoned && nvme_is_write(req)) {
2029 nvme_finalize_zoned_write(ns, req);
2032 nvme_enqueue_req_completion(nvme_cq(req), req);
2035 static void nvme_rw_cb(void *opaque, int ret)
2037 NvmeRequest *req = opaque;
2038 NvmeNamespace *ns = req->ns;
2040 BlockBackend *blk = ns->blkconf.blk;
2042 trace_pci_nvme_rw_cb(nvme_cid(req), blk_name(blk));
2044 if (ret) {
2045 goto out;
2048 if (ns->lbaf.ms) {
2049 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2050 uint64_t slba = le64_to_cpu(rw->slba);
2051 uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
2052 uint64_t offset = nvme_moff(ns, slba);
2054 if (req->cmd.opcode == NVME_CMD_WRITE_ZEROES) {
2055 size_t mlen = nvme_m2b(ns, nlb);
2057 req->aiocb = blk_aio_pwrite_zeroes(blk, offset, mlen,
2058 BDRV_REQ_MAY_UNMAP,
2059 nvme_rw_complete_cb, req);
2060 return;
2063 if (nvme_ns_ext(ns) || req->cmd.mptr) {
2064 uint16_t status;
2066 nvme_sg_unmap(&req->sg);
2067 status = nvme_map_mdata(nvme_ctrl(req), nlb, req);
2068 if (status) {
2069 ret = -EFAULT;
2070 goto out;
2073 if (req->cmd.opcode == NVME_CMD_READ) {
2074 return nvme_blk_read(blk, offset, nvme_rw_complete_cb, req);
2077 return nvme_blk_write(blk, offset, nvme_rw_complete_cb, req);
2081 out:
2082 nvme_rw_complete_cb(req, ret);
2085 static void nvme_verify_cb(void *opaque, int ret)
2087 NvmeBounceContext *ctx = opaque;
2088 NvmeRequest *req = ctx->req;
2089 NvmeNamespace *ns = req->ns;
2090 BlockBackend *blk = ns->blkconf.blk;
2091 BlockAcctCookie *acct = &req->acct;
2092 BlockAcctStats *stats = blk_get_stats(blk);
2093 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2094 uint64_t slba = le64_to_cpu(rw->slba);
2095 uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
2096 uint16_t apptag = le16_to_cpu(rw->apptag);
2097 uint16_t appmask = le16_to_cpu(rw->appmask);
2098 uint64_t reftag = le32_to_cpu(rw->reftag);
2099 uint64_t cdw3 = le32_to_cpu(rw->cdw3);
2100 uint16_t status;
2102 reftag |= cdw3 << 32;
2104 trace_pci_nvme_verify_cb(nvme_cid(req), prinfo, apptag, appmask, reftag);
2106 if (ret) {
2107 block_acct_failed(stats, acct);
2108 nvme_aio_err(req, ret);
2109 goto out;
2112 block_acct_done(stats, acct);
2114 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2115 status = nvme_dif_mangle_mdata(ns, ctx->mdata.bounce,
2116 ctx->mdata.iov.size, slba);
2117 if (status) {
2118 req->status = status;
2119 goto out;
2122 req->status = nvme_dif_check(ns, ctx->data.bounce, ctx->data.iov.size,
2123 ctx->mdata.bounce, ctx->mdata.iov.size,
2124 prinfo, slba, apptag, appmask, &reftag);
2127 out:
2128 qemu_iovec_destroy(&ctx->data.iov);
2129 g_free(ctx->data.bounce);
2131 qemu_iovec_destroy(&ctx->mdata.iov);
2132 g_free(ctx->mdata.bounce);
2134 g_free(ctx);
2136 nvme_enqueue_req_completion(nvme_cq(req), req);
2140 static void nvme_verify_mdata_in_cb(void *opaque, int ret)
2142 NvmeBounceContext *ctx = opaque;
2143 NvmeRequest *req = ctx->req;
2144 NvmeNamespace *ns = req->ns;
2145 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2146 uint64_t slba = le64_to_cpu(rw->slba);
2147 uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
2148 size_t mlen = nvme_m2b(ns, nlb);
2149 uint64_t offset = nvme_moff(ns, slba);
2150 BlockBackend *blk = ns->blkconf.blk;
2152 trace_pci_nvme_verify_mdata_in_cb(nvme_cid(req), blk_name(blk));
2154 if (ret) {
2155 goto out;
2158 ctx->mdata.bounce = g_malloc(mlen);
2160 qemu_iovec_reset(&ctx->mdata.iov);
2161 qemu_iovec_add(&ctx->mdata.iov, ctx->mdata.bounce, mlen);
2163 req->aiocb = blk_aio_preadv(blk, offset, &ctx->mdata.iov, 0,
2164 nvme_verify_cb, ctx);
2165 return;
2167 out:
2168 nvme_verify_cb(ctx, ret);
2171 struct nvme_compare_ctx {
2172 struct {
2173 QEMUIOVector iov;
2174 uint8_t *bounce;
2175 } data;
2177 struct {
2178 QEMUIOVector iov;
2179 uint8_t *bounce;
2180 } mdata;
2183 static void nvme_compare_mdata_cb(void *opaque, int ret)
2185 NvmeRequest *req = opaque;
2186 NvmeNamespace *ns = req->ns;
2187 NvmeCtrl *n = nvme_ctrl(req);
2188 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2189 uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
2190 uint16_t apptag = le16_to_cpu(rw->apptag);
2191 uint16_t appmask = le16_to_cpu(rw->appmask);
2192 uint64_t reftag = le32_to_cpu(rw->reftag);
2193 uint64_t cdw3 = le32_to_cpu(rw->cdw3);
2194 struct nvme_compare_ctx *ctx = req->opaque;
2195 g_autofree uint8_t *buf = NULL;
2196 BlockBackend *blk = ns->blkconf.blk;
2197 BlockAcctCookie *acct = &req->acct;
2198 BlockAcctStats *stats = blk_get_stats(blk);
2199 uint16_t status = NVME_SUCCESS;
2201 reftag |= cdw3 << 32;
2203 trace_pci_nvme_compare_mdata_cb(nvme_cid(req));
2205 if (ret) {
2206 block_acct_failed(stats, acct);
2207 nvme_aio_err(req, ret);
2208 goto out;
2211 buf = g_malloc(ctx->mdata.iov.size);
2213 status = nvme_bounce_mdata(n, buf, ctx->mdata.iov.size,
2214 NVME_TX_DIRECTION_TO_DEVICE, req);
2215 if (status) {
2216 req->status = status;
2217 goto out;
2220 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2221 uint64_t slba = le64_to_cpu(rw->slba);
2222 uint8_t *bufp;
2223 uint8_t *mbufp = ctx->mdata.bounce;
2224 uint8_t *end = mbufp + ctx->mdata.iov.size;
2225 int16_t pil = 0;
2227 status = nvme_dif_check(ns, ctx->data.bounce, ctx->data.iov.size,
2228 ctx->mdata.bounce, ctx->mdata.iov.size, prinfo,
2229 slba, apptag, appmask, &reftag);
2230 if (status) {
2231 req->status = status;
2232 goto out;
2236 * When formatted with protection information, do not compare the DIF
2237 * tuple.
2239 if (!(ns->id_ns.dps & NVME_ID_NS_DPS_FIRST_EIGHT)) {
2240 pil = ns->lbaf.ms - nvme_pi_tuple_size(ns);
2243 for (bufp = buf; mbufp < end; bufp += ns->lbaf.ms, mbufp += ns->lbaf.ms) {
2244 if (memcmp(bufp + pil, mbufp + pil, ns->lbaf.ms - pil)) {
2245 req->status = NVME_CMP_FAILURE;
2246 goto out;
2250 goto out;
2253 if (memcmp(buf, ctx->mdata.bounce, ctx->mdata.iov.size)) {
2254 req->status = NVME_CMP_FAILURE;
2255 goto out;
2258 block_acct_done(stats, acct);
2260 out:
2261 qemu_iovec_destroy(&ctx->data.iov);
2262 g_free(ctx->data.bounce);
2264 qemu_iovec_destroy(&ctx->mdata.iov);
2265 g_free(ctx->mdata.bounce);
2267 g_free(ctx);
2269 nvme_enqueue_req_completion(nvme_cq(req), req);
2272 static void nvme_compare_data_cb(void *opaque, int ret)
2274 NvmeRequest *req = opaque;
2275 NvmeCtrl *n = nvme_ctrl(req);
2276 NvmeNamespace *ns = req->ns;
2277 BlockBackend *blk = ns->blkconf.blk;
2278 BlockAcctCookie *acct = &req->acct;
2279 BlockAcctStats *stats = blk_get_stats(blk);
2281 struct nvme_compare_ctx *ctx = req->opaque;
2282 g_autofree uint8_t *buf = NULL;
2283 uint16_t status;
2285 trace_pci_nvme_compare_data_cb(nvme_cid(req));
2287 if (ret) {
2288 block_acct_failed(stats, acct);
2289 nvme_aio_err(req, ret);
2290 goto out;
2293 buf = g_malloc(ctx->data.iov.size);
2295 status = nvme_bounce_data(n, buf, ctx->data.iov.size,
2296 NVME_TX_DIRECTION_TO_DEVICE, req);
2297 if (status) {
2298 req->status = status;
2299 goto out;
2302 if (memcmp(buf, ctx->data.bounce, ctx->data.iov.size)) {
2303 req->status = NVME_CMP_FAILURE;
2304 goto out;
2307 if (ns->lbaf.ms) {
2308 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2309 uint64_t slba = le64_to_cpu(rw->slba);
2310 uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
2311 size_t mlen = nvme_m2b(ns, nlb);
2312 uint64_t offset = nvme_moff(ns, slba);
2314 ctx->mdata.bounce = g_malloc(mlen);
2316 qemu_iovec_init(&ctx->mdata.iov, 1);
2317 qemu_iovec_add(&ctx->mdata.iov, ctx->mdata.bounce, mlen);
2319 req->aiocb = blk_aio_preadv(blk, offset, &ctx->mdata.iov, 0,
2320 nvme_compare_mdata_cb, req);
2321 return;
2324 block_acct_done(stats, acct);
2326 out:
2327 qemu_iovec_destroy(&ctx->data.iov);
2328 g_free(ctx->data.bounce);
2329 g_free(ctx);
2331 nvme_enqueue_req_completion(nvme_cq(req), req);
2334 typedef struct NvmeDSMAIOCB {
2335 BlockAIOCB common;
2336 BlockAIOCB *aiocb;
2337 NvmeRequest *req;
2338 QEMUBH *bh;
2339 int ret;
2341 NvmeDsmRange *range;
2342 unsigned int nr;
2343 unsigned int idx;
2344 } NvmeDSMAIOCB;
2346 static void nvme_dsm_cancel(BlockAIOCB *aiocb)
2348 NvmeDSMAIOCB *iocb = container_of(aiocb, NvmeDSMAIOCB, common);
2350 /* break nvme_dsm_cb loop */
2351 iocb->idx = iocb->nr;
2352 iocb->ret = -ECANCELED;
2354 if (iocb->aiocb) {
2355 blk_aio_cancel_async(iocb->aiocb);
2356 iocb->aiocb = NULL;
2357 } else {
2359 * We only reach this if nvme_dsm_cancel() has already been called or
2360 * the command ran to completion and nvme_dsm_bh is scheduled to run.
2362 assert(iocb->idx == iocb->nr);
2366 static const AIOCBInfo nvme_dsm_aiocb_info = {
2367 .aiocb_size = sizeof(NvmeDSMAIOCB),
2368 .cancel_async = nvme_dsm_cancel,
2371 static void nvme_dsm_bh(void *opaque)
2373 NvmeDSMAIOCB *iocb = opaque;
2375 iocb->common.cb(iocb->common.opaque, iocb->ret);
2377 qemu_bh_delete(iocb->bh);
2378 iocb->bh = NULL;
2379 qemu_aio_unref(iocb);
2382 static void nvme_dsm_cb(void *opaque, int ret);
2384 static void nvme_dsm_md_cb(void *opaque, int ret)
2386 NvmeDSMAIOCB *iocb = opaque;
2387 NvmeRequest *req = iocb->req;
2388 NvmeNamespace *ns = req->ns;
2389 NvmeDsmRange *range;
2390 uint64_t slba;
2391 uint32_t nlb;
2393 if (ret < 0) {
2394 iocb->ret = ret;
2395 goto done;
2398 if (!ns->lbaf.ms) {
2399 nvme_dsm_cb(iocb, 0);
2400 return;
2403 range = &iocb->range[iocb->idx - 1];
2404 slba = le64_to_cpu(range->slba);
2405 nlb = le32_to_cpu(range->nlb);
2408 * Check that all block were discarded (zeroed); otherwise we do not zero
2409 * the metadata.
2412 ret = nvme_block_status_all(ns, slba, nlb, BDRV_BLOCK_ZERO);
2413 if (ret) {
2414 if (ret < 0) {
2415 iocb->ret = ret;
2416 goto done;
2419 nvme_dsm_cb(iocb, 0);
2420 return;
2423 iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk, nvme_moff(ns, slba),
2424 nvme_m2b(ns, nlb), BDRV_REQ_MAY_UNMAP,
2425 nvme_dsm_cb, iocb);
2426 return;
2428 done:
2429 iocb->aiocb = NULL;
2430 qemu_bh_schedule(iocb->bh);
2433 static void nvme_dsm_cb(void *opaque, int ret)
2435 NvmeDSMAIOCB *iocb = opaque;
2436 NvmeRequest *req = iocb->req;
2437 NvmeCtrl *n = nvme_ctrl(req);
2438 NvmeNamespace *ns = req->ns;
2439 NvmeDsmRange *range;
2440 uint64_t slba;
2441 uint32_t nlb;
2443 if (ret < 0) {
2444 iocb->ret = ret;
2445 goto done;
2448 next:
2449 if (iocb->idx == iocb->nr) {
2450 goto done;
2453 range = &iocb->range[iocb->idx++];
2454 slba = le64_to_cpu(range->slba);
2455 nlb = le32_to_cpu(range->nlb);
2457 trace_pci_nvme_dsm_deallocate(slba, nlb);
2459 if (nlb > n->dmrsl) {
2460 trace_pci_nvme_dsm_single_range_limit_exceeded(nlb, n->dmrsl);
2461 goto next;
2464 if (nvme_check_bounds(ns, slba, nlb)) {
2465 trace_pci_nvme_err_invalid_lba_range(slba, nlb,
2466 ns->id_ns.nsze);
2467 goto next;
2470 iocb->aiocb = blk_aio_pdiscard(ns->blkconf.blk, nvme_l2b(ns, slba),
2471 nvme_l2b(ns, nlb),
2472 nvme_dsm_md_cb, iocb);
2473 return;
2475 done:
2476 iocb->aiocb = NULL;
2477 qemu_bh_schedule(iocb->bh);
2480 static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest *req)
2482 NvmeNamespace *ns = req->ns;
2483 NvmeDsmCmd *dsm = (NvmeDsmCmd *) &req->cmd;
2484 uint32_t attr = le32_to_cpu(dsm->attributes);
2485 uint32_t nr = (le32_to_cpu(dsm->nr) & 0xff) + 1;
2486 uint16_t status = NVME_SUCCESS;
2488 trace_pci_nvme_dsm(nr, attr);
2490 if (attr & NVME_DSMGMT_AD) {
2491 NvmeDSMAIOCB *iocb = blk_aio_get(&nvme_dsm_aiocb_info, ns->blkconf.blk,
2492 nvme_misc_cb, req);
2494 iocb->req = req;
2495 iocb->bh = qemu_bh_new(nvme_dsm_bh, iocb);
2496 iocb->ret = 0;
2497 iocb->range = g_new(NvmeDsmRange, nr);
2498 iocb->nr = nr;
2499 iocb->idx = 0;
2501 status = nvme_h2c(n, (uint8_t *)iocb->range, sizeof(NvmeDsmRange) * nr,
2502 req);
2503 if (status) {
2504 return status;
2507 req->aiocb = &iocb->common;
2508 nvme_dsm_cb(iocb, 0);
2510 return NVME_NO_COMPLETE;
2513 return status;
2516 static uint16_t nvme_verify(NvmeCtrl *n, NvmeRequest *req)
2518 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
2519 NvmeNamespace *ns = req->ns;
2520 BlockBackend *blk = ns->blkconf.blk;
2521 uint64_t slba = le64_to_cpu(rw->slba);
2522 uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
2523 size_t len = nvme_l2b(ns, nlb);
2524 int64_t offset = nvme_l2b(ns, slba);
2525 uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
2526 uint32_t reftag = le32_to_cpu(rw->reftag);
2527 NvmeBounceContext *ctx = NULL;
2528 uint16_t status;
2530 trace_pci_nvme_verify(nvme_cid(req), nvme_nsid(ns), slba, nlb);
2532 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2533 status = nvme_check_prinfo(ns, prinfo, slba, reftag);
2534 if (status) {
2535 return status;
2538 if (prinfo & NVME_PRINFO_PRACT) {
2539 return NVME_INVALID_PROT_INFO | NVME_DNR;
2543 if (len > n->page_size << n->params.vsl) {
2544 return NVME_INVALID_FIELD | NVME_DNR;
2547 status = nvme_check_bounds(ns, slba, nlb);
2548 if (status) {
2549 return status;
2552 if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
2553 status = nvme_check_dulbe(ns, slba, nlb);
2554 if (status) {
2555 return status;
2559 ctx = g_new0(NvmeBounceContext, 1);
2560 ctx->req = req;
2562 ctx->data.bounce = g_malloc(len);
2564 qemu_iovec_init(&ctx->data.iov, 1);
2565 qemu_iovec_add(&ctx->data.iov, ctx->data.bounce, len);
2567 block_acct_start(blk_get_stats(blk), &req->acct, ctx->data.iov.size,
2568 BLOCK_ACCT_READ);
2570 req->aiocb = blk_aio_preadv(ns->blkconf.blk, offset, &ctx->data.iov, 0,
2571 nvme_verify_mdata_in_cb, ctx);
2572 return NVME_NO_COMPLETE;
2575 typedef struct NvmeCopyAIOCB {
2576 BlockAIOCB common;
2577 BlockAIOCB *aiocb;
2578 NvmeRequest *req;
2579 QEMUBH *bh;
2580 int ret;
2582 void *ranges;
2583 unsigned int format;
2584 int nr;
2585 int idx;
2587 uint8_t *bounce;
2588 QEMUIOVector iov;
2589 struct {
2590 BlockAcctCookie read;
2591 BlockAcctCookie write;
2592 } acct;
2594 uint64_t reftag;
2595 uint64_t slba;
2597 NvmeZone *zone;
2598 } NvmeCopyAIOCB;
2600 static void nvme_copy_cancel(BlockAIOCB *aiocb)
2602 NvmeCopyAIOCB *iocb = container_of(aiocb, NvmeCopyAIOCB, common);
2604 iocb->ret = -ECANCELED;
2606 if (iocb->aiocb) {
2607 blk_aio_cancel_async(iocb->aiocb);
2608 iocb->aiocb = NULL;
2612 static const AIOCBInfo nvme_copy_aiocb_info = {
2613 .aiocb_size = sizeof(NvmeCopyAIOCB),
2614 .cancel_async = nvme_copy_cancel,
2617 static void nvme_copy_bh(void *opaque)
2619 NvmeCopyAIOCB *iocb = opaque;
2620 NvmeRequest *req = iocb->req;
2621 NvmeNamespace *ns = req->ns;
2622 BlockAcctStats *stats = blk_get_stats(ns->blkconf.blk);
2624 if (iocb->idx != iocb->nr) {
2625 req->cqe.result = cpu_to_le32(iocb->idx);
2628 qemu_iovec_destroy(&iocb->iov);
2629 g_free(iocb->bounce);
2631 qemu_bh_delete(iocb->bh);
2632 iocb->bh = NULL;
2634 if (iocb->ret < 0) {
2635 block_acct_failed(stats, &iocb->acct.read);
2636 block_acct_failed(stats, &iocb->acct.write);
2637 } else {
2638 block_acct_done(stats, &iocb->acct.read);
2639 block_acct_done(stats, &iocb->acct.write);
2642 iocb->common.cb(iocb->common.opaque, iocb->ret);
2643 qemu_aio_unref(iocb);
2646 static void nvme_copy_cb(void *opaque, int ret);
2648 static void nvme_copy_source_range_parse_format0(void *ranges, int idx,
2649 uint64_t *slba, uint32_t *nlb,
2650 uint16_t *apptag,
2651 uint16_t *appmask,
2652 uint64_t *reftag)
2654 NvmeCopySourceRangeFormat0 *_ranges = ranges;
2656 if (slba) {
2657 *slba = le64_to_cpu(_ranges[idx].slba);
2660 if (nlb) {
2661 *nlb = le16_to_cpu(_ranges[idx].nlb) + 1;
2664 if (apptag) {
2665 *apptag = le16_to_cpu(_ranges[idx].apptag);
2668 if (appmask) {
2669 *appmask = le16_to_cpu(_ranges[idx].appmask);
2672 if (reftag) {
2673 *reftag = le32_to_cpu(_ranges[idx].reftag);
2677 static void nvme_copy_source_range_parse_format1(void *ranges, int idx,
2678 uint64_t *slba, uint32_t *nlb,
2679 uint16_t *apptag,
2680 uint16_t *appmask,
2681 uint64_t *reftag)
2683 NvmeCopySourceRangeFormat1 *_ranges = ranges;
2685 if (slba) {
2686 *slba = le64_to_cpu(_ranges[idx].slba);
2689 if (nlb) {
2690 *nlb = le16_to_cpu(_ranges[idx].nlb) + 1;
2693 if (apptag) {
2694 *apptag = le16_to_cpu(_ranges[idx].apptag);
2697 if (appmask) {
2698 *appmask = le16_to_cpu(_ranges[idx].appmask);
2701 if (reftag) {
2702 *reftag = 0;
2704 *reftag |= (uint64_t)_ranges[idx].sr[4] << 40;
2705 *reftag |= (uint64_t)_ranges[idx].sr[5] << 32;
2706 *reftag |= (uint64_t)_ranges[idx].sr[6] << 24;
2707 *reftag |= (uint64_t)_ranges[idx].sr[7] << 16;
2708 *reftag |= (uint64_t)_ranges[idx].sr[8] << 8;
2709 *reftag |= (uint64_t)_ranges[idx].sr[9];
2713 static void nvme_copy_source_range_parse(void *ranges, int idx, uint8_t format,
2714 uint64_t *slba, uint32_t *nlb,
2715 uint16_t *apptag, uint16_t *appmask,
2716 uint64_t *reftag)
2718 switch (format) {
2719 case NVME_COPY_FORMAT_0:
2720 nvme_copy_source_range_parse_format0(ranges, idx, slba, nlb, apptag,
2721 appmask, reftag);
2722 break;
2724 case NVME_COPY_FORMAT_1:
2725 nvme_copy_source_range_parse_format1(ranges, idx, slba, nlb, apptag,
2726 appmask, reftag);
2727 break;
2729 default:
2730 abort();
2734 static void nvme_copy_out_completed_cb(void *opaque, int ret)
2736 NvmeCopyAIOCB *iocb = opaque;
2737 NvmeRequest *req = iocb->req;
2738 NvmeNamespace *ns = req->ns;
2739 uint32_t nlb;
2741 nvme_copy_source_range_parse(iocb->ranges, iocb->idx, iocb->format, NULL,
2742 &nlb, NULL, NULL, NULL);
2744 if (ret < 0) {
2745 iocb->ret = ret;
2746 goto out;
2747 } else if (iocb->ret < 0) {
2748 goto out;
2751 if (ns->params.zoned) {
2752 nvme_advance_zone_wp(ns, iocb->zone, nlb);
2755 iocb->idx++;
2756 iocb->slba += nlb;
2757 out:
2758 nvme_copy_cb(iocb, iocb->ret);
2761 static void nvme_copy_out_cb(void *opaque, int ret)
2763 NvmeCopyAIOCB *iocb = opaque;
2764 NvmeRequest *req = iocb->req;
2765 NvmeNamespace *ns = req->ns;
2766 uint32_t nlb;
2767 size_t mlen;
2768 uint8_t *mbounce;
2770 if (ret < 0) {
2771 iocb->ret = ret;
2772 goto out;
2773 } else if (iocb->ret < 0) {
2774 goto out;
2777 if (!ns->lbaf.ms) {
2778 nvme_copy_out_completed_cb(iocb, 0);
2779 return;
2782 nvme_copy_source_range_parse(iocb->ranges, iocb->idx, iocb->format, NULL,
2783 &nlb, NULL, NULL, NULL);
2785 mlen = nvme_m2b(ns, nlb);
2786 mbounce = iocb->bounce + nvme_l2b(ns, nlb);
2788 qemu_iovec_reset(&iocb->iov);
2789 qemu_iovec_add(&iocb->iov, mbounce, mlen);
2791 iocb->aiocb = blk_aio_pwritev(ns->blkconf.blk, nvme_moff(ns, iocb->slba),
2792 &iocb->iov, 0, nvme_copy_out_completed_cb,
2793 iocb);
2795 return;
2797 out:
2798 nvme_copy_cb(iocb, ret);
2801 static void nvme_copy_in_completed_cb(void *opaque, int ret)
2803 NvmeCopyAIOCB *iocb = opaque;
2804 NvmeRequest *req = iocb->req;
2805 NvmeNamespace *ns = req->ns;
2806 uint32_t nlb;
2807 uint64_t slba;
2808 uint16_t apptag, appmask;
2809 uint64_t reftag;
2810 size_t len;
2811 uint16_t status;
2813 if (ret < 0) {
2814 iocb->ret = ret;
2815 goto out;
2816 } else if (iocb->ret < 0) {
2817 goto out;
2820 nvme_copy_source_range_parse(iocb->ranges, iocb->idx, iocb->format, &slba,
2821 &nlb, &apptag, &appmask, &reftag);
2822 len = nvme_l2b(ns, nlb);
2824 trace_pci_nvme_copy_out(iocb->slba, nlb);
2826 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
2827 NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd;
2829 uint16_t prinfor = ((copy->control[0] >> 4) & 0xf);
2830 uint16_t prinfow = ((copy->control[2] >> 2) & 0xf);
2832 size_t mlen = nvme_m2b(ns, nlb);
2833 uint8_t *mbounce = iocb->bounce + nvme_l2b(ns, nlb);
2835 status = nvme_dif_mangle_mdata(ns, mbounce, mlen, slba);
2836 if (status) {
2837 goto invalid;
2839 status = nvme_dif_check(ns, iocb->bounce, len, mbounce, mlen, prinfor,
2840 slba, apptag, appmask, &reftag);
2841 if (status) {
2842 goto invalid;
2845 apptag = le16_to_cpu(copy->apptag);
2846 appmask = le16_to_cpu(copy->appmask);
2848 if (prinfow & NVME_PRINFO_PRACT) {
2849 status = nvme_check_prinfo(ns, prinfow, iocb->slba, iocb->reftag);
2850 if (status) {
2851 goto invalid;
2854 nvme_dif_pract_generate_dif(ns, iocb->bounce, len, mbounce, mlen,
2855 apptag, &iocb->reftag);
2856 } else {
2857 status = nvme_dif_check(ns, iocb->bounce, len, mbounce, mlen,
2858 prinfow, iocb->slba, apptag, appmask,
2859 &iocb->reftag);
2860 if (status) {
2861 goto invalid;
2866 status = nvme_check_bounds(ns, iocb->slba, nlb);
2867 if (status) {
2868 goto invalid;
2871 if (ns->params.zoned) {
2872 status = nvme_check_zone_write(ns, iocb->zone, iocb->slba, nlb);
2873 if (status) {
2874 goto invalid;
2877 if (!(iocb->zone->d.za & NVME_ZA_ZRWA_VALID)) {
2878 iocb->zone->w_ptr += nlb;
2882 qemu_iovec_reset(&iocb->iov);
2883 qemu_iovec_add(&iocb->iov, iocb->bounce, len);
2885 iocb->aiocb = blk_aio_pwritev(ns->blkconf.blk, nvme_l2b(ns, iocb->slba),
2886 &iocb->iov, 0, nvme_copy_out_cb, iocb);
2888 return;
2890 invalid:
2891 req->status = status;
2892 iocb->aiocb = NULL;
2893 if (iocb->bh) {
2894 qemu_bh_schedule(iocb->bh);
2897 return;
2899 out:
2900 nvme_copy_cb(iocb, ret);
2903 static void nvme_copy_in_cb(void *opaque, int ret)
2905 NvmeCopyAIOCB *iocb = opaque;
2906 NvmeRequest *req = iocb->req;
2907 NvmeNamespace *ns = req->ns;
2908 uint64_t slba;
2909 uint32_t nlb;
2911 if (ret < 0) {
2912 iocb->ret = ret;
2913 goto out;
2914 } else if (iocb->ret < 0) {
2915 goto out;
2918 if (!ns->lbaf.ms) {
2919 nvme_copy_in_completed_cb(iocb, 0);
2920 return;
2923 nvme_copy_source_range_parse(iocb->ranges, iocb->idx, iocb->format, &slba,
2924 &nlb, NULL, NULL, NULL);
2926 qemu_iovec_reset(&iocb->iov);
2927 qemu_iovec_add(&iocb->iov, iocb->bounce + nvme_l2b(ns, nlb),
2928 nvme_m2b(ns, nlb));
2930 iocb->aiocb = blk_aio_preadv(ns->blkconf.blk, nvme_moff(ns, slba),
2931 &iocb->iov, 0, nvme_copy_in_completed_cb,
2932 iocb);
2933 return;
2935 out:
2936 nvme_copy_cb(iocb, iocb->ret);
2939 static void nvme_copy_cb(void *opaque, int ret)
2941 NvmeCopyAIOCB *iocb = opaque;
2942 NvmeRequest *req = iocb->req;
2943 NvmeNamespace *ns = req->ns;
2944 uint64_t slba;
2945 uint32_t nlb;
2946 size_t len;
2947 uint16_t status;
2949 if (ret < 0) {
2950 iocb->ret = ret;
2951 goto done;
2952 } else if (iocb->ret < 0) {
2953 goto done;
2956 if (iocb->idx == iocb->nr) {
2957 goto done;
2960 nvme_copy_source_range_parse(iocb->ranges, iocb->idx, iocb->format, &slba,
2961 &nlb, NULL, NULL, NULL);
2962 len = nvme_l2b(ns, nlb);
2964 trace_pci_nvme_copy_source_range(slba, nlb);
2966 if (nlb > le16_to_cpu(ns->id_ns.mssrl)) {
2967 status = NVME_CMD_SIZE_LIMIT | NVME_DNR;
2968 goto invalid;
2971 status = nvme_check_bounds(ns, slba, nlb);
2972 if (status) {
2973 goto invalid;
2976 if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
2977 status = nvme_check_dulbe(ns, slba, nlb);
2978 if (status) {
2979 goto invalid;
2983 if (ns->params.zoned) {
2984 status = nvme_check_zone_read(ns, slba, nlb);
2985 if (status) {
2986 goto invalid;
2990 qemu_iovec_reset(&iocb->iov);
2991 qemu_iovec_add(&iocb->iov, iocb->bounce, len);
2993 iocb->aiocb = blk_aio_preadv(ns->blkconf.blk, nvme_l2b(ns, slba),
2994 &iocb->iov, 0, nvme_copy_in_cb, iocb);
2995 return;
2997 invalid:
2998 req->status = status;
2999 done:
3000 iocb->aiocb = NULL;
3001 if (iocb->bh) {
3002 qemu_bh_schedule(iocb->bh);
3007 static uint16_t nvme_copy(NvmeCtrl *n, NvmeRequest *req)
3009 NvmeNamespace *ns = req->ns;
3010 NvmeCopyCmd *copy = (NvmeCopyCmd *)&req->cmd;
3011 NvmeCopyAIOCB *iocb = blk_aio_get(&nvme_copy_aiocb_info, ns->blkconf.blk,
3012 nvme_misc_cb, req);
3013 uint16_t nr = copy->nr + 1;
3014 uint8_t format = copy->control[0] & 0xf;
3015 uint16_t prinfor = ((copy->control[0] >> 4) & 0xf);
3016 uint16_t prinfow = ((copy->control[2] >> 2) & 0xf);
3017 size_t len = sizeof(NvmeCopySourceRangeFormat0);
3019 uint16_t status;
3021 trace_pci_nvme_copy(nvme_cid(req), nvme_nsid(ns), nr, format);
3023 iocb->ranges = NULL;
3024 iocb->zone = NULL;
3026 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps) &&
3027 ((prinfor & NVME_PRINFO_PRACT) != (prinfow & NVME_PRINFO_PRACT))) {
3028 status = NVME_INVALID_FIELD | NVME_DNR;
3029 goto invalid;
3032 if (!(n->id_ctrl.ocfs & (1 << format))) {
3033 trace_pci_nvme_err_copy_invalid_format(format);
3034 status = NVME_INVALID_FIELD | NVME_DNR;
3035 goto invalid;
3038 if (nr > ns->id_ns.msrc + 1) {
3039 status = NVME_CMD_SIZE_LIMIT | NVME_DNR;
3040 goto invalid;
3043 if (ns->pif && format != 0x1) {
3044 status = NVME_INVALID_FORMAT | NVME_DNR;
3045 goto invalid;
3048 if (ns->pif) {
3049 len = sizeof(NvmeCopySourceRangeFormat1);
3052 iocb->format = format;
3053 iocb->ranges = g_malloc_n(nr, len);
3054 status = nvme_h2c(n, (uint8_t *)iocb->ranges, len * nr, req);
3055 if (status) {
3056 goto invalid;
3059 iocb->slba = le64_to_cpu(copy->sdlba);
3061 if (ns->params.zoned) {
3062 iocb->zone = nvme_get_zone_by_slba(ns, iocb->slba);
3063 if (!iocb->zone) {
3064 status = NVME_LBA_RANGE | NVME_DNR;
3065 goto invalid;
3068 status = nvme_zrm_auto(n, ns, iocb->zone);
3069 if (status) {
3070 goto invalid;
3074 iocb->req = req;
3075 iocb->bh = qemu_bh_new(nvme_copy_bh, iocb);
3076 iocb->ret = 0;
3077 iocb->nr = nr;
3078 iocb->idx = 0;
3079 iocb->reftag = le32_to_cpu(copy->reftag);
3080 iocb->reftag |= (uint64_t)le32_to_cpu(copy->cdw3) << 32;
3081 iocb->bounce = g_malloc_n(le16_to_cpu(ns->id_ns.mssrl),
3082 ns->lbasz + ns->lbaf.ms);
3084 qemu_iovec_init(&iocb->iov, 1);
3086 block_acct_start(blk_get_stats(ns->blkconf.blk), &iocb->acct.read, 0,
3087 BLOCK_ACCT_READ);
3088 block_acct_start(blk_get_stats(ns->blkconf.blk), &iocb->acct.write, 0,
3089 BLOCK_ACCT_WRITE);
3091 req->aiocb = &iocb->common;
3092 nvme_copy_cb(iocb, 0);
3094 return NVME_NO_COMPLETE;
3096 invalid:
3097 g_free(iocb->ranges);
3098 qemu_aio_unref(iocb);
3099 return status;
3102 static uint16_t nvme_compare(NvmeCtrl *n, NvmeRequest *req)
3104 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
3105 NvmeNamespace *ns = req->ns;
3106 BlockBackend *blk = ns->blkconf.blk;
3107 uint64_t slba = le64_to_cpu(rw->slba);
3108 uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
3109 uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
3110 size_t data_len = nvme_l2b(ns, nlb);
3111 size_t len = data_len;
3112 int64_t offset = nvme_l2b(ns, slba);
3113 struct nvme_compare_ctx *ctx = NULL;
3114 uint16_t status;
3116 trace_pci_nvme_compare(nvme_cid(req), nvme_nsid(ns), slba, nlb);
3118 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps) && (prinfo & NVME_PRINFO_PRACT)) {
3119 return NVME_INVALID_PROT_INFO | NVME_DNR;
3122 if (nvme_ns_ext(ns)) {
3123 len += nvme_m2b(ns, nlb);
3126 status = nvme_check_mdts(n, len);
3127 if (status) {
3128 return status;
3131 status = nvme_check_bounds(ns, slba, nlb);
3132 if (status) {
3133 return status;
3136 if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
3137 status = nvme_check_dulbe(ns, slba, nlb);
3138 if (status) {
3139 return status;
3143 status = nvme_map_dptr(n, &req->sg, len, &req->cmd);
3144 if (status) {
3145 return status;
3148 ctx = g_new(struct nvme_compare_ctx, 1);
3149 ctx->data.bounce = g_malloc(data_len);
3151 req->opaque = ctx;
3153 qemu_iovec_init(&ctx->data.iov, 1);
3154 qemu_iovec_add(&ctx->data.iov, ctx->data.bounce, data_len);
3156 block_acct_start(blk_get_stats(blk), &req->acct, data_len,
3157 BLOCK_ACCT_READ);
3158 req->aiocb = blk_aio_preadv(blk, offset, &ctx->data.iov, 0,
3159 nvme_compare_data_cb, req);
3161 return NVME_NO_COMPLETE;
3164 typedef struct NvmeFlushAIOCB {
3165 BlockAIOCB common;
3166 BlockAIOCB *aiocb;
3167 NvmeRequest *req;
3168 QEMUBH *bh;
3169 int ret;
3171 NvmeNamespace *ns;
3172 uint32_t nsid;
3173 bool broadcast;
3174 } NvmeFlushAIOCB;
3176 static void nvme_flush_cancel(BlockAIOCB *acb)
3178 NvmeFlushAIOCB *iocb = container_of(acb, NvmeFlushAIOCB, common);
3180 iocb->ret = -ECANCELED;
3182 if (iocb->aiocb) {
3183 blk_aio_cancel_async(iocb->aiocb);
3187 static const AIOCBInfo nvme_flush_aiocb_info = {
3188 .aiocb_size = sizeof(NvmeFlushAIOCB),
3189 .cancel_async = nvme_flush_cancel,
3190 .get_aio_context = nvme_get_aio_context,
3193 static void nvme_flush_ns_cb(void *opaque, int ret)
3195 NvmeFlushAIOCB *iocb = opaque;
3196 NvmeNamespace *ns = iocb->ns;
3198 if (ret < 0) {
3199 iocb->ret = ret;
3200 goto out;
3201 } else if (iocb->ret < 0) {
3202 goto out;
3205 if (ns) {
3206 trace_pci_nvme_flush_ns(iocb->nsid);
3208 iocb->ns = NULL;
3209 iocb->aiocb = blk_aio_flush(ns->blkconf.blk, nvme_flush_ns_cb, iocb);
3210 return;
3213 out:
3214 iocb->aiocb = NULL;
3215 qemu_bh_schedule(iocb->bh);
3218 static void nvme_flush_bh(void *opaque)
3220 NvmeFlushAIOCB *iocb = opaque;
3221 NvmeRequest *req = iocb->req;
3222 NvmeCtrl *n = nvme_ctrl(req);
3223 int i;
3225 if (iocb->ret < 0) {
3226 goto done;
3229 if (iocb->broadcast) {
3230 for (i = iocb->nsid + 1; i <= NVME_MAX_NAMESPACES; i++) {
3231 iocb->ns = nvme_ns(n, i);
3232 if (iocb->ns) {
3233 iocb->nsid = i;
3234 break;
3239 if (!iocb->ns) {
3240 goto done;
3243 nvme_flush_ns_cb(iocb, 0);
3244 return;
3246 done:
3247 qemu_bh_delete(iocb->bh);
3248 iocb->bh = NULL;
3250 iocb->common.cb(iocb->common.opaque, iocb->ret);
3252 qemu_aio_unref(iocb);
3254 return;
3257 static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req)
3259 NvmeFlushAIOCB *iocb;
3260 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
3261 uint16_t status;
3263 iocb = qemu_aio_get(&nvme_flush_aiocb_info, NULL, nvme_misc_cb, req);
3265 iocb->req = req;
3266 iocb->bh = qemu_bh_new(nvme_flush_bh, iocb);
3267 iocb->ret = 0;
3268 iocb->ns = NULL;
3269 iocb->nsid = 0;
3270 iocb->broadcast = (nsid == NVME_NSID_BROADCAST);
3272 if (!iocb->broadcast) {
3273 if (!nvme_nsid_valid(n, nsid)) {
3274 status = NVME_INVALID_NSID | NVME_DNR;
3275 goto out;
3278 iocb->ns = nvme_ns(n, nsid);
3279 if (!iocb->ns) {
3280 status = NVME_INVALID_FIELD | NVME_DNR;
3281 goto out;
3284 iocb->nsid = nsid;
3287 req->aiocb = &iocb->common;
3288 qemu_bh_schedule(iocb->bh);
3290 return NVME_NO_COMPLETE;
3292 out:
3293 qemu_bh_delete(iocb->bh);
3294 iocb->bh = NULL;
3295 qemu_aio_unref(iocb);
3297 return status;
3300 static uint16_t nvme_read(NvmeCtrl *n, NvmeRequest *req)
3302 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
3303 NvmeNamespace *ns = req->ns;
3304 uint64_t slba = le64_to_cpu(rw->slba);
3305 uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
3306 uint8_t prinfo = NVME_RW_PRINFO(le16_to_cpu(rw->control));
3307 uint64_t data_size = nvme_l2b(ns, nlb);
3308 uint64_t mapped_size = data_size;
3309 uint64_t data_offset;
3310 BlockBackend *blk = ns->blkconf.blk;
3311 uint16_t status;
3313 if (nvme_ns_ext(ns)) {
3314 mapped_size += nvme_m2b(ns, nlb);
3316 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3317 bool pract = prinfo & NVME_PRINFO_PRACT;
3319 if (pract && ns->lbaf.ms == nvme_pi_tuple_size(ns)) {
3320 mapped_size = data_size;
3325 trace_pci_nvme_read(nvme_cid(req), nvme_nsid(ns), nlb, mapped_size, slba);
3327 status = nvme_check_mdts(n, mapped_size);
3328 if (status) {
3329 goto invalid;
3332 status = nvme_check_bounds(ns, slba, nlb);
3333 if (status) {
3334 goto invalid;
3337 if (ns->params.zoned) {
3338 status = nvme_check_zone_read(ns, slba, nlb);
3339 if (status) {
3340 trace_pci_nvme_err_zone_read_not_ok(slba, nlb, status);
3341 goto invalid;
3345 if (NVME_ERR_REC_DULBE(ns->features.err_rec)) {
3346 status = nvme_check_dulbe(ns, slba, nlb);
3347 if (status) {
3348 goto invalid;
3352 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3353 return nvme_dif_rw(n, req);
3356 status = nvme_map_data(n, nlb, req);
3357 if (status) {
3358 goto invalid;
3361 data_offset = nvme_l2b(ns, slba);
3363 block_acct_start(blk_get_stats(blk), &req->acct, data_size,
3364 BLOCK_ACCT_READ);
3365 nvme_blk_read(blk, data_offset, nvme_rw_cb, req);
3366 return NVME_NO_COMPLETE;
3368 invalid:
3369 block_acct_invalid(blk_get_stats(blk), BLOCK_ACCT_READ);
3370 return status | NVME_DNR;
3373 static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append,
3374 bool wrz)
3376 NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
3377 NvmeNamespace *ns = req->ns;
3378 uint64_t slba = le64_to_cpu(rw->slba);
3379 uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
3380 uint16_t ctrl = le16_to_cpu(rw->control);
3381 uint8_t prinfo = NVME_RW_PRINFO(ctrl);
3382 uint64_t data_size = nvme_l2b(ns, nlb);
3383 uint64_t mapped_size = data_size;
3384 uint64_t data_offset;
3385 NvmeZone *zone;
3386 NvmeZonedResult *res = (NvmeZonedResult *)&req->cqe;
3387 BlockBackend *blk = ns->blkconf.blk;
3388 uint16_t status;
3390 if (nvme_ns_ext(ns)) {
3391 mapped_size += nvme_m2b(ns, nlb);
3393 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3394 bool pract = prinfo & NVME_PRINFO_PRACT;
3396 if (pract && ns->lbaf.ms == nvme_pi_tuple_size(ns)) {
3397 mapped_size -= nvme_m2b(ns, nlb);
3402 trace_pci_nvme_write(nvme_cid(req), nvme_io_opc_str(rw->opcode),
3403 nvme_nsid(ns), nlb, mapped_size, slba);
3405 if (!wrz) {
3406 status = nvme_check_mdts(n, mapped_size);
3407 if (status) {
3408 goto invalid;
3412 status = nvme_check_bounds(ns, slba, nlb);
3413 if (status) {
3414 goto invalid;
3417 if (ns->params.zoned) {
3418 zone = nvme_get_zone_by_slba(ns, slba);
3419 assert(zone);
3421 if (append) {
3422 bool piremap = !!(ctrl & NVME_RW_PIREMAP);
3424 if (unlikely(zone->d.za & NVME_ZA_ZRWA_VALID)) {
3425 return NVME_INVALID_ZONE_OP | NVME_DNR;
3428 if (unlikely(slba != zone->d.zslba)) {
3429 trace_pci_nvme_err_append_not_at_start(slba, zone->d.zslba);
3430 status = NVME_INVALID_FIELD;
3431 goto invalid;
3434 if (n->params.zasl &&
3435 data_size > (uint64_t)n->page_size << n->params.zasl) {
3436 trace_pci_nvme_err_zasl(data_size);
3437 return NVME_INVALID_FIELD | NVME_DNR;
3440 slba = zone->w_ptr;
3441 rw->slba = cpu_to_le64(slba);
3442 res->slba = cpu_to_le64(slba);
3444 switch (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3445 case NVME_ID_NS_DPS_TYPE_1:
3446 if (!piremap) {
3447 return NVME_INVALID_PROT_INFO | NVME_DNR;
3450 /* fallthrough */
3452 case NVME_ID_NS_DPS_TYPE_2:
3453 if (piremap) {
3454 uint32_t reftag = le32_to_cpu(rw->reftag);
3455 rw->reftag = cpu_to_le32(reftag + (slba - zone->d.zslba));
3458 break;
3460 case NVME_ID_NS_DPS_TYPE_3:
3461 if (piremap) {
3462 return NVME_INVALID_PROT_INFO | NVME_DNR;
3465 break;
3469 status = nvme_check_zone_write(ns, zone, slba, nlb);
3470 if (status) {
3471 goto invalid;
3474 status = nvme_zrm_auto(n, ns, zone);
3475 if (status) {
3476 goto invalid;
3479 if (!(zone->d.za & NVME_ZA_ZRWA_VALID)) {
3480 zone->w_ptr += nlb;
3484 data_offset = nvme_l2b(ns, slba);
3486 if (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
3487 return nvme_dif_rw(n, req);
3490 if (!wrz) {
3491 status = nvme_map_data(n, nlb, req);
3492 if (status) {
3493 goto invalid;
3496 block_acct_start(blk_get_stats(blk), &req->acct, data_size,
3497 BLOCK_ACCT_WRITE);
3498 nvme_blk_write(blk, data_offset, nvme_rw_cb, req);
3499 } else {
3500 req->aiocb = blk_aio_pwrite_zeroes(blk, data_offset, data_size,
3501 BDRV_REQ_MAY_UNMAP, nvme_rw_cb,
3502 req);
3505 return NVME_NO_COMPLETE;
3507 invalid:
3508 block_acct_invalid(blk_get_stats(blk), BLOCK_ACCT_WRITE);
3509 return status | NVME_DNR;
3512 static inline uint16_t nvme_write(NvmeCtrl *n, NvmeRequest *req)
3514 return nvme_do_write(n, req, false, false);
3517 static inline uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeRequest *req)
3519 return nvme_do_write(n, req, false, true);
3522 static inline uint16_t nvme_zone_append(NvmeCtrl *n, NvmeRequest *req)
3524 return nvme_do_write(n, req, true, false);
3527 static uint16_t nvme_get_mgmt_zone_slba_idx(NvmeNamespace *ns, NvmeCmd *c,
3528 uint64_t *slba, uint32_t *zone_idx)
3530 uint32_t dw10 = le32_to_cpu(c->cdw10);
3531 uint32_t dw11 = le32_to_cpu(c->cdw11);
3533 if (!ns->params.zoned) {
3534 trace_pci_nvme_err_invalid_opc(c->opcode);
3535 return NVME_INVALID_OPCODE | NVME_DNR;
3538 *slba = ((uint64_t)dw11) << 32 | dw10;
3539 if (unlikely(*slba >= ns->id_ns.nsze)) {
3540 trace_pci_nvme_err_invalid_lba_range(*slba, 0, ns->id_ns.nsze);
3541 *slba = 0;
3542 return NVME_LBA_RANGE | NVME_DNR;
3545 *zone_idx = nvme_zone_idx(ns, *slba);
3546 assert(*zone_idx < ns->num_zones);
3548 return NVME_SUCCESS;
3551 typedef uint16_t (*op_handler_t)(NvmeNamespace *, NvmeZone *, NvmeZoneState,
3552 NvmeRequest *);
3554 enum NvmeZoneProcessingMask {
3555 NVME_PROC_CURRENT_ZONE = 0,
3556 NVME_PROC_OPENED_ZONES = 1 << 0,
3557 NVME_PROC_CLOSED_ZONES = 1 << 1,
3558 NVME_PROC_READ_ONLY_ZONES = 1 << 2,
3559 NVME_PROC_FULL_ZONES = 1 << 3,
3562 static uint16_t nvme_open_zone(NvmeNamespace *ns, NvmeZone *zone,
3563 NvmeZoneState state, NvmeRequest *req)
3565 NvmeZoneSendCmd *cmd = (NvmeZoneSendCmd *)&req->cmd;
3566 int flags = 0;
3568 if (cmd->zsflags & NVME_ZSFLAG_ZRWA_ALLOC) {
3569 uint16_t ozcs = le16_to_cpu(ns->id_ns_zoned->ozcs);
3571 if (!(ozcs & NVME_ID_NS_ZONED_OZCS_ZRWASUP)) {
3572 return NVME_INVALID_ZONE_OP | NVME_DNR;
3575 if (zone->w_ptr % ns->zns.zrwafg) {
3576 return NVME_NOZRWA | NVME_DNR;
3579 flags = NVME_ZRM_ZRWA;
3582 return nvme_zrm_open_flags(nvme_ctrl(req), ns, zone, flags);
3585 static uint16_t nvme_close_zone(NvmeNamespace *ns, NvmeZone *zone,
3586 NvmeZoneState state, NvmeRequest *req)
3588 return nvme_zrm_close(ns, zone);
3591 static uint16_t nvme_finish_zone(NvmeNamespace *ns, NvmeZone *zone,
3592 NvmeZoneState state, NvmeRequest *req)
3594 return nvme_zrm_finish(ns, zone);
3597 static uint16_t nvme_offline_zone(NvmeNamespace *ns, NvmeZone *zone,
3598 NvmeZoneState state, NvmeRequest *req)
3600 switch (state) {
3601 case NVME_ZONE_STATE_READ_ONLY:
3602 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_OFFLINE);
3603 /* fall through */
3604 case NVME_ZONE_STATE_OFFLINE:
3605 return NVME_SUCCESS;
3606 default:
3607 return NVME_ZONE_INVAL_TRANSITION;
3611 static uint16_t nvme_set_zd_ext(NvmeNamespace *ns, NvmeZone *zone)
3613 uint16_t status;
3614 uint8_t state = nvme_get_zone_state(zone);
3616 if (state == NVME_ZONE_STATE_EMPTY) {
3617 status = nvme_aor_check(ns, 1, 0);
3618 if (status) {
3619 return status;
3621 nvme_aor_inc_active(ns);
3622 zone->d.za |= NVME_ZA_ZD_EXT_VALID;
3623 nvme_assign_zone_state(ns, zone, NVME_ZONE_STATE_CLOSED);
3624 return NVME_SUCCESS;
3627 return NVME_ZONE_INVAL_TRANSITION;
3630 static uint16_t nvme_bulk_proc_zone(NvmeNamespace *ns, NvmeZone *zone,
3631 enum NvmeZoneProcessingMask proc_mask,
3632 op_handler_t op_hndlr, NvmeRequest *req)
3634 uint16_t status = NVME_SUCCESS;
3635 NvmeZoneState zs = nvme_get_zone_state(zone);
3636 bool proc_zone;
3638 switch (zs) {
3639 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
3640 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
3641 proc_zone = proc_mask & NVME_PROC_OPENED_ZONES;
3642 break;
3643 case NVME_ZONE_STATE_CLOSED:
3644 proc_zone = proc_mask & NVME_PROC_CLOSED_ZONES;
3645 break;
3646 case NVME_ZONE_STATE_READ_ONLY:
3647 proc_zone = proc_mask & NVME_PROC_READ_ONLY_ZONES;
3648 break;
3649 case NVME_ZONE_STATE_FULL:
3650 proc_zone = proc_mask & NVME_PROC_FULL_ZONES;
3651 break;
3652 default:
3653 proc_zone = false;
3656 if (proc_zone) {
3657 status = op_hndlr(ns, zone, zs, req);
3660 return status;
3663 static uint16_t nvme_do_zone_op(NvmeNamespace *ns, NvmeZone *zone,
3664 enum NvmeZoneProcessingMask proc_mask,
3665 op_handler_t op_hndlr, NvmeRequest *req)
3667 NvmeZone *next;
3668 uint16_t status = NVME_SUCCESS;
3669 int i;
3671 if (!proc_mask) {
3672 status = op_hndlr(ns, zone, nvme_get_zone_state(zone), req);
3673 } else {
3674 if (proc_mask & NVME_PROC_CLOSED_ZONES) {
3675 QTAILQ_FOREACH_SAFE(zone, &ns->closed_zones, entry, next) {
3676 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
3677 req);
3678 if (status && status != NVME_NO_COMPLETE) {
3679 goto out;
3683 if (proc_mask & NVME_PROC_OPENED_ZONES) {
3684 QTAILQ_FOREACH_SAFE(zone, &ns->imp_open_zones, entry, next) {
3685 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
3686 req);
3687 if (status && status != NVME_NO_COMPLETE) {
3688 goto out;
3692 QTAILQ_FOREACH_SAFE(zone, &ns->exp_open_zones, entry, next) {
3693 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
3694 req);
3695 if (status && status != NVME_NO_COMPLETE) {
3696 goto out;
3700 if (proc_mask & NVME_PROC_FULL_ZONES) {
3701 QTAILQ_FOREACH_SAFE(zone, &ns->full_zones, entry, next) {
3702 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
3703 req);
3704 if (status && status != NVME_NO_COMPLETE) {
3705 goto out;
3710 if (proc_mask & NVME_PROC_READ_ONLY_ZONES) {
3711 for (i = 0; i < ns->num_zones; i++, zone++) {
3712 status = nvme_bulk_proc_zone(ns, zone, proc_mask, op_hndlr,
3713 req);
3714 if (status && status != NVME_NO_COMPLETE) {
3715 goto out;
3721 out:
3722 return status;
3725 typedef struct NvmeZoneResetAIOCB {
3726 BlockAIOCB common;
3727 BlockAIOCB *aiocb;
3728 NvmeRequest *req;
3729 QEMUBH *bh;
3730 int ret;
3732 bool all;
3733 int idx;
3734 NvmeZone *zone;
3735 } NvmeZoneResetAIOCB;
3737 static void nvme_zone_reset_cancel(BlockAIOCB *aiocb)
3739 NvmeZoneResetAIOCB *iocb = container_of(aiocb, NvmeZoneResetAIOCB, common);
3740 NvmeRequest *req = iocb->req;
3741 NvmeNamespace *ns = req->ns;
3743 iocb->idx = ns->num_zones;
3745 iocb->ret = -ECANCELED;
3747 if (iocb->aiocb) {
3748 blk_aio_cancel_async(iocb->aiocb);
3749 iocb->aiocb = NULL;
3753 static const AIOCBInfo nvme_zone_reset_aiocb_info = {
3754 .aiocb_size = sizeof(NvmeZoneResetAIOCB),
3755 .cancel_async = nvme_zone_reset_cancel,
3758 static void nvme_zone_reset_bh(void *opaque)
3760 NvmeZoneResetAIOCB *iocb = opaque;
3762 iocb->common.cb(iocb->common.opaque, iocb->ret);
3764 qemu_bh_delete(iocb->bh);
3765 iocb->bh = NULL;
3766 qemu_aio_unref(iocb);
3769 static void nvme_zone_reset_cb(void *opaque, int ret);
3771 static void nvme_zone_reset_epilogue_cb(void *opaque, int ret)
3773 NvmeZoneResetAIOCB *iocb = opaque;
3774 NvmeRequest *req = iocb->req;
3775 NvmeNamespace *ns = req->ns;
3776 int64_t moff;
3777 int count;
3779 if (ret < 0) {
3780 nvme_zone_reset_cb(iocb, ret);
3781 return;
3784 if (!ns->lbaf.ms) {
3785 nvme_zone_reset_cb(iocb, 0);
3786 return;
3789 moff = nvme_moff(ns, iocb->zone->d.zslba);
3790 count = nvme_m2b(ns, ns->zone_size);
3792 iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk, moff, count,
3793 BDRV_REQ_MAY_UNMAP,
3794 nvme_zone_reset_cb, iocb);
3795 return;
3798 static void nvme_zone_reset_cb(void *opaque, int ret)
3800 NvmeZoneResetAIOCB *iocb = opaque;
3801 NvmeRequest *req = iocb->req;
3802 NvmeNamespace *ns = req->ns;
3804 if (ret < 0) {
3805 iocb->ret = ret;
3806 goto done;
3809 if (iocb->zone) {
3810 nvme_zrm_reset(ns, iocb->zone);
3812 if (!iocb->all) {
3813 goto done;
3817 while (iocb->idx < ns->num_zones) {
3818 NvmeZone *zone = &ns->zone_array[iocb->idx++];
3820 switch (nvme_get_zone_state(zone)) {
3821 case NVME_ZONE_STATE_EMPTY:
3822 if (!iocb->all) {
3823 goto done;
3826 continue;
3828 case NVME_ZONE_STATE_EXPLICITLY_OPEN:
3829 case NVME_ZONE_STATE_IMPLICITLY_OPEN:
3830 case NVME_ZONE_STATE_CLOSED:
3831 case NVME_ZONE_STATE_FULL:
3832 iocb->zone = zone;
3833 break;
3835 default:
3836 continue;
3839 trace_pci_nvme_zns_zone_reset(zone->d.zslba);
3841 iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk,
3842 nvme_l2b(ns, zone->d.zslba),
3843 nvme_l2b(ns, ns->zone_size),
3844 BDRV_REQ_MAY_UNMAP,
3845 nvme_zone_reset_epilogue_cb,
3846 iocb);
3847 return;
3850 done:
3851 iocb->aiocb = NULL;
3852 if (iocb->bh) {
3853 qemu_bh_schedule(iocb->bh);
3857 static uint16_t nvme_zone_mgmt_send_zrwa_flush(NvmeCtrl *n, NvmeZone *zone,
3858 uint64_t elba, NvmeRequest *req)
3860 NvmeNamespace *ns = req->ns;
3861 uint16_t ozcs = le16_to_cpu(ns->id_ns_zoned->ozcs);
3862 uint64_t wp = zone->d.wp;
3863 uint32_t nlb = elba - wp + 1;
3864 uint16_t status;
3867 if (!(ozcs & NVME_ID_NS_ZONED_OZCS_ZRWASUP)) {
3868 return NVME_INVALID_ZONE_OP | NVME_DNR;
3871 if (!(zone->d.za & NVME_ZA_ZRWA_VALID)) {
3872 return NVME_INVALID_FIELD | NVME_DNR;
3875 if (elba < wp || elba > wp + ns->zns.zrwas) {
3876 return NVME_ZONE_BOUNDARY_ERROR | NVME_DNR;
3879 if (nlb % ns->zns.zrwafg) {
3880 return NVME_INVALID_FIELD | NVME_DNR;
3883 status = nvme_zrm_auto(n, ns, zone);
3884 if (status) {
3885 return status;
3888 zone->w_ptr += nlb;
3890 nvme_advance_zone_wp(ns, zone, nlb);
3892 return NVME_SUCCESS;
3895 static uint16_t nvme_zone_mgmt_send(NvmeCtrl *n, NvmeRequest *req)
3897 NvmeZoneSendCmd *cmd = (NvmeZoneSendCmd *)&req->cmd;
3898 NvmeNamespace *ns = req->ns;
3899 NvmeZone *zone;
3900 NvmeZoneResetAIOCB *iocb;
3901 uint8_t *zd_ext;
3902 uint64_t slba = 0;
3903 uint32_t zone_idx = 0;
3904 uint16_t status;
3905 uint8_t action = cmd->zsa;
3906 bool all;
3907 enum NvmeZoneProcessingMask proc_mask = NVME_PROC_CURRENT_ZONE;
3909 all = cmd->zsflags & NVME_ZSFLAG_SELECT_ALL;
3911 req->status = NVME_SUCCESS;
3913 if (!all) {
3914 status = nvme_get_mgmt_zone_slba_idx(ns, &req->cmd, &slba, &zone_idx);
3915 if (status) {
3916 return status;
3920 zone = &ns->zone_array[zone_idx];
3921 if (slba != zone->d.zslba && action != NVME_ZONE_ACTION_ZRWA_FLUSH) {
3922 trace_pci_nvme_err_unaligned_zone_cmd(action, slba, zone->d.zslba);
3923 return NVME_INVALID_FIELD | NVME_DNR;
3926 switch (action) {
3928 case NVME_ZONE_ACTION_OPEN:
3929 if (all) {
3930 proc_mask = NVME_PROC_CLOSED_ZONES;
3932 trace_pci_nvme_open_zone(slba, zone_idx, all);
3933 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_open_zone, req);
3934 break;
3936 case NVME_ZONE_ACTION_CLOSE:
3937 if (all) {
3938 proc_mask = NVME_PROC_OPENED_ZONES;
3940 trace_pci_nvme_close_zone(slba, zone_idx, all);
3941 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_close_zone, req);
3942 break;
3944 case NVME_ZONE_ACTION_FINISH:
3945 if (all) {
3946 proc_mask = NVME_PROC_OPENED_ZONES | NVME_PROC_CLOSED_ZONES;
3948 trace_pci_nvme_finish_zone(slba, zone_idx, all);
3949 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_finish_zone, req);
3950 break;
3952 case NVME_ZONE_ACTION_RESET:
3953 trace_pci_nvme_reset_zone(slba, zone_idx, all);
3955 iocb = blk_aio_get(&nvme_zone_reset_aiocb_info, ns->blkconf.blk,
3956 nvme_misc_cb, req);
3958 iocb->req = req;
3959 iocb->bh = qemu_bh_new(nvme_zone_reset_bh, iocb);
3960 iocb->ret = 0;
3961 iocb->all = all;
3962 iocb->idx = zone_idx;
3963 iocb->zone = NULL;
3965 req->aiocb = &iocb->common;
3966 nvme_zone_reset_cb(iocb, 0);
3968 return NVME_NO_COMPLETE;
3970 case NVME_ZONE_ACTION_OFFLINE:
3971 if (all) {
3972 proc_mask = NVME_PROC_READ_ONLY_ZONES;
3974 trace_pci_nvme_offline_zone(slba, zone_idx, all);
3975 status = nvme_do_zone_op(ns, zone, proc_mask, nvme_offline_zone, req);
3976 break;
3978 case NVME_ZONE_ACTION_SET_ZD_EXT:
3979 trace_pci_nvme_set_descriptor_extension(slba, zone_idx);
3980 if (all || !ns->params.zd_extension_size) {
3981 return NVME_INVALID_FIELD | NVME_DNR;
3983 zd_ext = nvme_get_zd_extension(ns, zone_idx);
3984 status = nvme_h2c(n, zd_ext, ns->params.zd_extension_size, req);
3985 if (status) {
3986 trace_pci_nvme_err_zd_extension_map_error(zone_idx);
3987 return status;
3990 status = nvme_set_zd_ext(ns, zone);
3991 if (status == NVME_SUCCESS) {
3992 trace_pci_nvme_zd_extension_set(zone_idx);
3993 return status;
3995 break;
3997 case NVME_ZONE_ACTION_ZRWA_FLUSH:
3998 if (all) {
3999 return NVME_INVALID_FIELD | NVME_DNR;
4002 return nvme_zone_mgmt_send_zrwa_flush(n, zone, slba, req);
4004 default:
4005 trace_pci_nvme_err_invalid_mgmt_action(action);
4006 status = NVME_INVALID_FIELD;
4009 if (status == NVME_ZONE_INVAL_TRANSITION) {
4010 trace_pci_nvme_err_invalid_zone_state_transition(action, slba,
4011 zone->d.za);
4013 if (status) {
4014 status |= NVME_DNR;
4017 return status;
4020 static bool nvme_zone_matches_filter(uint32_t zafs, NvmeZone *zl)
4022 NvmeZoneState zs = nvme_get_zone_state(zl);
4024 switch (zafs) {
4025 case NVME_ZONE_REPORT_ALL:
4026 return true;
4027 case NVME_ZONE_REPORT_EMPTY:
4028 return zs == NVME_ZONE_STATE_EMPTY;
4029 case NVME_ZONE_REPORT_IMPLICITLY_OPEN:
4030 return zs == NVME_ZONE_STATE_IMPLICITLY_OPEN;
4031 case NVME_ZONE_REPORT_EXPLICITLY_OPEN:
4032 return zs == NVME_ZONE_STATE_EXPLICITLY_OPEN;
4033 case NVME_ZONE_REPORT_CLOSED:
4034 return zs == NVME_ZONE_STATE_CLOSED;
4035 case NVME_ZONE_REPORT_FULL:
4036 return zs == NVME_ZONE_STATE_FULL;
4037 case NVME_ZONE_REPORT_READ_ONLY:
4038 return zs == NVME_ZONE_STATE_READ_ONLY;
4039 case NVME_ZONE_REPORT_OFFLINE:
4040 return zs == NVME_ZONE_STATE_OFFLINE;
4041 default:
4042 return false;
4046 static uint16_t nvme_zone_mgmt_recv(NvmeCtrl *n, NvmeRequest *req)
4048 NvmeCmd *cmd = (NvmeCmd *)&req->cmd;
4049 NvmeNamespace *ns = req->ns;
4050 /* cdw12 is zero-based number of dwords to return. Convert to bytes */
4051 uint32_t data_size = (le32_to_cpu(cmd->cdw12) + 1) << 2;
4052 uint32_t dw13 = le32_to_cpu(cmd->cdw13);
4053 uint32_t zone_idx, zra, zrasf, partial;
4054 uint64_t max_zones, nr_zones = 0;
4055 uint16_t status;
4056 uint64_t slba;
4057 NvmeZoneDescr *z;
4058 NvmeZone *zone;
4059 NvmeZoneReportHeader *header;
4060 void *buf, *buf_p;
4061 size_t zone_entry_sz;
4062 int i;
4064 req->status = NVME_SUCCESS;
4066 status = nvme_get_mgmt_zone_slba_idx(ns, cmd, &slba, &zone_idx);
4067 if (status) {
4068 return status;
4071 zra = dw13 & 0xff;
4072 if (zra != NVME_ZONE_REPORT && zra != NVME_ZONE_REPORT_EXTENDED) {
4073 return NVME_INVALID_FIELD | NVME_DNR;
4075 if (zra == NVME_ZONE_REPORT_EXTENDED && !ns->params.zd_extension_size) {
4076 return NVME_INVALID_FIELD | NVME_DNR;
4079 zrasf = (dw13 >> 8) & 0xff;
4080 if (zrasf > NVME_ZONE_REPORT_OFFLINE) {
4081 return NVME_INVALID_FIELD | NVME_DNR;
4084 if (data_size < sizeof(NvmeZoneReportHeader)) {
4085 return NVME_INVALID_FIELD | NVME_DNR;
4088 status = nvme_check_mdts(n, data_size);
4089 if (status) {
4090 return status;
4093 partial = (dw13 >> 16) & 0x01;
4095 zone_entry_sz = sizeof(NvmeZoneDescr);
4096 if (zra == NVME_ZONE_REPORT_EXTENDED) {
4097 zone_entry_sz += ns->params.zd_extension_size;
4100 max_zones = (data_size - sizeof(NvmeZoneReportHeader)) / zone_entry_sz;
4101 buf = g_malloc0(data_size);
4103 zone = &ns->zone_array[zone_idx];
4104 for (i = zone_idx; i < ns->num_zones; i++) {
4105 if (partial && nr_zones >= max_zones) {
4106 break;
4108 if (nvme_zone_matches_filter(zrasf, zone++)) {
4109 nr_zones++;
4112 header = (NvmeZoneReportHeader *)buf;
4113 header->nr_zones = cpu_to_le64(nr_zones);
4115 buf_p = buf + sizeof(NvmeZoneReportHeader);
4116 for (; zone_idx < ns->num_zones && max_zones > 0; zone_idx++) {
4117 zone = &ns->zone_array[zone_idx];
4118 if (nvme_zone_matches_filter(zrasf, zone)) {
4119 z = (NvmeZoneDescr *)buf_p;
4120 buf_p += sizeof(NvmeZoneDescr);
4122 z->zt = zone->d.zt;
4123 z->zs = zone->d.zs;
4124 z->zcap = cpu_to_le64(zone->d.zcap);
4125 z->zslba = cpu_to_le64(zone->d.zslba);
4126 z->za = zone->d.za;
4128 if (nvme_wp_is_valid(zone)) {
4129 z->wp = cpu_to_le64(zone->d.wp);
4130 } else {
4131 z->wp = cpu_to_le64(~0ULL);
4134 if (zra == NVME_ZONE_REPORT_EXTENDED) {
4135 if (zone->d.za & NVME_ZA_ZD_EXT_VALID) {
4136 memcpy(buf_p, nvme_get_zd_extension(ns, zone_idx),
4137 ns->params.zd_extension_size);
4139 buf_p += ns->params.zd_extension_size;
4142 max_zones--;
4146 status = nvme_c2h(n, (uint8_t *)buf, data_size, req);
4148 g_free(buf);
4150 return status;
4153 static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req)
4155 NvmeNamespace *ns;
4156 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
4158 trace_pci_nvme_io_cmd(nvme_cid(req), nsid, nvme_sqid(req),
4159 req->cmd.opcode, nvme_io_opc_str(req->cmd.opcode));
4161 if (!nvme_nsid_valid(n, nsid)) {
4162 return NVME_INVALID_NSID | NVME_DNR;
4166 * In the base NVM command set, Flush may apply to all namespaces
4167 * (indicated by NSID being set to FFFFFFFFh). But if that feature is used
4168 * along with TP 4056 (Namespace Types), it may be pretty screwed up.
4170 * If NSID is indeed set to FFFFFFFFh, we simply cannot associate the
4171 * opcode with a specific command since we cannot determine a unique I/O
4172 * command set. Opcode 0h could have any other meaning than something
4173 * equivalent to flushing and say it DOES have completely different
4174 * semantics in some other command set - does an NSID of FFFFFFFFh then
4175 * mean "for all namespaces, apply whatever command set specific command
4176 * that uses the 0h opcode?" Or does it mean "for all namespaces, apply
4177 * whatever command that uses the 0h opcode if, and only if, it allows NSID
4178 * to be FFFFFFFFh"?
4180 * Anyway (and luckily), for now, we do not care about this since the
4181 * device only supports namespace types that includes the NVM Flush command
4182 * (NVM and Zoned), so always do an NVM Flush.
4184 if (req->cmd.opcode == NVME_CMD_FLUSH) {
4185 return nvme_flush(n, req);
4188 ns = nvme_ns(n, nsid);
4189 if (unlikely(!ns)) {
4190 return NVME_INVALID_FIELD | NVME_DNR;
4193 if (!(ns->iocs[req->cmd.opcode] & NVME_CMD_EFF_CSUPP)) {
4194 trace_pci_nvme_err_invalid_opc(req->cmd.opcode);
4195 return NVME_INVALID_OPCODE | NVME_DNR;
4198 if (ns->status) {
4199 return ns->status;
4202 if (NVME_CMD_FLAGS_FUSE(req->cmd.flags)) {
4203 return NVME_INVALID_FIELD;
4206 req->ns = ns;
4208 switch (req->cmd.opcode) {
4209 case NVME_CMD_WRITE_ZEROES:
4210 return nvme_write_zeroes(n, req);
4211 case NVME_CMD_ZONE_APPEND:
4212 return nvme_zone_append(n, req);
4213 case NVME_CMD_WRITE:
4214 return nvme_write(n, req);
4215 case NVME_CMD_READ:
4216 return nvme_read(n, req);
4217 case NVME_CMD_COMPARE:
4218 return nvme_compare(n, req);
4219 case NVME_CMD_DSM:
4220 return nvme_dsm(n, req);
4221 case NVME_CMD_VERIFY:
4222 return nvme_verify(n, req);
4223 case NVME_CMD_COPY:
4224 return nvme_copy(n, req);
4225 case NVME_CMD_ZONE_MGMT_SEND:
4226 return nvme_zone_mgmt_send(n, req);
4227 case NVME_CMD_ZONE_MGMT_RECV:
4228 return nvme_zone_mgmt_recv(n, req);
4229 default:
4230 assert(false);
4233 return NVME_INVALID_OPCODE | NVME_DNR;
4236 static void nvme_cq_notifier(EventNotifier *e)
4238 NvmeCQueue *cq = container_of(e, NvmeCQueue, notifier);
4239 NvmeCtrl *n = cq->ctrl;
4241 if (!event_notifier_test_and_clear(e)) {
4242 return;
4245 nvme_update_cq_head(cq);
4247 if (cq->tail == cq->head) {
4248 if (cq->irq_enabled) {
4249 n->cq_pending--;
4252 nvme_irq_deassert(n, cq);
4255 nvme_post_cqes(cq);
4258 static int nvme_init_cq_ioeventfd(NvmeCQueue *cq)
4260 NvmeCtrl *n = cq->ctrl;
4261 uint16_t offset = (cq->cqid << 3) + (1 << 2);
4262 int ret;
4264 ret = event_notifier_init(&cq->notifier, 0);
4265 if (ret < 0) {
4266 return ret;
4269 event_notifier_set_handler(&cq->notifier, nvme_cq_notifier);
4270 memory_region_add_eventfd(&n->iomem,
4271 0x1000 + offset, 4, false, 0, &cq->notifier);
4273 return 0;
4276 static void nvme_sq_notifier(EventNotifier *e)
4278 NvmeSQueue *sq = container_of(e, NvmeSQueue, notifier);
4280 if (!event_notifier_test_and_clear(e)) {
4281 return;
4284 nvme_process_sq(sq);
4287 static int nvme_init_sq_ioeventfd(NvmeSQueue *sq)
4289 NvmeCtrl *n = sq->ctrl;
4290 uint16_t offset = sq->sqid << 3;
4291 int ret;
4293 ret = event_notifier_init(&sq->notifier, 0);
4294 if (ret < 0) {
4295 return ret;
4298 event_notifier_set_handler(&sq->notifier, nvme_sq_notifier);
4299 memory_region_add_eventfd(&n->iomem,
4300 0x1000 + offset, 4, false, 0, &sq->notifier);
4302 return 0;
4305 static void nvme_free_sq(NvmeSQueue *sq, NvmeCtrl *n)
4307 uint16_t offset = sq->sqid << 3;
4309 n->sq[sq->sqid] = NULL;
4310 timer_free(sq->timer);
4311 if (sq->ioeventfd_enabled) {
4312 memory_region_del_eventfd(&n->iomem,
4313 0x1000 + offset, 4, false, 0, &sq->notifier);
4314 event_notifier_set_handler(&sq->notifier, NULL);
4315 event_notifier_cleanup(&sq->notifier);
4317 g_free(sq->io_req);
4318 if (sq->sqid) {
4319 g_free(sq);
4323 static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeRequest *req)
4325 NvmeDeleteQ *c = (NvmeDeleteQ *)&req->cmd;
4326 NvmeRequest *r, *next;
4327 NvmeSQueue *sq;
4328 NvmeCQueue *cq;
4329 uint16_t qid = le16_to_cpu(c->qid);
4331 if (unlikely(!qid || nvme_check_sqid(n, qid))) {
4332 trace_pci_nvme_err_invalid_del_sq(qid);
4333 return NVME_INVALID_QID | NVME_DNR;
4336 trace_pci_nvme_del_sq(qid);
4338 sq = n->sq[qid];
4339 while (!QTAILQ_EMPTY(&sq->out_req_list)) {
4340 r = QTAILQ_FIRST(&sq->out_req_list);
4341 assert(r->aiocb);
4342 blk_aio_cancel(r->aiocb);
4345 assert(QTAILQ_EMPTY(&sq->out_req_list));
4347 if (!nvme_check_cqid(n, sq->cqid)) {
4348 cq = n->cq[sq->cqid];
4349 QTAILQ_REMOVE(&cq->sq_list, sq, entry);
4351 nvme_post_cqes(cq);
4352 QTAILQ_FOREACH_SAFE(r, &cq->req_list, entry, next) {
4353 if (r->sq == sq) {
4354 QTAILQ_REMOVE(&cq->req_list, r, entry);
4355 QTAILQ_INSERT_TAIL(&sq->req_list, r, entry);
4360 nvme_free_sq(sq, n);
4361 return NVME_SUCCESS;
4364 static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, uint64_t dma_addr,
4365 uint16_t sqid, uint16_t cqid, uint16_t size)
4367 int i;
4368 NvmeCQueue *cq;
4370 sq->ctrl = n;
4371 sq->dma_addr = dma_addr;
4372 sq->sqid = sqid;
4373 sq->size = size;
4374 sq->cqid = cqid;
4375 sq->head = sq->tail = 0;
4376 sq->io_req = g_new0(NvmeRequest, sq->size);
4378 QTAILQ_INIT(&sq->req_list);
4379 QTAILQ_INIT(&sq->out_req_list);
4380 for (i = 0; i < sq->size; i++) {
4381 sq->io_req[i].sq = sq;
4382 QTAILQ_INSERT_TAIL(&(sq->req_list), &sq->io_req[i], entry);
4384 sq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_process_sq, sq);
4386 if (n->dbbuf_enabled) {
4387 sq->db_addr = n->dbbuf_dbs + (sqid << 3);
4388 sq->ei_addr = n->dbbuf_eis + (sqid << 3);
4390 if (n->params.ioeventfd && sq->sqid != 0) {
4391 if (!nvme_init_sq_ioeventfd(sq)) {
4392 sq->ioeventfd_enabled = true;
4397 assert(n->cq[cqid]);
4398 cq = n->cq[cqid];
4399 QTAILQ_INSERT_TAIL(&(cq->sq_list), sq, entry);
4400 n->sq[sqid] = sq;
4403 static uint16_t nvme_create_sq(NvmeCtrl *n, NvmeRequest *req)
4405 NvmeSQueue *sq;
4406 NvmeCreateSq *c = (NvmeCreateSq *)&req->cmd;
4408 uint16_t cqid = le16_to_cpu(c->cqid);
4409 uint16_t sqid = le16_to_cpu(c->sqid);
4410 uint16_t qsize = le16_to_cpu(c->qsize);
4411 uint16_t qflags = le16_to_cpu(c->sq_flags);
4412 uint64_t prp1 = le64_to_cpu(c->prp1);
4414 trace_pci_nvme_create_sq(prp1, sqid, cqid, qsize, qflags);
4416 if (unlikely(!cqid || nvme_check_cqid(n, cqid))) {
4417 trace_pci_nvme_err_invalid_create_sq_cqid(cqid);
4418 return NVME_INVALID_CQID | NVME_DNR;
4420 if (unlikely(!sqid || sqid > n->conf_ioqpairs || n->sq[sqid] != NULL)) {
4421 trace_pci_nvme_err_invalid_create_sq_sqid(sqid);
4422 return NVME_INVALID_QID | NVME_DNR;
4424 if (unlikely(!qsize || qsize > NVME_CAP_MQES(ldq_le_p(&n->bar.cap)))) {
4425 trace_pci_nvme_err_invalid_create_sq_size(qsize);
4426 return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
4428 if (unlikely(prp1 & (n->page_size - 1))) {
4429 trace_pci_nvme_err_invalid_create_sq_addr(prp1);
4430 return NVME_INVALID_PRP_OFFSET | NVME_DNR;
4432 if (unlikely(!(NVME_SQ_FLAGS_PC(qflags)))) {
4433 trace_pci_nvme_err_invalid_create_sq_qflags(NVME_SQ_FLAGS_PC(qflags));
4434 return NVME_INVALID_FIELD | NVME_DNR;
4436 sq = g_malloc0(sizeof(*sq));
4437 nvme_init_sq(sq, n, prp1, sqid, cqid, qsize + 1);
4438 return NVME_SUCCESS;
4441 struct nvme_stats {
4442 uint64_t units_read;
4443 uint64_t units_written;
4444 uint64_t read_commands;
4445 uint64_t write_commands;
4448 static void nvme_set_blk_stats(NvmeNamespace *ns, struct nvme_stats *stats)
4450 BlockAcctStats *s = blk_get_stats(ns->blkconf.blk);
4452 stats->units_read += s->nr_bytes[BLOCK_ACCT_READ] >> BDRV_SECTOR_BITS;
4453 stats->units_written += s->nr_bytes[BLOCK_ACCT_WRITE] >> BDRV_SECTOR_BITS;
4454 stats->read_commands += s->nr_ops[BLOCK_ACCT_READ];
4455 stats->write_commands += s->nr_ops[BLOCK_ACCT_WRITE];
4458 static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
4459 uint64_t off, NvmeRequest *req)
4461 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
4462 struct nvme_stats stats = { 0 };
4463 NvmeSmartLog smart = { 0 };
4464 uint32_t trans_len;
4465 NvmeNamespace *ns;
4466 time_t current_ms;
4468 if (off >= sizeof(smart)) {
4469 return NVME_INVALID_FIELD | NVME_DNR;
4472 if (nsid != 0xffffffff) {
4473 ns = nvme_ns(n, nsid);
4474 if (!ns) {
4475 return NVME_INVALID_NSID | NVME_DNR;
4477 nvme_set_blk_stats(ns, &stats);
4478 } else {
4479 int i;
4481 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
4482 ns = nvme_ns(n, i);
4483 if (!ns) {
4484 continue;
4486 nvme_set_blk_stats(ns, &stats);
4490 trans_len = MIN(sizeof(smart) - off, buf_len);
4491 smart.critical_warning = n->smart_critical_warning;
4493 smart.data_units_read[0] = cpu_to_le64(DIV_ROUND_UP(stats.units_read,
4494 1000));
4495 smart.data_units_written[0] = cpu_to_le64(DIV_ROUND_UP(stats.units_written,
4496 1000));
4497 smart.host_read_commands[0] = cpu_to_le64(stats.read_commands);
4498 smart.host_write_commands[0] = cpu_to_le64(stats.write_commands);
4500 smart.temperature = cpu_to_le16(n->temperature);
4502 if ((n->temperature >= n->features.temp_thresh_hi) ||
4503 (n->temperature <= n->features.temp_thresh_low)) {
4504 smart.critical_warning |= NVME_SMART_TEMPERATURE;
4507 current_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
4508 smart.power_on_hours[0] =
4509 cpu_to_le64((((current_ms - n->starttime_ms) / 1000) / 60) / 60);
4511 if (!rae) {
4512 nvme_clear_events(n, NVME_AER_TYPE_SMART);
4515 return nvme_c2h(n, (uint8_t *) &smart + off, trans_len, req);
4518 static uint16_t nvme_fw_log_info(NvmeCtrl *n, uint32_t buf_len, uint64_t off,
4519 NvmeRequest *req)
4521 uint32_t trans_len;
4522 NvmeFwSlotInfoLog fw_log = {
4523 .afi = 0x1,
4526 if (off >= sizeof(fw_log)) {
4527 return NVME_INVALID_FIELD | NVME_DNR;
4530 strpadcpy((char *)&fw_log.frs1, sizeof(fw_log.frs1), "1.0", ' ');
4531 trans_len = MIN(sizeof(fw_log) - off, buf_len);
4533 return nvme_c2h(n, (uint8_t *) &fw_log + off, trans_len, req);
4536 static uint16_t nvme_error_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
4537 uint64_t off, NvmeRequest *req)
4539 uint32_t trans_len;
4540 NvmeErrorLog errlog;
4542 if (off >= sizeof(errlog)) {
4543 return NVME_INVALID_FIELD | NVME_DNR;
4546 if (!rae) {
4547 nvme_clear_events(n, NVME_AER_TYPE_ERROR);
4550 memset(&errlog, 0x0, sizeof(errlog));
4551 trans_len = MIN(sizeof(errlog) - off, buf_len);
4553 return nvme_c2h(n, (uint8_t *)&errlog, trans_len, req);
4556 static uint16_t nvme_changed_nslist(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
4557 uint64_t off, NvmeRequest *req)
4559 uint32_t nslist[1024];
4560 uint32_t trans_len;
4561 int i = 0;
4562 uint32_t nsid;
4564 if (off >= sizeof(nslist)) {
4565 trace_pci_nvme_err_invalid_log_page_offset(off, sizeof(nslist));
4566 return NVME_INVALID_FIELD | NVME_DNR;
4569 memset(nslist, 0x0, sizeof(nslist));
4570 trans_len = MIN(sizeof(nslist) - off, buf_len);
4572 while ((nsid = find_first_bit(n->changed_nsids, NVME_CHANGED_NSID_SIZE)) !=
4573 NVME_CHANGED_NSID_SIZE) {
4575 * If more than 1024 namespaces, the first entry in the log page should
4576 * be set to FFFFFFFFh and the others to 0 as spec.
4578 if (i == ARRAY_SIZE(nslist)) {
4579 memset(nslist, 0x0, sizeof(nslist));
4580 nslist[0] = 0xffffffff;
4581 break;
4584 nslist[i++] = nsid;
4585 clear_bit(nsid, n->changed_nsids);
4589 * Remove all the remaining list entries in case returns directly due to
4590 * more than 1024 namespaces.
4592 if (nslist[0] == 0xffffffff) {
4593 bitmap_zero(n->changed_nsids, NVME_CHANGED_NSID_SIZE);
4596 if (!rae) {
4597 nvme_clear_events(n, NVME_AER_TYPE_NOTICE);
4600 return nvme_c2h(n, ((uint8_t *)nslist) + off, trans_len, req);
4603 static uint16_t nvme_cmd_effects(NvmeCtrl *n, uint8_t csi, uint32_t buf_len,
4604 uint64_t off, NvmeRequest *req)
4606 NvmeEffectsLog log = {};
4607 const uint32_t *src_iocs = NULL;
4608 uint32_t trans_len;
4610 if (off >= sizeof(log)) {
4611 trace_pci_nvme_err_invalid_log_page_offset(off, sizeof(log));
4612 return NVME_INVALID_FIELD | NVME_DNR;
4615 switch (NVME_CC_CSS(ldl_le_p(&n->bar.cc))) {
4616 case NVME_CC_CSS_NVM:
4617 src_iocs = nvme_cse_iocs_nvm;
4618 /* fall through */
4619 case NVME_CC_CSS_ADMIN_ONLY:
4620 break;
4621 case NVME_CC_CSS_CSI:
4622 switch (csi) {
4623 case NVME_CSI_NVM:
4624 src_iocs = nvme_cse_iocs_nvm;
4625 break;
4626 case NVME_CSI_ZONED:
4627 src_iocs = nvme_cse_iocs_zoned;
4628 break;
4632 memcpy(log.acs, nvme_cse_acs, sizeof(nvme_cse_acs));
4634 if (src_iocs) {
4635 memcpy(log.iocs, src_iocs, sizeof(log.iocs));
4638 trans_len = MIN(sizeof(log) - off, buf_len);
4640 return nvme_c2h(n, ((uint8_t *)&log) + off, trans_len, req);
4643 static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req)
4645 NvmeCmd *cmd = &req->cmd;
4647 uint32_t dw10 = le32_to_cpu(cmd->cdw10);
4648 uint32_t dw11 = le32_to_cpu(cmd->cdw11);
4649 uint32_t dw12 = le32_to_cpu(cmd->cdw12);
4650 uint32_t dw13 = le32_to_cpu(cmd->cdw13);
4651 uint8_t lid = dw10 & 0xff;
4652 uint8_t lsp = (dw10 >> 8) & 0xf;
4653 uint8_t rae = (dw10 >> 15) & 0x1;
4654 uint8_t csi = le32_to_cpu(cmd->cdw14) >> 24;
4655 uint32_t numdl, numdu;
4656 uint64_t off, lpol, lpou;
4657 size_t len;
4658 uint16_t status;
4660 numdl = (dw10 >> 16);
4661 numdu = (dw11 & 0xffff);
4662 lpol = dw12;
4663 lpou = dw13;
4665 len = (((numdu << 16) | numdl) + 1) << 2;
4666 off = (lpou << 32ULL) | lpol;
4668 if (off & 0x3) {
4669 return NVME_INVALID_FIELD | NVME_DNR;
4672 trace_pci_nvme_get_log(nvme_cid(req), lid, lsp, rae, len, off);
4674 status = nvme_check_mdts(n, len);
4675 if (status) {
4676 return status;
4679 switch (lid) {
4680 case NVME_LOG_ERROR_INFO:
4681 return nvme_error_info(n, rae, len, off, req);
4682 case NVME_LOG_SMART_INFO:
4683 return nvme_smart_info(n, rae, len, off, req);
4684 case NVME_LOG_FW_SLOT_INFO:
4685 return nvme_fw_log_info(n, len, off, req);
4686 case NVME_LOG_CHANGED_NSLIST:
4687 return nvme_changed_nslist(n, rae, len, off, req);
4688 case NVME_LOG_CMD_EFFECTS:
4689 return nvme_cmd_effects(n, csi, len, off, req);
4690 default:
4691 trace_pci_nvme_err_invalid_log_page(nvme_cid(req), lid);
4692 return NVME_INVALID_FIELD | NVME_DNR;
4696 static void nvme_free_cq(NvmeCQueue *cq, NvmeCtrl *n)
4698 uint16_t offset = (cq->cqid << 3) + (1 << 2);
4700 n->cq[cq->cqid] = NULL;
4701 timer_free(cq->timer);
4702 if (cq->ioeventfd_enabled) {
4703 memory_region_del_eventfd(&n->iomem,
4704 0x1000 + offset, 4, false, 0, &cq->notifier);
4705 event_notifier_set_handler(&cq->notifier, NULL);
4706 event_notifier_cleanup(&cq->notifier);
4708 if (msix_enabled(&n->parent_obj)) {
4709 msix_vector_unuse(&n->parent_obj, cq->vector);
4711 if (cq->cqid) {
4712 g_free(cq);
4716 static uint16_t nvme_del_cq(NvmeCtrl *n, NvmeRequest *req)
4718 NvmeDeleteQ *c = (NvmeDeleteQ *)&req->cmd;
4719 NvmeCQueue *cq;
4720 uint16_t qid = le16_to_cpu(c->qid);
4722 if (unlikely(!qid || nvme_check_cqid(n, qid))) {
4723 trace_pci_nvme_err_invalid_del_cq_cqid(qid);
4724 return NVME_INVALID_CQID | NVME_DNR;
4727 cq = n->cq[qid];
4728 if (unlikely(!QTAILQ_EMPTY(&cq->sq_list))) {
4729 trace_pci_nvme_err_invalid_del_cq_notempty(qid);
4730 return NVME_INVALID_QUEUE_DEL;
4733 if (cq->irq_enabled && cq->tail != cq->head) {
4734 n->cq_pending--;
4737 nvme_irq_deassert(n, cq);
4738 trace_pci_nvme_del_cq(qid);
4739 nvme_free_cq(cq, n);
4740 return NVME_SUCCESS;
4743 static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, uint64_t dma_addr,
4744 uint16_t cqid, uint16_t vector, uint16_t size,
4745 uint16_t irq_enabled)
4747 if (msix_enabled(&n->parent_obj)) {
4748 msix_vector_use(&n->parent_obj, vector);
4750 cq->ctrl = n;
4751 cq->cqid = cqid;
4752 cq->size = size;
4753 cq->dma_addr = dma_addr;
4754 cq->phase = 1;
4755 cq->irq_enabled = irq_enabled;
4756 cq->vector = vector;
4757 cq->head = cq->tail = 0;
4758 QTAILQ_INIT(&cq->req_list);
4759 QTAILQ_INIT(&cq->sq_list);
4760 if (n->dbbuf_enabled) {
4761 cq->db_addr = n->dbbuf_dbs + (cqid << 3) + (1 << 2);
4762 cq->ei_addr = n->dbbuf_eis + (cqid << 3) + (1 << 2);
4764 if (n->params.ioeventfd && cqid != 0) {
4765 if (!nvme_init_cq_ioeventfd(cq)) {
4766 cq->ioeventfd_enabled = true;
4770 n->cq[cqid] = cq;
4771 cq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_post_cqes, cq);
4774 static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeRequest *req)
4776 NvmeCQueue *cq;
4777 NvmeCreateCq *c = (NvmeCreateCq *)&req->cmd;
4778 uint16_t cqid = le16_to_cpu(c->cqid);
4779 uint16_t vector = le16_to_cpu(c->irq_vector);
4780 uint16_t qsize = le16_to_cpu(c->qsize);
4781 uint16_t qflags = le16_to_cpu(c->cq_flags);
4782 uint64_t prp1 = le64_to_cpu(c->prp1);
4784 trace_pci_nvme_create_cq(prp1, cqid, vector, qsize, qflags,
4785 NVME_CQ_FLAGS_IEN(qflags) != 0);
4787 if (unlikely(!cqid || cqid > n->conf_ioqpairs || n->cq[cqid] != NULL)) {
4788 trace_pci_nvme_err_invalid_create_cq_cqid(cqid);
4789 return NVME_INVALID_QID | NVME_DNR;
4791 if (unlikely(!qsize || qsize > NVME_CAP_MQES(ldq_le_p(&n->bar.cap)))) {
4792 trace_pci_nvme_err_invalid_create_cq_size(qsize);
4793 return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
4795 if (unlikely(prp1 & (n->page_size - 1))) {
4796 trace_pci_nvme_err_invalid_create_cq_addr(prp1);
4797 return NVME_INVALID_PRP_OFFSET | NVME_DNR;
4799 if (unlikely(!msix_enabled(&n->parent_obj) && vector)) {
4800 trace_pci_nvme_err_invalid_create_cq_vector(vector);
4801 return NVME_INVALID_IRQ_VECTOR | NVME_DNR;
4803 if (unlikely(vector >= n->conf_msix_qsize)) {
4804 trace_pci_nvme_err_invalid_create_cq_vector(vector);
4805 return NVME_INVALID_IRQ_VECTOR | NVME_DNR;
4807 if (unlikely(!(NVME_CQ_FLAGS_PC(qflags)))) {
4808 trace_pci_nvme_err_invalid_create_cq_qflags(NVME_CQ_FLAGS_PC(qflags));
4809 return NVME_INVALID_FIELD | NVME_DNR;
4812 cq = g_malloc0(sizeof(*cq));
4813 nvme_init_cq(cq, n, prp1, cqid, vector, qsize + 1,
4814 NVME_CQ_FLAGS_IEN(qflags));
4817 * It is only required to set qs_created when creating a completion queue;
4818 * creating a submission queue without a matching completion queue will
4819 * fail.
4821 n->qs_created = true;
4822 return NVME_SUCCESS;
4825 static uint16_t nvme_rpt_empty_id_struct(NvmeCtrl *n, NvmeRequest *req)
4827 uint8_t id[NVME_IDENTIFY_DATA_SIZE] = {};
4829 return nvme_c2h(n, id, sizeof(id), req);
4832 static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeRequest *req)
4834 trace_pci_nvme_identify_ctrl();
4836 return nvme_c2h(n, (uint8_t *)&n->id_ctrl, sizeof(n->id_ctrl), req);
4839 static uint16_t nvme_identify_ctrl_csi(NvmeCtrl *n, NvmeRequest *req)
4841 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4842 uint8_t id[NVME_IDENTIFY_DATA_SIZE] = {};
4843 NvmeIdCtrlNvm *id_nvm = (NvmeIdCtrlNvm *)&id;
4845 trace_pci_nvme_identify_ctrl_csi(c->csi);
4847 switch (c->csi) {
4848 case NVME_CSI_NVM:
4849 id_nvm->vsl = n->params.vsl;
4850 id_nvm->dmrsl = cpu_to_le32(n->dmrsl);
4851 break;
4853 case NVME_CSI_ZONED:
4854 ((NvmeIdCtrlZoned *)&id)->zasl = n->params.zasl;
4855 break;
4857 default:
4858 return NVME_INVALID_FIELD | NVME_DNR;
4861 return nvme_c2h(n, id, sizeof(id), req);
4864 static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req, bool active)
4866 NvmeNamespace *ns;
4867 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4868 uint32_t nsid = le32_to_cpu(c->nsid);
4870 trace_pci_nvme_identify_ns(nsid);
4872 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
4873 return NVME_INVALID_NSID | NVME_DNR;
4876 ns = nvme_ns(n, nsid);
4877 if (unlikely(!ns)) {
4878 if (!active) {
4879 ns = nvme_subsys_ns(n->subsys, nsid);
4880 if (!ns) {
4881 return nvme_rpt_empty_id_struct(n, req);
4883 } else {
4884 return nvme_rpt_empty_id_struct(n, req);
4888 if (active || ns->csi == NVME_CSI_NVM) {
4889 return nvme_c2h(n, (uint8_t *)&ns->id_ns, sizeof(NvmeIdNs), req);
4892 return NVME_INVALID_CMD_SET | NVME_DNR;
4895 static uint16_t nvme_identify_ctrl_list(NvmeCtrl *n, NvmeRequest *req,
4896 bool attached)
4898 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4899 uint32_t nsid = le32_to_cpu(c->nsid);
4900 uint16_t min_id = le16_to_cpu(c->ctrlid);
4901 uint16_t list[NVME_CONTROLLER_LIST_SIZE] = {};
4902 uint16_t *ids = &list[1];
4903 NvmeNamespace *ns;
4904 NvmeCtrl *ctrl;
4905 int cntlid, nr_ids = 0;
4907 trace_pci_nvme_identify_ctrl_list(c->cns, min_id);
4909 if (!n->subsys) {
4910 return NVME_INVALID_FIELD | NVME_DNR;
4913 if (attached) {
4914 if (nsid == NVME_NSID_BROADCAST) {
4915 return NVME_INVALID_FIELD | NVME_DNR;
4918 ns = nvme_subsys_ns(n->subsys, nsid);
4919 if (!ns) {
4920 return NVME_INVALID_FIELD | NVME_DNR;
4924 for (cntlid = min_id; cntlid < ARRAY_SIZE(n->subsys->ctrls); cntlid++) {
4925 ctrl = nvme_subsys_ctrl(n->subsys, cntlid);
4926 if (!ctrl) {
4927 continue;
4930 if (attached && !nvme_ns(ctrl, nsid)) {
4931 continue;
4934 ids[nr_ids++] = cntlid;
4937 list[0] = nr_ids;
4939 return nvme_c2h(n, (uint8_t *)list, sizeof(list), req);
4942 static uint16_t nvme_identify_pri_ctrl_cap(NvmeCtrl *n, NvmeRequest *req)
4944 trace_pci_nvme_identify_pri_ctrl_cap(le16_to_cpu(n->pri_ctrl_cap.cntlid));
4946 return nvme_c2h(n, (uint8_t *)&n->pri_ctrl_cap,
4947 sizeof(NvmePriCtrlCap), req);
4950 static uint16_t nvme_identify_sec_ctrl_list(NvmeCtrl *n, NvmeRequest *req)
4952 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4953 uint16_t pri_ctrl_id = le16_to_cpu(n->pri_ctrl_cap.cntlid);
4954 uint16_t min_id = le16_to_cpu(c->ctrlid);
4955 uint8_t num_sec_ctrl = n->sec_ctrl_list.numcntl;
4956 NvmeSecCtrlList list = {0};
4957 uint8_t i;
4959 for (i = 0; i < num_sec_ctrl; i++) {
4960 if (n->sec_ctrl_list.sec[i].scid >= min_id) {
4961 list.numcntl = num_sec_ctrl - i;
4962 memcpy(&list.sec, n->sec_ctrl_list.sec + i,
4963 list.numcntl * sizeof(NvmeSecCtrlEntry));
4964 break;
4968 trace_pci_nvme_identify_sec_ctrl_list(pri_ctrl_id, list.numcntl);
4970 return nvme_c2h(n, (uint8_t *)&list, sizeof(list), req);
4973 static uint16_t nvme_identify_ns_csi(NvmeCtrl *n, NvmeRequest *req,
4974 bool active)
4976 NvmeNamespace *ns;
4977 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
4978 uint32_t nsid = le32_to_cpu(c->nsid);
4980 trace_pci_nvme_identify_ns_csi(nsid, c->csi);
4982 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
4983 return NVME_INVALID_NSID | NVME_DNR;
4986 ns = nvme_ns(n, nsid);
4987 if (unlikely(!ns)) {
4988 if (!active) {
4989 ns = nvme_subsys_ns(n->subsys, nsid);
4990 if (!ns) {
4991 return nvme_rpt_empty_id_struct(n, req);
4993 } else {
4994 return nvme_rpt_empty_id_struct(n, req);
4998 if (c->csi == NVME_CSI_NVM) {
4999 return nvme_c2h(n, (uint8_t *)&ns->id_ns_nvm, sizeof(NvmeIdNsNvm),
5000 req);
5001 } else if (c->csi == NVME_CSI_ZONED && ns->csi == NVME_CSI_ZONED) {
5002 return nvme_c2h(n, (uint8_t *)ns->id_ns_zoned, sizeof(NvmeIdNsZoned),
5003 req);
5006 return NVME_INVALID_FIELD | NVME_DNR;
5009 static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeRequest *req,
5010 bool active)
5012 NvmeNamespace *ns;
5013 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5014 uint32_t min_nsid = le32_to_cpu(c->nsid);
5015 uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
5016 static const int data_len = sizeof(list);
5017 uint32_t *list_ptr = (uint32_t *)list;
5018 int i, j = 0;
5020 trace_pci_nvme_identify_nslist(min_nsid);
5023 * Both FFFFFFFFh (NVME_NSID_BROADCAST) and FFFFFFFFEh are invalid values
5024 * since the Active Namespace ID List should return namespaces with ids
5025 * *higher* than the NSID specified in the command. This is also specified
5026 * in the spec (NVM Express v1.3d, Section 5.15.4).
5028 if (min_nsid >= NVME_NSID_BROADCAST - 1) {
5029 return NVME_INVALID_NSID | NVME_DNR;
5032 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
5033 ns = nvme_ns(n, i);
5034 if (!ns) {
5035 if (!active) {
5036 ns = nvme_subsys_ns(n->subsys, i);
5037 if (!ns) {
5038 continue;
5040 } else {
5041 continue;
5044 if (ns->params.nsid <= min_nsid) {
5045 continue;
5047 list_ptr[j++] = cpu_to_le32(ns->params.nsid);
5048 if (j == data_len / sizeof(uint32_t)) {
5049 break;
5053 return nvme_c2h(n, list, data_len, req);
5056 static uint16_t nvme_identify_nslist_csi(NvmeCtrl *n, NvmeRequest *req,
5057 bool active)
5059 NvmeNamespace *ns;
5060 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5061 uint32_t min_nsid = le32_to_cpu(c->nsid);
5062 uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
5063 static const int data_len = sizeof(list);
5064 uint32_t *list_ptr = (uint32_t *)list;
5065 int i, j = 0;
5067 trace_pci_nvme_identify_nslist_csi(min_nsid, c->csi);
5070 * Same as in nvme_identify_nslist(), FFFFFFFFh/FFFFFFFFEh are invalid.
5072 if (min_nsid >= NVME_NSID_BROADCAST - 1) {
5073 return NVME_INVALID_NSID | NVME_DNR;
5076 if (c->csi != NVME_CSI_NVM && c->csi != NVME_CSI_ZONED) {
5077 return NVME_INVALID_FIELD | NVME_DNR;
5080 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
5081 ns = nvme_ns(n, i);
5082 if (!ns) {
5083 if (!active) {
5084 ns = nvme_subsys_ns(n->subsys, i);
5085 if (!ns) {
5086 continue;
5088 } else {
5089 continue;
5092 if (ns->params.nsid <= min_nsid || c->csi != ns->csi) {
5093 continue;
5095 list_ptr[j++] = cpu_to_le32(ns->params.nsid);
5096 if (j == data_len / sizeof(uint32_t)) {
5097 break;
5101 return nvme_c2h(n, list, data_len, req);
5104 static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeRequest *req)
5106 NvmeNamespace *ns;
5107 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5108 uint32_t nsid = le32_to_cpu(c->nsid);
5109 uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
5110 uint8_t *pos = list;
5111 struct {
5112 NvmeIdNsDescr hdr;
5113 uint8_t v[NVME_NIDL_UUID];
5114 } QEMU_PACKED uuid = {};
5115 struct {
5116 NvmeIdNsDescr hdr;
5117 uint64_t v;
5118 } QEMU_PACKED eui64 = {};
5119 struct {
5120 NvmeIdNsDescr hdr;
5121 uint8_t v;
5122 } QEMU_PACKED csi = {};
5124 trace_pci_nvme_identify_ns_descr_list(nsid);
5126 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
5127 return NVME_INVALID_NSID | NVME_DNR;
5130 ns = nvme_ns(n, nsid);
5131 if (unlikely(!ns)) {
5132 return NVME_INVALID_FIELD | NVME_DNR;
5135 if (!qemu_uuid_is_null(&ns->params.uuid)) {
5136 uuid.hdr.nidt = NVME_NIDT_UUID;
5137 uuid.hdr.nidl = NVME_NIDL_UUID;
5138 memcpy(uuid.v, ns->params.uuid.data, NVME_NIDL_UUID);
5139 memcpy(pos, &uuid, sizeof(uuid));
5140 pos += sizeof(uuid);
5143 if (ns->params.eui64) {
5144 eui64.hdr.nidt = NVME_NIDT_EUI64;
5145 eui64.hdr.nidl = NVME_NIDL_EUI64;
5146 eui64.v = cpu_to_be64(ns->params.eui64);
5147 memcpy(pos, &eui64, sizeof(eui64));
5148 pos += sizeof(eui64);
5151 csi.hdr.nidt = NVME_NIDT_CSI;
5152 csi.hdr.nidl = NVME_NIDL_CSI;
5153 csi.v = ns->csi;
5154 memcpy(pos, &csi, sizeof(csi));
5155 pos += sizeof(csi);
5157 return nvme_c2h(n, list, sizeof(list), req);
5160 static uint16_t nvme_identify_cmd_set(NvmeCtrl *n, NvmeRequest *req)
5162 uint8_t list[NVME_IDENTIFY_DATA_SIZE] = {};
5163 static const int data_len = sizeof(list);
5165 trace_pci_nvme_identify_cmd_set();
5167 NVME_SET_CSI(*list, NVME_CSI_NVM);
5168 NVME_SET_CSI(*list, NVME_CSI_ZONED);
5170 return nvme_c2h(n, list, data_len, req);
5173 static uint16_t nvme_identify(NvmeCtrl *n, NvmeRequest *req)
5175 NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
5177 trace_pci_nvme_identify(nvme_cid(req), c->cns, le16_to_cpu(c->ctrlid),
5178 c->csi);
5180 switch (c->cns) {
5181 case NVME_ID_CNS_NS:
5182 return nvme_identify_ns(n, req, true);
5183 case NVME_ID_CNS_NS_PRESENT:
5184 return nvme_identify_ns(n, req, false);
5185 case NVME_ID_CNS_NS_ATTACHED_CTRL_LIST:
5186 return nvme_identify_ctrl_list(n, req, true);
5187 case NVME_ID_CNS_CTRL_LIST:
5188 return nvme_identify_ctrl_list(n, req, false);
5189 case NVME_ID_CNS_PRIMARY_CTRL_CAP:
5190 return nvme_identify_pri_ctrl_cap(n, req);
5191 case NVME_ID_CNS_SECONDARY_CTRL_LIST:
5192 return nvme_identify_sec_ctrl_list(n, req);
5193 case NVME_ID_CNS_CS_NS:
5194 return nvme_identify_ns_csi(n, req, true);
5195 case NVME_ID_CNS_CS_NS_PRESENT:
5196 return nvme_identify_ns_csi(n, req, false);
5197 case NVME_ID_CNS_CTRL:
5198 return nvme_identify_ctrl(n, req);
5199 case NVME_ID_CNS_CS_CTRL:
5200 return nvme_identify_ctrl_csi(n, req);
5201 case NVME_ID_CNS_NS_ACTIVE_LIST:
5202 return nvme_identify_nslist(n, req, true);
5203 case NVME_ID_CNS_NS_PRESENT_LIST:
5204 return nvme_identify_nslist(n, req, false);
5205 case NVME_ID_CNS_CS_NS_ACTIVE_LIST:
5206 return nvme_identify_nslist_csi(n, req, true);
5207 case NVME_ID_CNS_CS_NS_PRESENT_LIST:
5208 return nvme_identify_nslist_csi(n, req, false);
5209 case NVME_ID_CNS_NS_DESCR_LIST:
5210 return nvme_identify_ns_descr_list(n, req);
5211 case NVME_ID_CNS_IO_COMMAND_SET:
5212 return nvme_identify_cmd_set(n, req);
5213 default:
5214 trace_pci_nvme_err_invalid_identify_cns(le32_to_cpu(c->cns));
5215 return NVME_INVALID_FIELD | NVME_DNR;
5219 static uint16_t nvme_abort(NvmeCtrl *n, NvmeRequest *req)
5221 uint16_t sqid = le32_to_cpu(req->cmd.cdw10) & 0xffff;
5223 req->cqe.result = 1;
5224 if (nvme_check_sqid(n, sqid)) {
5225 return NVME_INVALID_FIELD | NVME_DNR;
5228 return NVME_SUCCESS;
5231 static inline void nvme_set_timestamp(NvmeCtrl *n, uint64_t ts)
5233 trace_pci_nvme_setfeat_timestamp(ts);
5235 n->host_timestamp = le64_to_cpu(ts);
5236 n->timestamp_set_qemu_clock_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
5239 static inline uint64_t nvme_get_timestamp(const NvmeCtrl *n)
5241 uint64_t current_time = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
5242 uint64_t elapsed_time = current_time - n->timestamp_set_qemu_clock_ms;
5244 union nvme_timestamp {
5245 struct {
5246 uint64_t timestamp:48;
5247 uint64_t sync:1;
5248 uint64_t origin:3;
5249 uint64_t rsvd1:12;
5251 uint64_t all;
5254 union nvme_timestamp ts;
5255 ts.all = 0;
5256 ts.timestamp = n->host_timestamp + elapsed_time;
5258 /* If the host timestamp is non-zero, set the timestamp origin */
5259 ts.origin = n->host_timestamp ? 0x01 : 0x00;
5261 trace_pci_nvme_getfeat_timestamp(ts.all);
5263 return cpu_to_le64(ts.all);
5266 static uint16_t nvme_get_feature_timestamp(NvmeCtrl *n, NvmeRequest *req)
5268 uint64_t timestamp = nvme_get_timestamp(n);
5270 return nvme_c2h(n, (uint8_t *)&timestamp, sizeof(timestamp), req);
5273 static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest *req)
5275 NvmeCmd *cmd = &req->cmd;
5276 uint32_t dw10 = le32_to_cpu(cmd->cdw10);
5277 uint32_t dw11 = le32_to_cpu(cmd->cdw11);
5278 uint32_t nsid = le32_to_cpu(cmd->nsid);
5279 uint32_t result;
5280 uint8_t fid = NVME_GETSETFEAT_FID(dw10);
5281 NvmeGetFeatureSelect sel = NVME_GETFEAT_SELECT(dw10);
5282 uint16_t iv;
5283 NvmeNamespace *ns;
5284 int i;
5286 static const uint32_t nvme_feature_default[NVME_FID_MAX] = {
5287 [NVME_ARBITRATION] = NVME_ARB_AB_NOLIMIT,
5290 trace_pci_nvme_getfeat(nvme_cid(req), nsid, fid, sel, dw11);
5292 if (!nvme_feature_support[fid]) {
5293 return NVME_INVALID_FIELD | NVME_DNR;
5296 if (nvme_feature_cap[fid] & NVME_FEAT_CAP_NS) {
5297 if (!nvme_nsid_valid(n, nsid) || nsid == NVME_NSID_BROADCAST) {
5299 * The Reservation Notification Mask and Reservation Persistence
5300 * features require a status code of Invalid Field in Command when
5301 * NSID is FFFFFFFFh. Since the device does not support those
5302 * features we can always return Invalid Namespace or Format as we
5303 * should do for all other features.
5305 return NVME_INVALID_NSID | NVME_DNR;
5308 if (!nvme_ns(n, nsid)) {
5309 return NVME_INVALID_FIELD | NVME_DNR;
5313 switch (sel) {
5314 case NVME_GETFEAT_SELECT_CURRENT:
5315 break;
5316 case NVME_GETFEAT_SELECT_SAVED:
5317 /* no features are saveable by the controller; fallthrough */
5318 case NVME_GETFEAT_SELECT_DEFAULT:
5319 goto defaults;
5320 case NVME_GETFEAT_SELECT_CAP:
5321 result = nvme_feature_cap[fid];
5322 goto out;
5325 switch (fid) {
5326 case NVME_TEMPERATURE_THRESHOLD:
5327 result = 0;
5330 * The controller only implements the Composite Temperature sensor, so
5331 * return 0 for all other sensors.
5333 if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) {
5334 goto out;
5337 switch (NVME_TEMP_THSEL(dw11)) {
5338 case NVME_TEMP_THSEL_OVER:
5339 result = n->features.temp_thresh_hi;
5340 goto out;
5341 case NVME_TEMP_THSEL_UNDER:
5342 result = n->features.temp_thresh_low;
5343 goto out;
5346 return NVME_INVALID_FIELD | NVME_DNR;
5347 case NVME_ERROR_RECOVERY:
5348 if (!nvme_nsid_valid(n, nsid)) {
5349 return NVME_INVALID_NSID | NVME_DNR;
5352 ns = nvme_ns(n, nsid);
5353 if (unlikely(!ns)) {
5354 return NVME_INVALID_FIELD | NVME_DNR;
5357 result = ns->features.err_rec;
5358 goto out;
5359 case NVME_VOLATILE_WRITE_CACHE:
5360 result = 0;
5361 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
5362 ns = nvme_ns(n, i);
5363 if (!ns) {
5364 continue;
5367 result = blk_enable_write_cache(ns->blkconf.blk);
5368 if (result) {
5369 break;
5372 trace_pci_nvme_getfeat_vwcache(result ? "enabled" : "disabled");
5373 goto out;
5374 case NVME_ASYNCHRONOUS_EVENT_CONF:
5375 result = n->features.async_config;
5376 goto out;
5377 case NVME_TIMESTAMP:
5378 return nvme_get_feature_timestamp(n, req);
5379 case NVME_HOST_BEHAVIOR_SUPPORT:
5380 return nvme_c2h(n, (uint8_t *)&n->features.hbs,
5381 sizeof(n->features.hbs), req);
5382 default:
5383 break;
5386 defaults:
5387 switch (fid) {
5388 case NVME_TEMPERATURE_THRESHOLD:
5389 result = 0;
5391 if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) {
5392 break;
5395 if (NVME_TEMP_THSEL(dw11) == NVME_TEMP_THSEL_OVER) {
5396 result = NVME_TEMPERATURE_WARNING;
5399 break;
5400 case NVME_NUMBER_OF_QUEUES:
5401 result = (n->conf_ioqpairs - 1) | ((n->conf_ioqpairs - 1) << 16);
5402 trace_pci_nvme_getfeat_numq(result);
5403 break;
5404 case NVME_INTERRUPT_VECTOR_CONF:
5405 iv = dw11 & 0xffff;
5406 if (iv >= n->conf_ioqpairs + 1) {
5407 return NVME_INVALID_FIELD | NVME_DNR;
5410 result = iv;
5411 if (iv == n->admin_cq.vector) {
5412 result |= NVME_INTVC_NOCOALESCING;
5414 break;
5415 default:
5416 result = nvme_feature_default[fid];
5417 break;
5420 out:
5421 req->cqe.result = cpu_to_le32(result);
5422 return NVME_SUCCESS;
5425 static uint16_t nvme_set_feature_timestamp(NvmeCtrl *n, NvmeRequest *req)
5427 uint16_t ret;
5428 uint64_t timestamp;
5430 ret = nvme_h2c(n, (uint8_t *)&timestamp, sizeof(timestamp), req);
5431 if (ret) {
5432 return ret;
5435 nvme_set_timestamp(n, timestamp);
5437 return NVME_SUCCESS;
5440 static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest *req)
5442 NvmeNamespace *ns = NULL;
5444 NvmeCmd *cmd = &req->cmd;
5445 uint32_t dw10 = le32_to_cpu(cmd->cdw10);
5446 uint32_t dw11 = le32_to_cpu(cmd->cdw11);
5447 uint32_t nsid = le32_to_cpu(cmd->nsid);
5448 uint8_t fid = NVME_GETSETFEAT_FID(dw10);
5449 uint8_t save = NVME_SETFEAT_SAVE(dw10);
5450 uint16_t status;
5451 int i;
5453 trace_pci_nvme_setfeat(nvme_cid(req), nsid, fid, save, dw11);
5455 if (save && !(nvme_feature_cap[fid] & NVME_FEAT_CAP_SAVE)) {
5456 return NVME_FID_NOT_SAVEABLE | NVME_DNR;
5459 if (!nvme_feature_support[fid]) {
5460 return NVME_INVALID_FIELD | NVME_DNR;
5463 if (nvme_feature_cap[fid] & NVME_FEAT_CAP_NS) {
5464 if (nsid != NVME_NSID_BROADCAST) {
5465 if (!nvme_nsid_valid(n, nsid)) {
5466 return NVME_INVALID_NSID | NVME_DNR;
5469 ns = nvme_ns(n, nsid);
5470 if (unlikely(!ns)) {
5471 return NVME_INVALID_FIELD | NVME_DNR;
5474 } else if (nsid && nsid != NVME_NSID_BROADCAST) {
5475 if (!nvme_nsid_valid(n, nsid)) {
5476 return NVME_INVALID_NSID | NVME_DNR;
5479 return NVME_FEAT_NOT_NS_SPEC | NVME_DNR;
5482 if (!(nvme_feature_cap[fid] & NVME_FEAT_CAP_CHANGE)) {
5483 return NVME_FEAT_NOT_CHANGEABLE | NVME_DNR;
5486 switch (fid) {
5487 case NVME_TEMPERATURE_THRESHOLD:
5488 if (NVME_TEMP_TMPSEL(dw11) != NVME_TEMP_TMPSEL_COMPOSITE) {
5489 break;
5492 switch (NVME_TEMP_THSEL(dw11)) {
5493 case NVME_TEMP_THSEL_OVER:
5494 n->features.temp_thresh_hi = NVME_TEMP_TMPTH(dw11);
5495 break;
5496 case NVME_TEMP_THSEL_UNDER:
5497 n->features.temp_thresh_low = NVME_TEMP_TMPTH(dw11);
5498 break;
5499 default:
5500 return NVME_INVALID_FIELD | NVME_DNR;
5503 if ((n->temperature >= n->features.temp_thresh_hi) ||
5504 (n->temperature <= n->features.temp_thresh_low)) {
5505 nvme_smart_event(n, NVME_SMART_TEMPERATURE);
5508 break;
5509 case NVME_ERROR_RECOVERY:
5510 if (nsid == NVME_NSID_BROADCAST) {
5511 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
5512 ns = nvme_ns(n, i);
5514 if (!ns) {
5515 continue;
5518 if (NVME_ID_NS_NSFEAT_DULBE(ns->id_ns.nsfeat)) {
5519 ns->features.err_rec = dw11;
5523 break;
5526 assert(ns);
5527 if (NVME_ID_NS_NSFEAT_DULBE(ns->id_ns.nsfeat)) {
5528 ns->features.err_rec = dw11;
5530 break;
5531 case NVME_VOLATILE_WRITE_CACHE:
5532 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
5533 ns = nvme_ns(n, i);
5534 if (!ns) {
5535 continue;
5538 if (!(dw11 & 0x1) && blk_enable_write_cache(ns->blkconf.blk)) {
5539 blk_flush(ns->blkconf.blk);
5542 blk_set_enable_write_cache(ns->blkconf.blk, dw11 & 1);
5545 break;
5547 case NVME_NUMBER_OF_QUEUES:
5548 if (n->qs_created) {
5549 return NVME_CMD_SEQ_ERROR | NVME_DNR;
5553 * NVMe v1.3, Section 5.21.1.7: FFFFh is not an allowed value for NCQR
5554 * and NSQR.
5556 if ((dw11 & 0xffff) == 0xffff || ((dw11 >> 16) & 0xffff) == 0xffff) {
5557 return NVME_INVALID_FIELD | NVME_DNR;
5560 trace_pci_nvme_setfeat_numq((dw11 & 0xffff) + 1,
5561 ((dw11 >> 16) & 0xffff) + 1,
5562 n->conf_ioqpairs,
5563 n->conf_ioqpairs);
5564 req->cqe.result = cpu_to_le32((n->conf_ioqpairs - 1) |
5565 ((n->conf_ioqpairs - 1) << 16));
5566 break;
5567 case NVME_ASYNCHRONOUS_EVENT_CONF:
5568 n->features.async_config = dw11;
5569 break;
5570 case NVME_TIMESTAMP:
5571 return nvme_set_feature_timestamp(n, req);
5572 case NVME_HOST_BEHAVIOR_SUPPORT:
5573 status = nvme_h2c(n, (uint8_t *)&n->features.hbs,
5574 sizeof(n->features.hbs), req);
5575 if (status) {
5576 return status;
5579 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
5580 ns = nvme_ns(n, i);
5582 if (!ns) {
5583 continue;
5586 ns->id_ns.nlbaf = ns->nlbaf - 1;
5587 if (!n->features.hbs.lbafee) {
5588 ns->id_ns.nlbaf = MIN(ns->id_ns.nlbaf, 15);
5592 return status;
5593 case NVME_COMMAND_SET_PROFILE:
5594 if (dw11 & 0x1ff) {
5595 trace_pci_nvme_err_invalid_iocsci(dw11 & 0x1ff);
5596 return NVME_CMD_SET_CMB_REJECTED | NVME_DNR;
5598 break;
5599 default:
5600 return NVME_FEAT_NOT_CHANGEABLE | NVME_DNR;
5602 return NVME_SUCCESS;
5605 static uint16_t nvme_aer(NvmeCtrl *n, NvmeRequest *req)
5607 trace_pci_nvme_aer(nvme_cid(req));
5609 if (n->outstanding_aers > n->params.aerl) {
5610 trace_pci_nvme_aer_aerl_exceeded();
5611 return NVME_AER_LIMIT_EXCEEDED;
5614 n->aer_reqs[n->outstanding_aers] = req;
5615 n->outstanding_aers++;
5617 if (!QTAILQ_EMPTY(&n->aer_queue)) {
5618 nvme_process_aers(n);
5621 return NVME_NO_COMPLETE;
5624 static void nvme_update_dmrsl(NvmeCtrl *n)
5626 int nsid;
5628 for (nsid = 1; nsid <= NVME_MAX_NAMESPACES; nsid++) {
5629 NvmeNamespace *ns = nvme_ns(n, nsid);
5630 if (!ns) {
5631 continue;
5634 n->dmrsl = MIN_NON_ZERO(n->dmrsl,
5635 BDRV_REQUEST_MAX_BYTES / nvme_l2b(ns, 1));
5639 static void nvme_select_iocs_ns(NvmeCtrl *n, NvmeNamespace *ns)
5641 uint32_t cc = ldl_le_p(&n->bar.cc);
5643 ns->iocs = nvme_cse_iocs_none;
5644 switch (ns->csi) {
5645 case NVME_CSI_NVM:
5646 if (NVME_CC_CSS(cc) != NVME_CC_CSS_ADMIN_ONLY) {
5647 ns->iocs = nvme_cse_iocs_nvm;
5649 break;
5650 case NVME_CSI_ZONED:
5651 if (NVME_CC_CSS(cc) == NVME_CC_CSS_CSI) {
5652 ns->iocs = nvme_cse_iocs_zoned;
5653 } else if (NVME_CC_CSS(cc) == NVME_CC_CSS_NVM) {
5654 ns->iocs = nvme_cse_iocs_nvm;
5656 break;
5660 static uint16_t nvme_ns_attachment(NvmeCtrl *n, NvmeRequest *req)
5662 NvmeNamespace *ns;
5663 NvmeCtrl *ctrl;
5664 uint16_t list[NVME_CONTROLLER_LIST_SIZE] = {};
5665 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
5666 uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
5667 uint8_t sel = dw10 & 0xf;
5668 uint16_t *nr_ids = &list[0];
5669 uint16_t *ids = &list[1];
5670 uint16_t ret;
5671 int i;
5673 trace_pci_nvme_ns_attachment(nvme_cid(req), dw10 & 0xf);
5675 if (!nvme_nsid_valid(n, nsid)) {
5676 return NVME_INVALID_NSID | NVME_DNR;
5679 ns = nvme_subsys_ns(n->subsys, nsid);
5680 if (!ns) {
5681 return NVME_INVALID_FIELD | NVME_DNR;
5684 ret = nvme_h2c(n, (uint8_t *)list, 4096, req);
5685 if (ret) {
5686 return ret;
5689 if (!*nr_ids) {
5690 return NVME_NS_CTRL_LIST_INVALID | NVME_DNR;
5693 *nr_ids = MIN(*nr_ids, NVME_CONTROLLER_LIST_SIZE - 1);
5694 for (i = 0; i < *nr_ids; i++) {
5695 ctrl = nvme_subsys_ctrl(n->subsys, ids[i]);
5696 if (!ctrl) {
5697 return NVME_NS_CTRL_LIST_INVALID | NVME_DNR;
5700 switch (sel) {
5701 case NVME_NS_ATTACHMENT_ATTACH:
5702 if (nvme_ns(ctrl, nsid)) {
5703 return NVME_NS_ALREADY_ATTACHED | NVME_DNR;
5706 if (ns->attached && !ns->params.shared) {
5707 return NVME_NS_PRIVATE | NVME_DNR;
5710 nvme_attach_ns(ctrl, ns);
5711 nvme_select_iocs_ns(ctrl, ns);
5713 break;
5715 case NVME_NS_ATTACHMENT_DETACH:
5716 if (!nvme_ns(ctrl, nsid)) {
5717 return NVME_NS_NOT_ATTACHED | NVME_DNR;
5720 ctrl->namespaces[nsid] = NULL;
5721 ns->attached--;
5723 nvme_update_dmrsl(ctrl);
5725 break;
5727 default:
5728 return NVME_INVALID_FIELD | NVME_DNR;
5732 * Add namespace id to the changed namespace id list for event clearing
5733 * via Get Log Page command.
5735 if (!test_and_set_bit(nsid, ctrl->changed_nsids)) {
5736 nvme_enqueue_event(ctrl, NVME_AER_TYPE_NOTICE,
5737 NVME_AER_INFO_NOTICE_NS_ATTR_CHANGED,
5738 NVME_LOG_CHANGED_NSLIST);
5742 return NVME_SUCCESS;
5745 typedef struct NvmeFormatAIOCB {
5746 BlockAIOCB common;
5747 BlockAIOCB *aiocb;
5748 QEMUBH *bh;
5749 NvmeRequest *req;
5750 int ret;
5752 NvmeNamespace *ns;
5753 uint32_t nsid;
5754 bool broadcast;
5755 int64_t offset;
5757 uint8_t lbaf;
5758 uint8_t mset;
5759 uint8_t pi;
5760 uint8_t pil;
5761 } NvmeFormatAIOCB;
5763 static void nvme_format_bh(void *opaque);
5765 static void nvme_format_cancel(BlockAIOCB *aiocb)
5767 NvmeFormatAIOCB *iocb = container_of(aiocb, NvmeFormatAIOCB, common);
5769 if (iocb->aiocb) {
5770 blk_aio_cancel_async(iocb->aiocb);
5774 static const AIOCBInfo nvme_format_aiocb_info = {
5775 .aiocb_size = sizeof(NvmeFormatAIOCB),
5776 .cancel_async = nvme_format_cancel,
5777 .get_aio_context = nvme_get_aio_context,
5780 static void nvme_format_set(NvmeNamespace *ns, uint8_t lbaf, uint8_t mset,
5781 uint8_t pi, uint8_t pil)
5783 uint8_t lbafl = lbaf & 0xf;
5784 uint8_t lbafu = lbaf >> 4;
5786 trace_pci_nvme_format_set(ns->params.nsid, lbaf, mset, pi, pil);
5788 ns->id_ns.dps = (pil << 3) | pi;
5789 ns->id_ns.flbas = (lbafu << 5) | (mset << 4) | lbafl;
5791 nvme_ns_init_format(ns);
5794 static void nvme_format_ns_cb(void *opaque, int ret)
5796 NvmeFormatAIOCB *iocb = opaque;
5797 NvmeNamespace *ns = iocb->ns;
5798 int bytes;
5800 if (ret < 0) {
5801 iocb->ret = ret;
5802 goto done;
5805 assert(ns);
5807 if (iocb->offset < ns->size) {
5808 bytes = MIN(BDRV_REQUEST_MAX_BYTES, ns->size - iocb->offset);
5810 iocb->aiocb = blk_aio_pwrite_zeroes(ns->blkconf.blk, iocb->offset,
5811 bytes, BDRV_REQ_MAY_UNMAP,
5812 nvme_format_ns_cb, iocb);
5814 iocb->offset += bytes;
5815 return;
5818 nvme_format_set(ns, iocb->lbaf, iocb->mset, iocb->pi, iocb->pil);
5819 ns->status = 0x0;
5820 iocb->ns = NULL;
5821 iocb->offset = 0;
5823 done:
5824 iocb->aiocb = NULL;
5825 qemu_bh_schedule(iocb->bh);
5828 static uint16_t nvme_format_check(NvmeNamespace *ns, uint8_t lbaf, uint8_t pi)
5830 if (ns->params.zoned) {
5831 return NVME_INVALID_FORMAT | NVME_DNR;
5834 if (lbaf > ns->id_ns.nlbaf) {
5835 return NVME_INVALID_FORMAT | NVME_DNR;
5838 if (pi && (ns->id_ns.lbaf[lbaf].ms < nvme_pi_tuple_size(ns))) {
5839 return NVME_INVALID_FORMAT | NVME_DNR;
5842 if (pi && pi > NVME_ID_NS_DPS_TYPE_3) {
5843 return NVME_INVALID_FIELD | NVME_DNR;
5846 return NVME_SUCCESS;
5849 static void nvme_format_bh(void *opaque)
5851 NvmeFormatAIOCB *iocb = opaque;
5852 NvmeRequest *req = iocb->req;
5853 NvmeCtrl *n = nvme_ctrl(req);
5854 uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
5855 uint8_t lbaf = dw10 & 0xf;
5856 uint8_t pi = (dw10 >> 5) & 0x7;
5857 uint16_t status;
5858 int i;
5860 if (iocb->ret < 0) {
5861 goto done;
5864 if (iocb->broadcast) {
5865 for (i = iocb->nsid + 1; i <= NVME_MAX_NAMESPACES; i++) {
5866 iocb->ns = nvme_ns(n, i);
5867 if (iocb->ns) {
5868 iocb->nsid = i;
5869 break;
5874 if (!iocb->ns) {
5875 goto done;
5878 status = nvme_format_check(iocb->ns, lbaf, pi);
5879 if (status) {
5880 req->status = status;
5881 goto done;
5884 iocb->ns->status = NVME_FORMAT_IN_PROGRESS;
5885 nvme_format_ns_cb(iocb, 0);
5886 return;
5888 done:
5889 qemu_bh_delete(iocb->bh);
5890 iocb->bh = NULL;
5892 iocb->common.cb(iocb->common.opaque, iocb->ret);
5894 qemu_aio_unref(iocb);
5897 static uint16_t nvme_format(NvmeCtrl *n, NvmeRequest *req)
5899 NvmeFormatAIOCB *iocb;
5900 uint32_t nsid = le32_to_cpu(req->cmd.nsid);
5901 uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
5902 uint8_t lbaf = dw10 & 0xf;
5903 uint8_t mset = (dw10 >> 4) & 0x1;
5904 uint8_t pi = (dw10 >> 5) & 0x7;
5905 uint8_t pil = (dw10 >> 8) & 0x1;
5906 uint8_t lbafu = (dw10 >> 12) & 0x3;
5907 uint16_t status;
5909 iocb = qemu_aio_get(&nvme_format_aiocb_info, NULL, nvme_misc_cb, req);
5911 iocb->req = req;
5912 iocb->bh = qemu_bh_new(nvme_format_bh, iocb);
5913 iocb->ret = 0;
5914 iocb->ns = NULL;
5915 iocb->nsid = 0;
5916 iocb->lbaf = lbaf;
5917 iocb->mset = mset;
5918 iocb->pi = pi;
5919 iocb->pil = pil;
5920 iocb->broadcast = (nsid == NVME_NSID_BROADCAST);
5921 iocb->offset = 0;
5923 if (n->features.hbs.lbafee) {
5924 iocb->lbaf |= lbafu << 4;
5927 if (!iocb->broadcast) {
5928 if (!nvme_nsid_valid(n, nsid)) {
5929 status = NVME_INVALID_NSID | NVME_DNR;
5930 goto out;
5933 iocb->ns = nvme_ns(n, nsid);
5934 if (!iocb->ns) {
5935 status = NVME_INVALID_FIELD | NVME_DNR;
5936 goto out;
5940 req->aiocb = &iocb->common;
5941 qemu_bh_schedule(iocb->bh);
5943 return NVME_NO_COMPLETE;
5945 out:
5946 qemu_bh_delete(iocb->bh);
5947 iocb->bh = NULL;
5948 qemu_aio_unref(iocb);
5949 return status;
5952 static void nvme_get_virt_res_num(NvmeCtrl *n, uint8_t rt, int *num_total,
5953 int *num_prim, int *num_sec)
5955 *num_total = le32_to_cpu(rt ?
5956 n->pri_ctrl_cap.vifrt : n->pri_ctrl_cap.vqfrt);
5957 *num_prim = le16_to_cpu(rt ?
5958 n->pri_ctrl_cap.virfap : n->pri_ctrl_cap.vqrfap);
5959 *num_sec = le16_to_cpu(rt ? n->pri_ctrl_cap.virfa : n->pri_ctrl_cap.vqrfa);
5962 static uint16_t nvme_assign_virt_res_to_prim(NvmeCtrl *n, NvmeRequest *req,
5963 uint16_t cntlid, uint8_t rt,
5964 int nr)
5966 int num_total, num_prim, num_sec;
5968 if (cntlid != n->cntlid) {
5969 return NVME_INVALID_CTRL_ID | NVME_DNR;
5972 nvme_get_virt_res_num(n, rt, &num_total, &num_prim, &num_sec);
5974 if (nr > num_total) {
5975 return NVME_INVALID_NUM_RESOURCES | NVME_DNR;
5978 if (nr > num_total - num_sec) {
5979 return NVME_INVALID_RESOURCE_ID | NVME_DNR;
5982 if (rt) {
5983 n->next_pri_ctrl_cap.virfap = cpu_to_le16(nr);
5984 } else {
5985 n->next_pri_ctrl_cap.vqrfap = cpu_to_le16(nr);
5988 req->cqe.result = cpu_to_le32(nr);
5989 return req->status;
5992 static void nvme_update_virt_res(NvmeCtrl *n, NvmeSecCtrlEntry *sctrl,
5993 uint8_t rt, int nr)
5995 int prev_nr, prev_total;
5997 if (rt) {
5998 prev_nr = le16_to_cpu(sctrl->nvi);
5999 prev_total = le32_to_cpu(n->pri_ctrl_cap.virfa);
6000 sctrl->nvi = cpu_to_le16(nr);
6001 n->pri_ctrl_cap.virfa = cpu_to_le32(prev_total + nr - prev_nr);
6002 } else {
6003 prev_nr = le16_to_cpu(sctrl->nvq);
6004 prev_total = le32_to_cpu(n->pri_ctrl_cap.vqrfa);
6005 sctrl->nvq = cpu_to_le16(nr);
6006 n->pri_ctrl_cap.vqrfa = cpu_to_le32(prev_total + nr - prev_nr);
6010 static uint16_t nvme_assign_virt_res_to_sec(NvmeCtrl *n, NvmeRequest *req,
6011 uint16_t cntlid, uint8_t rt, int nr)
6013 int num_total, num_prim, num_sec, num_free, diff, limit;
6014 NvmeSecCtrlEntry *sctrl;
6016 sctrl = nvme_sctrl_for_cntlid(n, cntlid);
6017 if (!sctrl) {
6018 return NVME_INVALID_CTRL_ID | NVME_DNR;
6021 if (sctrl->scs) {
6022 return NVME_INVALID_SEC_CTRL_STATE | NVME_DNR;
6025 limit = le16_to_cpu(rt ? n->pri_ctrl_cap.vifrsm : n->pri_ctrl_cap.vqfrsm);
6026 if (nr > limit) {
6027 return NVME_INVALID_NUM_RESOURCES | NVME_DNR;
6030 nvme_get_virt_res_num(n, rt, &num_total, &num_prim, &num_sec);
6031 num_free = num_total - num_prim - num_sec;
6032 diff = nr - le16_to_cpu(rt ? sctrl->nvi : sctrl->nvq);
6034 if (diff > num_free) {
6035 return NVME_INVALID_RESOURCE_ID | NVME_DNR;
6038 nvme_update_virt_res(n, sctrl, rt, nr);
6039 req->cqe.result = cpu_to_le32(nr);
6041 return req->status;
6044 static uint16_t nvme_virt_set_state(NvmeCtrl *n, uint16_t cntlid, bool online)
6046 NvmeCtrl *sn = NULL;
6047 NvmeSecCtrlEntry *sctrl;
6048 int vf_index;
6050 sctrl = nvme_sctrl_for_cntlid(n, cntlid);
6051 if (!sctrl) {
6052 return NVME_INVALID_CTRL_ID | NVME_DNR;
6055 if (!pci_is_vf(&n->parent_obj)) {
6056 vf_index = le16_to_cpu(sctrl->vfn) - 1;
6057 sn = NVME(pcie_sriov_get_vf_at_index(&n->parent_obj, vf_index));
6060 if (online) {
6061 if (!sctrl->nvi || (le16_to_cpu(sctrl->nvq) < 2) || !sn) {
6062 return NVME_INVALID_SEC_CTRL_STATE | NVME_DNR;
6065 if (!sctrl->scs) {
6066 sctrl->scs = 0x1;
6067 nvme_ctrl_reset(sn, NVME_RESET_FUNCTION);
6069 } else {
6070 nvme_update_virt_res(n, sctrl, NVME_VIRT_RES_INTERRUPT, 0);
6071 nvme_update_virt_res(n, sctrl, NVME_VIRT_RES_QUEUE, 0);
6073 if (sctrl->scs) {
6074 sctrl->scs = 0x0;
6075 if (sn) {
6076 nvme_ctrl_reset(sn, NVME_RESET_FUNCTION);
6081 return NVME_SUCCESS;
6084 static uint16_t nvme_virt_mngmt(NvmeCtrl *n, NvmeRequest *req)
6086 uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
6087 uint32_t dw11 = le32_to_cpu(req->cmd.cdw11);
6088 uint8_t act = dw10 & 0xf;
6089 uint8_t rt = (dw10 >> 8) & 0x7;
6090 uint16_t cntlid = (dw10 >> 16) & 0xffff;
6091 int nr = dw11 & 0xffff;
6093 trace_pci_nvme_virt_mngmt(nvme_cid(req), act, cntlid, rt ? "VI" : "VQ", nr);
6095 if (rt != NVME_VIRT_RES_QUEUE && rt != NVME_VIRT_RES_INTERRUPT) {
6096 return NVME_INVALID_RESOURCE_ID | NVME_DNR;
6099 switch (act) {
6100 case NVME_VIRT_MNGMT_ACTION_SEC_ASSIGN:
6101 return nvme_assign_virt_res_to_sec(n, req, cntlid, rt, nr);
6102 case NVME_VIRT_MNGMT_ACTION_PRM_ALLOC:
6103 return nvme_assign_virt_res_to_prim(n, req, cntlid, rt, nr);
6104 case NVME_VIRT_MNGMT_ACTION_SEC_ONLINE:
6105 return nvme_virt_set_state(n, cntlid, true);
6106 case NVME_VIRT_MNGMT_ACTION_SEC_OFFLINE:
6107 return nvme_virt_set_state(n, cntlid, false);
6108 default:
6109 return NVME_INVALID_FIELD | NVME_DNR;
6113 static uint16_t nvme_dbbuf_config(NvmeCtrl *n, const NvmeRequest *req)
6115 uint64_t dbs_addr = le64_to_cpu(req->cmd.dptr.prp1);
6116 uint64_t eis_addr = le64_to_cpu(req->cmd.dptr.prp2);
6117 int i;
6119 /* Address should be page aligned */
6120 if (dbs_addr & (n->page_size - 1) || eis_addr & (n->page_size - 1)) {
6121 return NVME_INVALID_FIELD | NVME_DNR;
6124 /* Save shadow buffer base addr for use during queue creation */
6125 n->dbbuf_dbs = dbs_addr;
6126 n->dbbuf_eis = eis_addr;
6127 n->dbbuf_enabled = true;
6129 for (i = 0; i < n->params.max_ioqpairs + 1; i++) {
6130 NvmeSQueue *sq = n->sq[i];
6131 NvmeCQueue *cq = n->cq[i];
6133 if (sq) {
6135 * CAP.DSTRD is 0, so offset of ith sq db_addr is (i<<3)
6136 * nvme_process_db() uses this hard-coded way to calculate
6137 * doorbell offsets. Be consistent with that here.
6139 sq->db_addr = dbs_addr + (i << 3);
6140 sq->ei_addr = eis_addr + (i << 3);
6141 pci_dma_write(&n->parent_obj, sq->db_addr, &sq->tail,
6142 sizeof(sq->tail));
6144 if (n->params.ioeventfd && sq->sqid != 0) {
6145 if (!nvme_init_sq_ioeventfd(sq)) {
6146 sq->ioeventfd_enabled = true;
6151 if (cq) {
6152 /* CAP.DSTRD is 0, so offset of ith cq db_addr is (i<<3)+(1<<2) */
6153 cq->db_addr = dbs_addr + (i << 3) + (1 << 2);
6154 cq->ei_addr = eis_addr + (i << 3) + (1 << 2);
6155 pci_dma_write(&n->parent_obj, cq->db_addr, &cq->head,
6156 sizeof(cq->head));
6158 if (n->params.ioeventfd && cq->cqid != 0) {
6159 if (!nvme_init_cq_ioeventfd(cq)) {
6160 cq->ioeventfd_enabled = true;
6166 trace_pci_nvme_dbbuf_config(dbs_addr, eis_addr);
6168 return NVME_SUCCESS;
6171 static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest *req)
6173 trace_pci_nvme_admin_cmd(nvme_cid(req), nvme_sqid(req), req->cmd.opcode,
6174 nvme_adm_opc_str(req->cmd.opcode));
6176 if (!(nvme_cse_acs[req->cmd.opcode] & NVME_CMD_EFF_CSUPP)) {
6177 trace_pci_nvme_err_invalid_admin_opc(req->cmd.opcode);
6178 return NVME_INVALID_OPCODE | NVME_DNR;
6181 /* SGLs shall not be used for Admin commands in NVMe over PCIe */
6182 if (NVME_CMD_FLAGS_PSDT(req->cmd.flags) != NVME_PSDT_PRP) {
6183 return NVME_INVALID_FIELD | NVME_DNR;
6186 if (NVME_CMD_FLAGS_FUSE(req->cmd.flags)) {
6187 return NVME_INVALID_FIELD;
6190 switch (req->cmd.opcode) {
6191 case NVME_ADM_CMD_DELETE_SQ:
6192 return nvme_del_sq(n, req);
6193 case NVME_ADM_CMD_CREATE_SQ:
6194 return nvme_create_sq(n, req);
6195 case NVME_ADM_CMD_GET_LOG_PAGE:
6196 return nvme_get_log(n, req);
6197 case NVME_ADM_CMD_DELETE_CQ:
6198 return nvme_del_cq(n, req);
6199 case NVME_ADM_CMD_CREATE_CQ:
6200 return nvme_create_cq(n, req);
6201 case NVME_ADM_CMD_IDENTIFY:
6202 return nvme_identify(n, req);
6203 case NVME_ADM_CMD_ABORT:
6204 return nvme_abort(n, req);
6205 case NVME_ADM_CMD_SET_FEATURES:
6206 return nvme_set_feature(n, req);
6207 case NVME_ADM_CMD_GET_FEATURES:
6208 return nvme_get_feature(n, req);
6209 case NVME_ADM_CMD_ASYNC_EV_REQ:
6210 return nvme_aer(n, req);
6211 case NVME_ADM_CMD_NS_ATTACHMENT:
6212 return nvme_ns_attachment(n, req);
6213 case NVME_ADM_CMD_VIRT_MNGMT:
6214 return nvme_virt_mngmt(n, req);
6215 case NVME_ADM_CMD_DBBUF_CONFIG:
6216 return nvme_dbbuf_config(n, req);
6217 case NVME_ADM_CMD_FORMAT_NVM:
6218 return nvme_format(n, req);
6219 default:
6220 assert(false);
6223 return NVME_INVALID_OPCODE | NVME_DNR;
6226 static void nvme_update_sq_eventidx(const NvmeSQueue *sq)
6228 pci_dma_write(&sq->ctrl->parent_obj, sq->ei_addr, &sq->tail,
6229 sizeof(sq->tail));
6230 trace_pci_nvme_eventidx_sq(sq->sqid, sq->tail);
6233 static void nvme_update_sq_tail(NvmeSQueue *sq)
6235 pci_dma_read(&sq->ctrl->parent_obj, sq->db_addr, &sq->tail,
6236 sizeof(sq->tail));
6237 trace_pci_nvme_shadow_doorbell_sq(sq->sqid, sq->tail);
6240 static void nvme_process_sq(void *opaque)
6242 NvmeSQueue *sq = opaque;
6243 NvmeCtrl *n = sq->ctrl;
6244 NvmeCQueue *cq = n->cq[sq->cqid];
6246 uint16_t status;
6247 hwaddr addr;
6248 NvmeCmd cmd;
6249 NvmeRequest *req;
6251 if (n->dbbuf_enabled) {
6252 nvme_update_sq_tail(sq);
6255 while (!(nvme_sq_empty(sq) || QTAILQ_EMPTY(&sq->req_list))) {
6256 addr = sq->dma_addr + sq->head * n->sqe_size;
6257 if (nvme_addr_read(n, addr, (void *)&cmd, sizeof(cmd))) {
6258 trace_pci_nvme_err_addr_read(addr);
6259 trace_pci_nvme_err_cfs();
6260 stl_le_p(&n->bar.csts, NVME_CSTS_FAILED);
6261 break;
6263 nvme_inc_sq_head(sq);
6265 req = QTAILQ_FIRST(&sq->req_list);
6266 QTAILQ_REMOVE(&sq->req_list, req, entry);
6267 QTAILQ_INSERT_TAIL(&sq->out_req_list, req, entry);
6268 nvme_req_clear(req);
6269 req->cqe.cid = cmd.cid;
6270 memcpy(&req->cmd, &cmd, sizeof(NvmeCmd));
6272 status = sq->sqid ? nvme_io_cmd(n, req) :
6273 nvme_admin_cmd(n, req);
6274 if (status != NVME_NO_COMPLETE) {
6275 req->status = status;
6276 nvme_enqueue_req_completion(cq, req);
6279 if (n->dbbuf_enabled) {
6280 nvme_update_sq_eventidx(sq);
6281 nvme_update_sq_tail(sq);
6286 static void nvme_update_msixcap_ts(PCIDevice *pci_dev, uint32_t table_size)
6288 uint8_t *config;
6290 if (!msix_present(pci_dev)) {
6291 return;
6294 assert(table_size > 0 && table_size <= pci_dev->msix_entries_nr);
6296 config = pci_dev->config + pci_dev->msix_cap;
6297 pci_set_word_by_mask(config + PCI_MSIX_FLAGS, PCI_MSIX_FLAGS_QSIZE,
6298 table_size - 1);
6301 static void nvme_activate_virt_res(NvmeCtrl *n)
6303 PCIDevice *pci_dev = &n->parent_obj;
6304 NvmePriCtrlCap *cap = &n->pri_ctrl_cap;
6305 NvmeSecCtrlEntry *sctrl;
6307 /* -1 to account for the admin queue */
6308 if (pci_is_vf(pci_dev)) {
6309 sctrl = nvme_sctrl(n);
6310 cap->vqprt = sctrl->nvq;
6311 cap->viprt = sctrl->nvi;
6312 n->conf_ioqpairs = sctrl->nvq ? le16_to_cpu(sctrl->nvq) - 1 : 0;
6313 n->conf_msix_qsize = sctrl->nvi ? le16_to_cpu(sctrl->nvi) : 1;
6314 } else {
6315 cap->vqrfap = n->next_pri_ctrl_cap.vqrfap;
6316 cap->virfap = n->next_pri_ctrl_cap.virfap;
6317 n->conf_ioqpairs = le16_to_cpu(cap->vqprt) +
6318 le16_to_cpu(cap->vqrfap) - 1;
6319 n->conf_msix_qsize = le16_to_cpu(cap->viprt) +
6320 le16_to_cpu(cap->virfap);
6324 static void nvme_ctrl_reset(NvmeCtrl *n, NvmeResetType rst)
6326 PCIDevice *pci_dev = &n->parent_obj;
6327 NvmeSecCtrlEntry *sctrl;
6328 NvmeNamespace *ns;
6329 int i;
6331 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
6332 ns = nvme_ns(n, i);
6333 if (!ns) {
6334 continue;
6337 nvme_ns_drain(ns);
6340 for (i = 0; i < n->params.max_ioqpairs + 1; i++) {
6341 if (n->sq[i] != NULL) {
6342 nvme_free_sq(n->sq[i], n);
6345 for (i = 0; i < n->params.max_ioqpairs + 1; i++) {
6346 if (n->cq[i] != NULL) {
6347 nvme_free_cq(n->cq[i], n);
6351 while (!QTAILQ_EMPTY(&n->aer_queue)) {
6352 NvmeAsyncEvent *event = QTAILQ_FIRST(&n->aer_queue);
6353 QTAILQ_REMOVE(&n->aer_queue, event, entry);
6354 g_free(event);
6357 if (n->params.sriov_max_vfs) {
6358 if (!pci_is_vf(pci_dev)) {
6359 for (i = 0; i < n->sec_ctrl_list.numcntl; i++) {
6360 sctrl = &n->sec_ctrl_list.sec[i];
6361 nvme_virt_set_state(n, le16_to_cpu(sctrl->scid), false);
6364 if (rst != NVME_RESET_CONTROLLER) {
6365 pcie_sriov_pf_disable_vfs(pci_dev);
6369 if (rst != NVME_RESET_CONTROLLER) {
6370 nvme_activate_virt_res(n);
6374 n->aer_queued = 0;
6375 n->aer_mask = 0;
6376 n->outstanding_aers = 0;
6377 n->qs_created = false;
6379 nvme_update_msixcap_ts(pci_dev, n->conf_msix_qsize);
6381 if (pci_is_vf(pci_dev)) {
6382 sctrl = nvme_sctrl(n);
6384 stl_le_p(&n->bar.csts, sctrl->scs ? 0 : NVME_CSTS_FAILED);
6385 } else {
6386 stl_le_p(&n->bar.csts, 0);
6389 stl_le_p(&n->bar.intms, 0);
6390 stl_le_p(&n->bar.intmc, 0);
6391 stl_le_p(&n->bar.cc, 0);
6393 n->dbbuf_dbs = 0;
6394 n->dbbuf_eis = 0;
6395 n->dbbuf_enabled = false;
6398 static void nvme_ctrl_shutdown(NvmeCtrl *n)
6400 NvmeNamespace *ns;
6401 int i;
6403 if (n->pmr.dev) {
6404 memory_region_msync(&n->pmr.dev->mr, 0, n->pmr.dev->size);
6407 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
6408 ns = nvme_ns(n, i);
6409 if (!ns) {
6410 continue;
6413 nvme_ns_shutdown(ns);
6417 static void nvme_select_iocs(NvmeCtrl *n)
6419 NvmeNamespace *ns;
6420 int i;
6422 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
6423 ns = nvme_ns(n, i);
6424 if (!ns) {
6425 continue;
6428 nvme_select_iocs_ns(n, ns);
6432 static int nvme_start_ctrl(NvmeCtrl *n)
6434 uint64_t cap = ldq_le_p(&n->bar.cap);
6435 uint32_t cc = ldl_le_p(&n->bar.cc);
6436 uint32_t aqa = ldl_le_p(&n->bar.aqa);
6437 uint64_t asq = ldq_le_p(&n->bar.asq);
6438 uint64_t acq = ldq_le_p(&n->bar.acq);
6439 uint32_t page_bits = NVME_CC_MPS(cc) + 12;
6440 uint32_t page_size = 1 << page_bits;
6441 NvmeSecCtrlEntry *sctrl = nvme_sctrl(n);
6443 if (pci_is_vf(&n->parent_obj) && !sctrl->scs) {
6444 trace_pci_nvme_err_startfail_virt_state(le16_to_cpu(sctrl->nvi),
6445 le16_to_cpu(sctrl->nvq),
6446 sctrl->scs ? "ONLINE" :
6447 "OFFLINE");
6448 return -1;
6450 if (unlikely(n->cq[0])) {
6451 trace_pci_nvme_err_startfail_cq();
6452 return -1;
6454 if (unlikely(n->sq[0])) {
6455 trace_pci_nvme_err_startfail_sq();
6456 return -1;
6458 if (unlikely(asq & (page_size - 1))) {
6459 trace_pci_nvme_err_startfail_asq_misaligned(asq);
6460 return -1;
6462 if (unlikely(acq & (page_size - 1))) {
6463 trace_pci_nvme_err_startfail_acq_misaligned(acq);
6464 return -1;
6466 if (unlikely(!(NVME_CAP_CSS(cap) & (1 << NVME_CC_CSS(cc))))) {
6467 trace_pci_nvme_err_startfail_css(NVME_CC_CSS(cc));
6468 return -1;
6470 if (unlikely(NVME_CC_MPS(cc) < NVME_CAP_MPSMIN(cap))) {
6471 trace_pci_nvme_err_startfail_page_too_small(
6472 NVME_CC_MPS(cc),
6473 NVME_CAP_MPSMIN(cap));
6474 return -1;
6476 if (unlikely(NVME_CC_MPS(cc) >
6477 NVME_CAP_MPSMAX(cap))) {
6478 trace_pci_nvme_err_startfail_page_too_large(
6479 NVME_CC_MPS(cc),
6480 NVME_CAP_MPSMAX(cap));
6481 return -1;
6483 if (unlikely(NVME_CC_IOCQES(cc) <
6484 NVME_CTRL_CQES_MIN(n->id_ctrl.cqes))) {
6485 trace_pci_nvme_err_startfail_cqent_too_small(
6486 NVME_CC_IOCQES(cc),
6487 NVME_CTRL_CQES_MIN(cap));
6488 return -1;
6490 if (unlikely(NVME_CC_IOCQES(cc) >
6491 NVME_CTRL_CQES_MAX(n->id_ctrl.cqes))) {
6492 trace_pci_nvme_err_startfail_cqent_too_large(
6493 NVME_CC_IOCQES(cc),
6494 NVME_CTRL_CQES_MAX(cap));
6495 return -1;
6497 if (unlikely(NVME_CC_IOSQES(cc) <
6498 NVME_CTRL_SQES_MIN(n->id_ctrl.sqes))) {
6499 trace_pci_nvme_err_startfail_sqent_too_small(
6500 NVME_CC_IOSQES(cc),
6501 NVME_CTRL_SQES_MIN(cap));
6502 return -1;
6504 if (unlikely(NVME_CC_IOSQES(cc) >
6505 NVME_CTRL_SQES_MAX(n->id_ctrl.sqes))) {
6506 trace_pci_nvme_err_startfail_sqent_too_large(
6507 NVME_CC_IOSQES(cc),
6508 NVME_CTRL_SQES_MAX(cap));
6509 return -1;
6511 if (unlikely(!NVME_AQA_ASQS(aqa))) {
6512 trace_pci_nvme_err_startfail_asqent_sz_zero();
6513 return -1;
6515 if (unlikely(!NVME_AQA_ACQS(aqa))) {
6516 trace_pci_nvme_err_startfail_acqent_sz_zero();
6517 return -1;
6520 n->page_bits = page_bits;
6521 n->page_size = page_size;
6522 n->max_prp_ents = n->page_size / sizeof(uint64_t);
6523 n->cqe_size = 1 << NVME_CC_IOCQES(cc);
6524 n->sqe_size = 1 << NVME_CC_IOSQES(cc);
6525 nvme_init_cq(&n->admin_cq, n, acq, 0, 0, NVME_AQA_ACQS(aqa) + 1, 1);
6526 nvme_init_sq(&n->admin_sq, n, asq, 0, 0, NVME_AQA_ASQS(aqa) + 1);
6528 nvme_set_timestamp(n, 0ULL);
6530 nvme_select_iocs(n);
6532 return 0;
6535 static void nvme_cmb_enable_regs(NvmeCtrl *n)
6537 uint32_t cmbloc = ldl_le_p(&n->bar.cmbloc);
6538 uint32_t cmbsz = ldl_le_p(&n->bar.cmbsz);
6540 NVME_CMBLOC_SET_CDPCILS(cmbloc, 1);
6541 NVME_CMBLOC_SET_CDPMLS(cmbloc, 1);
6542 NVME_CMBLOC_SET_BIR(cmbloc, NVME_CMB_BIR);
6543 stl_le_p(&n->bar.cmbloc, cmbloc);
6545 NVME_CMBSZ_SET_SQS(cmbsz, 1);
6546 NVME_CMBSZ_SET_CQS(cmbsz, 0);
6547 NVME_CMBSZ_SET_LISTS(cmbsz, 1);
6548 NVME_CMBSZ_SET_RDS(cmbsz, 1);
6549 NVME_CMBSZ_SET_WDS(cmbsz, 1);
6550 NVME_CMBSZ_SET_SZU(cmbsz, 2); /* MBs */
6551 NVME_CMBSZ_SET_SZ(cmbsz, n->params.cmb_size_mb);
6552 stl_le_p(&n->bar.cmbsz, cmbsz);
6555 static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
6556 unsigned size)
6558 uint64_t cap = ldq_le_p(&n->bar.cap);
6559 uint32_t cc = ldl_le_p(&n->bar.cc);
6560 uint32_t intms = ldl_le_p(&n->bar.intms);
6561 uint32_t csts = ldl_le_p(&n->bar.csts);
6562 uint32_t pmrsts = ldl_le_p(&n->bar.pmrsts);
6564 if (unlikely(offset & (sizeof(uint32_t) - 1))) {
6565 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_misaligned32,
6566 "MMIO write not 32-bit aligned,"
6567 " offset=0x%"PRIx64"", offset);
6568 /* should be ignored, fall through for now */
6571 if (unlikely(size < sizeof(uint32_t))) {
6572 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_toosmall,
6573 "MMIO write smaller than 32-bits,"
6574 " offset=0x%"PRIx64", size=%u",
6575 offset, size);
6576 /* should be ignored, fall through for now */
6579 switch (offset) {
6580 case NVME_REG_INTMS:
6581 if (unlikely(msix_enabled(&(n->parent_obj)))) {
6582 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_intmask_with_msix,
6583 "undefined access to interrupt mask set"
6584 " when MSI-X is enabled");
6585 /* should be ignored, fall through for now */
6587 intms |= data;
6588 stl_le_p(&n->bar.intms, intms);
6589 n->bar.intmc = n->bar.intms;
6590 trace_pci_nvme_mmio_intm_set(data & 0xffffffff, intms);
6591 nvme_irq_check(n);
6592 break;
6593 case NVME_REG_INTMC:
6594 if (unlikely(msix_enabled(&(n->parent_obj)))) {
6595 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_intmask_with_msix,
6596 "undefined access to interrupt mask clr"
6597 " when MSI-X is enabled");
6598 /* should be ignored, fall through for now */
6600 intms &= ~data;
6601 stl_le_p(&n->bar.intms, intms);
6602 n->bar.intmc = n->bar.intms;
6603 trace_pci_nvme_mmio_intm_clr(data & 0xffffffff, intms);
6604 nvme_irq_check(n);
6605 break;
6606 case NVME_REG_CC:
6607 stl_le_p(&n->bar.cc, data);
6609 trace_pci_nvme_mmio_cfg(data & 0xffffffff);
6611 if (NVME_CC_SHN(data) && !(NVME_CC_SHN(cc))) {
6612 trace_pci_nvme_mmio_shutdown_set();
6613 nvme_ctrl_shutdown(n);
6614 csts &= ~(CSTS_SHST_MASK << CSTS_SHST_SHIFT);
6615 csts |= NVME_CSTS_SHST_COMPLETE;
6616 } else if (!NVME_CC_SHN(data) && NVME_CC_SHN(cc)) {
6617 trace_pci_nvme_mmio_shutdown_cleared();
6618 csts &= ~(CSTS_SHST_MASK << CSTS_SHST_SHIFT);
6621 if (NVME_CC_EN(data) && !NVME_CC_EN(cc)) {
6622 if (unlikely(nvme_start_ctrl(n))) {
6623 trace_pci_nvme_err_startfail();
6624 csts = NVME_CSTS_FAILED;
6625 } else {
6626 trace_pci_nvme_mmio_start_success();
6627 csts = NVME_CSTS_READY;
6629 } else if (!NVME_CC_EN(data) && NVME_CC_EN(cc)) {
6630 trace_pci_nvme_mmio_stopped();
6631 nvme_ctrl_reset(n, NVME_RESET_CONTROLLER);
6633 break;
6636 stl_le_p(&n->bar.csts, csts);
6638 break;
6639 case NVME_REG_CSTS:
6640 if (data & (1 << 4)) {
6641 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_ssreset_w1c_unsupported,
6642 "attempted to W1C CSTS.NSSRO"
6643 " but CAP.NSSRS is zero (not supported)");
6644 } else if (data != 0) {
6645 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_ro_csts,
6646 "attempted to set a read only bit"
6647 " of controller status");
6649 break;
6650 case NVME_REG_NSSR:
6651 if (data == 0x4e564d65) {
6652 trace_pci_nvme_ub_mmiowr_ssreset_unsupported();
6653 } else {
6654 /* The spec says that writes of other values have no effect */
6655 return;
6657 break;
6658 case NVME_REG_AQA:
6659 stl_le_p(&n->bar.aqa, data);
6660 trace_pci_nvme_mmio_aqattr(data & 0xffffffff);
6661 break;
6662 case NVME_REG_ASQ:
6663 stn_le_p(&n->bar.asq, size, data);
6664 trace_pci_nvme_mmio_asqaddr(data);
6665 break;
6666 case NVME_REG_ASQ + 4:
6667 stl_le_p((uint8_t *)&n->bar.asq + 4, data);
6668 trace_pci_nvme_mmio_asqaddr_hi(data, ldq_le_p(&n->bar.asq));
6669 break;
6670 case NVME_REG_ACQ:
6671 trace_pci_nvme_mmio_acqaddr(data);
6672 stn_le_p(&n->bar.acq, size, data);
6673 break;
6674 case NVME_REG_ACQ + 4:
6675 stl_le_p((uint8_t *)&n->bar.acq + 4, data);
6676 trace_pci_nvme_mmio_acqaddr_hi(data, ldq_le_p(&n->bar.acq));
6677 break;
6678 case NVME_REG_CMBLOC:
6679 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_cmbloc_reserved,
6680 "invalid write to reserved CMBLOC"
6681 " when CMBSZ is zero, ignored");
6682 return;
6683 case NVME_REG_CMBSZ:
6684 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_cmbsz_readonly,
6685 "invalid write to read only CMBSZ, ignored");
6686 return;
6687 case NVME_REG_CMBMSC:
6688 if (!NVME_CAP_CMBS(cap)) {
6689 return;
6692 stn_le_p(&n->bar.cmbmsc, size, data);
6693 n->cmb.cmse = false;
6695 if (NVME_CMBMSC_CRE(data)) {
6696 nvme_cmb_enable_regs(n);
6698 if (NVME_CMBMSC_CMSE(data)) {
6699 uint64_t cmbmsc = ldq_le_p(&n->bar.cmbmsc);
6700 hwaddr cba = NVME_CMBMSC_CBA(cmbmsc) << CMBMSC_CBA_SHIFT;
6701 if (cba + int128_get64(n->cmb.mem.size) < cba) {
6702 uint32_t cmbsts = ldl_le_p(&n->bar.cmbsts);
6703 NVME_CMBSTS_SET_CBAI(cmbsts, 1);
6704 stl_le_p(&n->bar.cmbsts, cmbsts);
6705 return;
6708 n->cmb.cba = cba;
6709 n->cmb.cmse = true;
6711 } else {
6712 n->bar.cmbsz = 0;
6713 n->bar.cmbloc = 0;
6716 return;
6717 case NVME_REG_CMBMSC + 4:
6718 stl_le_p((uint8_t *)&n->bar.cmbmsc + 4, data);
6719 return;
6721 case NVME_REG_PMRCAP:
6722 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrcap_readonly,
6723 "invalid write to PMRCAP register, ignored");
6724 return;
6725 case NVME_REG_PMRCTL:
6726 if (!NVME_CAP_PMRS(cap)) {
6727 return;
6730 stl_le_p(&n->bar.pmrctl, data);
6731 if (NVME_PMRCTL_EN(data)) {
6732 memory_region_set_enabled(&n->pmr.dev->mr, true);
6733 pmrsts = 0;
6734 } else {
6735 memory_region_set_enabled(&n->pmr.dev->mr, false);
6736 NVME_PMRSTS_SET_NRDY(pmrsts, 1);
6737 n->pmr.cmse = false;
6739 stl_le_p(&n->bar.pmrsts, pmrsts);
6740 return;
6741 case NVME_REG_PMRSTS:
6742 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrsts_readonly,
6743 "invalid write to PMRSTS register, ignored");
6744 return;
6745 case NVME_REG_PMREBS:
6746 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrebs_readonly,
6747 "invalid write to PMREBS register, ignored");
6748 return;
6749 case NVME_REG_PMRSWTP:
6750 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrswtp_readonly,
6751 "invalid write to PMRSWTP register, ignored");
6752 return;
6753 case NVME_REG_PMRMSCL:
6754 if (!NVME_CAP_PMRS(cap)) {
6755 return;
6758 stl_le_p(&n->bar.pmrmscl, data);
6759 n->pmr.cmse = false;
6761 if (NVME_PMRMSCL_CMSE(data)) {
6762 uint64_t pmrmscu = ldl_le_p(&n->bar.pmrmscu);
6763 hwaddr cba = pmrmscu << 32 |
6764 (NVME_PMRMSCL_CBA(data) << PMRMSCL_CBA_SHIFT);
6765 if (cba + int128_get64(n->pmr.dev->mr.size) < cba) {
6766 NVME_PMRSTS_SET_CBAI(pmrsts, 1);
6767 stl_le_p(&n->bar.pmrsts, pmrsts);
6768 return;
6771 n->pmr.cmse = true;
6772 n->pmr.cba = cba;
6775 return;
6776 case NVME_REG_PMRMSCU:
6777 if (!NVME_CAP_PMRS(cap)) {
6778 return;
6781 stl_le_p(&n->bar.pmrmscu, data);
6782 return;
6783 default:
6784 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_invalid,
6785 "invalid MMIO write,"
6786 " offset=0x%"PRIx64", data=%"PRIx64"",
6787 offset, data);
6788 break;
6792 static uint64_t nvme_mmio_read(void *opaque, hwaddr addr, unsigned size)
6794 NvmeCtrl *n = (NvmeCtrl *)opaque;
6795 uint8_t *ptr = (uint8_t *)&n->bar;
6797 trace_pci_nvme_mmio_read(addr, size);
6799 if (unlikely(addr & (sizeof(uint32_t) - 1))) {
6800 NVME_GUEST_ERR(pci_nvme_ub_mmiord_misaligned32,
6801 "MMIO read not 32-bit aligned,"
6802 " offset=0x%"PRIx64"", addr);
6803 /* should RAZ, fall through for now */
6804 } else if (unlikely(size < sizeof(uint32_t))) {
6805 NVME_GUEST_ERR(pci_nvme_ub_mmiord_toosmall,
6806 "MMIO read smaller than 32-bits,"
6807 " offset=0x%"PRIx64"", addr);
6808 /* should RAZ, fall through for now */
6811 if (addr > sizeof(n->bar) - size) {
6812 NVME_GUEST_ERR(pci_nvme_ub_mmiord_invalid_ofs,
6813 "MMIO read beyond last register,"
6814 " offset=0x%"PRIx64", returning 0", addr);
6816 return 0;
6819 if (pci_is_vf(&n->parent_obj) && !nvme_sctrl(n)->scs &&
6820 addr != NVME_REG_CSTS) {
6821 trace_pci_nvme_err_ignored_mmio_vf_offline(addr, size);
6822 return 0;
6826 * When PMRWBM bit 1 is set then read from
6827 * from PMRSTS should ensure prior writes
6828 * made it to persistent media
6830 if (addr == NVME_REG_PMRSTS &&
6831 (NVME_PMRCAP_PMRWBM(ldl_le_p(&n->bar.pmrcap)) & 0x02)) {
6832 memory_region_msync(&n->pmr.dev->mr, 0, n->pmr.dev->size);
6835 return ldn_le_p(ptr + addr, size);
6838 static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
6840 uint32_t qid;
6842 if (unlikely(addr & ((1 << 2) - 1))) {
6843 NVME_GUEST_ERR(pci_nvme_ub_db_wr_misaligned,
6844 "doorbell write not 32-bit aligned,"
6845 " offset=0x%"PRIx64", ignoring", addr);
6846 return;
6849 if (((addr - 0x1000) >> 2) & 1) {
6850 /* Completion queue doorbell write */
6852 uint16_t new_head = val & 0xffff;
6853 int start_sqs;
6854 NvmeCQueue *cq;
6856 qid = (addr - (0x1000 + (1 << 2))) >> 3;
6857 if (unlikely(nvme_check_cqid(n, qid))) {
6858 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_cq,
6859 "completion queue doorbell write"
6860 " for nonexistent queue,"
6861 " sqid=%"PRIu32", ignoring", qid);
6864 * NVM Express v1.3d, Section 4.1 state: "If host software writes
6865 * an invalid value to the Submission Queue Tail Doorbell or
6866 * Completion Queue Head Doorbell regiter and an Asynchronous Event
6867 * Request command is outstanding, then an asynchronous event is
6868 * posted to the Admin Completion Queue with a status code of
6869 * Invalid Doorbell Write Value."
6871 * Also note that the spec includes the "Invalid Doorbell Register"
6872 * status code, but nowhere does it specify when to use it.
6873 * However, it seems reasonable to use it here in a similar
6874 * fashion.
6876 if (n->outstanding_aers) {
6877 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
6878 NVME_AER_INFO_ERR_INVALID_DB_REGISTER,
6879 NVME_LOG_ERROR_INFO);
6882 return;
6885 cq = n->cq[qid];
6886 if (unlikely(new_head >= cq->size)) {
6887 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_cqhead,
6888 "completion queue doorbell write value"
6889 " beyond queue size, sqid=%"PRIu32","
6890 " new_head=%"PRIu16", ignoring",
6891 qid, new_head);
6893 if (n->outstanding_aers) {
6894 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
6895 NVME_AER_INFO_ERR_INVALID_DB_VALUE,
6896 NVME_LOG_ERROR_INFO);
6899 return;
6902 trace_pci_nvme_mmio_doorbell_cq(cq->cqid, new_head);
6904 start_sqs = nvme_cq_full(cq) ? 1 : 0;
6905 cq->head = new_head;
6906 if (!qid && n->dbbuf_enabled) {
6907 pci_dma_write(&n->parent_obj, cq->db_addr, &cq->head,
6908 sizeof(cq->head));
6910 if (start_sqs) {
6911 NvmeSQueue *sq;
6912 QTAILQ_FOREACH(sq, &cq->sq_list, entry) {
6913 timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
6915 timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
6918 if (cq->tail == cq->head) {
6919 if (cq->irq_enabled) {
6920 n->cq_pending--;
6923 nvme_irq_deassert(n, cq);
6925 } else {
6926 /* Submission queue doorbell write */
6928 uint16_t new_tail = val & 0xffff;
6929 NvmeSQueue *sq;
6931 qid = (addr - 0x1000) >> 3;
6932 if (unlikely(nvme_check_sqid(n, qid))) {
6933 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_sq,
6934 "submission queue doorbell write"
6935 " for nonexistent queue,"
6936 " sqid=%"PRIu32", ignoring", qid);
6938 if (n->outstanding_aers) {
6939 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
6940 NVME_AER_INFO_ERR_INVALID_DB_REGISTER,
6941 NVME_LOG_ERROR_INFO);
6944 return;
6947 sq = n->sq[qid];
6948 if (unlikely(new_tail >= sq->size)) {
6949 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_sqtail,
6950 "submission queue doorbell write value"
6951 " beyond queue size, sqid=%"PRIu32","
6952 " new_tail=%"PRIu16", ignoring",
6953 qid, new_tail);
6955 if (n->outstanding_aers) {
6956 nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
6957 NVME_AER_INFO_ERR_INVALID_DB_VALUE,
6958 NVME_LOG_ERROR_INFO);
6961 return;
6964 trace_pci_nvme_mmio_doorbell_sq(sq->sqid, new_tail);
6966 sq->tail = new_tail;
6967 if (!qid && n->dbbuf_enabled) {
6969 * The spec states "the host shall also update the controller's
6970 * corresponding doorbell property to match the value of that entry
6971 * in the Shadow Doorbell buffer."
6973 * Since this context is currently a VM trap, we can safely enforce
6974 * the requirement from the device side in case the host is
6975 * misbehaving.
6977 * Note, we shouldn't have to do this, but various drivers
6978 * including ones that run on Linux, are not updating Admin Queues,
6979 * so we can't trust reading it for an appropriate sq tail.
6981 pci_dma_write(&n->parent_obj, sq->db_addr, &sq->tail,
6982 sizeof(sq->tail));
6984 timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
6988 static void nvme_mmio_write(void *opaque, hwaddr addr, uint64_t data,
6989 unsigned size)
6991 NvmeCtrl *n = (NvmeCtrl *)opaque;
6993 trace_pci_nvme_mmio_write(addr, data, size);
6995 if (pci_is_vf(&n->parent_obj) && !nvme_sctrl(n)->scs &&
6996 addr != NVME_REG_CSTS) {
6997 trace_pci_nvme_err_ignored_mmio_vf_offline(addr, size);
6998 return;
7001 if (addr < sizeof(n->bar)) {
7002 nvme_write_bar(n, addr, data, size);
7003 } else {
7004 nvme_process_db(n, addr, data);
7008 static const MemoryRegionOps nvme_mmio_ops = {
7009 .read = nvme_mmio_read,
7010 .write = nvme_mmio_write,
7011 .endianness = DEVICE_LITTLE_ENDIAN,
7012 .impl = {
7013 .min_access_size = 2,
7014 .max_access_size = 8,
7018 static void nvme_cmb_write(void *opaque, hwaddr addr, uint64_t data,
7019 unsigned size)
7021 NvmeCtrl *n = (NvmeCtrl *)opaque;
7022 stn_le_p(&n->cmb.buf[addr], size, data);
7025 static uint64_t nvme_cmb_read(void *opaque, hwaddr addr, unsigned size)
7027 NvmeCtrl *n = (NvmeCtrl *)opaque;
7028 return ldn_le_p(&n->cmb.buf[addr], size);
7031 static const MemoryRegionOps nvme_cmb_ops = {
7032 .read = nvme_cmb_read,
7033 .write = nvme_cmb_write,
7034 .endianness = DEVICE_LITTLE_ENDIAN,
7035 .impl = {
7036 .min_access_size = 1,
7037 .max_access_size = 8,
7041 static void nvme_check_constraints(NvmeCtrl *n, Error **errp)
7043 NvmeParams *params = &n->params;
7045 if (params->num_queues) {
7046 warn_report("num_queues is deprecated; please use max_ioqpairs "
7047 "instead");
7049 params->max_ioqpairs = params->num_queues - 1;
7052 if (n->namespace.blkconf.blk && n->subsys) {
7053 error_setg(errp, "subsystem support is unavailable with legacy "
7054 "namespace ('drive' property)");
7055 return;
7058 if (params->max_ioqpairs < 1 ||
7059 params->max_ioqpairs > NVME_MAX_IOQPAIRS) {
7060 error_setg(errp, "max_ioqpairs must be between 1 and %d",
7061 NVME_MAX_IOQPAIRS);
7062 return;
7065 if (params->msix_qsize < 1 ||
7066 params->msix_qsize > PCI_MSIX_FLAGS_QSIZE + 1) {
7067 error_setg(errp, "msix_qsize must be between 1 and %d",
7068 PCI_MSIX_FLAGS_QSIZE + 1);
7069 return;
7072 if (!params->serial) {
7073 error_setg(errp, "serial property not set");
7074 return;
7077 if (n->pmr.dev) {
7078 if (host_memory_backend_is_mapped(n->pmr.dev)) {
7079 error_setg(errp, "can't use already busy memdev: %s",
7080 object_get_canonical_path_component(OBJECT(n->pmr.dev)));
7081 return;
7084 if (!is_power_of_2(n->pmr.dev->size)) {
7085 error_setg(errp, "pmr backend size needs to be power of 2 in size");
7086 return;
7089 host_memory_backend_set_mapped(n->pmr.dev, true);
7092 if (n->params.zasl > n->params.mdts) {
7093 error_setg(errp, "zoned.zasl (Zone Append Size Limit) must be less "
7094 "than or equal to mdts (Maximum Data Transfer Size)");
7095 return;
7098 if (!n->params.vsl) {
7099 error_setg(errp, "vsl must be non-zero");
7100 return;
7103 if (params->sriov_max_vfs) {
7104 if (!n->subsys) {
7105 error_setg(errp, "subsystem is required for the use of SR-IOV");
7106 return;
7109 if (params->sriov_max_vfs > NVME_MAX_VFS) {
7110 error_setg(errp, "sriov_max_vfs must be between 0 and %d",
7111 NVME_MAX_VFS);
7112 return;
7115 if (params->cmb_size_mb) {
7116 error_setg(errp, "CMB is not supported with SR-IOV");
7117 return;
7120 if (n->pmr.dev) {
7121 error_setg(errp, "PMR is not supported with SR-IOV");
7122 return;
7125 if (!params->sriov_vq_flexible || !params->sriov_vi_flexible) {
7126 error_setg(errp, "both sriov_vq_flexible and sriov_vi_flexible"
7127 " must be set for the use of SR-IOV");
7128 return;
7131 if (params->sriov_vq_flexible < params->sriov_max_vfs * 2) {
7132 error_setg(errp, "sriov_vq_flexible must be greater than or equal"
7133 " to %d (sriov_max_vfs * 2)", params->sriov_max_vfs * 2);
7134 return;
7137 if (params->max_ioqpairs < params->sriov_vq_flexible + 2) {
7138 error_setg(errp, "(max_ioqpairs - sriov_vq_flexible) must be"
7139 " greater than or equal to 2");
7140 return;
7143 if (params->sriov_vi_flexible < params->sriov_max_vfs) {
7144 error_setg(errp, "sriov_vi_flexible must be greater than or equal"
7145 " to %d (sriov_max_vfs)", params->sriov_max_vfs);
7146 return;
7149 if (params->msix_qsize < params->sriov_vi_flexible + 1) {
7150 error_setg(errp, "(msix_qsize - sriov_vi_flexible) must be"
7151 " greater than or equal to 1");
7152 return;
7155 if (params->sriov_max_vi_per_vf &&
7156 (params->sriov_max_vi_per_vf - 1) % NVME_VF_RES_GRANULARITY) {
7157 error_setg(errp, "sriov_max_vi_per_vf must meet:"
7158 " (sriov_max_vi_per_vf - 1) %% %d == 0 and"
7159 " sriov_max_vi_per_vf >= 1", NVME_VF_RES_GRANULARITY);
7160 return;
7163 if (params->sriov_max_vq_per_vf &&
7164 (params->sriov_max_vq_per_vf < 2 ||
7165 (params->sriov_max_vq_per_vf - 1) % NVME_VF_RES_GRANULARITY)) {
7166 error_setg(errp, "sriov_max_vq_per_vf must meet:"
7167 " (sriov_max_vq_per_vf - 1) %% %d == 0 and"
7168 " sriov_max_vq_per_vf >= 2", NVME_VF_RES_GRANULARITY);
7169 return;
7174 static void nvme_init_state(NvmeCtrl *n)
7176 NvmePriCtrlCap *cap = &n->pri_ctrl_cap;
7177 NvmeSecCtrlList *list = &n->sec_ctrl_list;
7178 NvmeSecCtrlEntry *sctrl;
7179 uint8_t max_vfs;
7180 int i;
7182 if (pci_is_vf(&n->parent_obj)) {
7183 sctrl = nvme_sctrl(n);
7184 max_vfs = 0;
7185 n->conf_ioqpairs = sctrl->nvq ? le16_to_cpu(sctrl->nvq) - 1 : 0;
7186 n->conf_msix_qsize = sctrl->nvi ? le16_to_cpu(sctrl->nvi) : 1;
7187 } else {
7188 max_vfs = n->params.sriov_max_vfs;
7189 n->conf_ioqpairs = n->params.max_ioqpairs;
7190 n->conf_msix_qsize = n->params.msix_qsize;
7193 n->sq = g_new0(NvmeSQueue *, n->params.max_ioqpairs + 1);
7194 n->cq = g_new0(NvmeCQueue *, n->params.max_ioqpairs + 1);
7195 n->temperature = NVME_TEMPERATURE;
7196 n->features.temp_thresh_hi = NVME_TEMPERATURE_WARNING;
7197 n->starttime_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
7198 n->aer_reqs = g_new0(NvmeRequest *, n->params.aerl + 1);
7199 QTAILQ_INIT(&n->aer_queue);
7201 list->numcntl = cpu_to_le16(max_vfs);
7202 for (i = 0; i < max_vfs; i++) {
7203 sctrl = &list->sec[i];
7204 sctrl->pcid = cpu_to_le16(n->cntlid);
7205 sctrl->vfn = cpu_to_le16(i + 1);
7208 cap->cntlid = cpu_to_le16(n->cntlid);
7209 cap->crt = NVME_CRT_VQ | NVME_CRT_VI;
7211 if (pci_is_vf(&n->parent_obj)) {
7212 cap->vqprt = cpu_to_le16(1 + n->conf_ioqpairs);
7213 } else {
7214 cap->vqprt = cpu_to_le16(1 + n->params.max_ioqpairs -
7215 n->params.sriov_vq_flexible);
7216 cap->vqfrt = cpu_to_le32(n->params.sriov_vq_flexible);
7217 cap->vqrfap = cap->vqfrt;
7218 cap->vqgran = cpu_to_le16(NVME_VF_RES_GRANULARITY);
7219 cap->vqfrsm = n->params.sriov_max_vq_per_vf ?
7220 cpu_to_le16(n->params.sriov_max_vq_per_vf) :
7221 cap->vqfrt / MAX(max_vfs, 1);
7224 if (pci_is_vf(&n->parent_obj)) {
7225 cap->viprt = cpu_to_le16(n->conf_msix_qsize);
7226 } else {
7227 cap->viprt = cpu_to_le16(n->params.msix_qsize -
7228 n->params.sriov_vi_flexible);
7229 cap->vifrt = cpu_to_le32(n->params.sriov_vi_flexible);
7230 cap->virfap = cap->vifrt;
7231 cap->vigran = cpu_to_le16(NVME_VF_RES_GRANULARITY);
7232 cap->vifrsm = n->params.sriov_max_vi_per_vf ?
7233 cpu_to_le16(n->params.sriov_max_vi_per_vf) :
7234 cap->vifrt / MAX(max_vfs, 1);
7238 static void nvme_init_cmb(NvmeCtrl *n, PCIDevice *pci_dev)
7240 uint64_t cmb_size = n->params.cmb_size_mb * MiB;
7241 uint64_t cap = ldq_le_p(&n->bar.cap);
7243 n->cmb.buf = g_malloc0(cmb_size);
7244 memory_region_init_io(&n->cmb.mem, OBJECT(n), &nvme_cmb_ops, n,
7245 "nvme-cmb", cmb_size);
7246 pci_register_bar(pci_dev, NVME_CMB_BIR,
7247 PCI_BASE_ADDRESS_SPACE_MEMORY |
7248 PCI_BASE_ADDRESS_MEM_TYPE_64 |
7249 PCI_BASE_ADDRESS_MEM_PREFETCH, &n->cmb.mem);
7251 NVME_CAP_SET_CMBS(cap, 1);
7252 stq_le_p(&n->bar.cap, cap);
7254 if (n->params.legacy_cmb) {
7255 nvme_cmb_enable_regs(n);
7256 n->cmb.cmse = true;
7260 static void nvme_init_pmr(NvmeCtrl *n, PCIDevice *pci_dev)
7262 uint32_t pmrcap = ldl_le_p(&n->bar.pmrcap);
7264 NVME_PMRCAP_SET_RDS(pmrcap, 1);
7265 NVME_PMRCAP_SET_WDS(pmrcap, 1);
7266 NVME_PMRCAP_SET_BIR(pmrcap, NVME_PMR_BIR);
7267 /* Turn on bit 1 support */
7268 NVME_PMRCAP_SET_PMRWBM(pmrcap, 0x02);
7269 NVME_PMRCAP_SET_CMSS(pmrcap, 1);
7270 stl_le_p(&n->bar.pmrcap, pmrcap);
7272 pci_register_bar(pci_dev, NVME_PMR_BIR,
7273 PCI_BASE_ADDRESS_SPACE_MEMORY |
7274 PCI_BASE_ADDRESS_MEM_TYPE_64 |
7275 PCI_BASE_ADDRESS_MEM_PREFETCH, &n->pmr.dev->mr);
7277 memory_region_set_enabled(&n->pmr.dev->mr, false);
7280 static uint64_t nvme_bar_size(unsigned total_queues, unsigned total_irqs,
7281 unsigned *msix_table_offset,
7282 unsigned *msix_pba_offset)
7284 uint64_t bar_size, msix_table_size, msix_pba_size;
7286 bar_size = sizeof(NvmeBar) + 2 * total_queues * NVME_DB_SIZE;
7287 bar_size = QEMU_ALIGN_UP(bar_size, 4 * KiB);
7289 if (msix_table_offset) {
7290 *msix_table_offset = bar_size;
7293 msix_table_size = PCI_MSIX_ENTRY_SIZE * total_irqs;
7294 bar_size += msix_table_size;
7295 bar_size = QEMU_ALIGN_UP(bar_size, 4 * KiB);
7297 if (msix_pba_offset) {
7298 *msix_pba_offset = bar_size;
7301 msix_pba_size = QEMU_ALIGN_UP(total_irqs, 64) / 8;
7302 bar_size += msix_pba_size;
7304 bar_size = pow2ceil(bar_size);
7305 return bar_size;
7308 static void nvme_init_sriov(NvmeCtrl *n, PCIDevice *pci_dev, uint16_t offset)
7310 uint16_t vf_dev_id = n->params.use_intel_id ?
7311 PCI_DEVICE_ID_INTEL_NVME : PCI_DEVICE_ID_REDHAT_NVME;
7312 NvmePriCtrlCap *cap = &n->pri_ctrl_cap;
7313 uint64_t bar_size = nvme_bar_size(le16_to_cpu(cap->vqfrsm),
7314 le16_to_cpu(cap->vifrsm),
7315 NULL, NULL);
7317 pcie_sriov_pf_init(pci_dev, offset, "nvme", vf_dev_id,
7318 n->params.sriov_max_vfs, n->params.sriov_max_vfs,
7319 NVME_VF_OFFSET, NVME_VF_STRIDE);
7321 pcie_sriov_pf_init_vf_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY |
7322 PCI_BASE_ADDRESS_MEM_TYPE_64, bar_size);
7325 static int nvme_add_pm_capability(PCIDevice *pci_dev, uint8_t offset)
7327 Error *err = NULL;
7328 int ret;
7330 ret = pci_add_capability(pci_dev, PCI_CAP_ID_PM, offset,
7331 PCI_PM_SIZEOF, &err);
7332 if (err) {
7333 error_report_err(err);
7334 return ret;
7337 pci_set_word(pci_dev->config + offset + PCI_PM_PMC,
7338 PCI_PM_CAP_VER_1_2);
7339 pci_set_word(pci_dev->config + offset + PCI_PM_CTRL,
7340 PCI_PM_CTRL_NO_SOFT_RESET);
7341 pci_set_word(pci_dev->wmask + offset + PCI_PM_CTRL,
7342 PCI_PM_CTRL_STATE_MASK);
7344 return 0;
7347 static int nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev, Error **errp)
7349 uint8_t *pci_conf = pci_dev->config;
7350 uint64_t bar_size;
7351 unsigned msix_table_offset, msix_pba_offset;
7352 int ret;
7354 Error *err = NULL;
7356 pci_conf[PCI_INTERRUPT_PIN] = 1;
7357 pci_config_set_prog_interface(pci_conf, 0x2);
7359 if (n->params.use_intel_id) {
7360 pci_config_set_vendor_id(pci_conf, PCI_VENDOR_ID_INTEL);
7361 pci_config_set_device_id(pci_conf, PCI_DEVICE_ID_INTEL_NVME);
7362 } else {
7363 pci_config_set_vendor_id(pci_conf, PCI_VENDOR_ID_REDHAT);
7364 pci_config_set_device_id(pci_conf, PCI_DEVICE_ID_REDHAT_NVME);
7367 pci_config_set_class(pci_conf, PCI_CLASS_STORAGE_EXPRESS);
7368 nvme_add_pm_capability(pci_dev, 0x60);
7369 pcie_endpoint_cap_init(pci_dev, 0x80);
7370 pcie_cap_flr_init(pci_dev);
7371 if (n->params.sriov_max_vfs) {
7372 pcie_ari_init(pci_dev, 0x100, 1);
7375 /* add one to max_ioqpairs to account for the admin queue pair */
7376 bar_size = nvme_bar_size(n->params.max_ioqpairs + 1, n->params.msix_qsize,
7377 &msix_table_offset, &msix_pba_offset);
7379 memory_region_init(&n->bar0, OBJECT(n), "nvme-bar0", bar_size);
7380 memory_region_init_io(&n->iomem, OBJECT(n), &nvme_mmio_ops, n, "nvme",
7381 msix_table_offset);
7382 memory_region_add_subregion(&n->bar0, 0, &n->iomem);
7384 if (pci_is_vf(pci_dev)) {
7385 pcie_sriov_vf_register_bar(pci_dev, 0, &n->bar0);
7386 } else {
7387 pci_register_bar(pci_dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY |
7388 PCI_BASE_ADDRESS_MEM_TYPE_64, &n->bar0);
7390 ret = msix_init(pci_dev, n->params.msix_qsize,
7391 &n->bar0, 0, msix_table_offset,
7392 &n->bar0, 0, msix_pba_offset, 0, &err);
7393 if (ret < 0) {
7394 if (ret == -ENOTSUP) {
7395 warn_report_err(err);
7396 } else {
7397 error_propagate(errp, err);
7398 return ret;
7402 nvme_update_msixcap_ts(pci_dev, n->conf_msix_qsize);
7404 if (n->params.cmb_size_mb) {
7405 nvme_init_cmb(n, pci_dev);
7408 if (n->pmr.dev) {
7409 nvme_init_pmr(n, pci_dev);
7412 if (!pci_is_vf(pci_dev) && n->params.sriov_max_vfs) {
7413 nvme_init_sriov(n, pci_dev, 0x120);
7416 return 0;
7419 static void nvme_init_subnqn(NvmeCtrl *n)
7421 NvmeSubsystem *subsys = n->subsys;
7422 NvmeIdCtrl *id = &n->id_ctrl;
7424 if (!subsys) {
7425 snprintf((char *)id->subnqn, sizeof(id->subnqn),
7426 "nqn.2019-08.org.qemu:%s", n->params.serial);
7427 } else {
7428 pstrcpy((char *)id->subnqn, sizeof(id->subnqn), (char*)subsys->subnqn);
7432 static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev)
7434 NvmeIdCtrl *id = &n->id_ctrl;
7435 uint8_t *pci_conf = pci_dev->config;
7436 uint64_t cap = ldq_le_p(&n->bar.cap);
7437 NvmeSecCtrlEntry *sctrl = nvme_sctrl(n);
7439 id->vid = cpu_to_le16(pci_get_word(pci_conf + PCI_VENDOR_ID));
7440 id->ssvid = cpu_to_le16(pci_get_word(pci_conf + PCI_SUBSYSTEM_VENDOR_ID));
7441 strpadcpy((char *)id->mn, sizeof(id->mn), "QEMU NVMe Ctrl", ' ');
7442 strpadcpy((char *)id->fr, sizeof(id->fr), QEMU_VERSION, ' ');
7443 strpadcpy((char *)id->sn, sizeof(id->sn), n->params.serial, ' ');
7445 id->cntlid = cpu_to_le16(n->cntlid);
7447 id->oaes = cpu_to_le32(NVME_OAES_NS_ATTR);
7448 id->ctratt |= cpu_to_le32(NVME_CTRATT_ELBAS);
7450 id->rab = 6;
7452 if (n->params.use_intel_id) {
7453 id->ieee[0] = 0xb3;
7454 id->ieee[1] = 0x02;
7455 id->ieee[2] = 0x00;
7456 } else {
7457 id->ieee[0] = 0x00;
7458 id->ieee[1] = 0x54;
7459 id->ieee[2] = 0x52;
7462 id->mdts = n->params.mdts;
7463 id->ver = cpu_to_le32(NVME_SPEC_VER);
7464 id->oacs =
7465 cpu_to_le16(NVME_OACS_NS_MGMT | NVME_OACS_FORMAT | NVME_OACS_DBBUF);
7466 id->cntrltype = 0x1;
7469 * Because the controller always completes the Abort command immediately,
7470 * there can never be more than one concurrently executing Abort command,
7471 * so this value is never used for anything. Note that there can easily be
7472 * many Abort commands in the queues, but they are not considered
7473 * "executing" until processed by nvme_abort.
7475 * The specification recommends a value of 3 for Abort Command Limit (four
7476 * concurrently outstanding Abort commands), so lets use that though it is
7477 * inconsequential.
7479 id->acl = 3;
7480 id->aerl = n->params.aerl;
7481 id->frmw = (NVME_NUM_FW_SLOTS << 1) | NVME_FRMW_SLOT1_RO;
7482 id->lpa = NVME_LPA_NS_SMART | NVME_LPA_CSE | NVME_LPA_EXTENDED;
7484 /* recommended default value (~70 C) */
7485 id->wctemp = cpu_to_le16(NVME_TEMPERATURE_WARNING);
7486 id->cctemp = cpu_to_le16(NVME_TEMPERATURE_CRITICAL);
7488 id->sqes = (0x6 << 4) | 0x6;
7489 id->cqes = (0x4 << 4) | 0x4;
7490 id->nn = cpu_to_le32(NVME_MAX_NAMESPACES);
7491 id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROES | NVME_ONCS_TIMESTAMP |
7492 NVME_ONCS_FEATURES | NVME_ONCS_DSM |
7493 NVME_ONCS_COMPARE | NVME_ONCS_COPY);
7496 * NOTE: If this device ever supports a command set that does NOT use 0x0
7497 * as a Flush-equivalent operation, support for the broadcast NSID in Flush
7498 * should probably be removed.
7500 * See comment in nvme_io_cmd.
7502 id->vwc = NVME_VWC_NSID_BROADCAST_SUPPORT | NVME_VWC_PRESENT;
7504 id->ocfs = cpu_to_le16(NVME_OCFS_COPY_FORMAT_0 | NVME_OCFS_COPY_FORMAT_1);
7505 id->sgls = cpu_to_le32(NVME_CTRL_SGLS_SUPPORT_NO_ALIGN);
7507 nvme_init_subnqn(n);
7509 id->psd[0].mp = cpu_to_le16(0x9c4);
7510 id->psd[0].enlat = cpu_to_le32(0x10);
7511 id->psd[0].exlat = cpu_to_le32(0x4);
7513 if (n->subsys) {
7514 id->cmic |= NVME_CMIC_MULTI_CTRL;
7517 NVME_CAP_SET_MQES(cap, 0x7ff);
7518 NVME_CAP_SET_CQR(cap, 1);
7519 NVME_CAP_SET_TO(cap, 0xf);
7520 NVME_CAP_SET_CSS(cap, NVME_CAP_CSS_NVM);
7521 NVME_CAP_SET_CSS(cap, NVME_CAP_CSS_CSI_SUPP);
7522 NVME_CAP_SET_CSS(cap, NVME_CAP_CSS_ADMIN_ONLY);
7523 NVME_CAP_SET_MPSMAX(cap, 4);
7524 NVME_CAP_SET_CMBS(cap, n->params.cmb_size_mb ? 1 : 0);
7525 NVME_CAP_SET_PMRS(cap, n->pmr.dev ? 1 : 0);
7526 stq_le_p(&n->bar.cap, cap);
7528 stl_le_p(&n->bar.vs, NVME_SPEC_VER);
7529 n->bar.intmc = n->bar.intms = 0;
7531 if (pci_is_vf(&n->parent_obj) && !sctrl->scs) {
7532 stl_le_p(&n->bar.csts, NVME_CSTS_FAILED);
7536 static int nvme_init_subsys(NvmeCtrl *n, Error **errp)
7538 int cntlid;
7540 if (!n->subsys) {
7541 return 0;
7544 cntlid = nvme_subsys_register_ctrl(n, errp);
7545 if (cntlid < 0) {
7546 return -1;
7549 n->cntlid = cntlid;
7551 return 0;
7554 void nvme_attach_ns(NvmeCtrl *n, NvmeNamespace *ns)
7556 uint32_t nsid = ns->params.nsid;
7557 assert(nsid && nsid <= NVME_MAX_NAMESPACES);
7559 n->namespaces[nsid] = ns;
7560 ns->attached++;
7562 n->dmrsl = MIN_NON_ZERO(n->dmrsl,
7563 BDRV_REQUEST_MAX_BYTES / nvme_l2b(ns, 1));
7566 static void nvme_realize(PCIDevice *pci_dev, Error **errp)
7568 NvmeCtrl *n = NVME(pci_dev);
7569 NvmeNamespace *ns;
7570 Error *local_err = NULL;
7571 NvmeCtrl *pn = NVME(pcie_sriov_get_pf(pci_dev));
7573 if (pci_is_vf(pci_dev)) {
7575 * VFs derive settings from the parent. PF's lifespan exceeds
7576 * that of VF's, so it's safe to share params.serial.
7578 memcpy(&n->params, &pn->params, sizeof(NvmeParams));
7579 n->subsys = pn->subsys;
7582 nvme_check_constraints(n, &local_err);
7583 if (local_err) {
7584 error_propagate(errp, local_err);
7585 return;
7588 qbus_init(&n->bus, sizeof(NvmeBus), TYPE_NVME_BUS,
7589 &pci_dev->qdev, n->parent_obj.qdev.id);
7591 if (nvme_init_subsys(n, errp)) {
7592 error_propagate(errp, local_err);
7593 return;
7595 nvme_init_state(n);
7596 if (nvme_init_pci(n, pci_dev, errp)) {
7597 return;
7599 nvme_init_ctrl(n, pci_dev);
7601 /* setup a namespace if the controller drive property was given */
7602 if (n->namespace.blkconf.blk) {
7603 ns = &n->namespace;
7604 ns->params.nsid = 1;
7606 if (nvme_ns_setup(ns, errp)) {
7607 return;
7610 nvme_attach_ns(n, ns);
7614 static void nvme_exit(PCIDevice *pci_dev)
7616 NvmeCtrl *n = NVME(pci_dev);
7617 NvmeNamespace *ns;
7618 int i;
7620 nvme_ctrl_reset(n, NVME_RESET_FUNCTION);
7622 if (n->subsys) {
7623 for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
7624 ns = nvme_ns(n, i);
7625 if (ns) {
7626 ns->attached--;
7630 nvme_subsys_unregister_ctrl(n->subsys, n);
7633 g_free(n->cq);
7634 g_free(n->sq);
7635 g_free(n->aer_reqs);
7637 if (n->params.cmb_size_mb) {
7638 g_free(n->cmb.buf);
7641 if (n->pmr.dev) {
7642 host_memory_backend_set_mapped(n->pmr.dev, false);
7645 if (!pci_is_vf(pci_dev) && n->params.sriov_max_vfs) {
7646 pcie_sriov_pf_exit(pci_dev);
7649 msix_uninit(pci_dev, &n->bar0, &n->bar0);
7650 memory_region_del_subregion(&n->bar0, &n->iomem);
7653 static Property nvme_props[] = {
7654 DEFINE_BLOCK_PROPERTIES(NvmeCtrl, namespace.blkconf),
7655 DEFINE_PROP_LINK("pmrdev", NvmeCtrl, pmr.dev, TYPE_MEMORY_BACKEND,
7656 HostMemoryBackend *),
7657 DEFINE_PROP_LINK("subsys", NvmeCtrl, subsys, TYPE_NVME_SUBSYS,
7658 NvmeSubsystem *),
7659 DEFINE_PROP_STRING("serial", NvmeCtrl, params.serial),
7660 DEFINE_PROP_UINT32("cmb_size_mb", NvmeCtrl, params.cmb_size_mb, 0),
7661 DEFINE_PROP_UINT32("num_queues", NvmeCtrl, params.num_queues, 0),
7662 DEFINE_PROP_UINT32("max_ioqpairs", NvmeCtrl, params.max_ioqpairs, 64),
7663 DEFINE_PROP_UINT16("msix_qsize", NvmeCtrl, params.msix_qsize, 65),
7664 DEFINE_PROP_UINT8("aerl", NvmeCtrl, params.aerl, 3),
7665 DEFINE_PROP_UINT32("aer_max_queued", NvmeCtrl, params.aer_max_queued, 64),
7666 DEFINE_PROP_UINT8("mdts", NvmeCtrl, params.mdts, 7),
7667 DEFINE_PROP_UINT8("vsl", NvmeCtrl, params.vsl, 7),
7668 DEFINE_PROP_BOOL("use-intel-id", NvmeCtrl, params.use_intel_id, false),
7669 DEFINE_PROP_BOOL("legacy-cmb", NvmeCtrl, params.legacy_cmb, false),
7670 DEFINE_PROP_BOOL("ioeventfd", NvmeCtrl, params.ioeventfd, false),
7671 DEFINE_PROP_UINT8("zoned.zasl", NvmeCtrl, params.zasl, 0),
7672 DEFINE_PROP_BOOL("zoned.auto_transition", NvmeCtrl,
7673 params.auto_transition_zones, true),
7674 DEFINE_PROP_UINT8("sriov_max_vfs", NvmeCtrl, params.sriov_max_vfs, 0),
7675 DEFINE_PROP_UINT16("sriov_vq_flexible", NvmeCtrl,
7676 params.sriov_vq_flexible, 0),
7677 DEFINE_PROP_UINT16("sriov_vi_flexible", NvmeCtrl,
7678 params.sriov_vi_flexible, 0),
7679 DEFINE_PROP_UINT8("sriov_max_vi_per_vf", NvmeCtrl,
7680 params.sriov_max_vi_per_vf, 0),
7681 DEFINE_PROP_UINT8("sriov_max_vq_per_vf", NvmeCtrl,
7682 params.sriov_max_vq_per_vf, 0),
7683 DEFINE_PROP_END_OF_LIST(),
7686 static void nvme_get_smart_warning(Object *obj, Visitor *v, const char *name,
7687 void *opaque, Error **errp)
7689 NvmeCtrl *n = NVME(obj);
7690 uint8_t value = n->smart_critical_warning;
7692 visit_type_uint8(v, name, &value, errp);
7695 static void nvme_set_smart_warning(Object *obj, Visitor *v, const char *name,
7696 void *opaque, Error **errp)
7698 NvmeCtrl *n = NVME(obj);
7699 uint8_t value, old_value, cap = 0, index, event;
7701 if (!visit_type_uint8(v, name, &value, errp)) {
7702 return;
7705 cap = NVME_SMART_SPARE | NVME_SMART_TEMPERATURE | NVME_SMART_RELIABILITY
7706 | NVME_SMART_MEDIA_READ_ONLY | NVME_SMART_FAILED_VOLATILE_MEDIA;
7707 if (NVME_CAP_PMRS(ldq_le_p(&n->bar.cap))) {
7708 cap |= NVME_SMART_PMR_UNRELIABLE;
7711 if ((value & cap) != value) {
7712 error_setg(errp, "unsupported smart critical warning bits: 0x%x",
7713 value & ~cap);
7714 return;
7717 old_value = n->smart_critical_warning;
7718 n->smart_critical_warning = value;
7720 /* only inject new bits of smart critical warning */
7721 for (index = 0; index < NVME_SMART_WARN_MAX; index++) {
7722 event = 1 << index;
7723 if (value & ~old_value & event)
7724 nvme_smart_event(n, event);
7728 static void nvme_pci_reset(DeviceState *qdev)
7730 PCIDevice *pci_dev = PCI_DEVICE(qdev);
7731 NvmeCtrl *n = NVME(pci_dev);
7733 trace_pci_nvme_pci_reset();
7734 nvme_ctrl_reset(n, NVME_RESET_FUNCTION);
7737 static void nvme_sriov_pre_write_ctrl(PCIDevice *dev, uint32_t address,
7738 uint32_t val, int len)
7740 NvmeCtrl *n = NVME(dev);
7741 NvmeSecCtrlEntry *sctrl;
7742 uint16_t sriov_cap = dev->exp.sriov_cap;
7743 uint32_t off = address - sriov_cap;
7744 int i, num_vfs;
7746 if (!sriov_cap) {
7747 return;
7750 if (range_covers_byte(off, len, PCI_SRIOV_CTRL)) {
7751 if (!(val & PCI_SRIOV_CTRL_VFE)) {
7752 num_vfs = pci_get_word(dev->config + sriov_cap + PCI_SRIOV_NUM_VF);
7753 for (i = 0; i < num_vfs; i++) {
7754 sctrl = &n->sec_ctrl_list.sec[i];
7755 nvme_virt_set_state(n, le16_to_cpu(sctrl->scid), false);
7761 static void nvme_pci_write_config(PCIDevice *dev, uint32_t address,
7762 uint32_t val, int len)
7764 nvme_sriov_pre_write_ctrl(dev, address, val, len);
7765 pci_default_write_config(dev, address, val, len);
7766 pcie_cap_flr_write_config(dev, address, val, len);
7769 static const VMStateDescription nvme_vmstate = {
7770 .name = "nvme",
7771 .unmigratable = 1,
7774 static void nvme_class_init(ObjectClass *oc, void *data)
7776 DeviceClass *dc = DEVICE_CLASS(oc);
7777 PCIDeviceClass *pc = PCI_DEVICE_CLASS(oc);
7779 pc->realize = nvme_realize;
7780 pc->config_write = nvme_pci_write_config;
7781 pc->exit = nvme_exit;
7782 pc->class_id = PCI_CLASS_STORAGE_EXPRESS;
7783 pc->revision = 2;
7785 set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
7786 dc->desc = "Non-Volatile Memory Express";
7787 device_class_set_props(dc, nvme_props);
7788 dc->vmsd = &nvme_vmstate;
7789 dc->reset = nvme_pci_reset;
7792 static void nvme_instance_init(Object *obj)
7794 NvmeCtrl *n = NVME(obj);
7796 device_add_bootindex_property(obj, &n->namespace.blkconf.bootindex,
7797 "bootindex", "/namespace@1,0",
7798 DEVICE(obj));
7800 object_property_add(obj, "smart_critical_warning", "uint8",
7801 nvme_get_smart_warning,
7802 nvme_set_smart_warning, NULL, NULL);
7805 static const TypeInfo nvme_info = {
7806 .name = TYPE_NVME,
7807 .parent = TYPE_PCI_DEVICE,
7808 .instance_size = sizeof(NvmeCtrl),
7809 .instance_init = nvme_instance_init,
7810 .class_init = nvme_class_init,
7811 .interfaces = (InterfaceInfo[]) {
7812 { INTERFACE_PCIE_DEVICE },
7817 static const TypeInfo nvme_bus_info = {
7818 .name = TYPE_NVME_BUS,
7819 .parent = TYPE_BUS,
7820 .instance_size = sizeof(NvmeBus),
7823 static void nvme_register_types(void)
7825 type_register_static(&nvme_info);
7826 type_register_static(&nvme_bus_info);
7829 type_init(nvme_register_types)