2 * QEMU NVM Express Controller
4 * Copyright (c) 2012, Intel Corporation
6 * Written by Keith Busch <keith.busch@intel.com>
8 * This code is licensed under the GNU GPL v2 or later.
12 * Reference Specs: http://www.nvmexpress.org, 1.4, 1.3, 1.2, 1.1, 1.0e
14 * https://nvmexpress.org/developers/nvme-specification/
17 * Notes on coding style
18 * ---------------------
19 * While QEMU coding style prefers lowercase hexadecimals in constants, the
20 * NVMe subsystem use thes format from the NVMe specifications in the comments
21 * (i.e. 'h' suffix instead of '0x' prefix).
25 * See docs/system/nvme.rst for extensive documentation.
28 * -drive file=<file>,if=none,id=<drive_id>
29 * -device nvme-subsys,id=<subsys_id>,nqn=<nqn_id>
30 * -device nvme,serial=<serial>,id=<bus_name>, \
31 * cmb_size_mb=<cmb_size_mb[optional]>, \
32 * [pmrdev=<mem_backend_file_id>,] \
33 * max_ioqpairs=<N[optional]>, \
34 * aerl=<N[optional]>,aer_max_queued=<N[optional]>, \
35 * mdts=<N[optional]>,vsl=<N[optional]>, \
36 * zoned.zasl=<N[optional]>, \
37 * zoned.auto_transition=<on|off[optional]>, \
39 * -device nvme-ns,drive=<drive_id>,bus=<bus_name>,nsid=<nsid>,\
40 * zoned=<true|false[optional]>, \
41 * subsys=<subsys_id>,detached=<true|false[optional]>
43 * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
44 * offset 0 in BAR2 and supports only WDS, RDS and SQS for now. By default, the
45 * device will use the "v1.4 CMB scheme" - use the `legacy-cmb` parameter to
46 * always enable the CMBLOC and CMBSZ registers (v1.3 behavior).
48 * Enabling pmr emulation can be achieved by pointing to memory-backend-file.
50 * -object memory-backend-file,id=<mem_id>,share=on,mem-path=<file_path>, \
51 * size=<size> .... -device nvme,...,pmrdev=<mem_id>
53 * The PMR will use BAR 4/5 exclusively.
55 * To place controller(s) and namespace(s) to a subsystem, then provide
56 * nvme-subsys device as above.
58 * nvme subsystem device parameters
59 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
61 * This parameter provides the `<nqn_id>` part of the string
62 * `nqn.2019-08.org.qemu:<nqn_id>` which will be reported in the SUBNQN field
63 * of subsystem controllers. Note that `<nqn_id>` should be unique per
64 * subsystem, but this is not enforced by QEMU. If not specified, it will
65 * default to the value of the `id` parameter (`<subsys_id>`).
67 * nvme device parameters
68 * ~~~~~~~~~~~~~~~~~~~~~~
70 * Specifying this parameter attaches the controller to the subsystem and
71 * the SUBNQN field in the controller will report the NQN of the subsystem
72 * device. This also enables multi controller capability represented in
73 * Identify Controller data structure in CMIC (Controller Multi-path I/O and
74 * Namesapce Sharing Capabilities).
77 * The Asynchronous Event Request Limit (AERL). Indicates the maximum number
78 * of concurrently outstanding Asynchronous Event Request commands support
79 * by the controller. This is a 0's based value.
82 * This is the maximum number of events that the device will enqueue for
83 * completion when there are no outstanding AERs. When the maximum number of
84 * enqueued events are reached, subsequent events will be dropped.
87 * Indicates the maximum data transfer size for a command that transfers data
88 * between host-accessible memory and the controller. The value is specified
89 * as a power of two (2^n) and is in units of the minimum memory page size
90 * (CAP.MPSMIN). The default value is 7 (i.e. 512 KiB).
93 * Indicates the maximum data size limit for the Verify command. Like `mdts`,
94 * this value is specified as a power of two (2^n) and is in units of the
95 * minimum memory page size (CAP.MPSMIN). The default value is 7 (i.e. 512
99 * Indicates the maximum data transfer size for the Zone Append command. Like
100 * `mdts`, the value is specified as a power of two (2^n) and is in units of
101 * the minimum memory page size (CAP.MPSMIN). The default value is 0 (i.e.
102 * defaulting to the value of `mdts`).
104 * - `zoned.auto_transition`
105 * Indicates if zones in zone state implicitly opened can be automatically
106 * transitioned to zone state closed for resource management purposes.
109 * nvme namespace device parameters
110 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
112 * When the parent nvme device (as defined explicitly by the 'bus' parameter
113 * or implicitly by the most recently defined NvmeBus) is linked to an
114 * nvme-subsys device, the namespace will be attached to all controllers in
115 * the subsystem. If set to 'off' (the default), the namespace will remain a
116 * private namespace and may only be attached to a single controller at a
120 * This parameter is only valid together with the `subsys` parameter. If left
121 * at the default value (`false/off`), the namespace will be attached to all
122 * controllers in the NVMe subsystem at boot-up. If set to `true/on`, the
123 * namespace will be available in the subsystem but not attached to any
126 * Setting `zoned` to true selects Zoned Command Set at the namespace.
127 * In this case, the following namespace properties are available to configure
129 * zoned.zone_size=<zone size in bytes, default: 128MiB>
130 * The number may be followed by K, M, G as in kilo-, mega- or giga-.
132 * zoned.zone_capacity=<zone capacity in bytes, default: zone size>
133 * The value 0 (default) forces zone capacity to be the same as zone
134 * size. The value of this property may not exceed zone size.
136 * zoned.descr_ext_size=<zone descriptor extension size, default 0>
137 * This value needs to be specified in 64B units. If it is zero,
138 * namespace(s) will not support zone descriptor extensions.
140 * zoned.max_active=<Maximum Active Resources (zones), default: 0>
141 * The default value means there is no limit to the number of
142 * concurrently active zones.
144 * zoned.max_open=<Maximum Open Resources (zones), default: 0>
145 * The default value means there is no limit to the number of
146 * concurrently open zones.
148 * zoned.cross_read=<enable RAZB, default: false>
149 * Setting this property to true enables Read Across Zone Boundaries.
152 #include "qemu/osdep.h"
153 #include "qemu/cutils.h"
154 #include "qemu/error-report.h"
155 #include "qemu/log.h"
156 #include "qemu/units.h"
157 #include "qapi/error.h"
158 #include "qapi/visitor.h"
159 #include "sysemu/sysemu.h"
160 #include "sysemu/block-backend.h"
161 #include "sysemu/hostmem.h"
162 #include "hw/pci/msix.h"
163 #include "migration/vmstate.h"
169 #define NVME_MAX_IOQPAIRS 0xffff
170 #define NVME_DB_SIZE 4
171 #define NVME_SPEC_VER 0x00010400
172 #define NVME_CMB_BIR 2
173 #define NVME_PMR_BIR 4
174 #define NVME_TEMPERATURE 0x143
175 #define NVME_TEMPERATURE_WARNING 0x157
176 #define NVME_TEMPERATURE_CRITICAL 0x175
177 #define NVME_NUM_FW_SLOTS 1
178 #define NVME_DEFAULT_MAX_ZA_SIZE (128 * KiB)
180 #define NVME_GUEST_ERR(trace, fmt, ...) \
182 (trace_##trace)(__VA_ARGS__); \
183 qemu_log_mask(LOG_GUEST_ERROR, #trace \
184 " in %s: " fmt "\n", __func__, ## __VA_ARGS__); \
187 static const bool nvme_feature_support
[NVME_FID_MAX
] = {
188 [NVME_ARBITRATION
] = true,
189 [NVME_POWER_MANAGEMENT
] = true,
190 [NVME_TEMPERATURE_THRESHOLD
] = true,
191 [NVME_ERROR_RECOVERY
] = true,
192 [NVME_VOLATILE_WRITE_CACHE
] = true,
193 [NVME_NUMBER_OF_QUEUES
] = true,
194 [NVME_INTERRUPT_COALESCING
] = true,
195 [NVME_INTERRUPT_VECTOR_CONF
] = true,
196 [NVME_WRITE_ATOMICITY
] = true,
197 [NVME_ASYNCHRONOUS_EVENT_CONF
] = true,
198 [NVME_TIMESTAMP
] = true,
199 [NVME_HOST_BEHAVIOR_SUPPORT
] = true,
200 [NVME_COMMAND_SET_PROFILE
] = true,
203 static const uint32_t nvme_feature_cap
[NVME_FID_MAX
] = {
204 [NVME_TEMPERATURE_THRESHOLD
] = NVME_FEAT_CAP_CHANGE
,
205 [NVME_ERROR_RECOVERY
] = NVME_FEAT_CAP_CHANGE
| NVME_FEAT_CAP_NS
,
206 [NVME_VOLATILE_WRITE_CACHE
] = NVME_FEAT_CAP_CHANGE
,
207 [NVME_NUMBER_OF_QUEUES
] = NVME_FEAT_CAP_CHANGE
,
208 [NVME_ASYNCHRONOUS_EVENT_CONF
] = NVME_FEAT_CAP_CHANGE
,
209 [NVME_TIMESTAMP
] = NVME_FEAT_CAP_CHANGE
,
210 [NVME_HOST_BEHAVIOR_SUPPORT
] = NVME_FEAT_CAP_CHANGE
,
211 [NVME_COMMAND_SET_PROFILE
] = NVME_FEAT_CAP_CHANGE
,
214 static const uint32_t nvme_cse_acs
[256] = {
215 [NVME_ADM_CMD_DELETE_SQ
] = NVME_CMD_EFF_CSUPP
,
216 [NVME_ADM_CMD_CREATE_SQ
] = NVME_CMD_EFF_CSUPP
,
217 [NVME_ADM_CMD_GET_LOG_PAGE
] = NVME_CMD_EFF_CSUPP
,
218 [NVME_ADM_CMD_DELETE_CQ
] = NVME_CMD_EFF_CSUPP
,
219 [NVME_ADM_CMD_CREATE_CQ
] = NVME_CMD_EFF_CSUPP
,
220 [NVME_ADM_CMD_IDENTIFY
] = NVME_CMD_EFF_CSUPP
,
221 [NVME_ADM_CMD_ABORT
] = NVME_CMD_EFF_CSUPP
,
222 [NVME_ADM_CMD_SET_FEATURES
] = NVME_CMD_EFF_CSUPP
,
223 [NVME_ADM_CMD_GET_FEATURES
] = NVME_CMD_EFF_CSUPP
,
224 [NVME_ADM_CMD_ASYNC_EV_REQ
] = NVME_CMD_EFF_CSUPP
,
225 [NVME_ADM_CMD_NS_ATTACHMENT
] = NVME_CMD_EFF_CSUPP
| NVME_CMD_EFF_NIC
,
226 [NVME_ADM_CMD_FORMAT_NVM
] = NVME_CMD_EFF_CSUPP
| NVME_CMD_EFF_LBCC
,
229 static const uint32_t nvme_cse_iocs_none
[256];
231 static const uint32_t nvme_cse_iocs_nvm
[256] = {
232 [NVME_CMD_FLUSH
] = NVME_CMD_EFF_CSUPP
| NVME_CMD_EFF_LBCC
,
233 [NVME_CMD_WRITE_ZEROES
] = NVME_CMD_EFF_CSUPP
| NVME_CMD_EFF_LBCC
,
234 [NVME_CMD_WRITE
] = NVME_CMD_EFF_CSUPP
| NVME_CMD_EFF_LBCC
,
235 [NVME_CMD_READ
] = NVME_CMD_EFF_CSUPP
,
236 [NVME_CMD_DSM
] = NVME_CMD_EFF_CSUPP
| NVME_CMD_EFF_LBCC
,
237 [NVME_CMD_VERIFY
] = NVME_CMD_EFF_CSUPP
,
238 [NVME_CMD_COPY
] = NVME_CMD_EFF_CSUPP
| NVME_CMD_EFF_LBCC
,
239 [NVME_CMD_COMPARE
] = NVME_CMD_EFF_CSUPP
,
242 static const uint32_t nvme_cse_iocs_zoned
[256] = {
243 [NVME_CMD_FLUSH
] = NVME_CMD_EFF_CSUPP
| NVME_CMD_EFF_LBCC
,
244 [NVME_CMD_WRITE_ZEROES
] = NVME_CMD_EFF_CSUPP
| NVME_CMD_EFF_LBCC
,
245 [NVME_CMD_WRITE
] = NVME_CMD_EFF_CSUPP
| NVME_CMD_EFF_LBCC
,
246 [NVME_CMD_READ
] = NVME_CMD_EFF_CSUPP
,
247 [NVME_CMD_DSM
] = NVME_CMD_EFF_CSUPP
| NVME_CMD_EFF_LBCC
,
248 [NVME_CMD_VERIFY
] = NVME_CMD_EFF_CSUPP
,
249 [NVME_CMD_COPY
] = NVME_CMD_EFF_CSUPP
| NVME_CMD_EFF_LBCC
,
250 [NVME_CMD_COMPARE
] = NVME_CMD_EFF_CSUPP
,
251 [NVME_CMD_ZONE_APPEND
] = NVME_CMD_EFF_CSUPP
| NVME_CMD_EFF_LBCC
,
252 [NVME_CMD_ZONE_MGMT_SEND
] = NVME_CMD_EFF_CSUPP
| NVME_CMD_EFF_LBCC
,
253 [NVME_CMD_ZONE_MGMT_RECV
] = NVME_CMD_EFF_CSUPP
,
256 static void nvme_process_sq(void *opaque
);
258 static uint16_t nvme_sqid(NvmeRequest
*req
)
260 return le16_to_cpu(req
->sq
->sqid
);
263 static void nvme_assign_zone_state(NvmeNamespace
*ns
, NvmeZone
*zone
,
266 if (QTAILQ_IN_USE(zone
, entry
)) {
267 switch (nvme_get_zone_state(zone
)) {
268 case NVME_ZONE_STATE_EXPLICITLY_OPEN
:
269 QTAILQ_REMOVE(&ns
->exp_open_zones
, zone
, entry
);
271 case NVME_ZONE_STATE_IMPLICITLY_OPEN
:
272 QTAILQ_REMOVE(&ns
->imp_open_zones
, zone
, entry
);
274 case NVME_ZONE_STATE_CLOSED
:
275 QTAILQ_REMOVE(&ns
->closed_zones
, zone
, entry
);
277 case NVME_ZONE_STATE_FULL
:
278 QTAILQ_REMOVE(&ns
->full_zones
, zone
, entry
);
284 nvme_set_zone_state(zone
, state
);
287 case NVME_ZONE_STATE_EXPLICITLY_OPEN
:
288 QTAILQ_INSERT_TAIL(&ns
->exp_open_zones
, zone
, entry
);
290 case NVME_ZONE_STATE_IMPLICITLY_OPEN
:
291 QTAILQ_INSERT_TAIL(&ns
->imp_open_zones
, zone
, entry
);
293 case NVME_ZONE_STATE_CLOSED
:
294 QTAILQ_INSERT_TAIL(&ns
->closed_zones
, zone
, entry
);
296 case NVME_ZONE_STATE_FULL
:
297 QTAILQ_INSERT_TAIL(&ns
->full_zones
, zone
, entry
);
298 case NVME_ZONE_STATE_READ_ONLY
:
305 static uint16_t nvme_zns_check_resources(NvmeNamespace
*ns
, uint32_t act
,
306 uint32_t opn
, uint32_t zrwa
)
308 if (ns
->params
.max_active_zones
!= 0 &&
309 ns
->nr_active_zones
+ act
> ns
->params
.max_active_zones
) {
310 trace_pci_nvme_err_insuff_active_res(ns
->params
.max_active_zones
);
311 return NVME_ZONE_TOO_MANY_ACTIVE
| NVME_DNR
;
314 if (ns
->params
.max_open_zones
!= 0 &&
315 ns
->nr_open_zones
+ opn
> ns
->params
.max_open_zones
) {
316 trace_pci_nvme_err_insuff_open_res(ns
->params
.max_open_zones
);
317 return NVME_ZONE_TOO_MANY_OPEN
| NVME_DNR
;
320 if (zrwa
> ns
->zns
.numzrwa
) {
321 return NVME_NOZRWA
| NVME_DNR
;
328 * Check if we can open a zone without exceeding open/active limits.
329 * AOR stands for "Active and Open Resources" (see TP 4053 section 2.5).
331 static uint16_t nvme_aor_check(NvmeNamespace
*ns
, uint32_t act
, uint32_t opn
)
333 return nvme_zns_check_resources(ns
, act
, opn
, 0);
336 static bool nvme_addr_is_cmb(NvmeCtrl
*n
, hwaddr addr
)
344 lo
= n
->params
.legacy_cmb
? n
->cmb
.mem
.addr
: n
->cmb
.cba
;
345 hi
= lo
+ int128_get64(n
->cmb
.mem
.size
);
347 return addr
>= lo
&& addr
< hi
;
350 static inline void *nvme_addr_to_cmb(NvmeCtrl
*n
, hwaddr addr
)
352 hwaddr base
= n
->params
.legacy_cmb
? n
->cmb
.mem
.addr
: n
->cmb
.cba
;
353 return &n
->cmb
.buf
[addr
- base
];
356 static bool nvme_addr_is_pmr(NvmeCtrl
*n
, hwaddr addr
)
364 hi
= n
->pmr
.cba
+ int128_get64(n
->pmr
.dev
->mr
.size
);
366 return addr
>= n
->pmr
.cba
&& addr
< hi
;
369 static inline void *nvme_addr_to_pmr(NvmeCtrl
*n
, hwaddr addr
)
371 return memory_region_get_ram_ptr(&n
->pmr
.dev
->mr
) + (addr
- n
->pmr
.cba
);
374 static inline bool nvme_addr_is_iomem(NvmeCtrl
*n
, hwaddr addr
)
379 * The purpose of this check is to guard against invalid "local" access to
380 * the iomem (i.e. controller registers). Thus, we check against the range
381 * covered by the 'bar0' MemoryRegion since that is currently composed of
382 * two subregions (the NVMe "MBAR" and the MSI-X table/pba). Note, however,
383 * that if the device model is ever changed to allow the CMB to be located
384 * in BAR0 as well, then this must be changed.
387 hi
= lo
+ int128_get64(n
->bar0
.size
);
389 return addr
>= lo
&& addr
< hi
;
392 static int nvme_addr_read(NvmeCtrl
*n
, hwaddr addr
, void *buf
, int size
)
394 hwaddr hi
= addr
+ size
- 1;
399 if (n
->bar
.cmbsz
&& nvme_addr_is_cmb(n
, addr
) && nvme_addr_is_cmb(n
, hi
)) {
400 memcpy(buf
, nvme_addr_to_cmb(n
, addr
), size
);
404 if (nvme_addr_is_pmr(n
, addr
) && nvme_addr_is_pmr(n
, hi
)) {
405 memcpy(buf
, nvme_addr_to_pmr(n
, addr
), size
);
409 return pci_dma_read(&n
->parent_obj
, addr
, buf
, size
);
412 static int nvme_addr_write(NvmeCtrl
*n
, hwaddr addr
, const void *buf
, int size
)
414 hwaddr hi
= addr
+ size
- 1;
419 if (n
->bar
.cmbsz
&& nvme_addr_is_cmb(n
, addr
) && nvme_addr_is_cmb(n
, hi
)) {
420 memcpy(nvme_addr_to_cmb(n
, addr
), buf
, size
);
424 if (nvme_addr_is_pmr(n
, addr
) && nvme_addr_is_pmr(n
, hi
)) {
425 memcpy(nvme_addr_to_pmr(n
, addr
), buf
, size
);
429 return pci_dma_write(&n
->parent_obj
, addr
, buf
, size
);
432 static bool nvme_nsid_valid(NvmeCtrl
*n
, uint32_t nsid
)
435 (nsid
== NVME_NSID_BROADCAST
|| nsid
<= NVME_MAX_NAMESPACES
);
438 static int nvme_check_sqid(NvmeCtrl
*n
, uint16_t sqid
)
440 return sqid
< n
->params
.max_ioqpairs
+ 1 && n
->sq
[sqid
] != NULL
? 0 : -1;
443 static int nvme_check_cqid(NvmeCtrl
*n
, uint16_t cqid
)
445 return cqid
< n
->params
.max_ioqpairs
+ 1 && n
->cq
[cqid
] != NULL
? 0 : -1;
448 static void nvme_inc_cq_tail(NvmeCQueue
*cq
)
451 if (cq
->tail
>= cq
->size
) {
453 cq
->phase
= !cq
->phase
;
457 static void nvme_inc_sq_head(NvmeSQueue
*sq
)
459 sq
->head
= (sq
->head
+ 1) % sq
->size
;
462 static uint8_t nvme_cq_full(NvmeCQueue
*cq
)
464 return (cq
->tail
+ 1) % cq
->size
== cq
->head
;
467 static uint8_t nvme_sq_empty(NvmeSQueue
*sq
)
469 return sq
->head
== sq
->tail
;
472 static void nvme_irq_check(NvmeCtrl
*n
)
474 uint32_t intms
= ldl_le_p(&n
->bar
.intms
);
476 if (msix_enabled(&(n
->parent_obj
))) {
479 if (~intms
& n
->irq_status
) {
480 pci_irq_assert(&n
->parent_obj
);
482 pci_irq_deassert(&n
->parent_obj
);
486 static void nvme_irq_assert(NvmeCtrl
*n
, NvmeCQueue
*cq
)
488 if (cq
->irq_enabled
) {
489 if (msix_enabled(&(n
->parent_obj
))) {
490 trace_pci_nvme_irq_msix(cq
->vector
);
491 msix_notify(&(n
->parent_obj
), cq
->vector
);
493 trace_pci_nvme_irq_pin();
494 assert(cq
->vector
< 32);
495 n
->irq_status
|= 1 << cq
->vector
;
499 trace_pci_nvme_irq_masked();
503 static void nvme_irq_deassert(NvmeCtrl
*n
, NvmeCQueue
*cq
)
505 if (cq
->irq_enabled
) {
506 if (msix_enabled(&(n
->parent_obj
))) {
509 assert(cq
->vector
< 32);
510 if (!n
->cq_pending
) {
511 n
->irq_status
&= ~(1 << cq
->vector
);
518 static void nvme_req_clear(NvmeRequest
*req
)
523 memset(&req
->cqe
, 0x0, sizeof(req
->cqe
));
524 req
->status
= NVME_SUCCESS
;
527 static inline void nvme_sg_init(NvmeCtrl
*n
, NvmeSg
*sg
, bool dma
)
530 pci_dma_sglist_init(&sg
->qsg
, &n
->parent_obj
, 0);
531 sg
->flags
= NVME_SG_DMA
;
533 qemu_iovec_init(&sg
->iov
, 0);
536 sg
->flags
|= NVME_SG_ALLOC
;
539 static inline void nvme_sg_unmap(NvmeSg
*sg
)
541 if (!(sg
->flags
& NVME_SG_ALLOC
)) {
545 if (sg
->flags
& NVME_SG_DMA
) {
546 qemu_sglist_destroy(&sg
->qsg
);
548 qemu_iovec_destroy(&sg
->iov
);
551 memset(sg
, 0x0, sizeof(*sg
));
555 * When metadata is transfered as extended LBAs, the DPTR mapped into `sg`
556 * holds both data and metadata. This function splits the data and metadata
557 * into two separate QSG/IOVs.
559 static void nvme_sg_split(NvmeSg
*sg
, NvmeNamespace
*ns
, NvmeSg
*data
,
563 uint32_t trans_len
, count
= ns
->lbasz
;
565 bool dma
= sg
->flags
& NVME_SG_DMA
;
567 size_t sg_len
= dma
? sg
->qsg
.size
: sg
->iov
.size
;
570 assert(sg
->flags
& NVME_SG_ALLOC
);
573 sge_len
= dma
? sg
->qsg
.sg
[sg_idx
].len
: sg
->iov
.iov
[sg_idx
].iov_len
;
575 trans_len
= MIN(sg_len
, count
);
576 trans_len
= MIN(trans_len
, sge_len
- offset
);
580 qemu_sglist_add(&dst
->qsg
, sg
->qsg
.sg
[sg_idx
].base
+ offset
,
583 qemu_iovec_add(&dst
->iov
,
584 sg
->iov
.iov
[sg_idx
].iov_base
+ offset
,
594 dst
= (dst
== data
) ? mdata
: data
;
595 count
= (dst
== data
) ? ns
->lbasz
: ns
->lbaf
.ms
;
598 if (sge_len
== offset
) {
605 static uint16_t nvme_map_addr_cmb(NvmeCtrl
*n
, QEMUIOVector
*iov
, hwaddr addr
,
612 trace_pci_nvme_map_addr_cmb(addr
, len
);
614 if (!nvme_addr_is_cmb(n
, addr
) || !nvme_addr_is_cmb(n
, addr
+ len
- 1)) {
615 return NVME_DATA_TRAS_ERROR
;
618 qemu_iovec_add(iov
, nvme_addr_to_cmb(n
, addr
), len
);
623 static uint16_t nvme_map_addr_pmr(NvmeCtrl
*n
, QEMUIOVector
*iov
, hwaddr addr
,
630 if (!nvme_addr_is_pmr(n
, addr
) || !nvme_addr_is_pmr(n
, addr
+ len
- 1)) {
631 return NVME_DATA_TRAS_ERROR
;
634 qemu_iovec_add(iov
, nvme_addr_to_pmr(n
, addr
), len
);
639 static uint16_t nvme_map_addr(NvmeCtrl
*n
, NvmeSg
*sg
, hwaddr addr
, size_t len
)
641 bool cmb
= false, pmr
= false;
647 trace_pci_nvme_map_addr(addr
, len
);
649 if (nvme_addr_is_iomem(n
, addr
)) {
650 return NVME_DATA_TRAS_ERROR
;
653 if (nvme_addr_is_cmb(n
, addr
)) {
655 } else if (nvme_addr_is_pmr(n
, addr
)) {
660 if (sg
->flags
& NVME_SG_DMA
) {
661 return NVME_INVALID_USE_OF_CMB
| NVME_DNR
;
664 if (sg
->iov
.niov
+ 1 > IOV_MAX
) {
665 goto max_mappings_exceeded
;
669 return nvme_map_addr_cmb(n
, &sg
->iov
, addr
, len
);
671 return nvme_map_addr_pmr(n
, &sg
->iov
, addr
, len
);
675 if (!(sg
->flags
& NVME_SG_DMA
)) {
676 return NVME_INVALID_USE_OF_CMB
| NVME_DNR
;
679 if (sg
->qsg
.nsg
+ 1 > IOV_MAX
) {
680 goto max_mappings_exceeded
;
683 qemu_sglist_add(&sg
->qsg
, addr
, len
);
687 max_mappings_exceeded
:
688 NVME_GUEST_ERR(pci_nvme_ub_too_many_mappings
,
689 "number of mappings exceed 1024");
690 return NVME_INTERNAL_DEV_ERROR
| NVME_DNR
;
693 static inline bool nvme_addr_is_dma(NvmeCtrl
*n
, hwaddr addr
)
695 return !(nvme_addr_is_cmb(n
, addr
) || nvme_addr_is_pmr(n
, addr
));
698 static uint16_t nvme_map_prp(NvmeCtrl
*n
, NvmeSg
*sg
, uint64_t prp1
,
699 uint64_t prp2
, uint32_t len
)
701 hwaddr trans_len
= n
->page_size
- (prp1
% n
->page_size
);
702 trans_len
= MIN(len
, trans_len
);
703 int num_prps
= (len
>> n
->page_bits
) + 1;
707 trace_pci_nvme_map_prp(trans_len
, len
, prp1
, prp2
, num_prps
);
709 nvme_sg_init(n
, sg
, nvme_addr_is_dma(n
, prp1
));
711 status
= nvme_map_addr(n
, sg
, prp1
, trans_len
);
718 if (len
> n
->page_size
) {
719 uint64_t prp_list
[n
->max_prp_ents
];
720 uint32_t nents
, prp_trans
;
724 * The first PRP list entry, pointed to by PRP2 may contain offset.
725 * Hence, we need to calculate the number of entries in based on
728 nents
= (n
->page_size
- (prp2
& (n
->page_size
- 1))) >> 3;
729 prp_trans
= MIN(n
->max_prp_ents
, nents
) * sizeof(uint64_t);
730 ret
= nvme_addr_read(n
, prp2
, (void *)prp_list
, prp_trans
);
732 trace_pci_nvme_err_addr_read(prp2
);
733 status
= NVME_DATA_TRAS_ERROR
;
737 uint64_t prp_ent
= le64_to_cpu(prp_list
[i
]);
739 if (i
== nents
- 1 && len
> n
->page_size
) {
740 if (unlikely(prp_ent
& (n
->page_size
- 1))) {
741 trace_pci_nvme_err_invalid_prplist_ent(prp_ent
);
742 status
= NVME_INVALID_PRP_OFFSET
| NVME_DNR
;
747 nents
= (len
+ n
->page_size
- 1) >> n
->page_bits
;
748 nents
= MIN(nents
, n
->max_prp_ents
);
749 prp_trans
= nents
* sizeof(uint64_t);
750 ret
= nvme_addr_read(n
, prp_ent
, (void *)prp_list
,
753 trace_pci_nvme_err_addr_read(prp_ent
);
754 status
= NVME_DATA_TRAS_ERROR
;
757 prp_ent
= le64_to_cpu(prp_list
[i
]);
760 if (unlikely(prp_ent
& (n
->page_size
- 1))) {
761 trace_pci_nvme_err_invalid_prplist_ent(prp_ent
);
762 status
= NVME_INVALID_PRP_OFFSET
| NVME_DNR
;
766 trans_len
= MIN(len
, n
->page_size
);
767 status
= nvme_map_addr(n
, sg
, prp_ent
, trans_len
);
776 if (unlikely(prp2
& (n
->page_size
- 1))) {
777 trace_pci_nvme_err_invalid_prp2_align(prp2
);
778 status
= NVME_INVALID_PRP_OFFSET
| NVME_DNR
;
781 status
= nvme_map_addr(n
, sg
, prp2
, len
);
796 * Map 'nsgld' data descriptors from 'segment'. The function will subtract the
797 * number of bytes mapped in len.
799 static uint16_t nvme_map_sgl_data(NvmeCtrl
*n
, NvmeSg
*sg
,
800 NvmeSglDescriptor
*segment
, uint64_t nsgld
,
801 size_t *len
, NvmeCmd
*cmd
)
803 dma_addr_t addr
, trans_len
;
807 for (int i
= 0; i
< nsgld
; i
++) {
808 uint8_t type
= NVME_SGL_TYPE(segment
[i
].type
);
811 case NVME_SGL_DESCR_TYPE_BIT_BUCKET
:
812 if (cmd
->opcode
== NVME_CMD_WRITE
) {
815 case NVME_SGL_DESCR_TYPE_DATA_BLOCK
:
817 case NVME_SGL_DESCR_TYPE_SEGMENT
:
818 case NVME_SGL_DESCR_TYPE_LAST_SEGMENT
:
819 return NVME_INVALID_NUM_SGL_DESCRS
| NVME_DNR
;
821 return NVME_SGL_DESCR_TYPE_INVALID
| NVME_DNR
;
824 dlen
= le32_to_cpu(segment
[i
].len
);
832 * All data has been mapped, but the SGL contains additional
833 * segments and/or descriptors. The controller might accept
834 * ignoring the rest of the SGL.
836 uint32_t sgls
= le32_to_cpu(n
->id_ctrl
.sgls
);
837 if (sgls
& NVME_CTRL_SGLS_EXCESS_LENGTH
) {
841 trace_pci_nvme_err_invalid_sgl_excess_length(dlen
);
842 return NVME_DATA_SGL_LEN_INVALID
| NVME_DNR
;
845 trans_len
= MIN(*len
, dlen
);
847 if (type
== NVME_SGL_DESCR_TYPE_BIT_BUCKET
) {
851 addr
= le64_to_cpu(segment
[i
].addr
);
853 if (UINT64_MAX
- addr
< dlen
) {
854 return NVME_DATA_SGL_LEN_INVALID
| NVME_DNR
;
857 status
= nvme_map_addr(n
, sg
, addr
, trans_len
);
869 static uint16_t nvme_map_sgl(NvmeCtrl
*n
, NvmeSg
*sg
, NvmeSglDescriptor sgl
,
870 size_t len
, NvmeCmd
*cmd
)
873 * Read the segment in chunks of 256 descriptors (one 4k page) to avoid
874 * dynamically allocating a potentially huge SGL. The spec allows the SGL
875 * to be larger (as in number of bytes required to describe the SGL
876 * descriptors and segment chain) than the command transfer size, so it is
877 * not bounded by MDTS.
879 const int SEG_CHUNK_SIZE
= 256;
881 NvmeSglDescriptor segment
[SEG_CHUNK_SIZE
], *sgld
, *last_sgld
;
889 addr
= le64_to_cpu(sgl
.addr
);
891 trace_pci_nvme_map_sgl(NVME_SGL_TYPE(sgl
.type
), len
);
893 nvme_sg_init(n
, sg
, nvme_addr_is_dma(n
, addr
));
896 * If the entire transfer can be described with a single data block it can
897 * be mapped directly.
899 if (NVME_SGL_TYPE(sgl
.type
) == NVME_SGL_DESCR_TYPE_DATA_BLOCK
) {
900 status
= nvme_map_sgl_data(n
, sg
, sgld
, 1, &len
, cmd
);
909 switch (NVME_SGL_TYPE(sgld
->type
)) {
910 case NVME_SGL_DESCR_TYPE_SEGMENT
:
911 case NVME_SGL_DESCR_TYPE_LAST_SEGMENT
:
914 return NVME_INVALID_SGL_SEG_DESCR
| NVME_DNR
;
917 seg_len
= le32_to_cpu(sgld
->len
);
919 /* check the length of the (Last) Segment descriptor */
920 if ((!seg_len
|| seg_len
& 0xf) &&
921 (NVME_SGL_TYPE(sgld
->type
) != NVME_SGL_DESCR_TYPE_BIT_BUCKET
)) {
922 return NVME_INVALID_SGL_SEG_DESCR
| NVME_DNR
;
925 if (UINT64_MAX
- addr
< seg_len
) {
926 return NVME_DATA_SGL_LEN_INVALID
| NVME_DNR
;
929 nsgld
= seg_len
/ sizeof(NvmeSglDescriptor
);
931 while (nsgld
> SEG_CHUNK_SIZE
) {
932 if (nvme_addr_read(n
, addr
, segment
, sizeof(segment
))) {
933 trace_pci_nvme_err_addr_read(addr
);
934 status
= NVME_DATA_TRAS_ERROR
;
938 status
= nvme_map_sgl_data(n
, sg
, segment
, SEG_CHUNK_SIZE
,
944 nsgld
-= SEG_CHUNK_SIZE
;
945 addr
+= SEG_CHUNK_SIZE
* sizeof(NvmeSglDescriptor
);
948 ret
= nvme_addr_read(n
, addr
, segment
, nsgld
*
949 sizeof(NvmeSglDescriptor
));
951 trace_pci_nvme_err_addr_read(addr
);
952 status
= NVME_DATA_TRAS_ERROR
;
956 last_sgld
= &segment
[nsgld
- 1];
959 * If the segment ends with a Data Block or Bit Bucket Descriptor Type,
962 switch (NVME_SGL_TYPE(last_sgld
->type
)) {
963 case NVME_SGL_DESCR_TYPE_DATA_BLOCK
:
964 case NVME_SGL_DESCR_TYPE_BIT_BUCKET
:
965 status
= nvme_map_sgl_data(n
, sg
, segment
, nsgld
, &len
, cmd
);
977 * If the last descriptor was not a Data Block or Bit Bucket, then the
978 * current segment must not be a Last Segment.
980 if (NVME_SGL_TYPE(sgld
->type
) == NVME_SGL_DESCR_TYPE_LAST_SEGMENT
) {
981 status
= NVME_INVALID_SGL_SEG_DESCR
| NVME_DNR
;
986 addr
= le64_to_cpu(sgld
->addr
);
989 * Do not map the last descriptor; it will be a Segment or Last Segment
990 * descriptor and is handled by the next iteration.
992 status
= nvme_map_sgl_data(n
, sg
, segment
, nsgld
- 1, &len
, cmd
);
999 /* if there is any residual left in len, the SGL was too short */
1001 status
= NVME_DATA_SGL_LEN_INVALID
| NVME_DNR
;
1005 return NVME_SUCCESS
;
1012 uint16_t nvme_map_dptr(NvmeCtrl
*n
, NvmeSg
*sg
, size_t len
,
1015 uint64_t prp1
, prp2
;
1017 switch (NVME_CMD_FLAGS_PSDT(cmd
->flags
)) {
1019 prp1
= le64_to_cpu(cmd
->dptr
.prp1
);
1020 prp2
= le64_to_cpu(cmd
->dptr
.prp2
);
1022 return nvme_map_prp(n
, sg
, prp1
, prp2
, len
);
1023 case NVME_PSDT_SGL_MPTR_CONTIGUOUS
:
1024 case NVME_PSDT_SGL_MPTR_SGL
:
1025 return nvme_map_sgl(n
, sg
, cmd
->dptr
.sgl
, len
, cmd
);
1027 return NVME_INVALID_FIELD
;
1031 static uint16_t nvme_map_mptr(NvmeCtrl
*n
, NvmeSg
*sg
, size_t len
,
1034 int psdt
= NVME_CMD_FLAGS_PSDT(cmd
->flags
);
1035 hwaddr mptr
= le64_to_cpu(cmd
->mptr
);
1038 if (psdt
== NVME_PSDT_SGL_MPTR_SGL
) {
1039 NvmeSglDescriptor sgl
;
1041 if (nvme_addr_read(n
, mptr
, &sgl
, sizeof(sgl
))) {
1042 return NVME_DATA_TRAS_ERROR
;
1045 status
= nvme_map_sgl(n
, sg
, sgl
, len
, cmd
);
1046 if (status
&& (status
& 0x7ff) == NVME_DATA_SGL_LEN_INVALID
) {
1047 status
= NVME_MD_SGL_LEN_INVALID
| NVME_DNR
;
1053 nvme_sg_init(n
, sg
, nvme_addr_is_dma(n
, mptr
));
1054 status
= nvme_map_addr(n
, sg
, mptr
, len
);
1062 static uint16_t nvme_map_data(NvmeCtrl
*n
, uint32_t nlb
, NvmeRequest
*req
)
1064 NvmeNamespace
*ns
= req
->ns
;
1065 NvmeRwCmd
*rw
= (NvmeRwCmd
*)&req
->cmd
;
1066 bool pi
= !!NVME_ID_NS_DPS_TYPE(ns
->id_ns
.dps
);
1067 bool pract
= !!(le16_to_cpu(rw
->control
) & NVME_RW_PRINFO_PRACT
);
1068 size_t len
= nvme_l2b(ns
, nlb
);
1071 if (nvme_ns_ext(ns
) &&
1072 !(pi
&& pract
&& ns
->lbaf
.ms
== nvme_pi_tuple_size(ns
))) {
1075 len
+= nvme_m2b(ns
, nlb
);
1077 status
= nvme_map_dptr(n
, &sg
, len
, &req
->cmd
);
1082 nvme_sg_init(n
, &req
->sg
, sg
.flags
& NVME_SG_DMA
);
1083 nvme_sg_split(&sg
, ns
, &req
->sg
, NULL
);
1086 return NVME_SUCCESS
;
1089 return nvme_map_dptr(n
, &req
->sg
, len
, &req
->cmd
);
1092 static uint16_t nvme_map_mdata(NvmeCtrl
*n
, uint32_t nlb
, NvmeRequest
*req
)
1094 NvmeNamespace
*ns
= req
->ns
;
1095 size_t len
= nvme_m2b(ns
, nlb
);
1098 if (nvme_ns_ext(ns
)) {
1101 len
+= nvme_l2b(ns
, nlb
);
1103 status
= nvme_map_dptr(n
, &sg
, len
, &req
->cmd
);
1108 nvme_sg_init(n
, &req
->sg
, sg
.flags
& NVME_SG_DMA
);
1109 nvme_sg_split(&sg
, ns
, NULL
, &req
->sg
);
1112 return NVME_SUCCESS
;
1115 return nvme_map_mptr(n
, &req
->sg
, len
, &req
->cmd
);
1118 static uint16_t nvme_tx_interleaved(NvmeCtrl
*n
, NvmeSg
*sg
, uint8_t *ptr
,
1119 uint32_t len
, uint32_t bytes
,
1120 int32_t skip_bytes
, int64_t offset
,
1121 NvmeTxDirection dir
)
1124 uint32_t trans_len
, count
= bytes
;
1125 bool dma
= sg
->flags
& NVME_SG_DMA
;
1130 assert(sg
->flags
& NVME_SG_ALLOC
);
1133 sge_len
= dma
? sg
->qsg
.sg
[sg_idx
].len
: sg
->iov
.iov
[sg_idx
].iov_len
;
1135 if (sge_len
- offset
< 0) {
1141 if (sge_len
== offset
) {
1147 trans_len
= MIN(len
, count
);
1148 trans_len
= MIN(trans_len
, sge_len
- offset
);
1151 addr
= sg
->qsg
.sg
[sg_idx
].base
+ offset
;
1153 addr
= (hwaddr
)(uintptr_t)sg
->iov
.iov
[sg_idx
].iov_base
+ offset
;
1156 if (dir
== NVME_TX_DIRECTION_TO_DEVICE
) {
1157 ret
= nvme_addr_read(n
, addr
, ptr
, trans_len
);
1159 ret
= nvme_addr_write(n
, addr
, ptr
, trans_len
);
1163 return NVME_DATA_TRAS_ERROR
;
1169 offset
+= trans_len
;
1173 offset
+= skip_bytes
;
1177 return NVME_SUCCESS
;
1180 static uint16_t nvme_tx(NvmeCtrl
*n
, NvmeSg
*sg
, void *ptr
, uint32_t len
,
1181 NvmeTxDirection dir
)
1183 assert(sg
->flags
& NVME_SG_ALLOC
);
1185 if (sg
->flags
& NVME_SG_DMA
) {
1186 const MemTxAttrs attrs
= MEMTXATTRS_UNSPECIFIED
;
1187 dma_addr_t residual
;
1189 if (dir
== NVME_TX_DIRECTION_TO_DEVICE
) {
1190 dma_buf_write(ptr
, len
, &residual
, &sg
->qsg
, attrs
);
1192 dma_buf_read(ptr
, len
, &residual
, &sg
->qsg
, attrs
);
1195 if (unlikely(residual
)) {
1196 trace_pci_nvme_err_invalid_dma();
1197 return NVME_INVALID_FIELD
| NVME_DNR
;
1202 if (dir
== NVME_TX_DIRECTION_TO_DEVICE
) {
1203 bytes
= qemu_iovec_to_buf(&sg
->iov
, 0, ptr
, len
);
1205 bytes
= qemu_iovec_from_buf(&sg
->iov
, 0, ptr
, len
);
1208 if (unlikely(bytes
!= len
)) {
1209 trace_pci_nvme_err_invalid_dma();
1210 return NVME_INVALID_FIELD
| NVME_DNR
;
1214 return NVME_SUCCESS
;
1217 static inline uint16_t nvme_c2h(NvmeCtrl
*n
, void *ptr
, uint32_t len
,
1222 status
= nvme_map_dptr(n
, &req
->sg
, len
, &req
->cmd
);
1227 return nvme_tx(n
, &req
->sg
, ptr
, len
, NVME_TX_DIRECTION_FROM_DEVICE
);
1230 static inline uint16_t nvme_h2c(NvmeCtrl
*n
, void *ptr
, uint32_t len
,
1235 status
= nvme_map_dptr(n
, &req
->sg
, len
, &req
->cmd
);
1240 return nvme_tx(n
, &req
->sg
, ptr
, len
, NVME_TX_DIRECTION_TO_DEVICE
);
1243 uint16_t nvme_bounce_data(NvmeCtrl
*n
, void *ptr
, uint32_t len
,
1244 NvmeTxDirection dir
, NvmeRequest
*req
)
1246 NvmeNamespace
*ns
= req
->ns
;
1247 NvmeRwCmd
*rw
= (NvmeRwCmd
*)&req
->cmd
;
1248 bool pi
= !!NVME_ID_NS_DPS_TYPE(ns
->id_ns
.dps
);
1249 bool pract
= !!(le16_to_cpu(rw
->control
) & NVME_RW_PRINFO_PRACT
);
1251 if (nvme_ns_ext(ns
) &&
1252 !(pi
&& pract
&& ns
->lbaf
.ms
== nvme_pi_tuple_size(ns
))) {
1253 return nvme_tx_interleaved(n
, &req
->sg
, ptr
, len
, ns
->lbasz
,
1254 ns
->lbaf
.ms
, 0, dir
);
1257 return nvme_tx(n
, &req
->sg
, ptr
, len
, dir
);
1260 uint16_t nvme_bounce_mdata(NvmeCtrl
*n
, void *ptr
, uint32_t len
,
1261 NvmeTxDirection dir
, NvmeRequest
*req
)
1263 NvmeNamespace
*ns
= req
->ns
;
1266 if (nvme_ns_ext(ns
)) {
1267 return nvme_tx_interleaved(n
, &req
->sg
, ptr
, len
, ns
->lbaf
.ms
,
1268 ns
->lbasz
, ns
->lbasz
, dir
);
1271 nvme_sg_unmap(&req
->sg
);
1273 status
= nvme_map_mptr(n
, &req
->sg
, len
, &req
->cmd
);
1278 return nvme_tx(n
, &req
->sg
, ptr
, len
, dir
);
1281 static inline void nvme_blk_read(BlockBackend
*blk
, int64_t offset
,
1282 BlockCompletionFunc
*cb
, NvmeRequest
*req
)
1284 assert(req
->sg
.flags
& NVME_SG_ALLOC
);
1286 if (req
->sg
.flags
& NVME_SG_DMA
) {
1287 req
->aiocb
= dma_blk_read(blk
, &req
->sg
.qsg
, offset
, BDRV_SECTOR_SIZE
,
1290 req
->aiocb
= blk_aio_preadv(blk
, offset
, &req
->sg
.iov
, 0, cb
, req
);
1294 static inline void nvme_blk_write(BlockBackend
*blk
, int64_t offset
,
1295 BlockCompletionFunc
*cb
, NvmeRequest
*req
)
1297 assert(req
->sg
.flags
& NVME_SG_ALLOC
);
1299 if (req
->sg
.flags
& NVME_SG_DMA
) {
1300 req
->aiocb
= dma_blk_write(blk
, &req
->sg
.qsg
, offset
, BDRV_SECTOR_SIZE
,
1303 req
->aiocb
= blk_aio_pwritev(blk
, offset
, &req
->sg
.iov
, 0, cb
, req
);
1307 static void nvme_post_cqes(void *opaque
)
1309 NvmeCQueue
*cq
= opaque
;
1310 NvmeCtrl
*n
= cq
->ctrl
;
1311 NvmeRequest
*req
, *next
;
1312 bool pending
= cq
->head
!= cq
->tail
;
1315 QTAILQ_FOREACH_SAFE(req
, &cq
->req_list
, entry
, next
) {
1319 if (nvme_cq_full(cq
)) {
1324 req
->cqe
.status
= cpu_to_le16((req
->status
<< 1) | cq
->phase
);
1325 req
->cqe
.sq_id
= cpu_to_le16(sq
->sqid
);
1326 req
->cqe
.sq_head
= cpu_to_le16(sq
->head
);
1327 addr
= cq
->dma_addr
+ cq
->tail
* n
->cqe_size
;
1328 ret
= pci_dma_write(&n
->parent_obj
, addr
, (void *)&req
->cqe
,
1331 trace_pci_nvme_err_addr_write(addr
);
1332 trace_pci_nvme_err_cfs();
1333 stl_le_p(&n
->bar
.csts
, NVME_CSTS_FAILED
);
1336 QTAILQ_REMOVE(&cq
->req_list
, req
, entry
);
1337 nvme_inc_cq_tail(cq
);
1338 nvme_sg_unmap(&req
->sg
);
1339 QTAILQ_INSERT_TAIL(&sq
->req_list
, req
, entry
);
1341 if (cq
->tail
!= cq
->head
) {
1342 if (cq
->irq_enabled
&& !pending
) {
1346 nvme_irq_assert(n
, cq
);
1350 static void nvme_enqueue_req_completion(NvmeCQueue
*cq
, NvmeRequest
*req
)
1352 assert(cq
->cqid
== req
->sq
->cqid
);
1353 trace_pci_nvme_enqueue_req_completion(nvme_cid(req
), cq
->cqid
,
1354 le32_to_cpu(req
->cqe
.result
),
1355 le32_to_cpu(req
->cqe
.dw1
),
1359 trace_pci_nvme_err_req_status(nvme_cid(req
), nvme_nsid(req
->ns
),
1360 req
->status
, req
->cmd
.opcode
);
1363 QTAILQ_REMOVE(&req
->sq
->out_req_list
, req
, entry
);
1364 QTAILQ_INSERT_TAIL(&cq
->req_list
, req
, entry
);
1365 timer_mod(cq
->timer
, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL
) + 500);
1368 static void nvme_process_aers(void *opaque
)
1370 NvmeCtrl
*n
= opaque
;
1371 NvmeAsyncEvent
*event
, *next
;
1373 trace_pci_nvme_process_aers(n
->aer_queued
);
1375 QTAILQ_FOREACH_SAFE(event
, &n
->aer_queue
, entry
, next
) {
1377 NvmeAerResult
*result
;
1379 /* can't post cqe if there is nothing to complete */
1380 if (!n
->outstanding_aers
) {
1381 trace_pci_nvme_no_outstanding_aers();
1385 /* ignore if masked (cqe posted, but event not cleared) */
1386 if (n
->aer_mask
& (1 << event
->result
.event_type
)) {
1387 trace_pci_nvme_aer_masked(event
->result
.event_type
, n
->aer_mask
);
1391 QTAILQ_REMOVE(&n
->aer_queue
, event
, entry
);
1394 n
->aer_mask
|= 1 << event
->result
.event_type
;
1395 n
->outstanding_aers
--;
1397 req
= n
->aer_reqs
[n
->outstanding_aers
];
1399 result
= (NvmeAerResult
*) &req
->cqe
.result
;
1400 result
->event_type
= event
->result
.event_type
;
1401 result
->event_info
= event
->result
.event_info
;
1402 result
->log_page
= event
->result
.log_page
;
1405 trace_pci_nvme_aer_post_cqe(result
->event_type
, result
->event_info
,
1408 nvme_enqueue_req_completion(&n
->admin_cq
, req
);
1412 static void nvme_enqueue_event(NvmeCtrl
*n
, uint8_t event_type
,
1413 uint8_t event_info
, uint8_t log_page
)
1415 NvmeAsyncEvent
*event
;
1417 trace_pci_nvme_enqueue_event(event_type
, event_info
, log_page
);
1419 if (n
->aer_queued
== n
->params
.aer_max_queued
) {
1420 trace_pci_nvme_enqueue_event_noqueue(n
->aer_queued
);
1424 event
= g_new(NvmeAsyncEvent
, 1);
1425 event
->result
= (NvmeAerResult
) {
1426 .event_type
= event_type
,
1427 .event_info
= event_info
,
1428 .log_page
= log_page
,
1431 QTAILQ_INSERT_TAIL(&n
->aer_queue
, event
, entry
);
1434 nvme_process_aers(n
);
1437 static void nvme_smart_event(NvmeCtrl
*n
, uint8_t event
)
1441 /* Ref SPEC <Asynchronous Event Information 0x2013 SMART / Health Status> */
1442 if (!(NVME_AEC_SMART(n
->features
.async_config
) & event
)) {
1447 case NVME_SMART_SPARE
:
1448 aer_info
= NVME_AER_INFO_SMART_SPARE_THRESH
;
1450 case NVME_SMART_TEMPERATURE
:
1451 aer_info
= NVME_AER_INFO_SMART_TEMP_THRESH
;
1453 case NVME_SMART_RELIABILITY
:
1454 case NVME_SMART_MEDIA_READ_ONLY
:
1455 case NVME_SMART_FAILED_VOLATILE_MEDIA
:
1456 case NVME_SMART_PMR_UNRELIABLE
:
1457 aer_info
= NVME_AER_INFO_SMART_RELIABILITY
;
1463 nvme_enqueue_event(n
, NVME_AER_TYPE_SMART
, aer_info
, NVME_LOG_SMART_INFO
);
1466 static void nvme_clear_events(NvmeCtrl
*n
, uint8_t event_type
)
1468 n
->aer_mask
&= ~(1 << event_type
);
1469 if (!QTAILQ_EMPTY(&n
->aer_queue
)) {
1470 nvme_process_aers(n
);
1474 static inline uint16_t nvme_check_mdts(NvmeCtrl
*n
, size_t len
)
1476 uint8_t mdts
= n
->params
.mdts
;
1478 if (mdts
&& len
> n
->page_size
<< mdts
) {
1479 trace_pci_nvme_err_mdts(len
);
1480 return NVME_INVALID_FIELD
| NVME_DNR
;
1483 return NVME_SUCCESS
;
1486 static inline uint16_t nvme_check_bounds(NvmeNamespace
*ns
, uint64_t slba
,
1489 uint64_t nsze
= le64_to_cpu(ns
->id_ns
.nsze
);
1491 if (unlikely(UINT64_MAX
- slba
< nlb
|| slba
+ nlb
> nsze
)) {
1492 trace_pci_nvme_err_invalid_lba_range(slba
, nlb
, nsze
);
1493 return NVME_LBA_RANGE
| NVME_DNR
;
1496 return NVME_SUCCESS
;
1499 static int nvme_block_status_all(NvmeNamespace
*ns
, uint64_t slba
,
1500 uint32_t nlb
, int flags
)
1502 BlockDriverState
*bs
= blk_bs(ns
->blkconf
.blk
);
1504 int64_t pnum
= 0, bytes
= nvme_l2b(ns
, nlb
);
1505 int64_t offset
= nvme_l2b(ns
, slba
);
1509 * `pnum` holds the number of bytes after offset that shares the same
1510 * allocation status as the byte at offset. If `pnum` is different from
1511 * `bytes`, we should check the allocation status of the next range and
1512 * continue this until all bytes have been checked.
1517 ret
= bdrv_block_status(bs
, offset
, bytes
, &pnum
, NULL
, NULL
);
1523 trace_pci_nvme_block_status(offset
, bytes
, pnum
, ret
,
1524 !!(ret
& BDRV_BLOCK_ZERO
));
1526 if (!(ret
& flags
)) {
1531 } while (pnum
!= bytes
);
1536 static uint16_t nvme_check_dulbe(NvmeNamespace
*ns
, uint64_t slba
,
1542 ret
= nvme_block_status_all(ns
, slba
, nlb
, BDRV_BLOCK_DATA
);
1545 error_setg_errno(&err
, -ret
, "unable to get block status");
1546 error_report_err(err
);
1548 return NVME_INTERNAL_DEV_ERROR
;
1554 return NVME_SUCCESS
;
1557 static void nvme_aio_err(NvmeRequest
*req
, int ret
)
1559 uint16_t status
= NVME_SUCCESS
;
1560 Error
*local_err
= NULL
;
1562 switch (req
->cmd
.opcode
) {
1564 status
= NVME_UNRECOVERED_READ
;
1566 case NVME_CMD_FLUSH
:
1567 case NVME_CMD_WRITE
:
1568 case NVME_CMD_WRITE_ZEROES
:
1569 case NVME_CMD_ZONE_APPEND
:
1570 status
= NVME_WRITE_FAULT
;
1573 status
= NVME_INTERNAL_DEV_ERROR
;
1577 trace_pci_nvme_err_aio(nvme_cid(req
), strerror(-ret
), status
);
1579 error_setg_errno(&local_err
, -ret
, "aio failed");
1580 error_report_err(local_err
);
1583 * Set the command status code to the first encountered error but allow a
1584 * subsequent Internal Device Error to trump it.
1586 if (req
->status
&& status
!= NVME_INTERNAL_DEV_ERROR
) {
1590 req
->status
= status
;
1593 static inline uint32_t nvme_zone_idx(NvmeNamespace
*ns
, uint64_t slba
)
1595 return ns
->zone_size_log2
> 0 ? slba
>> ns
->zone_size_log2
:
1596 slba
/ ns
->zone_size
;
1599 static inline NvmeZone
*nvme_get_zone_by_slba(NvmeNamespace
*ns
, uint64_t slba
)
1601 uint32_t zone_idx
= nvme_zone_idx(ns
, slba
);
1603 if (zone_idx
>= ns
->num_zones
) {
1607 return &ns
->zone_array
[zone_idx
];
1610 static uint16_t nvme_check_zone_state_for_write(NvmeZone
*zone
)
1612 uint64_t zslba
= zone
->d
.zslba
;
1614 switch (nvme_get_zone_state(zone
)) {
1615 case NVME_ZONE_STATE_EMPTY
:
1616 case NVME_ZONE_STATE_IMPLICITLY_OPEN
:
1617 case NVME_ZONE_STATE_EXPLICITLY_OPEN
:
1618 case NVME_ZONE_STATE_CLOSED
:
1619 return NVME_SUCCESS
;
1620 case NVME_ZONE_STATE_FULL
:
1621 trace_pci_nvme_err_zone_is_full(zslba
);
1622 return NVME_ZONE_FULL
;
1623 case NVME_ZONE_STATE_OFFLINE
:
1624 trace_pci_nvme_err_zone_is_offline(zslba
);
1625 return NVME_ZONE_OFFLINE
;
1626 case NVME_ZONE_STATE_READ_ONLY
:
1627 trace_pci_nvme_err_zone_is_read_only(zslba
);
1628 return NVME_ZONE_READ_ONLY
;
1633 return NVME_INTERNAL_DEV_ERROR
;
1636 static uint16_t nvme_check_zone_write(NvmeNamespace
*ns
, NvmeZone
*zone
,
1637 uint64_t slba
, uint32_t nlb
)
1639 uint64_t zcap
= nvme_zone_wr_boundary(zone
);
1642 status
= nvme_check_zone_state_for_write(zone
);
1647 if (zone
->d
.za
& NVME_ZA_ZRWA_VALID
) {
1648 uint64_t ezrwa
= zone
->w_ptr
+ 2 * ns
->zns
.zrwas
;
1650 if (slba
< zone
->w_ptr
|| slba
+ nlb
> ezrwa
) {
1651 trace_pci_nvme_err_zone_invalid_write(slba
, zone
->w_ptr
);
1652 return NVME_ZONE_INVALID_WRITE
;
1655 if (unlikely(slba
!= zone
->w_ptr
)) {
1656 trace_pci_nvme_err_write_not_at_wp(slba
, zone
->d
.zslba
,
1658 return NVME_ZONE_INVALID_WRITE
;
1662 if (unlikely((slba
+ nlb
) > zcap
)) {
1663 trace_pci_nvme_err_zone_boundary(slba
, nlb
, zcap
);
1664 return NVME_ZONE_BOUNDARY_ERROR
;
1667 return NVME_SUCCESS
;
1670 static uint16_t nvme_check_zone_state_for_read(NvmeZone
*zone
)
1672 switch (nvme_get_zone_state(zone
)) {
1673 case NVME_ZONE_STATE_EMPTY
:
1674 case NVME_ZONE_STATE_IMPLICITLY_OPEN
:
1675 case NVME_ZONE_STATE_EXPLICITLY_OPEN
:
1676 case NVME_ZONE_STATE_FULL
:
1677 case NVME_ZONE_STATE_CLOSED
:
1678 case NVME_ZONE_STATE_READ_ONLY
:
1679 return NVME_SUCCESS
;
1680 case NVME_ZONE_STATE_OFFLINE
:
1681 trace_pci_nvme_err_zone_is_offline(zone
->d
.zslba
);
1682 return NVME_ZONE_OFFLINE
;
1687 return NVME_INTERNAL_DEV_ERROR
;
1690 static uint16_t nvme_check_zone_read(NvmeNamespace
*ns
, uint64_t slba
,
1694 uint64_t bndry
, end
;
1697 zone
= nvme_get_zone_by_slba(ns
, slba
);
1700 bndry
= nvme_zone_rd_boundary(ns
, zone
);
1703 status
= nvme_check_zone_state_for_read(zone
);
1706 } else if (unlikely(end
> bndry
)) {
1707 if (!ns
->params
.cross_zone_read
) {
1708 status
= NVME_ZONE_BOUNDARY_ERROR
;
1711 * Read across zone boundary - check that all subsequent
1712 * zones that are being read have an appropriate state.
1716 status
= nvme_check_zone_state_for_read(zone
);
1720 } while (end
> nvme_zone_rd_boundary(ns
, zone
));
1727 static uint16_t nvme_zrm_finish(NvmeNamespace
*ns
, NvmeZone
*zone
)
1729 switch (nvme_get_zone_state(zone
)) {
1730 case NVME_ZONE_STATE_FULL
:
1731 return NVME_SUCCESS
;
1733 case NVME_ZONE_STATE_IMPLICITLY_OPEN
:
1734 case NVME_ZONE_STATE_EXPLICITLY_OPEN
:
1735 nvme_aor_dec_open(ns
);
1737 case NVME_ZONE_STATE_CLOSED
:
1738 nvme_aor_dec_active(ns
);
1740 if (zone
->d
.za
& NVME_ZA_ZRWA_VALID
) {
1741 zone
->d
.za
&= ~NVME_ZA_ZRWA_VALID
;
1742 if (ns
->params
.numzrwa
) {
1748 case NVME_ZONE_STATE_EMPTY
:
1749 nvme_assign_zone_state(ns
, zone
, NVME_ZONE_STATE_FULL
);
1750 return NVME_SUCCESS
;
1753 return NVME_ZONE_INVAL_TRANSITION
;
1757 static uint16_t nvme_zrm_close(NvmeNamespace
*ns
, NvmeZone
*zone
)
1759 switch (nvme_get_zone_state(zone
)) {
1760 case NVME_ZONE_STATE_EXPLICITLY_OPEN
:
1761 case NVME_ZONE_STATE_IMPLICITLY_OPEN
:
1762 nvme_aor_dec_open(ns
);
1763 nvme_assign_zone_state(ns
, zone
, NVME_ZONE_STATE_CLOSED
);
1765 case NVME_ZONE_STATE_CLOSED
:
1766 return NVME_SUCCESS
;
1769 return NVME_ZONE_INVAL_TRANSITION
;
1773 static uint16_t nvme_zrm_reset(NvmeNamespace
*ns
, NvmeZone
*zone
)
1775 switch (nvme_get_zone_state(zone
)) {
1776 case NVME_ZONE_STATE_EXPLICITLY_OPEN
:
1777 case NVME_ZONE_STATE_IMPLICITLY_OPEN
:
1778 nvme_aor_dec_open(ns
);
1780 case NVME_ZONE_STATE_CLOSED
:
1781 nvme_aor_dec_active(ns
);
1783 if (zone
->d
.za
& NVME_ZA_ZRWA_VALID
) {
1784 if (ns
->params
.numzrwa
) {
1790 case NVME_ZONE_STATE_FULL
:
1791 zone
->w_ptr
= zone
->d
.zslba
;
1792 zone
->d
.wp
= zone
->w_ptr
;
1793 nvme_assign_zone_state(ns
, zone
, NVME_ZONE_STATE_EMPTY
);
1795 case NVME_ZONE_STATE_EMPTY
:
1796 return NVME_SUCCESS
;
1799 return NVME_ZONE_INVAL_TRANSITION
;
1803 static void nvme_zrm_auto_transition_zone(NvmeNamespace
*ns
)
1807 if (ns
->params
.max_open_zones
&&
1808 ns
->nr_open_zones
== ns
->params
.max_open_zones
) {
1809 zone
= QTAILQ_FIRST(&ns
->imp_open_zones
);
1812 * Automatically close this implicitly open zone.
1814 QTAILQ_REMOVE(&ns
->imp_open_zones
, zone
, entry
);
1815 nvme_zrm_close(ns
, zone
);
1821 NVME_ZRM_AUTO
= 1 << 0,
1822 NVME_ZRM_ZRWA
= 1 << 1,
1825 static uint16_t nvme_zrm_open_flags(NvmeCtrl
*n
, NvmeNamespace
*ns
,
1826 NvmeZone
*zone
, int flags
)
1831 switch (nvme_get_zone_state(zone
)) {
1832 case NVME_ZONE_STATE_EMPTY
:
1837 case NVME_ZONE_STATE_CLOSED
:
1838 if (n
->params
.auto_transition_zones
) {
1839 nvme_zrm_auto_transition_zone(ns
);
1841 status
= nvme_zns_check_resources(ns
, act
, 1,
1842 (flags
& NVME_ZRM_ZRWA
) ? 1 : 0);
1848 nvme_aor_inc_active(ns
);
1851 nvme_aor_inc_open(ns
);
1853 if (flags
& NVME_ZRM_AUTO
) {
1854 nvme_assign_zone_state(ns
, zone
, NVME_ZONE_STATE_IMPLICITLY_OPEN
);
1855 return NVME_SUCCESS
;
1860 case NVME_ZONE_STATE_IMPLICITLY_OPEN
:
1861 if (flags
& NVME_ZRM_AUTO
) {
1862 return NVME_SUCCESS
;
1865 nvme_assign_zone_state(ns
, zone
, NVME_ZONE_STATE_EXPLICITLY_OPEN
);
1869 case NVME_ZONE_STATE_EXPLICITLY_OPEN
:
1870 if (flags
& NVME_ZRM_ZRWA
) {
1873 zone
->d
.za
|= NVME_ZA_ZRWA_VALID
;
1876 return NVME_SUCCESS
;
1879 return NVME_ZONE_INVAL_TRANSITION
;
1883 static inline uint16_t nvme_zrm_auto(NvmeCtrl
*n
, NvmeNamespace
*ns
,
1886 return nvme_zrm_open_flags(n
, ns
, zone
, NVME_ZRM_AUTO
);
1889 static void nvme_advance_zone_wp(NvmeNamespace
*ns
, NvmeZone
*zone
,
1894 if (zone
->d
.wp
== nvme_zone_wr_boundary(zone
)) {
1895 nvme_zrm_finish(ns
, zone
);
1899 static void nvme_zoned_zrwa_implicit_flush(NvmeNamespace
*ns
, NvmeZone
*zone
,
1902 uint16_t nzrwafgs
= DIV_ROUND_UP(nlbc
, ns
->zns
.zrwafg
);
1904 nlbc
= nzrwafgs
* ns
->zns
.zrwafg
;
1906 trace_pci_nvme_zoned_zrwa_implicit_flush(zone
->d
.zslba
, nlbc
);
1908 zone
->w_ptr
+= nlbc
;
1910 nvme_advance_zone_wp(ns
, zone
, nlbc
);
1913 static void nvme_finalize_zoned_write(NvmeNamespace
*ns
, NvmeRequest
*req
)
1915 NvmeRwCmd
*rw
= (NvmeRwCmd
*)&req
->cmd
;
1920 slba
= le64_to_cpu(rw
->slba
);
1921 nlb
= le16_to_cpu(rw
->nlb
) + 1;
1922 zone
= nvme_get_zone_by_slba(ns
, slba
);
1925 if (zone
->d
.za
& NVME_ZA_ZRWA_VALID
) {
1926 uint64_t ezrwa
= zone
->w_ptr
+ ns
->zns
.zrwas
- 1;
1927 uint64_t elba
= slba
+ nlb
- 1;
1930 nvme_zoned_zrwa_implicit_flush(ns
, zone
, elba
- ezrwa
);
1936 nvme_advance_zone_wp(ns
, zone
, nlb
);
1939 static inline bool nvme_is_write(NvmeRequest
*req
)
1941 NvmeRwCmd
*rw
= (NvmeRwCmd
*)&req
->cmd
;
1943 return rw
->opcode
== NVME_CMD_WRITE
||
1944 rw
->opcode
== NVME_CMD_ZONE_APPEND
||
1945 rw
->opcode
== NVME_CMD_WRITE_ZEROES
;
1948 static AioContext
*nvme_get_aio_context(BlockAIOCB
*acb
)
1950 return qemu_get_aio_context();
1953 static void nvme_misc_cb(void *opaque
, int ret
)
1955 NvmeRequest
*req
= opaque
;
1957 trace_pci_nvme_misc_cb(nvme_cid(req
));
1960 nvme_aio_err(req
, ret
);
1963 nvme_enqueue_req_completion(nvme_cq(req
), req
);
1966 void nvme_rw_complete_cb(void *opaque
, int ret
)
1968 NvmeRequest
*req
= opaque
;
1969 NvmeNamespace
*ns
= req
->ns
;
1970 BlockBackend
*blk
= ns
->blkconf
.blk
;
1971 BlockAcctCookie
*acct
= &req
->acct
;
1972 BlockAcctStats
*stats
= blk_get_stats(blk
);
1974 trace_pci_nvme_rw_complete_cb(nvme_cid(req
), blk_name(blk
));
1977 block_acct_failed(stats
, acct
);
1978 nvme_aio_err(req
, ret
);
1980 block_acct_done(stats
, acct
);
1983 if (ns
->params
.zoned
&& nvme_is_write(req
)) {
1984 nvme_finalize_zoned_write(ns
, req
);
1987 nvme_enqueue_req_completion(nvme_cq(req
), req
);
1990 static void nvme_rw_cb(void *opaque
, int ret
)
1992 NvmeRequest
*req
= opaque
;
1993 NvmeNamespace
*ns
= req
->ns
;
1995 BlockBackend
*blk
= ns
->blkconf
.blk
;
1997 trace_pci_nvme_rw_cb(nvme_cid(req
), blk_name(blk
));
2004 NvmeRwCmd
*rw
= (NvmeRwCmd
*)&req
->cmd
;
2005 uint64_t slba
= le64_to_cpu(rw
->slba
);
2006 uint32_t nlb
= (uint32_t)le16_to_cpu(rw
->nlb
) + 1;
2007 uint64_t offset
= nvme_moff(ns
, slba
);
2009 if (req
->cmd
.opcode
== NVME_CMD_WRITE_ZEROES
) {
2010 size_t mlen
= nvme_m2b(ns
, nlb
);
2012 req
->aiocb
= blk_aio_pwrite_zeroes(blk
, offset
, mlen
,
2014 nvme_rw_complete_cb
, req
);
2018 if (nvme_ns_ext(ns
) || req
->cmd
.mptr
) {
2021 nvme_sg_unmap(&req
->sg
);
2022 status
= nvme_map_mdata(nvme_ctrl(req
), nlb
, req
);
2028 if (req
->cmd
.opcode
== NVME_CMD_READ
) {
2029 return nvme_blk_read(blk
, offset
, nvme_rw_complete_cb
, req
);
2032 return nvme_blk_write(blk
, offset
, nvme_rw_complete_cb
, req
);
2037 nvme_rw_complete_cb(req
, ret
);
2040 static void nvme_verify_cb(void *opaque
, int ret
)
2042 NvmeBounceContext
*ctx
= opaque
;
2043 NvmeRequest
*req
= ctx
->req
;
2044 NvmeNamespace
*ns
= req
->ns
;
2045 BlockBackend
*blk
= ns
->blkconf
.blk
;
2046 BlockAcctCookie
*acct
= &req
->acct
;
2047 BlockAcctStats
*stats
= blk_get_stats(blk
);
2048 NvmeRwCmd
*rw
= (NvmeRwCmd
*)&req
->cmd
;
2049 uint64_t slba
= le64_to_cpu(rw
->slba
);
2050 uint8_t prinfo
= NVME_RW_PRINFO(le16_to_cpu(rw
->control
));
2051 uint16_t apptag
= le16_to_cpu(rw
->apptag
);
2052 uint16_t appmask
= le16_to_cpu(rw
->appmask
);
2053 uint64_t reftag
= le32_to_cpu(rw
->reftag
);
2054 uint64_t cdw3
= le32_to_cpu(rw
->cdw3
);
2057 reftag
|= cdw3
<< 32;
2059 trace_pci_nvme_verify_cb(nvme_cid(req
), prinfo
, apptag
, appmask
, reftag
);
2062 block_acct_failed(stats
, acct
);
2063 nvme_aio_err(req
, ret
);
2067 block_acct_done(stats
, acct
);
2069 if (NVME_ID_NS_DPS_TYPE(ns
->id_ns
.dps
)) {
2070 status
= nvme_dif_mangle_mdata(ns
, ctx
->mdata
.bounce
,
2071 ctx
->mdata
.iov
.size
, slba
);
2073 req
->status
= status
;
2077 req
->status
= nvme_dif_check(ns
, ctx
->data
.bounce
, ctx
->data
.iov
.size
,
2078 ctx
->mdata
.bounce
, ctx
->mdata
.iov
.size
,
2079 prinfo
, slba
, apptag
, appmask
, &reftag
);
2083 qemu_iovec_destroy(&ctx
->data
.iov
);
2084 g_free(ctx
->data
.bounce
);
2086 qemu_iovec_destroy(&ctx
->mdata
.iov
);
2087 g_free(ctx
->mdata
.bounce
);
2091 nvme_enqueue_req_completion(nvme_cq(req
), req
);
2095 static void nvme_verify_mdata_in_cb(void *opaque
, int ret
)
2097 NvmeBounceContext
*ctx
= opaque
;
2098 NvmeRequest
*req
= ctx
->req
;
2099 NvmeNamespace
*ns
= req
->ns
;
2100 NvmeRwCmd
*rw
= (NvmeRwCmd
*)&req
->cmd
;
2101 uint64_t slba
= le64_to_cpu(rw
->slba
);
2102 uint32_t nlb
= le16_to_cpu(rw
->nlb
) + 1;
2103 size_t mlen
= nvme_m2b(ns
, nlb
);
2104 uint64_t offset
= nvme_moff(ns
, slba
);
2105 BlockBackend
*blk
= ns
->blkconf
.blk
;
2107 trace_pci_nvme_verify_mdata_in_cb(nvme_cid(req
), blk_name(blk
));
2113 ctx
->mdata
.bounce
= g_malloc(mlen
);
2115 qemu_iovec_reset(&ctx
->mdata
.iov
);
2116 qemu_iovec_add(&ctx
->mdata
.iov
, ctx
->mdata
.bounce
, mlen
);
2118 req
->aiocb
= blk_aio_preadv(blk
, offset
, &ctx
->mdata
.iov
, 0,
2119 nvme_verify_cb
, ctx
);
2123 nvme_verify_cb(ctx
, ret
);
2126 struct nvme_compare_ctx
{
2138 static void nvme_compare_mdata_cb(void *opaque
, int ret
)
2140 NvmeRequest
*req
= opaque
;
2141 NvmeNamespace
*ns
= req
->ns
;
2142 NvmeCtrl
*n
= nvme_ctrl(req
);
2143 NvmeRwCmd
*rw
= (NvmeRwCmd
*)&req
->cmd
;
2144 uint8_t prinfo
= NVME_RW_PRINFO(le16_to_cpu(rw
->control
));
2145 uint16_t apptag
= le16_to_cpu(rw
->apptag
);
2146 uint16_t appmask
= le16_to_cpu(rw
->appmask
);
2147 uint64_t reftag
= le32_to_cpu(rw
->reftag
);
2148 uint64_t cdw3
= le32_to_cpu(rw
->cdw3
);
2149 struct nvme_compare_ctx
*ctx
= req
->opaque
;
2150 g_autofree
uint8_t *buf
= NULL
;
2151 BlockBackend
*blk
= ns
->blkconf
.blk
;
2152 BlockAcctCookie
*acct
= &req
->acct
;
2153 BlockAcctStats
*stats
= blk_get_stats(blk
);
2154 uint16_t status
= NVME_SUCCESS
;
2156 reftag
|= cdw3
<< 32;
2158 trace_pci_nvme_compare_mdata_cb(nvme_cid(req
));
2161 block_acct_failed(stats
, acct
);
2162 nvme_aio_err(req
, ret
);
2166 buf
= g_malloc(ctx
->mdata
.iov
.size
);
2168 status
= nvme_bounce_mdata(n
, buf
, ctx
->mdata
.iov
.size
,
2169 NVME_TX_DIRECTION_TO_DEVICE
, req
);
2171 req
->status
= status
;
2175 if (NVME_ID_NS_DPS_TYPE(ns
->id_ns
.dps
)) {
2176 uint64_t slba
= le64_to_cpu(rw
->slba
);
2178 uint8_t *mbufp
= ctx
->mdata
.bounce
;
2179 uint8_t *end
= mbufp
+ ctx
->mdata
.iov
.size
;
2182 status
= nvme_dif_check(ns
, ctx
->data
.bounce
, ctx
->data
.iov
.size
,
2183 ctx
->mdata
.bounce
, ctx
->mdata
.iov
.size
, prinfo
,
2184 slba
, apptag
, appmask
, &reftag
);
2186 req
->status
= status
;
2191 * When formatted with protection information, do not compare the DIF
2194 if (!(ns
->id_ns
.dps
& NVME_ID_NS_DPS_FIRST_EIGHT
)) {
2195 pil
= ns
->lbaf
.ms
- nvme_pi_tuple_size(ns
);
2198 for (bufp
= buf
; mbufp
< end
; bufp
+= ns
->lbaf
.ms
, mbufp
+= ns
->lbaf
.ms
) {
2199 if (memcmp(bufp
+ pil
, mbufp
+ pil
, ns
->lbaf
.ms
- pil
)) {
2200 req
->status
= NVME_CMP_FAILURE
;
2208 if (memcmp(buf
, ctx
->mdata
.bounce
, ctx
->mdata
.iov
.size
)) {
2209 req
->status
= NVME_CMP_FAILURE
;
2213 block_acct_done(stats
, acct
);
2216 qemu_iovec_destroy(&ctx
->data
.iov
);
2217 g_free(ctx
->data
.bounce
);
2219 qemu_iovec_destroy(&ctx
->mdata
.iov
);
2220 g_free(ctx
->mdata
.bounce
);
2224 nvme_enqueue_req_completion(nvme_cq(req
), req
);
2227 static void nvme_compare_data_cb(void *opaque
, int ret
)
2229 NvmeRequest
*req
= opaque
;
2230 NvmeCtrl
*n
= nvme_ctrl(req
);
2231 NvmeNamespace
*ns
= req
->ns
;
2232 BlockBackend
*blk
= ns
->blkconf
.blk
;
2233 BlockAcctCookie
*acct
= &req
->acct
;
2234 BlockAcctStats
*stats
= blk_get_stats(blk
);
2236 struct nvme_compare_ctx
*ctx
= req
->opaque
;
2237 g_autofree
uint8_t *buf
= NULL
;
2240 trace_pci_nvme_compare_data_cb(nvme_cid(req
));
2243 block_acct_failed(stats
, acct
);
2244 nvme_aio_err(req
, ret
);
2248 buf
= g_malloc(ctx
->data
.iov
.size
);
2250 status
= nvme_bounce_data(n
, buf
, ctx
->data
.iov
.size
,
2251 NVME_TX_DIRECTION_TO_DEVICE
, req
);
2253 req
->status
= status
;
2257 if (memcmp(buf
, ctx
->data
.bounce
, ctx
->data
.iov
.size
)) {
2258 req
->status
= NVME_CMP_FAILURE
;
2263 NvmeRwCmd
*rw
= (NvmeRwCmd
*)&req
->cmd
;
2264 uint64_t slba
= le64_to_cpu(rw
->slba
);
2265 uint32_t nlb
= le16_to_cpu(rw
->nlb
) + 1;
2266 size_t mlen
= nvme_m2b(ns
, nlb
);
2267 uint64_t offset
= nvme_moff(ns
, slba
);
2269 ctx
->mdata
.bounce
= g_malloc(mlen
);
2271 qemu_iovec_init(&ctx
->mdata
.iov
, 1);
2272 qemu_iovec_add(&ctx
->mdata
.iov
, ctx
->mdata
.bounce
, mlen
);
2274 req
->aiocb
= blk_aio_preadv(blk
, offset
, &ctx
->mdata
.iov
, 0,
2275 nvme_compare_mdata_cb
, req
);
2279 block_acct_done(stats
, acct
);
2282 qemu_iovec_destroy(&ctx
->data
.iov
);
2283 g_free(ctx
->data
.bounce
);
2286 nvme_enqueue_req_completion(nvme_cq(req
), req
);
2289 typedef struct NvmeDSMAIOCB
{
2296 NvmeDsmRange
*range
;
2301 static void nvme_dsm_cancel(BlockAIOCB
*aiocb
)
2303 NvmeDSMAIOCB
*iocb
= container_of(aiocb
, NvmeDSMAIOCB
, common
);
2305 /* break nvme_dsm_cb loop */
2306 iocb
->idx
= iocb
->nr
;
2307 iocb
->ret
= -ECANCELED
;
2310 blk_aio_cancel_async(iocb
->aiocb
);
2314 * We only reach this if nvme_dsm_cancel() has already been called or
2315 * the command ran to completion and nvme_dsm_bh is scheduled to run.
2317 assert(iocb
->idx
== iocb
->nr
);
2321 static const AIOCBInfo nvme_dsm_aiocb_info
= {
2322 .aiocb_size
= sizeof(NvmeDSMAIOCB
),
2323 .cancel_async
= nvme_dsm_cancel
,
2326 static void nvme_dsm_bh(void *opaque
)
2328 NvmeDSMAIOCB
*iocb
= opaque
;
2330 iocb
->common
.cb(iocb
->common
.opaque
, iocb
->ret
);
2332 qemu_bh_delete(iocb
->bh
);
2334 qemu_aio_unref(iocb
);
2337 static void nvme_dsm_cb(void *opaque
, int ret
);
2339 static void nvme_dsm_md_cb(void *opaque
, int ret
)
2341 NvmeDSMAIOCB
*iocb
= opaque
;
2342 NvmeRequest
*req
= iocb
->req
;
2343 NvmeNamespace
*ns
= req
->ns
;
2344 NvmeDsmRange
*range
;
2354 nvme_dsm_cb(iocb
, 0);
2358 range
= &iocb
->range
[iocb
->idx
- 1];
2359 slba
= le64_to_cpu(range
->slba
);
2360 nlb
= le32_to_cpu(range
->nlb
);
2363 * Check that all block were discarded (zeroed); otherwise we do not zero
2367 ret
= nvme_block_status_all(ns
, slba
, nlb
, BDRV_BLOCK_ZERO
);
2374 nvme_dsm_cb(iocb
, 0);
2377 iocb
->aiocb
= blk_aio_pwrite_zeroes(ns
->blkconf
.blk
, nvme_moff(ns
, slba
),
2378 nvme_m2b(ns
, nlb
), BDRV_REQ_MAY_UNMAP
,
2384 qemu_bh_schedule(iocb
->bh
);
2387 static void nvme_dsm_cb(void *opaque
, int ret
)
2389 NvmeDSMAIOCB
*iocb
= opaque
;
2390 NvmeRequest
*req
= iocb
->req
;
2391 NvmeCtrl
*n
= nvme_ctrl(req
);
2392 NvmeNamespace
*ns
= req
->ns
;
2393 NvmeDsmRange
*range
;
2403 if (iocb
->idx
== iocb
->nr
) {
2407 range
= &iocb
->range
[iocb
->idx
++];
2408 slba
= le64_to_cpu(range
->slba
);
2409 nlb
= le32_to_cpu(range
->nlb
);
2411 trace_pci_nvme_dsm_deallocate(slba
, nlb
);
2413 if (nlb
> n
->dmrsl
) {
2414 trace_pci_nvme_dsm_single_range_limit_exceeded(nlb
, n
->dmrsl
);
2418 if (nvme_check_bounds(ns
, slba
, nlb
)) {
2419 trace_pci_nvme_err_invalid_lba_range(slba
, nlb
,
2424 iocb
->aiocb
= blk_aio_pdiscard(ns
->blkconf
.blk
, nvme_l2b(ns
, slba
),
2426 nvme_dsm_md_cb
, iocb
);
2431 qemu_bh_schedule(iocb
->bh
);
2434 static uint16_t nvme_dsm(NvmeCtrl
*n
, NvmeRequest
*req
)
2436 NvmeNamespace
*ns
= req
->ns
;
2437 NvmeDsmCmd
*dsm
= (NvmeDsmCmd
*) &req
->cmd
;
2438 uint32_t attr
= le32_to_cpu(dsm
->attributes
);
2439 uint32_t nr
= (le32_to_cpu(dsm
->nr
) & 0xff) + 1;
2440 uint16_t status
= NVME_SUCCESS
;
2442 trace_pci_nvme_dsm(nr
, attr
);
2444 if (attr
& NVME_DSMGMT_AD
) {
2445 NvmeDSMAIOCB
*iocb
= blk_aio_get(&nvme_dsm_aiocb_info
, ns
->blkconf
.blk
,
2449 iocb
->bh
= qemu_bh_new(nvme_dsm_bh
, iocb
);
2451 iocb
->range
= g_new(NvmeDsmRange
, nr
);
2455 status
= nvme_h2c(n
, (uint8_t *)iocb
->range
, sizeof(NvmeDsmRange
) * nr
,
2461 req
->aiocb
= &iocb
->common
;
2462 nvme_dsm_cb(iocb
, 0);
2464 return NVME_NO_COMPLETE
;
2470 static uint16_t nvme_verify(NvmeCtrl
*n
, NvmeRequest
*req
)
2472 NvmeRwCmd
*rw
= (NvmeRwCmd
*)&req
->cmd
;
2473 NvmeNamespace
*ns
= req
->ns
;
2474 BlockBackend
*blk
= ns
->blkconf
.blk
;
2475 uint64_t slba
= le64_to_cpu(rw
->slba
);
2476 uint32_t nlb
= le16_to_cpu(rw
->nlb
) + 1;
2477 size_t len
= nvme_l2b(ns
, nlb
);
2478 int64_t offset
= nvme_l2b(ns
, slba
);
2479 uint8_t prinfo
= NVME_RW_PRINFO(le16_to_cpu(rw
->control
));
2480 uint32_t reftag
= le32_to_cpu(rw
->reftag
);
2481 NvmeBounceContext
*ctx
= NULL
;
2484 trace_pci_nvme_verify(nvme_cid(req
), nvme_nsid(ns
), slba
, nlb
);
2486 if (NVME_ID_NS_DPS_TYPE(ns
->id_ns
.dps
)) {
2487 status
= nvme_check_prinfo(ns
, prinfo
, slba
, reftag
);
2492 if (prinfo
& NVME_PRINFO_PRACT
) {
2493 return NVME_INVALID_PROT_INFO
| NVME_DNR
;
2497 if (len
> n
->page_size
<< n
->params
.vsl
) {
2498 return NVME_INVALID_FIELD
| NVME_DNR
;
2501 status
= nvme_check_bounds(ns
, slba
, nlb
);
2506 if (NVME_ERR_REC_DULBE(ns
->features
.err_rec
)) {
2507 status
= nvme_check_dulbe(ns
, slba
, nlb
);
2513 ctx
= g_new0(NvmeBounceContext
, 1);
2516 ctx
->data
.bounce
= g_malloc(len
);
2518 qemu_iovec_init(&ctx
->data
.iov
, 1);
2519 qemu_iovec_add(&ctx
->data
.iov
, ctx
->data
.bounce
, len
);
2521 block_acct_start(blk_get_stats(blk
), &req
->acct
, ctx
->data
.iov
.size
,
2524 req
->aiocb
= blk_aio_preadv(ns
->blkconf
.blk
, offset
, &ctx
->data
.iov
, 0,
2525 nvme_verify_mdata_in_cb
, ctx
);
2526 return NVME_NO_COMPLETE
;
2529 typedef struct NvmeCopyAIOCB
{
2537 unsigned int format
;
2544 BlockAcctCookie read
;
2545 BlockAcctCookie write
;
2554 static void nvme_copy_cancel(BlockAIOCB
*aiocb
)
2556 NvmeCopyAIOCB
*iocb
= container_of(aiocb
, NvmeCopyAIOCB
, common
);
2558 iocb
->ret
= -ECANCELED
;
2561 blk_aio_cancel_async(iocb
->aiocb
);
2566 static const AIOCBInfo nvme_copy_aiocb_info
= {
2567 .aiocb_size
= sizeof(NvmeCopyAIOCB
),
2568 .cancel_async
= nvme_copy_cancel
,
2571 static void nvme_copy_bh(void *opaque
)
2573 NvmeCopyAIOCB
*iocb
= opaque
;
2574 NvmeRequest
*req
= iocb
->req
;
2575 NvmeNamespace
*ns
= req
->ns
;
2576 BlockAcctStats
*stats
= blk_get_stats(ns
->blkconf
.blk
);
2578 if (iocb
->idx
!= iocb
->nr
) {
2579 req
->cqe
.result
= cpu_to_le32(iocb
->idx
);
2582 qemu_iovec_destroy(&iocb
->iov
);
2583 g_free(iocb
->bounce
);
2585 qemu_bh_delete(iocb
->bh
);
2588 if (iocb
->ret
< 0) {
2589 block_acct_failed(stats
, &iocb
->acct
.read
);
2590 block_acct_failed(stats
, &iocb
->acct
.write
);
2592 block_acct_done(stats
, &iocb
->acct
.read
);
2593 block_acct_done(stats
, &iocb
->acct
.write
);
2596 iocb
->common
.cb(iocb
->common
.opaque
, iocb
->ret
);
2597 qemu_aio_unref(iocb
);
2600 static void nvme_copy_cb(void *opaque
, int ret
);
2602 static void nvme_copy_source_range_parse_format0(void *ranges
, int idx
,
2603 uint64_t *slba
, uint32_t *nlb
,
2608 NvmeCopySourceRangeFormat0
*_ranges
= ranges
;
2611 *slba
= le64_to_cpu(_ranges
[idx
].slba
);
2615 *nlb
= le16_to_cpu(_ranges
[idx
].nlb
) + 1;
2619 *apptag
= le16_to_cpu(_ranges
[idx
].apptag
);
2623 *appmask
= le16_to_cpu(_ranges
[idx
].appmask
);
2627 *reftag
= le32_to_cpu(_ranges
[idx
].reftag
);
2631 static void nvme_copy_source_range_parse_format1(void *ranges
, int idx
,
2632 uint64_t *slba
, uint32_t *nlb
,
2637 NvmeCopySourceRangeFormat1
*_ranges
= ranges
;
2640 *slba
= le64_to_cpu(_ranges
[idx
].slba
);
2644 *nlb
= le16_to_cpu(_ranges
[idx
].nlb
) + 1;
2648 *apptag
= le16_to_cpu(_ranges
[idx
].apptag
);
2652 *appmask
= le16_to_cpu(_ranges
[idx
].appmask
);
2658 *reftag
|= (uint64_t)_ranges
[idx
].sr
[4] << 40;
2659 *reftag
|= (uint64_t)_ranges
[idx
].sr
[5] << 32;
2660 *reftag
|= (uint64_t)_ranges
[idx
].sr
[6] << 24;
2661 *reftag
|= (uint64_t)_ranges
[idx
].sr
[7] << 16;
2662 *reftag
|= (uint64_t)_ranges
[idx
].sr
[8] << 8;
2663 *reftag
|= (uint64_t)_ranges
[idx
].sr
[9];
2667 static void nvme_copy_source_range_parse(void *ranges
, int idx
, uint8_t format
,
2668 uint64_t *slba
, uint32_t *nlb
,
2669 uint16_t *apptag
, uint16_t *appmask
,
2673 case NVME_COPY_FORMAT_0
:
2674 nvme_copy_source_range_parse_format0(ranges
, idx
, slba
, nlb
, apptag
,
2678 case NVME_COPY_FORMAT_1
:
2679 nvme_copy_source_range_parse_format1(ranges
, idx
, slba
, nlb
, apptag
,
2688 static void nvme_copy_out_completed_cb(void *opaque
, int ret
)
2690 NvmeCopyAIOCB
*iocb
= opaque
;
2691 NvmeRequest
*req
= iocb
->req
;
2692 NvmeNamespace
*ns
= req
->ns
;
2695 nvme_copy_source_range_parse(iocb
->ranges
, iocb
->idx
, iocb
->format
, NULL
,
2696 &nlb
, NULL
, NULL
, NULL
);
2701 } else if (iocb
->ret
< 0) {
2705 if (ns
->params
.zoned
) {
2706 nvme_advance_zone_wp(ns
, iocb
->zone
, nlb
);
2712 nvme_copy_cb(iocb
, iocb
->ret
);
2715 static void nvme_copy_out_cb(void *opaque
, int ret
)
2717 NvmeCopyAIOCB
*iocb
= opaque
;
2718 NvmeRequest
*req
= iocb
->req
;
2719 NvmeNamespace
*ns
= req
->ns
;
2727 } else if (iocb
->ret
< 0) {
2732 nvme_copy_out_completed_cb(iocb
, 0);
2736 nvme_copy_source_range_parse(iocb
->ranges
, iocb
->idx
, iocb
->format
, NULL
,
2737 &nlb
, NULL
, NULL
, NULL
);
2739 mlen
= nvme_m2b(ns
, nlb
);
2740 mbounce
= iocb
->bounce
+ nvme_l2b(ns
, nlb
);
2742 qemu_iovec_reset(&iocb
->iov
);
2743 qemu_iovec_add(&iocb
->iov
, mbounce
, mlen
);
2745 iocb
->aiocb
= blk_aio_pwritev(ns
->blkconf
.blk
, nvme_moff(ns
, iocb
->slba
),
2746 &iocb
->iov
, 0, nvme_copy_out_completed_cb
,
2752 nvme_copy_cb(iocb
, ret
);
2755 static void nvme_copy_in_completed_cb(void *opaque
, int ret
)
2757 NvmeCopyAIOCB
*iocb
= opaque
;
2758 NvmeRequest
*req
= iocb
->req
;
2759 NvmeNamespace
*ns
= req
->ns
;
2762 uint16_t apptag
, appmask
;
2770 } else if (iocb
->ret
< 0) {
2774 nvme_copy_source_range_parse(iocb
->ranges
, iocb
->idx
, iocb
->format
, &slba
,
2775 &nlb
, &apptag
, &appmask
, &reftag
);
2776 len
= nvme_l2b(ns
, nlb
);
2778 trace_pci_nvme_copy_out(iocb
->slba
, nlb
);
2780 if (NVME_ID_NS_DPS_TYPE(ns
->id_ns
.dps
)) {
2781 NvmeCopyCmd
*copy
= (NvmeCopyCmd
*)&req
->cmd
;
2783 uint16_t prinfor
= ((copy
->control
[0] >> 4) & 0xf);
2784 uint16_t prinfow
= ((copy
->control
[2] >> 2) & 0xf);
2786 size_t mlen
= nvme_m2b(ns
, nlb
);
2787 uint8_t *mbounce
= iocb
->bounce
+ nvme_l2b(ns
, nlb
);
2789 status
= nvme_dif_check(ns
, iocb
->bounce
, len
, mbounce
, mlen
, prinfor
,
2790 slba
, apptag
, appmask
, &reftag
);
2795 apptag
= le16_to_cpu(copy
->apptag
);
2796 appmask
= le16_to_cpu(copy
->appmask
);
2798 if (prinfow
& NVME_PRINFO_PRACT
) {
2799 status
= nvme_check_prinfo(ns
, prinfow
, iocb
->slba
, iocb
->reftag
);
2804 nvme_dif_pract_generate_dif(ns
, iocb
->bounce
, len
, mbounce
, mlen
,
2805 apptag
, &iocb
->reftag
);
2807 status
= nvme_dif_check(ns
, iocb
->bounce
, len
, mbounce
, mlen
,
2808 prinfow
, iocb
->slba
, apptag
, appmask
,
2816 status
= nvme_check_bounds(ns
, iocb
->slba
, nlb
);
2821 if (ns
->params
.zoned
) {
2822 status
= nvme_check_zone_write(ns
, iocb
->zone
, iocb
->slba
, nlb
);
2827 if (!(iocb
->zone
->d
.za
& NVME_ZA_ZRWA_VALID
)) {
2828 iocb
->zone
->w_ptr
+= nlb
;
2832 qemu_iovec_reset(&iocb
->iov
);
2833 qemu_iovec_add(&iocb
->iov
, iocb
->bounce
, len
);
2835 iocb
->aiocb
= blk_aio_pwritev(ns
->blkconf
.blk
, nvme_l2b(ns
, iocb
->slba
),
2836 &iocb
->iov
, 0, nvme_copy_out_cb
, iocb
);
2841 req
->status
= status
;
2844 qemu_bh_schedule(iocb
->bh
);
2850 nvme_copy_cb(iocb
, ret
);
2853 static void nvme_copy_in_cb(void *opaque
, int ret
)
2855 NvmeCopyAIOCB
*iocb
= opaque
;
2856 NvmeRequest
*req
= iocb
->req
;
2857 NvmeNamespace
*ns
= req
->ns
;
2864 } else if (iocb
->ret
< 0) {
2869 nvme_copy_in_completed_cb(iocb
, 0);
2873 nvme_copy_source_range_parse(iocb
->ranges
, iocb
->idx
, iocb
->format
, &slba
,
2874 &nlb
, NULL
, NULL
, NULL
);
2876 qemu_iovec_reset(&iocb
->iov
);
2877 qemu_iovec_add(&iocb
->iov
, iocb
->bounce
+ nvme_l2b(ns
, nlb
),
2880 iocb
->aiocb
= blk_aio_preadv(ns
->blkconf
.blk
, nvme_moff(ns
, slba
),
2881 &iocb
->iov
, 0, nvme_copy_in_completed_cb
,
2886 nvme_copy_cb(iocb
, iocb
->ret
);
2889 static void nvme_copy_cb(void *opaque
, int ret
)
2891 NvmeCopyAIOCB
*iocb
= opaque
;
2892 NvmeRequest
*req
= iocb
->req
;
2893 NvmeNamespace
*ns
= req
->ns
;
2902 } else if (iocb
->ret
< 0) {
2906 if (iocb
->idx
== iocb
->nr
) {
2910 nvme_copy_source_range_parse(iocb
->ranges
, iocb
->idx
, iocb
->format
, &slba
,
2911 &nlb
, NULL
, NULL
, NULL
);
2912 len
= nvme_l2b(ns
, nlb
);
2914 trace_pci_nvme_copy_source_range(slba
, nlb
);
2916 if (nlb
> le16_to_cpu(ns
->id_ns
.mssrl
)) {
2917 status
= NVME_CMD_SIZE_LIMIT
| NVME_DNR
;
2921 status
= nvme_check_bounds(ns
, slba
, nlb
);
2926 if (NVME_ERR_REC_DULBE(ns
->features
.err_rec
)) {
2927 status
= nvme_check_dulbe(ns
, slba
, nlb
);
2933 if (ns
->params
.zoned
) {
2934 status
= nvme_check_zone_read(ns
, slba
, nlb
);
2940 qemu_iovec_reset(&iocb
->iov
);
2941 qemu_iovec_add(&iocb
->iov
, iocb
->bounce
, len
);
2943 iocb
->aiocb
= blk_aio_preadv(ns
->blkconf
.blk
, nvme_l2b(ns
, slba
),
2944 &iocb
->iov
, 0, nvme_copy_in_cb
, iocb
);
2948 req
->status
= status
;
2952 qemu_bh_schedule(iocb
->bh
);
2957 static uint16_t nvme_copy(NvmeCtrl
*n
, NvmeRequest
*req
)
2959 NvmeNamespace
*ns
= req
->ns
;
2960 NvmeCopyCmd
*copy
= (NvmeCopyCmd
*)&req
->cmd
;
2961 NvmeCopyAIOCB
*iocb
= blk_aio_get(&nvme_copy_aiocb_info
, ns
->blkconf
.blk
,
2963 uint16_t nr
= copy
->nr
+ 1;
2964 uint8_t format
= copy
->control
[0] & 0xf;
2965 uint16_t prinfor
= ((copy
->control
[0] >> 4) & 0xf);
2966 uint16_t prinfow
= ((copy
->control
[2] >> 2) & 0xf);
2967 size_t len
= sizeof(NvmeCopySourceRangeFormat0
);
2971 trace_pci_nvme_copy(nvme_cid(req
), nvme_nsid(ns
), nr
, format
);
2973 iocb
->ranges
= NULL
;
2976 if (NVME_ID_NS_DPS_TYPE(ns
->id_ns
.dps
) &&
2977 ((prinfor
& NVME_PRINFO_PRACT
) != (prinfow
& NVME_PRINFO_PRACT
))) {
2978 status
= NVME_INVALID_FIELD
| NVME_DNR
;
2982 if (!(n
->id_ctrl
.ocfs
& (1 << format
))) {
2983 trace_pci_nvme_err_copy_invalid_format(format
);
2984 status
= NVME_INVALID_FIELD
| NVME_DNR
;
2988 if (nr
> ns
->id_ns
.msrc
+ 1) {
2989 status
= NVME_CMD_SIZE_LIMIT
| NVME_DNR
;
2993 if (ns
->pif
&& format
!= 0x1) {
2994 status
= NVME_INVALID_FORMAT
| NVME_DNR
;
2999 len
= sizeof(NvmeCopySourceRangeFormat1
);
3002 iocb
->format
= format
;
3003 iocb
->ranges
= g_malloc_n(nr
, len
);
3004 status
= nvme_h2c(n
, (uint8_t *)iocb
->ranges
, len
* nr
, req
);
3009 iocb
->slba
= le64_to_cpu(copy
->sdlba
);
3011 if (ns
->params
.zoned
) {
3012 iocb
->zone
= nvme_get_zone_by_slba(ns
, iocb
->slba
);
3014 status
= NVME_LBA_RANGE
| NVME_DNR
;
3018 status
= nvme_zrm_auto(n
, ns
, iocb
->zone
);
3025 iocb
->bh
= qemu_bh_new(nvme_copy_bh
, iocb
);
3029 iocb
->reftag
= le32_to_cpu(copy
->reftag
);
3030 iocb
->reftag
|= (uint64_t)le32_to_cpu(copy
->cdw3
) << 32;
3031 iocb
->bounce
= g_malloc_n(le16_to_cpu(ns
->id_ns
.mssrl
),
3032 ns
->lbasz
+ ns
->lbaf
.ms
);
3034 qemu_iovec_init(&iocb
->iov
, 1);
3036 block_acct_start(blk_get_stats(ns
->blkconf
.blk
), &iocb
->acct
.read
, 0,
3038 block_acct_start(blk_get_stats(ns
->blkconf
.blk
), &iocb
->acct
.write
, 0,
3041 req
->aiocb
= &iocb
->common
;
3042 nvme_copy_cb(iocb
, 0);
3044 return NVME_NO_COMPLETE
;
3047 g_free(iocb
->ranges
);
3048 qemu_aio_unref(iocb
);
3052 static uint16_t nvme_compare(NvmeCtrl
*n
, NvmeRequest
*req
)
3054 NvmeRwCmd
*rw
= (NvmeRwCmd
*)&req
->cmd
;
3055 NvmeNamespace
*ns
= req
->ns
;
3056 BlockBackend
*blk
= ns
->blkconf
.blk
;
3057 uint64_t slba
= le64_to_cpu(rw
->slba
);
3058 uint32_t nlb
= le16_to_cpu(rw
->nlb
) + 1;
3059 uint8_t prinfo
= NVME_RW_PRINFO(le16_to_cpu(rw
->control
));
3060 size_t data_len
= nvme_l2b(ns
, nlb
);
3061 size_t len
= data_len
;
3062 int64_t offset
= nvme_l2b(ns
, slba
);
3063 struct nvme_compare_ctx
*ctx
= NULL
;
3066 trace_pci_nvme_compare(nvme_cid(req
), nvme_nsid(ns
), slba
, nlb
);
3068 if (NVME_ID_NS_DPS_TYPE(ns
->id_ns
.dps
) && (prinfo
& NVME_PRINFO_PRACT
)) {
3069 return NVME_INVALID_PROT_INFO
| NVME_DNR
;
3072 if (nvme_ns_ext(ns
)) {
3073 len
+= nvme_m2b(ns
, nlb
);
3076 status
= nvme_check_mdts(n
, len
);
3081 status
= nvme_check_bounds(ns
, slba
, nlb
);
3086 if (NVME_ERR_REC_DULBE(ns
->features
.err_rec
)) {
3087 status
= nvme_check_dulbe(ns
, slba
, nlb
);
3093 status
= nvme_map_dptr(n
, &req
->sg
, len
, &req
->cmd
);
3098 ctx
= g_new(struct nvme_compare_ctx
, 1);
3099 ctx
->data
.bounce
= g_malloc(data_len
);
3103 qemu_iovec_init(&ctx
->data
.iov
, 1);
3104 qemu_iovec_add(&ctx
->data
.iov
, ctx
->data
.bounce
, data_len
);
3106 block_acct_start(blk_get_stats(blk
), &req
->acct
, data_len
,
3108 req
->aiocb
= blk_aio_preadv(blk
, offset
, &ctx
->data
.iov
, 0,
3109 nvme_compare_data_cb
, req
);
3111 return NVME_NO_COMPLETE
;
3114 typedef struct NvmeFlushAIOCB
{
3126 static void nvme_flush_cancel(BlockAIOCB
*acb
)
3128 NvmeFlushAIOCB
*iocb
= container_of(acb
, NvmeFlushAIOCB
, common
);
3130 iocb
->ret
= -ECANCELED
;
3133 blk_aio_cancel_async(iocb
->aiocb
);
3137 static const AIOCBInfo nvme_flush_aiocb_info
= {
3138 .aiocb_size
= sizeof(NvmeFlushAIOCB
),
3139 .cancel_async
= nvme_flush_cancel
,
3140 .get_aio_context
= nvme_get_aio_context
,
3143 static void nvme_flush_ns_cb(void *opaque
, int ret
)
3145 NvmeFlushAIOCB
*iocb
= opaque
;
3146 NvmeNamespace
*ns
= iocb
->ns
;
3151 } else if (iocb
->ret
< 0) {
3156 trace_pci_nvme_flush_ns(iocb
->nsid
);
3159 iocb
->aiocb
= blk_aio_flush(ns
->blkconf
.blk
, nvme_flush_ns_cb
, iocb
);
3165 qemu_bh_schedule(iocb
->bh
);
3168 static void nvme_flush_bh(void *opaque
)
3170 NvmeFlushAIOCB
*iocb
= opaque
;
3171 NvmeRequest
*req
= iocb
->req
;
3172 NvmeCtrl
*n
= nvme_ctrl(req
);
3175 if (iocb
->ret
< 0) {
3179 if (iocb
->broadcast
) {
3180 for (i
= iocb
->nsid
+ 1; i
<= NVME_MAX_NAMESPACES
; i
++) {
3181 iocb
->ns
= nvme_ns(n
, i
);
3193 nvme_flush_ns_cb(iocb
, 0);
3197 qemu_bh_delete(iocb
->bh
);
3200 iocb
->common
.cb(iocb
->common
.opaque
, iocb
->ret
);
3202 qemu_aio_unref(iocb
);
3207 static uint16_t nvme_flush(NvmeCtrl
*n
, NvmeRequest
*req
)
3209 NvmeFlushAIOCB
*iocb
;
3210 uint32_t nsid
= le32_to_cpu(req
->cmd
.nsid
);
3213 iocb
= qemu_aio_get(&nvme_flush_aiocb_info
, NULL
, nvme_misc_cb
, req
);
3216 iocb
->bh
= qemu_bh_new(nvme_flush_bh
, iocb
);
3220 iocb
->broadcast
= (nsid
== NVME_NSID_BROADCAST
);
3222 if (!iocb
->broadcast
) {
3223 if (!nvme_nsid_valid(n
, nsid
)) {
3224 status
= NVME_INVALID_NSID
| NVME_DNR
;
3228 iocb
->ns
= nvme_ns(n
, nsid
);
3230 status
= NVME_INVALID_FIELD
| NVME_DNR
;
3237 req
->aiocb
= &iocb
->common
;
3238 qemu_bh_schedule(iocb
->bh
);
3240 return NVME_NO_COMPLETE
;
3243 qemu_bh_delete(iocb
->bh
);
3245 qemu_aio_unref(iocb
);
3250 static uint16_t nvme_read(NvmeCtrl
*n
, NvmeRequest
*req
)
3252 NvmeRwCmd
*rw
= (NvmeRwCmd
*)&req
->cmd
;
3253 NvmeNamespace
*ns
= req
->ns
;
3254 uint64_t slba
= le64_to_cpu(rw
->slba
);
3255 uint32_t nlb
= (uint32_t)le16_to_cpu(rw
->nlb
) + 1;
3256 uint8_t prinfo
= NVME_RW_PRINFO(le16_to_cpu(rw
->control
));
3257 uint64_t data_size
= nvme_l2b(ns
, nlb
);
3258 uint64_t mapped_size
= data_size
;
3259 uint64_t data_offset
;
3260 BlockBackend
*blk
= ns
->blkconf
.blk
;
3263 if (nvme_ns_ext(ns
)) {
3264 mapped_size
+= nvme_m2b(ns
, nlb
);
3266 if (NVME_ID_NS_DPS_TYPE(ns
->id_ns
.dps
)) {
3267 bool pract
= prinfo
& NVME_PRINFO_PRACT
;
3269 if (pract
&& ns
->lbaf
.ms
== nvme_pi_tuple_size(ns
)) {
3270 mapped_size
= data_size
;
3275 trace_pci_nvme_read(nvme_cid(req
), nvme_nsid(ns
), nlb
, mapped_size
, slba
);
3277 status
= nvme_check_mdts(n
, mapped_size
);
3282 status
= nvme_check_bounds(ns
, slba
, nlb
);
3287 if (ns
->params
.zoned
) {
3288 status
= nvme_check_zone_read(ns
, slba
, nlb
);
3290 trace_pci_nvme_err_zone_read_not_ok(slba
, nlb
, status
);
3295 if (NVME_ERR_REC_DULBE(ns
->features
.err_rec
)) {
3296 status
= nvme_check_dulbe(ns
, slba
, nlb
);
3302 if (NVME_ID_NS_DPS_TYPE(ns
->id_ns
.dps
)) {
3303 return nvme_dif_rw(n
, req
);
3306 status
= nvme_map_data(n
, nlb
, req
);
3311 data_offset
= nvme_l2b(ns
, slba
);
3313 block_acct_start(blk_get_stats(blk
), &req
->acct
, data_size
,
3315 nvme_blk_read(blk
, data_offset
, nvme_rw_cb
, req
);
3316 return NVME_NO_COMPLETE
;
3319 block_acct_invalid(blk_get_stats(blk
), BLOCK_ACCT_READ
);
3320 return status
| NVME_DNR
;
3323 static uint16_t nvme_do_write(NvmeCtrl
*n
, NvmeRequest
*req
, bool append
,
3326 NvmeRwCmd
*rw
= (NvmeRwCmd
*)&req
->cmd
;
3327 NvmeNamespace
*ns
= req
->ns
;
3328 uint64_t slba
= le64_to_cpu(rw
->slba
);
3329 uint32_t nlb
= (uint32_t)le16_to_cpu(rw
->nlb
) + 1;
3330 uint16_t ctrl
= le16_to_cpu(rw
->control
);
3331 uint8_t prinfo
= NVME_RW_PRINFO(ctrl
);
3332 uint64_t data_size
= nvme_l2b(ns
, nlb
);
3333 uint64_t mapped_size
= data_size
;
3334 uint64_t data_offset
;
3336 NvmeZonedResult
*res
= (NvmeZonedResult
*)&req
->cqe
;
3337 BlockBackend
*blk
= ns
->blkconf
.blk
;
3340 if (nvme_ns_ext(ns
)) {
3341 mapped_size
+= nvme_m2b(ns
, nlb
);
3343 if (NVME_ID_NS_DPS_TYPE(ns
->id_ns
.dps
)) {
3344 bool pract
= prinfo
& NVME_PRINFO_PRACT
;
3346 if (pract
&& ns
->lbaf
.ms
== nvme_pi_tuple_size(ns
)) {
3347 mapped_size
-= nvme_m2b(ns
, nlb
);
3352 trace_pci_nvme_write(nvme_cid(req
), nvme_io_opc_str(rw
->opcode
),
3353 nvme_nsid(ns
), nlb
, mapped_size
, slba
);
3356 status
= nvme_check_mdts(n
, mapped_size
);
3362 status
= nvme_check_bounds(ns
, slba
, nlb
);
3367 if (ns
->params
.zoned
) {
3368 zone
= nvme_get_zone_by_slba(ns
, slba
);
3372 bool piremap
= !!(ctrl
& NVME_RW_PIREMAP
);
3374 if (unlikely(zone
->d
.za
& NVME_ZA_ZRWA_VALID
)) {
3375 return NVME_INVALID_ZONE_OP
| NVME_DNR
;
3378 if (unlikely(slba
!= zone
->d
.zslba
)) {
3379 trace_pci_nvme_err_append_not_at_start(slba
, zone
->d
.zslba
);
3380 status
= NVME_INVALID_FIELD
;
3384 if (n
->params
.zasl
&&
3385 data_size
> (uint64_t)n
->page_size
<< n
->params
.zasl
) {
3386 trace_pci_nvme_err_zasl(data_size
);
3387 return NVME_INVALID_FIELD
| NVME_DNR
;
3391 rw
->slba
= cpu_to_le64(slba
);
3392 res
->slba
= cpu_to_le64(slba
);
3394 switch (NVME_ID_NS_DPS_TYPE(ns
->id_ns
.dps
)) {
3395 case NVME_ID_NS_DPS_TYPE_1
:
3397 return NVME_INVALID_PROT_INFO
| NVME_DNR
;
3402 case NVME_ID_NS_DPS_TYPE_2
:
3404 uint32_t reftag
= le32_to_cpu(rw
->reftag
);
3405 rw
->reftag
= cpu_to_le32(reftag
+ (slba
- zone
->d
.zslba
));
3410 case NVME_ID_NS_DPS_TYPE_3
:
3412 return NVME_INVALID_PROT_INFO
| NVME_DNR
;
3419 status
= nvme_check_zone_write(ns
, zone
, slba
, nlb
);
3424 status
= nvme_zrm_auto(n
, ns
, zone
);
3429 if (!(zone
->d
.za
& NVME_ZA_ZRWA_VALID
)) {
3434 data_offset
= nvme_l2b(ns
, slba
);
3436 if (NVME_ID_NS_DPS_TYPE(ns
->id_ns
.dps
)) {
3437 return nvme_dif_rw(n
, req
);
3441 status
= nvme_map_data(n
, nlb
, req
);
3446 block_acct_start(blk_get_stats(blk
), &req
->acct
, data_size
,
3448 nvme_blk_write(blk
, data_offset
, nvme_rw_cb
, req
);
3450 req
->aiocb
= blk_aio_pwrite_zeroes(blk
, data_offset
, data_size
,
3451 BDRV_REQ_MAY_UNMAP
, nvme_rw_cb
,
3455 return NVME_NO_COMPLETE
;
3458 block_acct_invalid(blk_get_stats(blk
), BLOCK_ACCT_WRITE
);
3459 return status
| NVME_DNR
;
3462 static inline uint16_t nvme_write(NvmeCtrl
*n
, NvmeRequest
*req
)
3464 return nvme_do_write(n
, req
, false, false);
3467 static inline uint16_t nvme_write_zeroes(NvmeCtrl
*n
, NvmeRequest
*req
)
3469 return nvme_do_write(n
, req
, false, true);
3472 static inline uint16_t nvme_zone_append(NvmeCtrl
*n
, NvmeRequest
*req
)
3474 return nvme_do_write(n
, req
, true, false);
3477 static uint16_t nvme_get_mgmt_zone_slba_idx(NvmeNamespace
*ns
, NvmeCmd
*c
,
3478 uint64_t *slba
, uint32_t *zone_idx
)
3480 uint32_t dw10
= le32_to_cpu(c
->cdw10
);
3481 uint32_t dw11
= le32_to_cpu(c
->cdw11
);
3483 if (!ns
->params
.zoned
) {
3484 trace_pci_nvme_err_invalid_opc(c
->opcode
);
3485 return NVME_INVALID_OPCODE
| NVME_DNR
;
3488 *slba
= ((uint64_t)dw11
) << 32 | dw10
;
3489 if (unlikely(*slba
>= ns
->id_ns
.nsze
)) {
3490 trace_pci_nvme_err_invalid_lba_range(*slba
, 0, ns
->id_ns
.nsze
);
3492 return NVME_LBA_RANGE
| NVME_DNR
;
3495 *zone_idx
= nvme_zone_idx(ns
, *slba
);
3496 assert(*zone_idx
< ns
->num_zones
);
3498 return NVME_SUCCESS
;
3501 typedef uint16_t (*op_handler_t
)(NvmeNamespace
*, NvmeZone
*, NvmeZoneState
,
3504 enum NvmeZoneProcessingMask
{
3505 NVME_PROC_CURRENT_ZONE
= 0,
3506 NVME_PROC_OPENED_ZONES
= 1 << 0,
3507 NVME_PROC_CLOSED_ZONES
= 1 << 1,
3508 NVME_PROC_READ_ONLY_ZONES
= 1 << 2,
3509 NVME_PROC_FULL_ZONES
= 1 << 3,
3512 static uint16_t nvme_open_zone(NvmeNamespace
*ns
, NvmeZone
*zone
,
3513 NvmeZoneState state
, NvmeRequest
*req
)
3515 NvmeZoneSendCmd
*cmd
= (NvmeZoneSendCmd
*)&req
->cmd
;
3518 if (cmd
->zsflags
& NVME_ZSFLAG_ZRWA_ALLOC
) {
3519 uint16_t ozcs
= le16_to_cpu(ns
->id_ns_zoned
->ozcs
);
3521 if (!(ozcs
& NVME_ID_NS_ZONED_OZCS_ZRWASUP
)) {
3522 return NVME_INVALID_ZONE_OP
| NVME_DNR
;
3525 if (zone
->w_ptr
% ns
->zns
.zrwafg
) {
3526 return NVME_NOZRWA
| NVME_DNR
;
3529 flags
= NVME_ZRM_ZRWA
;
3532 return nvme_zrm_open_flags(nvme_ctrl(req
), ns
, zone
, flags
);
3535 static uint16_t nvme_close_zone(NvmeNamespace
*ns
, NvmeZone
*zone
,
3536 NvmeZoneState state
, NvmeRequest
*req
)
3538 return nvme_zrm_close(ns
, zone
);
3541 static uint16_t nvme_finish_zone(NvmeNamespace
*ns
, NvmeZone
*zone
,
3542 NvmeZoneState state
, NvmeRequest
*req
)
3544 return nvme_zrm_finish(ns
, zone
);
3547 static uint16_t nvme_offline_zone(NvmeNamespace
*ns
, NvmeZone
*zone
,
3548 NvmeZoneState state
, NvmeRequest
*req
)
3551 case NVME_ZONE_STATE_READ_ONLY
:
3552 nvme_assign_zone_state(ns
, zone
, NVME_ZONE_STATE_OFFLINE
);
3554 case NVME_ZONE_STATE_OFFLINE
:
3555 return NVME_SUCCESS
;
3557 return NVME_ZONE_INVAL_TRANSITION
;
3561 static uint16_t nvme_set_zd_ext(NvmeNamespace
*ns
, NvmeZone
*zone
)
3564 uint8_t state
= nvme_get_zone_state(zone
);
3566 if (state
== NVME_ZONE_STATE_EMPTY
) {
3567 status
= nvme_aor_check(ns
, 1, 0);
3571 nvme_aor_inc_active(ns
);
3572 zone
->d
.za
|= NVME_ZA_ZD_EXT_VALID
;
3573 nvme_assign_zone_state(ns
, zone
, NVME_ZONE_STATE_CLOSED
);
3574 return NVME_SUCCESS
;
3577 return NVME_ZONE_INVAL_TRANSITION
;
3580 static uint16_t nvme_bulk_proc_zone(NvmeNamespace
*ns
, NvmeZone
*zone
,
3581 enum NvmeZoneProcessingMask proc_mask
,
3582 op_handler_t op_hndlr
, NvmeRequest
*req
)
3584 uint16_t status
= NVME_SUCCESS
;
3585 NvmeZoneState zs
= nvme_get_zone_state(zone
);
3589 case NVME_ZONE_STATE_IMPLICITLY_OPEN
:
3590 case NVME_ZONE_STATE_EXPLICITLY_OPEN
:
3591 proc_zone
= proc_mask
& NVME_PROC_OPENED_ZONES
;
3593 case NVME_ZONE_STATE_CLOSED
:
3594 proc_zone
= proc_mask
& NVME_PROC_CLOSED_ZONES
;
3596 case NVME_ZONE_STATE_READ_ONLY
:
3597 proc_zone
= proc_mask
& NVME_PROC_READ_ONLY_ZONES
;
3599 case NVME_ZONE_STATE_FULL
:
3600 proc_zone
= proc_mask
& NVME_PROC_FULL_ZONES
;
3607 status
= op_hndlr(ns
, zone
, zs
, req
);
3613 static uint16_t nvme_do_zone_op(NvmeNamespace
*ns
, NvmeZone
*zone
,
3614 enum NvmeZoneProcessingMask proc_mask
,
3615 op_handler_t op_hndlr
, NvmeRequest
*req
)
3618 uint16_t status
= NVME_SUCCESS
;
3622 status
= op_hndlr(ns
, zone
, nvme_get_zone_state(zone
), req
);
3624 if (proc_mask
& NVME_PROC_CLOSED_ZONES
) {
3625 QTAILQ_FOREACH_SAFE(zone
, &ns
->closed_zones
, entry
, next
) {
3626 status
= nvme_bulk_proc_zone(ns
, zone
, proc_mask
, op_hndlr
,
3628 if (status
&& status
!= NVME_NO_COMPLETE
) {
3633 if (proc_mask
& NVME_PROC_OPENED_ZONES
) {
3634 QTAILQ_FOREACH_SAFE(zone
, &ns
->imp_open_zones
, entry
, next
) {
3635 status
= nvme_bulk_proc_zone(ns
, zone
, proc_mask
, op_hndlr
,
3637 if (status
&& status
!= NVME_NO_COMPLETE
) {
3642 QTAILQ_FOREACH_SAFE(zone
, &ns
->exp_open_zones
, entry
, next
) {
3643 status
= nvme_bulk_proc_zone(ns
, zone
, proc_mask
, op_hndlr
,
3645 if (status
&& status
!= NVME_NO_COMPLETE
) {
3650 if (proc_mask
& NVME_PROC_FULL_ZONES
) {
3651 QTAILQ_FOREACH_SAFE(zone
, &ns
->full_zones
, entry
, next
) {
3652 status
= nvme_bulk_proc_zone(ns
, zone
, proc_mask
, op_hndlr
,
3654 if (status
&& status
!= NVME_NO_COMPLETE
) {
3660 if (proc_mask
& NVME_PROC_READ_ONLY_ZONES
) {
3661 for (i
= 0; i
< ns
->num_zones
; i
++, zone
++) {
3662 status
= nvme_bulk_proc_zone(ns
, zone
, proc_mask
, op_hndlr
,
3664 if (status
&& status
!= NVME_NO_COMPLETE
) {
3675 typedef struct NvmeZoneResetAIOCB
{
3685 } NvmeZoneResetAIOCB
;
3687 static void nvme_zone_reset_cancel(BlockAIOCB
*aiocb
)
3689 NvmeZoneResetAIOCB
*iocb
= container_of(aiocb
, NvmeZoneResetAIOCB
, common
);
3690 NvmeRequest
*req
= iocb
->req
;
3691 NvmeNamespace
*ns
= req
->ns
;
3693 iocb
->idx
= ns
->num_zones
;
3695 iocb
->ret
= -ECANCELED
;
3698 blk_aio_cancel_async(iocb
->aiocb
);
3703 static const AIOCBInfo nvme_zone_reset_aiocb_info
= {
3704 .aiocb_size
= sizeof(NvmeZoneResetAIOCB
),
3705 .cancel_async
= nvme_zone_reset_cancel
,
3708 static void nvme_zone_reset_bh(void *opaque
)
3710 NvmeZoneResetAIOCB
*iocb
= opaque
;
3712 iocb
->common
.cb(iocb
->common
.opaque
, iocb
->ret
);
3714 qemu_bh_delete(iocb
->bh
);
3716 qemu_aio_unref(iocb
);
3719 static void nvme_zone_reset_cb(void *opaque
, int ret
);
3721 static void nvme_zone_reset_epilogue_cb(void *opaque
, int ret
)
3723 NvmeZoneResetAIOCB
*iocb
= opaque
;
3724 NvmeRequest
*req
= iocb
->req
;
3725 NvmeNamespace
*ns
= req
->ns
;
3730 nvme_zone_reset_cb(iocb
, ret
);
3735 nvme_zone_reset_cb(iocb
, 0);
3739 moff
= nvme_moff(ns
, iocb
->zone
->d
.zslba
);
3740 count
= nvme_m2b(ns
, ns
->zone_size
);
3742 iocb
->aiocb
= blk_aio_pwrite_zeroes(ns
->blkconf
.blk
, moff
, count
,
3744 nvme_zone_reset_cb
, iocb
);
3748 static void nvme_zone_reset_cb(void *opaque
, int ret
)
3750 NvmeZoneResetAIOCB
*iocb
= opaque
;
3751 NvmeRequest
*req
= iocb
->req
;
3752 NvmeNamespace
*ns
= req
->ns
;
3760 nvme_zrm_reset(ns
, iocb
->zone
);
3767 while (iocb
->idx
< ns
->num_zones
) {
3768 NvmeZone
*zone
= &ns
->zone_array
[iocb
->idx
++];
3770 switch (nvme_get_zone_state(zone
)) {
3771 case NVME_ZONE_STATE_EMPTY
:
3778 case NVME_ZONE_STATE_EXPLICITLY_OPEN
:
3779 case NVME_ZONE_STATE_IMPLICITLY_OPEN
:
3780 case NVME_ZONE_STATE_CLOSED
:
3781 case NVME_ZONE_STATE_FULL
:
3789 trace_pci_nvme_zns_zone_reset(zone
->d
.zslba
);
3791 iocb
->aiocb
= blk_aio_pwrite_zeroes(ns
->blkconf
.blk
,
3792 nvme_l2b(ns
, zone
->d
.zslba
),
3793 nvme_l2b(ns
, ns
->zone_size
),
3795 nvme_zone_reset_epilogue_cb
,
3803 qemu_bh_schedule(iocb
->bh
);
3807 static uint16_t nvme_zone_mgmt_send_zrwa_flush(NvmeCtrl
*n
, NvmeZone
*zone
,
3808 uint64_t elba
, NvmeRequest
*req
)
3810 NvmeNamespace
*ns
= req
->ns
;
3811 uint16_t ozcs
= le16_to_cpu(ns
->id_ns_zoned
->ozcs
);
3812 uint64_t wp
= zone
->d
.wp
;
3813 uint32_t nlb
= elba
- wp
+ 1;
3817 if (!(ozcs
& NVME_ID_NS_ZONED_OZCS_ZRWASUP
)) {
3818 return NVME_INVALID_ZONE_OP
| NVME_DNR
;
3821 if (!(zone
->d
.za
& NVME_ZA_ZRWA_VALID
)) {
3822 return NVME_INVALID_FIELD
| NVME_DNR
;
3825 if (elba
< wp
|| elba
> wp
+ ns
->zns
.zrwas
) {
3826 return NVME_ZONE_BOUNDARY_ERROR
| NVME_DNR
;
3829 if (nlb
% ns
->zns
.zrwafg
) {
3830 return NVME_INVALID_FIELD
| NVME_DNR
;
3833 status
= nvme_zrm_auto(n
, ns
, zone
);
3840 nvme_advance_zone_wp(ns
, zone
, nlb
);
3842 return NVME_SUCCESS
;
3845 static uint16_t nvme_zone_mgmt_send(NvmeCtrl
*n
, NvmeRequest
*req
)
3847 NvmeZoneSendCmd
*cmd
= (NvmeZoneSendCmd
*)&req
->cmd
;
3848 NvmeNamespace
*ns
= req
->ns
;
3850 NvmeZoneResetAIOCB
*iocb
;
3853 uint32_t zone_idx
= 0;
3855 uint8_t action
= cmd
->zsa
;
3857 enum NvmeZoneProcessingMask proc_mask
= NVME_PROC_CURRENT_ZONE
;
3859 all
= cmd
->zsflags
& NVME_ZSFLAG_SELECT_ALL
;
3861 req
->status
= NVME_SUCCESS
;
3864 status
= nvme_get_mgmt_zone_slba_idx(ns
, &req
->cmd
, &slba
, &zone_idx
);
3870 zone
= &ns
->zone_array
[zone_idx
];
3871 if (slba
!= zone
->d
.zslba
&& action
!= NVME_ZONE_ACTION_ZRWA_FLUSH
) {
3872 trace_pci_nvme_err_unaligned_zone_cmd(action
, slba
, zone
->d
.zslba
);
3873 return NVME_INVALID_FIELD
| NVME_DNR
;
3878 case NVME_ZONE_ACTION_OPEN
:
3880 proc_mask
= NVME_PROC_CLOSED_ZONES
;
3882 trace_pci_nvme_open_zone(slba
, zone_idx
, all
);
3883 status
= nvme_do_zone_op(ns
, zone
, proc_mask
, nvme_open_zone
, req
);
3886 case NVME_ZONE_ACTION_CLOSE
:
3888 proc_mask
= NVME_PROC_OPENED_ZONES
;
3890 trace_pci_nvme_close_zone(slba
, zone_idx
, all
);
3891 status
= nvme_do_zone_op(ns
, zone
, proc_mask
, nvme_close_zone
, req
);
3894 case NVME_ZONE_ACTION_FINISH
:
3896 proc_mask
= NVME_PROC_OPENED_ZONES
| NVME_PROC_CLOSED_ZONES
;
3898 trace_pci_nvme_finish_zone(slba
, zone_idx
, all
);
3899 status
= nvme_do_zone_op(ns
, zone
, proc_mask
, nvme_finish_zone
, req
);
3902 case NVME_ZONE_ACTION_RESET
:
3903 trace_pci_nvme_reset_zone(slba
, zone_idx
, all
);
3905 iocb
= blk_aio_get(&nvme_zone_reset_aiocb_info
, ns
->blkconf
.blk
,
3909 iocb
->bh
= qemu_bh_new(nvme_zone_reset_bh
, iocb
);
3912 iocb
->idx
= zone_idx
;
3915 req
->aiocb
= &iocb
->common
;
3916 nvme_zone_reset_cb(iocb
, 0);
3918 return NVME_NO_COMPLETE
;
3920 case NVME_ZONE_ACTION_OFFLINE
:
3922 proc_mask
= NVME_PROC_READ_ONLY_ZONES
;
3924 trace_pci_nvme_offline_zone(slba
, zone_idx
, all
);
3925 status
= nvme_do_zone_op(ns
, zone
, proc_mask
, nvme_offline_zone
, req
);
3928 case NVME_ZONE_ACTION_SET_ZD_EXT
:
3929 trace_pci_nvme_set_descriptor_extension(slba
, zone_idx
);
3930 if (all
|| !ns
->params
.zd_extension_size
) {
3931 return NVME_INVALID_FIELD
| NVME_DNR
;
3933 zd_ext
= nvme_get_zd_extension(ns
, zone_idx
);
3934 status
= nvme_h2c(n
, zd_ext
, ns
->params
.zd_extension_size
, req
);
3936 trace_pci_nvme_err_zd_extension_map_error(zone_idx
);
3940 status
= nvme_set_zd_ext(ns
, zone
);
3941 if (status
== NVME_SUCCESS
) {
3942 trace_pci_nvme_zd_extension_set(zone_idx
);
3947 case NVME_ZONE_ACTION_ZRWA_FLUSH
:
3949 return NVME_INVALID_FIELD
| NVME_DNR
;
3952 return nvme_zone_mgmt_send_zrwa_flush(n
, zone
, slba
, req
);
3955 trace_pci_nvme_err_invalid_mgmt_action(action
);
3956 status
= NVME_INVALID_FIELD
;
3959 if (status
== NVME_ZONE_INVAL_TRANSITION
) {
3960 trace_pci_nvme_err_invalid_zone_state_transition(action
, slba
,
3970 static bool nvme_zone_matches_filter(uint32_t zafs
, NvmeZone
*zl
)
3972 NvmeZoneState zs
= nvme_get_zone_state(zl
);
3975 case NVME_ZONE_REPORT_ALL
:
3977 case NVME_ZONE_REPORT_EMPTY
:
3978 return zs
== NVME_ZONE_STATE_EMPTY
;
3979 case NVME_ZONE_REPORT_IMPLICITLY_OPEN
:
3980 return zs
== NVME_ZONE_STATE_IMPLICITLY_OPEN
;
3981 case NVME_ZONE_REPORT_EXPLICITLY_OPEN
:
3982 return zs
== NVME_ZONE_STATE_EXPLICITLY_OPEN
;
3983 case NVME_ZONE_REPORT_CLOSED
:
3984 return zs
== NVME_ZONE_STATE_CLOSED
;
3985 case NVME_ZONE_REPORT_FULL
:
3986 return zs
== NVME_ZONE_STATE_FULL
;
3987 case NVME_ZONE_REPORT_READ_ONLY
:
3988 return zs
== NVME_ZONE_STATE_READ_ONLY
;
3989 case NVME_ZONE_REPORT_OFFLINE
:
3990 return zs
== NVME_ZONE_STATE_OFFLINE
;
3996 static uint16_t nvme_zone_mgmt_recv(NvmeCtrl
*n
, NvmeRequest
*req
)
3998 NvmeCmd
*cmd
= (NvmeCmd
*)&req
->cmd
;
3999 NvmeNamespace
*ns
= req
->ns
;
4000 /* cdw12 is zero-based number of dwords to return. Convert to bytes */
4001 uint32_t data_size
= (le32_to_cpu(cmd
->cdw12
) + 1) << 2;
4002 uint32_t dw13
= le32_to_cpu(cmd
->cdw13
);
4003 uint32_t zone_idx
, zra
, zrasf
, partial
;
4004 uint64_t max_zones
, nr_zones
= 0;
4009 NvmeZoneReportHeader
*header
;
4011 size_t zone_entry_sz
;
4014 req
->status
= NVME_SUCCESS
;
4016 status
= nvme_get_mgmt_zone_slba_idx(ns
, cmd
, &slba
, &zone_idx
);
4022 if (zra
!= NVME_ZONE_REPORT
&& zra
!= NVME_ZONE_REPORT_EXTENDED
) {
4023 return NVME_INVALID_FIELD
| NVME_DNR
;
4025 if (zra
== NVME_ZONE_REPORT_EXTENDED
&& !ns
->params
.zd_extension_size
) {
4026 return NVME_INVALID_FIELD
| NVME_DNR
;
4029 zrasf
= (dw13
>> 8) & 0xff;
4030 if (zrasf
> NVME_ZONE_REPORT_OFFLINE
) {
4031 return NVME_INVALID_FIELD
| NVME_DNR
;
4034 if (data_size
< sizeof(NvmeZoneReportHeader
)) {
4035 return NVME_INVALID_FIELD
| NVME_DNR
;
4038 status
= nvme_check_mdts(n
, data_size
);
4043 partial
= (dw13
>> 16) & 0x01;
4045 zone_entry_sz
= sizeof(NvmeZoneDescr
);
4046 if (zra
== NVME_ZONE_REPORT_EXTENDED
) {
4047 zone_entry_sz
+= ns
->params
.zd_extension_size
;
4050 max_zones
= (data_size
- sizeof(NvmeZoneReportHeader
)) / zone_entry_sz
;
4051 buf
= g_malloc0(data_size
);
4053 zone
= &ns
->zone_array
[zone_idx
];
4054 for (i
= zone_idx
; i
< ns
->num_zones
; i
++) {
4055 if (partial
&& nr_zones
>= max_zones
) {
4058 if (nvme_zone_matches_filter(zrasf
, zone
++)) {
4062 header
= (NvmeZoneReportHeader
*)buf
;
4063 header
->nr_zones
= cpu_to_le64(nr_zones
);
4065 buf_p
= buf
+ sizeof(NvmeZoneReportHeader
);
4066 for (; zone_idx
< ns
->num_zones
&& max_zones
> 0; zone_idx
++) {
4067 zone
= &ns
->zone_array
[zone_idx
];
4068 if (nvme_zone_matches_filter(zrasf
, zone
)) {
4069 z
= (NvmeZoneDescr
*)buf_p
;
4070 buf_p
+= sizeof(NvmeZoneDescr
);
4074 z
->zcap
= cpu_to_le64(zone
->d
.zcap
);
4075 z
->zslba
= cpu_to_le64(zone
->d
.zslba
);
4078 if (nvme_wp_is_valid(zone
)) {
4079 z
->wp
= cpu_to_le64(zone
->d
.wp
);
4081 z
->wp
= cpu_to_le64(~0ULL);
4084 if (zra
== NVME_ZONE_REPORT_EXTENDED
) {
4085 if (zone
->d
.za
& NVME_ZA_ZD_EXT_VALID
) {
4086 memcpy(buf_p
, nvme_get_zd_extension(ns
, zone_idx
),
4087 ns
->params
.zd_extension_size
);
4089 buf_p
+= ns
->params
.zd_extension_size
;
4096 status
= nvme_c2h(n
, (uint8_t *)buf
, data_size
, req
);
4103 static uint16_t nvme_io_cmd(NvmeCtrl
*n
, NvmeRequest
*req
)
4106 uint32_t nsid
= le32_to_cpu(req
->cmd
.nsid
);
4108 trace_pci_nvme_io_cmd(nvme_cid(req
), nsid
, nvme_sqid(req
),
4109 req
->cmd
.opcode
, nvme_io_opc_str(req
->cmd
.opcode
));
4111 if (!nvme_nsid_valid(n
, nsid
)) {
4112 return NVME_INVALID_NSID
| NVME_DNR
;
4116 * In the base NVM command set, Flush may apply to all namespaces
4117 * (indicated by NSID being set to FFFFFFFFh). But if that feature is used
4118 * along with TP 4056 (Namespace Types), it may be pretty screwed up.
4120 * If NSID is indeed set to FFFFFFFFh, we simply cannot associate the
4121 * opcode with a specific command since we cannot determine a unique I/O
4122 * command set. Opcode 0h could have any other meaning than something
4123 * equivalent to flushing and say it DOES have completely different
4124 * semantics in some other command set - does an NSID of FFFFFFFFh then
4125 * mean "for all namespaces, apply whatever command set specific command
4126 * that uses the 0h opcode?" Or does it mean "for all namespaces, apply
4127 * whatever command that uses the 0h opcode if, and only if, it allows NSID
4130 * Anyway (and luckily), for now, we do not care about this since the
4131 * device only supports namespace types that includes the NVM Flush command
4132 * (NVM and Zoned), so always do an NVM Flush.
4134 if (req
->cmd
.opcode
== NVME_CMD_FLUSH
) {
4135 return nvme_flush(n
, req
);
4138 ns
= nvme_ns(n
, nsid
);
4139 if (unlikely(!ns
)) {
4140 return NVME_INVALID_FIELD
| NVME_DNR
;
4143 if (!(ns
->iocs
[req
->cmd
.opcode
] & NVME_CMD_EFF_CSUPP
)) {
4144 trace_pci_nvme_err_invalid_opc(req
->cmd
.opcode
);
4145 return NVME_INVALID_OPCODE
| NVME_DNR
;
4152 if (NVME_CMD_FLAGS_FUSE(req
->cmd
.flags
)) {
4153 return NVME_INVALID_FIELD
;
4158 switch (req
->cmd
.opcode
) {
4159 case NVME_CMD_WRITE_ZEROES
:
4160 return nvme_write_zeroes(n
, req
);
4161 case NVME_CMD_ZONE_APPEND
:
4162 return nvme_zone_append(n
, req
);
4163 case NVME_CMD_WRITE
:
4164 return nvme_write(n
, req
);
4166 return nvme_read(n
, req
);
4167 case NVME_CMD_COMPARE
:
4168 return nvme_compare(n
, req
);
4170 return nvme_dsm(n
, req
);
4171 case NVME_CMD_VERIFY
:
4172 return nvme_verify(n
, req
);
4174 return nvme_copy(n
, req
);
4175 case NVME_CMD_ZONE_MGMT_SEND
:
4176 return nvme_zone_mgmt_send(n
, req
);
4177 case NVME_CMD_ZONE_MGMT_RECV
:
4178 return nvme_zone_mgmt_recv(n
, req
);
4183 return NVME_INVALID_OPCODE
| NVME_DNR
;
4186 static void nvme_free_sq(NvmeSQueue
*sq
, NvmeCtrl
*n
)
4188 n
->sq
[sq
->sqid
] = NULL
;
4189 timer_free(sq
->timer
);
4196 static uint16_t nvme_del_sq(NvmeCtrl
*n
, NvmeRequest
*req
)
4198 NvmeDeleteQ
*c
= (NvmeDeleteQ
*)&req
->cmd
;
4199 NvmeRequest
*r
, *next
;
4202 uint16_t qid
= le16_to_cpu(c
->qid
);
4204 if (unlikely(!qid
|| nvme_check_sqid(n
, qid
))) {
4205 trace_pci_nvme_err_invalid_del_sq(qid
);
4206 return NVME_INVALID_QID
| NVME_DNR
;
4209 trace_pci_nvme_del_sq(qid
);
4212 while (!QTAILQ_EMPTY(&sq
->out_req_list
)) {
4213 r
= QTAILQ_FIRST(&sq
->out_req_list
);
4215 blk_aio_cancel(r
->aiocb
);
4218 assert(QTAILQ_EMPTY(&sq
->out_req_list
));
4220 if (!nvme_check_cqid(n
, sq
->cqid
)) {
4221 cq
= n
->cq
[sq
->cqid
];
4222 QTAILQ_REMOVE(&cq
->sq_list
, sq
, entry
);
4225 QTAILQ_FOREACH_SAFE(r
, &cq
->req_list
, entry
, next
) {
4227 QTAILQ_REMOVE(&cq
->req_list
, r
, entry
);
4228 QTAILQ_INSERT_TAIL(&sq
->req_list
, r
, entry
);
4233 nvme_free_sq(sq
, n
);
4234 return NVME_SUCCESS
;
4237 static void nvme_init_sq(NvmeSQueue
*sq
, NvmeCtrl
*n
, uint64_t dma_addr
,
4238 uint16_t sqid
, uint16_t cqid
, uint16_t size
)
4244 sq
->dma_addr
= dma_addr
;
4248 sq
->head
= sq
->tail
= 0;
4249 sq
->io_req
= g_new0(NvmeRequest
, sq
->size
);
4251 QTAILQ_INIT(&sq
->req_list
);
4252 QTAILQ_INIT(&sq
->out_req_list
);
4253 for (i
= 0; i
< sq
->size
; i
++) {
4254 sq
->io_req
[i
].sq
= sq
;
4255 QTAILQ_INSERT_TAIL(&(sq
->req_list
), &sq
->io_req
[i
], entry
);
4257 sq
->timer
= timer_new_ns(QEMU_CLOCK_VIRTUAL
, nvme_process_sq
, sq
);
4259 assert(n
->cq
[cqid
]);
4261 QTAILQ_INSERT_TAIL(&(cq
->sq_list
), sq
, entry
);
4265 static uint16_t nvme_create_sq(NvmeCtrl
*n
, NvmeRequest
*req
)
4268 NvmeCreateSq
*c
= (NvmeCreateSq
*)&req
->cmd
;
4270 uint16_t cqid
= le16_to_cpu(c
->cqid
);
4271 uint16_t sqid
= le16_to_cpu(c
->sqid
);
4272 uint16_t qsize
= le16_to_cpu(c
->qsize
);
4273 uint16_t qflags
= le16_to_cpu(c
->sq_flags
);
4274 uint64_t prp1
= le64_to_cpu(c
->prp1
);
4276 trace_pci_nvme_create_sq(prp1
, sqid
, cqid
, qsize
, qflags
);
4278 if (unlikely(!cqid
|| nvme_check_cqid(n
, cqid
))) {
4279 trace_pci_nvme_err_invalid_create_sq_cqid(cqid
);
4280 return NVME_INVALID_CQID
| NVME_DNR
;
4282 if (unlikely(!sqid
|| sqid
> n
->params
.max_ioqpairs
||
4283 n
->sq
[sqid
] != NULL
)) {
4284 trace_pci_nvme_err_invalid_create_sq_sqid(sqid
);
4285 return NVME_INVALID_QID
| NVME_DNR
;
4287 if (unlikely(!qsize
|| qsize
> NVME_CAP_MQES(ldq_le_p(&n
->bar
.cap
)))) {
4288 trace_pci_nvme_err_invalid_create_sq_size(qsize
);
4289 return NVME_MAX_QSIZE_EXCEEDED
| NVME_DNR
;
4291 if (unlikely(prp1
& (n
->page_size
- 1))) {
4292 trace_pci_nvme_err_invalid_create_sq_addr(prp1
);
4293 return NVME_INVALID_PRP_OFFSET
| NVME_DNR
;
4295 if (unlikely(!(NVME_SQ_FLAGS_PC(qflags
)))) {
4296 trace_pci_nvme_err_invalid_create_sq_qflags(NVME_SQ_FLAGS_PC(qflags
));
4297 return NVME_INVALID_FIELD
| NVME_DNR
;
4299 sq
= g_malloc0(sizeof(*sq
));
4300 nvme_init_sq(sq
, n
, prp1
, sqid
, cqid
, qsize
+ 1);
4301 return NVME_SUCCESS
;
4305 uint64_t units_read
;
4306 uint64_t units_written
;
4307 uint64_t read_commands
;
4308 uint64_t write_commands
;
4311 static void nvme_set_blk_stats(NvmeNamespace
*ns
, struct nvme_stats
*stats
)
4313 BlockAcctStats
*s
= blk_get_stats(ns
->blkconf
.blk
);
4315 stats
->units_read
+= s
->nr_bytes
[BLOCK_ACCT_READ
] >> BDRV_SECTOR_BITS
;
4316 stats
->units_written
+= s
->nr_bytes
[BLOCK_ACCT_WRITE
] >> BDRV_SECTOR_BITS
;
4317 stats
->read_commands
+= s
->nr_ops
[BLOCK_ACCT_READ
];
4318 stats
->write_commands
+= s
->nr_ops
[BLOCK_ACCT_WRITE
];
4321 static uint16_t nvme_smart_info(NvmeCtrl
*n
, uint8_t rae
, uint32_t buf_len
,
4322 uint64_t off
, NvmeRequest
*req
)
4324 uint32_t nsid
= le32_to_cpu(req
->cmd
.nsid
);
4325 struct nvme_stats stats
= { 0 };
4326 NvmeSmartLog smart
= { 0 };
4331 if (off
>= sizeof(smart
)) {
4332 return NVME_INVALID_FIELD
| NVME_DNR
;
4335 if (nsid
!= 0xffffffff) {
4336 ns
= nvme_ns(n
, nsid
);
4338 return NVME_INVALID_NSID
| NVME_DNR
;
4340 nvme_set_blk_stats(ns
, &stats
);
4344 for (i
= 1; i
<= NVME_MAX_NAMESPACES
; i
++) {
4349 nvme_set_blk_stats(ns
, &stats
);
4353 trans_len
= MIN(sizeof(smart
) - off
, buf_len
);
4354 smart
.critical_warning
= n
->smart_critical_warning
;
4356 smart
.data_units_read
[0] = cpu_to_le64(DIV_ROUND_UP(stats
.units_read
,
4358 smart
.data_units_written
[0] = cpu_to_le64(DIV_ROUND_UP(stats
.units_written
,
4360 smart
.host_read_commands
[0] = cpu_to_le64(stats
.read_commands
);
4361 smart
.host_write_commands
[0] = cpu_to_le64(stats
.write_commands
);
4363 smart
.temperature
= cpu_to_le16(n
->temperature
);
4365 if ((n
->temperature
>= n
->features
.temp_thresh_hi
) ||
4366 (n
->temperature
<= n
->features
.temp_thresh_low
)) {
4367 smart
.critical_warning
|= NVME_SMART_TEMPERATURE
;
4370 current_ms
= qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL
);
4371 smart
.power_on_hours
[0] =
4372 cpu_to_le64((((current_ms
- n
->starttime_ms
) / 1000) / 60) / 60);
4375 nvme_clear_events(n
, NVME_AER_TYPE_SMART
);
4378 return nvme_c2h(n
, (uint8_t *) &smart
+ off
, trans_len
, req
);
4381 static uint16_t nvme_fw_log_info(NvmeCtrl
*n
, uint32_t buf_len
, uint64_t off
,
4385 NvmeFwSlotInfoLog fw_log
= {
4389 if (off
>= sizeof(fw_log
)) {
4390 return NVME_INVALID_FIELD
| NVME_DNR
;
4393 strpadcpy((char *)&fw_log
.frs1
, sizeof(fw_log
.frs1
), "1.0", ' ');
4394 trans_len
= MIN(sizeof(fw_log
) - off
, buf_len
);
4396 return nvme_c2h(n
, (uint8_t *) &fw_log
+ off
, trans_len
, req
);
4399 static uint16_t nvme_error_info(NvmeCtrl
*n
, uint8_t rae
, uint32_t buf_len
,
4400 uint64_t off
, NvmeRequest
*req
)
4403 NvmeErrorLog errlog
;
4405 if (off
>= sizeof(errlog
)) {
4406 return NVME_INVALID_FIELD
| NVME_DNR
;
4410 nvme_clear_events(n
, NVME_AER_TYPE_ERROR
);
4413 memset(&errlog
, 0x0, sizeof(errlog
));
4414 trans_len
= MIN(sizeof(errlog
) - off
, buf_len
);
4416 return nvme_c2h(n
, (uint8_t *)&errlog
, trans_len
, req
);
4419 static uint16_t nvme_changed_nslist(NvmeCtrl
*n
, uint8_t rae
, uint32_t buf_len
,
4420 uint64_t off
, NvmeRequest
*req
)
4422 uint32_t nslist
[1024];
4427 if (off
>= sizeof(nslist
)) {
4428 trace_pci_nvme_err_invalid_log_page_offset(off
, sizeof(nslist
));
4429 return NVME_INVALID_FIELD
| NVME_DNR
;
4432 memset(nslist
, 0x0, sizeof(nslist
));
4433 trans_len
= MIN(sizeof(nslist
) - off
, buf_len
);
4435 while ((nsid
= find_first_bit(n
->changed_nsids
, NVME_CHANGED_NSID_SIZE
)) !=
4436 NVME_CHANGED_NSID_SIZE
) {
4438 * If more than 1024 namespaces, the first entry in the log page should
4439 * be set to FFFFFFFFh and the others to 0 as spec.
4441 if (i
== ARRAY_SIZE(nslist
)) {
4442 memset(nslist
, 0x0, sizeof(nslist
));
4443 nslist
[0] = 0xffffffff;
4448 clear_bit(nsid
, n
->changed_nsids
);
4452 * Remove all the remaining list entries in case returns directly due to
4453 * more than 1024 namespaces.
4455 if (nslist
[0] == 0xffffffff) {
4456 bitmap_zero(n
->changed_nsids
, NVME_CHANGED_NSID_SIZE
);
4460 nvme_clear_events(n
, NVME_AER_TYPE_NOTICE
);
4463 return nvme_c2h(n
, ((uint8_t *)nslist
) + off
, trans_len
, req
);
4466 static uint16_t nvme_cmd_effects(NvmeCtrl
*n
, uint8_t csi
, uint32_t buf_len
,
4467 uint64_t off
, NvmeRequest
*req
)
4469 NvmeEffectsLog log
= {};
4470 const uint32_t *src_iocs
= NULL
;
4473 if (off
>= sizeof(log
)) {
4474 trace_pci_nvme_err_invalid_log_page_offset(off
, sizeof(log
));
4475 return NVME_INVALID_FIELD
| NVME_DNR
;
4478 switch (NVME_CC_CSS(ldl_le_p(&n
->bar
.cc
))) {
4479 case NVME_CC_CSS_NVM
:
4480 src_iocs
= nvme_cse_iocs_nvm
;
4482 case NVME_CC_CSS_ADMIN_ONLY
:
4484 case NVME_CC_CSS_CSI
:
4487 src_iocs
= nvme_cse_iocs_nvm
;
4489 case NVME_CSI_ZONED
:
4490 src_iocs
= nvme_cse_iocs_zoned
;
4495 memcpy(log
.acs
, nvme_cse_acs
, sizeof(nvme_cse_acs
));
4498 memcpy(log
.iocs
, src_iocs
, sizeof(log
.iocs
));
4501 trans_len
= MIN(sizeof(log
) - off
, buf_len
);
4503 return nvme_c2h(n
, ((uint8_t *)&log
) + off
, trans_len
, req
);
4506 static uint16_t nvme_get_log(NvmeCtrl
*n
, NvmeRequest
*req
)
4508 NvmeCmd
*cmd
= &req
->cmd
;
4510 uint32_t dw10
= le32_to_cpu(cmd
->cdw10
);
4511 uint32_t dw11
= le32_to_cpu(cmd
->cdw11
);
4512 uint32_t dw12
= le32_to_cpu(cmd
->cdw12
);
4513 uint32_t dw13
= le32_to_cpu(cmd
->cdw13
);
4514 uint8_t lid
= dw10
& 0xff;
4515 uint8_t lsp
= (dw10
>> 8) & 0xf;
4516 uint8_t rae
= (dw10
>> 15) & 0x1;
4517 uint8_t csi
= le32_to_cpu(cmd
->cdw14
) >> 24;
4518 uint32_t numdl
, numdu
;
4519 uint64_t off
, lpol
, lpou
;
4523 numdl
= (dw10
>> 16);
4524 numdu
= (dw11
& 0xffff);
4528 len
= (((numdu
<< 16) | numdl
) + 1) << 2;
4529 off
= (lpou
<< 32ULL) | lpol
;
4532 return NVME_INVALID_FIELD
| NVME_DNR
;
4535 trace_pci_nvme_get_log(nvme_cid(req
), lid
, lsp
, rae
, len
, off
);
4537 status
= nvme_check_mdts(n
, len
);
4543 case NVME_LOG_ERROR_INFO
:
4544 return nvme_error_info(n
, rae
, len
, off
, req
);
4545 case NVME_LOG_SMART_INFO
:
4546 return nvme_smart_info(n
, rae
, len
, off
, req
);
4547 case NVME_LOG_FW_SLOT_INFO
:
4548 return nvme_fw_log_info(n
, len
, off
, req
);
4549 case NVME_LOG_CHANGED_NSLIST
:
4550 return nvme_changed_nslist(n
, rae
, len
, off
, req
);
4551 case NVME_LOG_CMD_EFFECTS
:
4552 return nvme_cmd_effects(n
, csi
, len
, off
, req
);
4554 trace_pci_nvme_err_invalid_log_page(nvme_cid(req
), lid
);
4555 return NVME_INVALID_FIELD
| NVME_DNR
;
4559 static void nvme_free_cq(NvmeCQueue
*cq
, NvmeCtrl
*n
)
4561 n
->cq
[cq
->cqid
] = NULL
;
4562 timer_free(cq
->timer
);
4563 if (msix_enabled(&n
->parent_obj
)) {
4564 msix_vector_unuse(&n
->parent_obj
, cq
->vector
);
4571 static uint16_t nvme_del_cq(NvmeCtrl
*n
, NvmeRequest
*req
)
4573 NvmeDeleteQ
*c
= (NvmeDeleteQ
*)&req
->cmd
;
4575 uint16_t qid
= le16_to_cpu(c
->qid
);
4577 if (unlikely(!qid
|| nvme_check_cqid(n
, qid
))) {
4578 trace_pci_nvme_err_invalid_del_cq_cqid(qid
);
4579 return NVME_INVALID_CQID
| NVME_DNR
;
4583 if (unlikely(!QTAILQ_EMPTY(&cq
->sq_list
))) {
4584 trace_pci_nvme_err_invalid_del_cq_notempty(qid
);
4585 return NVME_INVALID_QUEUE_DEL
;
4588 if (cq
->irq_enabled
&& cq
->tail
!= cq
->head
) {
4592 nvme_irq_deassert(n
, cq
);
4593 trace_pci_nvme_del_cq(qid
);
4594 nvme_free_cq(cq
, n
);
4595 return NVME_SUCCESS
;
4598 static void nvme_init_cq(NvmeCQueue
*cq
, NvmeCtrl
*n
, uint64_t dma_addr
,
4599 uint16_t cqid
, uint16_t vector
, uint16_t size
,
4600 uint16_t irq_enabled
)
4604 if (msix_enabled(&n
->parent_obj
)) {
4605 ret
= msix_vector_use(&n
->parent_obj
, vector
);
4611 cq
->dma_addr
= dma_addr
;
4613 cq
->irq_enabled
= irq_enabled
;
4614 cq
->vector
= vector
;
4615 cq
->head
= cq
->tail
= 0;
4616 QTAILQ_INIT(&cq
->req_list
);
4617 QTAILQ_INIT(&cq
->sq_list
);
4619 cq
->timer
= timer_new_ns(QEMU_CLOCK_VIRTUAL
, nvme_post_cqes
, cq
);
4622 static uint16_t nvme_create_cq(NvmeCtrl
*n
, NvmeRequest
*req
)
4625 NvmeCreateCq
*c
= (NvmeCreateCq
*)&req
->cmd
;
4626 uint16_t cqid
= le16_to_cpu(c
->cqid
);
4627 uint16_t vector
= le16_to_cpu(c
->irq_vector
);
4628 uint16_t qsize
= le16_to_cpu(c
->qsize
);
4629 uint16_t qflags
= le16_to_cpu(c
->cq_flags
);
4630 uint64_t prp1
= le64_to_cpu(c
->prp1
);
4632 trace_pci_nvme_create_cq(prp1
, cqid
, vector
, qsize
, qflags
,
4633 NVME_CQ_FLAGS_IEN(qflags
) != 0);
4635 if (unlikely(!cqid
|| cqid
> n
->params
.max_ioqpairs
||
4636 n
->cq
[cqid
] != NULL
)) {
4637 trace_pci_nvme_err_invalid_create_cq_cqid(cqid
);
4638 return NVME_INVALID_QID
| NVME_DNR
;
4640 if (unlikely(!qsize
|| qsize
> NVME_CAP_MQES(ldq_le_p(&n
->bar
.cap
)))) {
4641 trace_pci_nvme_err_invalid_create_cq_size(qsize
);
4642 return NVME_MAX_QSIZE_EXCEEDED
| NVME_DNR
;
4644 if (unlikely(prp1
& (n
->page_size
- 1))) {
4645 trace_pci_nvme_err_invalid_create_cq_addr(prp1
);
4646 return NVME_INVALID_PRP_OFFSET
| NVME_DNR
;
4648 if (unlikely(!msix_enabled(&n
->parent_obj
) && vector
)) {
4649 trace_pci_nvme_err_invalid_create_cq_vector(vector
);
4650 return NVME_INVALID_IRQ_VECTOR
| NVME_DNR
;
4652 if (unlikely(vector
>= n
->params
.msix_qsize
)) {
4653 trace_pci_nvme_err_invalid_create_cq_vector(vector
);
4654 return NVME_INVALID_IRQ_VECTOR
| NVME_DNR
;
4656 if (unlikely(!(NVME_CQ_FLAGS_PC(qflags
)))) {
4657 trace_pci_nvme_err_invalid_create_cq_qflags(NVME_CQ_FLAGS_PC(qflags
));
4658 return NVME_INVALID_FIELD
| NVME_DNR
;
4661 cq
= g_malloc0(sizeof(*cq
));
4662 nvme_init_cq(cq
, n
, prp1
, cqid
, vector
, qsize
+ 1,
4663 NVME_CQ_FLAGS_IEN(qflags
));
4666 * It is only required to set qs_created when creating a completion queue;
4667 * creating a submission queue without a matching completion queue will
4670 n
->qs_created
= true;
4671 return NVME_SUCCESS
;
4674 static uint16_t nvme_rpt_empty_id_struct(NvmeCtrl
*n
, NvmeRequest
*req
)
4676 uint8_t id
[NVME_IDENTIFY_DATA_SIZE
] = {};
4678 return nvme_c2h(n
, id
, sizeof(id
), req
);
4681 static uint16_t nvme_identify_ctrl(NvmeCtrl
*n
, NvmeRequest
*req
)
4683 trace_pci_nvme_identify_ctrl();
4685 return nvme_c2h(n
, (uint8_t *)&n
->id_ctrl
, sizeof(n
->id_ctrl
), req
);
4688 static uint16_t nvme_identify_ctrl_csi(NvmeCtrl
*n
, NvmeRequest
*req
)
4690 NvmeIdentify
*c
= (NvmeIdentify
*)&req
->cmd
;
4691 uint8_t id
[NVME_IDENTIFY_DATA_SIZE
] = {};
4692 NvmeIdCtrlNvm
*id_nvm
= (NvmeIdCtrlNvm
*)&id
;
4694 trace_pci_nvme_identify_ctrl_csi(c
->csi
);
4698 id_nvm
->vsl
= n
->params
.vsl
;
4699 id_nvm
->dmrsl
= cpu_to_le32(n
->dmrsl
);
4702 case NVME_CSI_ZONED
:
4703 ((NvmeIdCtrlZoned
*)&id
)->zasl
= n
->params
.zasl
;
4707 return NVME_INVALID_FIELD
| NVME_DNR
;
4710 return nvme_c2h(n
, id
, sizeof(id
), req
);
4713 static uint16_t nvme_identify_ns(NvmeCtrl
*n
, NvmeRequest
*req
, bool active
)
4716 NvmeIdentify
*c
= (NvmeIdentify
*)&req
->cmd
;
4717 uint32_t nsid
= le32_to_cpu(c
->nsid
);
4719 trace_pci_nvme_identify_ns(nsid
);
4721 if (!nvme_nsid_valid(n
, nsid
) || nsid
== NVME_NSID_BROADCAST
) {
4722 return NVME_INVALID_NSID
| NVME_DNR
;
4725 ns
= nvme_ns(n
, nsid
);
4726 if (unlikely(!ns
)) {
4728 ns
= nvme_subsys_ns(n
->subsys
, nsid
);
4730 return nvme_rpt_empty_id_struct(n
, req
);
4733 return nvme_rpt_empty_id_struct(n
, req
);
4737 if (active
|| ns
->csi
== NVME_CSI_NVM
) {
4738 return nvme_c2h(n
, (uint8_t *)&ns
->id_ns
, sizeof(NvmeIdNs
), req
);
4741 return NVME_INVALID_CMD_SET
| NVME_DNR
;
4744 static uint16_t nvme_identify_ctrl_list(NvmeCtrl
*n
, NvmeRequest
*req
,
4747 NvmeIdentify
*c
= (NvmeIdentify
*)&req
->cmd
;
4748 uint32_t nsid
= le32_to_cpu(c
->nsid
);
4749 uint16_t min_id
= le16_to_cpu(c
->ctrlid
);
4750 uint16_t list
[NVME_CONTROLLER_LIST_SIZE
] = {};
4751 uint16_t *ids
= &list
[1];
4754 int cntlid
, nr_ids
= 0;
4756 trace_pci_nvme_identify_ctrl_list(c
->cns
, min_id
);
4759 return NVME_INVALID_FIELD
| NVME_DNR
;
4763 if (nsid
== NVME_NSID_BROADCAST
) {
4764 return NVME_INVALID_FIELD
| NVME_DNR
;
4767 ns
= nvme_subsys_ns(n
->subsys
, nsid
);
4769 return NVME_INVALID_FIELD
| NVME_DNR
;
4773 for (cntlid
= min_id
; cntlid
< ARRAY_SIZE(n
->subsys
->ctrls
); cntlid
++) {
4774 ctrl
= nvme_subsys_ctrl(n
->subsys
, cntlid
);
4779 if (attached
&& !nvme_ns(ctrl
, nsid
)) {
4783 ids
[nr_ids
++] = cntlid
;
4788 return nvme_c2h(n
, (uint8_t *)list
, sizeof(list
), req
);
4791 static uint16_t nvme_identify_ns_csi(NvmeCtrl
*n
, NvmeRequest
*req
,
4795 NvmeIdentify
*c
= (NvmeIdentify
*)&req
->cmd
;
4796 uint32_t nsid
= le32_to_cpu(c
->nsid
);
4798 trace_pci_nvme_identify_ns_csi(nsid
, c
->csi
);
4800 if (!nvme_nsid_valid(n
, nsid
) || nsid
== NVME_NSID_BROADCAST
) {
4801 return NVME_INVALID_NSID
| NVME_DNR
;
4804 ns
= nvme_ns(n
, nsid
);
4805 if (unlikely(!ns
)) {
4807 ns
= nvme_subsys_ns(n
->subsys
, nsid
);
4809 return nvme_rpt_empty_id_struct(n
, req
);
4812 return nvme_rpt_empty_id_struct(n
, req
);
4816 if (c
->csi
== NVME_CSI_NVM
) {
4817 return nvme_c2h(n
, (uint8_t *)&ns
->id_ns_nvm
, sizeof(NvmeIdNsNvm
),
4819 } else if (c
->csi
== NVME_CSI_ZONED
&& ns
->csi
== NVME_CSI_ZONED
) {
4820 return nvme_c2h(n
, (uint8_t *)ns
->id_ns_zoned
, sizeof(NvmeIdNsZoned
),
4824 return NVME_INVALID_FIELD
| NVME_DNR
;
4827 static uint16_t nvme_identify_nslist(NvmeCtrl
*n
, NvmeRequest
*req
,
4831 NvmeIdentify
*c
= (NvmeIdentify
*)&req
->cmd
;
4832 uint32_t min_nsid
= le32_to_cpu(c
->nsid
);
4833 uint8_t list
[NVME_IDENTIFY_DATA_SIZE
] = {};
4834 static const int data_len
= sizeof(list
);
4835 uint32_t *list_ptr
= (uint32_t *)list
;
4838 trace_pci_nvme_identify_nslist(min_nsid
);
4841 * Both FFFFFFFFh (NVME_NSID_BROADCAST) and FFFFFFFFEh are invalid values
4842 * since the Active Namespace ID List should return namespaces with ids
4843 * *higher* than the NSID specified in the command. This is also specified
4844 * in the spec (NVM Express v1.3d, Section 5.15.4).
4846 if (min_nsid
>= NVME_NSID_BROADCAST
- 1) {
4847 return NVME_INVALID_NSID
| NVME_DNR
;
4850 for (i
= 1; i
<= NVME_MAX_NAMESPACES
; i
++) {
4854 ns
= nvme_subsys_ns(n
->subsys
, i
);
4862 if (ns
->params
.nsid
<= min_nsid
) {
4865 list_ptr
[j
++] = cpu_to_le32(ns
->params
.nsid
);
4866 if (j
== data_len
/ sizeof(uint32_t)) {
4871 return nvme_c2h(n
, list
, data_len
, req
);
4874 static uint16_t nvme_identify_nslist_csi(NvmeCtrl
*n
, NvmeRequest
*req
,
4878 NvmeIdentify
*c
= (NvmeIdentify
*)&req
->cmd
;
4879 uint32_t min_nsid
= le32_to_cpu(c
->nsid
);
4880 uint8_t list
[NVME_IDENTIFY_DATA_SIZE
] = {};
4881 static const int data_len
= sizeof(list
);
4882 uint32_t *list_ptr
= (uint32_t *)list
;
4885 trace_pci_nvme_identify_nslist_csi(min_nsid
, c
->csi
);
4888 * Same as in nvme_identify_nslist(), FFFFFFFFh/FFFFFFFFEh are invalid.
4890 if (min_nsid
>= NVME_NSID_BROADCAST
- 1) {
4891 return NVME_INVALID_NSID
| NVME_DNR
;
4894 if (c
->csi
!= NVME_CSI_NVM
&& c
->csi
!= NVME_CSI_ZONED
) {
4895 return NVME_INVALID_FIELD
| NVME_DNR
;
4898 for (i
= 1; i
<= NVME_MAX_NAMESPACES
; i
++) {
4902 ns
= nvme_subsys_ns(n
->subsys
, i
);
4910 if (ns
->params
.nsid
<= min_nsid
|| c
->csi
!= ns
->csi
) {
4913 list_ptr
[j
++] = cpu_to_le32(ns
->params
.nsid
);
4914 if (j
== data_len
/ sizeof(uint32_t)) {
4919 return nvme_c2h(n
, list
, data_len
, req
);
4922 static uint16_t nvme_identify_ns_descr_list(NvmeCtrl
*n
, NvmeRequest
*req
)
4925 NvmeIdentify
*c
= (NvmeIdentify
*)&req
->cmd
;
4926 uint32_t nsid
= le32_to_cpu(c
->nsid
);
4927 uint8_t list
[NVME_IDENTIFY_DATA_SIZE
] = {};
4928 uint8_t *pos
= list
;
4931 uint8_t v
[NVME_NIDL_UUID
];
4932 } QEMU_PACKED uuid
= {};
4936 } QEMU_PACKED eui64
= {};
4940 } QEMU_PACKED csi
= {};
4942 trace_pci_nvme_identify_ns_descr_list(nsid
);
4944 if (!nvme_nsid_valid(n
, nsid
) || nsid
== NVME_NSID_BROADCAST
) {
4945 return NVME_INVALID_NSID
| NVME_DNR
;
4948 ns
= nvme_ns(n
, nsid
);
4949 if (unlikely(!ns
)) {
4950 return NVME_INVALID_FIELD
| NVME_DNR
;
4954 * If the EUI-64 field is 0 and the NGUID field is 0, the namespace must
4955 * provide a valid Namespace UUID in the Namespace Identification Descriptor
4956 * data structure. QEMU does not yet support setting NGUID.
4958 uuid
.hdr
.nidt
= NVME_NIDT_UUID
;
4959 uuid
.hdr
.nidl
= NVME_NIDL_UUID
;
4960 memcpy(uuid
.v
, ns
->params
.uuid
.data
, NVME_NIDL_UUID
);
4961 memcpy(pos
, &uuid
, sizeof(uuid
));
4962 pos
+= sizeof(uuid
);
4964 if (ns
->params
.eui64
) {
4965 eui64
.hdr
.nidt
= NVME_NIDT_EUI64
;
4966 eui64
.hdr
.nidl
= NVME_NIDL_EUI64
;
4967 eui64
.v
= cpu_to_be64(ns
->params
.eui64
);
4968 memcpy(pos
, &eui64
, sizeof(eui64
));
4969 pos
+= sizeof(eui64
);
4972 csi
.hdr
.nidt
= NVME_NIDT_CSI
;
4973 csi
.hdr
.nidl
= NVME_NIDL_CSI
;
4975 memcpy(pos
, &csi
, sizeof(csi
));
4978 return nvme_c2h(n
, list
, sizeof(list
), req
);
4981 static uint16_t nvme_identify_cmd_set(NvmeCtrl
*n
, NvmeRequest
*req
)
4983 uint8_t list
[NVME_IDENTIFY_DATA_SIZE
] = {};
4984 static const int data_len
= sizeof(list
);
4986 trace_pci_nvme_identify_cmd_set();
4988 NVME_SET_CSI(*list
, NVME_CSI_NVM
);
4989 NVME_SET_CSI(*list
, NVME_CSI_ZONED
);
4991 return nvme_c2h(n
, list
, data_len
, req
);
4994 static uint16_t nvme_identify(NvmeCtrl
*n
, NvmeRequest
*req
)
4996 NvmeIdentify
*c
= (NvmeIdentify
*)&req
->cmd
;
4998 trace_pci_nvme_identify(nvme_cid(req
), c
->cns
, le16_to_cpu(c
->ctrlid
),
5002 case NVME_ID_CNS_NS
:
5003 return nvme_identify_ns(n
, req
, true);
5004 case NVME_ID_CNS_NS_PRESENT
:
5005 return nvme_identify_ns(n
, req
, false);
5006 case NVME_ID_CNS_NS_ATTACHED_CTRL_LIST
:
5007 return nvme_identify_ctrl_list(n
, req
, true);
5008 case NVME_ID_CNS_CTRL_LIST
:
5009 return nvme_identify_ctrl_list(n
, req
, false);
5010 case NVME_ID_CNS_CS_NS
:
5011 return nvme_identify_ns_csi(n
, req
, true);
5012 case NVME_ID_CNS_CS_NS_PRESENT
:
5013 return nvme_identify_ns_csi(n
, req
, false);
5014 case NVME_ID_CNS_CTRL
:
5015 return nvme_identify_ctrl(n
, req
);
5016 case NVME_ID_CNS_CS_CTRL
:
5017 return nvme_identify_ctrl_csi(n
, req
);
5018 case NVME_ID_CNS_NS_ACTIVE_LIST
:
5019 return nvme_identify_nslist(n
, req
, true);
5020 case NVME_ID_CNS_NS_PRESENT_LIST
:
5021 return nvme_identify_nslist(n
, req
, false);
5022 case NVME_ID_CNS_CS_NS_ACTIVE_LIST
:
5023 return nvme_identify_nslist_csi(n
, req
, true);
5024 case NVME_ID_CNS_CS_NS_PRESENT_LIST
:
5025 return nvme_identify_nslist_csi(n
, req
, false);
5026 case NVME_ID_CNS_NS_DESCR_LIST
:
5027 return nvme_identify_ns_descr_list(n
, req
);
5028 case NVME_ID_CNS_IO_COMMAND_SET
:
5029 return nvme_identify_cmd_set(n
, req
);
5031 trace_pci_nvme_err_invalid_identify_cns(le32_to_cpu(c
->cns
));
5032 return NVME_INVALID_FIELD
| NVME_DNR
;
5036 static uint16_t nvme_abort(NvmeCtrl
*n
, NvmeRequest
*req
)
5038 uint16_t sqid
= le32_to_cpu(req
->cmd
.cdw10
) & 0xffff;
5040 req
->cqe
.result
= 1;
5041 if (nvme_check_sqid(n
, sqid
)) {
5042 return NVME_INVALID_FIELD
| NVME_DNR
;
5045 return NVME_SUCCESS
;
5048 static inline void nvme_set_timestamp(NvmeCtrl
*n
, uint64_t ts
)
5050 trace_pci_nvme_setfeat_timestamp(ts
);
5052 n
->host_timestamp
= le64_to_cpu(ts
);
5053 n
->timestamp_set_qemu_clock_ms
= qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL
);
5056 static inline uint64_t nvme_get_timestamp(const NvmeCtrl
*n
)
5058 uint64_t current_time
= qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL
);
5059 uint64_t elapsed_time
= current_time
- n
->timestamp_set_qemu_clock_ms
;
5061 union nvme_timestamp
{
5063 uint64_t timestamp
:48;
5071 union nvme_timestamp ts
;
5073 ts
.timestamp
= n
->host_timestamp
+ elapsed_time
;
5075 /* If the host timestamp is non-zero, set the timestamp origin */
5076 ts
.origin
= n
->host_timestamp
? 0x01 : 0x00;
5078 trace_pci_nvme_getfeat_timestamp(ts
.all
);
5080 return cpu_to_le64(ts
.all
);
5083 static uint16_t nvme_get_feature_timestamp(NvmeCtrl
*n
, NvmeRequest
*req
)
5085 uint64_t timestamp
= nvme_get_timestamp(n
);
5087 return nvme_c2h(n
, (uint8_t *)×tamp
, sizeof(timestamp
), req
);
5090 static uint16_t nvme_get_feature(NvmeCtrl
*n
, NvmeRequest
*req
)
5092 NvmeCmd
*cmd
= &req
->cmd
;
5093 uint32_t dw10
= le32_to_cpu(cmd
->cdw10
);
5094 uint32_t dw11
= le32_to_cpu(cmd
->cdw11
);
5095 uint32_t nsid
= le32_to_cpu(cmd
->nsid
);
5097 uint8_t fid
= NVME_GETSETFEAT_FID(dw10
);
5098 NvmeGetFeatureSelect sel
= NVME_GETFEAT_SELECT(dw10
);
5103 static const uint32_t nvme_feature_default
[NVME_FID_MAX
] = {
5104 [NVME_ARBITRATION
] = NVME_ARB_AB_NOLIMIT
,
5107 trace_pci_nvme_getfeat(nvme_cid(req
), nsid
, fid
, sel
, dw11
);
5109 if (!nvme_feature_support
[fid
]) {
5110 return NVME_INVALID_FIELD
| NVME_DNR
;
5113 if (nvme_feature_cap
[fid
] & NVME_FEAT_CAP_NS
) {
5114 if (!nvme_nsid_valid(n
, nsid
) || nsid
== NVME_NSID_BROADCAST
) {
5116 * The Reservation Notification Mask and Reservation Persistence
5117 * features require a status code of Invalid Field in Command when
5118 * NSID is FFFFFFFFh. Since the device does not support those
5119 * features we can always return Invalid Namespace or Format as we
5120 * should do for all other features.
5122 return NVME_INVALID_NSID
| NVME_DNR
;
5125 if (!nvme_ns(n
, nsid
)) {
5126 return NVME_INVALID_FIELD
| NVME_DNR
;
5131 case NVME_GETFEAT_SELECT_CURRENT
:
5133 case NVME_GETFEAT_SELECT_SAVED
:
5134 /* no features are saveable by the controller; fallthrough */
5135 case NVME_GETFEAT_SELECT_DEFAULT
:
5137 case NVME_GETFEAT_SELECT_CAP
:
5138 result
= nvme_feature_cap
[fid
];
5143 case NVME_TEMPERATURE_THRESHOLD
:
5147 * The controller only implements the Composite Temperature sensor, so
5148 * return 0 for all other sensors.
5150 if (NVME_TEMP_TMPSEL(dw11
) != NVME_TEMP_TMPSEL_COMPOSITE
) {
5154 switch (NVME_TEMP_THSEL(dw11
)) {
5155 case NVME_TEMP_THSEL_OVER
:
5156 result
= n
->features
.temp_thresh_hi
;
5158 case NVME_TEMP_THSEL_UNDER
:
5159 result
= n
->features
.temp_thresh_low
;
5163 return NVME_INVALID_FIELD
| NVME_DNR
;
5164 case NVME_ERROR_RECOVERY
:
5165 if (!nvme_nsid_valid(n
, nsid
)) {
5166 return NVME_INVALID_NSID
| NVME_DNR
;
5169 ns
= nvme_ns(n
, nsid
);
5170 if (unlikely(!ns
)) {
5171 return NVME_INVALID_FIELD
| NVME_DNR
;
5174 result
= ns
->features
.err_rec
;
5176 case NVME_VOLATILE_WRITE_CACHE
:
5178 for (i
= 1; i
<= NVME_MAX_NAMESPACES
; i
++) {
5184 result
= blk_enable_write_cache(ns
->blkconf
.blk
);
5189 trace_pci_nvme_getfeat_vwcache(result
? "enabled" : "disabled");
5191 case NVME_ASYNCHRONOUS_EVENT_CONF
:
5192 result
= n
->features
.async_config
;
5194 case NVME_TIMESTAMP
:
5195 return nvme_get_feature_timestamp(n
, req
);
5196 case NVME_HOST_BEHAVIOR_SUPPORT
:
5197 return nvme_c2h(n
, (uint8_t *)&n
->features
.hbs
,
5198 sizeof(n
->features
.hbs
), req
);
5205 case NVME_TEMPERATURE_THRESHOLD
:
5208 if (NVME_TEMP_TMPSEL(dw11
) != NVME_TEMP_TMPSEL_COMPOSITE
) {
5212 if (NVME_TEMP_THSEL(dw11
) == NVME_TEMP_THSEL_OVER
) {
5213 result
= NVME_TEMPERATURE_WARNING
;
5217 case NVME_NUMBER_OF_QUEUES
:
5218 result
= (n
->params
.max_ioqpairs
- 1) |
5219 ((n
->params
.max_ioqpairs
- 1) << 16);
5220 trace_pci_nvme_getfeat_numq(result
);
5222 case NVME_INTERRUPT_VECTOR_CONF
:
5224 if (iv
>= n
->params
.max_ioqpairs
+ 1) {
5225 return NVME_INVALID_FIELD
| NVME_DNR
;
5229 if (iv
== n
->admin_cq
.vector
) {
5230 result
|= NVME_INTVC_NOCOALESCING
;
5234 result
= nvme_feature_default
[fid
];
5239 req
->cqe
.result
= cpu_to_le32(result
);
5240 return NVME_SUCCESS
;
5243 static uint16_t nvme_set_feature_timestamp(NvmeCtrl
*n
, NvmeRequest
*req
)
5248 ret
= nvme_h2c(n
, (uint8_t *)×tamp
, sizeof(timestamp
), req
);
5253 nvme_set_timestamp(n
, timestamp
);
5255 return NVME_SUCCESS
;
5258 static uint16_t nvme_set_feature(NvmeCtrl
*n
, NvmeRequest
*req
)
5260 NvmeNamespace
*ns
= NULL
;
5262 NvmeCmd
*cmd
= &req
->cmd
;
5263 uint32_t dw10
= le32_to_cpu(cmd
->cdw10
);
5264 uint32_t dw11
= le32_to_cpu(cmd
->cdw11
);
5265 uint32_t nsid
= le32_to_cpu(cmd
->nsid
);
5266 uint8_t fid
= NVME_GETSETFEAT_FID(dw10
);
5267 uint8_t save
= NVME_SETFEAT_SAVE(dw10
);
5271 trace_pci_nvme_setfeat(nvme_cid(req
), nsid
, fid
, save
, dw11
);
5273 if (save
&& !(nvme_feature_cap
[fid
] & NVME_FEAT_CAP_SAVE
)) {
5274 return NVME_FID_NOT_SAVEABLE
| NVME_DNR
;
5277 if (!nvme_feature_support
[fid
]) {
5278 return NVME_INVALID_FIELD
| NVME_DNR
;
5281 if (nvme_feature_cap
[fid
] & NVME_FEAT_CAP_NS
) {
5282 if (nsid
!= NVME_NSID_BROADCAST
) {
5283 if (!nvme_nsid_valid(n
, nsid
)) {
5284 return NVME_INVALID_NSID
| NVME_DNR
;
5287 ns
= nvme_ns(n
, nsid
);
5288 if (unlikely(!ns
)) {
5289 return NVME_INVALID_FIELD
| NVME_DNR
;
5292 } else if (nsid
&& nsid
!= NVME_NSID_BROADCAST
) {
5293 if (!nvme_nsid_valid(n
, nsid
)) {
5294 return NVME_INVALID_NSID
| NVME_DNR
;
5297 return NVME_FEAT_NOT_NS_SPEC
| NVME_DNR
;
5300 if (!(nvme_feature_cap
[fid
] & NVME_FEAT_CAP_CHANGE
)) {
5301 return NVME_FEAT_NOT_CHANGEABLE
| NVME_DNR
;
5305 case NVME_TEMPERATURE_THRESHOLD
:
5306 if (NVME_TEMP_TMPSEL(dw11
) != NVME_TEMP_TMPSEL_COMPOSITE
) {
5310 switch (NVME_TEMP_THSEL(dw11
)) {
5311 case NVME_TEMP_THSEL_OVER
:
5312 n
->features
.temp_thresh_hi
= NVME_TEMP_TMPTH(dw11
);
5314 case NVME_TEMP_THSEL_UNDER
:
5315 n
->features
.temp_thresh_low
= NVME_TEMP_TMPTH(dw11
);
5318 return NVME_INVALID_FIELD
| NVME_DNR
;
5321 if ((n
->temperature
>= n
->features
.temp_thresh_hi
) ||
5322 (n
->temperature
<= n
->features
.temp_thresh_low
)) {
5323 nvme_smart_event(n
, NVME_AER_INFO_SMART_TEMP_THRESH
);
5327 case NVME_ERROR_RECOVERY
:
5328 if (nsid
== NVME_NSID_BROADCAST
) {
5329 for (i
= 1; i
<= NVME_MAX_NAMESPACES
; i
++) {
5336 if (NVME_ID_NS_NSFEAT_DULBE(ns
->id_ns
.nsfeat
)) {
5337 ns
->features
.err_rec
= dw11
;
5345 if (NVME_ID_NS_NSFEAT_DULBE(ns
->id_ns
.nsfeat
)) {
5346 ns
->features
.err_rec
= dw11
;
5349 case NVME_VOLATILE_WRITE_CACHE
:
5350 for (i
= 1; i
<= NVME_MAX_NAMESPACES
; i
++) {
5356 if (!(dw11
& 0x1) && blk_enable_write_cache(ns
->blkconf
.blk
)) {
5357 blk_flush(ns
->blkconf
.blk
);
5360 blk_set_enable_write_cache(ns
->blkconf
.blk
, dw11
& 1);
5365 case NVME_NUMBER_OF_QUEUES
:
5366 if (n
->qs_created
) {
5367 return NVME_CMD_SEQ_ERROR
| NVME_DNR
;
5371 * NVMe v1.3, Section 5.21.1.7: FFFFh is not an allowed value for NCQR
5374 if ((dw11
& 0xffff) == 0xffff || ((dw11
>> 16) & 0xffff) == 0xffff) {
5375 return NVME_INVALID_FIELD
| NVME_DNR
;
5378 trace_pci_nvme_setfeat_numq((dw11
& 0xffff) + 1,
5379 ((dw11
>> 16) & 0xffff) + 1,
5380 n
->params
.max_ioqpairs
,
5381 n
->params
.max_ioqpairs
);
5382 req
->cqe
.result
= cpu_to_le32((n
->params
.max_ioqpairs
- 1) |
5383 ((n
->params
.max_ioqpairs
- 1) << 16));
5385 case NVME_ASYNCHRONOUS_EVENT_CONF
:
5386 n
->features
.async_config
= dw11
;
5388 case NVME_TIMESTAMP
:
5389 return nvme_set_feature_timestamp(n
, req
);
5390 case NVME_HOST_BEHAVIOR_SUPPORT
:
5391 status
= nvme_h2c(n
, (uint8_t *)&n
->features
.hbs
,
5392 sizeof(n
->features
.hbs
), req
);
5397 for (i
= 1; i
<= NVME_MAX_NAMESPACES
; i
++) {
5404 ns
->id_ns
.nlbaf
= ns
->nlbaf
- 1;
5405 if (!n
->features
.hbs
.lbafee
) {
5406 ns
->id_ns
.nlbaf
= MIN(ns
->id_ns
.nlbaf
, 15);
5411 case NVME_COMMAND_SET_PROFILE
:
5413 trace_pci_nvme_err_invalid_iocsci(dw11
& 0x1ff);
5414 return NVME_CMD_SET_CMB_REJECTED
| NVME_DNR
;
5418 return NVME_FEAT_NOT_CHANGEABLE
| NVME_DNR
;
5420 return NVME_SUCCESS
;
5423 static uint16_t nvme_aer(NvmeCtrl
*n
, NvmeRequest
*req
)
5425 trace_pci_nvme_aer(nvme_cid(req
));
5427 if (n
->outstanding_aers
> n
->params
.aerl
) {
5428 trace_pci_nvme_aer_aerl_exceeded();
5429 return NVME_AER_LIMIT_EXCEEDED
;
5432 n
->aer_reqs
[n
->outstanding_aers
] = req
;
5433 n
->outstanding_aers
++;
5435 if (!QTAILQ_EMPTY(&n
->aer_queue
)) {
5436 nvme_process_aers(n
);
5439 return NVME_NO_COMPLETE
;
5442 static void nvme_update_dmrsl(NvmeCtrl
*n
)
5446 for (nsid
= 1; nsid
<= NVME_MAX_NAMESPACES
; nsid
++) {
5447 NvmeNamespace
*ns
= nvme_ns(n
, nsid
);
5452 n
->dmrsl
= MIN_NON_ZERO(n
->dmrsl
,
5453 BDRV_REQUEST_MAX_BYTES
/ nvme_l2b(ns
, 1));
5457 static void nvme_select_iocs_ns(NvmeCtrl
*n
, NvmeNamespace
*ns
)
5459 uint32_t cc
= ldl_le_p(&n
->bar
.cc
);
5461 ns
->iocs
= nvme_cse_iocs_none
;
5464 if (NVME_CC_CSS(cc
) != NVME_CC_CSS_ADMIN_ONLY
) {
5465 ns
->iocs
= nvme_cse_iocs_nvm
;
5468 case NVME_CSI_ZONED
:
5469 if (NVME_CC_CSS(cc
) == NVME_CC_CSS_CSI
) {
5470 ns
->iocs
= nvme_cse_iocs_zoned
;
5471 } else if (NVME_CC_CSS(cc
) == NVME_CC_CSS_NVM
) {
5472 ns
->iocs
= nvme_cse_iocs_nvm
;
5478 static uint16_t nvme_ns_attachment(NvmeCtrl
*n
, NvmeRequest
*req
)
5482 uint16_t list
[NVME_CONTROLLER_LIST_SIZE
] = {};
5483 uint32_t nsid
= le32_to_cpu(req
->cmd
.nsid
);
5484 uint32_t dw10
= le32_to_cpu(req
->cmd
.cdw10
);
5485 uint8_t sel
= dw10
& 0xf;
5486 uint16_t *nr_ids
= &list
[0];
5487 uint16_t *ids
= &list
[1];
5491 trace_pci_nvme_ns_attachment(nvme_cid(req
), dw10
& 0xf);
5493 if (!nvme_nsid_valid(n
, nsid
)) {
5494 return NVME_INVALID_NSID
| NVME_DNR
;
5497 ns
= nvme_subsys_ns(n
->subsys
, nsid
);
5499 return NVME_INVALID_FIELD
| NVME_DNR
;
5502 ret
= nvme_h2c(n
, (uint8_t *)list
, 4096, req
);
5508 return NVME_NS_CTRL_LIST_INVALID
| NVME_DNR
;
5511 *nr_ids
= MIN(*nr_ids
, NVME_CONTROLLER_LIST_SIZE
- 1);
5512 for (i
= 0; i
< *nr_ids
; i
++) {
5513 ctrl
= nvme_subsys_ctrl(n
->subsys
, ids
[i
]);
5515 return NVME_NS_CTRL_LIST_INVALID
| NVME_DNR
;
5519 case NVME_NS_ATTACHMENT_ATTACH
:
5520 if (nvme_ns(ctrl
, nsid
)) {
5521 return NVME_NS_ALREADY_ATTACHED
| NVME_DNR
;
5524 if (ns
->attached
&& !ns
->params
.shared
) {
5525 return NVME_NS_PRIVATE
| NVME_DNR
;
5528 nvme_attach_ns(ctrl
, ns
);
5529 nvme_select_iocs_ns(ctrl
, ns
);
5533 case NVME_NS_ATTACHMENT_DETACH
:
5534 if (!nvme_ns(ctrl
, nsid
)) {
5535 return NVME_NS_NOT_ATTACHED
| NVME_DNR
;
5538 ctrl
->namespaces
[nsid
] = NULL
;
5541 nvme_update_dmrsl(ctrl
);
5546 return NVME_INVALID_FIELD
| NVME_DNR
;
5550 * Add namespace id to the changed namespace id list for event clearing
5551 * via Get Log Page command.
5553 if (!test_and_set_bit(nsid
, ctrl
->changed_nsids
)) {
5554 nvme_enqueue_event(ctrl
, NVME_AER_TYPE_NOTICE
,
5555 NVME_AER_INFO_NOTICE_NS_ATTR_CHANGED
,
5556 NVME_LOG_CHANGED_NSLIST
);
5560 return NVME_SUCCESS
;
5563 typedef struct NvmeFormatAIOCB
{
5581 static void nvme_format_bh(void *opaque
);
5583 static void nvme_format_cancel(BlockAIOCB
*aiocb
)
5585 NvmeFormatAIOCB
*iocb
= container_of(aiocb
, NvmeFormatAIOCB
, common
);
5588 blk_aio_cancel_async(iocb
->aiocb
);
5592 static const AIOCBInfo nvme_format_aiocb_info
= {
5593 .aiocb_size
= sizeof(NvmeFormatAIOCB
),
5594 .cancel_async
= nvme_format_cancel
,
5595 .get_aio_context
= nvme_get_aio_context
,
5598 static void nvme_format_set(NvmeNamespace
*ns
, uint8_t lbaf
, uint8_t mset
,
5599 uint8_t pi
, uint8_t pil
)
5601 uint8_t lbafl
= lbaf
& 0xf;
5602 uint8_t lbafu
= lbaf
>> 4;
5604 trace_pci_nvme_format_set(ns
->params
.nsid
, lbaf
, mset
, pi
, pil
);
5606 ns
->id_ns
.dps
= (pil
<< 3) | pi
;
5607 ns
->id_ns
.flbas
= (lbafu
<< 5) | (mset
<< 4) | lbafl
;
5609 nvme_ns_init_format(ns
);
5612 static void nvme_format_ns_cb(void *opaque
, int ret
)
5614 NvmeFormatAIOCB
*iocb
= opaque
;
5615 NvmeNamespace
*ns
= iocb
->ns
;
5625 if (iocb
->offset
< ns
->size
) {
5626 bytes
= MIN(BDRV_REQUEST_MAX_BYTES
, ns
->size
- iocb
->offset
);
5628 iocb
->aiocb
= blk_aio_pwrite_zeroes(ns
->blkconf
.blk
, iocb
->offset
,
5629 bytes
, BDRV_REQ_MAY_UNMAP
,
5630 nvme_format_ns_cb
, iocb
);
5632 iocb
->offset
+= bytes
;
5636 nvme_format_set(ns
, iocb
->lbaf
, iocb
->mset
, iocb
->pi
, iocb
->pil
);
5643 qemu_bh_schedule(iocb
->bh
);
5646 static uint16_t nvme_format_check(NvmeNamespace
*ns
, uint8_t lbaf
, uint8_t pi
)
5648 if (ns
->params
.zoned
) {
5649 return NVME_INVALID_FORMAT
| NVME_DNR
;
5652 if (lbaf
> ns
->id_ns
.nlbaf
) {
5653 return NVME_INVALID_FORMAT
| NVME_DNR
;
5656 if (pi
&& (ns
->id_ns
.lbaf
[lbaf
].ms
< nvme_pi_tuple_size(ns
))) {
5657 return NVME_INVALID_FORMAT
| NVME_DNR
;
5660 if (pi
&& pi
> NVME_ID_NS_DPS_TYPE_3
) {
5661 return NVME_INVALID_FIELD
| NVME_DNR
;
5664 return NVME_SUCCESS
;
5667 static void nvme_format_bh(void *opaque
)
5669 NvmeFormatAIOCB
*iocb
= opaque
;
5670 NvmeRequest
*req
= iocb
->req
;
5671 NvmeCtrl
*n
= nvme_ctrl(req
);
5672 uint32_t dw10
= le32_to_cpu(req
->cmd
.cdw10
);
5673 uint8_t lbaf
= dw10
& 0xf;
5674 uint8_t pi
= (dw10
>> 5) & 0x7;
5678 if (iocb
->ret
< 0) {
5682 if (iocb
->broadcast
) {
5683 for (i
= iocb
->nsid
+ 1; i
<= NVME_MAX_NAMESPACES
; i
++) {
5684 iocb
->ns
= nvme_ns(n
, i
);
5696 status
= nvme_format_check(iocb
->ns
, lbaf
, pi
);
5698 req
->status
= status
;
5702 iocb
->ns
->status
= NVME_FORMAT_IN_PROGRESS
;
5703 nvme_format_ns_cb(iocb
, 0);
5707 qemu_bh_delete(iocb
->bh
);
5710 iocb
->common
.cb(iocb
->common
.opaque
, iocb
->ret
);
5712 qemu_aio_unref(iocb
);
5715 static uint16_t nvme_format(NvmeCtrl
*n
, NvmeRequest
*req
)
5717 NvmeFormatAIOCB
*iocb
;
5718 uint32_t nsid
= le32_to_cpu(req
->cmd
.nsid
);
5719 uint32_t dw10
= le32_to_cpu(req
->cmd
.cdw10
);
5720 uint8_t lbaf
= dw10
& 0xf;
5721 uint8_t mset
= (dw10
>> 4) & 0x1;
5722 uint8_t pi
= (dw10
>> 5) & 0x7;
5723 uint8_t pil
= (dw10
>> 8) & 0x1;
5724 uint8_t lbafu
= (dw10
>> 12) & 0x3;
5727 iocb
= qemu_aio_get(&nvme_format_aiocb_info
, NULL
, nvme_misc_cb
, req
);
5730 iocb
->bh
= qemu_bh_new(nvme_format_bh
, iocb
);
5738 iocb
->broadcast
= (nsid
== NVME_NSID_BROADCAST
);
5741 if (n
->features
.hbs
.lbafee
) {
5742 iocb
->lbaf
|= lbafu
<< 4;
5745 if (!iocb
->broadcast
) {
5746 if (!nvme_nsid_valid(n
, nsid
)) {
5747 status
= NVME_INVALID_NSID
| NVME_DNR
;
5751 iocb
->ns
= nvme_ns(n
, nsid
);
5753 status
= NVME_INVALID_FIELD
| NVME_DNR
;
5758 req
->aiocb
= &iocb
->common
;
5759 qemu_bh_schedule(iocb
->bh
);
5761 return NVME_NO_COMPLETE
;
5764 qemu_bh_delete(iocb
->bh
);
5766 qemu_aio_unref(iocb
);
5770 static uint16_t nvme_admin_cmd(NvmeCtrl
*n
, NvmeRequest
*req
)
5772 trace_pci_nvme_admin_cmd(nvme_cid(req
), nvme_sqid(req
), req
->cmd
.opcode
,
5773 nvme_adm_opc_str(req
->cmd
.opcode
));
5775 if (!(nvme_cse_acs
[req
->cmd
.opcode
] & NVME_CMD_EFF_CSUPP
)) {
5776 trace_pci_nvme_err_invalid_admin_opc(req
->cmd
.opcode
);
5777 return NVME_INVALID_OPCODE
| NVME_DNR
;
5780 /* SGLs shall not be used for Admin commands in NVMe over PCIe */
5781 if (NVME_CMD_FLAGS_PSDT(req
->cmd
.flags
) != NVME_PSDT_PRP
) {
5782 return NVME_INVALID_FIELD
| NVME_DNR
;
5785 if (NVME_CMD_FLAGS_FUSE(req
->cmd
.flags
)) {
5786 return NVME_INVALID_FIELD
;
5789 switch (req
->cmd
.opcode
) {
5790 case NVME_ADM_CMD_DELETE_SQ
:
5791 return nvme_del_sq(n
, req
);
5792 case NVME_ADM_CMD_CREATE_SQ
:
5793 return nvme_create_sq(n
, req
);
5794 case NVME_ADM_CMD_GET_LOG_PAGE
:
5795 return nvme_get_log(n
, req
);
5796 case NVME_ADM_CMD_DELETE_CQ
:
5797 return nvme_del_cq(n
, req
);
5798 case NVME_ADM_CMD_CREATE_CQ
:
5799 return nvme_create_cq(n
, req
);
5800 case NVME_ADM_CMD_IDENTIFY
:
5801 return nvme_identify(n
, req
);
5802 case NVME_ADM_CMD_ABORT
:
5803 return nvme_abort(n
, req
);
5804 case NVME_ADM_CMD_SET_FEATURES
:
5805 return nvme_set_feature(n
, req
);
5806 case NVME_ADM_CMD_GET_FEATURES
:
5807 return nvme_get_feature(n
, req
);
5808 case NVME_ADM_CMD_ASYNC_EV_REQ
:
5809 return nvme_aer(n
, req
);
5810 case NVME_ADM_CMD_NS_ATTACHMENT
:
5811 return nvme_ns_attachment(n
, req
);
5812 case NVME_ADM_CMD_FORMAT_NVM
:
5813 return nvme_format(n
, req
);
5818 return NVME_INVALID_OPCODE
| NVME_DNR
;
5821 static void nvme_process_sq(void *opaque
)
5823 NvmeSQueue
*sq
= opaque
;
5824 NvmeCtrl
*n
= sq
->ctrl
;
5825 NvmeCQueue
*cq
= n
->cq
[sq
->cqid
];
5832 while (!(nvme_sq_empty(sq
) || QTAILQ_EMPTY(&sq
->req_list
))) {
5833 addr
= sq
->dma_addr
+ sq
->head
* n
->sqe_size
;
5834 if (nvme_addr_read(n
, addr
, (void *)&cmd
, sizeof(cmd
))) {
5835 trace_pci_nvme_err_addr_read(addr
);
5836 trace_pci_nvme_err_cfs();
5837 stl_le_p(&n
->bar
.csts
, NVME_CSTS_FAILED
);
5840 nvme_inc_sq_head(sq
);
5842 req
= QTAILQ_FIRST(&sq
->req_list
);
5843 QTAILQ_REMOVE(&sq
->req_list
, req
, entry
);
5844 QTAILQ_INSERT_TAIL(&sq
->out_req_list
, req
, entry
);
5845 nvme_req_clear(req
);
5846 req
->cqe
.cid
= cmd
.cid
;
5847 memcpy(&req
->cmd
, &cmd
, sizeof(NvmeCmd
));
5849 status
= sq
->sqid
? nvme_io_cmd(n
, req
) :
5850 nvme_admin_cmd(n
, req
);
5851 if (status
!= NVME_NO_COMPLETE
) {
5852 req
->status
= status
;
5853 nvme_enqueue_req_completion(cq
, req
);
5858 static void nvme_ctrl_reset(NvmeCtrl
*n
)
5863 for (i
= 1; i
<= NVME_MAX_NAMESPACES
; i
++) {
5872 for (i
= 0; i
< n
->params
.max_ioqpairs
+ 1; i
++) {
5873 if (n
->sq
[i
] != NULL
) {
5874 nvme_free_sq(n
->sq
[i
], n
);
5877 for (i
= 0; i
< n
->params
.max_ioqpairs
+ 1; i
++) {
5878 if (n
->cq
[i
] != NULL
) {
5879 nvme_free_cq(n
->cq
[i
], n
);
5883 while (!QTAILQ_EMPTY(&n
->aer_queue
)) {
5884 NvmeAsyncEvent
*event
= QTAILQ_FIRST(&n
->aer_queue
);
5885 QTAILQ_REMOVE(&n
->aer_queue
, event
, entry
);
5890 n
->outstanding_aers
= 0;
5891 n
->qs_created
= false;
5894 static void nvme_ctrl_shutdown(NvmeCtrl
*n
)
5900 memory_region_msync(&n
->pmr
.dev
->mr
, 0, n
->pmr
.dev
->size
);
5903 for (i
= 1; i
<= NVME_MAX_NAMESPACES
; i
++) {
5909 nvme_ns_shutdown(ns
);
5913 static void nvme_select_iocs(NvmeCtrl
*n
)
5918 for (i
= 1; i
<= NVME_MAX_NAMESPACES
; i
++) {
5924 nvme_select_iocs_ns(n
, ns
);
5928 static int nvme_start_ctrl(NvmeCtrl
*n
)
5930 uint64_t cap
= ldq_le_p(&n
->bar
.cap
);
5931 uint32_t cc
= ldl_le_p(&n
->bar
.cc
);
5932 uint32_t aqa
= ldl_le_p(&n
->bar
.aqa
);
5933 uint64_t asq
= ldq_le_p(&n
->bar
.asq
);
5934 uint64_t acq
= ldq_le_p(&n
->bar
.acq
);
5935 uint32_t page_bits
= NVME_CC_MPS(cc
) + 12;
5936 uint32_t page_size
= 1 << page_bits
;
5938 if (unlikely(n
->cq
[0])) {
5939 trace_pci_nvme_err_startfail_cq();
5942 if (unlikely(n
->sq
[0])) {
5943 trace_pci_nvme_err_startfail_sq();
5946 if (unlikely(asq
& (page_size
- 1))) {
5947 trace_pci_nvme_err_startfail_asq_misaligned(asq
);
5950 if (unlikely(acq
& (page_size
- 1))) {
5951 trace_pci_nvme_err_startfail_acq_misaligned(acq
);
5954 if (unlikely(!(NVME_CAP_CSS(cap
) & (1 << NVME_CC_CSS(cc
))))) {
5955 trace_pci_nvme_err_startfail_css(NVME_CC_CSS(cc
));
5958 if (unlikely(NVME_CC_MPS(cc
) < NVME_CAP_MPSMIN(cap
))) {
5959 trace_pci_nvme_err_startfail_page_too_small(
5961 NVME_CAP_MPSMIN(cap
));
5964 if (unlikely(NVME_CC_MPS(cc
) >
5965 NVME_CAP_MPSMAX(cap
))) {
5966 trace_pci_nvme_err_startfail_page_too_large(
5968 NVME_CAP_MPSMAX(cap
));
5971 if (unlikely(NVME_CC_IOCQES(cc
) <
5972 NVME_CTRL_CQES_MIN(n
->id_ctrl
.cqes
))) {
5973 trace_pci_nvme_err_startfail_cqent_too_small(
5975 NVME_CTRL_CQES_MIN(cap
));
5978 if (unlikely(NVME_CC_IOCQES(cc
) >
5979 NVME_CTRL_CQES_MAX(n
->id_ctrl
.cqes
))) {
5980 trace_pci_nvme_err_startfail_cqent_too_large(
5982 NVME_CTRL_CQES_MAX(cap
));
5985 if (unlikely(NVME_CC_IOSQES(cc
) <
5986 NVME_CTRL_SQES_MIN(n
->id_ctrl
.sqes
))) {
5987 trace_pci_nvme_err_startfail_sqent_too_small(
5989 NVME_CTRL_SQES_MIN(cap
));
5992 if (unlikely(NVME_CC_IOSQES(cc
) >
5993 NVME_CTRL_SQES_MAX(n
->id_ctrl
.sqes
))) {
5994 trace_pci_nvme_err_startfail_sqent_too_large(
5996 NVME_CTRL_SQES_MAX(cap
));
5999 if (unlikely(!NVME_AQA_ASQS(aqa
))) {
6000 trace_pci_nvme_err_startfail_asqent_sz_zero();
6003 if (unlikely(!NVME_AQA_ACQS(aqa
))) {
6004 trace_pci_nvme_err_startfail_acqent_sz_zero();
6008 n
->page_bits
= page_bits
;
6009 n
->page_size
= page_size
;
6010 n
->max_prp_ents
= n
->page_size
/ sizeof(uint64_t);
6011 n
->cqe_size
= 1 << NVME_CC_IOCQES(cc
);
6012 n
->sqe_size
= 1 << NVME_CC_IOSQES(cc
);
6013 nvme_init_cq(&n
->admin_cq
, n
, acq
, 0, 0, NVME_AQA_ACQS(aqa
) + 1, 1);
6014 nvme_init_sq(&n
->admin_sq
, n
, asq
, 0, 0, NVME_AQA_ASQS(aqa
) + 1);
6016 nvme_set_timestamp(n
, 0ULL);
6018 QTAILQ_INIT(&n
->aer_queue
);
6020 nvme_select_iocs(n
);
6025 static void nvme_cmb_enable_regs(NvmeCtrl
*n
)
6027 uint32_t cmbloc
= ldl_le_p(&n
->bar
.cmbloc
);
6028 uint32_t cmbsz
= ldl_le_p(&n
->bar
.cmbsz
);
6030 NVME_CMBLOC_SET_CDPCILS(cmbloc
, 1);
6031 NVME_CMBLOC_SET_CDPMLS(cmbloc
, 1);
6032 NVME_CMBLOC_SET_BIR(cmbloc
, NVME_CMB_BIR
);
6033 stl_le_p(&n
->bar
.cmbloc
, cmbloc
);
6035 NVME_CMBSZ_SET_SQS(cmbsz
, 1);
6036 NVME_CMBSZ_SET_CQS(cmbsz
, 0);
6037 NVME_CMBSZ_SET_LISTS(cmbsz
, 1);
6038 NVME_CMBSZ_SET_RDS(cmbsz
, 1);
6039 NVME_CMBSZ_SET_WDS(cmbsz
, 1);
6040 NVME_CMBSZ_SET_SZU(cmbsz
, 2); /* MBs */
6041 NVME_CMBSZ_SET_SZ(cmbsz
, n
->params
.cmb_size_mb
);
6042 stl_le_p(&n
->bar
.cmbsz
, cmbsz
);
6045 static void nvme_write_bar(NvmeCtrl
*n
, hwaddr offset
, uint64_t data
,
6048 uint64_t cap
= ldq_le_p(&n
->bar
.cap
);
6049 uint32_t cc
= ldl_le_p(&n
->bar
.cc
);
6050 uint32_t intms
= ldl_le_p(&n
->bar
.intms
);
6051 uint32_t csts
= ldl_le_p(&n
->bar
.csts
);
6052 uint32_t pmrsts
= ldl_le_p(&n
->bar
.pmrsts
);
6054 if (unlikely(offset
& (sizeof(uint32_t) - 1))) {
6055 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_misaligned32
,
6056 "MMIO write not 32-bit aligned,"
6057 " offset=0x%"PRIx64
"", offset
);
6058 /* should be ignored, fall through for now */
6061 if (unlikely(size
< sizeof(uint32_t))) {
6062 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_toosmall
,
6063 "MMIO write smaller than 32-bits,"
6064 " offset=0x%"PRIx64
", size=%u",
6066 /* should be ignored, fall through for now */
6070 case NVME_REG_INTMS
:
6071 if (unlikely(msix_enabled(&(n
->parent_obj
)))) {
6072 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_intmask_with_msix
,
6073 "undefined access to interrupt mask set"
6074 " when MSI-X is enabled");
6075 /* should be ignored, fall through for now */
6078 stl_le_p(&n
->bar
.intms
, intms
);
6079 n
->bar
.intmc
= n
->bar
.intms
;
6080 trace_pci_nvme_mmio_intm_set(data
& 0xffffffff, intms
);
6083 case NVME_REG_INTMC
:
6084 if (unlikely(msix_enabled(&(n
->parent_obj
)))) {
6085 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_intmask_with_msix
,
6086 "undefined access to interrupt mask clr"
6087 " when MSI-X is enabled");
6088 /* should be ignored, fall through for now */
6091 stl_le_p(&n
->bar
.intms
, intms
);
6092 n
->bar
.intmc
= n
->bar
.intms
;
6093 trace_pci_nvme_mmio_intm_clr(data
& 0xffffffff, intms
);
6097 trace_pci_nvme_mmio_cfg(data
& 0xffffffff);
6099 /* Windows first sends data, then sends enable bit */
6100 if (!NVME_CC_EN(data
) && !NVME_CC_EN(cc
) &&
6101 !NVME_CC_SHN(data
) && !NVME_CC_SHN(cc
))
6106 if (NVME_CC_EN(data
) && !NVME_CC_EN(cc
)) {
6109 /* flush CC since nvme_start_ctrl() needs the value */
6110 stl_le_p(&n
->bar
.cc
, cc
);
6111 if (unlikely(nvme_start_ctrl(n
))) {
6112 trace_pci_nvme_err_startfail();
6113 csts
= NVME_CSTS_FAILED
;
6115 trace_pci_nvme_mmio_start_success();
6116 csts
= NVME_CSTS_READY
;
6118 } else if (!NVME_CC_EN(data
) && NVME_CC_EN(cc
)) {
6119 trace_pci_nvme_mmio_stopped();
6122 csts
&= ~NVME_CSTS_READY
;
6125 if (NVME_CC_SHN(data
) && !(NVME_CC_SHN(cc
))) {
6126 trace_pci_nvme_mmio_shutdown_set();
6127 nvme_ctrl_shutdown(n
);
6129 csts
|= NVME_CSTS_SHST_COMPLETE
;
6130 } else if (!NVME_CC_SHN(data
) && NVME_CC_SHN(cc
)) {
6131 trace_pci_nvme_mmio_shutdown_cleared();
6132 csts
&= ~NVME_CSTS_SHST_COMPLETE
;
6136 stl_le_p(&n
->bar
.cc
, cc
);
6137 stl_le_p(&n
->bar
.csts
, csts
);
6141 if (data
& (1 << 4)) {
6142 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_ssreset_w1c_unsupported
,
6143 "attempted to W1C CSTS.NSSRO"
6144 " but CAP.NSSRS is zero (not supported)");
6145 } else if (data
!= 0) {
6146 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_ro_csts
,
6147 "attempted to set a read only bit"
6148 " of controller status");
6152 if (data
== 0x4e564d65) {
6153 trace_pci_nvme_ub_mmiowr_ssreset_unsupported();
6155 /* The spec says that writes of other values have no effect */
6160 stl_le_p(&n
->bar
.aqa
, data
);
6161 trace_pci_nvme_mmio_aqattr(data
& 0xffffffff);
6164 stn_le_p(&n
->bar
.asq
, size
, data
);
6165 trace_pci_nvme_mmio_asqaddr(data
);
6167 case NVME_REG_ASQ
+ 4:
6168 stl_le_p((uint8_t *)&n
->bar
.asq
+ 4, data
);
6169 trace_pci_nvme_mmio_asqaddr_hi(data
, ldq_le_p(&n
->bar
.asq
));
6172 trace_pci_nvme_mmio_acqaddr(data
);
6173 stn_le_p(&n
->bar
.acq
, size
, data
);
6175 case NVME_REG_ACQ
+ 4:
6176 stl_le_p((uint8_t *)&n
->bar
.acq
+ 4, data
);
6177 trace_pci_nvme_mmio_acqaddr_hi(data
, ldq_le_p(&n
->bar
.acq
));
6179 case NVME_REG_CMBLOC
:
6180 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_cmbloc_reserved
,
6181 "invalid write to reserved CMBLOC"
6182 " when CMBSZ is zero, ignored");
6184 case NVME_REG_CMBSZ
:
6185 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_cmbsz_readonly
,
6186 "invalid write to read only CMBSZ, ignored");
6188 case NVME_REG_CMBMSC
:
6189 if (!NVME_CAP_CMBS(cap
)) {
6193 stn_le_p(&n
->bar
.cmbmsc
, size
, data
);
6194 n
->cmb
.cmse
= false;
6196 if (NVME_CMBMSC_CRE(data
)) {
6197 nvme_cmb_enable_regs(n
);
6199 if (NVME_CMBMSC_CMSE(data
)) {
6200 uint64_t cmbmsc
= ldq_le_p(&n
->bar
.cmbmsc
);
6201 hwaddr cba
= NVME_CMBMSC_CBA(cmbmsc
) << CMBMSC_CBA_SHIFT
;
6202 if (cba
+ int128_get64(n
->cmb
.mem
.size
) < cba
) {
6203 uint32_t cmbsts
= ldl_le_p(&n
->bar
.cmbsts
);
6204 NVME_CMBSTS_SET_CBAI(cmbsts
, 1);
6205 stl_le_p(&n
->bar
.cmbsts
, cmbsts
);
6218 case NVME_REG_CMBMSC
+ 4:
6219 stl_le_p((uint8_t *)&n
->bar
.cmbmsc
+ 4, data
);
6222 case NVME_REG_PMRCAP
:
6223 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrcap_readonly
,
6224 "invalid write to PMRCAP register, ignored");
6226 case NVME_REG_PMRCTL
:
6227 if (!NVME_CAP_PMRS(cap
)) {
6231 stl_le_p(&n
->bar
.pmrctl
, data
);
6232 if (NVME_PMRCTL_EN(data
)) {
6233 memory_region_set_enabled(&n
->pmr
.dev
->mr
, true);
6236 memory_region_set_enabled(&n
->pmr
.dev
->mr
, false);
6237 NVME_PMRSTS_SET_NRDY(pmrsts
, 1);
6238 n
->pmr
.cmse
= false;
6240 stl_le_p(&n
->bar
.pmrsts
, pmrsts
);
6242 case NVME_REG_PMRSTS
:
6243 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrsts_readonly
,
6244 "invalid write to PMRSTS register, ignored");
6246 case NVME_REG_PMREBS
:
6247 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrebs_readonly
,
6248 "invalid write to PMREBS register, ignored");
6250 case NVME_REG_PMRSWTP
:
6251 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrswtp_readonly
,
6252 "invalid write to PMRSWTP register, ignored");
6254 case NVME_REG_PMRMSCL
:
6255 if (!NVME_CAP_PMRS(cap
)) {
6259 stl_le_p(&n
->bar
.pmrmscl
, data
);
6260 n
->pmr
.cmse
= false;
6262 if (NVME_PMRMSCL_CMSE(data
)) {
6263 uint64_t pmrmscu
= ldl_le_p(&n
->bar
.pmrmscu
);
6264 hwaddr cba
= pmrmscu
<< 32 |
6265 (NVME_PMRMSCL_CBA(data
) << PMRMSCL_CBA_SHIFT
);
6266 if (cba
+ int128_get64(n
->pmr
.dev
->mr
.size
) < cba
) {
6267 NVME_PMRSTS_SET_CBAI(pmrsts
, 1);
6268 stl_le_p(&n
->bar
.pmrsts
, pmrsts
);
6277 case NVME_REG_PMRMSCU
:
6278 if (!NVME_CAP_PMRS(cap
)) {
6282 stl_le_p(&n
->bar
.pmrmscu
, data
);
6285 NVME_GUEST_ERR(pci_nvme_ub_mmiowr_invalid
,
6286 "invalid MMIO write,"
6287 " offset=0x%"PRIx64
", data=%"PRIx64
"",
6293 static uint64_t nvme_mmio_read(void *opaque
, hwaddr addr
, unsigned size
)
6295 NvmeCtrl
*n
= (NvmeCtrl
*)opaque
;
6296 uint8_t *ptr
= (uint8_t *)&n
->bar
;
6298 trace_pci_nvme_mmio_read(addr
, size
);
6300 if (unlikely(addr
& (sizeof(uint32_t) - 1))) {
6301 NVME_GUEST_ERR(pci_nvme_ub_mmiord_misaligned32
,
6302 "MMIO read not 32-bit aligned,"
6303 " offset=0x%"PRIx64
"", addr
);
6304 /* should RAZ, fall through for now */
6305 } else if (unlikely(size
< sizeof(uint32_t))) {
6306 NVME_GUEST_ERR(pci_nvme_ub_mmiord_toosmall
,
6307 "MMIO read smaller than 32-bits,"
6308 " offset=0x%"PRIx64
"", addr
);
6309 /* should RAZ, fall through for now */
6312 if (addr
> sizeof(n
->bar
) - size
) {
6313 NVME_GUEST_ERR(pci_nvme_ub_mmiord_invalid_ofs
,
6314 "MMIO read beyond last register,"
6315 " offset=0x%"PRIx64
", returning 0", addr
);
6321 * When PMRWBM bit 1 is set then read from
6322 * from PMRSTS should ensure prior writes
6323 * made it to persistent media
6325 if (addr
== NVME_REG_PMRSTS
&&
6326 (NVME_PMRCAP_PMRWBM(ldl_le_p(&n
->bar
.pmrcap
)) & 0x02)) {
6327 memory_region_msync(&n
->pmr
.dev
->mr
, 0, n
->pmr
.dev
->size
);
6330 return ldn_le_p(ptr
+ addr
, size
);
6333 static void nvme_process_db(NvmeCtrl
*n
, hwaddr addr
, int val
)
6337 if (unlikely(addr
& ((1 << 2) - 1))) {
6338 NVME_GUEST_ERR(pci_nvme_ub_db_wr_misaligned
,
6339 "doorbell write not 32-bit aligned,"
6340 " offset=0x%"PRIx64
", ignoring", addr
);
6344 if (((addr
- 0x1000) >> 2) & 1) {
6345 /* Completion queue doorbell write */
6347 uint16_t new_head
= val
& 0xffff;
6351 qid
= (addr
- (0x1000 + (1 << 2))) >> 3;
6352 if (unlikely(nvme_check_cqid(n
, qid
))) {
6353 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_cq
,
6354 "completion queue doorbell write"
6355 " for nonexistent queue,"
6356 " sqid=%"PRIu32
", ignoring", qid
);
6359 * NVM Express v1.3d, Section 4.1 state: "If host software writes
6360 * an invalid value to the Submission Queue Tail Doorbell or
6361 * Completion Queue Head Doorbell regiter and an Asynchronous Event
6362 * Request command is outstanding, then an asynchronous event is
6363 * posted to the Admin Completion Queue with a status code of
6364 * Invalid Doorbell Write Value."
6366 * Also note that the spec includes the "Invalid Doorbell Register"
6367 * status code, but nowhere does it specify when to use it.
6368 * However, it seems reasonable to use it here in a similar
6371 if (n
->outstanding_aers
) {
6372 nvme_enqueue_event(n
, NVME_AER_TYPE_ERROR
,
6373 NVME_AER_INFO_ERR_INVALID_DB_REGISTER
,
6374 NVME_LOG_ERROR_INFO
);
6381 if (unlikely(new_head
>= cq
->size
)) {
6382 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_cqhead
,
6383 "completion queue doorbell write value"
6384 " beyond queue size, sqid=%"PRIu32
","
6385 " new_head=%"PRIu16
", ignoring",
6388 if (n
->outstanding_aers
) {
6389 nvme_enqueue_event(n
, NVME_AER_TYPE_ERROR
,
6390 NVME_AER_INFO_ERR_INVALID_DB_VALUE
,
6391 NVME_LOG_ERROR_INFO
);
6397 trace_pci_nvme_mmio_doorbell_cq(cq
->cqid
, new_head
);
6399 start_sqs
= nvme_cq_full(cq
) ? 1 : 0;
6400 cq
->head
= new_head
;
6403 QTAILQ_FOREACH(sq
, &cq
->sq_list
, entry
) {
6404 timer_mod(sq
->timer
, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL
) + 500);
6406 timer_mod(cq
->timer
, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL
) + 500);
6409 if (cq
->tail
== cq
->head
) {
6410 if (cq
->irq_enabled
) {
6414 nvme_irq_deassert(n
, cq
);
6417 /* Submission queue doorbell write */
6419 uint16_t new_tail
= val
& 0xffff;
6422 qid
= (addr
- 0x1000) >> 3;
6423 if (unlikely(nvme_check_sqid(n
, qid
))) {
6424 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_sq
,
6425 "submission queue doorbell write"
6426 " for nonexistent queue,"
6427 " sqid=%"PRIu32
", ignoring", qid
);
6429 if (n
->outstanding_aers
) {
6430 nvme_enqueue_event(n
, NVME_AER_TYPE_ERROR
,
6431 NVME_AER_INFO_ERR_INVALID_DB_REGISTER
,
6432 NVME_LOG_ERROR_INFO
);
6439 if (unlikely(new_tail
>= sq
->size
)) {
6440 NVME_GUEST_ERR(pci_nvme_ub_db_wr_invalid_sqtail
,
6441 "submission queue doorbell write value"
6442 " beyond queue size, sqid=%"PRIu32
","
6443 " new_tail=%"PRIu16
", ignoring",
6446 if (n
->outstanding_aers
) {
6447 nvme_enqueue_event(n
, NVME_AER_TYPE_ERROR
,
6448 NVME_AER_INFO_ERR_INVALID_DB_VALUE
,
6449 NVME_LOG_ERROR_INFO
);
6455 trace_pci_nvme_mmio_doorbell_sq(sq
->sqid
, new_tail
);
6457 sq
->tail
= new_tail
;
6458 timer_mod(sq
->timer
, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL
) + 500);
6462 static void nvme_mmio_write(void *opaque
, hwaddr addr
, uint64_t data
,
6465 NvmeCtrl
*n
= (NvmeCtrl
*)opaque
;
6467 trace_pci_nvme_mmio_write(addr
, data
, size
);
6469 if (addr
< sizeof(n
->bar
)) {
6470 nvme_write_bar(n
, addr
, data
, size
);
6472 nvme_process_db(n
, addr
, data
);
6476 static const MemoryRegionOps nvme_mmio_ops
= {
6477 .read
= nvme_mmio_read
,
6478 .write
= nvme_mmio_write
,
6479 .endianness
= DEVICE_LITTLE_ENDIAN
,
6481 .min_access_size
= 2,
6482 .max_access_size
= 8,
6486 static void nvme_cmb_write(void *opaque
, hwaddr addr
, uint64_t data
,
6489 NvmeCtrl
*n
= (NvmeCtrl
*)opaque
;
6490 stn_le_p(&n
->cmb
.buf
[addr
], size
, data
);
6493 static uint64_t nvme_cmb_read(void *opaque
, hwaddr addr
, unsigned size
)
6495 NvmeCtrl
*n
= (NvmeCtrl
*)opaque
;
6496 return ldn_le_p(&n
->cmb
.buf
[addr
], size
);
6499 static const MemoryRegionOps nvme_cmb_ops
= {
6500 .read
= nvme_cmb_read
,
6501 .write
= nvme_cmb_write
,
6502 .endianness
= DEVICE_LITTLE_ENDIAN
,
6504 .min_access_size
= 1,
6505 .max_access_size
= 8,
6509 static void nvme_check_constraints(NvmeCtrl
*n
, Error
**errp
)
6511 NvmeParams
*params
= &n
->params
;
6513 if (params
->num_queues
) {
6514 warn_report("num_queues is deprecated; please use max_ioqpairs "
6517 params
->max_ioqpairs
= params
->num_queues
- 1;
6520 if (n
->namespace.blkconf
.blk
&& n
->subsys
) {
6521 error_setg(errp
, "subsystem support is unavailable with legacy "
6522 "namespace ('drive' property)");
6526 if (params
->max_ioqpairs
< 1 ||
6527 params
->max_ioqpairs
> NVME_MAX_IOQPAIRS
) {
6528 error_setg(errp
, "max_ioqpairs must be between 1 and %d",
6533 if (params
->msix_qsize
< 1 ||
6534 params
->msix_qsize
> PCI_MSIX_FLAGS_QSIZE
+ 1) {
6535 error_setg(errp
, "msix_qsize must be between 1 and %d",
6536 PCI_MSIX_FLAGS_QSIZE
+ 1);
6540 if (!params
->serial
) {
6541 error_setg(errp
, "serial property not set");
6546 if (host_memory_backend_is_mapped(n
->pmr
.dev
)) {
6547 error_setg(errp
, "can't use already busy memdev: %s",
6548 object_get_canonical_path_component(OBJECT(n
->pmr
.dev
)));
6552 if (!is_power_of_2(n
->pmr
.dev
->size
)) {
6553 error_setg(errp
, "pmr backend size needs to be power of 2 in size");
6557 host_memory_backend_set_mapped(n
->pmr
.dev
, true);
6560 if (n
->params
.zasl
> n
->params
.mdts
) {
6561 error_setg(errp
, "zoned.zasl (Zone Append Size Limit) must be less "
6562 "than or equal to mdts (Maximum Data Transfer Size)");
6566 if (!n
->params
.vsl
) {
6567 error_setg(errp
, "vsl must be non-zero");
6572 static void nvme_init_state(NvmeCtrl
*n
)
6574 /* add one to max_ioqpairs to account for the admin queue pair */
6575 n
->reg_size
= pow2ceil(sizeof(NvmeBar
) +
6576 2 * (n
->params
.max_ioqpairs
+ 1) * NVME_DB_SIZE
);
6577 n
->sq
= g_new0(NvmeSQueue
*, n
->params
.max_ioqpairs
+ 1);
6578 n
->cq
= g_new0(NvmeCQueue
*, n
->params
.max_ioqpairs
+ 1);
6579 n
->temperature
= NVME_TEMPERATURE
;
6580 n
->features
.temp_thresh_hi
= NVME_TEMPERATURE_WARNING
;
6581 n
->starttime_ms
= qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL
);
6582 n
->aer_reqs
= g_new0(NvmeRequest
*, n
->params
.aerl
+ 1);
6585 static void nvme_init_cmb(NvmeCtrl
*n
, PCIDevice
*pci_dev
)
6587 uint64_t cmb_size
= n
->params
.cmb_size_mb
* MiB
;
6588 uint64_t cap
= ldq_le_p(&n
->bar
.cap
);
6590 n
->cmb
.buf
= g_malloc0(cmb_size
);
6591 memory_region_init_io(&n
->cmb
.mem
, OBJECT(n
), &nvme_cmb_ops
, n
,
6592 "nvme-cmb", cmb_size
);
6593 pci_register_bar(pci_dev
, NVME_CMB_BIR
,
6594 PCI_BASE_ADDRESS_SPACE_MEMORY
|
6595 PCI_BASE_ADDRESS_MEM_TYPE_64
|
6596 PCI_BASE_ADDRESS_MEM_PREFETCH
, &n
->cmb
.mem
);
6598 NVME_CAP_SET_CMBS(cap
, 1);
6599 stq_le_p(&n
->bar
.cap
, cap
);
6601 if (n
->params
.legacy_cmb
) {
6602 nvme_cmb_enable_regs(n
);
6607 static void nvme_init_pmr(NvmeCtrl
*n
, PCIDevice
*pci_dev
)
6609 uint32_t pmrcap
= ldl_le_p(&n
->bar
.pmrcap
);
6611 NVME_PMRCAP_SET_RDS(pmrcap
, 1);
6612 NVME_PMRCAP_SET_WDS(pmrcap
, 1);
6613 NVME_PMRCAP_SET_BIR(pmrcap
, NVME_PMR_BIR
);
6614 /* Turn on bit 1 support */
6615 NVME_PMRCAP_SET_PMRWBM(pmrcap
, 0x02);
6616 NVME_PMRCAP_SET_CMSS(pmrcap
, 1);
6617 stl_le_p(&n
->bar
.pmrcap
, pmrcap
);
6619 pci_register_bar(pci_dev
, NVME_PMR_BIR
,
6620 PCI_BASE_ADDRESS_SPACE_MEMORY
|
6621 PCI_BASE_ADDRESS_MEM_TYPE_64
|
6622 PCI_BASE_ADDRESS_MEM_PREFETCH
, &n
->pmr
.dev
->mr
);
6624 memory_region_set_enabled(&n
->pmr
.dev
->mr
, false);
6627 static int nvme_init_pci(NvmeCtrl
*n
, PCIDevice
*pci_dev
, Error
**errp
)
6629 uint8_t *pci_conf
= pci_dev
->config
;
6630 uint64_t bar_size
, msix_table_size
, msix_pba_size
;
6631 unsigned msix_table_offset
, msix_pba_offset
;
6636 pci_conf
[PCI_INTERRUPT_PIN
] = 1;
6637 pci_config_set_prog_interface(pci_conf
, 0x2);
6639 if (n
->params
.use_intel_id
) {
6640 pci_config_set_vendor_id(pci_conf
, PCI_VENDOR_ID_INTEL
);
6641 pci_config_set_device_id(pci_conf
, 0x5845);
6643 pci_config_set_vendor_id(pci_conf
, PCI_VENDOR_ID_REDHAT
);
6644 pci_config_set_device_id(pci_conf
, PCI_DEVICE_ID_REDHAT_NVME
);
6647 pci_config_set_class(pci_conf
, PCI_CLASS_STORAGE_EXPRESS
);
6648 pcie_endpoint_cap_init(pci_dev
, 0x80);
6650 bar_size
= QEMU_ALIGN_UP(n
->reg_size
, 4 * KiB
);
6651 msix_table_offset
= bar_size
;
6652 msix_table_size
= PCI_MSIX_ENTRY_SIZE
* n
->params
.msix_qsize
;
6654 bar_size
+= msix_table_size
;
6655 bar_size
= QEMU_ALIGN_UP(bar_size
, 4 * KiB
);
6656 msix_pba_offset
= bar_size
;
6657 msix_pba_size
= QEMU_ALIGN_UP(n
->params
.msix_qsize
, 64) / 8;
6659 bar_size
+= msix_pba_size
;
6660 bar_size
= pow2ceil(bar_size
);
6662 memory_region_init(&n
->bar0
, OBJECT(n
), "nvme-bar0", bar_size
);
6663 memory_region_init_io(&n
->iomem
, OBJECT(n
), &nvme_mmio_ops
, n
, "nvme",
6665 memory_region_add_subregion(&n
->bar0
, 0, &n
->iomem
);
6667 pci_register_bar(pci_dev
, 0, PCI_BASE_ADDRESS_SPACE_MEMORY
|
6668 PCI_BASE_ADDRESS_MEM_TYPE_64
, &n
->bar0
);
6669 ret
= msix_init(pci_dev
, n
->params
.msix_qsize
,
6670 &n
->bar0
, 0, msix_table_offset
,
6671 &n
->bar0
, 0, msix_pba_offset
, 0, &err
);
6673 if (ret
== -ENOTSUP
) {
6674 warn_report_err(err
);
6676 error_propagate(errp
, err
);
6681 if (n
->params
.cmb_size_mb
) {
6682 nvme_init_cmb(n
, pci_dev
);
6686 nvme_init_pmr(n
, pci_dev
);
6692 static void nvme_init_subnqn(NvmeCtrl
*n
)
6694 NvmeSubsystem
*subsys
= n
->subsys
;
6695 NvmeIdCtrl
*id
= &n
->id_ctrl
;
6698 snprintf((char *)id
->subnqn
, sizeof(id
->subnqn
),
6699 "nqn.2019-08.org.qemu:%s", n
->params
.serial
);
6701 pstrcpy((char *)id
->subnqn
, sizeof(id
->subnqn
), (char*)subsys
->subnqn
);
6705 static void nvme_init_ctrl(NvmeCtrl
*n
, PCIDevice
*pci_dev
)
6707 NvmeIdCtrl
*id
= &n
->id_ctrl
;
6708 uint8_t *pci_conf
= pci_dev
->config
;
6709 uint64_t cap
= ldq_le_p(&n
->bar
.cap
);
6711 id
->vid
= cpu_to_le16(pci_get_word(pci_conf
+ PCI_VENDOR_ID
));
6712 id
->ssvid
= cpu_to_le16(pci_get_word(pci_conf
+ PCI_SUBSYSTEM_VENDOR_ID
));
6713 strpadcpy((char *)id
->mn
, sizeof(id
->mn
), "QEMU NVMe Ctrl", ' ');
6714 strpadcpy((char *)id
->fr
, sizeof(id
->fr
), "1.0", ' ');
6715 strpadcpy((char *)id
->sn
, sizeof(id
->sn
), n
->params
.serial
, ' ');
6717 id
->cntlid
= cpu_to_le16(n
->cntlid
);
6719 id
->oaes
= cpu_to_le32(NVME_OAES_NS_ATTR
);
6720 id
->ctratt
|= cpu_to_le32(NVME_CTRATT_ELBAS
);
6724 if (n
->params
.use_intel_id
) {
6734 id
->mdts
= n
->params
.mdts
;
6735 id
->ver
= cpu_to_le32(NVME_SPEC_VER
);
6736 id
->oacs
= cpu_to_le16(NVME_OACS_NS_MGMT
| NVME_OACS_FORMAT
);
6737 id
->cntrltype
= 0x1;
6740 * Because the controller always completes the Abort command immediately,
6741 * there can never be more than one concurrently executing Abort command,
6742 * so this value is never used for anything. Note that there can easily be
6743 * many Abort commands in the queues, but they are not considered
6744 * "executing" until processed by nvme_abort.
6746 * The specification recommends a value of 3 for Abort Command Limit (four
6747 * concurrently outstanding Abort commands), so lets use that though it is
6751 id
->aerl
= n
->params
.aerl
;
6752 id
->frmw
= (NVME_NUM_FW_SLOTS
<< 1) | NVME_FRMW_SLOT1_RO
;
6753 id
->lpa
= NVME_LPA_NS_SMART
| NVME_LPA_CSE
| NVME_LPA_EXTENDED
;
6755 /* recommended default value (~70 C) */
6756 id
->wctemp
= cpu_to_le16(NVME_TEMPERATURE_WARNING
);
6757 id
->cctemp
= cpu_to_le16(NVME_TEMPERATURE_CRITICAL
);
6759 id
->sqes
= (0x6 << 4) | 0x6;
6760 id
->cqes
= (0x4 << 4) | 0x4;
6761 id
->nn
= cpu_to_le32(NVME_MAX_NAMESPACES
);
6762 id
->oncs
= cpu_to_le16(NVME_ONCS_WRITE_ZEROES
| NVME_ONCS_TIMESTAMP
|
6763 NVME_ONCS_FEATURES
| NVME_ONCS_DSM
|
6764 NVME_ONCS_COMPARE
| NVME_ONCS_COPY
);
6767 * NOTE: If this device ever supports a command set that does NOT use 0x0
6768 * as a Flush-equivalent operation, support for the broadcast NSID in Flush
6769 * should probably be removed.
6771 * See comment in nvme_io_cmd.
6773 id
->vwc
= NVME_VWC_NSID_BROADCAST_SUPPORT
| NVME_VWC_PRESENT
;
6775 id
->ocfs
= cpu_to_le16(NVME_OCFS_COPY_FORMAT_0
| NVME_OCFS_COPY_FORMAT_1
);
6776 id
->sgls
= cpu_to_le32(NVME_CTRL_SGLS_SUPPORT_NO_ALIGN
|
6777 NVME_CTRL_SGLS_BITBUCKET
);
6779 nvme_init_subnqn(n
);
6781 id
->psd
[0].mp
= cpu_to_le16(0x9c4);
6782 id
->psd
[0].enlat
= cpu_to_le32(0x10);
6783 id
->psd
[0].exlat
= cpu_to_le32(0x4);
6786 id
->cmic
|= NVME_CMIC_MULTI_CTRL
;
6789 NVME_CAP_SET_MQES(cap
, 0x7ff);
6790 NVME_CAP_SET_CQR(cap
, 1);
6791 NVME_CAP_SET_TO(cap
, 0xf);
6792 NVME_CAP_SET_CSS(cap
, NVME_CAP_CSS_NVM
);
6793 NVME_CAP_SET_CSS(cap
, NVME_CAP_CSS_CSI_SUPP
);
6794 NVME_CAP_SET_CSS(cap
, NVME_CAP_CSS_ADMIN_ONLY
);
6795 NVME_CAP_SET_MPSMAX(cap
, 4);
6796 NVME_CAP_SET_CMBS(cap
, n
->params
.cmb_size_mb
? 1 : 0);
6797 NVME_CAP_SET_PMRS(cap
, n
->pmr
.dev
? 1 : 0);
6798 stq_le_p(&n
->bar
.cap
, cap
);
6800 stl_le_p(&n
->bar
.vs
, NVME_SPEC_VER
);
6801 n
->bar
.intmc
= n
->bar
.intms
= 0;
6804 static int nvme_init_subsys(NvmeCtrl
*n
, Error
**errp
)
6812 cntlid
= nvme_subsys_register_ctrl(n
, errp
);
6822 void nvme_attach_ns(NvmeCtrl
*n
, NvmeNamespace
*ns
)
6824 uint32_t nsid
= ns
->params
.nsid
;
6825 assert(nsid
&& nsid
<= NVME_MAX_NAMESPACES
);
6827 n
->namespaces
[nsid
] = ns
;
6830 n
->dmrsl
= MIN_NON_ZERO(n
->dmrsl
,
6831 BDRV_REQUEST_MAX_BYTES
/ nvme_l2b(ns
, 1));
6834 static void nvme_realize(PCIDevice
*pci_dev
, Error
**errp
)
6836 NvmeCtrl
*n
= NVME(pci_dev
);
6838 Error
*local_err
= NULL
;
6840 nvme_check_constraints(n
, &local_err
);
6842 error_propagate(errp
, local_err
);
6846 qbus_init(&n
->bus
, sizeof(NvmeBus
), TYPE_NVME_BUS
,
6847 &pci_dev
->qdev
, n
->parent_obj
.qdev
.id
);
6850 if (nvme_init_pci(n
, pci_dev
, errp
)) {
6854 if (nvme_init_subsys(n
, errp
)) {
6855 error_propagate(errp
, local_err
);
6858 nvme_init_ctrl(n
, pci_dev
);
6860 /* setup a namespace if the controller drive property was given */
6861 if (n
->namespace.blkconf
.blk
) {
6863 ns
->params
.nsid
= 1;
6865 if (nvme_ns_setup(ns
, errp
)) {
6869 nvme_attach_ns(n
, ns
);
6873 static void nvme_exit(PCIDevice
*pci_dev
)
6875 NvmeCtrl
*n
= NVME(pci_dev
);
6882 for (i
= 1; i
<= NVME_MAX_NAMESPACES
; i
++) {
6889 nvme_subsys_unregister_ctrl(n
->subsys
, n
);
6894 g_free(n
->aer_reqs
);
6896 if (n
->params
.cmb_size_mb
) {
6901 host_memory_backend_set_mapped(n
->pmr
.dev
, false);
6903 msix_uninit(pci_dev
, &n
->bar0
, &n
->bar0
);
6904 memory_region_del_subregion(&n
->bar0
, &n
->iomem
);
6907 static Property nvme_props
[] = {
6908 DEFINE_BLOCK_PROPERTIES(NvmeCtrl
, namespace.blkconf
),
6909 DEFINE_PROP_LINK("pmrdev", NvmeCtrl
, pmr
.dev
, TYPE_MEMORY_BACKEND
,
6910 HostMemoryBackend
*),
6911 DEFINE_PROP_LINK("subsys", NvmeCtrl
, subsys
, TYPE_NVME_SUBSYS
,
6913 DEFINE_PROP_STRING("serial", NvmeCtrl
, params
.serial
),
6914 DEFINE_PROP_UINT32("cmb_size_mb", NvmeCtrl
, params
.cmb_size_mb
, 0),
6915 DEFINE_PROP_UINT32("num_queues", NvmeCtrl
, params
.num_queues
, 0),
6916 DEFINE_PROP_UINT32("max_ioqpairs", NvmeCtrl
, params
.max_ioqpairs
, 64),
6917 DEFINE_PROP_UINT16("msix_qsize", NvmeCtrl
, params
.msix_qsize
, 65),
6918 DEFINE_PROP_UINT8("aerl", NvmeCtrl
, params
.aerl
, 3),
6919 DEFINE_PROP_UINT32("aer_max_queued", NvmeCtrl
, params
.aer_max_queued
, 64),
6920 DEFINE_PROP_UINT8("mdts", NvmeCtrl
, params
.mdts
, 7),
6921 DEFINE_PROP_UINT8("vsl", NvmeCtrl
, params
.vsl
, 7),
6922 DEFINE_PROP_BOOL("use-intel-id", NvmeCtrl
, params
.use_intel_id
, false),
6923 DEFINE_PROP_BOOL("legacy-cmb", NvmeCtrl
, params
.legacy_cmb
, false),
6924 DEFINE_PROP_UINT8("zoned.zasl", NvmeCtrl
, params
.zasl
, 0),
6925 DEFINE_PROP_BOOL("zoned.auto_transition", NvmeCtrl
,
6926 params
.auto_transition_zones
, true),
6927 DEFINE_PROP_END_OF_LIST(),
6930 static void nvme_get_smart_warning(Object
*obj
, Visitor
*v
, const char *name
,
6931 void *opaque
, Error
**errp
)
6933 NvmeCtrl
*n
= NVME(obj
);
6934 uint8_t value
= n
->smart_critical_warning
;
6936 visit_type_uint8(v
, name
, &value
, errp
);
6939 static void nvme_set_smart_warning(Object
*obj
, Visitor
*v
, const char *name
,
6940 void *opaque
, Error
**errp
)
6942 NvmeCtrl
*n
= NVME(obj
);
6943 uint8_t value
, old_value
, cap
= 0, index
, event
;
6945 if (!visit_type_uint8(v
, name
, &value
, errp
)) {
6949 cap
= NVME_SMART_SPARE
| NVME_SMART_TEMPERATURE
| NVME_SMART_RELIABILITY
6950 | NVME_SMART_MEDIA_READ_ONLY
| NVME_SMART_FAILED_VOLATILE_MEDIA
;
6951 if (NVME_CAP_PMRS(ldq_le_p(&n
->bar
.cap
))) {
6952 cap
|= NVME_SMART_PMR_UNRELIABLE
;
6955 if ((value
& cap
) != value
) {
6956 error_setg(errp
, "unsupported smart critical warning bits: 0x%x",
6961 old_value
= n
->smart_critical_warning
;
6962 n
->smart_critical_warning
= value
;
6964 /* only inject new bits of smart critical warning */
6965 for (index
= 0; index
< NVME_SMART_WARN_MAX
; index
++) {
6967 if (value
& ~old_value
& event
)
6968 nvme_smart_event(n
, event
);
6972 static const VMStateDescription nvme_vmstate
= {
6977 static void nvme_class_init(ObjectClass
*oc
, void *data
)
6979 DeviceClass
*dc
= DEVICE_CLASS(oc
);
6980 PCIDeviceClass
*pc
= PCI_DEVICE_CLASS(oc
);
6982 pc
->realize
= nvme_realize
;
6983 pc
->exit
= nvme_exit
;
6984 pc
->class_id
= PCI_CLASS_STORAGE_EXPRESS
;
6987 set_bit(DEVICE_CATEGORY_STORAGE
, dc
->categories
);
6988 dc
->desc
= "Non-Volatile Memory Express";
6989 device_class_set_props(dc
, nvme_props
);
6990 dc
->vmsd
= &nvme_vmstate
;
6993 static void nvme_instance_init(Object
*obj
)
6995 NvmeCtrl
*n
= NVME(obj
);
6997 device_add_bootindex_property(obj
, &n
->namespace.blkconf
.bootindex
,
6998 "bootindex", "/namespace@1,0",
7001 object_property_add(obj
, "smart_critical_warning", "uint8",
7002 nvme_get_smart_warning
,
7003 nvme_set_smart_warning
, NULL
, NULL
);
7006 static const TypeInfo nvme_info
= {
7008 .parent
= TYPE_PCI_DEVICE
,
7009 .instance_size
= sizeof(NvmeCtrl
),
7010 .instance_init
= nvme_instance_init
,
7011 .class_init
= nvme_class_init
,
7012 .interfaces
= (InterfaceInfo
[]) {
7013 { INTERFACE_PCIE_DEVICE
},
7018 static const TypeInfo nvme_bus_info
= {
7019 .name
= TYPE_NVME_BUS
,
7021 .instance_size
= sizeof(NvmeBus
),
7024 static void nvme_register_types(void)
7026 type_register_static(&nvme_info
);
7027 type_register_static(&nvme_bus_info
);
7030 type_init(nvme_register_types
)