2 * QEMU NVM Express Virtual Namespace
4 * Copyright (c) 2019 CNEX Labs
5 * Copyright (c) 2020 Samsung Electronics
8 * Klaus Jensen <k.jensen@samsung.com>
10 * This work is licensed under the terms of the GNU GPL, version 2. See the
11 * COPYING file in the top-level directory.
15 #include "qemu/osdep.h"
16 #include "qemu/units.h"
17 #include "qemu/cutils.h"
19 #include "qemu/error-report.h"
20 #include "hw/block/block.h"
21 #include "hw/pci/pci.h"
22 #include "sysemu/sysemu.h"
23 #include "sysemu/block-backend.h"
24 #include "qapi/error.h"
26 #include "hw/qdev-properties.h"
27 #include "hw/qdev-core.h"
33 #define MIN_DISCARD_GRANULARITY (4 * KiB)
35 static int nvme_ns_init(NvmeNamespace
*ns
, Error
**errp
)
38 NvmeIdNs
*id_ns
= &ns
->id_ns
;
39 int lba_index
= NVME_ID_NS_FLBAS_INDEX(ns
->id_ns
.flbas
);
42 ns
->id_ns
.dlfeat
= 0x9;
44 id_ns
->lbaf
[lba_index
].ds
= 31 - clz32(ns
->blkconf
.logical_block_size
);
46 id_ns
->nsze
= cpu_to_le64(nvme_ns_nlbas(ns
));
48 ns
->csi
= NVME_CSI_NVM
;
50 /* no thin provisioning */
51 id_ns
->ncap
= id_ns
->nsze
;
52 id_ns
->nuse
= id_ns
->ncap
;
54 /* support DULBE and I/O optimization fields */
55 id_ns
->nsfeat
|= (0x4 | 0x10);
57 npdg
= ns
->blkconf
.discard_granularity
/ ns
->blkconf
.logical_block_size
;
59 if (bdrv_get_info(blk_bs(ns
->blkconf
.blk
), &bdi
) >= 0 &&
60 bdi
.cluster_size
> ns
->blkconf
.discard_granularity
) {
61 npdg
= bdi
.cluster_size
/ ns
->blkconf
.logical_block_size
;
64 id_ns
->npda
= id_ns
->npdg
= npdg
- 1;
69 static int nvme_ns_init_blk(NvmeNamespace
*ns
, Error
**errp
)
73 if (!blkconf_blocksizes(&ns
->blkconf
, errp
)) {
77 read_only
= !blk_supports_write_perm(ns
->blkconf
.blk
);
78 if (!blkconf_apply_backend_options(&ns
->blkconf
, read_only
, false, errp
)) {
82 if (ns
->blkconf
.discard_granularity
== -1) {
83 ns
->blkconf
.discard_granularity
=
84 MAX(ns
->blkconf
.logical_block_size
, MIN_DISCARD_GRANULARITY
);
87 ns
->size
= blk_getlength(ns
->blkconf
.blk
);
89 error_setg_errno(errp
, -ns
->size
, "could not get blockdev size");
96 static int nvme_ns_zoned_check_calc_geometry(NvmeNamespace
*ns
, Error
**errp
)
98 uint64_t zone_size
, zone_cap
;
99 uint32_t lbasz
= ns
->blkconf
.logical_block_size
;
101 /* Make sure that the values of ZNS properties are sane */
102 if (ns
->params
.zone_size_bs
) {
103 zone_size
= ns
->params
.zone_size_bs
;
105 zone_size
= NVME_DEFAULT_ZONE_SIZE
;
107 if (ns
->params
.zone_cap_bs
) {
108 zone_cap
= ns
->params
.zone_cap_bs
;
110 zone_cap
= zone_size
;
112 if (zone_cap
> zone_size
) {
113 error_setg(errp
, "zone capacity %"PRIu64
"B exceeds "
114 "zone size %"PRIu64
"B", zone_cap
, zone_size
);
117 if (zone_size
< lbasz
) {
118 error_setg(errp
, "zone size %"PRIu64
"B too small, "
119 "must be at least %"PRIu32
"B", zone_size
, lbasz
);
122 if (zone_cap
< lbasz
) {
123 error_setg(errp
, "zone capacity %"PRIu64
"B too small, "
124 "must be at least %"PRIu32
"B", zone_cap
, lbasz
);
129 * Save the main zone geometry values to avoid
130 * calculating them later again.
132 ns
->zone_size
= zone_size
/ lbasz
;
133 ns
->zone_capacity
= zone_cap
/ lbasz
;
134 ns
->num_zones
= ns
->size
/ lbasz
/ ns
->zone_size
;
136 /* Do a few more sanity checks of ZNS properties */
137 if (!ns
->num_zones
) {
139 "insufficient drive capacity, must be at least the size "
140 "of one zone (%"PRIu64
"B)", zone_size
);
144 if (ns
->params
.max_open_zones
> ns
->num_zones
) {
146 "max_open_zones value %u exceeds the number of zones %u",
147 ns
->params
.max_open_zones
, ns
->num_zones
);
150 if (ns
->params
.max_active_zones
> ns
->num_zones
) {
152 "max_active_zones value %u exceeds the number of zones %u",
153 ns
->params
.max_active_zones
, ns
->num_zones
);
157 if (ns
->params
.zd_extension_size
) {
158 if (ns
->params
.zd_extension_size
& 0x3f) {
160 "zone descriptor extension size must be a multiple of 64B");
163 if ((ns
->params
.zd_extension_size
>> 6) > 0xff) {
164 error_setg(errp
, "zone descriptor extension size is too large");
172 static void nvme_ns_zoned_init_state(NvmeNamespace
*ns
)
174 uint64_t start
= 0, zone_size
= ns
->zone_size
;
175 uint64_t capacity
= ns
->num_zones
* zone_size
;
179 ns
->zone_array
= g_new0(NvmeZone
, ns
->num_zones
);
180 if (ns
->params
.zd_extension_size
) {
181 ns
->zd_extensions
= g_malloc0(ns
->params
.zd_extension_size
*
185 QTAILQ_INIT(&ns
->exp_open_zones
);
186 QTAILQ_INIT(&ns
->imp_open_zones
);
187 QTAILQ_INIT(&ns
->closed_zones
);
188 QTAILQ_INIT(&ns
->full_zones
);
190 zone
= ns
->zone_array
;
191 for (i
= 0; i
< ns
->num_zones
; i
++, zone
++) {
192 if (start
+ zone_size
> capacity
) {
193 zone_size
= capacity
- start
;
195 zone
->d
.zt
= NVME_ZONE_TYPE_SEQ_WRITE
;
196 nvme_set_zone_state(zone
, NVME_ZONE_STATE_EMPTY
);
198 zone
->d
.zcap
= ns
->zone_capacity
;
199 zone
->d
.zslba
= start
;
205 ns
->zone_size_log2
= 0;
206 if (is_power_of_2(ns
->zone_size
)) {
207 ns
->zone_size_log2
= 63 - clz64(ns
->zone_size
);
211 static void nvme_ns_init_zoned(NvmeNamespace
*ns
, int lba_index
)
213 NvmeIdNsZoned
*id_ns_z
;
215 nvme_ns_zoned_init_state(ns
);
217 id_ns_z
= g_malloc0(sizeof(NvmeIdNsZoned
));
219 /* MAR/MOR are zeroes-based, 0xffffffff means no limit */
220 id_ns_z
->mar
= cpu_to_le32(ns
->params
.max_active_zones
- 1);
221 id_ns_z
->mor
= cpu_to_le32(ns
->params
.max_open_zones
- 1);
223 id_ns_z
->ozcs
= ns
->params
.cross_zone_read
? 0x01 : 0x00;
225 id_ns_z
->lbafe
[lba_index
].zsze
= cpu_to_le64(ns
->zone_size
);
226 id_ns_z
->lbafe
[lba_index
].zdes
=
227 ns
->params
.zd_extension_size
>> 6; /* Units of 64B */
229 ns
->csi
= NVME_CSI_ZONED
;
230 ns
->id_ns
.nsze
= cpu_to_le64(ns
->num_zones
* ns
->zone_size
);
231 ns
->id_ns
.ncap
= ns
->id_ns
.nsze
;
232 ns
->id_ns
.nuse
= ns
->id_ns
.ncap
;
235 * The device uses the BDRV_BLOCK_ZERO flag to determine the "deallocated"
236 * status of logical blocks. Since the spec defines that logical blocks
237 * SHALL be deallocated when then zone is in the Empty or Offline states,
238 * we can only support DULBE if the zone size is a multiple of the
241 if (ns
->zone_size
% (ns
->id_ns
.npdg
+ 1)) {
242 warn_report("the zone size (%"PRIu64
" blocks) is not a multiple of "
243 "the calculated deallocation granularity (%d blocks); "
244 "DULBE support disabled",
245 ns
->zone_size
, ns
->id_ns
.npdg
+ 1);
247 ns
->id_ns
.nsfeat
&= ~0x4;
250 ns
->id_ns_zoned
= id_ns_z
;
253 static void nvme_clear_zone(NvmeNamespace
*ns
, NvmeZone
*zone
)
257 zone
->w_ptr
= zone
->d
.wp
;
258 state
= nvme_get_zone_state(zone
);
259 if (zone
->d
.wp
!= zone
->d
.zslba
||
260 (zone
->d
.za
& NVME_ZA_ZD_EXT_VALID
)) {
261 if (state
!= NVME_ZONE_STATE_CLOSED
) {
262 trace_pci_nvme_clear_ns_close(state
, zone
->d
.zslba
);
263 nvme_set_zone_state(zone
, NVME_ZONE_STATE_CLOSED
);
265 nvme_aor_inc_active(ns
);
266 QTAILQ_INSERT_HEAD(&ns
->closed_zones
, zone
, entry
);
268 trace_pci_nvme_clear_ns_reset(state
, zone
->d
.zslba
);
269 nvme_set_zone_state(zone
, NVME_ZONE_STATE_EMPTY
);
274 * Close all the zones that are currently open.
276 static void nvme_zoned_ns_shutdown(NvmeNamespace
*ns
)
278 NvmeZone
*zone
, *next
;
280 QTAILQ_FOREACH_SAFE(zone
, &ns
->closed_zones
, entry
, next
) {
281 QTAILQ_REMOVE(&ns
->closed_zones
, zone
, entry
);
282 nvme_aor_dec_active(ns
);
283 nvme_clear_zone(ns
, zone
);
285 QTAILQ_FOREACH_SAFE(zone
, &ns
->imp_open_zones
, entry
, next
) {
286 QTAILQ_REMOVE(&ns
->imp_open_zones
, zone
, entry
);
287 nvme_aor_dec_open(ns
);
288 nvme_aor_dec_active(ns
);
289 nvme_clear_zone(ns
, zone
);
291 QTAILQ_FOREACH_SAFE(zone
, &ns
->exp_open_zones
, entry
, next
) {
292 QTAILQ_REMOVE(&ns
->exp_open_zones
, zone
, entry
);
293 nvme_aor_dec_open(ns
);
294 nvme_aor_dec_active(ns
);
295 nvme_clear_zone(ns
, zone
);
298 assert(ns
->nr_open_zones
== 0);
301 static int nvme_ns_check_constraints(NvmeNamespace
*ns
, Error
**errp
)
303 if (!ns
->blkconf
.blk
) {
304 error_setg(errp
, "block backend not configured");
311 int nvme_ns_setup(NvmeNamespace
*ns
, Error
**errp
)
313 if (nvme_ns_check_constraints(ns
, errp
)) {
317 if (nvme_ns_init_blk(ns
, errp
)) {
321 if (nvme_ns_init(ns
, errp
)) {
324 if (ns
->params
.zoned
) {
325 if (nvme_ns_zoned_check_calc_geometry(ns
, errp
) != 0) {
328 nvme_ns_init_zoned(ns
, 0);
334 void nvme_ns_drain(NvmeNamespace
*ns
)
336 blk_drain(ns
->blkconf
.blk
);
339 void nvme_ns_shutdown(NvmeNamespace
*ns
)
341 blk_flush(ns
->blkconf
.blk
);
342 if (ns
->params
.zoned
) {
343 nvme_zoned_ns_shutdown(ns
);
347 void nvme_ns_cleanup(NvmeNamespace
*ns
)
349 if (ns
->params
.zoned
) {
350 g_free(ns
->id_ns_zoned
);
351 g_free(ns
->zone_array
);
352 g_free(ns
->zd_extensions
);
356 static void nvme_ns_realize(DeviceState
*dev
, Error
**errp
)
358 NvmeNamespace
*ns
= NVME_NS(dev
);
359 BusState
*s
= qdev_get_parent_bus(dev
);
360 NvmeCtrl
*n
= NVME(s
->parent
);
362 if (nvme_ns_setup(ns
, errp
)) {
366 if (nvme_register_namespace(n
, ns
, errp
)) {
372 static Property nvme_ns_props
[] = {
373 DEFINE_BLOCK_PROPERTIES(NvmeNamespace
, blkconf
),
374 DEFINE_PROP_UINT32("nsid", NvmeNamespace
, params
.nsid
, 0),
375 DEFINE_PROP_UUID("uuid", NvmeNamespace
, params
.uuid
),
376 DEFINE_PROP_BOOL("zoned", NvmeNamespace
, params
.zoned
, false),
377 DEFINE_PROP_SIZE("zoned.zone_size", NvmeNamespace
, params
.zone_size_bs
,
378 NVME_DEFAULT_ZONE_SIZE
),
379 DEFINE_PROP_SIZE("zoned.zone_capacity", NvmeNamespace
, params
.zone_cap_bs
,
381 DEFINE_PROP_BOOL("zoned.cross_read", NvmeNamespace
,
382 params
.cross_zone_read
, false),
383 DEFINE_PROP_UINT32("zoned.max_active", NvmeNamespace
,
384 params
.max_active_zones
, 0),
385 DEFINE_PROP_UINT32("zoned.max_open", NvmeNamespace
,
386 params
.max_open_zones
, 0),
387 DEFINE_PROP_UINT32("zoned.descr_ext_size", NvmeNamespace
,
388 params
.zd_extension_size
, 0),
389 DEFINE_PROP_END_OF_LIST(),
392 static void nvme_ns_class_init(ObjectClass
*oc
, void *data
)
394 DeviceClass
*dc
= DEVICE_CLASS(oc
);
396 set_bit(DEVICE_CATEGORY_STORAGE
, dc
->categories
);
398 dc
->bus_type
= TYPE_NVME_BUS
;
399 dc
->realize
= nvme_ns_realize
;
400 device_class_set_props(dc
, nvme_ns_props
);
401 dc
->desc
= "Virtual NVMe namespace";
404 static void nvme_ns_instance_init(Object
*obj
)
406 NvmeNamespace
*ns
= NVME_NS(obj
);
407 char *bootindex
= g_strdup_printf("/namespace@%d,0", ns
->params
.nsid
);
409 device_add_bootindex_property(obj
, &ns
->bootindex
, "bootindex",
410 bootindex
, DEVICE(obj
));
415 static const TypeInfo nvme_ns_info
= {
416 .name
= TYPE_NVME_NS
,
417 .parent
= TYPE_DEVICE
,
418 .class_init
= nvme_ns_class_init
,
419 .instance_size
= sizeof(NvmeNamespace
),
420 .instance_init
= nvme_ns_instance_init
,
423 static void nvme_ns_register_types(void)
425 type_register_static(&nvme_ns_info
);
428 type_init(nvme_ns_register_types
)