2 * QEMU System Emulator block driver
4 * Copyright (c) 2003 Fabrice Bellard
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24 #ifndef BLOCK_COMMON_H
25 #define BLOCK_COMMON_H
27 #include "qapi/qapi-types-block-core.h"
28 #include "qemu/queue.h"
31 * co_wrapper{*}: Function specifiers used by block-coroutine-wrapper.py
33 * Function specifiers, which do nothing but mark functions to be
34 * generated by scripts/block-coroutine-wrapper.py
36 * Usage: read docs/devel/block-coroutine-wrapper.rst
38 * There are 4 kind of specifiers:
39 * - co_wrapper functions can be called by only non-coroutine context, because
40 * they always generate a new coroutine.
41 * - co_wrapper_mixed functions can be called by both coroutine and
42 * non-coroutine context.
43 * - co_wrapper_bdrv_rdlock are co_wrapper functions but automatically take and
44 * release the graph rdlock when creating a new coroutine
45 * - co_wrapper_mixed_bdrv_rdlock are co_wrapper_mixed functions but
46 * automatically take and release the graph rdlock when creating a new
49 * These functions should not be called from a coroutine_fn; instead,
50 * call the wrapped function directly.
52 #define co_wrapper no_coroutine_fn
53 #define co_wrapper_mixed no_coroutine_fn coroutine_mixed_fn
54 #define co_wrapper_bdrv_rdlock no_coroutine_fn
55 #define co_wrapper_mixed_bdrv_rdlock no_coroutine_fn coroutine_mixed_fn
58 * no_co_wrapper: Function specifier used by block-coroutine-wrapper.py
60 * Function specifier which does nothing but mark functions to be generated by
61 * scripts/block-coroutine-wrapper.py.
63 * A no_co_wrapper function declaration creates a coroutine_fn wrapper around
64 * functions that must not be called in coroutine context. It achieves this by
65 * scheduling a BH in the bottom half that runs the respective non-coroutine
66 * function. The coroutine yields after scheduling the BH and is reentered when
67 * the wrapped function returns.
69 * If the first parameter of the function is a BlockDriverState, BdrvChild or
70 * BlockBackend pointer, the AioContext lock for it is taken in the wrapper.
74 #include "block/blockjob.h"
77 typedef struct BlockDriver BlockDriver
;
78 typedef struct BdrvChild BdrvChild
;
79 typedef struct BdrvChildClass BdrvChildClass
;
81 typedef enum BlockZoneOp
{
88 typedef enum BlockZoneModel
{
89 BLK_Z_NONE
= 0x0, /* Regular block device */
90 BLK_Z_HM
= 0x1, /* Host-managed zoned block device */
91 BLK_Z_HA
= 0x2, /* Host-aware zoned block device */
94 typedef enum BlockZoneState
{
102 BLK_ZS_OFFLINE
= 0xF,
105 typedef enum BlockZoneType
{
106 BLK_ZT_CONV
= 0x1, /* Conventional random writes supported */
107 BLK_ZT_SWR
= 0x2, /* Sequential writes required */
108 BLK_ZT_SWP
= 0x3, /* Sequential writes preferred */
112 * Zone descriptor data structure.
113 * Provides information on a zone with all position and size values in bytes.
115 typedef struct BlockZoneDescriptor
{
121 BlockZoneState state
;
122 } BlockZoneDescriptor
;
125 * Track write pointers of a zone in bytes.
127 typedef struct BlockZoneWps
{
132 typedef struct BlockDriverInfo
{
133 /* in bytes, 0 if irrelevant */
136 * A fraction of cluster_size, if supported (currently QCOW2 only); if
137 * disabled or unsupported, set equal to cluster_size.
140 /* offset at which the VM state can be saved (0 if not possible) */
141 int64_t vm_state_offset
;
144 * True if this block driver only supports compressed writes
146 bool needs_compressed_writes
;
149 typedef struct BlockFragInfo
{
150 uint64_t allocated_clusters
;
151 uint64_t total_clusters
;
152 uint64_t fragmented_clusters
;
153 uint64_t compressed_clusters
;
157 BDRV_REQ_COPY_ON_READ
= 0x1,
158 BDRV_REQ_ZERO_WRITE
= 0x2,
161 * The BDRV_REQ_MAY_UNMAP flag is used in write_zeroes requests to indicate
162 * that the block driver should unmap (discard) blocks if it is guaranteed
163 * that the result will read back as zeroes. The flag is only passed to the
164 * driver if the block device is opened with BDRV_O_UNMAP.
166 BDRV_REQ_MAY_UNMAP
= 0x4,
169 * An optimization hint when all QEMUIOVector elements are within
170 * previously registered bdrv_register_buf() memory ranges.
172 * Code that replaces the user's QEMUIOVector elements with bounce buffers
173 * must take care to clear this flag.
175 BDRV_REQ_REGISTERED_BUF
= 0x8,
178 BDRV_REQ_WRITE_COMPRESSED
= 0x20,
181 * Signifies that this write request will not change the visible disk
184 BDRV_REQ_WRITE_UNCHANGED
= 0x40,
187 * Forces request serialisation. Use only with write requests.
189 BDRV_REQ_SERIALISING
= 0x80,
192 * Execute the request only if the operation can be offloaded or otherwise
193 * be executed efficiently, but return an error instead of using a slow
196 BDRV_REQ_NO_FALLBACK
= 0x100,
199 * BDRV_REQ_PREFETCH makes sense only in the context of copy-on-read
200 * (i.e., together with the BDRV_REQ_COPY_ON_READ flag or when a COR
201 * filter is involved), in which case it signals that the COR operation
202 * need not read the data into memory (qiov) but only ensure they are
203 * copied to the top layer (i.e., that COR operation is done).
205 BDRV_REQ_PREFETCH
= 0x200,
208 * If we need to wait for other requests, just fail immediately. Used
209 * only together with BDRV_REQ_SERIALISING. Used only with requests aligned
210 * to request_alignment (corresponding assertions are in block/io.c).
212 BDRV_REQ_NO_WAIT
= 0x400,
214 /* Mask of valid flags */
215 BDRV_REQ_MASK
= 0x7ff,
218 #define BDRV_O_NO_SHARE 0x0001 /* don't share permissions */
219 #define BDRV_O_RDWR 0x0002
220 #define BDRV_O_RESIZE 0x0004 /* request permission for resizing the node */
221 #define BDRV_O_SNAPSHOT 0x0008 /* open the file read only and save
222 writes in a snapshot */
223 #define BDRV_O_TEMPORARY 0x0010 /* delete the file after use */
224 #define BDRV_O_NOCACHE 0x0020 /* do not use the host page cache */
225 #define BDRV_O_NATIVE_AIO 0x0080 /* use native AIO instead of the
227 #define BDRV_O_NO_BACKING 0x0100 /* don't open the backing file */
228 #define BDRV_O_NO_FLUSH 0x0200 /* disable flushing on this disk */
229 #define BDRV_O_COPY_ON_READ 0x0400 /* copy read backing sectors into image */
230 #define BDRV_O_INACTIVE 0x0800 /* consistency hint for migration handoff */
231 #define BDRV_O_CHECK 0x1000 /* open solely for consistency check */
232 #define BDRV_O_ALLOW_RDWR 0x2000 /* allow reopen to change from r/o to r/w */
233 #define BDRV_O_UNMAP 0x4000 /* execute guest UNMAP/TRIM operations */
234 #define BDRV_O_PROTOCOL 0x8000 /* if no block driver is explicitly given:
235 select an appropriate protocol driver,
236 ignoring the format layer */
237 #define BDRV_O_NO_IO 0x10000 /* don't initialize for I/O */
238 #define BDRV_O_AUTO_RDONLY 0x20000 /* degrade to read-only if opening
240 #define BDRV_O_IO_URING 0x40000 /* use io_uring instead of the thread pool */
242 #define BDRV_O_CACHE_MASK (BDRV_O_NOCACHE | BDRV_O_NO_FLUSH)
245 /* Option names of options parsed by the block layer */
247 #define BDRV_OPT_CACHE_WB "cache.writeback"
248 #define BDRV_OPT_CACHE_DIRECT "cache.direct"
249 #define BDRV_OPT_CACHE_NO_FLUSH "cache.no-flush"
250 #define BDRV_OPT_READ_ONLY "read-only"
251 #define BDRV_OPT_AUTO_READ_ONLY "auto-read-only"
252 #define BDRV_OPT_DISCARD "discard"
253 #define BDRV_OPT_FORCE_SHARE "force-share"
256 #define BDRV_SECTOR_BITS 9
257 #define BDRV_SECTOR_SIZE (1ULL << BDRV_SECTOR_BITS)
260 * Get the first most significant bit of wp. If it is zero, then
261 * the zone type is SWR.
263 #define BDRV_ZT_IS_CONV(wp) (wp & (1ULL << 63))
265 #define BDRV_REQUEST_MAX_SECTORS MIN_CONST(SIZE_MAX >> BDRV_SECTOR_BITS, \
266 INT_MAX >> BDRV_SECTOR_BITS)
267 #define BDRV_REQUEST_MAX_BYTES (BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS)
270 * We want allow aligning requests and disk length up to any 32bit alignment
271 * and don't afraid of overflow.
272 * To achieve it, and in the same time use some pretty number as maximum disk
273 * size, let's define maximum "length" (a limit for any offset/bytes request and
274 * for disk size) to be the greatest power of 2 less than INT64_MAX.
276 #define BDRV_MAX_ALIGNMENT (1L << 30)
277 #define BDRV_MAX_LENGTH (QEMU_ALIGN_DOWN(INT64_MAX, BDRV_MAX_ALIGNMENT))
280 * Allocation status flags for bdrv_block_status() and friends.
283 * BDRV_BLOCK_DATA: allocation for data at offset is tied to this layer
284 * BDRV_BLOCK_ZERO: offset reads as zero
285 * BDRV_BLOCK_OFFSET_VALID: an associated offset exists for accessing raw data
286 * BDRV_BLOCK_ALLOCATED: the content of the block is determined by this
287 * layer rather than any backing, set by block layer
288 * BDRV_BLOCK_EOF: the returned pnum covers through end of file for this
289 * layer, set by block layer
292 * BDRV_BLOCK_RAW: for use by passthrough drivers, such as raw, to request
293 * that the block layer recompute the answer from the returned
294 * BDS; must be accompanied by just BDRV_BLOCK_OFFSET_VALID.
295 * BDRV_BLOCK_RECURSE: request that the block layer will recursively search for
296 * zeroes in file child of current block node inside
297 * returned region. Only valid together with both
298 * BDRV_BLOCK_DATA and BDRV_BLOCK_OFFSET_VALID. Should not
299 * appear with BDRV_BLOCK_ZERO.
301 * If BDRV_BLOCK_OFFSET_VALID is set, the map parameter represents the
302 * host offset within the returned BDS that is allocated for the
303 * corresponding raw guest data. However, whether that offset
304 * actually contains data also depends on BDRV_BLOCK_DATA, as follows:
306 * DATA ZERO OFFSET_VALID
307 * t t t sectors read as zero, returned file is zero at offset
308 * t f t sectors read as valid from file at offset
309 * f t t sectors preallocated, read as zero, returned file not
310 * necessarily zero at offset
311 * f f t sectors preallocated but read from backing_hd,
312 * returned file contains garbage at offset
313 * t t f sectors preallocated, read as zero, unknown offset
314 * t f f sectors read from unknown file or offset
315 * f t f not allocated or unknown offset, read as zero
316 * f f f not allocated or unknown offset, read from backing_hd
318 #define BDRV_BLOCK_DATA 0x01
319 #define BDRV_BLOCK_ZERO 0x02
320 #define BDRV_BLOCK_OFFSET_VALID 0x04
321 #define BDRV_BLOCK_RAW 0x08
322 #define BDRV_BLOCK_ALLOCATED 0x10
323 #define BDRV_BLOCK_EOF 0x20
324 #define BDRV_BLOCK_RECURSE 0x40
326 typedef QTAILQ_HEAD(BlockReopenQueue
, BlockReopenQueueEntry
) BlockReopenQueue
;
328 typedef struct BDRVReopenState
{
329 BlockDriverState
*bs
;
331 BlockdevDetectZeroesOptions detect_zeroes
;
332 bool backing_missing
;
333 BlockDriverState
*old_backing_bs
; /* keep pointer for permissions update */
334 BlockDriverState
*old_file_bs
; /* keep pointer for permissions update */
336 QDict
*explicit_options
;
341 * Block operation types
343 typedef enum BlockOpType
{
344 BLOCK_OP_TYPE_BACKUP_SOURCE
,
345 BLOCK_OP_TYPE_BACKUP_TARGET
,
346 BLOCK_OP_TYPE_CHANGE
,
347 BLOCK_OP_TYPE_COMMIT_SOURCE
,
348 BLOCK_OP_TYPE_COMMIT_TARGET
,
349 BLOCK_OP_TYPE_DATAPLANE
,
350 BLOCK_OP_TYPE_DRIVE_DEL
,
352 BLOCK_OP_TYPE_EXTERNAL_SNAPSHOT
,
353 BLOCK_OP_TYPE_INTERNAL_SNAPSHOT
,
354 BLOCK_OP_TYPE_INTERNAL_SNAPSHOT_DELETE
,
355 BLOCK_OP_TYPE_MIRROR_SOURCE
,
356 BLOCK_OP_TYPE_MIRROR_TARGET
,
357 BLOCK_OP_TYPE_RESIZE
,
358 BLOCK_OP_TYPE_STREAM
,
359 BLOCK_OP_TYPE_REPLACE
,
363 /* Block node permission constants */
366 * A user that has the "permission" of consistent reads is guaranteed that
367 * their view of the contents of the block device is complete and
368 * self-consistent, representing the contents of a disk at a specific
371 * For most block devices (including their backing files) this is true, but
372 * the property cannot be maintained in a few situations like for
373 * intermediate nodes of a commit block job.
375 BLK_PERM_CONSISTENT_READ
= 0x01,
377 /** This permission is required to change the visible disk contents. */
378 BLK_PERM_WRITE
= 0x02,
381 * This permission (which is weaker than BLK_PERM_WRITE) is both enough and
382 * required for writes to the block node when the caller promises that
383 * the visible disk content doesn't change.
385 * As the BLK_PERM_WRITE permission is strictly stronger, either is
386 * sufficient to perform an unchanging write.
388 BLK_PERM_WRITE_UNCHANGED
= 0x04,
390 /** This permission is required to change the size of a block node. */
391 BLK_PERM_RESIZE
= 0x08,
394 * There was a now-removed bit BLK_PERM_GRAPH_MOD, with value of 0x10. QEMU
395 * 6.1 and earlier may still lock the corresponding byte in block/file-posix
396 * locking. So, implementing some new permission should be very careful to
397 * not interfere with this old unused thing.
402 DEFAULT_PERM_PASSTHROUGH
= BLK_PERM_CONSISTENT_READ
404 | BLK_PERM_WRITE_UNCHANGED
407 DEFAULT_PERM_UNCHANGED
= BLK_PERM_ALL
& ~DEFAULT_PERM_PASSTHROUGH
,
411 * Flags that parent nodes assign to child nodes to specify what kind of
414 * At least one of DATA, METADATA, FILTERED, or COW must be set for
418 * = Connection with bs->children, bs->file and bs->backing fields =
422 * Filter drivers have drv->is_filter = true.
424 * Filter node has exactly one FILTERED|PRIMARY child, and may have other
425 * children which must not have these bits (one example is the
426 * copy-before-write filter, which also has its target DATA child).
428 * Filter nodes never have COW children.
430 * For most filters, the filtered child is linked in bs->file, bs->backing is
431 * NULL. For some filters (as an exception), it is the other way around; those
432 * drivers will have drv->filtered_child_is_backing set to true (see that
433 * field’s documentation for what drivers this concerns)
435 * 2. "raw" driver (block/raw-format.c)
437 * Formally it's not a filter (drv->is_filter = false)
439 * bs->backing is always NULL
441 * Only has one child, linked in bs->file. Its role is either FILTERED|PRIMARY
442 * (like filter) or DATA|PRIMARY depending on options.
446 * Don't have any FILTERED children.
448 * May have at most one COW child. In this case it's linked in bs->backing.
449 * Otherwise bs->backing is NULL. COW child is never PRIMARY.
451 * May have at most one PRIMARY child. In this case it's linked in bs->file.
452 * Otherwise bs->file is NULL.
454 * May also have some other children that don't have the PRIMARY or COW bit set.
456 enum BdrvChildRoleBits
{
458 * This child stores data.
459 * Any node may have an arbitrary number of such children.
461 BDRV_CHILD_DATA
= (1 << 0),
464 * This child stores metadata.
465 * Any node may have an arbitrary number of metadata-storing
468 BDRV_CHILD_METADATA
= (1 << 1),
471 * A child that always presents exactly the same visible data as
472 * the parent, e.g. by virtue of the parent forwarding all reads
474 * This flag is mutually exclusive with DATA, METADATA, and COW.
475 * Any node may have at most one filtered child at a time.
477 BDRV_CHILD_FILTERED
= (1 << 2),
480 * Child from which to read all data that isn't allocated in the
481 * parent (i.e., the backing child); such data is copied to the
482 * parent through COW (and optionally COR).
483 * This field is mutually exclusive with DATA, METADATA, and
485 * Any node may have at most one such backing child at a time.
487 BDRV_CHILD_COW
= (1 << 3),
490 * The primary child. For most drivers, this is the child whose
491 * filename applies best to the parent node.
492 * Any node may have at most one primary child at a time.
494 BDRV_CHILD_PRIMARY
= (1 << 4),
496 /* Useful combination of flags */
497 BDRV_CHILD_IMAGE
= BDRV_CHILD_DATA
498 | BDRV_CHILD_METADATA
499 | BDRV_CHILD_PRIMARY
,
502 /* Mask of BdrvChildRoleBits values */
503 typedef unsigned int BdrvChildRole
;
505 typedef struct BdrvCheckResult
{
509 int corruptions_fixed
;
511 int64_t image_end_offset
;
520 typedef struct BlockSizes
{
525 typedef struct HDGeometry
{
532 * Common functions that are neither I/O nor Global State.
534 * These functions must never call any function from other categories
535 * (I/O, "I/O or GS", Global State) except this one, but can be invoked by
539 char *bdrv_perm_names(uint64_t perm
);
540 uint64_t bdrv_qapi_perm_to_blk_perm(BlockPermission qapi_perm
);
542 void bdrv_init_with_whitelist(void);
543 bool bdrv_uses_whitelist(void);
544 int bdrv_is_whitelisted(BlockDriver
*drv
, bool read_only
);
546 int bdrv_parse_aio(const char *mode
, int *flags
);
547 int bdrv_parse_cache_mode(const char *mode
, int *flags
, bool *writethrough
);
548 int bdrv_parse_discard_flags(const char *mode
, int *flags
);
550 int path_has_protocol(const char *path
);
551 int path_is_absolute(const char *path
);
552 char *path_combine(const char *base_path
, const char *filename
);
554 char *bdrv_get_full_backing_filename_from_filename(const char *backed
,
558 #endif /* BLOCK_COMMON_H */