2 * preallocate filter driver
4 * The driver performs preallocate operation: it is injected above
5 * some node, and before each write over EOF it does additional preallocating
6 * write-zeroes request.
8 * Copyright (c) 2020 Virtuozzo International GmbH.
11 * Sementsov-Ogievskiy Vladimir <vsementsov@virtuozzo.com>
13 * This program is free software; you can redistribute it and/or modify
14 * it under the terms of the GNU General Public License as published by
15 * the Free Software Foundation; either version 2 of the License, or
16 * (at your option) any later version.
18 * This program is distributed in the hope that it will be useful,
19 * but WITHOUT ANY WARRANTY; without even the implied warranty of
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 * GNU General Public License for more details.
23 * You should have received a copy of the GNU General Public License
24 * along with this program. If not, see <http://www.gnu.org/licenses/>.
27 #include "qemu/osdep.h"
29 #include "qapi/error.h"
30 #include "qemu/module.h"
31 #include "qemu/option.h"
32 #include "qemu/units.h"
33 #include "block/block_int.h"
36 typedef struct PreallocateOpts
{
37 int64_t prealloc_size
;
38 int64_t prealloc_align
;
41 typedef struct BDRVPreallocateState
{
45 * Track real data end, to crop preallocation on close. If < 0 the status is
48 * @data_end is a maximum of file size on open (or when we get write/resize
49 * permissions) and all write request ends after it. So it's safe to
50 * truncate to data_end if it is valid.
55 * Start of trailing preallocated area which reads as zero. May be smaller
56 * than data_end, if user does over-EOF write zero operation. If < 0 the
59 * If both @zero_start and @file_end are valid, the region
60 * [@zero_start, @file_end) is known to be preallocated zeroes. If @file_end
61 * is not valid, @zero_start doesn't make much sense.
66 * Real end of file. Actually the cache for bdrv_getlength(bs->file->bs),
67 * to avoid extra lseek() calls on each write operation. If < 0 the status
73 * All three states @data_end, @zero_start and @file_end are guaranteed to
74 * be invalid (< 0) when we don't have both exclusive BLK_PERM_RESIZE and
75 * BLK_PERM_WRITE permissions on file child.
77 } BDRVPreallocateState
;
79 #define PREALLOCATE_OPT_PREALLOC_ALIGN "prealloc-align"
80 #define PREALLOCATE_OPT_PREALLOC_SIZE "prealloc-size"
81 static QemuOptsList runtime_opts
= {
82 .name
= "preallocate",
83 .head
= QTAILQ_HEAD_INITIALIZER(runtime_opts
.head
),
86 .name
= PREALLOCATE_OPT_PREALLOC_ALIGN
,
87 .type
= QEMU_OPT_SIZE
,
88 .help
= "on preallocation, align file length to this number, "
92 .name
= PREALLOCATE_OPT_PREALLOC_SIZE
,
93 .type
= QEMU_OPT_SIZE
,
94 .help
= "how much to preallocate, default 128M",
100 static bool preallocate_absorb_opts(PreallocateOpts
*dest
, QDict
*options
,
101 BlockDriverState
*child_bs
, Error
**errp
)
103 QemuOpts
*opts
= qemu_opts_create(&runtime_opts
, NULL
, 0, &error_abort
);
105 if (!qemu_opts_absorb_qdict(opts
, options
, errp
)) {
109 dest
->prealloc_align
=
110 qemu_opt_get_size(opts
, PREALLOCATE_OPT_PREALLOC_ALIGN
, 1 * MiB
);
111 dest
->prealloc_size
=
112 qemu_opt_get_size(opts
, PREALLOCATE_OPT_PREALLOC_SIZE
, 128 * MiB
);
116 if (!QEMU_IS_ALIGNED(dest
->prealloc_align
, BDRV_SECTOR_SIZE
)) {
117 error_setg(errp
, "prealloc-align parameter of preallocate filter "
118 "is not aligned to %llu", BDRV_SECTOR_SIZE
);
122 if (!QEMU_IS_ALIGNED(dest
->prealloc_align
,
123 child_bs
->bl
.request_alignment
)) {
124 error_setg(errp
, "prealloc-align parameter of preallocate filter "
125 "is not aligned to underlying node request alignment "
126 "(%" PRIi32
")", child_bs
->bl
.request_alignment
);
133 static int preallocate_open(BlockDriverState
*bs
, QDict
*options
, int flags
,
136 BDRVPreallocateState
*s
= bs
->opaque
;
139 * s->data_end and friends should be initialized on permission update.
140 * For this to work, mark them invalid.
142 s
->file_end
= s
->zero_start
= s
->data_end
= -EINVAL
;
144 bs
->file
= bdrv_open_child(NULL
, options
, "file", bs
, &child_of_bds
,
145 BDRV_CHILD_FILTERED
| BDRV_CHILD_PRIMARY
,
151 if (!preallocate_absorb_opts(&s
->opts
, options
, bs
->file
->bs
, errp
)) {
155 bs
->supported_write_flags
= BDRV_REQ_WRITE_UNCHANGED
|
156 (BDRV_REQ_FUA
& bs
->file
->bs
->supported_write_flags
);
158 bs
->supported_zero_flags
= BDRV_REQ_WRITE_UNCHANGED
|
159 ((BDRV_REQ_FUA
| BDRV_REQ_MAY_UNMAP
| BDRV_REQ_NO_FALLBACK
) &
160 bs
->file
->bs
->supported_zero_flags
);
165 static void preallocate_close(BlockDriverState
*bs
)
168 BDRVPreallocateState
*s
= bs
->opaque
;
170 if (s
->data_end
< 0) {
174 if (s
->file_end
< 0) {
175 s
->file_end
= bdrv_getlength(bs
->file
->bs
);
176 if (s
->file_end
< 0) {
181 if (s
->data_end
< s
->file_end
) {
182 ret
= bdrv_truncate(bs
->file
, s
->data_end
, true, PREALLOC_MODE_OFF
, 0,
184 s
->file_end
= ret
< 0 ? ret
: s
->data_end
;
192 * We must implement reopen handlers, otherwise reopen just don't work. Handle
193 * new options and don't care about preallocation state, as it is handled in
194 * set/check permission handlers.
197 static int preallocate_reopen_prepare(BDRVReopenState
*reopen_state
,
198 BlockReopenQueue
*queue
, Error
**errp
)
200 PreallocateOpts
*opts
= g_new0(PreallocateOpts
, 1);
202 if (!preallocate_absorb_opts(opts
, reopen_state
->options
,
203 reopen_state
->bs
->file
->bs
, errp
)) {
208 reopen_state
->opaque
= opts
;
213 static void preallocate_reopen_commit(BDRVReopenState
*state
)
215 BDRVPreallocateState
*s
= state
->bs
->opaque
;
217 s
->opts
= *(PreallocateOpts
*)state
->opaque
;
219 g_free(state
->opaque
);
220 state
->opaque
= NULL
;
223 static void preallocate_reopen_abort(BDRVReopenState
*state
)
225 g_free(state
->opaque
);
226 state
->opaque
= NULL
;
229 static coroutine_fn
int preallocate_co_preadv_part(
230 BlockDriverState
*bs
, int64_t offset
, int64_t bytes
,
231 QEMUIOVector
*qiov
, size_t qiov_offset
, BdrvRequestFlags flags
)
233 return bdrv_co_preadv_part(bs
->file
, offset
, bytes
, qiov
, qiov_offset
,
237 static int coroutine_fn
preallocate_co_pdiscard(BlockDriverState
*bs
,
238 int64_t offset
, int64_t bytes
)
240 return bdrv_co_pdiscard(bs
->file
, offset
, bytes
);
243 static bool can_write_resize(uint64_t perm
)
245 return (perm
& BLK_PERM_WRITE
) && (perm
& BLK_PERM_RESIZE
);
248 static bool has_prealloc_perms(BlockDriverState
*bs
)
250 BDRVPreallocateState
*s
= bs
->opaque
;
252 if (can_write_resize(bs
->file
->perm
)) {
253 assert(!(bs
->file
->shared_perm
& BLK_PERM_WRITE
));
254 assert(!(bs
->file
->shared_perm
& BLK_PERM_RESIZE
));
258 assert(s
->data_end
< 0);
259 assert(s
->zero_start
< 0);
260 assert(s
->file_end
< 0);
265 * Call on each write. Returns true if @want_merge_zero is true and the region
266 * [offset, offset + bytes) is zeroed (as a result of this call or earlier
269 * want_merge_zero is used to merge write-zero request with preallocation in
270 * one bdrv_co_pwrite_zeroes() call.
272 static bool coroutine_fn
handle_write(BlockDriverState
*bs
, int64_t offset
,
273 int64_t bytes
, bool want_merge_zero
)
275 BDRVPreallocateState
*s
= bs
->opaque
;
276 int64_t end
= offset
+ bytes
;
277 int64_t prealloc_start
, prealloc_end
;
280 if (!has_prealloc_perms(bs
)) {
281 /* We don't have state neither should try to recover it */
285 if (s
->data_end
< 0) {
286 s
->data_end
= bdrv_getlength(bs
->file
->bs
);
287 if (s
->data_end
< 0) {
291 if (s
->file_end
< 0) {
292 s
->file_end
= s
->data_end
;
296 if (end
<= s
->data_end
) {
300 /* We have valid s->data_end, and request writes beyond it. */
303 if (s
->zero_start
< 0 || !want_merge_zero
) {
307 if (s
->file_end
< 0) {
308 s
->file_end
= bdrv_getlength(bs
->file
->bs
);
309 if (s
->file_end
< 0) {
314 /* Now s->data_end, s->zero_start and s->file_end are valid. */
316 if (end
<= s
->file_end
) {
317 /* No preallocation needed. */
318 return want_merge_zero
&& offset
>= s
->zero_start
;
321 /* Now we want new preallocation, as request writes beyond s->file_end. */
323 prealloc_start
= want_merge_zero
? MIN(offset
, s
->file_end
) : s
->file_end
;
324 prealloc_end
= QEMU_ALIGN_UP(end
+ s
->opts
.prealloc_size
,
325 s
->opts
.prealloc_align
);
327 ret
= bdrv_co_pwrite_zeroes(
328 bs
->file
, prealloc_start
, prealloc_end
- prealloc_start
,
329 BDRV_REQ_NO_FALLBACK
| BDRV_REQ_SERIALISING
| BDRV_REQ_NO_WAIT
);
335 s
->file_end
= prealloc_end
;
336 return want_merge_zero
;
339 static int coroutine_fn
preallocate_co_pwrite_zeroes(BlockDriverState
*bs
,
340 int64_t offset
, int64_t bytes
, BdrvRequestFlags flags
)
342 bool want_merge_zero
=
343 !(flags
& ~(BDRV_REQ_ZERO_WRITE
| BDRV_REQ_NO_FALLBACK
));
344 if (handle_write(bs
, offset
, bytes
, want_merge_zero
)) {
348 return bdrv_co_pwrite_zeroes(bs
->file
, offset
, bytes
, flags
);
351 static coroutine_fn
int preallocate_co_pwritev_part(BlockDriverState
*bs
,
356 BdrvRequestFlags flags
)
358 handle_write(bs
, offset
, bytes
, false);
360 return bdrv_co_pwritev_part(bs
->file
, offset
, bytes
, qiov
, qiov_offset
,
364 static int coroutine_fn
365 preallocate_co_truncate(BlockDriverState
*bs
, int64_t offset
,
366 bool exact
, PreallocMode prealloc
,
367 BdrvRequestFlags flags
, Error
**errp
)
370 BDRVPreallocateState
*s
= bs
->opaque
;
373 if (s
->data_end
>= 0 && offset
> s
->data_end
) {
374 if (s
->file_end
< 0) {
375 s
->file_end
= bdrv_getlength(bs
->file
->bs
);
376 if (s
->file_end
< 0) {
377 error_setg(errp
, "failed to get file length");
382 if (prealloc
== PREALLOC_MODE_FALLOC
) {
384 * If offset <= s->file_end, the task is already done, just
385 * update s->data_end, to move part of "filter preallocation"
386 * to "preallocation requested by user".
387 * Otherwise just proceed to preallocate missing part.
389 if (offset
<= s
->file_end
) {
390 s
->data_end
= offset
;
395 * We have to drop our preallocation, to
396 * - avoid "Cannot use preallocation for shrinking files" in
397 * case of offset < file_end
398 * - give PREALLOC_MODE_OFF a chance to keep small disk
400 * - give PREALLOC_MODE_FULL a chance to actually write the
401 * whole region as user expects
403 if (s
->file_end
> s
->data_end
) {
404 ret
= bdrv_co_truncate(bs
->file
, s
->data_end
, true,
405 PREALLOC_MODE_OFF
, 0, errp
);
408 error_prepend(errp
, "preallocate-filter: failed to drop "
409 "write-zero preallocation: ");
412 s
->file_end
= s
->data_end
;
416 s
->data_end
= offset
;
419 ret
= bdrv_co_truncate(bs
->file
, offset
, exact
, prealloc
, flags
, errp
);
421 s
->file_end
= s
->zero_start
= s
->data_end
= ret
;
425 if (has_prealloc_perms(bs
)) {
426 s
->file_end
= s
->zero_start
= s
->data_end
= offset
;
431 static int coroutine_fn
preallocate_co_flush(BlockDriverState
*bs
)
433 return bdrv_co_flush(bs
->file
->bs
);
436 static int64_t preallocate_getlength(BlockDriverState
*bs
)
439 BDRVPreallocateState
*s
= bs
->opaque
;
441 if (s
->data_end
>= 0) {
445 ret
= bdrv_getlength(bs
->file
->bs
);
447 if (has_prealloc_perms(bs
)) {
448 s
->file_end
= s
->zero_start
= s
->data_end
= ret
;
454 static int preallocate_check_perm(BlockDriverState
*bs
,
455 uint64_t perm
, uint64_t shared
, Error
**errp
)
457 BDRVPreallocateState
*s
= bs
->opaque
;
459 if (s
->data_end
>= 0 && !can_write_resize(perm
)) {
462 * We should truncate in check_perm, as in set_perm bs->file->perm will
463 * be already changed, and we should not violate it.
465 if (s
->file_end
< 0) {
466 s
->file_end
= bdrv_getlength(bs
->file
->bs
);
467 if (s
->file_end
< 0) {
468 error_setg(errp
, "Failed to get file length");
473 if (s
->data_end
< s
->file_end
) {
474 int ret
= bdrv_truncate(bs
->file
, s
->data_end
, true,
475 PREALLOC_MODE_OFF
, 0, NULL
);
477 error_setg(errp
, "Failed to drop preallocation");
481 s
->file_end
= s
->data_end
;
488 static void preallocate_set_perm(BlockDriverState
*bs
,
489 uint64_t perm
, uint64_t shared
)
491 BDRVPreallocateState
*s
= bs
->opaque
;
493 if (can_write_resize(perm
)) {
494 if (s
->data_end
< 0) {
495 s
->data_end
= s
->file_end
= s
->zero_start
=
496 bdrv_getlength(bs
->file
->bs
);
500 * We drop our permissions, as well as allow shared
501 * permissions (see preallocate_child_perm), anyone will be able to
502 * change the child, so mark all states invalid. We'll regain control if
503 * get good permissions back.
505 s
->data_end
= s
->file_end
= s
->zero_start
= -EINVAL
;
509 static void preallocate_child_perm(BlockDriverState
*bs
, BdrvChild
*c
,
510 BdrvChildRole role
, BlockReopenQueue
*reopen_queue
,
511 uint64_t perm
, uint64_t shared
, uint64_t *nperm
, uint64_t *nshared
)
513 bdrv_default_perms(bs
, c
, role
, reopen_queue
, perm
, shared
, nperm
, nshared
);
515 if (can_write_resize(perm
)) {
516 /* This should come by default, but let's enforce: */
517 *nperm
|= BLK_PERM_WRITE
| BLK_PERM_RESIZE
;
520 * Don't share, to keep our states s->file_end, s->data_end and
521 * s->zero_start valid.
523 *nshared
&= ~(BLK_PERM_WRITE
| BLK_PERM_RESIZE
);
527 BlockDriver bdrv_preallocate_filter
= {
528 .format_name
= "preallocate",
529 .instance_size
= sizeof(BDRVPreallocateState
),
531 .bdrv_getlength
= preallocate_getlength
,
532 .bdrv_open
= preallocate_open
,
533 .bdrv_close
= preallocate_close
,
535 .bdrv_reopen_prepare
= preallocate_reopen_prepare
,
536 .bdrv_reopen_commit
= preallocate_reopen_commit
,
537 .bdrv_reopen_abort
= preallocate_reopen_abort
,
539 .bdrv_co_preadv_part
= preallocate_co_preadv_part
,
540 .bdrv_co_pwritev_part
= preallocate_co_pwritev_part
,
541 .bdrv_co_pwrite_zeroes
= preallocate_co_pwrite_zeroes
,
542 .bdrv_co_pdiscard
= preallocate_co_pdiscard
,
543 .bdrv_co_flush
= preallocate_co_flush
,
544 .bdrv_co_truncate
= preallocate_co_truncate
,
546 .bdrv_check_perm
= preallocate_check_perm
,
547 .bdrv_set_perm
= preallocate_set_perm
,
548 .bdrv_child_perm
= preallocate_child_perm
,
550 .has_variable_length
= true,
554 static void bdrv_preallocate_init(void)
556 bdrv_register(&bdrv_preallocate_filter
);
559 block_init(bdrv_preallocate_init
);