2 * preallocate filter driver
4 * The driver performs preallocate operation: it is injected above
5 * some node, and before each write over EOF it does additional preallocating
6 * write-zeroes request.
8 * Copyright (c) 2020 Virtuozzo International GmbH.
11 * Sementsov-Ogievskiy Vladimir <vsementsov@virtuozzo.com>
13 * This program is free software; you can redistribute it and/or modify
14 * it under the terms of the GNU General Public License as published by
15 * the Free Software Foundation; either version 2 of the License, or
16 * (at your option) any later version.
18 * This program is distributed in the hope that it will be useful,
19 * but WITHOUT ANY WARRANTY; without even the implied warranty of
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 * GNU General Public License for more details.
23 * You should have received a copy of the GNU General Public License
24 * along with this program. If not, see <http://www.gnu.org/licenses/>.
27 #include "qemu/osdep.h"
29 #include "qapi/error.h"
30 #include "qemu/module.h"
31 #include "qemu/option.h"
32 #include "qemu/units.h"
33 #include "block/block-io.h"
34 #include "block/block_int.h"
37 typedef struct PreallocateOpts
{
38 int64_t prealloc_size
;
39 int64_t prealloc_align
;
42 typedef struct BDRVPreallocateState
{
46 * Track real data end, to crop preallocation on close. If < 0 the status is
49 * @data_end is a maximum of file size on open (or when we get write/resize
50 * permissions) and all write request ends after it. So it's safe to
51 * truncate to data_end if it is valid.
56 * Start of trailing preallocated area which reads as zero. May be smaller
57 * than data_end, if user does over-EOF write zero operation. If < 0 the
60 * If both @zero_start and @file_end are valid, the region
61 * [@zero_start, @file_end) is known to be preallocated zeroes. If @file_end
62 * is not valid, @zero_start doesn't make much sense.
67 * Real end of file. Actually the cache for bdrv_getlength(bs->file->bs),
68 * to avoid extra lseek() calls on each write operation. If < 0 the status
74 * All three states @data_end, @zero_start and @file_end are guaranteed to
75 * be invalid (< 0) when we don't have both exclusive BLK_PERM_RESIZE and
76 * BLK_PERM_WRITE permissions on file child.
79 /* Gives up the resize permission on children when parents don't need it */
80 QEMUBH
*drop_resize_bh
;
81 } BDRVPreallocateState
;
83 static int preallocate_drop_resize(BlockDriverState
*bs
, Error
**errp
);
84 static void preallocate_drop_resize_bh(void *opaque
);
86 #define PREALLOCATE_OPT_PREALLOC_ALIGN "prealloc-align"
87 #define PREALLOCATE_OPT_PREALLOC_SIZE "prealloc-size"
88 static QemuOptsList runtime_opts
= {
89 .name
= "preallocate",
90 .head
= QTAILQ_HEAD_INITIALIZER(runtime_opts
.head
),
93 .name
= PREALLOCATE_OPT_PREALLOC_ALIGN
,
94 .type
= QEMU_OPT_SIZE
,
95 .help
= "on preallocation, align file length to this number, "
99 .name
= PREALLOCATE_OPT_PREALLOC_SIZE
,
100 .type
= QEMU_OPT_SIZE
,
101 .help
= "how much to preallocate, default 128M",
103 { /* end of list */ }
107 static bool preallocate_absorb_opts(PreallocateOpts
*dest
, QDict
*options
,
108 BlockDriverState
*child_bs
, Error
**errp
)
110 QemuOpts
*opts
= qemu_opts_create(&runtime_opts
, NULL
, 0, &error_abort
);
112 if (!qemu_opts_absorb_qdict(opts
, options
, errp
)) {
116 dest
->prealloc_align
=
117 qemu_opt_get_size(opts
, PREALLOCATE_OPT_PREALLOC_ALIGN
, 1 * MiB
);
118 dest
->prealloc_size
=
119 qemu_opt_get_size(opts
, PREALLOCATE_OPT_PREALLOC_SIZE
, 128 * MiB
);
123 if (!QEMU_IS_ALIGNED(dest
->prealloc_align
, BDRV_SECTOR_SIZE
)) {
124 error_setg(errp
, "prealloc-align parameter of preallocate filter "
125 "is not aligned to %llu", BDRV_SECTOR_SIZE
);
129 if (!QEMU_IS_ALIGNED(dest
->prealloc_align
,
130 child_bs
->bl
.request_alignment
)) {
131 error_setg(errp
, "prealloc-align parameter of preallocate filter "
132 "is not aligned to underlying node request alignment "
133 "(%" PRIi32
")", child_bs
->bl
.request_alignment
);
140 static int preallocate_open(BlockDriverState
*bs
, QDict
*options
, int flags
,
143 BDRVPreallocateState
*s
= bs
->opaque
;
149 * s->data_end and friends should be initialized on permission update.
150 * For this to work, mark them invalid.
152 s
->file_end
= s
->zero_start
= s
->data_end
= -EINVAL
;
153 s
->drop_resize_bh
= qemu_bh_new(preallocate_drop_resize_bh
, bs
);
155 ret
= bdrv_open_file_child(NULL
, options
, "file", bs
, errp
);
160 GRAPH_RDLOCK_GUARD_MAINLOOP();
162 if (!preallocate_absorb_opts(&s
->opts
, options
, bs
->file
->bs
, errp
)) {
166 bs
->supported_write_flags
= BDRV_REQ_WRITE_UNCHANGED
|
167 (BDRV_REQ_FUA
& bs
->file
->bs
->supported_write_flags
);
169 bs
->supported_zero_flags
= BDRV_REQ_WRITE_UNCHANGED
|
170 ((BDRV_REQ_FUA
| BDRV_REQ_MAY_UNMAP
| BDRV_REQ_NO_FALLBACK
) &
171 bs
->file
->bs
->supported_zero_flags
);
176 static int GRAPH_RDLOCK
177 preallocate_truncate_to_real_size(BlockDriverState
*bs
, Error
**errp
)
179 BDRVPreallocateState
*s
= bs
->opaque
;
182 if (s
->file_end
< 0) {
183 s
->file_end
= bdrv_getlength(bs
->file
->bs
);
184 if (s
->file_end
< 0) {
185 error_setg_errno(errp
, -s
->file_end
, "Failed to get file length");
190 if (s
->data_end
< s
->file_end
) {
191 ret
= bdrv_truncate(bs
->file
, s
->data_end
, true, PREALLOC_MODE_OFF
, 0,
194 error_setg_errno(errp
, -ret
, "Failed to drop preallocation");
198 s
->file_end
= s
->data_end
;
204 static void preallocate_close(BlockDriverState
*bs
)
206 BDRVPreallocateState
*s
= bs
->opaque
;
209 GRAPH_RDLOCK_GUARD_MAINLOOP();
211 qemu_bh_cancel(s
->drop_resize_bh
);
212 qemu_bh_delete(s
->drop_resize_bh
);
214 if (s
->data_end
>= 0) {
215 preallocate_truncate_to_real_size(bs
, NULL
);
223 * We must implement reopen handlers, otherwise reopen just don't work. Handle
224 * new options and don't care about preallocation state, as it is handled in
225 * set/check permission handlers.
228 static int preallocate_reopen_prepare(BDRVReopenState
*reopen_state
,
229 BlockReopenQueue
*queue
, Error
**errp
)
231 PreallocateOpts
*opts
= g_new0(PreallocateOpts
, 1);
235 GRAPH_RDLOCK_GUARD_MAINLOOP();
237 if (!preallocate_absorb_opts(opts
, reopen_state
->options
,
238 reopen_state
->bs
->file
->bs
, errp
)) {
244 * Drop the preallocation already here if reopening read-only. The child
245 * might also be reopened read-only and then scheduling a BH during the
246 * permission update is too late.
248 if ((reopen_state
->flags
& BDRV_O_RDWR
) == 0) {
249 ret
= preallocate_drop_resize(reopen_state
->bs
, errp
);
256 reopen_state
->opaque
= opts
;
261 static void preallocate_reopen_commit(BDRVReopenState
*state
)
263 BDRVPreallocateState
*s
= state
->bs
->opaque
;
265 s
->opts
= *(PreallocateOpts
*)state
->opaque
;
267 g_free(state
->opaque
);
268 state
->opaque
= NULL
;
271 static void preallocate_reopen_abort(BDRVReopenState
*state
)
273 g_free(state
->opaque
);
274 state
->opaque
= NULL
;
277 static int coroutine_fn GRAPH_RDLOCK
278 preallocate_co_preadv_part(BlockDriverState
*bs
, int64_t offset
, int64_t bytes
,
279 QEMUIOVector
*qiov
, size_t qiov_offset
,
280 BdrvRequestFlags flags
)
282 return bdrv_co_preadv_part(bs
->file
, offset
, bytes
, qiov
, qiov_offset
,
286 static int coroutine_fn GRAPH_RDLOCK
287 preallocate_co_pdiscard(BlockDriverState
*bs
, int64_t offset
, int64_t bytes
)
289 return bdrv_co_pdiscard(bs
->file
, offset
, bytes
);
292 static bool can_write_resize(uint64_t perm
)
294 return (perm
& BLK_PERM_WRITE
) && (perm
& BLK_PERM_RESIZE
);
297 static bool GRAPH_RDLOCK
has_prealloc_perms(BlockDriverState
*bs
)
299 BDRVPreallocateState
*s
= bs
->opaque
;
301 if (can_write_resize(bs
->file
->perm
)) {
302 assert(!(bs
->file
->shared_perm
& BLK_PERM_WRITE
));
303 assert(!(bs
->file
->shared_perm
& BLK_PERM_RESIZE
));
307 assert(s
->data_end
< 0);
308 assert(s
->zero_start
< 0);
309 assert(s
->file_end
< 0);
314 * Call on each write. Returns true if @want_merge_zero is true and the region
315 * [offset, offset + bytes) is zeroed (as a result of this call or earlier
318 * want_merge_zero is used to merge write-zero request with preallocation in
319 * one bdrv_co_pwrite_zeroes() call.
321 static bool coroutine_fn GRAPH_RDLOCK
322 handle_write(BlockDriverState
*bs
, int64_t offset
, int64_t bytes
,
323 bool want_merge_zero
)
325 BDRVPreallocateState
*s
= bs
->opaque
;
326 int64_t end
= offset
+ bytes
;
327 int64_t prealloc_start
, prealloc_end
;
329 uint32_t file_align
= bs
->file
->bs
->bl
.request_alignment
;
330 uint32_t prealloc_align
= MAX(s
->opts
.prealloc_align
, file_align
);
332 assert(QEMU_IS_ALIGNED(prealloc_align
, file_align
));
334 if (!has_prealloc_perms(bs
)) {
335 /* We don't have state neither should try to recover it */
339 if (s
->data_end
< 0) {
340 s
->data_end
= bdrv_co_getlength(bs
->file
->bs
);
341 if (s
->data_end
< 0) {
345 if (s
->file_end
< 0) {
346 s
->file_end
= s
->data_end
;
350 if (end
<= s
->data_end
) {
354 /* We have valid s->data_end, and request writes beyond it. */
357 if (s
->zero_start
< 0 || !want_merge_zero
) {
361 if (s
->file_end
< 0) {
362 s
->file_end
= bdrv_co_getlength(bs
->file
->bs
);
363 if (s
->file_end
< 0) {
368 /* Now s->data_end, s->zero_start and s->file_end are valid. */
370 if (end
<= s
->file_end
) {
371 /* No preallocation needed. */
372 return want_merge_zero
&& offset
>= s
->zero_start
;
375 /* Now we want new preallocation, as request writes beyond s->file_end. */
377 prealloc_start
= QEMU_ALIGN_UP(
378 want_merge_zero
? MIN(offset
, s
->file_end
) : s
->file_end
,
380 prealloc_end
= QEMU_ALIGN_UP(
381 MAX(prealloc_start
, end
) + s
->opts
.prealloc_size
,
384 want_merge_zero
= want_merge_zero
&& (prealloc_start
<= offset
);
386 ret
= bdrv_co_pwrite_zeroes(
387 bs
->file
, prealloc_start
, prealloc_end
- prealloc_start
,
388 BDRV_REQ_NO_FALLBACK
| BDRV_REQ_SERIALISING
| BDRV_REQ_NO_WAIT
);
394 s
->file_end
= prealloc_end
;
395 return want_merge_zero
;
398 static int coroutine_fn GRAPH_RDLOCK
399 preallocate_co_pwrite_zeroes(BlockDriverState
*bs
, int64_t offset
,
400 int64_t bytes
, BdrvRequestFlags flags
)
402 bool want_merge_zero
=
403 !(flags
& ~(BDRV_REQ_ZERO_WRITE
| BDRV_REQ_NO_FALLBACK
));
404 if (handle_write(bs
, offset
, bytes
, want_merge_zero
)) {
408 return bdrv_co_pwrite_zeroes(bs
->file
, offset
, bytes
, flags
);
411 static int coroutine_fn GRAPH_RDLOCK
412 preallocate_co_pwritev_part(BlockDriverState
*bs
, int64_t offset
, int64_t bytes
,
413 QEMUIOVector
*qiov
, size_t qiov_offset
,
414 BdrvRequestFlags flags
)
416 handle_write(bs
, offset
, bytes
, false);
418 return bdrv_co_pwritev_part(bs
->file
, offset
, bytes
, qiov
, qiov_offset
,
422 static int coroutine_fn GRAPH_RDLOCK
423 preallocate_co_truncate(BlockDriverState
*bs
, int64_t offset
,
424 bool exact
, PreallocMode prealloc
,
425 BdrvRequestFlags flags
, Error
**errp
)
428 BDRVPreallocateState
*s
= bs
->opaque
;
431 if (s
->data_end
>= 0 && offset
> s
->data_end
) {
432 if (s
->file_end
< 0) {
433 s
->file_end
= bdrv_co_getlength(bs
->file
->bs
);
434 if (s
->file_end
< 0) {
435 error_setg(errp
, "failed to get file length");
440 if (prealloc
== PREALLOC_MODE_FALLOC
) {
442 * If offset <= s->file_end, the task is already done, just
443 * update s->data_end, to move part of "filter preallocation"
444 * to "preallocation requested by user".
445 * Otherwise just proceed to preallocate missing part.
447 if (offset
<= s
->file_end
) {
448 s
->data_end
= offset
;
453 * We have to drop our preallocation, to
454 * - avoid "Cannot use preallocation for shrinking files" in
455 * case of offset < file_end
456 * - give PREALLOC_MODE_OFF a chance to keep small disk
458 * - give PREALLOC_MODE_FULL a chance to actually write the
459 * whole region as user expects
461 if (s
->file_end
> s
->data_end
) {
462 ret
= bdrv_co_truncate(bs
->file
, s
->data_end
, true,
463 PREALLOC_MODE_OFF
, 0, errp
);
466 error_prepend(errp
, "preallocate-filter: failed to drop "
467 "write-zero preallocation: ");
470 s
->file_end
= s
->data_end
;
474 s
->data_end
= offset
;
477 ret
= bdrv_co_truncate(bs
->file
, offset
, exact
, prealloc
, flags
, errp
);
479 s
->file_end
= s
->zero_start
= s
->data_end
= ret
;
483 if (has_prealloc_perms(bs
)) {
484 s
->file_end
= s
->zero_start
= s
->data_end
= offset
;
489 static int coroutine_fn GRAPH_RDLOCK
preallocate_co_flush(BlockDriverState
*bs
)
491 return bdrv_co_flush(bs
->file
->bs
);
494 static int64_t coroutine_fn GRAPH_RDLOCK
495 preallocate_co_getlength(BlockDriverState
*bs
)
498 BDRVPreallocateState
*s
= bs
->opaque
;
500 if (s
->data_end
>= 0) {
504 ret
= bdrv_co_getlength(bs
->file
->bs
);
506 if (has_prealloc_perms(bs
)) {
507 s
->file_end
= s
->zero_start
= s
->data_end
= ret
;
513 static int GRAPH_RDLOCK
514 preallocate_drop_resize(BlockDriverState
*bs
, Error
**errp
)
516 BDRVPreallocateState
*s
= bs
->opaque
;
519 if (s
->data_end
< 0) {
524 * Before switching children to be read-only, truncate them to remove
525 * the preallocation and let them have the real size.
527 ret
= preallocate_truncate_to_real_size(bs
, errp
);
533 * We'll drop our permissions and will allow other users to take write and
534 * resize permissions (see preallocate_child_perm). Anyone will be able to
535 * change the child, so mark all states invalid. We'll regain control if a
536 * parent requests write access again.
538 s
->data_end
= s
->file_end
= s
->zero_start
= -EINVAL
;
540 bdrv_child_refresh_perms(bs
, bs
->file
, NULL
);
545 static void preallocate_drop_resize_bh(void *opaque
)
548 GRAPH_RDLOCK_GUARD_MAINLOOP();
551 * In case of errors, we'll simply keep the exclusive lock on the image
554 preallocate_drop_resize(opaque
, NULL
);
557 static void GRAPH_RDLOCK
558 preallocate_set_perm(BlockDriverState
*bs
, uint64_t perm
, uint64_t shared
)
560 BDRVPreallocateState
*s
= bs
->opaque
;
562 if (can_write_resize(perm
)) {
563 qemu_bh_cancel(s
->drop_resize_bh
);
564 if (s
->data_end
< 0) {
565 s
->data_end
= s
->file_end
= s
->zero_start
=
566 bs
->file
->bs
->total_sectors
* BDRV_SECTOR_SIZE
;
569 qemu_bh_schedule(s
->drop_resize_bh
);
573 static void preallocate_child_perm(BlockDriverState
*bs
, BdrvChild
*c
,
574 BdrvChildRole role
, BlockReopenQueue
*reopen_queue
,
575 uint64_t perm
, uint64_t shared
, uint64_t *nperm
, uint64_t *nshared
)
577 BDRVPreallocateState
*s
= bs
->opaque
;
579 bdrv_default_perms(bs
, c
, role
, reopen_queue
, perm
, shared
, nperm
, nshared
);
582 * We need exclusive write and resize permissions on the child not only when
583 * the parent can write to it, but also after the parent gave up write
584 * permissions until preallocate_drop_resize() has completed.
586 if (can_write_resize(perm
) || s
->data_end
!= -EINVAL
) {
587 *nperm
|= BLK_PERM_WRITE
| BLK_PERM_RESIZE
;
590 * Don't share, to keep our states s->file_end, s->data_end and
591 * s->zero_start valid.
593 *nshared
&= ~(BLK_PERM_WRITE
| BLK_PERM_RESIZE
);
597 static BlockDriver bdrv_preallocate_filter
= {
598 .format_name
= "preallocate",
599 .instance_size
= sizeof(BDRVPreallocateState
),
601 .bdrv_co_getlength
= preallocate_co_getlength
,
602 .bdrv_open
= preallocate_open
,
603 .bdrv_close
= preallocate_close
,
605 .bdrv_reopen_prepare
= preallocate_reopen_prepare
,
606 .bdrv_reopen_commit
= preallocate_reopen_commit
,
607 .bdrv_reopen_abort
= preallocate_reopen_abort
,
609 .bdrv_co_preadv_part
= preallocate_co_preadv_part
,
610 .bdrv_co_pwritev_part
= preallocate_co_pwritev_part
,
611 .bdrv_co_pwrite_zeroes
= preallocate_co_pwrite_zeroes
,
612 .bdrv_co_pdiscard
= preallocate_co_pdiscard
,
613 .bdrv_co_flush
= preallocate_co_flush
,
614 .bdrv_co_truncate
= preallocate_co_truncate
,
616 .bdrv_set_perm
= preallocate_set_perm
,
617 .bdrv_child_perm
= preallocate_child_perm
,
622 static void bdrv_preallocate_init(void)
624 bdrv_register(&bdrv_preallocate_filter
);
627 block_init(bdrv_preallocate_init
);