Merge tag 'migration-20240407-pull-request' of https://gitlab.com/peterx/qemu into...
[qemu/armbru.git] / block / preallocate.c
blobd215bc5d6d02709f7d5bd324dc939d16ee582708
1 /*
2 * preallocate filter driver
4 * The driver performs preallocate operation: it is injected above
5 * some node, and before each write over EOF it does additional preallocating
6 * write-zeroes request.
8 * Copyright (c) 2020 Virtuozzo International GmbH.
10 * Author:
11 * Sementsov-Ogievskiy Vladimir <vsementsov@virtuozzo.com>
13 * This program is free software; you can redistribute it and/or modify
14 * it under the terms of the GNU General Public License as published by
15 * the Free Software Foundation; either version 2 of the License, or
16 * (at your option) any later version.
18 * This program is distributed in the hope that it will be useful,
19 * but WITHOUT ANY WARRANTY; without even the implied warranty of
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 * GNU General Public License for more details.
23 * You should have received a copy of the GNU General Public License
24 * along with this program. If not, see <http://www.gnu.org/licenses/>.
27 #include "qemu/osdep.h"
29 #include "qapi/error.h"
30 #include "qemu/module.h"
31 #include "qemu/option.h"
32 #include "qemu/units.h"
33 #include "block/block-io.h"
34 #include "block/block_int.h"
37 typedef struct PreallocateOpts {
38 int64_t prealloc_size;
39 int64_t prealloc_align;
40 } PreallocateOpts;
42 typedef struct BDRVPreallocateState {
43 PreallocateOpts opts;
46 * Track real data end, to crop preallocation on close. If < 0 the status is
47 * unknown.
49 * @data_end is a maximum of file size on open (or when we get write/resize
50 * permissions) and all write request ends after it. So it's safe to
51 * truncate to data_end if it is valid.
53 int64_t data_end;
56 * Start of trailing preallocated area which reads as zero. May be smaller
57 * than data_end, if user does over-EOF write zero operation. If < 0 the
58 * status is unknown.
60 * If both @zero_start and @file_end are valid, the region
61 * [@zero_start, @file_end) is known to be preallocated zeroes. If @file_end
62 * is not valid, @zero_start doesn't make much sense.
64 int64_t zero_start;
67 * Real end of file. Actually the cache for bdrv_getlength(bs->file->bs),
68 * to avoid extra lseek() calls on each write operation. If < 0 the status
69 * is unknown.
71 int64_t file_end;
74 * All three states @data_end, @zero_start and @file_end are guaranteed to
75 * be invalid (< 0) when we don't have both exclusive BLK_PERM_RESIZE and
76 * BLK_PERM_WRITE permissions on file child.
79 /* Gives up the resize permission on children when parents don't need it */
80 QEMUBH *drop_resize_bh;
81 } BDRVPreallocateState;
83 static int preallocate_drop_resize(BlockDriverState *bs, Error **errp);
84 static void preallocate_drop_resize_bh(void *opaque);
86 #define PREALLOCATE_OPT_PREALLOC_ALIGN "prealloc-align"
87 #define PREALLOCATE_OPT_PREALLOC_SIZE "prealloc-size"
88 static QemuOptsList runtime_opts = {
89 .name = "preallocate",
90 .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
91 .desc = {
93 .name = PREALLOCATE_OPT_PREALLOC_ALIGN,
94 .type = QEMU_OPT_SIZE,
95 .help = "on preallocation, align file length to this number, "
96 "default 1M",
99 .name = PREALLOCATE_OPT_PREALLOC_SIZE,
100 .type = QEMU_OPT_SIZE,
101 .help = "how much to preallocate, default 128M",
103 { /* end of list */ }
107 static bool preallocate_absorb_opts(PreallocateOpts *dest, QDict *options,
108 BlockDriverState *child_bs, Error **errp)
110 QemuOpts *opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
112 if (!qemu_opts_absorb_qdict(opts, options, errp)) {
113 return false;
116 dest->prealloc_align =
117 qemu_opt_get_size(opts, PREALLOCATE_OPT_PREALLOC_ALIGN, 1 * MiB);
118 dest->prealloc_size =
119 qemu_opt_get_size(opts, PREALLOCATE_OPT_PREALLOC_SIZE, 128 * MiB);
121 qemu_opts_del(opts);
123 if (!QEMU_IS_ALIGNED(dest->prealloc_align, BDRV_SECTOR_SIZE)) {
124 error_setg(errp, "prealloc-align parameter of preallocate filter "
125 "is not aligned to %llu", BDRV_SECTOR_SIZE);
126 return false;
129 if (!QEMU_IS_ALIGNED(dest->prealloc_align,
130 child_bs->bl.request_alignment)) {
131 error_setg(errp, "prealloc-align parameter of preallocate filter "
132 "is not aligned to underlying node request alignment "
133 "(%" PRIi32 ")", child_bs->bl.request_alignment);
134 return false;
137 return true;
140 static int preallocate_open(BlockDriverState *bs, QDict *options, int flags,
141 Error **errp)
143 BDRVPreallocateState *s = bs->opaque;
144 int ret;
146 GLOBAL_STATE_CODE();
149 * s->data_end and friends should be initialized on permission update.
150 * For this to work, mark them invalid.
152 s->file_end = s->zero_start = s->data_end = -EINVAL;
153 s->drop_resize_bh = qemu_bh_new(preallocate_drop_resize_bh, bs);
155 ret = bdrv_open_file_child(NULL, options, "file", bs, errp);
156 if (ret < 0) {
157 return ret;
160 GRAPH_RDLOCK_GUARD_MAINLOOP();
162 if (!preallocate_absorb_opts(&s->opts, options, bs->file->bs, errp)) {
163 return -EINVAL;
166 bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED |
167 (BDRV_REQ_FUA & bs->file->bs->supported_write_flags);
169 bs->supported_zero_flags = BDRV_REQ_WRITE_UNCHANGED |
170 ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK) &
171 bs->file->bs->supported_zero_flags);
173 return 0;
176 static int GRAPH_RDLOCK
177 preallocate_truncate_to_real_size(BlockDriverState *bs, Error **errp)
179 BDRVPreallocateState *s = bs->opaque;
180 int ret;
182 if (s->file_end < 0) {
183 s->file_end = bdrv_getlength(bs->file->bs);
184 if (s->file_end < 0) {
185 error_setg_errno(errp, -s->file_end, "Failed to get file length");
186 return s->file_end;
190 if (s->data_end < s->file_end) {
191 ret = bdrv_truncate(bs->file, s->data_end, true, PREALLOC_MODE_OFF, 0,
192 NULL);
193 if (ret < 0) {
194 error_setg_errno(errp, -ret, "Failed to drop preallocation");
195 s->file_end = ret;
196 return ret;
198 s->file_end = s->data_end;
201 return 0;
204 static void preallocate_close(BlockDriverState *bs)
206 BDRVPreallocateState *s = bs->opaque;
208 GLOBAL_STATE_CODE();
209 GRAPH_RDLOCK_GUARD_MAINLOOP();
211 qemu_bh_cancel(s->drop_resize_bh);
212 qemu_bh_delete(s->drop_resize_bh);
214 if (s->data_end >= 0) {
215 preallocate_truncate_to_real_size(bs, NULL);
221 * Handle reopen.
223 * We must implement reopen handlers, otherwise reopen just don't work. Handle
224 * new options and don't care about preallocation state, as it is handled in
225 * set/check permission handlers.
228 static int preallocate_reopen_prepare(BDRVReopenState *reopen_state,
229 BlockReopenQueue *queue, Error **errp)
231 PreallocateOpts *opts = g_new0(PreallocateOpts, 1);
232 int ret;
234 GLOBAL_STATE_CODE();
235 GRAPH_RDLOCK_GUARD_MAINLOOP();
237 if (!preallocate_absorb_opts(opts, reopen_state->options,
238 reopen_state->bs->file->bs, errp)) {
239 g_free(opts);
240 return -EINVAL;
244 * Drop the preallocation already here if reopening read-only. The child
245 * might also be reopened read-only and then scheduling a BH during the
246 * permission update is too late.
248 if ((reopen_state->flags & BDRV_O_RDWR) == 0) {
249 ret = preallocate_drop_resize(reopen_state->bs, errp);
250 if (ret < 0) {
251 g_free(opts);
252 return ret;
256 reopen_state->opaque = opts;
258 return 0;
261 static void preallocate_reopen_commit(BDRVReopenState *state)
263 BDRVPreallocateState *s = state->bs->opaque;
265 s->opts = *(PreallocateOpts *)state->opaque;
267 g_free(state->opaque);
268 state->opaque = NULL;
271 static void preallocate_reopen_abort(BDRVReopenState *state)
273 g_free(state->opaque);
274 state->opaque = NULL;
277 static int coroutine_fn GRAPH_RDLOCK
278 preallocate_co_preadv_part(BlockDriverState *bs, int64_t offset, int64_t bytes,
279 QEMUIOVector *qiov, size_t qiov_offset,
280 BdrvRequestFlags flags)
282 return bdrv_co_preadv_part(bs->file, offset, bytes, qiov, qiov_offset,
283 flags);
286 static int coroutine_fn GRAPH_RDLOCK
287 preallocate_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes)
289 return bdrv_co_pdiscard(bs->file, offset, bytes);
292 static bool can_write_resize(uint64_t perm)
294 return (perm & BLK_PERM_WRITE) && (perm & BLK_PERM_RESIZE);
297 static bool GRAPH_RDLOCK has_prealloc_perms(BlockDriverState *bs)
299 BDRVPreallocateState *s = bs->opaque;
301 if (can_write_resize(bs->file->perm)) {
302 assert(!(bs->file->shared_perm & BLK_PERM_WRITE));
303 assert(!(bs->file->shared_perm & BLK_PERM_RESIZE));
304 return true;
307 assert(s->data_end < 0);
308 assert(s->zero_start < 0);
309 assert(s->file_end < 0);
310 return false;
314 * Call on each write. Returns true if @want_merge_zero is true and the region
315 * [offset, offset + bytes) is zeroed (as a result of this call or earlier
316 * preallocation).
318 * want_merge_zero is used to merge write-zero request with preallocation in
319 * one bdrv_co_pwrite_zeroes() call.
321 static bool coroutine_fn GRAPH_RDLOCK
322 handle_write(BlockDriverState *bs, int64_t offset, int64_t bytes,
323 bool want_merge_zero)
325 BDRVPreallocateState *s = bs->opaque;
326 int64_t end = offset + bytes;
327 int64_t prealloc_start, prealloc_end;
328 int ret;
329 uint32_t file_align = bs->file->bs->bl.request_alignment;
330 uint32_t prealloc_align = MAX(s->opts.prealloc_align, file_align);
332 assert(QEMU_IS_ALIGNED(prealloc_align, file_align));
334 if (!has_prealloc_perms(bs)) {
335 /* We don't have state neither should try to recover it */
336 return false;
339 if (s->data_end < 0) {
340 s->data_end = bdrv_co_getlength(bs->file->bs);
341 if (s->data_end < 0) {
342 return false;
345 if (s->file_end < 0) {
346 s->file_end = s->data_end;
350 if (end <= s->data_end) {
351 return false;
354 /* We have valid s->data_end, and request writes beyond it. */
356 s->data_end = end;
357 if (s->zero_start < 0 || !want_merge_zero) {
358 s->zero_start = end;
361 if (s->file_end < 0) {
362 s->file_end = bdrv_co_getlength(bs->file->bs);
363 if (s->file_end < 0) {
364 return false;
368 /* Now s->data_end, s->zero_start and s->file_end are valid. */
370 if (end <= s->file_end) {
371 /* No preallocation needed. */
372 return want_merge_zero && offset >= s->zero_start;
375 /* Now we want new preallocation, as request writes beyond s->file_end. */
377 prealloc_start = QEMU_ALIGN_UP(
378 want_merge_zero ? MIN(offset, s->file_end) : s->file_end,
379 file_align);
380 prealloc_end = QEMU_ALIGN_UP(
381 MAX(prealloc_start, end) + s->opts.prealloc_size,
382 prealloc_align);
384 want_merge_zero = want_merge_zero && (prealloc_start <= offset);
386 ret = bdrv_co_pwrite_zeroes(
387 bs->file, prealloc_start, prealloc_end - prealloc_start,
388 BDRV_REQ_NO_FALLBACK | BDRV_REQ_SERIALISING | BDRV_REQ_NO_WAIT);
389 if (ret < 0) {
390 s->file_end = ret;
391 return false;
394 s->file_end = prealloc_end;
395 return want_merge_zero;
398 static int coroutine_fn GRAPH_RDLOCK
399 preallocate_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset,
400 int64_t bytes, BdrvRequestFlags flags)
402 bool want_merge_zero =
403 !(flags & ~(BDRV_REQ_ZERO_WRITE | BDRV_REQ_NO_FALLBACK));
404 if (handle_write(bs, offset, bytes, want_merge_zero)) {
405 return 0;
408 return bdrv_co_pwrite_zeroes(bs->file, offset, bytes, flags);
411 static int coroutine_fn GRAPH_RDLOCK
412 preallocate_co_pwritev_part(BlockDriverState *bs, int64_t offset, int64_t bytes,
413 QEMUIOVector *qiov, size_t qiov_offset,
414 BdrvRequestFlags flags)
416 handle_write(bs, offset, bytes, false);
418 return bdrv_co_pwritev_part(bs->file, offset, bytes, qiov, qiov_offset,
419 flags);
422 static int coroutine_fn GRAPH_RDLOCK
423 preallocate_co_truncate(BlockDriverState *bs, int64_t offset,
424 bool exact, PreallocMode prealloc,
425 BdrvRequestFlags flags, Error **errp)
427 ERRP_GUARD();
428 BDRVPreallocateState *s = bs->opaque;
429 int ret;
431 if (s->data_end >= 0 && offset > s->data_end) {
432 if (s->file_end < 0) {
433 s->file_end = bdrv_co_getlength(bs->file->bs);
434 if (s->file_end < 0) {
435 error_setg(errp, "failed to get file length");
436 return s->file_end;
440 if (prealloc == PREALLOC_MODE_FALLOC) {
442 * If offset <= s->file_end, the task is already done, just
443 * update s->data_end, to move part of "filter preallocation"
444 * to "preallocation requested by user".
445 * Otherwise just proceed to preallocate missing part.
447 if (offset <= s->file_end) {
448 s->data_end = offset;
449 return 0;
451 } else {
453 * We have to drop our preallocation, to
454 * - avoid "Cannot use preallocation for shrinking files" in
455 * case of offset < file_end
456 * - give PREALLOC_MODE_OFF a chance to keep small disk
457 * usage
458 * - give PREALLOC_MODE_FULL a chance to actually write the
459 * whole region as user expects
461 if (s->file_end > s->data_end) {
462 ret = bdrv_co_truncate(bs->file, s->data_end, true,
463 PREALLOC_MODE_OFF, 0, errp);
464 if (ret < 0) {
465 s->file_end = ret;
466 error_prepend(errp, "preallocate-filter: failed to drop "
467 "write-zero preallocation: ");
468 return ret;
470 s->file_end = s->data_end;
474 s->data_end = offset;
477 ret = bdrv_co_truncate(bs->file, offset, exact, prealloc, flags, errp);
478 if (ret < 0) {
479 s->file_end = s->zero_start = s->data_end = ret;
480 return ret;
483 if (has_prealloc_perms(bs)) {
484 s->file_end = s->zero_start = s->data_end = offset;
486 return 0;
489 static int coroutine_fn GRAPH_RDLOCK preallocate_co_flush(BlockDriverState *bs)
491 return bdrv_co_flush(bs->file->bs);
494 static int64_t coroutine_fn GRAPH_RDLOCK
495 preallocate_co_getlength(BlockDriverState *bs)
497 int64_t ret;
498 BDRVPreallocateState *s = bs->opaque;
500 if (s->data_end >= 0) {
501 return s->data_end;
504 ret = bdrv_co_getlength(bs->file->bs);
506 if (has_prealloc_perms(bs)) {
507 s->file_end = s->zero_start = s->data_end = ret;
510 return ret;
513 static int GRAPH_RDLOCK
514 preallocate_drop_resize(BlockDriverState *bs, Error **errp)
516 BDRVPreallocateState *s = bs->opaque;
517 int ret;
519 if (s->data_end < 0) {
520 return 0;
524 * Before switching children to be read-only, truncate them to remove
525 * the preallocation and let them have the real size.
527 ret = preallocate_truncate_to_real_size(bs, errp);
528 if (ret < 0) {
529 return ret;
533 * We'll drop our permissions and will allow other users to take write and
534 * resize permissions (see preallocate_child_perm). Anyone will be able to
535 * change the child, so mark all states invalid. We'll regain control if a
536 * parent requests write access again.
538 s->data_end = s->file_end = s->zero_start = -EINVAL;
540 bdrv_child_refresh_perms(bs, bs->file, NULL);
542 return 0;
545 static void preallocate_drop_resize_bh(void *opaque)
547 GLOBAL_STATE_CODE();
548 GRAPH_RDLOCK_GUARD_MAINLOOP();
551 * In case of errors, we'll simply keep the exclusive lock on the image
552 * indefinitely.
554 preallocate_drop_resize(opaque, NULL);
557 static void GRAPH_RDLOCK
558 preallocate_set_perm(BlockDriverState *bs, uint64_t perm, uint64_t shared)
560 BDRVPreallocateState *s = bs->opaque;
562 if (can_write_resize(perm)) {
563 qemu_bh_cancel(s->drop_resize_bh);
564 if (s->data_end < 0) {
565 s->data_end = s->file_end = s->zero_start =
566 bs->file->bs->total_sectors * BDRV_SECTOR_SIZE;
568 } else {
569 qemu_bh_schedule(s->drop_resize_bh);
573 static void preallocate_child_perm(BlockDriverState *bs, BdrvChild *c,
574 BdrvChildRole role, BlockReopenQueue *reopen_queue,
575 uint64_t perm, uint64_t shared, uint64_t *nperm, uint64_t *nshared)
577 BDRVPreallocateState *s = bs->opaque;
579 bdrv_default_perms(bs, c, role, reopen_queue, perm, shared, nperm, nshared);
582 * We need exclusive write and resize permissions on the child not only when
583 * the parent can write to it, but also after the parent gave up write
584 * permissions until preallocate_drop_resize() has completed.
586 if (can_write_resize(perm) || s->data_end != -EINVAL) {
587 *nperm |= BLK_PERM_WRITE | BLK_PERM_RESIZE;
590 * Don't share, to keep our states s->file_end, s->data_end and
591 * s->zero_start valid.
593 *nshared &= ~(BLK_PERM_WRITE | BLK_PERM_RESIZE);
597 static BlockDriver bdrv_preallocate_filter = {
598 .format_name = "preallocate",
599 .instance_size = sizeof(BDRVPreallocateState),
601 .bdrv_co_getlength = preallocate_co_getlength,
602 .bdrv_open = preallocate_open,
603 .bdrv_close = preallocate_close,
605 .bdrv_reopen_prepare = preallocate_reopen_prepare,
606 .bdrv_reopen_commit = preallocate_reopen_commit,
607 .bdrv_reopen_abort = preallocate_reopen_abort,
609 .bdrv_co_preadv_part = preallocate_co_preadv_part,
610 .bdrv_co_pwritev_part = preallocate_co_pwritev_part,
611 .bdrv_co_pwrite_zeroes = preallocate_co_pwrite_zeroes,
612 .bdrv_co_pdiscard = preallocate_co_pdiscard,
613 .bdrv_co_flush = preallocate_co_flush,
614 .bdrv_co_truncate = preallocate_co_truncate,
616 .bdrv_set_perm = preallocate_set_perm,
617 .bdrv_child_perm = preallocate_child_perm,
619 .is_filter = true,
622 static void bdrv_preallocate_init(void)
624 bdrv_register(&bdrv_preallocate_filter);
627 block_init(bdrv_preallocate_init);