2 * copy-before-write filter driver
4 * The driver performs Copy-Before-Write (CBW) operation: it is injected above
5 * some node, and before each write it copies _old_ data to the target node.
7 * Copyright (c) 2018-2021 Virtuozzo International GmbH.
10 * Sementsov-Ogievskiy Vladimir <vsementsov@virtuozzo.com>
12 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or
15 * (at your option) any later version.
17 * This program is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
22 * You should have received a copy of the GNU General Public License
23 * along with this program. If not, see <http://www.gnu.org/licenses/>.
26 #include "qemu/osdep.h"
28 #include "sysemu/block-backend.h"
29 #include "qemu/cutils.h"
30 #include "qapi/error.h"
31 #include "block/block_int.h"
32 #include "block/qdict.h"
33 #include "block/block-copy.h"
35 #include "block/copy-before-write.h"
36 #include "block/reqlist.h"
38 #include "qapi/qapi-visit-block-core.h"
40 typedef struct BDRVCopyBeforeWriteState
{
45 * @lock: protects access to @access_bitmap, @done_bitmap and
51 * @access_bitmap: represents areas allowed for reading by fleecing user.
52 * Reading from non-dirty areas leads to -EACCES.
54 BdrvDirtyBitmap
*access_bitmap
;
57 * @done_bitmap: represents areas that was successfully copied to @target by
58 * copy-before-write operations.
60 BdrvDirtyBitmap
*done_bitmap
;
63 * @frozen_read_reqs: current read requests for fleecing user in bs->file
64 * node. These areas must not be rewritten by guest.
66 BlockReqList frozen_read_reqs
;
67 } BDRVCopyBeforeWriteState
;
69 static coroutine_fn
int cbw_co_preadv(
70 BlockDriverState
*bs
, int64_t offset
, int64_t bytes
,
71 QEMUIOVector
*qiov
, BdrvRequestFlags flags
)
73 return bdrv_co_preadv(bs
->file
, offset
, bytes
, qiov
, flags
);
77 * Do copy-before-write operation.
79 * On failure guest request must be failed too.
81 * On success, we also wait for all in-flight fleecing read requests in source
82 * node, and it's guaranteed that after cbw_do_copy_before_write() successful
83 * return there are no such requests and they will never appear.
85 static coroutine_fn
int cbw_do_copy_before_write(BlockDriverState
*bs
,
86 uint64_t offset
, uint64_t bytes
, BdrvRequestFlags flags
)
88 BDRVCopyBeforeWriteState
*s
= bs
->opaque
;
91 int64_t cluster_size
= block_copy_cluster_size(s
->bcs
);
93 if (flags
& BDRV_REQ_WRITE_UNCHANGED
) {
97 off
= QEMU_ALIGN_DOWN(offset
, cluster_size
);
98 end
= QEMU_ALIGN_UP(offset
+ bytes
, cluster_size
);
100 ret
= block_copy(s
->bcs
, off
, end
- off
, true);
105 WITH_QEMU_LOCK_GUARD(&s
->lock
) {
106 bdrv_set_dirty_bitmap(s
->done_bitmap
, off
, end
- off
);
107 reqlist_wait_all(&s
->frozen_read_reqs
, off
, end
- off
, &s
->lock
);
113 static int coroutine_fn
cbw_co_pdiscard(BlockDriverState
*bs
,
114 int64_t offset
, int64_t bytes
)
116 int ret
= cbw_do_copy_before_write(bs
, offset
, bytes
, 0);
121 return bdrv_co_pdiscard(bs
->file
, offset
, bytes
);
124 static int coroutine_fn
cbw_co_pwrite_zeroes(BlockDriverState
*bs
,
125 int64_t offset
, int64_t bytes
, BdrvRequestFlags flags
)
127 int ret
= cbw_do_copy_before_write(bs
, offset
, bytes
, flags
);
132 return bdrv_co_pwrite_zeroes(bs
->file
, offset
, bytes
, flags
);
135 static coroutine_fn
int cbw_co_pwritev(BlockDriverState
*bs
,
139 BdrvRequestFlags flags
)
141 int ret
= cbw_do_copy_before_write(bs
, offset
, bytes
, flags
);
146 return bdrv_co_pwritev(bs
->file
, offset
, bytes
, qiov
, flags
);
149 static int coroutine_fn
cbw_co_flush(BlockDriverState
*bs
)
155 return bdrv_co_flush(bs
->file
->bs
);
159 * If @offset not accessible - return NULL.
161 * Otherwise, set @pnum to some bytes that accessible from @file (@file is set
162 * to bs->file or to s->target). Return newly allocated BlockReq object that
163 * should be than passed to cbw_snapshot_read_unlock().
165 * It's guaranteed that guest writes will not interact in the region until
166 * cbw_snapshot_read_unlock() called.
168 static BlockReq
*cbw_snapshot_read_lock(BlockDriverState
*bs
,
169 int64_t offset
, int64_t bytes
,
170 int64_t *pnum
, BdrvChild
**file
)
172 BDRVCopyBeforeWriteState
*s
= bs
->opaque
;
173 BlockReq
*req
= g_new(BlockReq
, 1);
176 QEMU_LOCK_GUARD(&s
->lock
);
178 if (bdrv_dirty_bitmap_next_zero(s
->access_bitmap
, offset
, bytes
) != -1) {
183 done
= bdrv_dirty_bitmap_status(s
->done_bitmap
, offset
, bytes
, pnum
);
186 * Special invalid BlockReq, that is handled in
187 * cbw_snapshot_read_unlock(). We don't need to lock something to read
190 *req
= (BlockReq
) {.offset
= -1, .bytes
= -1};
193 reqlist_init_req(&s
->frozen_read_reqs
, req
, offset
, bytes
);
200 static void cbw_snapshot_read_unlock(BlockDriverState
*bs
, BlockReq
*req
)
202 BDRVCopyBeforeWriteState
*s
= bs
->opaque
;
204 if (req
->offset
== -1 && req
->bytes
== -1) {
209 QEMU_LOCK_GUARD(&s
->lock
);
211 reqlist_remove_req(req
);
215 static coroutine_fn
int
216 cbw_co_preadv_snapshot(BlockDriverState
*bs
, int64_t offset
, int64_t bytes
,
217 QEMUIOVector
*qiov
, size_t qiov_offset
)
223 /* TODO: upgrade to async loop using AioTask */
227 req
= cbw_snapshot_read_lock(bs
, offset
, bytes
, &cur_bytes
, &file
);
232 ret
= bdrv_co_preadv_part(file
, offset
, cur_bytes
,
233 qiov
, qiov_offset
, 0);
234 cbw_snapshot_read_unlock(bs
, req
);
241 qiov_offset
+= cur_bytes
;
247 static int coroutine_fn
248 cbw_co_snapshot_block_status(BlockDriverState
*bs
,
249 bool want_zero
, int64_t offset
, int64_t bytes
,
250 int64_t *pnum
, int64_t *map
,
251 BlockDriverState
**file
)
253 BDRVCopyBeforeWriteState
*s
= bs
->opaque
;
259 req
= cbw_snapshot_read_lock(bs
, offset
, bytes
, &cur_bytes
, &child
);
264 ret
= bdrv_block_status(child
->bs
, offset
, cur_bytes
, pnum
, map
, file
);
265 if (child
== s
->target
) {
267 * We refer to s->target only for areas that we've written to it.
268 * And we can not report unallocated blocks in s->target: this will
269 * break generic block-status-above logic, that will go to
270 * copy-before-write filtered child in this case.
272 assert(ret
& BDRV_BLOCK_ALLOCATED
);
275 cbw_snapshot_read_unlock(bs
, req
);
280 static int coroutine_fn
cbw_co_pdiscard_snapshot(BlockDriverState
*bs
,
281 int64_t offset
, int64_t bytes
)
283 BDRVCopyBeforeWriteState
*s
= bs
->opaque
;
285 WITH_QEMU_LOCK_GUARD(&s
->lock
) {
286 bdrv_reset_dirty_bitmap(s
->access_bitmap
, offset
, bytes
);
289 block_copy_reset(s
->bcs
, offset
, bytes
);
291 return bdrv_co_pdiscard(s
->target
, offset
, bytes
);
294 static void cbw_refresh_filename(BlockDriverState
*bs
)
296 pstrcpy(bs
->exact_filename
, sizeof(bs
->exact_filename
),
297 bs
->file
->bs
->filename
);
300 static void cbw_child_perm(BlockDriverState
*bs
, BdrvChild
*c
,
302 BlockReopenQueue
*reopen_queue
,
303 uint64_t perm
, uint64_t shared
,
304 uint64_t *nperm
, uint64_t *nshared
)
306 if (!(role
& BDRV_CHILD_FILTERED
)) {
310 * Share write to target (child_file), to not interfere
311 * with guest writes to its disk which may be in target backing chain.
312 * Can't resize during a backup block job because we check the size
315 *nshared
= BLK_PERM_ALL
& ~BLK_PERM_RESIZE
;
316 *nperm
= BLK_PERM_WRITE
;
319 bdrv_default_perms(bs
, c
, role
, reopen_queue
,
320 perm
, shared
, nperm
, nshared
);
322 if (!QLIST_EMPTY(&bs
->parents
)) {
323 if (perm
& BLK_PERM_WRITE
) {
324 *nperm
= *nperm
| BLK_PERM_CONSISTENT_READ
;
326 *nshared
&= ~(BLK_PERM_WRITE
| BLK_PERM_RESIZE
);
331 static bool cbw_parse_bitmap_option(QDict
*options
, BdrvDirtyBitmap
**bitmap
,
334 QDict
*bitmap_qdict
= NULL
;
335 BlockDirtyBitmap
*bmp_param
= NULL
;
341 qdict_extract_subqdict(options
, &bitmap_qdict
, "bitmap.");
342 if (!qdict_size(bitmap_qdict
)) {
347 v
= qobject_input_visitor_new_flat_confused(bitmap_qdict
, errp
);
352 visit_type_BlockDirtyBitmap(v
, NULL
, &bmp_param
, errp
);
357 *bitmap
= block_dirty_bitmap_lookup(bmp_param
->node
, bmp_param
->name
, NULL
,
366 qapi_free_BlockDirtyBitmap(bmp_param
);
368 qobject_unref(bitmap_qdict
);
373 static int cbw_open(BlockDriverState
*bs
, QDict
*options
, int flags
,
376 BDRVCopyBeforeWriteState
*s
= bs
->opaque
;
377 BdrvDirtyBitmap
*bitmap
= NULL
;
378 int64_t cluster_size
;
380 bs
->file
= bdrv_open_child(NULL
, options
, "file", bs
, &child_of_bds
,
381 BDRV_CHILD_FILTERED
| BDRV_CHILD_PRIMARY
,
387 s
->target
= bdrv_open_child(NULL
, options
, "target", bs
, &child_of_bds
,
388 BDRV_CHILD_DATA
, false, errp
);
393 if (!cbw_parse_bitmap_option(options
, &bitmap
, errp
)) {
397 bs
->total_sectors
= bs
->file
->bs
->total_sectors
;
398 bs
->supported_write_flags
= BDRV_REQ_WRITE_UNCHANGED
|
399 (BDRV_REQ_FUA
& bs
->file
->bs
->supported_write_flags
);
400 bs
->supported_zero_flags
= BDRV_REQ_WRITE_UNCHANGED
|
401 ((BDRV_REQ_FUA
| BDRV_REQ_MAY_UNMAP
| BDRV_REQ_NO_FALLBACK
) &
402 bs
->file
->bs
->supported_zero_flags
);
404 s
->bcs
= block_copy_state_new(bs
->file
, s
->target
, bitmap
, errp
);
406 error_prepend(errp
, "Cannot create block-copy-state: ");
410 cluster_size
= block_copy_cluster_size(s
->bcs
);
412 s
->done_bitmap
= bdrv_create_dirty_bitmap(bs
, cluster_size
, NULL
, errp
);
413 if (!s
->done_bitmap
) {
416 bdrv_disable_dirty_bitmap(s
->done_bitmap
);
418 /* s->access_bitmap starts equal to bcs bitmap */
419 s
->access_bitmap
= bdrv_create_dirty_bitmap(bs
, cluster_size
, NULL
, errp
);
420 if (!s
->access_bitmap
) {
423 bdrv_disable_dirty_bitmap(s
->access_bitmap
);
424 bdrv_dirty_bitmap_merge_internal(s
->access_bitmap
,
425 block_copy_dirty_bitmap(s
->bcs
), NULL
,
428 qemu_co_mutex_init(&s
->lock
);
429 QLIST_INIT(&s
->frozen_read_reqs
);
434 static void cbw_close(BlockDriverState
*bs
)
436 BDRVCopyBeforeWriteState
*s
= bs
->opaque
;
438 bdrv_release_dirty_bitmap(s
->access_bitmap
);
439 bdrv_release_dirty_bitmap(s
->done_bitmap
);
441 block_copy_state_free(s
->bcs
);
445 BlockDriver bdrv_cbw_filter
= {
446 .format_name
= "copy-before-write",
447 .instance_size
= sizeof(BDRVCopyBeforeWriteState
),
449 .bdrv_open
= cbw_open
,
450 .bdrv_close
= cbw_close
,
452 .bdrv_co_preadv
= cbw_co_preadv
,
453 .bdrv_co_pwritev
= cbw_co_pwritev
,
454 .bdrv_co_pwrite_zeroes
= cbw_co_pwrite_zeroes
,
455 .bdrv_co_pdiscard
= cbw_co_pdiscard
,
456 .bdrv_co_flush
= cbw_co_flush
,
458 .bdrv_co_preadv_snapshot
= cbw_co_preadv_snapshot
,
459 .bdrv_co_pdiscard_snapshot
= cbw_co_pdiscard_snapshot
,
460 .bdrv_co_snapshot_block_status
= cbw_co_snapshot_block_status
,
462 .bdrv_refresh_filename
= cbw_refresh_filename
,
464 .bdrv_child_perm
= cbw_child_perm
,
469 BlockDriverState
*bdrv_cbw_append(BlockDriverState
*source
,
470 BlockDriverState
*target
,
471 const char *filter_node_name
,
472 BlockCopyState
**bcs
,
476 BDRVCopyBeforeWriteState
*state
;
477 BlockDriverState
*top
;
480 assert(source
->total_sectors
== target
->total_sectors
);
484 qdict_put_str(opts
, "driver", "copy-before-write");
485 if (filter_node_name
) {
486 qdict_put_str(opts
, "node-name", filter_node_name
);
488 qdict_put_str(opts
, "file", bdrv_get_node_name(source
));
489 qdict_put_str(opts
, "target", bdrv_get_node_name(target
));
491 top
= bdrv_insert_node(source
, opts
, BDRV_O_RDWR
, errp
);
502 void bdrv_cbw_drop(BlockDriverState
*bs
)
505 bdrv_drop_filter(bs
, &error_abort
);
509 static void cbw_init(void)
511 bdrv_register(&bdrv_cbw_filter
);
514 block_init(cbw_init
);