2 * Replication Block filter
4 * Copyright (c) 2016 HUAWEI TECHNOLOGIES CO., LTD.
5 * Copyright (c) 2016 Intel Corporation
6 * Copyright (c) 2016 FUJITSU LIMITED
9 * Wen Congyang <wency@cn.fujitsu.com>
11 * This work is licensed under the terms of the GNU GPL, version 2 or later.
12 * See the COPYING file in the top-level directory.
15 #include "qemu/osdep.h"
16 #include "qemu/module.h"
17 #include "qemu/option.h"
18 #include "block/nbd.h"
19 #include "block/blockjob.h"
20 #include "block/block_int.h"
21 #include "block/block_backup.h"
22 #include "sysemu/block-backend.h"
23 #include "qapi/error.h"
24 #include "qapi/qmp/qdict.h"
25 #include "replication.h"
28 BLOCK_REPLICATION_NONE
, /* block replication is not started */
29 BLOCK_REPLICATION_RUNNING
, /* block replication is running */
30 BLOCK_REPLICATION_FAILOVER
, /* failover is running in background */
31 BLOCK_REPLICATION_FAILOVER_FAILED
, /* failover failed */
32 BLOCK_REPLICATION_DONE
, /* block replication is done */
35 typedef struct BDRVReplicationState
{
37 ReplicationStage stage
;
38 BdrvChild
*active_disk
;
40 BdrvChild
*hidden_disk
;
41 BdrvChild
*secondary_disk
;
46 bool orig_hidden_read_only
;
47 bool orig_secondary_read_only
;
49 } BDRVReplicationState
;
51 static void replication_start(ReplicationState
*rs
, ReplicationMode mode
,
53 static void replication_do_checkpoint(ReplicationState
*rs
, Error
**errp
);
54 static void replication_get_error(ReplicationState
*rs
, Error
**errp
);
55 static void replication_stop(ReplicationState
*rs
, bool failover
,
58 #define REPLICATION_MODE "mode"
59 #define REPLICATION_TOP_ID "top-id"
60 static QemuOptsList replication_runtime_opts
= {
61 .name
= "replication",
62 .head
= QTAILQ_HEAD_INITIALIZER(replication_runtime_opts
.head
),
65 .name
= REPLICATION_MODE
,
66 .type
= QEMU_OPT_STRING
,
69 .name
= REPLICATION_TOP_ID
,
70 .type
= QEMU_OPT_STRING
,
76 static ReplicationOps replication_ops
= {
77 .start
= replication_start
,
78 .checkpoint
= replication_do_checkpoint
,
79 .get_error
= replication_get_error
,
80 .stop
= replication_stop
,
83 static int replication_open(BlockDriverState
*bs
, QDict
*options
,
84 int flags
, Error
**errp
)
87 BDRVReplicationState
*s
= bs
->opaque
;
88 Error
*local_err
= NULL
;
89 QemuOpts
*opts
= NULL
;
93 bs
->file
= bdrv_open_child(NULL
, options
, "file", bs
, &child_of_bds
,
94 BDRV_CHILD_FILTERED
| BDRV_CHILD_PRIMARY
,
101 opts
= qemu_opts_create(&replication_runtime_opts
, NULL
, 0, &error_abort
);
102 qemu_opts_absorb_qdict(opts
, options
, &local_err
);
107 mode
= qemu_opt_get(opts
, REPLICATION_MODE
);
109 error_setg(&local_err
, "Missing the option mode");
113 if (!strcmp(mode
, "primary")) {
114 s
->mode
= REPLICATION_MODE_PRIMARY
;
115 top_id
= qemu_opt_get(opts
, REPLICATION_TOP_ID
);
117 error_setg(&local_err
, "The primary side does not support option top-id");
120 } else if (!strcmp(mode
, "secondary")) {
121 s
->mode
= REPLICATION_MODE_SECONDARY
;
122 top_id
= qemu_opt_get(opts
, REPLICATION_TOP_ID
);
123 s
->top_id
= g_strdup(top_id
);
125 error_setg(&local_err
, "Missing the option top-id");
129 error_setg(&local_err
,
130 "The option mode's value should be primary or secondary");
134 s
->rs
= replication_new(bs
, &replication_ops
);
140 error_propagate(errp
, local_err
);
145 static void replication_close(BlockDriverState
*bs
)
147 BDRVReplicationState
*s
= bs
->opaque
;
150 if (s
->stage
== BLOCK_REPLICATION_RUNNING
) {
151 replication_stop(s
->rs
, false, NULL
);
153 if (s
->stage
== BLOCK_REPLICATION_FAILOVER
) {
154 commit_job
= &s
->commit_job
->job
;
155 assert(commit_job
->aio_context
== qemu_get_current_aio_context());
156 job_cancel_sync(commit_job
);
159 if (s
->mode
== REPLICATION_MODE_SECONDARY
) {
163 replication_remove(s
->rs
);
166 static void replication_child_perm(BlockDriverState
*bs
, BdrvChild
*c
,
168 BlockReopenQueue
*reopen_queue
,
169 uint64_t perm
, uint64_t shared
,
170 uint64_t *nperm
, uint64_t *nshared
)
172 *nperm
= BLK_PERM_CONSISTENT_READ
;
173 if ((bs
->open_flags
& (BDRV_O_INACTIVE
| BDRV_O_RDWR
)) == BDRV_O_RDWR
) {
174 *nperm
|= BLK_PERM_WRITE
;
176 *nshared
= BLK_PERM_CONSISTENT_READ
178 | BLK_PERM_WRITE_UNCHANGED
;
182 static int64_t replication_getlength(BlockDriverState
*bs
)
184 return bdrv_getlength(bs
->file
->bs
);
187 static int replication_get_io_status(BDRVReplicationState
*s
)
190 case BLOCK_REPLICATION_NONE
:
192 case BLOCK_REPLICATION_RUNNING
:
194 case BLOCK_REPLICATION_FAILOVER
:
195 return s
->mode
== REPLICATION_MODE_PRIMARY
? -EIO
: 0;
196 case BLOCK_REPLICATION_FAILOVER_FAILED
:
197 return s
->mode
== REPLICATION_MODE_PRIMARY
? -EIO
: 1;
198 case BLOCK_REPLICATION_DONE
:
200 * active commit job completes, and active disk and secondary_disk
201 * is swapped, so we can operate bs->file directly
203 return s
->mode
== REPLICATION_MODE_PRIMARY
? -EIO
: 0;
209 static int replication_return_value(BDRVReplicationState
*s
, int ret
)
211 if (s
->mode
== REPLICATION_MODE_SECONDARY
) {
223 static coroutine_fn
int replication_co_readv(BlockDriverState
*bs
,
225 int remaining_sectors
,
228 BDRVReplicationState
*s
= bs
->opaque
;
231 if (s
->mode
== REPLICATION_MODE_PRIMARY
) {
232 /* We only use it to forward primary write requests */
236 ret
= replication_get_io_status(s
);
241 ret
= bdrv_co_preadv(bs
->file
, sector_num
* BDRV_SECTOR_SIZE
,
242 remaining_sectors
* BDRV_SECTOR_SIZE
, qiov
, 0);
244 return replication_return_value(s
, ret
);
247 static coroutine_fn
int replication_co_writev(BlockDriverState
*bs
,
249 int remaining_sectors
,
253 BDRVReplicationState
*s
= bs
->opaque
;
254 QEMUIOVector hd_qiov
;
255 uint64_t bytes_done
= 0;
256 BdrvChild
*top
= bs
->file
;
257 BdrvChild
*base
= s
->secondary_disk
;
263 ret
= replication_get_io_status(s
);
269 ret
= bdrv_co_pwritev(top
, sector_num
* BDRV_SECTOR_SIZE
,
270 remaining_sectors
* BDRV_SECTOR_SIZE
, qiov
, 0);
271 return replication_return_value(s
, ret
);
275 * Failover failed, only write to active disk if the sectors
276 * have already been allocated in active disk/hidden disk.
278 qemu_iovec_init(&hd_qiov
, qiov
->niov
);
279 while (remaining_sectors
> 0) {
282 ret
= bdrv_is_allocated_above(top
->bs
, base
->bs
, false,
283 sector_num
* BDRV_SECTOR_SIZE
,
284 remaining_sectors
* BDRV_SECTOR_SIZE
,
290 assert(QEMU_IS_ALIGNED(count
, BDRV_SECTOR_SIZE
));
291 n
= count
>> BDRV_SECTOR_BITS
;
292 qemu_iovec_reset(&hd_qiov
);
293 qemu_iovec_concat(&hd_qiov
, qiov
, bytes_done
, count
);
295 target
= ret
? top
: base
;
296 ret
= bdrv_co_pwritev(target
, sector_num
* BDRV_SECTOR_SIZE
,
297 n
* BDRV_SECTOR_SIZE
, &hd_qiov
, 0);
302 remaining_sectors
-= n
;
308 qemu_iovec_destroy(&hd_qiov
);
313 static void secondary_do_checkpoint(BDRVReplicationState
*s
, Error
**errp
)
315 Error
*local_err
= NULL
;
318 if (!s
->backup_job
) {
319 error_setg(errp
, "Backup job was cancelled unexpectedly");
323 backup_do_checkpoint(s
->backup_job
, &local_err
);
325 error_propagate(errp
, local_err
);
329 if (!s
->active_disk
->bs
->drv
) {
330 error_setg(errp
, "Active disk %s is ejected",
331 s
->active_disk
->bs
->node_name
);
335 ret
= bdrv_make_empty(s
->active_disk
, errp
);
340 if (!s
->hidden_disk
->bs
->drv
) {
341 error_setg(errp
, "Hidden disk %s is ejected",
342 s
->hidden_disk
->bs
->node_name
);
346 BlockBackend
*blk
= blk_new(qemu_get_current_aio_context(),
347 BLK_PERM_WRITE
, BLK_PERM_ALL
);
348 blk_insert_bs(blk
, s
->hidden_disk
->bs
, &local_err
);
350 error_propagate(errp
, local_err
);
355 ret
= blk_make_empty(blk
, errp
);
362 /* This function is supposed to be called twice:
363 * first with writable = true, then with writable = false.
364 * The first call puts s->hidden_disk and s->secondary_disk in
365 * r/w mode, and the second puts them back in their original state.
367 static void reopen_backing_file(BlockDriverState
*bs
, bool writable
,
370 BDRVReplicationState
*s
= bs
->opaque
;
371 BlockReopenQueue
*reopen_queue
= NULL
;
372 Error
*local_err
= NULL
;
375 s
->orig_hidden_read_only
= bdrv_is_read_only(s
->hidden_disk
->bs
);
376 s
->orig_secondary_read_only
= bdrv_is_read_only(s
->secondary_disk
->bs
);
379 bdrv_subtree_drained_begin(s
->hidden_disk
->bs
);
380 bdrv_subtree_drained_begin(s
->secondary_disk
->bs
);
382 if (s
->orig_hidden_read_only
) {
383 QDict
*opts
= qdict_new();
384 qdict_put_bool(opts
, BDRV_OPT_READ_ONLY
, !writable
);
385 reopen_queue
= bdrv_reopen_queue(reopen_queue
, s
->hidden_disk
->bs
,
389 if (s
->orig_secondary_read_only
) {
390 QDict
*opts
= qdict_new();
391 qdict_put_bool(opts
, BDRV_OPT_READ_ONLY
, !writable
);
392 reopen_queue
= bdrv_reopen_queue(reopen_queue
, s
->secondary_disk
->bs
,
397 bdrv_reopen_multiple(reopen_queue
, &local_err
);
398 error_propagate(errp
, local_err
);
401 bdrv_subtree_drained_end(s
->hidden_disk
->bs
);
402 bdrv_subtree_drained_end(s
->secondary_disk
->bs
);
405 static void backup_job_cleanup(BlockDriverState
*bs
)
407 BDRVReplicationState
*s
= bs
->opaque
;
408 BlockDriverState
*top_bs
;
410 s
->backup_job
= NULL
;
412 top_bs
= bdrv_lookup_bs(s
->top_id
, s
->top_id
, NULL
);
416 bdrv_op_unblock_all(top_bs
, s
->blocker
);
417 error_free(s
->blocker
);
418 reopen_backing_file(bs
, false, NULL
);
421 static void backup_job_completed(void *opaque
, int ret
)
423 BlockDriverState
*bs
= opaque
;
424 BDRVReplicationState
*s
= bs
->opaque
;
426 if (s
->stage
!= BLOCK_REPLICATION_FAILOVER
) {
427 /* The backup job is cancelled unexpectedly */
431 backup_job_cleanup(bs
);
434 static bool check_top_bs(BlockDriverState
*top_bs
, BlockDriverState
*bs
)
438 /* The bs itself is the top_bs */
443 /* Iterate over top_bs's children */
444 QLIST_FOREACH(child
, &top_bs
->children
, next
) {
445 if (child
->bs
== bs
|| check_top_bs(child
->bs
, bs
)) {
453 static void replication_start(ReplicationState
*rs
, ReplicationMode mode
,
456 BlockDriverState
*bs
= rs
->opaque
;
457 BDRVReplicationState
*s
;
458 BlockDriverState
*top_bs
;
459 int64_t active_length
, hidden_length
, disk_length
;
460 AioContext
*aio_context
;
461 Error
*local_err
= NULL
;
463 aio_context
= bdrv_get_aio_context(bs
);
464 aio_context_acquire(aio_context
);
467 if (s
->stage
== BLOCK_REPLICATION_DONE
||
468 s
->stage
== BLOCK_REPLICATION_FAILOVER
) {
470 * This case happens when a secondary is promoted to primary.
471 * Ignore the request because the secondary side of replication
472 * doesn't have to do anything anymore.
474 aio_context_release(aio_context
);
478 if (s
->stage
!= BLOCK_REPLICATION_NONE
) {
479 error_setg(errp
, "Block replication is running or done");
480 aio_context_release(aio_context
);
484 if (s
->mode
!= mode
) {
485 error_setg(errp
, "The parameter mode's value is invalid, needs %d,"
486 " but got %d", s
->mode
, mode
);
487 aio_context_release(aio_context
);
492 case REPLICATION_MODE_PRIMARY
:
494 case REPLICATION_MODE_SECONDARY
:
495 s
->active_disk
= bs
->file
;
496 if (!s
->active_disk
|| !s
->active_disk
->bs
||
497 !s
->active_disk
->bs
->backing
) {
498 error_setg(errp
, "Active disk doesn't have backing file");
499 aio_context_release(aio_context
);
503 s
->hidden_disk
= s
->active_disk
->bs
->backing
;
504 if (!s
->hidden_disk
->bs
|| !s
->hidden_disk
->bs
->backing
) {
505 error_setg(errp
, "Hidden disk doesn't have backing file");
506 aio_context_release(aio_context
);
510 s
->secondary_disk
= s
->hidden_disk
->bs
->backing
;
511 if (!s
->secondary_disk
->bs
|| !bdrv_has_blk(s
->secondary_disk
->bs
)) {
512 error_setg(errp
, "The secondary disk doesn't have block backend");
513 aio_context_release(aio_context
);
517 /* verify the length */
518 active_length
= bdrv_getlength(s
->active_disk
->bs
);
519 hidden_length
= bdrv_getlength(s
->hidden_disk
->bs
);
520 disk_length
= bdrv_getlength(s
->secondary_disk
->bs
);
521 if (active_length
< 0 || hidden_length
< 0 || disk_length
< 0 ||
522 active_length
!= hidden_length
|| hidden_length
!= disk_length
) {
523 error_setg(errp
, "Active disk, hidden disk, secondary disk's length"
524 " are not the same");
525 aio_context_release(aio_context
);
529 /* Must be true, or the bdrv_getlength() calls would have failed */
530 assert(s
->active_disk
->bs
->drv
&& s
->hidden_disk
->bs
->drv
);
532 if (!s
->active_disk
->bs
->drv
->bdrv_make_empty
||
533 !s
->hidden_disk
->bs
->drv
->bdrv_make_empty
) {
535 "Active disk or hidden disk doesn't support make_empty");
536 aio_context_release(aio_context
);
540 /* reopen the backing file in r/w mode */
541 reopen_backing_file(bs
, true, &local_err
);
543 error_propagate(errp
, local_err
);
544 aio_context_release(aio_context
);
548 /* start backup job now */
549 error_setg(&s
->blocker
,
550 "Block device is in use by internal backup job");
552 top_bs
= bdrv_lookup_bs(s
->top_id
, s
->top_id
, NULL
);
553 if (!top_bs
|| !bdrv_is_root_node(top_bs
) ||
554 !check_top_bs(top_bs
, bs
)) {
555 error_setg(errp
, "No top_bs or it is invalid");
556 reopen_backing_file(bs
, false, NULL
);
557 aio_context_release(aio_context
);
560 bdrv_op_block_all(top_bs
, s
->blocker
);
561 bdrv_op_unblock(top_bs
, BLOCK_OP_TYPE_DATAPLANE
, s
->blocker
);
563 s
->backup_job
= backup_job_create(
564 NULL
, s
->secondary_disk
->bs
, s
->hidden_disk
->bs
,
565 0, MIRROR_SYNC_MODE_NONE
, NULL
, 0, false, NULL
,
566 BLOCKDEV_ON_ERROR_REPORT
,
567 BLOCKDEV_ON_ERROR_REPORT
, JOB_INTERNAL
,
568 backup_job_completed
, bs
, NULL
, &local_err
);
570 error_propagate(errp
, local_err
);
571 backup_job_cleanup(bs
);
572 aio_context_release(aio_context
);
575 job_start(&s
->backup_job
->job
);
578 aio_context_release(aio_context
);
582 s
->stage
= BLOCK_REPLICATION_RUNNING
;
584 if (s
->mode
== REPLICATION_MODE_SECONDARY
) {
585 secondary_do_checkpoint(s
, errp
);
589 aio_context_release(aio_context
);
592 static void replication_do_checkpoint(ReplicationState
*rs
, Error
**errp
)
594 BlockDriverState
*bs
= rs
->opaque
;
595 BDRVReplicationState
*s
;
596 AioContext
*aio_context
;
598 aio_context
= bdrv_get_aio_context(bs
);
599 aio_context_acquire(aio_context
);
602 if (s
->stage
== BLOCK_REPLICATION_DONE
||
603 s
->stage
== BLOCK_REPLICATION_FAILOVER
) {
605 * This case happens when a secondary was promoted to primary.
606 * Ignore the request because the secondary side of replication
607 * doesn't have to do anything anymore.
609 aio_context_release(aio_context
);
613 if (s
->mode
== REPLICATION_MODE_SECONDARY
) {
614 secondary_do_checkpoint(s
, errp
);
616 aio_context_release(aio_context
);
619 static void replication_get_error(ReplicationState
*rs
, Error
**errp
)
621 BlockDriverState
*bs
= rs
->opaque
;
622 BDRVReplicationState
*s
;
623 AioContext
*aio_context
;
625 aio_context
= bdrv_get_aio_context(bs
);
626 aio_context_acquire(aio_context
);
629 if (s
->stage
== BLOCK_REPLICATION_NONE
) {
630 error_setg(errp
, "Block replication is not running");
631 aio_context_release(aio_context
);
636 error_setg(errp
, "I/O error occurred");
637 aio_context_release(aio_context
);
640 aio_context_release(aio_context
);
643 static void replication_done(void *opaque
, int ret
)
645 BlockDriverState
*bs
= opaque
;
646 BDRVReplicationState
*s
= bs
->opaque
;
649 s
->stage
= BLOCK_REPLICATION_DONE
;
651 s
->active_disk
= NULL
;
652 s
->secondary_disk
= NULL
;
653 s
->hidden_disk
= NULL
;
656 s
->stage
= BLOCK_REPLICATION_FAILOVER_FAILED
;
661 static void replication_stop(ReplicationState
*rs
, bool failover
, Error
**errp
)
663 BlockDriverState
*bs
= rs
->opaque
;
664 BDRVReplicationState
*s
;
665 AioContext
*aio_context
;
667 aio_context
= bdrv_get_aio_context(bs
);
668 aio_context_acquire(aio_context
);
671 if (s
->stage
== BLOCK_REPLICATION_DONE
||
672 s
->stage
== BLOCK_REPLICATION_FAILOVER
) {
674 * This case happens when a secondary was promoted to primary.
675 * Ignore the request because the secondary side of replication
676 * doesn't have to do anything anymore.
678 aio_context_release(aio_context
);
682 if (s
->stage
!= BLOCK_REPLICATION_RUNNING
) {
683 error_setg(errp
, "Block replication is not running");
684 aio_context_release(aio_context
);
689 case REPLICATION_MODE_PRIMARY
:
690 s
->stage
= BLOCK_REPLICATION_DONE
;
693 case REPLICATION_MODE_SECONDARY
:
695 * This BDS will be closed, and the job should be completed
696 * before the BDS is closed, because we will access hidden
697 * disk, secondary disk in backup_job_completed().
700 job_cancel_sync(&s
->backup_job
->job
);
704 secondary_do_checkpoint(s
, errp
);
705 s
->stage
= BLOCK_REPLICATION_DONE
;
706 aio_context_release(aio_context
);
710 s
->stage
= BLOCK_REPLICATION_FAILOVER
;
711 s
->commit_job
= commit_active_start(
712 NULL
, s
->active_disk
->bs
, s
->secondary_disk
->bs
,
713 JOB_INTERNAL
, 0, BLOCKDEV_ON_ERROR_REPORT
,
714 NULL
, replication_done
, bs
, true, errp
);
717 aio_context_release(aio_context
);
720 aio_context_release(aio_context
);
723 static const char *const replication_strong_runtime_opts
[] = {
730 static BlockDriver bdrv_replication
= {
731 .format_name
= "replication",
732 .instance_size
= sizeof(BDRVReplicationState
),
734 .bdrv_open
= replication_open
,
735 .bdrv_close
= replication_close
,
736 .bdrv_child_perm
= replication_child_perm
,
738 .bdrv_getlength
= replication_getlength
,
739 .bdrv_co_readv
= replication_co_readv
,
740 .bdrv_co_writev
= replication_co_writev
,
744 .has_variable_length
= true,
745 .strong_runtime_opts
= replication_strong_runtime_opts
,
748 static void bdrv_replication_init(void)
750 bdrv_register(&bdrv_replication
);
753 block_init(bdrv_replication_init
);