Merge remote-tracking branch 'remotes/mst/tags/for_upstream' into staging
[qemu/ar7.git] / block / replication.c
blob97be7ef4de54686705c833dced03187f5cb222b8
1 /*
2  * Replication Block filter
3  *
4  * Copyright (c) 2016 HUAWEI TECHNOLOGIES CO., LTD.
5  * Copyright (c) 2016 Intel Corporation
6  * Copyright (c) 2016 FUJITSU LIMITED
7  *
8  * Author:
9  *   Wen Congyang <wency@cn.fujitsu.com>
10  *
11  * This work is licensed under the terms of the GNU GPL, version 2 or later.
12  * See the COPYING file in the top-level directory.
13  */
15 #include "qemu/osdep.h"
16 #include "qemu/module.h"
17 #include "qemu/option.h"
18 #include "block/nbd.h"
19 #include "block/blockjob.h"
20 #include "block/block_int.h"
21 #include "block/block_backup.h"
22 #include "sysemu/block-backend.h"
23 #include "qapi/error.h"
24 #include "qapi/qmp/qdict.h"
25 #include "replication.h"
27 typedef enum {
28     BLOCK_REPLICATION_NONE,             /* block replication is not started */
29     BLOCK_REPLICATION_RUNNING,          /* block replication is running */
30     BLOCK_REPLICATION_FAILOVER,         /* failover is running in background */
31     BLOCK_REPLICATION_FAILOVER_FAILED,  /* failover failed */
32     BLOCK_REPLICATION_DONE,             /* block replication is done */
33 } ReplicationStage;
35 typedef struct BDRVReplicationState {
36     ReplicationMode mode;
37     ReplicationStage stage;
38     BdrvChild *active_disk;
39     BlockJob *commit_job;
40     BdrvChild *hidden_disk;
41     BdrvChild *secondary_disk;
42     BlockJob *backup_job;
43     char *top_id;
44     ReplicationState *rs;
45     Error *blocker;
46     bool orig_hidden_read_only;
47     bool orig_secondary_read_only;
48     int error;
49 } BDRVReplicationState;
51 static void replication_start(ReplicationState *rs, ReplicationMode mode,
52                               Error **errp);
53 static void replication_do_checkpoint(ReplicationState *rs, Error **errp);
54 static void replication_get_error(ReplicationState *rs, Error **errp);
55 static void replication_stop(ReplicationState *rs, bool failover,
56                              Error **errp);
58 #define REPLICATION_MODE        "mode"
59 #define REPLICATION_TOP_ID      "top-id"
60 static QemuOptsList replication_runtime_opts = {
61     .name = "replication",
62     .head = QTAILQ_HEAD_INITIALIZER(replication_runtime_opts.head),
63     .desc = {
64         {
65             .name = REPLICATION_MODE,
66             .type = QEMU_OPT_STRING,
67         },
68         {
69             .name = REPLICATION_TOP_ID,
70             .type = QEMU_OPT_STRING,
71         },
72         { /* end of list */ }
73     },
76 static ReplicationOps replication_ops = {
77     .start = replication_start,
78     .checkpoint = replication_do_checkpoint,
79     .get_error = replication_get_error,
80     .stop = replication_stop,
83 static int replication_open(BlockDriverState *bs, QDict *options,
84                             int flags, Error **errp)
86     int ret;
87     BDRVReplicationState *s = bs->opaque;
88     QemuOpts *opts = NULL;
89     const char *mode;
90     const char *top_id;
92     bs->file = bdrv_open_child(NULL, options, "file", bs, &child_of_bds,
93                                BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY,
94                                false, errp);
95     if (!bs->file) {
96         return -EINVAL;
97     }
99     ret = -EINVAL;
100     opts = qemu_opts_create(&replication_runtime_opts, NULL, 0, &error_abort);
101     if (!qemu_opts_absorb_qdict(opts, options, errp)) {
102         goto fail;
103     }
105     mode = qemu_opt_get(opts, REPLICATION_MODE);
106     if (!mode) {
107         error_setg(errp, "Missing the option mode");
108         goto fail;
109     }
111     if (!strcmp(mode, "primary")) {
112         s->mode = REPLICATION_MODE_PRIMARY;
113         top_id = qemu_opt_get(opts, REPLICATION_TOP_ID);
114         if (top_id) {
115             error_setg(errp,
116                        "The primary side does not support option top-id");
117             goto fail;
118         }
119     } else if (!strcmp(mode, "secondary")) {
120         s->mode = REPLICATION_MODE_SECONDARY;
121         top_id = qemu_opt_get(opts, REPLICATION_TOP_ID);
122         s->top_id = g_strdup(top_id);
123         if (!s->top_id) {
124             error_setg(errp, "Missing the option top-id");
125             goto fail;
126         }
127     } else {
128         error_setg(errp,
129                    "The option mode's value should be primary or secondary");
130         goto fail;
131     }
133     s->rs = replication_new(bs, &replication_ops);
135     ret = 0;
137 fail:
138     qemu_opts_del(opts);
139     return ret;
142 static void replication_close(BlockDriverState *bs)
144     BDRVReplicationState *s = bs->opaque;
145     Job *commit_job;
147     if (s->stage == BLOCK_REPLICATION_RUNNING) {
148         replication_stop(s->rs, false, NULL);
149     }
150     if (s->stage == BLOCK_REPLICATION_FAILOVER) {
151         commit_job = &s->commit_job->job;
152         assert(commit_job->aio_context == qemu_get_current_aio_context());
153         job_cancel_sync(commit_job);
154     }
156     if (s->mode == REPLICATION_MODE_SECONDARY) {
157         g_free(s->top_id);
158     }
160     replication_remove(s->rs);
163 static void replication_child_perm(BlockDriverState *bs, BdrvChild *c,
164                                    BdrvChildRole role,
165                                    BlockReopenQueue *reopen_queue,
166                                    uint64_t perm, uint64_t shared,
167                                    uint64_t *nperm, uint64_t *nshared)
169     *nperm = BLK_PERM_CONSISTENT_READ;
170     if ((bs->open_flags & (BDRV_O_INACTIVE | BDRV_O_RDWR)) == BDRV_O_RDWR) {
171         *nperm |= BLK_PERM_WRITE;
172     }
173     *nshared = BLK_PERM_CONSISTENT_READ
174                | BLK_PERM_WRITE
175                | BLK_PERM_WRITE_UNCHANGED;
176     return;
179 static int64_t replication_getlength(BlockDriverState *bs)
181     return bdrv_getlength(bs->file->bs);
184 static int replication_get_io_status(BDRVReplicationState *s)
186     switch (s->stage) {
187     case BLOCK_REPLICATION_NONE:
188         return -EIO;
189     case BLOCK_REPLICATION_RUNNING:
190         return 0;
191     case BLOCK_REPLICATION_FAILOVER:
192         return s->mode == REPLICATION_MODE_PRIMARY ? -EIO : 0;
193     case BLOCK_REPLICATION_FAILOVER_FAILED:
194         return s->mode == REPLICATION_MODE_PRIMARY ? -EIO : 1;
195     case BLOCK_REPLICATION_DONE:
196         /*
197          * active commit job completes, and active disk and secondary_disk
198          * is swapped, so we can operate bs->file directly
199          */
200         return s->mode == REPLICATION_MODE_PRIMARY ? -EIO : 0;
201     default:
202         abort();
203     }
206 static int replication_return_value(BDRVReplicationState *s, int ret)
208     if (s->mode == REPLICATION_MODE_SECONDARY) {
209         return ret;
210     }
212     if (ret < 0) {
213         s->error = ret;
214         ret = 0;
215     }
217     return ret;
220 static coroutine_fn int replication_co_readv(BlockDriverState *bs,
221                                              int64_t sector_num,
222                                              int remaining_sectors,
223                                              QEMUIOVector *qiov)
225     BDRVReplicationState *s = bs->opaque;
226     int ret;
228     if (s->mode == REPLICATION_MODE_PRIMARY) {
229         /* We only use it to forward primary write requests */
230         return -EIO;
231     }
233     ret = replication_get_io_status(s);
234     if (ret < 0) {
235         return ret;
236     }
238     ret = bdrv_co_preadv(bs->file, sector_num * BDRV_SECTOR_SIZE,
239                          remaining_sectors * BDRV_SECTOR_SIZE, qiov, 0);
241     return replication_return_value(s, ret);
244 static coroutine_fn int replication_co_writev(BlockDriverState *bs,
245                                               int64_t sector_num,
246                                               int remaining_sectors,
247                                               QEMUIOVector *qiov,
248                                               int flags)
250     BDRVReplicationState *s = bs->opaque;
251     QEMUIOVector hd_qiov;
252     uint64_t bytes_done = 0;
253     BdrvChild *top = bs->file;
254     BdrvChild *base = s->secondary_disk;
255     BdrvChild *target;
256     int ret;
257     int64_t n;
259     assert(!flags);
260     ret = replication_get_io_status(s);
261     if (ret < 0) {
262         goto out;
263     }
265     if (ret == 0) {
266         ret = bdrv_co_pwritev(top, sector_num * BDRV_SECTOR_SIZE,
267                               remaining_sectors * BDRV_SECTOR_SIZE, qiov, 0);
268         return replication_return_value(s, ret);
269     }
271     /*
272      * Failover failed, only write to active disk if the sectors
273      * have already been allocated in active disk/hidden disk.
274      */
275     qemu_iovec_init(&hd_qiov, qiov->niov);
276     while (remaining_sectors > 0) {
277         int64_t count;
279         ret = bdrv_is_allocated_above(top->bs, base->bs, false,
280                                       sector_num * BDRV_SECTOR_SIZE,
281                                       remaining_sectors * BDRV_SECTOR_SIZE,
282                                       &count);
283         if (ret < 0) {
284             goto out1;
285         }
287         assert(QEMU_IS_ALIGNED(count, BDRV_SECTOR_SIZE));
288         n = count >> BDRV_SECTOR_BITS;
289         qemu_iovec_reset(&hd_qiov);
290         qemu_iovec_concat(&hd_qiov, qiov, bytes_done, count);
292         target = ret ? top : base;
293         ret = bdrv_co_pwritev(target, sector_num * BDRV_SECTOR_SIZE,
294                               n * BDRV_SECTOR_SIZE, &hd_qiov, 0);
295         if (ret < 0) {
296             goto out1;
297         }
299         remaining_sectors -= n;
300         sector_num += n;
301         bytes_done += count;
302     }
304 out1:
305     qemu_iovec_destroy(&hd_qiov);
306 out:
307     return ret;
310 static void secondary_do_checkpoint(BDRVReplicationState *s, Error **errp)
312     Error *local_err = NULL;
313     int ret;
315     if (!s->backup_job) {
316         error_setg(errp, "Backup job was cancelled unexpectedly");
317         return;
318     }
320     backup_do_checkpoint(s->backup_job, &local_err);
321     if (local_err) {
322         error_propagate(errp, local_err);
323         return;
324     }
326     if (!s->active_disk->bs->drv) {
327         error_setg(errp, "Active disk %s is ejected",
328                    s->active_disk->bs->node_name);
329         return;
330     }
332     ret = bdrv_make_empty(s->active_disk, errp);
333     if (ret < 0) {
334         return;
335     }
337     if (!s->hidden_disk->bs->drv) {
338         error_setg(errp, "Hidden disk %s is ejected",
339                    s->hidden_disk->bs->node_name);
340         return;
341     }
343     BlockBackend *blk = blk_new(qemu_get_current_aio_context(),
344                                 BLK_PERM_WRITE, BLK_PERM_ALL);
345     blk_insert_bs(blk, s->hidden_disk->bs, &local_err);
346     if (local_err) {
347         error_propagate(errp, local_err);
348         blk_unref(blk);
349         return;
350     }
352     ret = blk_make_empty(blk, errp);
353     blk_unref(blk);
354     if (ret < 0) {
355         return;
356     }
359 /* This function is supposed to be called twice:
360  * first with writable = true, then with writable = false.
361  * The first call puts s->hidden_disk and s->secondary_disk in
362  * r/w mode, and the second puts them back in their original state.
363  */
364 static void reopen_backing_file(BlockDriverState *bs, bool writable,
365                                 Error **errp)
367     BDRVReplicationState *s = bs->opaque;
368     BlockReopenQueue *reopen_queue = NULL;
370     if (writable) {
371         s->orig_hidden_read_only = bdrv_is_read_only(s->hidden_disk->bs);
372         s->orig_secondary_read_only = bdrv_is_read_only(s->secondary_disk->bs);
373     }
375     bdrv_subtree_drained_begin(s->hidden_disk->bs);
376     bdrv_subtree_drained_begin(s->secondary_disk->bs);
378     if (s->orig_hidden_read_only) {
379         QDict *opts = qdict_new();
380         qdict_put_bool(opts, BDRV_OPT_READ_ONLY, !writable);
381         reopen_queue = bdrv_reopen_queue(reopen_queue, s->hidden_disk->bs,
382                                          opts, true);
383     }
385     if (s->orig_secondary_read_only) {
386         QDict *opts = qdict_new();
387         qdict_put_bool(opts, BDRV_OPT_READ_ONLY, !writable);
388         reopen_queue = bdrv_reopen_queue(reopen_queue, s->secondary_disk->bs,
389                                          opts, true);
390     }
392     if (reopen_queue) {
393         bdrv_reopen_multiple(reopen_queue, errp);
394     }
396     bdrv_subtree_drained_end(s->hidden_disk->bs);
397     bdrv_subtree_drained_end(s->secondary_disk->bs);
400 static void backup_job_cleanup(BlockDriverState *bs)
402     BDRVReplicationState *s = bs->opaque;
403     BlockDriverState *top_bs;
405     s->backup_job = NULL;
407     top_bs = bdrv_lookup_bs(s->top_id, s->top_id, NULL);
408     if (!top_bs) {
409         return;
410     }
411     bdrv_op_unblock_all(top_bs, s->blocker);
412     error_free(s->blocker);
413     reopen_backing_file(bs, false, NULL);
416 static void backup_job_completed(void *opaque, int ret)
418     BlockDriverState *bs = opaque;
419     BDRVReplicationState *s = bs->opaque;
421     if (s->stage != BLOCK_REPLICATION_FAILOVER) {
422         /* The backup job is cancelled unexpectedly */
423         s->error = -EIO;
424     }
426     backup_job_cleanup(bs);
429 static bool check_top_bs(BlockDriverState *top_bs, BlockDriverState *bs)
431     BdrvChild *child;
433     /* The bs itself is the top_bs */
434     if (top_bs == bs) {
435         return true;
436     }
438     /* Iterate over top_bs's children */
439     QLIST_FOREACH(child, &top_bs->children, next) {
440         if (child->bs == bs || check_top_bs(child->bs, bs)) {
441             return true;
442         }
443     }
445     return false;
448 static void replication_start(ReplicationState *rs, ReplicationMode mode,
449                               Error **errp)
451     BlockDriverState *bs = rs->opaque;
452     BDRVReplicationState *s;
453     BlockDriverState *top_bs;
454     int64_t active_length, hidden_length, disk_length;
455     AioContext *aio_context;
456     Error *local_err = NULL;
457     BackupPerf perf = { .use_copy_range = true, .max_workers = 1 };
459     aio_context = bdrv_get_aio_context(bs);
460     aio_context_acquire(aio_context);
461     s = bs->opaque;
463     if (s->stage == BLOCK_REPLICATION_DONE ||
464         s->stage == BLOCK_REPLICATION_FAILOVER) {
465         /*
466          * This case happens when a secondary is promoted to primary.
467          * Ignore the request because the secondary side of replication
468          * doesn't have to do anything anymore.
469          */
470         aio_context_release(aio_context);
471         return;
472     }
474     if (s->stage != BLOCK_REPLICATION_NONE) {
475         error_setg(errp, "Block replication is running or done");
476         aio_context_release(aio_context);
477         return;
478     }
480     if (s->mode != mode) {
481         error_setg(errp, "The parameter mode's value is invalid, needs %d,"
482                    " but got %d", s->mode, mode);
483         aio_context_release(aio_context);
484         return;
485     }
487     switch (s->mode) {
488     case REPLICATION_MODE_PRIMARY:
489         break;
490     case REPLICATION_MODE_SECONDARY:
491         s->active_disk = bs->file;
492         if (!s->active_disk || !s->active_disk->bs ||
493                                     !s->active_disk->bs->backing) {
494             error_setg(errp, "Active disk doesn't have backing file");
495             aio_context_release(aio_context);
496             return;
497         }
499         s->hidden_disk = s->active_disk->bs->backing;
500         if (!s->hidden_disk->bs || !s->hidden_disk->bs->backing) {
501             error_setg(errp, "Hidden disk doesn't have backing file");
502             aio_context_release(aio_context);
503             return;
504         }
506         s->secondary_disk = s->hidden_disk->bs->backing;
507         if (!s->secondary_disk->bs || !bdrv_has_blk(s->secondary_disk->bs)) {
508             error_setg(errp, "The secondary disk doesn't have block backend");
509             aio_context_release(aio_context);
510             return;
511         }
513         /* verify the length */
514         active_length = bdrv_getlength(s->active_disk->bs);
515         hidden_length = bdrv_getlength(s->hidden_disk->bs);
516         disk_length = bdrv_getlength(s->secondary_disk->bs);
517         if (active_length < 0 || hidden_length < 0 || disk_length < 0 ||
518             active_length != hidden_length || hidden_length != disk_length) {
519             error_setg(errp, "Active disk, hidden disk, secondary disk's length"
520                        " are not the same");
521             aio_context_release(aio_context);
522             return;
523         }
525         /* Must be true, or the bdrv_getlength() calls would have failed */
526         assert(s->active_disk->bs->drv && s->hidden_disk->bs->drv);
528         if (!s->active_disk->bs->drv->bdrv_make_empty ||
529             !s->hidden_disk->bs->drv->bdrv_make_empty) {
530             error_setg(errp,
531                        "Active disk or hidden disk doesn't support make_empty");
532             aio_context_release(aio_context);
533             return;
534         }
536         /* reopen the backing file in r/w mode */
537         reopen_backing_file(bs, true, &local_err);
538         if (local_err) {
539             error_propagate(errp, local_err);
540             aio_context_release(aio_context);
541             return;
542         }
544         /* start backup job now */
545         error_setg(&s->blocker,
546                    "Block device is in use by internal backup job");
548         top_bs = bdrv_lookup_bs(s->top_id, s->top_id, NULL);
549         if (!top_bs || !bdrv_is_root_node(top_bs) ||
550             !check_top_bs(top_bs, bs)) {
551             error_setg(errp, "No top_bs or it is invalid");
552             reopen_backing_file(bs, false, NULL);
553             aio_context_release(aio_context);
554             return;
555         }
556         bdrv_op_block_all(top_bs, s->blocker);
557         bdrv_op_unblock(top_bs, BLOCK_OP_TYPE_DATAPLANE, s->blocker);
559         s->backup_job = backup_job_create(
560                                 NULL, s->secondary_disk->bs, s->hidden_disk->bs,
561                                 0, MIRROR_SYNC_MODE_NONE, NULL, 0, false, NULL,
562                                 &perf,
563                                 BLOCKDEV_ON_ERROR_REPORT,
564                                 BLOCKDEV_ON_ERROR_REPORT, JOB_INTERNAL,
565                                 backup_job_completed, bs, NULL, &local_err);
566         if (local_err) {
567             error_propagate(errp, local_err);
568             backup_job_cleanup(bs);
569             aio_context_release(aio_context);
570             return;
571         }
572         job_start(&s->backup_job->job);
573         break;
574     default:
575         aio_context_release(aio_context);
576         abort();
577     }
579     s->stage = BLOCK_REPLICATION_RUNNING;
581     if (s->mode == REPLICATION_MODE_SECONDARY) {
582         secondary_do_checkpoint(s, errp);
583     }
585     s->error = 0;
586     aio_context_release(aio_context);
589 static void replication_do_checkpoint(ReplicationState *rs, Error **errp)
591     BlockDriverState *bs = rs->opaque;
592     BDRVReplicationState *s;
593     AioContext *aio_context;
595     aio_context = bdrv_get_aio_context(bs);
596     aio_context_acquire(aio_context);
597     s = bs->opaque;
599     if (s->stage == BLOCK_REPLICATION_DONE ||
600         s->stage == BLOCK_REPLICATION_FAILOVER) {
601         /*
602          * This case happens when a secondary was promoted to primary.
603          * Ignore the request because the secondary side of replication
604          * doesn't have to do anything anymore.
605          */
606         aio_context_release(aio_context);
607         return;
608     }
610     if (s->mode == REPLICATION_MODE_SECONDARY) {
611         secondary_do_checkpoint(s, errp);
612     }
613     aio_context_release(aio_context);
616 static void replication_get_error(ReplicationState *rs, Error **errp)
618     BlockDriverState *bs = rs->opaque;
619     BDRVReplicationState *s;
620     AioContext *aio_context;
622     aio_context = bdrv_get_aio_context(bs);
623     aio_context_acquire(aio_context);
624     s = bs->opaque;
626     if (s->stage == BLOCK_REPLICATION_NONE) {
627         error_setg(errp, "Block replication is not running");
628         aio_context_release(aio_context);
629         return;
630     }
632     if (s->error) {
633         error_setg(errp, "I/O error occurred");
634         aio_context_release(aio_context);
635         return;
636     }
637     aio_context_release(aio_context);
640 static void replication_done(void *opaque, int ret)
642     BlockDriverState *bs = opaque;
643     BDRVReplicationState *s = bs->opaque;
645     if (ret == 0) {
646         s->stage = BLOCK_REPLICATION_DONE;
648         s->active_disk = NULL;
649         s->secondary_disk = NULL;
650         s->hidden_disk = NULL;
651         s->error = 0;
652     } else {
653         s->stage = BLOCK_REPLICATION_FAILOVER_FAILED;
654         s->error = -EIO;
655     }
658 static void replication_stop(ReplicationState *rs, bool failover, Error **errp)
660     BlockDriverState *bs = rs->opaque;
661     BDRVReplicationState *s;
662     AioContext *aio_context;
664     aio_context = bdrv_get_aio_context(bs);
665     aio_context_acquire(aio_context);
666     s = bs->opaque;
668     if (s->stage == BLOCK_REPLICATION_DONE ||
669         s->stage == BLOCK_REPLICATION_FAILOVER) {
670         /*
671          * This case happens when a secondary was promoted to primary.
672          * Ignore the request because the secondary side of replication
673          * doesn't have to do anything anymore.
674          */
675         aio_context_release(aio_context);
676         return;
677     }
679     if (s->stage != BLOCK_REPLICATION_RUNNING) {
680         error_setg(errp, "Block replication is not running");
681         aio_context_release(aio_context);
682         return;
683     }
685     switch (s->mode) {
686     case REPLICATION_MODE_PRIMARY:
687         s->stage = BLOCK_REPLICATION_DONE;
688         s->error = 0;
689         break;
690     case REPLICATION_MODE_SECONDARY:
691         /*
692          * This BDS will be closed, and the job should be completed
693          * before the BDS is closed, because we will access hidden
694          * disk, secondary disk in backup_job_completed().
695          */
696         if (s->backup_job) {
697             job_cancel_sync(&s->backup_job->job);
698         }
700         if (!failover) {
701             secondary_do_checkpoint(s, errp);
702             s->stage = BLOCK_REPLICATION_DONE;
703             aio_context_release(aio_context);
704             return;
705         }
707         s->stage = BLOCK_REPLICATION_FAILOVER;
708         s->commit_job = commit_active_start(
709                             NULL, s->active_disk->bs, s->secondary_disk->bs,
710                             JOB_INTERNAL, 0, BLOCKDEV_ON_ERROR_REPORT,
711                             NULL, replication_done, bs, true, errp);
712         break;
713     default:
714         aio_context_release(aio_context);
715         abort();
716     }
717     aio_context_release(aio_context);
720 static const char *const replication_strong_runtime_opts[] = {
721     REPLICATION_MODE,
722     REPLICATION_TOP_ID,
724     NULL
727 static BlockDriver bdrv_replication = {
728     .format_name                = "replication",
729     .instance_size              = sizeof(BDRVReplicationState),
731     .bdrv_open                  = replication_open,
732     .bdrv_close                 = replication_close,
733     .bdrv_child_perm            = replication_child_perm,
735     .bdrv_getlength             = replication_getlength,
736     .bdrv_co_readv              = replication_co_readv,
737     .bdrv_co_writev             = replication_co_writev,
739     .is_filter                  = true,
741     .has_variable_length        = true,
742     .strong_runtime_opts        = replication_strong_runtime_opts,
745 static void bdrv_replication_init(void)
747     bdrv_register(&bdrv_replication);
750 block_init(bdrv_replication_init);