2 * COarse-grain LOck-stepping Virtual Machines for Non-stop Service (COLO)
3 * (a.k.a. Fault Tolerance or Continuous Replication)
5 * Copyright (c) 2016 HUAWEI TECHNOLOGIES CO., LTD.
6 * Copyright (c) 2016 FUJITSU LIMITED
7 * Copyright (c) 2016 Intel Corporation
9 * This work is licensed under the terms of the GNU GPL, version 2 or
10 * later. See the COPYING file in the top-level directory.
13 #include "qemu/osdep.h"
14 #include "sysemu/sysemu.h"
15 #include "qapi/error.h"
16 #include "qapi/qapi-commands-migration.h"
17 #include "qemu-file-channel.h"
18 #include "migration.h"
19 #include "qemu-file.h"
21 #include "migration/colo.h"
23 #include "io/channel-buffer.h"
25 #include "qemu/error-report.h"
26 #include "migration/failover.h"
27 #include "replication.h"
28 #include "net/colo-compare.h"
30 #include "block/block.h"
31 #include "qapi/qapi-events-migration.h"
32 #include "qapi/qmp/qerror.h"
33 #include "sysemu/cpus.h"
34 #include "net/filter.h"
36 static bool vmstate_loading
;
37 static Notifier packets_compare_notifier
;
39 #define COLO_BUFFER_BASE_SIZE (4 * 1024 * 1024)
41 bool migration_in_colo_state(void)
43 MigrationState
*s
= migrate_get_current();
45 return (s
->state
== MIGRATION_STATUS_COLO
);
48 bool migration_incoming_in_colo_state(void)
50 MigrationIncomingState
*mis
= migration_incoming_get_current();
52 return mis
&& (mis
->state
== MIGRATION_STATUS_COLO
);
55 static bool colo_runstate_is_stopped(void)
57 return runstate_check(RUN_STATE_COLO
) || !runstate_is_running();
60 static void secondary_vm_do_failover(void)
63 MigrationIncomingState
*mis
= migration_incoming_get_current();
64 Error
*local_err
= NULL
;
66 /* Can not do failover during the process of VM's loading VMstate, Or
67 * it will break the secondary VM.
69 if (vmstate_loading
) {
70 old_state
= failover_set_state(FAILOVER_STATUS_ACTIVE
,
71 FAILOVER_STATUS_RELAUNCH
);
72 if (old_state
!= FAILOVER_STATUS_ACTIVE
) {
73 error_report("Unknown error while do failover for secondary VM,"
74 "old_state: %s", FailoverStatus_str(old_state
));
79 migrate_set_state(&mis
->state
, MIGRATION_STATUS_COLO
,
80 MIGRATION_STATUS_COMPLETED
);
82 replication_stop_all(true, &local_err
);
84 error_report_err(local_err
);
87 /* Notify all filters of all NIC to do checkpoint */
88 colo_notify_filters_event(COLO_EVENT_FAILOVER
, &local_err
);
90 error_report_err(local_err
);
94 error_report("\"-S\" qemu option will be ignored in secondary side");
95 /* recover runstate to normal migration finish state */
99 * Make sure COLO incoming thread not block in recv or send,
100 * If mis->from_src_file and mis->to_src_file use the same fd,
101 * The second shutdown() will return -1, we ignore this value,
104 if (mis
->from_src_file
) {
105 qemu_file_shutdown(mis
->from_src_file
);
107 if (mis
->to_src_file
) {
108 qemu_file_shutdown(mis
->to_src_file
);
111 old_state
= failover_set_state(FAILOVER_STATUS_ACTIVE
,
112 FAILOVER_STATUS_COMPLETED
);
113 if (old_state
!= FAILOVER_STATUS_ACTIVE
) {
114 error_report("Incorrect state (%s) while doing failover for "
115 "secondary VM", FailoverStatus_str(old_state
));
118 /* Notify COLO incoming thread that failover work is finished */
119 qemu_sem_post(&mis
->colo_incoming_sem
);
120 /* For Secondary VM, jump to incoming co */
121 if (mis
->migration_incoming_co
) {
122 qemu_coroutine_enter(mis
->migration_incoming_co
);
126 static void primary_vm_do_failover(void)
128 MigrationState
*s
= migrate_get_current();
130 Error
*local_err
= NULL
;
132 migrate_set_state(&s
->state
, MIGRATION_STATUS_COLO
,
133 MIGRATION_STATUS_COMPLETED
);
135 * kick COLO thread which might wait at
136 * qemu_sem_wait(&s->colo_checkpoint_sem).
138 colo_checkpoint_notify(migrate_get_current());
141 * Wake up COLO thread which may blocked in recv() or send(),
142 * The s->rp_state.from_dst_file and s->to_dst_file may use the
143 * same fd, but we still shutdown the fd for twice, it is harmless.
145 if (s
->to_dst_file
) {
146 qemu_file_shutdown(s
->to_dst_file
);
148 if (s
->rp_state
.from_dst_file
) {
149 qemu_file_shutdown(s
->rp_state
.from_dst_file
);
152 old_state
= failover_set_state(FAILOVER_STATUS_ACTIVE
,
153 FAILOVER_STATUS_COMPLETED
);
154 if (old_state
!= FAILOVER_STATUS_ACTIVE
) {
155 error_report("Incorrect state (%s) while doing failover for Primary VM",
156 FailoverStatus_str(old_state
));
160 replication_stop_all(true, &local_err
);
162 error_report_err(local_err
);
166 /* Notify COLO thread that failover work is finished */
167 qemu_sem_post(&s
->colo_exit_sem
);
170 COLOMode
get_colo_mode(void)
172 if (migration_in_colo_state()) {
173 return COLO_MODE_PRIMARY
;
174 } else if (migration_incoming_in_colo_state()) {
175 return COLO_MODE_SECONDARY
;
177 return COLO_MODE_NONE
;
181 void colo_do_failover(MigrationState
*s
)
183 /* Make sure VM stopped while failover happened. */
184 if (!colo_runstate_is_stopped()) {
185 vm_stop_force_state(RUN_STATE_COLO
);
188 if (get_colo_mode() == COLO_MODE_PRIMARY
) {
189 primary_vm_do_failover();
191 secondary_vm_do_failover();
195 void qmp_xen_set_replication(bool enable
, bool primary
,
196 bool has_failover
, bool failover
,
199 #ifdef CONFIG_REPLICATION
200 ReplicationMode mode
= primary
?
201 REPLICATION_MODE_PRIMARY
:
202 REPLICATION_MODE_SECONDARY
;
204 if (has_failover
&& enable
) {
205 error_setg(errp
, "Parameter 'failover' is only for"
206 " stopping replication");
211 replication_start_all(mode
, errp
);
216 replication_stop_all(failover
, failover
? NULL
: errp
);
223 ReplicationStatus
*qmp_query_xen_replication_status(Error
**errp
)
225 #ifdef CONFIG_REPLICATION
227 ReplicationStatus
*s
= g_new0(ReplicationStatus
, 1);
229 replication_get_error_all(&err
);
233 s
->desc
= g_strdup(error_get_pretty(err
));
245 void qmp_xen_colo_do_checkpoint(Error
**errp
)
247 #ifdef CONFIG_REPLICATION
248 replication_do_checkpoint_all(errp
);
254 COLOStatus
*qmp_query_colo_status(Error
**errp
)
256 COLOStatus
*s
= g_new0(COLOStatus
, 1);
258 s
->mode
= get_colo_mode();
260 switch (failover_get_state()) {
261 case FAILOVER_STATUS_NONE
:
262 s
->reason
= COLO_EXIT_REASON_NONE
;
264 case FAILOVER_STATUS_REQUIRE
:
265 s
->reason
= COLO_EXIT_REASON_REQUEST
;
268 s
->reason
= COLO_EXIT_REASON_ERROR
;
274 static void colo_send_message(QEMUFile
*f
, COLOMessage msg
,
279 if (msg
>= COLO_MESSAGE__MAX
) {
280 error_setg(errp
, "%s: Invalid message", __func__
);
283 qemu_put_be32(f
, msg
);
286 ret
= qemu_file_get_error(f
);
288 error_setg_errno(errp
, -ret
, "Can't send COLO message");
290 trace_colo_send_message(COLOMessage_str(msg
));
293 static void colo_send_message_value(QEMUFile
*f
, COLOMessage msg
,
294 uint64_t value
, Error
**errp
)
296 Error
*local_err
= NULL
;
299 colo_send_message(f
, msg
, &local_err
);
301 error_propagate(errp
, local_err
);
304 qemu_put_be64(f
, value
);
307 ret
= qemu_file_get_error(f
);
309 error_setg_errno(errp
, -ret
, "Failed to send value for message:%s",
310 COLOMessage_str(msg
));
314 static COLOMessage
colo_receive_message(QEMUFile
*f
, Error
**errp
)
319 msg
= qemu_get_be32(f
);
320 ret
= qemu_file_get_error(f
);
322 error_setg_errno(errp
, -ret
, "Can't receive COLO message");
325 if (msg
>= COLO_MESSAGE__MAX
) {
326 error_setg(errp
, "%s: Invalid message", __func__
);
329 trace_colo_receive_message(COLOMessage_str(msg
));
333 static void colo_receive_check_message(QEMUFile
*f
, COLOMessage expect_msg
,
337 Error
*local_err
= NULL
;
339 msg
= colo_receive_message(f
, &local_err
);
341 error_propagate(errp
, local_err
);
344 if (msg
!= expect_msg
) {
345 error_setg(errp
, "Unexpected COLO message %d, expected %d",
350 static uint64_t colo_receive_message_value(QEMUFile
*f
, uint32_t expect_msg
,
353 Error
*local_err
= NULL
;
357 colo_receive_check_message(f
, expect_msg
, &local_err
);
359 error_propagate(errp
, local_err
);
363 value
= qemu_get_be64(f
);
364 ret
= qemu_file_get_error(f
);
366 error_setg_errno(errp
, -ret
, "Failed to get value for COLO message: %s",
367 COLOMessage_str(expect_msg
));
372 static int colo_do_checkpoint_transaction(MigrationState
*s
,
373 QIOChannelBuffer
*bioc
,
376 Error
*local_err
= NULL
;
379 colo_send_message(s
->to_dst_file
, COLO_MESSAGE_CHECKPOINT_REQUEST
,
385 colo_receive_check_message(s
->rp_state
.from_dst_file
,
386 COLO_MESSAGE_CHECKPOINT_REPLY
, &local_err
);
390 /* Reset channel-buffer directly */
391 qio_channel_io_seek(QIO_CHANNEL(bioc
), 0, 0, NULL
);
394 qemu_mutex_lock_iothread();
395 if (failover_get_state() != FAILOVER_STATUS_NONE
) {
396 qemu_mutex_unlock_iothread();
399 vm_stop_force_state(RUN_STATE_COLO
);
400 qemu_mutex_unlock_iothread();
401 trace_colo_vm_state_change("run", "stop");
403 * Failover request bh could be called after vm_stop_force_state(),
404 * So we need check failover_request_is_active() again.
406 if (failover_get_state() != FAILOVER_STATUS_NONE
) {
410 colo_notify_compares_event(NULL
, COLO_EVENT_CHECKPOINT
, &local_err
);
415 /* Disable block migration */
416 migrate_set_block_enabled(false, &local_err
);
417 qemu_mutex_lock_iothread();
418 replication_do_checkpoint_all(&local_err
);
420 qemu_mutex_unlock_iothread();
424 colo_send_message(s
->to_dst_file
, COLO_MESSAGE_VMSTATE_SEND
, &local_err
);
426 qemu_mutex_unlock_iothread();
429 /* Note: device state is saved into buffer */
430 ret
= qemu_save_device_state(fb
);
432 qemu_mutex_unlock_iothread();
437 * Only save VM's live state, which not including device state.
438 * TODO: We may need a timeout mechanism to prevent COLO process
439 * to be blocked here.
441 qemu_savevm_live_state(s
->to_dst_file
);
446 * We need the size of the VMstate data in Secondary side,
447 * With which we can decide how much data should be read.
449 colo_send_message_value(s
->to_dst_file
, COLO_MESSAGE_VMSTATE_SIZE
,
450 bioc
->usage
, &local_err
);
455 qemu_put_buffer(s
->to_dst_file
, bioc
->data
, bioc
->usage
);
456 qemu_fflush(s
->to_dst_file
);
457 ret
= qemu_file_get_error(s
->to_dst_file
);
462 colo_receive_check_message(s
->rp_state
.from_dst_file
,
463 COLO_MESSAGE_VMSTATE_RECEIVED
, &local_err
);
468 colo_receive_check_message(s
->rp_state
.from_dst_file
,
469 COLO_MESSAGE_VMSTATE_LOADED
, &local_err
);
476 qemu_mutex_lock_iothread();
478 qemu_mutex_unlock_iothread();
479 trace_colo_vm_state_change("stop", "run");
483 error_report_err(local_err
);
488 static void colo_compare_notify_checkpoint(Notifier
*notifier
, void *data
)
490 colo_checkpoint_notify(data
);
493 static void colo_process_checkpoint(MigrationState
*s
)
495 QIOChannelBuffer
*bioc
;
497 int64_t current_time
= qemu_clock_get_ms(QEMU_CLOCK_HOST
);
498 Error
*local_err
= NULL
;
501 failover_init_state();
503 s
->rp_state
.from_dst_file
= qemu_file_get_return_path(s
->to_dst_file
);
504 if (!s
->rp_state
.from_dst_file
) {
505 error_report("Open QEMUFile from_dst_file failed");
509 packets_compare_notifier
.notify
= colo_compare_notify_checkpoint
;
510 colo_compare_register_notifier(&packets_compare_notifier
);
513 * Wait for Secondary finish loading VM states and enter COLO
516 colo_receive_check_message(s
->rp_state
.from_dst_file
,
517 COLO_MESSAGE_CHECKPOINT_READY
, &local_err
);
521 bioc
= qio_channel_buffer_new(COLO_BUFFER_BASE_SIZE
);
522 fb
= qemu_fopen_channel_output(QIO_CHANNEL(bioc
));
523 object_unref(OBJECT(bioc
));
525 qemu_mutex_lock_iothread();
526 replication_start_all(REPLICATION_MODE_PRIMARY
, &local_err
);
528 qemu_mutex_unlock_iothread();
533 qemu_mutex_unlock_iothread();
534 trace_colo_vm_state_change("stop", "run");
536 timer_mod(s
->colo_delay_timer
,
537 current_time
+ s
->parameters
.x_checkpoint_delay
);
539 while (s
->state
== MIGRATION_STATUS_COLO
) {
540 if (failover_get_state() != FAILOVER_STATUS_NONE
) {
541 error_report("failover request");
545 qemu_sem_wait(&s
->colo_checkpoint_sem
);
547 if (s
->state
!= MIGRATION_STATUS_COLO
) {
550 ret
= colo_do_checkpoint_transaction(s
, bioc
, fb
);
557 /* Throw the unreported error message after exited from loop */
559 error_report_err(local_err
);
567 * There are only two reasons we can get here, some error happened
568 * or the user triggered failover.
570 switch (failover_get_state()) {
571 case FAILOVER_STATUS_NONE
:
572 qapi_event_send_colo_exit(COLO_MODE_PRIMARY
,
573 COLO_EXIT_REASON_ERROR
);
575 case FAILOVER_STATUS_REQUIRE
:
576 qapi_event_send_colo_exit(COLO_MODE_PRIMARY
,
577 COLO_EXIT_REASON_REQUEST
);
583 /* Hope this not to be too long to wait here */
584 qemu_sem_wait(&s
->colo_exit_sem
);
585 qemu_sem_destroy(&s
->colo_exit_sem
);
588 * It is safe to unregister notifier after failover finished.
589 * Besides, colo_delay_timer and colo_checkpoint_sem can't be
590 * released befor unregister notifier, or there will be use-after-free
593 colo_compare_unregister_notifier(&packets_compare_notifier
);
594 timer_del(s
->colo_delay_timer
);
595 timer_free(s
->colo_delay_timer
);
596 qemu_sem_destroy(&s
->colo_checkpoint_sem
);
599 * Must be called after failover BH is completed,
600 * Or the failover BH may shutdown the wrong fd that
601 * re-used by other threads after we release here.
603 if (s
->rp_state
.from_dst_file
) {
604 qemu_fclose(s
->rp_state
.from_dst_file
);
608 void colo_checkpoint_notify(void *opaque
)
610 MigrationState
*s
= opaque
;
611 int64_t next_notify_time
;
613 qemu_sem_post(&s
->colo_checkpoint_sem
);
614 s
->colo_checkpoint_time
= qemu_clock_get_ms(QEMU_CLOCK_HOST
);
615 next_notify_time
= s
->colo_checkpoint_time
+
616 s
->parameters
.x_checkpoint_delay
;
617 timer_mod(s
->colo_delay_timer
, next_notify_time
);
620 void migrate_start_colo_process(MigrationState
*s
)
622 qemu_mutex_unlock_iothread();
623 qemu_sem_init(&s
->colo_checkpoint_sem
, 0);
624 s
->colo_delay_timer
= timer_new_ms(QEMU_CLOCK_HOST
,
625 colo_checkpoint_notify
, s
);
627 qemu_sem_init(&s
->colo_exit_sem
, 0);
628 migrate_set_state(&s
->state
, MIGRATION_STATUS_ACTIVE
,
629 MIGRATION_STATUS_COLO
);
630 colo_process_checkpoint(s
);
631 qemu_mutex_lock_iothread();
634 static void colo_wait_handle_message(QEMUFile
*f
, int *checkpoint_request
,
638 Error
*local_err
= NULL
;
640 msg
= colo_receive_message(f
, &local_err
);
642 error_propagate(errp
, local_err
);
647 case COLO_MESSAGE_CHECKPOINT_REQUEST
:
648 *checkpoint_request
= 1;
651 *checkpoint_request
= 0;
652 error_setg(errp
, "Got unknown COLO message: %d", msg
);
657 void *colo_process_incoming_thread(void *opaque
)
659 MigrationIncomingState
*mis
= opaque
;
661 QIOChannelBuffer
*bioc
= NULL
; /* Cache incoming device state */
664 Error
*local_err
= NULL
;
667 rcu_register_thread();
668 qemu_sem_init(&mis
->colo_incoming_sem
, 0);
670 migrate_set_state(&mis
->state
, MIGRATION_STATUS_ACTIVE
,
671 MIGRATION_STATUS_COLO
);
673 failover_init_state();
675 mis
->to_src_file
= qemu_file_get_return_path(mis
->from_src_file
);
676 if (!mis
->to_src_file
) {
677 error_report("COLO incoming thread: Open QEMUFile to_src_file failed");
681 * Note: the communication between Primary side and Secondary side
682 * should be sequential, we set the fd to unblocked in migration incoming
683 * coroutine, and here we are in the COLO incoming thread, so it is ok to
684 * set the fd back to blocked.
686 qemu_file_set_blocking(mis
->from_src_file
, true);
688 bioc
= qio_channel_buffer_new(COLO_BUFFER_BASE_SIZE
);
689 fb
= qemu_fopen_channel_input(QIO_CHANNEL(bioc
));
690 object_unref(OBJECT(bioc
));
692 qemu_mutex_lock_iothread();
693 replication_start_all(REPLICATION_MODE_SECONDARY
, &local_err
);
695 qemu_mutex_unlock_iothread();
699 trace_colo_vm_state_change("stop", "run");
700 qemu_mutex_unlock_iothread();
702 colo_send_message(mis
->to_src_file
, COLO_MESSAGE_CHECKPOINT_READY
,
708 while (mis
->state
== MIGRATION_STATUS_COLO
) {
711 colo_wait_handle_message(mis
->from_src_file
, &request
, &local_err
);
716 if (failover_get_state() != FAILOVER_STATUS_NONE
) {
717 error_report("failover request");
721 qemu_mutex_lock_iothread();
722 vm_stop_force_state(RUN_STATE_COLO
);
723 trace_colo_vm_state_change("run", "stop");
724 qemu_mutex_unlock_iothread();
726 /* FIXME: This is unnecessary for periodic checkpoint mode */
727 colo_send_message(mis
->to_src_file
, COLO_MESSAGE_CHECKPOINT_REPLY
,
733 colo_receive_check_message(mis
->from_src_file
,
734 COLO_MESSAGE_VMSTATE_SEND
, &local_err
);
739 qemu_mutex_lock_iothread();
740 cpu_synchronize_all_pre_loadvm();
741 ret
= qemu_loadvm_state_main(mis
->from_src_file
, mis
);
742 qemu_mutex_unlock_iothread();
745 error_report("Load VM's live state (ram) error");
749 value
= colo_receive_message_value(mis
->from_src_file
,
750 COLO_MESSAGE_VMSTATE_SIZE
, &local_err
);
756 * Read VM device state data into channel buffer,
757 * It's better to re-use the memory allocated.
758 * Here we need to handle the channel buffer directly.
760 if (value
> bioc
->capacity
) {
761 bioc
->capacity
= value
;
762 bioc
->data
= g_realloc(bioc
->data
, bioc
->capacity
);
764 total_size
= qemu_get_buffer(mis
->from_src_file
, bioc
->data
, value
);
765 if (total_size
!= value
) {
766 error_report("Got %" PRIu64
" VMState data, less than expected"
767 " %" PRIu64
, total_size
, value
);
770 bioc
->usage
= total_size
;
771 qio_channel_io_seek(QIO_CHANNEL(bioc
), 0, 0, NULL
);
773 colo_send_message(mis
->to_src_file
, COLO_MESSAGE_VMSTATE_RECEIVED
,
779 qemu_mutex_lock_iothread();
780 vmstate_loading
= true;
781 ret
= qemu_load_device_state(fb
);
783 error_report("COLO: load device state failed");
784 qemu_mutex_unlock_iothread();
788 replication_get_error_all(&local_err
);
790 qemu_mutex_unlock_iothread();
793 /* discard colo disk buffer */
794 replication_do_checkpoint_all(&local_err
);
796 qemu_mutex_unlock_iothread();
800 /* Notify all filters of all NIC to do checkpoint */
801 colo_notify_filters_event(COLO_EVENT_CHECKPOINT
, &local_err
);
804 qemu_mutex_unlock_iothread();
808 vmstate_loading
= false;
810 trace_colo_vm_state_change("stop", "run");
811 qemu_mutex_unlock_iothread();
813 if (failover_get_state() == FAILOVER_STATUS_RELAUNCH
) {
814 failover_set_state(FAILOVER_STATUS_RELAUNCH
,
815 FAILOVER_STATUS_NONE
);
816 failover_request_active(NULL
);
820 colo_send_message(mis
->to_src_file
, COLO_MESSAGE_VMSTATE_LOADED
,
828 vmstate_loading
= false;
829 /* Throw the unreported error message after exited from loop */
831 error_report_err(local_err
);
834 switch (failover_get_state()) {
835 case FAILOVER_STATUS_NONE
:
836 qapi_event_send_colo_exit(COLO_MODE_SECONDARY
,
837 COLO_EXIT_REASON_ERROR
);
839 case FAILOVER_STATUS_REQUIRE
:
840 qapi_event_send_colo_exit(COLO_MODE_SECONDARY
,
841 COLO_EXIT_REASON_REQUEST
);
851 /* Hope this not to be too long to loop here */
852 qemu_sem_wait(&mis
->colo_incoming_sem
);
853 qemu_sem_destroy(&mis
->colo_incoming_sem
);
854 /* Must be called after failover BH is completed */
855 if (mis
->to_src_file
) {
856 qemu_fclose(mis
->to_src_file
);
858 migration_incoming_disable_colo();
860 rcu_unregister_thread();