migration/colo.c

   1 /*
   2  * COarse-grain LOck-stepping Virtual Machines for Non-stop Service (COLO)
   3  * (a.k.a. Fault Tolerance or Continuous Replication)
   4  *
   5  * Copyright (c) 2016 HUAWEI TECHNOLOGIES CO., LTD.
   6  * Copyright (c) 2016 FUJITSU LIMITED
   7  * Copyright (c) 2016 Intel Corporation
   8  *
   9  * This work is licensed under the terms of the GNU GPL, version 2 or
  10  * later.  See the COPYING file in the top-level directory.
  11  */
  12
  13 #include "qemu/osdep.h"
  14 #include "sysemu/sysemu.h"
  15 #include "qapi/error.h"
  16 #include "qapi/qapi-commands-migration.h"
  17 #include "qemu-file-channel.h"
  18 #include "migration.h"
  19 #include "qemu-file.h"
  20 #include "savevm.h"
  21 #include "migration/colo.h"
  22 #include "block.h"
  23 #include "io/channel-buffer.h"
  24 #include "trace.h"
  25 #include "qemu/error-report.h"
  26 #include "qemu/main-loop.h"
  27 #include "qemu/rcu.h"
  28 #include "migration/failover.h"
  29 #ifdef CONFIG_REPLICATION
  30 #include "replication.h"
  31 #endif
  32 #include "net/colo-compare.h"
  33 #include "net/colo.h"
  34 #include "block/block.h"
  35 #include "qapi/qapi-events-migration.h"
  36 #include "qapi/qmp/qerror.h"
  37 #include "sysemu/cpus.h"
  38 #include "sysemu/runstate.h"
  39 #include "net/filter.h"
  40
  41 static bool vmstate_loading;
  42 static Notifier packets_compare_notifier;
  43
  44 /* User need to know colo mode after COLO failover */
  45 static COLOMode last_colo_mode;
  46
  47 #define COLO_BUFFER_BASE_SIZE (4 * 1024 * 1024)
  48
  49 bool migration_in_colo_state(void)
  50 {
  51     MigrationState *s = migrate_get_current();
  52
  53     return (s->state == MIGRATION_STATUS_COLO);
  54 }
  55
  56 bool migration_incoming_in_colo_state(void)
  57 {
  58     MigrationIncomingState *mis = migration_incoming_get_current();
  59
  60     return mis && (mis->state == MIGRATION_STATUS_COLO);
  61 }
  62
  63 static bool colo_runstate_is_stopped(void)
  64 {
  65     return runstate_check(RUN_STATE_COLO) || !runstate_is_running();
  66 }
  67
  68 static void secondary_vm_do_failover(void)
  69 {
  70 /* COLO needs enable block-replication */
  71 #ifdef CONFIG_REPLICATION
  72     int old_state;
  73     MigrationIncomingState *mis = migration_incoming_get_current();
  74     Error *local_err = NULL;
  75
  76     /* Can not do failover during the process of VM's loading VMstate, Or
  77      * it will break the secondary VM.
  78      */
  79     if (vmstate_loading) {
  80         old_state = failover_set_state(FAILOVER_STATUS_ACTIVE,
  81                         FAILOVER_STATUS_RELAUNCH);
  82         if (old_state != FAILOVER_STATUS_ACTIVE) {
  83             error_report("Unknown error while do failover for secondary VM,"
  84                          "old_state: %s", FailoverStatus_str(old_state));
  85         }
  86         return;
  87     }
  88
  89     migrate_set_state(&mis->state, MIGRATION_STATUS_COLO,
  90                       MIGRATION_STATUS_COMPLETED);
  91
  92     replication_stop_all(true, &local_err);
  93     if (local_err) {
  94         error_report_err(local_err);
  95     }
  96
  97     /* Notify all filters of all NIC to do checkpoint */
  98     colo_notify_filters_event(COLO_EVENT_FAILOVER, &local_err);
  99     if (local_err) {
 100         error_report_err(local_err);
 101     }
 102
 103     if (!autostart) {
 104         error_report("\"-S\" qemu option will be ignored in secondary side");
 105         /* recover runstate to normal migration finish state */
 106         autostart = true;
 107     }
 108     /*
 109      * Make sure COLO incoming thread not block in recv or send,
 110      * If mis->from_src_file and mis->to_src_file use the same fd,
 111      * The second shutdown() will return -1, we ignore this value,
 112      * It is harmless.
 113      */
 114     if (mis->from_src_file) {
 115         qemu_file_shutdown(mis->from_src_file);
 116     }
 117     if (mis->to_src_file) {
 118         qemu_file_shutdown(mis->to_src_file);
 119     }
 120
 121     old_state = failover_set_state(FAILOVER_STATUS_ACTIVE,
 122                                    FAILOVER_STATUS_COMPLETED);
 123     if (old_state != FAILOVER_STATUS_ACTIVE) {
 124         error_report("Incorrect state (%s) while doing failover for "
 125                      "secondary VM", FailoverStatus_str(old_state));
 126         return;
 127     }
 128     /* Notify COLO incoming thread that failover work is finished */
 129     qemu_sem_post(&mis->colo_incoming_sem);
 130
 131     /* For Secondary VM, jump to incoming co */
 132     if (mis->migration_incoming_co) {
 133         qemu_coroutine_enter(mis->migration_incoming_co);
 134     }
 135 #else
 136     abort();
 137 #endif
 138 }
 139
 140 static void primary_vm_do_failover(void)
 141 {
 142 #ifdef CONFIG_REPLICATION
 143     MigrationState *s = migrate_get_current();
 144     int old_state;
 145     Error *local_err = NULL;
 146
 147     migrate_set_state(&s->state, MIGRATION_STATUS_COLO,
 148                       MIGRATION_STATUS_COMPLETED);
 149     /*
 150      * kick COLO thread which might wait at
 151      * qemu_sem_wait(&s->colo_checkpoint_sem).
 152      */
 153     colo_checkpoint_notify(migrate_get_current());
 154
 155     /*
 156      * Wake up COLO thread which may blocked in recv() or send(),
 157      * The s->rp_state.from_dst_file and s->to_dst_file may use the
 158      * same fd, but we still shutdown the fd for twice, it is harmless.
 159      */
 160     if (s->to_dst_file) {
 161         qemu_file_shutdown(s->to_dst_file);
 162     }
 163     if (s->rp_state.from_dst_file) {
 164         qemu_file_shutdown(s->rp_state.from_dst_file);
 165     }
 166
 167     old_state = failover_set_state(FAILOVER_STATUS_ACTIVE,
 168                                    FAILOVER_STATUS_COMPLETED);
 169     if (old_state != FAILOVER_STATUS_ACTIVE) {
 170         error_report("Incorrect state (%s) while doing failover for Primary VM",
 171                      FailoverStatus_str(old_state));
 172         return;
 173     }
 174
 175     replication_stop_all(true, &local_err);
 176     if (local_err) {
 177         error_report_err(local_err);
 178         local_err = NULL;
 179     }
 180
 181     /* Notify COLO thread that failover work is finished */
 182     qemu_sem_post(&s->colo_exit_sem);
 183 #else
 184     abort();
 185 #endif
 186 }
 187
 188 COLOMode get_colo_mode(void)
 189 {
 190     if (migration_in_colo_state()) {
 191         return COLO_MODE_PRIMARY;
 192     } else if (migration_incoming_in_colo_state()) {
 193         return COLO_MODE_SECONDARY;
 194     } else {
 195         return COLO_MODE_NONE;
 196     }
 197 }
 198
 199 void colo_do_failover(void)
 200 {
 201     /* Make sure VM stopped while failover happened. */
 202     if (!colo_runstate_is_stopped()) {
 203         vm_stop_force_state(RUN_STATE_COLO);
 204     }
 205
 206     switch (get_colo_mode()) {
 207     case COLO_MODE_PRIMARY:
 208         primary_vm_do_failover();
 209         break;
 210     case COLO_MODE_SECONDARY:
 211         secondary_vm_do_failover();
 212         break;
 213     default:
 214         error_report("colo_do_failover failed because the colo mode"
 215                      " could not be obtained");
 216     }
 217 }
 218
 219 #ifdef CONFIG_REPLICATION
 220 void qmp_xen_set_replication(bool enable, bool primary,
 221                              bool has_failover, bool failover,
 222                              Error **errp)
 223 {
 224     ReplicationMode mode = primary ?
 225                            REPLICATION_MODE_PRIMARY :
 226                            REPLICATION_MODE_SECONDARY;
 227
 228     if (has_failover && enable) {
 229         error_setg(errp, "Parameter 'failover' is only for"
 230                    " stopping replication");
 231         return;
 232     }
 233
 234     if (enable) {
 235         replication_start_all(mode, errp);
 236     } else {
 237         if (!has_failover) {
 238             failover = NULL;
 239         }
 240         replication_stop_all(failover, failover ? NULL : errp);
 241     }
 242 }
 243
 244 ReplicationStatus *qmp_query_xen_replication_status(Error **errp)
 245 {
 246     Error *err = NULL;
 247     ReplicationStatus *s = g_new0(ReplicationStatus, 1);
 248
 249     replication_get_error_all(&err);
 250     if (err) {
 251         s->error = true;
 252         s->has_desc = true;
 253         s->desc = g_strdup(error_get_pretty(err));
 254     } else {
 255         s->error = false;
 256     }
 257
 258     error_free(err);
 259     return s;
 260 }
 261
 262 void qmp_xen_colo_do_checkpoint(Error **errp)
 263 {
 264     replication_do_checkpoint_all(errp);
 265     /* Notify all filters of all NIC to do checkpoint */
 266     colo_notify_filters_event(COLO_EVENT_CHECKPOINT, errp);
 267 }
 268 #endif
 269
 270 COLOStatus *qmp_query_colo_status(Error **errp)
 271 {
 272     COLOStatus *s = g_new0(COLOStatus, 1);
 273
 274     s->mode = get_colo_mode();
 275     s->last_mode = last_colo_mode;
 276
 277     switch (failover_get_state()) {
 278     case FAILOVER_STATUS_NONE:
 279         s->reason = COLO_EXIT_REASON_NONE;
 280         break;
 281     case FAILOVER_STATUS_COMPLETED:
 282         s->reason = COLO_EXIT_REASON_REQUEST;
 283         break;
 284     default:
 285         if (migration_in_colo_state()) {
 286             s->reason = COLO_EXIT_REASON_PROCESSING;
 287         } else {
 288             s->reason = COLO_EXIT_REASON_ERROR;
 289         }
 290     }
 291
 292     return s;
 293 }
 294
 295 static void colo_send_message(QEMUFile *f, COLOMessage msg,
 296                               Error **errp)
 297 {
 298     int ret;
 299
 300     if (msg >= COLO_MESSAGE__MAX) {
 301         error_setg(errp, "%s: Invalid message", __func__);
 302         return;
 303     }
 304     qemu_put_be32(f, msg);
 305     qemu_fflush(f);
 306
 307     ret = qemu_file_get_error(f);
 308     if (ret < 0) {
 309         error_setg_errno(errp, -ret, "Can't send COLO message");
 310     }
 311     trace_colo_send_message(COLOMessage_str(msg));
 312 }
 313
 314 static void colo_send_message_value(QEMUFile *f, COLOMessage msg,
 315                                     uint64_t value, Error **errp)
 316 {
 317     Error *local_err = NULL;
 318     int ret;
 319
 320     colo_send_message(f, msg, &local_err);
 321     if (local_err) {
 322         error_propagate(errp, local_err);
 323         return;
 324     }
 325     qemu_put_be64(f, value);
 326     qemu_fflush(f);
 327
 328     ret = qemu_file_get_error(f);
 329     if (ret < 0) {
 330         error_setg_errno(errp, -ret, "Failed to send value for message:%s",
 331                          COLOMessage_str(msg));
 332     }
 333 }
 334
 335 static COLOMessage colo_receive_message(QEMUFile *f, Error **errp)
 336 {
 337     COLOMessage msg;
 338     int ret;
 339
 340     msg = qemu_get_be32(f);
 341     ret = qemu_file_get_error(f);
 342     if (ret < 0) {
 343         error_setg_errno(errp, -ret, "Can't receive COLO message");
 344         return msg;
 345     }
 346     if (msg >= COLO_MESSAGE__MAX) {
 347         error_setg(errp, "%s: Invalid message", __func__);
 348         return msg;
 349     }
 350     trace_colo_receive_message(COLOMessage_str(msg));
 351     return msg;
 352 }
 353
 354 static void colo_receive_check_message(QEMUFile *f, COLOMessage expect_msg,
 355                                        Error **errp)
 356 {
 357     COLOMessage msg;
 358     Error *local_err = NULL;
 359
 360     msg = colo_receive_message(f, &local_err);
 361     if (local_err) {
 362         error_propagate(errp, local_err);
 363         return;
 364     }
 365     if (msg != expect_msg) {
 366         error_setg(errp, "Unexpected COLO message %d, expected %d",
 367                           msg, expect_msg);
 368     }
 369 }
 370
 371 static uint64_t colo_receive_message_value(QEMUFile *f, uint32_t expect_msg,
 372                                            Error **errp)
 373 {
 374     Error *local_err = NULL;
 375     uint64_t value;
 376     int ret;
 377
 378     colo_receive_check_message(f, expect_msg, &local_err);
 379     if (local_err) {
 380         error_propagate(errp, local_err);
 381         return 0;
 382     }
 383
 384     value = qemu_get_be64(f);
 385     ret = qemu_file_get_error(f);
 386     if (ret < 0) {
 387         error_setg_errno(errp, -ret, "Failed to get value for COLO message: %s",
 388                          COLOMessage_str(expect_msg));
 389     }
 390     return value;
 391 }
 392
 393 static int colo_do_checkpoint_transaction(MigrationState *s,
 394                                           QIOChannelBuffer *bioc,
 395                                           QEMUFile *fb)
 396 {
 397     Error *local_err = NULL;
 398     int ret = -1;
 399
 400     colo_send_message(s->to_dst_file, COLO_MESSAGE_CHECKPOINT_REQUEST,
 401                       &local_err);
 402     if (local_err) {
 403         goto out;
 404     }
 405
 406     colo_receive_check_message(s->rp_state.from_dst_file,
 407                     COLO_MESSAGE_CHECKPOINT_REPLY, &local_err);
 408     if (local_err) {
 409         goto out;
 410     }
 411     /* Reset channel-buffer directly */
 412     qio_channel_io_seek(QIO_CHANNEL(bioc), 0, 0, NULL);
 413     bioc->usage = 0;
 414
 415     qemu_mutex_lock_iothread();
 416     if (failover_get_state() != FAILOVER_STATUS_NONE) {
 417         qemu_mutex_unlock_iothread();
 418         goto out;
 419     }
 420     vm_stop_force_state(RUN_STATE_COLO);
 421     qemu_mutex_unlock_iothread();
 422     trace_colo_vm_state_change("run", "stop");
 423     /*
 424      * Failover request bh could be called after vm_stop_force_state(),
 425      * So we need check failover_request_is_active() again.
 426      */
 427     if (failover_get_state() != FAILOVER_STATUS_NONE) {
 428         goto out;
 429     }
 430
 431     colo_notify_compares_event(NULL, COLO_EVENT_CHECKPOINT, &local_err);
 432     if (local_err) {
 433         goto out;
 434     }
 435
 436     /* Disable block migration */
 437     migrate_set_block_enabled(false, &local_err);
 438     qemu_mutex_lock_iothread();
 439
 440 #ifdef CONFIG_REPLICATION
 441     replication_do_checkpoint_all(&local_err);
 442     if (local_err) {
 443         qemu_mutex_unlock_iothread();
 444         goto out;
 445     }
 446 #else
 447         abort();
 448 #endif
 449
 450     colo_send_message(s->to_dst_file, COLO_MESSAGE_VMSTATE_SEND, &local_err);
 451     if (local_err) {
 452         qemu_mutex_unlock_iothread();
 453         goto out;
 454     }
 455     /* Note: device state is saved into buffer */
 456     ret = qemu_save_device_state(fb);
 457
 458     qemu_mutex_unlock_iothread();
 459     if (ret < 0) {
 460         goto out;
 461     }
 462     /*
 463      * Only save VM's live state, which not including device state.
 464      * TODO: We may need a timeout mechanism to prevent COLO process
 465      * to be blocked here.
 466      */
 467     qemu_savevm_live_state(s->to_dst_file);
 468
 469     qemu_fflush(fb);
 470
 471     /*
 472      * We need the size of the VMstate data in Secondary side,
 473      * With which we can decide how much data should be read.
 474      */
 475     colo_send_message_value(s->to_dst_file, COLO_MESSAGE_VMSTATE_SIZE,
 476                             bioc->usage, &local_err);
 477     if (local_err) {
 478         goto out;
 479     }
 480
 481     qemu_put_buffer(s->to_dst_file, bioc->data, bioc->usage);
 482     qemu_fflush(s->to_dst_file);
 483     ret = qemu_file_get_error(s->to_dst_file);
 484     if (ret < 0) {
 485         goto out;
 486     }
 487
 488     colo_receive_check_message(s->rp_state.from_dst_file,
 489                        COLO_MESSAGE_VMSTATE_RECEIVED, &local_err);
 490     if (local_err) {
 491         goto out;
 492     }
 493
 494     colo_receive_check_message(s->rp_state.from_dst_file,
 495                        COLO_MESSAGE_VMSTATE_LOADED, &local_err);
 496     if (local_err) {
 497         goto out;
 498     }
 499
 500     ret = 0;
 501
 502     qemu_mutex_lock_iothread();
 503     vm_start();
 504     qemu_mutex_unlock_iothread();
 505     trace_colo_vm_state_change("stop", "run");
 506
 507 out:
 508     if (local_err) {
 509         error_report_err(local_err);
 510     }
 511     return ret;
 512 }
 513
 514 static void colo_compare_notify_checkpoint(Notifier *notifier, void *data)
 515 {
 516     colo_checkpoint_notify(data);
 517 }
 518
 519 static void colo_process_checkpoint(MigrationState *s)
 520 {
 521     QIOChannelBuffer *bioc;
 522     QEMUFile *fb = NULL;
 523     int64_t current_time = qemu_clock_get_ms(QEMU_CLOCK_HOST);
 524     Error *local_err = NULL;
 525     int ret;
 526
 527     last_colo_mode = get_colo_mode();
 528     if (last_colo_mode != COLO_MODE_PRIMARY) {
 529         error_report("COLO mode must be COLO_MODE_PRIMARY");
 530         return;
 531     }
 532
 533     failover_init_state();
 534
 535     s->rp_state.from_dst_file = qemu_file_get_return_path(s->to_dst_file);
 536     if (!s->rp_state.from_dst_file) {
 537         error_report("Open QEMUFile from_dst_file failed");
 538         goto out;
 539     }
 540
 541     packets_compare_notifier.notify = colo_compare_notify_checkpoint;
 542     colo_compare_register_notifier(&packets_compare_notifier);
 543
 544     /*
 545      * Wait for Secondary finish loading VM states and enter COLO
 546      * restore.
 547      */
 548     colo_receive_check_message(s->rp_state.from_dst_file,
 549                        COLO_MESSAGE_CHECKPOINT_READY, &local_err);
 550     if (local_err) {
 551         goto out;
 552     }
 553     bioc = qio_channel_buffer_new(COLO_BUFFER_BASE_SIZE);
 554     fb = qemu_fopen_channel_output(QIO_CHANNEL(bioc));
 555     object_unref(OBJECT(bioc));
 556
 557     qemu_mutex_lock_iothread();
 558 #ifdef CONFIG_REPLICATION
 559     replication_start_all(REPLICATION_MODE_PRIMARY, &local_err);
 560     if (local_err) {
 561         qemu_mutex_unlock_iothread();
 562         goto out;
 563     }
 564 #else
 565         abort();
 566 #endif
 567
 568     vm_start();
 569     qemu_mutex_unlock_iothread();
 570     trace_colo_vm_state_change("stop", "run");
 571
 572     timer_mod(s->colo_delay_timer,
 573             current_time + s->parameters.x_checkpoint_delay);
 574
 575     while (s->state == MIGRATION_STATUS_COLO) {
 576         if (failover_get_state() != FAILOVER_STATUS_NONE) {
 577             error_report("failover request");
 578             goto out;
 579         }
 580
 581         qemu_sem_wait(&s->colo_checkpoint_sem);
 582
 583         if (s->state != MIGRATION_STATUS_COLO) {
 584             goto out;
 585         }
 586         ret = colo_do_checkpoint_transaction(s, bioc, fb);
 587         if (ret < 0) {
 588             goto out;
 589         }
 590     }
 591
 592 out:
 593     /* Throw the unreported error message after exited from loop */
 594     if (local_err) {
 595         error_report_err(local_err);
 596     }
 597
 598     if (fb) {
 599         qemu_fclose(fb);
 600     }
 601
 602     /*
 603      * There are only two reasons we can get here, some error happened
 604      * or the user triggered failover.
 605      */
 606     switch (failover_get_state()) {
 607     case FAILOVER_STATUS_COMPLETED:
 608         qapi_event_send_colo_exit(COLO_MODE_PRIMARY,
 609                                   COLO_EXIT_REASON_REQUEST);
 610         break;
 611     default:
 612         qapi_event_send_colo_exit(COLO_MODE_PRIMARY,
 613                                   COLO_EXIT_REASON_ERROR);
 614     }
 615
 616     /* Hope this not to be too long to wait here */
 617     qemu_sem_wait(&s->colo_exit_sem);
 618     qemu_sem_destroy(&s->colo_exit_sem);
 619
 620     /*
 621      * It is safe to unregister notifier after failover finished.
 622      * Besides, colo_delay_timer and colo_checkpoint_sem can't be
 623      * released befor unregister notifier, or there will be use-after-free
 624      * error.
 625      */
 626     colo_compare_unregister_notifier(&packets_compare_notifier);
 627     timer_del(s->colo_delay_timer);
 628     timer_free(s->colo_delay_timer);
 629     qemu_sem_destroy(&s->colo_checkpoint_sem);
 630
 631     /*
 632      * Must be called after failover BH is completed,
 633      * Or the failover BH may shutdown the wrong fd that
 634      * re-used by other threads after we release here.
 635      */
 636     if (s->rp_state.from_dst_file) {
 637         qemu_fclose(s->rp_state.from_dst_file);
 638     }
 639 }
 640
 641 void colo_checkpoint_notify(void *opaque)
 642 {
 643     MigrationState *s = opaque;
 644     int64_t next_notify_time;
 645
 646     qemu_sem_post(&s->colo_checkpoint_sem);
 647     s->colo_checkpoint_time = qemu_clock_get_ms(QEMU_CLOCK_HOST);
 648     next_notify_time = s->colo_checkpoint_time +
 649                     s->parameters.x_checkpoint_delay;
 650     timer_mod(s->colo_delay_timer, next_notify_time);
 651 }
 652
 653 void migrate_start_colo_process(MigrationState *s)
 654 {
 655     qemu_mutex_unlock_iothread();
 656     qemu_sem_init(&s->colo_checkpoint_sem, 0);
 657     s->colo_delay_timer =  timer_new_ms(QEMU_CLOCK_HOST,
 658                                 colo_checkpoint_notify, s);
 659
 660     qemu_sem_init(&s->colo_exit_sem, 0);
 661     migrate_set_state(&s->state, MIGRATION_STATUS_ACTIVE,
 662                       MIGRATION_STATUS_COLO);
 663     colo_process_checkpoint(s);
 664     qemu_mutex_lock_iothread();
 665 }
 666
 667 static void colo_wait_handle_message(QEMUFile *f, int *checkpoint_request,
 668                                      Error **errp)
 669 {
 670     COLOMessage msg;
 671     Error *local_err = NULL;
 672
 673     msg = colo_receive_message(f, &local_err);
 674     if (local_err) {
 675         error_propagate(errp, local_err);
 676         return;
 677     }
 678
 679     switch (msg) {
 680     case COLO_MESSAGE_CHECKPOINT_REQUEST:
 681         *checkpoint_request = 1;
 682         break;
 683     default:
 684         *checkpoint_request = 0;
 685         error_setg(errp, "Got unknown COLO message: %d", msg);
 686         break;
 687     }
 688 }
 689
 690 void *colo_process_incoming_thread(void *opaque)
 691 {
 692     MigrationIncomingState *mis = opaque;
 693     QEMUFile *fb = NULL;
 694     QIOChannelBuffer *bioc = NULL; /* Cache incoming device state */
 695     uint64_t total_size;
 696     uint64_t value;
 697     Error *local_err = NULL;
 698     int ret;
 699
 700     rcu_register_thread();
 701     qemu_sem_init(&mis->colo_incoming_sem, 0);
 702
 703     migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
 704                       MIGRATION_STATUS_COLO);
 705
 706     last_colo_mode = get_colo_mode();
 707     if (last_colo_mode != COLO_MODE_SECONDARY) {
 708         error_report("COLO mode must be COLO_MODE_SECONDARY");
 709         return NULL;
 710     }
 711
 712     failover_init_state();
 713
 714     mis->to_src_file = qemu_file_get_return_path(mis->from_src_file);
 715     if (!mis->to_src_file) {
 716         error_report("COLO incoming thread: Open QEMUFile to_src_file failed");
 717         goto out;
 718     }
 719     /*
 720      * Note: the communication between Primary side and Secondary side
 721      * should be sequential, we set the fd to unblocked in migration incoming
 722      * coroutine, and here we are in the COLO incoming thread, so it is ok to
 723      * set the fd back to blocked.
 724      */
 725     qemu_file_set_blocking(mis->from_src_file, true);
 726
 727     bioc = qio_channel_buffer_new(COLO_BUFFER_BASE_SIZE);
 728     fb = qemu_fopen_channel_input(QIO_CHANNEL(bioc));
 729     object_unref(OBJECT(bioc));
 730
 731     qemu_mutex_lock_iothread();
 732 #ifdef CONFIG_REPLICATION
 733     replication_start_all(REPLICATION_MODE_SECONDARY, &local_err);
 734     if (local_err) {
 735         qemu_mutex_unlock_iothread();
 736         goto out;
 737     }
 738 #else
 739         abort();
 740 #endif
 741     vm_start();
 742     trace_colo_vm_state_change("stop", "run");
 743     qemu_mutex_unlock_iothread();
 744
 745     colo_send_message(mis->to_src_file, COLO_MESSAGE_CHECKPOINT_READY,
 746                       &local_err);
 747     if (local_err) {
 748         goto out;
 749     }
 750
 751     while (mis->state == MIGRATION_STATUS_COLO) {
 752         int request = 0;
 753
 754         colo_wait_handle_message(mis->from_src_file, &request, &local_err);
 755         if (local_err) {
 756             goto out;
 757         }
 758         assert(request);
 759         if (failover_get_state() != FAILOVER_STATUS_NONE) {
 760             error_report("failover request");
 761             goto out;
 762         }
 763
 764         qemu_mutex_lock_iothread();
 765         vm_stop_force_state(RUN_STATE_COLO);
 766         trace_colo_vm_state_change("run", "stop");
 767         qemu_mutex_unlock_iothread();
 768
 769         /* FIXME: This is unnecessary for periodic checkpoint mode */
 770         colo_send_message(mis->to_src_file, COLO_MESSAGE_CHECKPOINT_REPLY,
 771                      &local_err);
 772         if (local_err) {
 773             goto out;
 774         }
 775
 776         colo_receive_check_message(mis->from_src_file,
 777                            COLO_MESSAGE_VMSTATE_SEND, &local_err);
 778         if (local_err) {
 779             goto out;
 780         }
 781
 782         qemu_mutex_lock_iothread();
 783         cpu_synchronize_all_pre_loadvm();
 784         ret = qemu_loadvm_state_main(mis->from_src_file, mis);
 785         qemu_mutex_unlock_iothread();
 786
 787         if (ret < 0) {
 788             error_report("Load VM's live state (ram) error");
 789             goto out;
 790         }
 791
 792         value = colo_receive_message_value(mis->from_src_file,
 793                                  COLO_MESSAGE_VMSTATE_SIZE, &local_err);
 794         if (local_err) {
 795             goto out;
 796         }
 797
 798         /*
 799          * Read VM device state data into channel buffer,
 800          * It's better to re-use the memory allocated.
 801          * Here we need to handle the channel buffer directly.
 802          */
 803         if (value > bioc->capacity) {
 804             bioc->capacity = value;
 805             bioc->data = g_realloc(bioc->data, bioc->capacity);
 806         }
 807         total_size = qemu_get_buffer(mis->from_src_file, bioc->data, value);
 808         if (total_size != value) {
 809             error_report("Got %" PRIu64 " VMState data, less than expected"
 810                         " %" PRIu64, total_size, value);
 811             goto out;
 812         }
 813         bioc->usage = total_size;
 814         qio_channel_io_seek(QIO_CHANNEL(bioc), 0, 0, NULL);
 815
 816         colo_send_message(mis->to_src_file, COLO_MESSAGE_VMSTATE_RECEIVED,
 817                      &local_err);
 818         if (local_err) {
 819             goto out;
 820         }
 821
 822         qemu_mutex_lock_iothread();
 823         vmstate_loading = true;
 824         ret = qemu_load_device_state(fb);
 825         if (ret < 0) {
 826             error_report("COLO: load device state failed");
 827             qemu_mutex_unlock_iothread();
 828             goto out;
 829         }
 830
 831 #ifdef CONFIG_REPLICATION
 832         replication_get_error_all(&local_err);
 833         if (local_err) {
 834             qemu_mutex_unlock_iothread();
 835             goto out;
 836         }
 837
 838         /* discard colo disk buffer */
 839         replication_do_checkpoint_all(&local_err);
 840         if (local_err) {
 841             qemu_mutex_unlock_iothread();
 842             goto out;
 843         }
 844 #else
 845         abort();
 846 #endif
 847         /* Notify all filters of all NIC to do checkpoint */
 848         colo_notify_filters_event(COLO_EVENT_CHECKPOINT, &local_err);
 849
 850         if (local_err) {
 851             qemu_mutex_unlock_iothread();
 852             goto out;
 853         }
 854
 855         vmstate_loading = false;
 856         vm_start();
 857         trace_colo_vm_state_change("stop", "run");
 858         qemu_mutex_unlock_iothread();
 859
 860         if (failover_get_state() == FAILOVER_STATUS_RELAUNCH) {
 861             failover_set_state(FAILOVER_STATUS_RELAUNCH,
 862                             FAILOVER_STATUS_NONE);
 863             failover_request_active(NULL);
 864             goto out;
 865         }
 866
 867         colo_send_message(mis->to_src_file, COLO_MESSAGE_VMSTATE_LOADED,
 868                      &local_err);
 869         if (local_err) {
 870             goto out;
 871         }
 872     }
 873
 874 out:
 875     vmstate_loading = false;
 876     /* Throw the unreported error message after exited from loop */
 877     if (local_err) {
 878         error_report_err(local_err);
 879     }
 880
 881     /*
 882      * There are only two reasons we can get here, some error happened
 883      * or the user triggered failover.
 884      */
 885     switch (failover_get_state()) {
 886     case FAILOVER_STATUS_COMPLETED:
 887         qapi_event_send_colo_exit(COLO_MODE_SECONDARY,
 888                                   COLO_EXIT_REASON_REQUEST);
 889         break;
 890     default:
 891         qapi_event_send_colo_exit(COLO_MODE_SECONDARY,
 892                                   COLO_EXIT_REASON_ERROR);
 893     }
 894
 895     if (fb) {
 896         qemu_fclose(fb);
 897     }
 898
 899     /* Hope this not to be too long to loop here */
 900     qemu_sem_wait(&mis->colo_incoming_sem);
 901     qemu_sem_destroy(&mis->colo_incoming_sem);
 902     /* Must be called after failover BH is completed */
 903     if (mis->to_src_file) {
 904         qemu_fclose(mis->to_src_file);
 905         mis->to_src_file = NULL;
 906     }
 907
 908     rcu_unregister_thread();
 909     return NULL;
 910 }