migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <quintela@redhat.com>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28
  29 #include "qemu/osdep.h"
  30 #include "qemu/cutils.h"
  31 #include "qemu/bitops.h"
  32 #include "qemu/bitmap.h"
  33 #include "qemu/main-loop.h"
  34 #include "xbzrle.h"
  35 #include "ram.h"
  36 #include "migration.h"
  37 #include "migration/register.h"
  38 #include "migration/misc.h"
  39 #include "qemu-file.h"
  40 #include "postcopy-ram.h"
  41 #include "page_cache.h"
  42 #include "qemu/error-report.h"
  43 #include "qapi/error.h"
  44 #include "qapi/qapi-types-migration.h"
  45 #include "qapi/qapi-events-migration.h"
  46 #include "qapi/qmp/qerror.h"
  47 #include "trace.h"
  48 #include "exec/ram_addr.h"
  49 #include "exec/target_page.h"
  50 #include "qemu/rcu_queue.h"
  51 #include "migration/colo.h"
  52 #include "block.h"
  53 #include "sysemu/cpu-throttle.h"
  54 #include "savevm.h"
  55 #include "qemu/iov.h"
  56 #include "multifd.h"
  57 #include "sysemu/runstate.h"
  58
  59 #if defined(__linux__)
  60 #include "qemu/userfaultfd.h"
  61 #endif /* defined(__linux__) */
  62
  63 /***********************************************************/
  64 /* ram save/restore */
  65
  66 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
  67  * worked for pages that where filled with the same char.  We switched
  68  * it to only search for the zero value.  And to avoid confusion with
  69  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
  70  */
  71
  72 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  73 #define RAM_SAVE_FLAG_ZERO     0x02
  74 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  75 #define RAM_SAVE_FLAG_PAGE     0x08
  76 #define RAM_SAVE_FLAG_EOS      0x10
  77 #define RAM_SAVE_FLAG_CONTINUE 0x20
  78 #define RAM_SAVE_FLAG_XBZRLE   0x40
  79 /* 0x80 is reserved in migration.h start with 0x100 next */
  80 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  81
  82 static inline bool is_zero_range(uint8_t *p, uint64_t size)
  83 {
  84     return buffer_is_zero(p, size);
  85 }
  86
  87 XBZRLECacheStats xbzrle_counters;
  88
  89 /* struct contains XBZRLE cache and a static page
  90    used by the compression */
  91 static struct {
  92     /* buffer used for XBZRLE encoding */
  93     uint8_t *encoded_buf;
  94     /* buffer for storing page content */
  95     uint8_t *current_buf;
  96     /* Cache for XBZRLE, Protected by lock. */
  97     PageCache *cache;
  98     QemuMutex lock;
  99     /* it will store a page full of zeros */
 100     uint8_t *zero_target_page;
 101     /* buffer used for XBZRLE decoding */
 102     uint8_t *decoded_buf;
 103 } XBZRLE;
 104
 105 static void XBZRLE_cache_lock(void)
 106 {
 107     if (migrate_use_xbzrle()) {
 108         qemu_mutex_lock(&XBZRLE.lock);
 109     }
 110 }
 111
 112 static void XBZRLE_cache_unlock(void)
 113 {
 114     if (migrate_use_xbzrle()) {
 115         qemu_mutex_unlock(&XBZRLE.lock);
 116     }
 117 }
 118
 119 /**
 120  * xbzrle_cache_resize: resize the xbzrle cache
 121  *
 122  * This function is called from migrate_params_apply in main
 123  * thread, possibly while a migration is in progress.  A running
 124  * migration may be using the cache and might finish during this call,
 125  * hence changes to the cache are protected by XBZRLE.lock().
 126  *
 127  * Returns 0 for success or -1 for error
 128  *
 129  * @new_size: new cache size
 130  * @errp: set *errp if the check failed, with reason
 131  */
 132 int xbzrle_cache_resize(uint64_t new_size, Error **errp)
 133 {
 134     PageCache *new_cache;
 135     int64_t ret = 0;
 136
 137     /* Check for truncation */
 138     if (new_size != (size_t)new_size) {
 139         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
 140                    "exceeding address space");
 141         return -1;
 142     }
 143
 144     if (new_size == migrate_xbzrle_cache_size()) {
 145         /* nothing to do */
 146         return 0;
 147     }
 148
 149     XBZRLE_cache_lock();
 150
 151     if (XBZRLE.cache != NULL) {
 152         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
 153         if (!new_cache) {
 154             ret = -1;
 155             goto out;
 156         }
 157
 158         cache_fini(XBZRLE.cache);
 159         XBZRLE.cache = new_cache;
 160     }
 161 out:
 162     XBZRLE_cache_unlock();
 163     return ret;
 164 }
 165
 166 bool ramblock_is_ignored(RAMBlock *block)
 167 {
 168     return !qemu_ram_is_migratable(block) ||
 169            (migrate_ignore_shared() && qemu_ram_is_shared(block));
 170 }
 171
 172 #undef RAMBLOCK_FOREACH
 173
 174 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
 175 {
 176     RAMBlock *block;
 177     int ret = 0;
 178
 179     RCU_READ_LOCK_GUARD();
 180
 181     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 182         ret = func(block, opaque);
 183         if (ret) {
 184             break;
 185         }
 186     }
 187     return ret;
 188 }
 189
 190 static void ramblock_recv_map_init(void)
 191 {
 192     RAMBlock *rb;
 193
 194     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
 195         assert(!rb->receivedmap);
 196         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
 197     }
 198 }
 199
 200 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
 201 {
 202     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
 203                     rb->receivedmap);
 204 }
 205
 206 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
 207 {
 208     return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
 209 }
 210
 211 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
 212 {
 213     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
 214 }
 215
 216 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
 217                                     size_t nr)
 218 {
 219     bitmap_set_atomic(rb->receivedmap,
 220                       ramblock_recv_bitmap_offset(host_addr, rb),
 221                       nr);
 222 }
 223
 224 #define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
 225
 226 /*
 227  * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
 228  *
 229  * Returns >0 if success with sent bytes, or <0 if error.
 230  */
 231 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
 232                                   const char *block_name)
 233 {
 234     RAMBlock *block = qemu_ram_block_by_name(block_name);
 235     unsigned long *le_bitmap, nbits;
 236     uint64_t size;
 237
 238     if (!block) {
 239         error_report("%s: invalid block name: %s", __func__, block_name);
 240         return -1;
 241     }
 242
 243     nbits = block->used_length >> TARGET_PAGE_BITS;
 244
 245     /*
 246      * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
 247      * machines we may need 4 more bytes for padding (see below
 248      * comment). So extend it a bit before hand.
 249      */
 250     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
 251
 252     /*
 253      * Always use little endian when sending the bitmap. This is
 254      * required that when source and destination VMs are not using the
 255      * same endianness. (Note: big endian won't work.)
 256      */
 257     bitmap_to_le(le_bitmap, block->receivedmap, nbits);
 258
 259     /* Size of the bitmap, in bytes */
 260     size = DIV_ROUND_UP(nbits, 8);
 261
 262     /*
 263      * size is always aligned to 8 bytes for 64bit machines, but it
 264      * may not be true for 32bit machines. We need this padding to
 265      * make sure the migration can survive even between 32bit and
 266      * 64bit machines.
 267      */
 268     size = ROUND_UP(size, 8);
 269
 270     qemu_put_be64(file, size);
 271     qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
 272     /*
 273      * Mark as an end, in case the middle part is screwed up due to
 274      * some "mysterious" reason.
 275      */
 276     qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
 277     qemu_fflush(file);
 278
 279     g_free(le_bitmap);
 280
 281     if (qemu_file_get_error(file)) {
 282         return qemu_file_get_error(file);
 283     }
 284
 285     return size + sizeof(size);
 286 }
 287
 288 /*
 289  * An outstanding page request, on the source, having been received
 290  * and queued
 291  */
 292 struct RAMSrcPageRequest {
 293     RAMBlock *rb;
 294     hwaddr    offset;
 295     hwaddr    len;
 296
 297     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 298 };
 299
 300 /* State of RAM for migration */
 301 struct RAMState {
 302     /* QEMUFile used for this migration */
 303     QEMUFile *f;
 304     /* UFFD file descriptor, used in 'write-tracking' migration */
 305     int uffdio_fd;
 306     /* Last block that we have visited searching for dirty pages */
 307     RAMBlock *last_seen_block;
 308     /* Last block from where we have sent data */
 309     RAMBlock *last_sent_block;
 310     /* Last dirty target page we have sent */
 311     ram_addr_t last_page;
 312     /* last ram version we have seen */
 313     uint32_t last_version;
 314     /* We are in the first round */
 315     bool ram_bulk_stage;
 316     /* The free page optimization is enabled */
 317     bool fpo_enabled;
 318     /* How many times we have dirty too many pages */
 319     int dirty_rate_high_cnt;
 320     /* these variables are used for bitmap sync */
 321     /* last time we did a full bitmap_sync */
 322     int64_t time_last_bitmap_sync;
 323     /* bytes transferred at start_time */
 324     uint64_t bytes_xfer_prev;
 325     /* number of dirty pages since start_time */
 326     uint64_t num_dirty_pages_period;
 327     /* xbzrle misses since the beginning of the period */
 328     uint64_t xbzrle_cache_miss_prev;
 329     /* Amount of xbzrle pages since the beginning of the period */
 330     uint64_t xbzrle_pages_prev;
 331     /* Amount of xbzrle encoded bytes since the beginning of the period */
 332     uint64_t xbzrle_bytes_prev;
 333
 334     /* compression statistics since the beginning of the period */
 335     /* amount of count that no free thread to compress data */
 336     uint64_t compress_thread_busy_prev;
 337     /* amount bytes after compression */
 338     uint64_t compressed_size_prev;
 339     /* amount of compressed pages */
 340     uint64_t compress_pages_prev;
 341
 342     /* total handled target pages at the beginning of period */
 343     uint64_t target_page_count_prev;
 344     /* total handled target pages since start */
 345     uint64_t target_page_count;
 346     /* number of dirty bits in the bitmap */
 347     uint64_t migration_dirty_pages;
 348     /* Protects modification of the bitmap and migration dirty pages */
 349     QemuMutex bitmap_mutex;
 350     /* The RAMBlock used in the last src_page_requests */
 351     RAMBlock *last_req_rb;
 352     /* Queue of outstanding page requests from the destination */
 353     QemuMutex src_page_req_mutex;
 354     QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
 355 };
 356 typedef struct RAMState RAMState;
 357
 358 static RAMState *ram_state;
 359
 360 static NotifierWithReturnList precopy_notifier_list;
 361
 362 void precopy_infrastructure_init(void)
 363 {
 364     notifier_with_return_list_init(&precopy_notifier_list);
 365 }
 366
 367 void precopy_add_notifier(NotifierWithReturn *n)
 368 {
 369     notifier_with_return_list_add(&precopy_notifier_list, n);
 370 }
 371
 372 void precopy_remove_notifier(NotifierWithReturn *n)
 373 {
 374     notifier_with_return_remove(n);
 375 }
 376
 377 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
 378 {
 379     PrecopyNotifyData pnd;
 380     pnd.reason = reason;
 381     pnd.errp = errp;
 382
 383     return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
 384 }
 385
 386 void precopy_enable_free_page_optimization(void)
 387 {
 388     if (!ram_state) {
 389         return;
 390     }
 391
 392     ram_state->fpo_enabled = true;
 393 }
 394
 395 uint64_t ram_bytes_remaining(void)
 396 {
 397     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
 398                        0;
 399 }
 400
 401 MigrationStats ram_counters;
 402
 403 /* used by the search for pages to send */
 404 struct PageSearchStatus {
 405     /* Current block being searched */
 406     RAMBlock    *block;
 407     /* Current page to search from */
 408     unsigned long page;
 409     /* Set once we wrap around */
 410     bool         complete_round;
 411 };
 412 typedef struct PageSearchStatus PageSearchStatus;
 413
 414 CompressionStats compression_counters;
 415
 416 struct CompressParam {
 417     bool done;
 418     bool quit;
 419     bool zero_page;
 420     QEMUFile *file;
 421     QemuMutex mutex;
 422     QemuCond cond;
 423     RAMBlock *block;
 424     ram_addr_t offset;
 425
 426     /* internally used fields */
 427     z_stream stream;
 428     uint8_t *originbuf;
 429 };
 430 typedef struct CompressParam CompressParam;
 431
 432 struct DecompressParam {
 433     bool done;
 434     bool quit;
 435     QemuMutex mutex;
 436     QemuCond cond;
 437     void *des;
 438     uint8_t *compbuf;
 439     int len;
 440     z_stream stream;
 441 };
 442 typedef struct DecompressParam DecompressParam;
 443
 444 static CompressParam *comp_param;
 445 static QemuThread *compress_threads;
 446 /* comp_done_cond is used to wake up the migration thread when
 447  * one of the compression threads has finished the compression.
 448  * comp_done_lock is used to co-work with comp_done_cond.
 449  */
 450 static QemuMutex comp_done_lock;
 451 static QemuCond comp_done_cond;
 452 /* The empty QEMUFileOps will be used by file in CompressParam */
 453 static const QEMUFileOps empty_ops = { };
 454
 455 static QEMUFile *decomp_file;
 456 static DecompressParam *decomp_param;
 457 static QemuThread *decompress_threads;
 458 static QemuMutex decomp_done_lock;
 459 static QemuCond decomp_done_cond;
 460
 461 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
 462                                  ram_addr_t offset, uint8_t *source_buf);
 463
 464 static void *do_data_compress(void *opaque)
 465 {
 466     CompressParam *param = opaque;
 467     RAMBlock *block;
 468     ram_addr_t offset;
 469     bool zero_page;
 470
 471     qemu_mutex_lock(&param->mutex);
 472     while (!param->quit) {
 473         if (param->block) {
 474             block = param->block;
 475             offset = param->offset;
 476             param->block = NULL;
 477             qemu_mutex_unlock(&param->mutex);
 478
 479             zero_page = do_compress_ram_page(param->file, &param->stream,
 480                                              block, offset, param->originbuf);
 481
 482             qemu_mutex_lock(&comp_done_lock);
 483             param->done = true;
 484             param->zero_page = zero_page;
 485             qemu_cond_signal(&comp_done_cond);
 486             qemu_mutex_unlock(&comp_done_lock);
 487
 488             qemu_mutex_lock(&param->mutex);
 489         } else {
 490             qemu_cond_wait(&param->cond, &param->mutex);
 491         }
 492     }
 493     qemu_mutex_unlock(&param->mutex);
 494
 495     return NULL;
 496 }
 497
 498 static void compress_threads_save_cleanup(void)
 499 {
 500     int i, thread_count;
 501
 502     if (!migrate_use_compression() || !comp_param) {
 503         return;
 504     }
 505
 506     thread_count = migrate_compress_threads();
 507     for (i = 0; i < thread_count; i++) {
 508         /*
 509          * we use it as a indicator which shows if the thread is
 510          * properly init'd or not
 511          */
 512         if (!comp_param[i].file) {
 513             break;
 514         }
 515
 516         qemu_mutex_lock(&comp_param[i].mutex);
 517         comp_param[i].quit = true;
 518         qemu_cond_signal(&comp_param[i].cond);
 519         qemu_mutex_unlock(&comp_param[i].mutex);
 520
 521         qemu_thread_join(compress_threads + i);
 522         qemu_mutex_destroy(&comp_param[i].mutex);
 523         qemu_cond_destroy(&comp_param[i].cond);
 524         deflateEnd(&comp_param[i].stream);
 525         g_free(comp_param[i].originbuf);
 526         qemu_fclose(comp_param[i].file);
 527         comp_param[i].file = NULL;
 528     }
 529     qemu_mutex_destroy(&comp_done_lock);
 530     qemu_cond_destroy(&comp_done_cond);
 531     g_free(compress_threads);
 532     g_free(comp_param);
 533     compress_threads = NULL;
 534     comp_param = NULL;
 535 }
 536
 537 static int compress_threads_save_setup(void)
 538 {
 539     int i, thread_count;
 540
 541     if (!migrate_use_compression()) {
 542         return 0;
 543     }
 544     thread_count = migrate_compress_threads();
 545     compress_threads = g_new0(QemuThread, thread_count);
 546     comp_param = g_new0(CompressParam, thread_count);
 547     qemu_cond_init(&comp_done_cond);
 548     qemu_mutex_init(&comp_done_lock);
 549     for (i = 0; i < thread_count; i++) {
 550         comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
 551         if (!comp_param[i].originbuf) {
 552             goto exit;
 553         }
 554
 555         if (deflateInit(&comp_param[i].stream,
 556                         migrate_compress_level()) != Z_OK) {
 557             g_free(comp_param[i].originbuf);
 558             goto exit;
 559         }
 560
 561         /* comp_param[i].file is just used as a dummy buffer to save data,
 562          * set its ops to empty.
 563          */
 564         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
 565         comp_param[i].done = true;
 566         comp_param[i].quit = false;
 567         qemu_mutex_init(&comp_param[i].mutex);
 568         qemu_cond_init(&comp_param[i].cond);
 569         qemu_thread_create(compress_threads + i, "compress",
 570                            do_data_compress, comp_param + i,
 571                            QEMU_THREAD_JOINABLE);
 572     }
 573     return 0;
 574
 575 exit:
 576     compress_threads_save_cleanup();
 577     return -1;
 578 }
 579
 580 /**
 581  * save_page_header: write page header to wire
 582  *
 583  * If this is the 1st block, it also writes the block identification
 584  *
 585  * Returns the number of bytes written
 586  *
 587  * @f: QEMUFile where to send the data
 588  * @block: block that contains the page we want to send
 589  * @offset: offset inside the block for the page
 590  *          in the lower bits, it contains flags
 591  */
 592 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
 593                                ram_addr_t offset)
 594 {
 595     size_t size, len;
 596
 597     if (block == rs->last_sent_block) {
 598         offset |= RAM_SAVE_FLAG_CONTINUE;
 599     }
 600     qemu_put_be64(f, offset);
 601     size = 8;
 602
 603     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
 604         len = strlen(block->idstr);
 605         qemu_put_byte(f, len);
 606         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
 607         size += 1 + len;
 608         rs->last_sent_block = block;
 609     }
 610     return size;
 611 }
 612
 613 /**
 614  * mig_throttle_guest_down: throotle down the guest
 615  *
 616  * Reduce amount of guest cpu execution to hopefully slow down memory
 617  * writes. If guest dirty memory rate is reduced below the rate at
 618  * which we can transfer pages to the destination then we should be
 619  * able to complete migration. Some workloads dirty memory way too
 620  * fast and will not effectively converge, even with auto-converge.
 621  */
 622 static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
 623                                     uint64_t bytes_dirty_threshold)
 624 {
 625     MigrationState *s = migrate_get_current();
 626     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
 627     uint64_t pct_increment = s->parameters.cpu_throttle_increment;
 628     bool pct_tailslow = s->parameters.cpu_throttle_tailslow;
 629     int pct_max = s->parameters.max_cpu_throttle;
 630
 631     uint64_t throttle_now = cpu_throttle_get_percentage();
 632     uint64_t cpu_now, cpu_ideal, throttle_inc;
 633
 634     /* We have not started throttling yet. Let's start it. */
 635     if (!cpu_throttle_active()) {
 636         cpu_throttle_set(pct_initial);
 637     } else {
 638         /* Throttling already on, just increase the rate */
 639         if (!pct_tailslow) {
 640             throttle_inc = pct_increment;
 641         } else {
 642             /* Compute the ideal CPU percentage used by Guest, which may
 643              * make the dirty rate match the dirty rate threshold. */
 644             cpu_now = 100 - throttle_now;
 645             cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
 646                         bytes_dirty_period);
 647             throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
 648         }
 649         cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
 650     }
 651 }
 652
 653 /**
 654  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
 655  *
 656  * @rs: current RAM state
 657  * @current_addr: address for the zero page
 658  *
 659  * Update the xbzrle cache to reflect a page that's been sent as all 0.
 660  * The important thing is that a stale (not-yet-0'd) page be replaced
 661  * by the new data.
 662  * As a bonus, if the page wasn't in the cache it gets added so that
 663  * when a small write is made into the 0'd page it gets XBZRLE sent.
 664  */
 665 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
 666 {
 667     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
 668         return;
 669     }
 670
 671     /* We don't care if this fails to allocate a new cache page
 672      * as long as it updated an old one */
 673     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
 674                  ram_counters.dirty_sync_count);
 675 }
 676
 677 #define ENCODING_FLAG_XBZRLE 0x1
 678
 679 /**
 680  * save_xbzrle_page: compress and send current page
 681  *
 682  * Returns: 1 means that we wrote the page
 683  *          0 means that page is identical to the one already sent
 684  *          -1 means that xbzrle would be longer than normal
 685  *
 686  * @rs: current RAM state
 687  * @current_data: pointer to the address of the page contents
 688  * @current_addr: addr of the page
 689  * @block: block that contains the page we want to send
 690  * @offset: offset inside the block for the page
 691  * @last_stage: if we are at the completion stage
 692  */
 693 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
 694                             ram_addr_t current_addr, RAMBlock *block,
 695                             ram_addr_t offset, bool last_stage)
 696 {
 697     int encoded_len = 0, bytes_xbzrle;
 698     uint8_t *prev_cached_page;
 699
 700     if (!cache_is_cached(XBZRLE.cache, current_addr,
 701                          ram_counters.dirty_sync_count)) {
 702         xbzrle_counters.cache_miss++;
 703         if (!last_stage) {
 704             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
 705                              ram_counters.dirty_sync_count) == -1) {
 706                 return -1;
 707             } else {
 708                 /* update *current_data when the page has been
 709                    inserted into cache */
 710                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
 711             }
 712         }
 713         return -1;
 714     }
 715
 716     /*
 717      * Reaching here means the page has hit the xbzrle cache, no matter what
 718      * encoding result it is (normal encoding, overflow or skipping the page),
 719      * count the page as encoded. This is used to calculate the encoding rate.
 720      *
 721      * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
 722      * 2nd page turns out to be skipped (i.e. no new bytes written to the
 723      * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
 724      * skipped page included. In this way, the encoding rate can tell if the
 725      * guest page is good for xbzrle encoding.
 726      */
 727     xbzrle_counters.pages++;
 728     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
 729
 730     /* save current buffer into memory */
 731     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 732
 733     /* XBZRLE encoding (if there is no overflow) */
 734     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
 735                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
 736                                        TARGET_PAGE_SIZE);
 737
 738     /*
 739      * Update the cache contents, so that it corresponds to the data
 740      * sent, in all cases except where we skip the page.
 741      */
 742     if (!last_stage && encoded_len != 0) {
 743         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
 744         /*
 745          * In the case where we couldn't compress, ensure that the caller
 746          * sends the data from the cache, since the guest might have
 747          * changed the RAM since we copied it.
 748          */
 749         *current_data = prev_cached_page;
 750     }
 751
 752     if (encoded_len == 0) {
 753         trace_save_xbzrle_page_skipping();
 754         return 0;
 755     } else if (encoded_len == -1) {
 756         trace_save_xbzrle_page_overflow();
 757         xbzrle_counters.overflow++;
 758         xbzrle_counters.bytes += TARGET_PAGE_SIZE;
 759         return -1;
 760     }
 761
 762     /* Send XBZRLE based compressed page */
 763     bytes_xbzrle = save_page_header(rs, rs->f, block,
 764                                     offset | RAM_SAVE_FLAG_XBZRLE);
 765     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
 766     qemu_put_be16(rs->f, encoded_len);
 767     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
 768     bytes_xbzrle += encoded_len + 1 + 2;
 769     /*
 770      * Like compressed_size (please see update_compress_thread_counts),
 771      * the xbzrle encoded bytes don't count the 8 byte header with
 772      * RAM_SAVE_FLAG_CONTINUE.
 773      */
 774     xbzrle_counters.bytes += bytes_xbzrle - 8;
 775     ram_counters.transferred += bytes_xbzrle;
 776
 777     return 1;
 778 }
 779
 780 /**
 781  * migration_bitmap_find_dirty: find the next dirty page from start
 782  *
 783  * Returns the page offset within memory region of the start of a dirty page
 784  *
 785  * @rs: current RAM state
 786  * @rb: RAMBlock where to search for dirty pages
 787  * @start: page where we start the search
 788  */
 789 static inline
 790 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
 791                                           unsigned long start)
 792 {
 793     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
 794     unsigned long *bitmap = rb->bmap;
 795     unsigned long next;
 796
 797     if (ramblock_is_ignored(rb)) {
 798         return size;
 799     }
 800
 801     /*
 802      * When the free page optimization is enabled, we need to check the bitmap
 803      * to send the non-free pages rather than all the pages in the bulk stage.
 804      */
 805     if (!rs->fpo_enabled && rs->ram_bulk_stage && start > 0) {
 806         next = start + 1;
 807     } else {
 808         next = find_next_bit(bitmap, size, start);
 809     }
 810
 811     return next;
 812 }
 813
 814 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
 815                                                 RAMBlock *rb,
 816                                                 unsigned long page)
 817 {
 818     bool ret;
 819
 820     QEMU_LOCK_GUARD(&rs->bitmap_mutex);
 821
 822     /*
 823      * Clear dirty bitmap if needed.  This _must_ be called before we
 824      * send any of the page in the chunk because we need to make sure
 825      * we can capture further page content changes when we sync dirty
 826      * log the next time.  So as long as we are going to send any of
 827      * the page in the chunk we clear the remote dirty bitmap for all.
 828      * Clearing it earlier won't be a problem, but too late will.
 829      */
 830     if (rb->clear_bmap && clear_bmap_test_and_clear(rb, page)) {
 831         uint8_t shift = rb->clear_bmap_shift;
 832         hwaddr size = 1ULL << (TARGET_PAGE_BITS + shift);
 833         hwaddr start = (((ram_addr_t)page) << TARGET_PAGE_BITS) & (-size);
 834
 835         /*
 836          * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
 837          * can make things easier sometimes since then start address
 838          * of the small chunk will always be 64 pages aligned so the
 839          * bitmap will always be aligned to unsigned long.  We should
 840          * even be able to remove this restriction but I'm simply
 841          * keeping it.
 842          */
 843         assert(shift >= 6);
 844         trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
 845         memory_region_clear_dirty_bitmap(rb->mr, start, size);
 846     }
 847
 848     ret = test_and_clear_bit(page, rb->bmap);
 849
 850     if (ret) {
 851         rs->migration_dirty_pages--;
 852     }
 853
 854     return ret;
 855 }
 856
 857 /* Called with RCU critical section */
 858 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
 859 {
 860     uint64_t new_dirty_pages =
 861         cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
 862
 863     rs->migration_dirty_pages += new_dirty_pages;
 864     rs->num_dirty_pages_period += new_dirty_pages;
 865 }
 866
 867 /**
 868  * ram_pagesize_summary: calculate all the pagesizes of a VM
 869  *
 870  * Returns a summary bitmap of the page sizes of all RAMBlocks
 871  *
 872  * For VMs with just normal pages this is equivalent to the host page
 873  * size. If it's got some huge pages then it's the OR of all the
 874  * different page sizes.
 875  */
 876 uint64_t ram_pagesize_summary(void)
 877 {
 878     RAMBlock *block;
 879     uint64_t summary = 0;
 880
 881     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 882         summary |= block->page_size;
 883     }
 884
 885     return summary;
 886 }
 887
 888 uint64_t ram_get_total_transferred_pages(void)
 889 {
 890     return  ram_counters.normal + ram_counters.duplicate +
 891                 compression_counters.pages + xbzrle_counters.pages;
 892 }
 893
 894 static void migration_update_rates(RAMState *rs, int64_t end_time)
 895 {
 896     uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
 897     double compressed_size;
 898
 899     /* calculate period counters */
 900     ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
 901                 / (end_time - rs->time_last_bitmap_sync);
 902
 903     if (!page_count) {
 904         return;
 905     }
 906
 907     if (migrate_use_xbzrle()) {
 908         double encoded_size, unencoded_size;
 909
 910         xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
 911             rs->xbzrle_cache_miss_prev) / page_count;
 912         rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
 913         unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
 914                          TARGET_PAGE_SIZE;
 915         encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
 916         if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
 917             xbzrle_counters.encoding_rate = 0;
 918         } else {
 919             xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
 920         }
 921         rs->xbzrle_pages_prev = xbzrle_counters.pages;
 922         rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
 923     }
 924
 925     if (migrate_use_compression()) {
 926         compression_counters.busy_rate = (double)(compression_counters.busy -
 927             rs->compress_thread_busy_prev) / page_count;
 928         rs->compress_thread_busy_prev = compression_counters.busy;
 929
 930         compressed_size = compression_counters.compressed_size -
 931                           rs->compressed_size_prev;
 932         if (compressed_size) {
 933             double uncompressed_size = (compression_counters.pages -
 934                                     rs->compress_pages_prev) * TARGET_PAGE_SIZE;
 935
 936             /* Compression-Ratio = Uncompressed-size / Compressed-size */
 937             compression_counters.compression_rate =
 938                                         uncompressed_size / compressed_size;
 939
 940             rs->compress_pages_prev = compression_counters.pages;
 941             rs->compressed_size_prev = compression_counters.compressed_size;
 942         }
 943     }
 944 }
 945
 946 static void migration_trigger_throttle(RAMState *rs)
 947 {
 948     MigrationState *s = migrate_get_current();
 949     uint64_t threshold = s->parameters.throttle_trigger_threshold;
 950
 951     uint64_t bytes_xfer_period = ram_counters.transferred - rs->bytes_xfer_prev;
 952     uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
 953     uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
 954
 955     /* During block migration the auto-converge logic incorrectly detects
 956      * that ram migration makes no progress. Avoid this by disabling the
 957      * throttling logic during the bulk phase of block migration. */
 958     if (migrate_auto_converge() && !blk_mig_bulk_active()) {
 959         /* The following detection logic can be refined later. For now:
 960            Check to see if the ratio between dirtied bytes and the approx.
 961            amount of bytes that just got transferred since the last time
 962            we were in this routine reaches the threshold. If that happens
 963            twice, start or increase throttling. */
 964
 965         if ((bytes_dirty_period > bytes_dirty_threshold) &&
 966             (++rs->dirty_rate_high_cnt >= 2)) {
 967             trace_migration_throttle();
 968             rs->dirty_rate_high_cnt = 0;
 969             mig_throttle_guest_down(bytes_dirty_period,
 970                                     bytes_dirty_threshold);
 971         }
 972     }
 973 }
 974
 975 static void migration_bitmap_sync(RAMState *rs)
 976 {
 977     RAMBlock *block;
 978     int64_t end_time;
 979
 980     ram_counters.dirty_sync_count++;
 981
 982     if (!rs->time_last_bitmap_sync) {
 983         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 984     }
 985
 986     trace_migration_bitmap_sync_start();
 987     memory_global_dirty_log_sync();
 988
 989     qemu_mutex_lock(&rs->bitmap_mutex);
 990     WITH_RCU_READ_LOCK_GUARD() {
 991         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 992             ramblock_sync_dirty_bitmap(rs, block);
 993         }
 994         ram_counters.remaining = ram_bytes_remaining();
 995     }
 996     qemu_mutex_unlock(&rs->bitmap_mutex);
 997
 998     memory_global_after_dirty_log_sync();
 999     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1000
1001     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1002
1003     /* more than 1 second = 1000 millisecons */
1004     if (end_time > rs->time_last_bitmap_sync + 1000) {
1005         migration_trigger_throttle(rs);
1006
1007         migration_update_rates(rs, end_time);
1008
1009         rs->target_page_count_prev = rs->target_page_count;
1010
1011         /* reset period counters */
1012         rs->time_last_bitmap_sync = end_time;
1013         rs->num_dirty_pages_period = 0;
1014         rs->bytes_xfer_prev = ram_counters.transferred;
1015     }
1016     if (migrate_use_events()) {
1017         qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
1018     }
1019 }
1020
1021 static void migration_bitmap_sync_precopy(RAMState *rs)
1022 {
1023     Error *local_err = NULL;
1024
1025     /*
1026      * The current notifier usage is just an optimization to migration, so we
1027      * don't stop the normal migration process in the error case.
1028      */
1029     if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1030         error_report_err(local_err);
1031         local_err = NULL;
1032     }
1033
1034     migration_bitmap_sync(rs);
1035
1036     if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1037         error_report_err(local_err);
1038     }
1039 }
1040
1041 /**
1042  * save_zero_page_to_file: send the zero page to the file
1043  *
1044  * Returns the size of data written to the file, 0 means the page is not
1045  * a zero page
1046  *
1047  * @rs: current RAM state
1048  * @file: the file where the data is saved
1049  * @block: block that contains the page we want to send
1050  * @offset: offset inside the block for the page
1051  */
1052 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1053                                   RAMBlock *block, ram_addr_t offset)
1054 {
1055     uint8_t *p = block->host + offset;
1056     int len = 0;
1057
1058     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1059         len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1060         qemu_put_byte(file, 0);
1061         len += 1;
1062     }
1063     return len;
1064 }
1065
1066 /**
1067  * save_zero_page: send the zero page to the stream
1068  *
1069  * Returns the number of pages written.
1070  *
1071  * @rs: current RAM state
1072  * @block: block that contains the page we want to send
1073  * @offset: offset inside the block for the page
1074  */
1075 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1076 {
1077     int len = save_zero_page_to_file(rs, rs->f, block, offset);
1078
1079     if (len) {
1080         ram_counters.duplicate++;
1081         ram_counters.transferred += len;
1082         return 1;
1083     }
1084     return -1;
1085 }
1086
1087 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
1088 {
1089     if (!migrate_release_ram() || !migration_in_postcopy()) {
1090         return;
1091     }
1092
1093     ram_discard_range(rbname, offset, ((ram_addr_t)pages) << TARGET_PAGE_BITS);
1094 }
1095
1096 /*
1097  * @pages: the number of pages written by the control path,
1098  *        < 0 - error
1099  *        > 0 - number of pages written
1100  *
1101  * Return true if the pages has been saved, otherwise false is returned.
1102  */
1103 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1104                               int *pages)
1105 {
1106     uint64_t bytes_xmit = 0;
1107     int ret;
1108
1109     *pages = -1;
1110     ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1111                                 &bytes_xmit);
1112     if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1113         return false;
1114     }
1115
1116     if (bytes_xmit) {
1117         ram_counters.transferred += bytes_xmit;
1118         *pages = 1;
1119     }
1120
1121     if (ret == RAM_SAVE_CONTROL_DELAYED) {
1122         return true;
1123     }
1124
1125     if (bytes_xmit > 0) {
1126         ram_counters.normal++;
1127     } else if (bytes_xmit == 0) {
1128         ram_counters.duplicate++;
1129     }
1130
1131     return true;
1132 }
1133
1134 /*
1135  * directly send the page to the stream
1136  *
1137  * Returns the number of pages written.
1138  *
1139  * @rs: current RAM state
1140  * @block: block that contains the page we want to send
1141  * @offset: offset inside the block for the page
1142  * @buf: the page to be sent
1143  * @async: send to page asyncly
1144  */
1145 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1146                             uint8_t *buf, bool async)
1147 {
1148     ram_counters.transferred += save_page_header(rs, rs->f, block,
1149                                                  offset | RAM_SAVE_FLAG_PAGE);
1150     if (async) {
1151         qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1152                               migrate_release_ram() &
1153                               migration_in_postcopy());
1154     } else {
1155         qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1156     }
1157     ram_counters.transferred += TARGET_PAGE_SIZE;
1158     ram_counters.normal++;
1159     return 1;
1160 }
1161
1162 /**
1163  * ram_save_page: send the given page to the stream
1164  *
1165  * Returns the number of pages written.
1166  *          < 0 - error
1167  *          >=0 - Number of pages written - this might legally be 0
1168  *                if xbzrle noticed the page was the same.
1169  *
1170  * @rs: current RAM state
1171  * @block: block that contains the page we want to send
1172  * @offset: offset inside the block for the page
1173  * @last_stage: if we are at the completion stage
1174  */
1175 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
1176 {
1177     int pages = -1;
1178     uint8_t *p;
1179     bool send_async = true;
1180     RAMBlock *block = pss->block;
1181     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1182     ram_addr_t current_addr = block->offset + offset;
1183
1184     p = block->host + offset;
1185     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1186
1187     XBZRLE_cache_lock();
1188     if (!rs->ram_bulk_stage && !migration_in_postcopy() &&
1189         migrate_use_xbzrle()) {
1190         pages = save_xbzrle_page(rs, &p, current_addr, block,
1191                                  offset, last_stage);
1192         if (!last_stage) {
1193             /* Can't send this cached data async, since the cache page
1194              * might get updated before it gets to the wire
1195              */
1196             send_async = false;
1197         }
1198     }
1199
1200     /* XBZRLE overflow or normal page */
1201     if (pages == -1) {
1202         pages = save_normal_page(rs, block, offset, p, send_async);
1203     }
1204
1205     XBZRLE_cache_unlock();
1206
1207     return pages;
1208 }
1209
1210 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
1211                                  ram_addr_t offset)
1212 {
1213     if (multifd_queue_page(rs->f, block, offset) < 0) {
1214         return -1;
1215     }
1216     ram_counters.normal++;
1217
1218     return 1;
1219 }
1220
1221 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1222                                  ram_addr_t offset, uint8_t *source_buf)
1223 {
1224     RAMState *rs = ram_state;
1225     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1226     bool zero_page = false;
1227     int ret;
1228
1229     if (save_zero_page_to_file(rs, f, block, offset)) {
1230         zero_page = true;
1231         goto exit;
1232     }
1233
1234     save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1235
1236     /*
1237      * copy it to a internal buffer to avoid it being modified by VM
1238      * so that we can catch up the error during compression and
1239      * decompression
1240      */
1241     memcpy(source_buf, p, TARGET_PAGE_SIZE);
1242     ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1243     if (ret < 0) {
1244         qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
1245         error_report("compressed data failed!");
1246         return false;
1247     }
1248
1249 exit:
1250     ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
1251     return zero_page;
1252 }
1253
1254 static void
1255 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1256 {
1257     ram_counters.transferred += bytes_xmit;
1258
1259     if (param->zero_page) {
1260         ram_counters.duplicate++;
1261         return;
1262     }
1263
1264     /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1265     compression_counters.compressed_size += bytes_xmit - 8;
1266     compression_counters.pages++;
1267 }
1268
1269 static bool save_page_use_compression(RAMState *rs);
1270
1271 static void flush_compressed_data(RAMState *rs)
1272 {
1273     int idx, len, thread_count;
1274
1275     if (!save_page_use_compression(rs)) {
1276         return;
1277     }
1278     thread_count = migrate_compress_threads();
1279
1280     qemu_mutex_lock(&comp_done_lock);
1281     for (idx = 0; idx < thread_count; idx++) {
1282         while (!comp_param[idx].done) {
1283             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1284         }
1285     }
1286     qemu_mutex_unlock(&comp_done_lock);
1287
1288     for (idx = 0; idx < thread_count; idx++) {
1289         qemu_mutex_lock(&comp_param[idx].mutex);
1290         if (!comp_param[idx].quit) {
1291             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1292             /*
1293              * it's safe to fetch zero_page without holding comp_done_lock
1294              * as there is no further request submitted to the thread,
1295              * i.e, the thread should be waiting for a request at this point.
1296              */
1297             update_compress_thread_counts(&comp_param[idx], len);
1298         }
1299         qemu_mutex_unlock(&comp_param[idx].mutex);
1300     }
1301 }
1302
1303 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1304                                        ram_addr_t offset)
1305 {
1306     param->block = block;
1307     param->offset = offset;
1308 }
1309
1310 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1311                                            ram_addr_t offset)
1312 {
1313     int idx, thread_count, bytes_xmit = -1, pages = -1;
1314     bool wait = migrate_compress_wait_thread();
1315
1316     thread_count = migrate_compress_threads();
1317     qemu_mutex_lock(&comp_done_lock);
1318 retry:
1319     for (idx = 0; idx < thread_count; idx++) {
1320         if (comp_param[idx].done) {
1321             comp_param[idx].done = false;
1322             bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1323             qemu_mutex_lock(&comp_param[idx].mutex);
1324             set_compress_params(&comp_param[idx], block, offset);
1325             qemu_cond_signal(&comp_param[idx].cond);
1326             qemu_mutex_unlock(&comp_param[idx].mutex);
1327             pages = 1;
1328             update_compress_thread_counts(&comp_param[idx], bytes_xmit);
1329             break;
1330         }
1331     }
1332
1333     /*
1334      * wait for the free thread if the user specifies 'compress-wait-thread',
1335      * otherwise we will post the page out in the main thread as normal page.
1336      */
1337     if (pages < 0 && wait) {
1338         qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1339         goto retry;
1340     }
1341     qemu_mutex_unlock(&comp_done_lock);
1342
1343     return pages;
1344 }
1345
1346 /**
1347  * find_dirty_block: find the next dirty page and update any state
1348  * associated with the search process.
1349  *
1350  * Returns true if a page is found
1351  *
1352  * @rs: current RAM state
1353  * @pss: data about the state of the current dirty page scan
1354  * @again: set to false if the search has scanned the whole of RAM
1355  */
1356 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1357 {
1358     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1359     if (pss->complete_round && pss->block == rs->last_seen_block &&
1360         pss->page >= rs->last_page) {
1361         /*
1362          * We've been once around the RAM and haven't found anything.
1363          * Give up.
1364          */
1365         *again = false;
1366         return false;
1367     }
1368     if ((((ram_addr_t)pss->page) << TARGET_PAGE_BITS)
1369         >= pss->block->used_length) {
1370         /* Didn't find anything in this RAM Block */
1371         pss->page = 0;
1372         pss->block = QLIST_NEXT_RCU(pss->block, next);
1373         if (!pss->block) {
1374             /*
1375              * If memory migration starts over, we will meet a dirtied page
1376              * which may still exists in compression threads's ring, so we
1377              * should flush the compressed data to make sure the new page
1378              * is not overwritten by the old one in the destination.
1379              *
1380              * Also If xbzrle is on, stop using the data compression at this
1381              * point. In theory, xbzrle can do better than compression.
1382              */
1383             flush_compressed_data(rs);
1384
1385             /* Hit the end of the list */
1386             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1387             /* Flag that we've looped */
1388             pss->complete_round = true;
1389             rs->ram_bulk_stage = false;
1390         }
1391         /* Didn't find anything this time, but try again on the new block */
1392         *again = true;
1393         return false;
1394     } else {
1395         /* Can go around again, but... */
1396         *again = true;
1397         /* We've found something so probably don't need to */
1398         return true;
1399     }
1400 }
1401
1402 /**
1403  * unqueue_page: gets a page of the queue
1404  *
1405  * Helper for 'get_queued_page' - gets a page off the queue
1406  *
1407  * Returns the block of the page (or NULL if none available)
1408  *
1409  * @rs: current RAM state
1410  * @offset: used to return the offset within the RAMBlock
1411  */
1412 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1413 {
1414     RAMBlock *block = NULL;
1415
1416     if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) {
1417         return NULL;
1418     }
1419
1420     QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
1421     if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1422         struct RAMSrcPageRequest *entry =
1423                                 QSIMPLEQ_FIRST(&rs->src_page_requests);
1424         block = entry->rb;
1425         *offset = entry->offset;
1426
1427         if (entry->len > TARGET_PAGE_SIZE) {
1428             entry->len -= TARGET_PAGE_SIZE;
1429             entry->offset += TARGET_PAGE_SIZE;
1430         } else {
1431             memory_region_unref(block->mr);
1432             QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1433             g_free(entry);
1434             migration_consume_urgent_request();
1435         }
1436     }
1437
1438     return block;
1439 }
1440
1441 #if defined(__linux__)
1442 /**
1443  * poll_fault_page: try to get next UFFD write fault page and, if pending fault
1444  *   is found, return RAM block pointer and page offset
1445  *
1446  * Returns pointer to the RAMBlock containing faulting page,
1447  *   NULL if no write faults are pending
1448  *
1449  * @rs: current RAM state
1450  * @offset: page offset from the beginning of the block
1451  */
1452 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1453 {
1454     struct uffd_msg uffd_msg;
1455     void *page_address;
1456     RAMBlock *block;
1457     int res;
1458
1459     if (!migrate_background_snapshot()) {
1460         return NULL;
1461     }
1462
1463     res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1);
1464     if (res <= 0) {
1465         return NULL;
1466     }
1467
1468     page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address;
1469     block = qemu_ram_block_from_host(page_address, false, offset);
1470     assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0);
1471     return block;
1472 }
1473
1474 /**
1475  * ram_save_release_protection: release UFFD write protection after
1476  *   a range of pages has been saved
1477  *
1478  * @rs: current RAM state
1479  * @pss: page-search-status structure
1480  * @start_page: index of the first page in the range relative to pss->block
1481  *
1482  * Returns 0 on success, negative value in case of an error
1483 */
1484 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1485         unsigned long start_page)
1486 {
1487     int res = 0;
1488
1489     /* Check if page is from UFFD-managed region. */
1490     if (pss->block->flags & RAM_UF_WRITEPROTECT) {
1491         void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS);
1492         uint64_t run_length = (pss->page - start_page + 1) << TARGET_PAGE_BITS;
1493
1494         /* Flush async buffers before un-protect. */
1495         qemu_fflush(rs->f);
1496         /* Un-protect memory range. */
1497         res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
1498                 false, false);
1499     }
1500
1501     return res;
1502 }
1503
1504 /* ram_write_tracking_available: check if kernel supports required UFFD features
1505  *
1506  * Returns true if supports, false otherwise
1507  */
1508 bool ram_write_tracking_available(void)
1509 {
1510     uint64_t uffd_features;
1511     int res;
1512
1513     res = uffd_query_features(&uffd_features);
1514     return (res == 0 &&
1515             (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0);
1516 }
1517
1518 /* ram_write_tracking_compatible: check if guest configuration is
1519  *   compatible with 'write-tracking'
1520  *
1521  * Returns true if compatible, false otherwise
1522  */
1523 bool ram_write_tracking_compatible(void)
1524 {
1525     const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT);
1526     int uffd_fd;
1527     RAMBlock *block;
1528     bool ret = false;
1529
1530     /* Open UFFD file descriptor */
1531     uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false);
1532     if (uffd_fd < 0) {
1533         return false;
1534     }
1535
1536     RCU_READ_LOCK_GUARD();
1537
1538     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1539         uint64_t uffd_ioctls;
1540
1541         /* Nothing to do with read-only and MMIO-writable regions */
1542         if (block->mr->readonly || block->mr->rom_device) {
1543             continue;
1544         }
1545         /* Try to register block memory via UFFD-IO to track writes */
1546         if (uffd_register_memory(uffd_fd, block->host, block->max_length,
1547                 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) {
1548             goto out;
1549         }
1550         if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) {
1551             goto out;
1552         }
1553     }
1554     ret = true;
1555
1556 out:
1557     uffd_close_fd(uffd_fd);
1558     return ret;
1559 }
1560
1561 /*
1562  * ram_block_populate_pages: populate memory in the RAM block by reading
1563  *   an integer from the beginning of each page.
1564  *
1565  * Since it's solely used for userfault_fd WP feature, here we just
1566  *   hardcode page size to qemu_real_host_page_size.
1567  *
1568  * @block: RAM block to populate
1569  */
1570 static void ram_block_populate_pages(RAMBlock *block)
1571 {
1572     char *ptr = (char *) block->host;
1573
1574     for (ram_addr_t offset = 0; offset < block->used_length;
1575             offset += qemu_real_host_page_size) {
1576         char tmp = *(ptr + offset);
1577
1578         /* Don't optimize the read out */
1579         asm volatile("" : "+r" (tmp));
1580     }
1581 }
1582
1583 /*
1584  * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking
1585  */
1586 void ram_write_tracking_prepare(void)
1587 {
1588     RAMBlock *block;
1589
1590     RCU_READ_LOCK_GUARD();
1591
1592     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1593         /* Nothing to do with read-only and MMIO-writable regions */
1594         if (block->mr->readonly || block->mr->rom_device) {
1595             continue;
1596         }
1597
1598         /*
1599          * Populate pages of the RAM block before enabling userfault_fd
1600          * write protection.
1601          *
1602          * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with
1603          * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip
1604          * pages with pte_none() entries in page table.
1605          */
1606         ram_block_populate_pages(block);
1607     }
1608 }
1609
1610 /*
1611  * ram_write_tracking_start: start UFFD-WP memory tracking
1612  *
1613  * Returns 0 for success or negative value in case of error
1614  */
1615 int ram_write_tracking_start(void)
1616 {
1617     int uffd_fd;
1618     RAMState *rs = ram_state;
1619     RAMBlock *block;
1620
1621     /* Open UFFD file descriptor */
1622     uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true);
1623     if (uffd_fd < 0) {
1624         return uffd_fd;
1625     }
1626     rs->uffdio_fd = uffd_fd;
1627
1628     RCU_READ_LOCK_GUARD();
1629
1630     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1631         /* Nothing to do with read-only and MMIO-writable regions */
1632         if (block->mr->readonly || block->mr->rom_device) {
1633             continue;
1634         }
1635
1636         /* Register block memory with UFFD to track writes */
1637         if (uffd_register_memory(rs->uffdio_fd, block->host,
1638                 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
1639             goto fail;
1640         }
1641         /* Apply UFFD write protection to the block memory range */
1642         if (uffd_change_protection(rs->uffdio_fd, block->host,
1643                 block->max_length, true, false)) {
1644             goto fail;
1645         }
1646         block->flags |= RAM_UF_WRITEPROTECT;
1647         memory_region_ref(block->mr);
1648
1649         trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size,
1650                 block->host, block->max_length);
1651     }
1652
1653     return 0;
1654
1655 fail:
1656     error_report("ram_write_tracking_start() failed: restoring initial memory state");
1657
1658     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1659         if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1660             continue;
1661         }
1662         /*
1663          * In case some memory block failed to be write-protected
1664          * remove protection and unregister all succeeded RAM blocks
1665          */
1666         uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1667                 false, false);
1668         uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1669         /* Cleanup flags and remove reference */
1670         block->flags &= ~RAM_UF_WRITEPROTECT;
1671         memory_region_unref(block->mr);
1672     }
1673
1674     uffd_close_fd(uffd_fd);
1675     rs->uffdio_fd = -1;
1676     return -1;
1677 }
1678
1679 /**
1680  * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
1681  */
1682 void ram_write_tracking_stop(void)
1683 {
1684     RAMState *rs = ram_state;
1685     RAMBlock *block;
1686
1687     RCU_READ_LOCK_GUARD();
1688
1689     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1690         if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1691             continue;
1692         }
1693         /* Remove protection and unregister all affected RAM blocks */
1694         uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1695                 false, false);
1696         uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1697
1698         trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size,
1699                 block->host, block->max_length);
1700
1701         /* Cleanup flags and remove reference */
1702         block->flags &= ~RAM_UF_WRITEPROTECT;
1703         memory_region_unref(block->mr);
1704     }
1705
1706     /* Finally close UFFD file descriptor */
1707     uffd_close_fd(rs->uffdio_fd);
1708     rs->uffdio_fd = -1;
1709 }
1710
1711 #else
1712 /* No target OS support, stubs just fail or ignore */
1713
1714 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1715 {
1716     (void) rs;
1717     (void) offset;
1718
1719     return NULL;
1720 }
1721
1722 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1723         unsigned long start_page)
1724 {
1725     (void) rs;
1726     (void) pss;
1727     (void) start_page;
1728
1729     return 0;
1730 }
1731
1732 bool ram_write_tracking_available(void)
1733 {
1734     return false;
1735 }
1736
1737 bool ram_write_tracking_compatible(void)
1738 {
1739     assert(0);
1740     return false;
1741 }
1742
1743 int ram_write_tracking_start(void)
1744 {
1745     assert(0);
1746     return -1;
1747 }
1748
1749 void ram_write_tracking_stop(void)
1750 {
1751     assert(0);
1752 }
1753 #endif /* defined(__linux__) */
1754
1755 /**
1756  * get_queued_page: unqueue a page from the postcopy requests
1757  *
1758  * Skips pages that are already sent (!dirty)
1759  *
1760  * Returns true if a queued page is found
1761  *
1762  * @rs: current RAM state
1763  * @pss: data about the state of the current dirty page scan
1764  */
1765 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1766 {
1767     RAMBlock  *block;
1768     ram_addr_t offset;
1769     bool dirty;
1770
1771     do {
1772         block = unqueue_page(rs, &offset);
1773         /*
1774          * We're sending this page, and since it's postcopy nothing else
1775          * will dirty it, and we must make sure it doesn't get sent again
1776          * even if this queue request was received after the background
1777          * search already sent it.
1778          */
1779         if (block) {
1780             unsigned long page;
1781
1782             page = offset >> TARGET_PAGE_BITS;
1783             dirty = test_bit(page, block->bmap);
1784             if (!dirty) {
1785                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1786                                                 page);
1787             } else {
1788                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1789             }
1790         }
1791
1792     } while (block && !dirty);
1793
1794     if (!block) {
1795         /*
1796          * Poll write faults too if background snapshot is enabled; that's
1797          * when we have vcpus got blocked by the write protected pages.
1798          */
1799         block = poll_fault_page(rs, &offset);
1800     }
1801
1802     if (block) {
1803         /*
1804          * As soon as we start servicing pages out of order, then we have
1805          * to kill the bulk stage, since the bulk stage assumes
1806          * in (migration_bitmap_find_and_reset_dirty) that every page is
1807          * dirty, that's no longer true.
1808          */
1809         rs->ram_bulk_stage = false;
1810
1811         /*
1812          * We want the background search to continue from the queued page
1813          * since the guest is likely to want other pages near to the page
1814          * it just requested.
1815          */
1816         pss->block = block;
1817         pss->page = offset >> TARGET_PAGE_BITS;
1818
1819         /*
1820          * This unqueued page would break the "one round" check, even is
1821          * really rare.
1822          */
1823         pss->complete_round = false;
1824     }
1825
1826     return !!block;
1827 }
1828
1829 /**
1830  * migration_page_queue_free: drop any remaining pages in the ram
1831  * request queue
1832  *
1833  * It should be empty at the end anyway, but in error cases there may
1834  * be some left.  in case that there is any page left, we drop it.
1835  *
1836  */
1837 static void migration_page_queue_free(RAMState *rs)
1838 {
1839     struct RAMSrcPageRequest *mspr, *next_mspr;
1840     /* This queue generally should be empty - but in the case of a failed
1841      * migration might have some droppings in.
1842      */
1843     RCU_READ_LOCK_GUARD();
1844     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1845         memory_region_unref(mspr->rb->mr);
1846         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1847         g_free(mspr);
1848     }
1849 }
1850
1851 /**
1852  * ram_save_queue_pages: queue the page for transmission
1853  *
1854  * A request from postcopy destination for example.
1855  *
1856  * Returns zero on success or negative on error
1857  *
1858  * @rbname: Name of the RAMBLock of the request. NULL means the
1859  *          same that last one.
1860  * @start: starting address from the start of the RAMBlock
1861  * @len: length (in bytes) to send
1862  */
1863 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1864 {
1865     RAMBlock *ramblock;
1866     RAMState *rs = ram_state;
1867
1868     ram_counters.postcopy_requests++;
1869     RCU_READ_LOCK_GUARD();
1870
1871     if (!rbname) {
1872         /* Reuse last RAMBlock */
1873         ramblock = rs->last_req_rb;
1874
1875         if (!ramblock) {
1876             /*
1877              * Shouldn't happen, we can't reuse the last RAMBlock if
1878              * it's the 1st request.
1879              */
1880             error_report("ram_save_queue_pages no previous block");
1881             return -1;
1882         }
1883     } else {
1884         ramblock = qemu_ram_block_by_name(rbname);
1885
1886         if (!ramblock) {
1887             /* We shouldn't be asked for a non-existent RAMBlock */
1888             error_report("ram_save_queue_pages no block '%s'", rbname);
1889             return -1;
1890         }
1891         rs->last_req_rb = ramblock;
1892     }
1893     trace_ram_save_queue_pages(ramblock->idstr, start, len);
1894     if (start + len > ramblock->used_length) {
1895         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1896                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1897                      __func__, start, len, ramblock->used_length);
1898         return -1;
1899     }
1900
1901     struct RAMSrcPageRequest *new_entry =
1902         g_malloc0(sizeof(struct RAMSrcPageRequest));
1903     new_entry->rb = ramblock;
1904     new_entry->offset = start;
1905     new_entry->len = len;
1906
1907     memory_region_ref(ramblock->mr);
1908     qemu_mutex_lock(&rs->src_page_req_mutex);
1909     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1910     migration_make_urgent_request();
1911     qemu_mutex_unlock(&rs->src_page_req_mutex);
1912
1913     return 0;
1914 }
1915
1916 static bool save_page_use_compression(RAMState *rs)
1917 {
1918     if (!migrate_use_compression()) {
1919         return false;
1920     }
1921
1922     /*
1923      * If xbzrle is on, stop using the data compression after first
1924      * round of migration even if compression is enabled. In theory,
1925      * xbzrle can do better than compression.
1926      */
1927     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
1928         return true;
1929     }
1930
1931     return false;
1932 }
1933
1934 /*
1935  * try to compress the page before posting it out, return true if the page
1936  * has been properly handled by compression, otherwise needs other
1937  * paths to handle it
1938  */
1939 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1940 {
1941     if (!save_page_use_compression(rs)) {
1942         return false;
1943     }
1944
1945     /*
1946      * When starting the process of a new block, the first page of
1947      * the block should be sent out before other pages in the same
1948      * block, and all the pages in last block should have been sent
1949      * out, keeping this order is important, because the 'cont' flag
1950      * is used to avoid resending the block name.
1951      *
1952      * We post the fist page as normal page as compression will take
1953      * much CPU resource.
1954      */
1955     if (block != rs->last_sent_block) {
1956         flush_compressed_data(rs);
1957         return false;
1958     }
1959
1960     if (compress_page_with_multi_thread(rs, block, offset) > 0) {
1961         return true;
1962     }
1963
1964     compression_counters.busy++;
1965     return false;
1966 }
1967
1968 /**
1969  * ram_save_target_page: save one target page
1970  *
1971  * Returns the number of pages written
1972  *
1973  * @rs: current RAM state
1974  * @pss: data about the page we want to send
1975  * @last_stage: if we are at the completion stage
1976  */
1977 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
1978                                 bool last_stage)
1979 {
1980     RAMBlock *block = pss->block;
1981     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1982     int res;
1983
1984     if (control_save_page(rs, block, offset, &res)) {
1985         return res;
1986     }
1987
1988     if (save_compress_page(rs, block, offset)) {
1989         return 1;
1990     }
1991
1992     res = save_zero_page(rs, block, offset);
1993     if (res > 0) {
1994         /* Must let xbzrle know, otherwise a previous (now 0'd) cached
1995          * page would be stale
1996          */
1997         if (!save_page_use_compression(rs)) {
1998             XBZRLE_cache_lock();
1999             xbzrle_cache_zero_page(rs, block->offset + offset);
2000             XBZRLE_cache_unlock();
2001         }
2002         ram_release_pages(block->idstr, offset, res);
2003         return res;
2004     }
2005
2006     /*
2007      * Do not use multifd for:
2008      * 1. Compression as the first page in the new block should be posted out
2009      *    before sending the compressed page
2010      * 2. In postcopy as one whole host page should be placed
2011      */
2012     if (!save_page_use_compression(rs) && migrate_use_multifd()
2013         && !migration_in_postcopy()) {
2014         return ram_save_multifd_page(rs, block, offset);
2015     }
2016
2017     return ram_save_page(rs, pss, last_stage);
2018 }
2019
2020 /**
2021  * ram_save_host_page: save a whole host page
2022  *
2023  * Starting at *offset send pages up to the end of the current host
2024  * page. It's valid for the initial offset to point into the middle of
2025  * a host page in which case the remainder of the hostpage is sent.
2026  * Only dirty target pages are sent. Note that the host page size may
2027  * be a huge page for this block.
2028  * The saving stops at the boundary of the used_length of the block
2029  * if the RAMBlock isn't a multiple of the host page size.
2030  *
2031  * Returns the number of pages written or negative on error
2032  *
2033  * @rs: current RAM state
2034  * @ms: current migration state
2035  * @pss: data about the page we want to send
2036  * @last_stage: if we are at the completion stage
2037  */
2038 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
2039                               bool last_stage)
2040 {
2041     int tmppages, pages = 0;
2042     size_t pagesize_bits =
2043         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2044     unsigned long start_page = pss->page;
2045     int res;
2046
2047     if (ramblock_is_ignored(pss->block)) {
2048         error_report("block %s should not be migrated !", pss->block->idstr);
2049         return 0;
2050     }
2051
2052     do {
2053         /* Check the pages is dirty and if it is send it */
2054         if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
2055             pss->page++;
2056             continue;
2057         }
2058
2059         tmppages = ram_save_target_page(rs, pss, last_stage);
2060         if (tmppages < 0) {
2061             return tmppages;
2062         }
2063
2064         pages += tmppages;
2065         pss->page++;
2066         /* Allow rate limiting to happen in the middle of huge pages */
2067         migration_rate_limit();
2068     } while ((pss->page & (pagesize_bits - 1)) &&
2069              offset_in_ramblock(pss->block,
2070                                 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS));
2071     /* The offset we leave with is the last one we looked at */
2072     pss->page--;
2073
2074     res = ram_save_release_protection(rs, pss, start_page);
2075     return (res < 0 ? res : pages);
2076 }
2077
2078 /**
2079  * ram_find_and_save_block: finds a dirty page and sends it to f
2080  *
2081  * Called within an RCU critical section.
2082  *
2083  * Returns the number of pages written where zero means no dirty pages,
2084  * or negative on error
2085  *
2086  * @rs: current RAM state
2087  * @last_stage: if we are at the completion stage
2088  *
2089  * On systems where host-page-size > target-page-size it will send all the
2090  * pages in a host page that are dirty.
2091  */
2092
2093 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
2094 {
2095     PageSearchStatus pss;
2096     int pages = 0;
2097     bool again, found;
2098
2099     /* No dirty page as there is zero RAM */
2100     if (!ram_bytes_total()) {
2101         return pages;
2102     }
2103
2104     pss.block = rs->last_seen_block;
2105     pss.page = rs->last_page;
2106     pss.complete_round = false;
2107
2108     if (!pss.block) {
2109         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
2110     }
2111
2112     do {
2113         again = true;
2114         found = get_queued_page(rs, &pss);
2115
2116         if (!found) {
2117             /* priority queue empty, so just search for something dirty */
2118             found = find_dirty_block(rs, &pss, &again);
2119         }
2120
2121         if (found) {
2122             pages = ram_save_host_page(rs, &pss, last_stage);
2123         }
2124     } while (!pages && again);
2125
2126     rs->last_seen_block = pss.block;
2127     rs->last_page = pss.page;
2128
2129     return pages;
2130 }
2131
2132 void acct_update_position(QEMUFile *f, size_t size, bool zero)
2133 {
2134     uint64_t pages = size / TARGET_PAGE_SIZE;
2135
2136     if (zero) {
2137         ram_counters.duplicate += pages;
2138     } else {
2139         ram_counters.normal += pages;
2140         ram_counters.transferred += size;
2141         qemu_update_position(f, size);
2142     }
2143 }
2144
2145 static uint64_t ram_bytes_total_common(bool count_ignored)
2146 {
2147     RAMBlock *block;
2148     uint64_t total = 0;
2149
2150     RCU_READ_LOCK_GUARD();
2151
2152     if (count_ignored) {
2153         RAMBLOCK_FOREACH_MIGRATABLE(block) {
2154             total += block->used_length;
2155         }
2156     } else {
2157         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2158             total += block->used_length;
2159         }
2160     }
2161     return total;
2162 }
2163
2164 uint64_t ram_bytes_total(void)
2165 {
2166     return ram_bytes_total_common(false);
2167 }
2168
2169 static void xbzrle_load_setup(void)
2170 {
2171     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2172 }
2173
2174 static void xbzrle_load_cleanup(void)
2175 {
2176     g_free(XBZRLE.decoded_buf);
2177     XBZRLE.decoded_buf = NULL;
2178 }
2179
2180 static void ram_state_cleanup(RAMState **rsp)
2181 {
2182     if (*rsp) {
2183         migration_page_queue_free(*rsp);
2184         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2185         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2186         g_free(*rsp);
2187         *rsp = NULL;
2188     }
2189 }
2190
2191 static void xbzrle_cleanup(void)
2192 {
2193     XBZRLE_cache_lock();
2194     if (XBZRLE.cache) {
2195         cache_fini(XBZRLE.cache);
2196         g_free(XBZRLE.encoded_buf);
2197         g_free(XBZRLE.current_buf);
2198         g_free(XBZRLE.zero_target_page);
2199         XBZRLE.cache = NULL;
2200         XBZRLE.encoded_buf = NULL;
2201         XBZRLE.current_buf = NULL;
2202         XBZRLE.zero_target_page = NULL;
2203     }
2204     XBZRLE_cache_unlock();
2205 }
2206
2207 static void ram_save_cleanup(void *opaque)
2208 {
2209     RAMState **rsp = opaque;
2210     RAMBlock *block;
2211
2212     /* We don't use dirty log with background snapshots */
2213     if (!migrate_background_snapshot()) {
2214         /* caller have hold iothread lock or is in a bh, so there is
2215          * no writing race against the migration bitmap
2216          */
2217         memory_global_dirty_log_stop();
2218     }
2219
2220     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2221         g_free(block->clear_bmap);
2222         block->clear_bmap = NULL;
2223         g_free(block->bmap);
2224         block->bmap = NULL;
2225     }
2226
2227     xbzrle_cleanup();
2228     compress_threads_save_cleanup();
2229     ram_state_cleanup(rsp);
2230 }
2231
2232 static void ram_state_reset(RAMState *rs)
2233 {
2234     rs->last_seen_block = NULL;
2235     rs->last_sent_block = NULL;
2236     rs->last_page = 0;
2237     rs->last_version = ram_list.version;
2238     rs->ram_bulk_stage = true;
2239     rs->fpo_enabled = false;
2240 }
2241
2242 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2243
2244 /*
2245  * 'expected' is the value you expect the bitmap mostly to be full
2246  * of; it won't bother printing lines that are all this value.
2247  * If 'todump' is null the migration bitmap is dumped.
2248  */
2249 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
2250                            unsigned long pages)
2251 {
2252     int64_t cur;
2253     int64_t linelen = 128;
2254     char linebuf[129];
2255
2256     for (cur = 0; cur < pages; cur += linelen) {
2257         int64_t curb;
2258         bool found = false;
2259         /*
2260          * Last line; catch the case where the line length
2261          * is longer than remaining ram
2262          */
2263         if (cur + linelen > pages) {
2264             linelen = pages - cur;
2265         }
2266         for (curb = 0; curb < linelen; curb++) {
2267             bool thisbit = test_bit(cur + curb, todump);
2268             linebuf[curb] = thisbit ? '1' : '.';
2269             found = found || (thisbit != expected);
2270         }
2271         if (found) {
2272             linebuf[curb] = '\0';
2273             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
2274         }
2275     }
2276 }
2277
2278 /* **** functions for postcopy ***** */
2279
2280 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2281 {
2282     struct RAMBlock *block;
2283
2284     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2285         unsigned long *bitmap = block->bmap;
2286         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2287         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2288
2289         while (run_start < range) {
2290             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2291             ram_discard_range(block->idstr,
2292                               ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
2293                               ((ram_addr_t)(run_end - run_start))
2294                                 << TARGET_PAGE_BITS);
2295             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2296         }
2297     }
2298 }
2299
2300 /**
2301  * postcopy_send_discard_bm_ram: discard a RAMBlock
2302  *
2303  * Returns zero on success
2304  *
2305  * Callback from postcopy_each_ram_send_discard for each RAMBlock
2306  *
2307  * @ms: current migration state
2308  * @block: RAMBlock to discard
2309  */
2310 static int postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
2311 {
2312     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2313     unsigned long current;
2314     unsigned long *bitmap = block->bmap;
2315
2316     for (current = 0; current < end; ) {
2317         unsigned long one = find_next_bit(bitmap, end, current);
2318         unsigned long zero, discard_length;
2319
2320         if (one >= end) {
2321             break;
2322         }
2323
2324         zero = find_next_zero_bit(bitmap, end, one + 1);
2325
2326         if (zero >= end) {
2327             discard_length = end - one;
2328         } else {
2329             discard_length = zero - one;
2330         }
2331         postcopy_discard_send_range(ms, one, discard_length);
2332         current = one + discard_length;
2333     }
2334
2335     return 0;
2336 }
2337
2338 /**
2339  * postcopy_each_ram_send_discard: discard all RAMBlocks
2340  *
2341  * Returns 0 for success or negative for error
2342  *
2343  * Utility for the outgoing postcopy code.
2344  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
2345  *   passing it bitmap indexes and name.
2346  * (qemu_ram_foreach_block ends up passing unscaled lengths
2347  *  which would mean postcopy code would have to deal with target page)
2348  *
2349  * @ms: current migration state
2350  */
2351 static int postcopy_each_ram_send_discard(MigrationState *ms)
2352 {
2353     struct RAMBlock *block;
2354     int ret;
2355
2356     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2357         postcopy_discard_send_init(ms, block->idstr);
2358
2359         /*
2360          * Postcopy sends chunks of bitmap over the wire, but it
2361          * just needs indexes at this point, avoids it having
2362          * target page specific code.
2363          */
2364         ret = postcopy_send_discard_bm_ram(ms, block);
2365         postcopy_discard_send_finish(ms);
2366         if (ret) {
2367             return ret;
2368         }
2369     }
2370
2371     return 0;
2372 }
2373
2374 /**
2375  * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2376  *
2377  * Helper for postcopy_chunk_hostpages; it's called twice to
2378  * canonicalize the two bitmaps, that are similar, but one is
2379  * inverted.
2380  *
2381  * Postcopy requires that all target pages in a hostpage are dirty or
2382  * clean, not a mix.  This function canonicalizes the bitmaps.
2383  *
2384  * @ms: current migration state
2385  * @block: block that contains the page we want to canonicalize
2386  */
2387 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
2388 {
2389     RAMState *rs = ram_state;
2390     unsigned long *bitmap = block->bmap;
2391     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2392     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2393     unsigned long run_start;
2394
2395     if (block->page_size == TARGET_PAGE_SIZE) {
2396         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2397         return;
2398     }
2399
2400     /* Find a dirty page */
2401     run_start = find_next_bit(bitmap, pages, 0);
2402
2403     while (run_start < pages) {
2404
2405         /*
2406          * If the start of this run of pages is in the middle of a host
2407          * page, then we need to fixup this host page.
2408          */
2409         if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2410             /* Find the end of this run */
2411             run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2412             /*
2413              * If the end isn't at the start of a host page, then the
2414              * run doesn't finish at the end of a host page
2415              * and we need to discard.
2416              */
2417         }
2418
2419         if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2420             unsigned long page;
2421             unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2422                                                              host_ratio);
2423             run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2424
2425             /* Clean up the bitmap */
2426             for (page = fixup_start_addr;
2427                  page < fixup_start_addr + host_ratio; page++) {
2428                 /*
2429                  * Remark them as dirty, updating the count for any pages
2430                  * that weren't previously dirty.
2431                  */
2432                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2433             }
2434         }
2435
2436         /* Find the next dirty page for the next iteration */
2437         run_start = find_next_bit(bitmap, pages, run_start);
2438     }
2439 }
2440
2441 /**
2442  * postcopy_chunk_hostpages: discard any partially sent host page
2443  *
2444  * Utility for the outgoing postcopy code.
2445  *
2446  * Discard any partially sent host-page size chunks, mark any partially
2447  * dirty host-page size chunks as all dirty.  In this case the host-page
2448  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
2449  *
2450  * Returns zero on success
2451  *
2452  * @ms: current migration state
2453  * @block: block we want to work with
2454  */
2455 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
2456 {
2457     postcopy_discard_send_init(ms, block->idstr);
2458
2459     /*
2460      * Ensure that all partially dirty host pages are made fully dirty.
2461      */
2462     postcopy_chunk_hostpages_pass(ms, block);
2463
2464     postcopy_discard_send_finish(ms);
2465     return 0;
2466 }
2467
2468 /**
2469  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2470  *
2471  * Returns zero on success
2472  *
2473  * Transmit the set of pages to be discarded after precopy to the target
2474  * these are pages that:
2475  *     a) Have been previously transmitted but are now dirty again
2476  *     b) Pages that have never been transmitted, this ensures that
2477  *        any pages on the destination that have been mapped by background
2478  *        tasks get discarded (transparent huge pages is the specific concern)
2479  * Hopefully this is pretty sparse
2480  *
2481  * @ms: current migration state
2482  */
2483 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
2484 {
2485     RAMState *rs = ram_state;
2486     RAMBlock *block;
2487     int ret;
2488
2489     RCU_READ_LOCK_GUARD();
2490
2491     /* This should be our last sync, the src is now paused */
2492     migration_bitmap_sync(rs);
2493
2494     /* Easiest way to make sure we don't resume in the middle of a host-page */
2495     rs->last_seen_block = NULL;
2496     rs->last_sent_block = NULL;
2497     rs->last_page = 0;
2498
2499     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2500         /* Deal with TPS != HPS and huge pages */
2501         ret = postcopy_chunk_hostpages(ms, block);
2502         if (ret) {
2503             return ret;
2504         }
2505
2506 #ifdef DEBUG_POSTCOPY
2507         ram_debug_dump_bitmap(block->bmap, true,
2508                               block->used_length >> TARGET_PAGE_BITS);
2509 #endif
2510     }
2511     trace_ram_postcopy_send_discard_bitmap();
2512
2513     return postcopy_each_ram_send_discard(ms);
2514 }
2515
2516 /**
2517  * ram_discard_range: discard dirtied pages at the beginning of postcopy
2518  *
2519  * Returns zero on success
2520  *
2521  * @rbname: name of the RAMBlock of the request. NULL means the
2522  *          same that last one.
2523  * @start: RAMBlock starting page
2524  * @length: RAMBlock size
2525  */
2526 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2527 {
2528     trace_ram_discard_range(rbname, start, length);
2529
2530     RCU_READ_LOCK_GUARD();
2531     RAMBlock *rb = qemu_ram_block_by_name(rbname);
2532
2533     if (!rb) {
2534         error_report("ram_discard_range: Failed to find block '%s'", rbname);
2535         return -1;
2536     }
2537
2538     /*
2539      * On source VM, we don't need to update the received bitmap since
2540      * we don't even have one.
2541      */
2542     if (rb->receivedmap) {
2543         bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2544                      length >> qemu_target_page_bits());
2545     }
2546
2547     return ram_block_discard_range(rb, start, length);
2548 }
2549
2550 /*
2551  * For every allocation, we will try not to crash the VM if the
2552  * allocation failed.
2553  */
2554 static int xbzrle_init(void)
2555 {
2556     Error *local_err = NULL;
2557
2558     if (!migrate_use_xbzrle()) {
2559         return 0;
2560     }
2561
2562     XBZRLE_cache_lock();
2563
2564     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2565     if (!XBZRLE.zero_target_page) {
2566         error_report("%s: Error allocating zero page", __func__);
2567         goto err_out;
2568     }
2569
2570     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2571                               TARGET_PAGE_SIZE, &local_err);
2572     if (!XBZRLE.cache) {
2573         error_report_err(local_err);
2574         goto free_zero_page;
2575     }
2576
2577     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2578     if (!XBZRLE.encoded_buf) {
2579         error_report("%s: Error allocating encoded_buf", __func__);
2580         goto free_cache;
2581     }
2582
2583     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2584     if (!XBZRLE.current_buf) {
2585         error_report("%s: Error allocating current_buf", __func__);
2586         goto free_encoded_buf;
2587     }
2588
2589     /* We are all good */
2590     XBZRLE_cache_unlock();
2591     return 0;
2592
2593 free_encoded_buf:
2594     g_free(XBZRLE.encoded_buf);
2595     XBZRLE.encoded_buf = NULL;
2596 free_cache:
2597     cache_fini(XBZRLE.cache);
2598     XBZRLE.cache = NULL;
2599 free_zero_page:
2600     g_free(XBZRLE.zero_target_page);
2601     XBZRLE.zero_target_page = NULL;
2602 err_out:
2603     XBZRLE_cache_unlock();
2604     return -ENOMEM;
2605 }
2606
2607 static int ram_state_init(RAMState **rsp)
2608 {
2609     *rsp = g_try_new0(RAMState, 1);
2610
2611     if (!*rsp) {
2612         error_report("%s: Init ramstate fail", __func__);
2613         return -1;
2614     }
2615
2616     qemu_mutex_init(&(*rsp)->bitmap_mutex);
2617     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2618     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2619
2620     /*
2621      * Count the total number of pages used by ram blocks not including any
2622      * gaps due to alignment or unplugs.
2623      * This must match with the initial values of dirty bitmap.
2624      */
2625     (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2626     ram_state_reset(*rsp);
2627
2628     return 0;
2629 }
2630
2631 static void ram_list_init_bitmaps(void)
2632 {
2633     MigrationState *ms = migrate_get_current();
2634     RAMBlock *block;
2635     unsigned long pages;
2636     uint8_t shift;
2637
2638     /* Skip setting bitmap if there is no RAM */
2639     if (ram_bytes_total()) {
2640         shift = ms->clear_bitmap_shift;
2641         if (shift > CLEAR_BITMAP_SHIFT_MAX) {
2642             error_report("clear_bitmap_shift (%u) too big, using "
2643                          "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
2644             shift = CLEAR_BITMAP_SHIFT_MAX;
2645         } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
2646             error_report("clear_bitmap_shift (%u) too small, using "
2647                          "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
2648             shift = CLEAR_BITMAP_SHIFT_MIN;
2649         }
2650
2651         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2652             pages = block->max_length >> TARGET_PAGE_BITS;
2653             /*
2654              * The initial dirty bitmap for migration must be set with all
2655              * ones to make sure we'll migrate every guest RAM page to
2656              * destination.
2657              * Here we set RAMBlock.bmap all to 1 because when rebegin a
2658              * new migration after a failed migration, ram_list.
2659              * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
2660              * guest memory.
2661              */
2662             block->bmap = bitmap_new(pages);
2663             bitmap_set(block->bmap, 0, pages);
2664             block->clear_bmap_shift = shift;
2665             block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
2666         }
2667     }
2668 }
2669
2670 static void ram_init_bitmaps(RAMState *rs)
2671 {
2672     /* For memory_global_dirty_log_start below.  */
2673     qemu_mutex_lock_iothread();
2674     qemu_mutex_lock_ramlist();
2675
2676     WITH_RCU_READ_LOCK_GUARD() {
2677         ram_list_init_bitmaps();
2678         /* We don't use dirty log with background snapshots */
2679         if (!migrate_background_snapshot()) {
2680             memory_global_dirty_log_start();
2681             migration_bitmap_sync_precopy(rs);
2682         }
2683     }
2684     qemu_mutex_unlock_ramlist();
2685     qemu_mutex_unlock_iothread();
2686 }
2687
2688 static int ram_init_all(RAMState **rsp)
2689 {
2690     if (ram_state_init(rsp)) {
2691         return -1;
2692     }
2693
2694     if (xbzrle_init()) {
2695         ram_state_cleanup(rsp);
2696         return -1;
2697     }
2698
2699     ram_init_bitmaps(*rsp);
2700
2701     return 0;
2702 }
2703
2704 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2705 {
2706     RAMBlock *block;
2707     uint64_t pages = 0;
2708
2709     /*
2710      * Postcopy is not using xbzrle/compression, so no need for that.
2711      * Also, since source are already halted, we don't need to care
2712      * about dirty page logging as well.
2713      */
2714
2715     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2716         pages += bitmap_count_one(block->bmap,
2717                                   block->used_length >> TARGET_PAGE_BITS);
2718     }
2719
2720     /* This may not be aligned with current bitmaps. Recalculate. */
2721     rs->migration_dirty_pages = pages;
2722
2723     rs->last_seen_block = NULL;
2724     rs->last_sent_block = NULL;
2725     rs->last_page = 0;
2726     rs->last_version = ram_list.version;
2727     /*
2728      * Disable the bulk stage, otherwise we'll resend the whole RAM no
2729      * matter what we have sent.
2730      */
2731     rs->ram_bulk_stage = false;
2732
2733     /* Update RAMState cache of output QEMUFile */
2734     rs->f = out;
2735
2736     trace_ram_state_resume_prepare(pages);
2737 }
2738
2739 /*
2740  * This function clears bits of the free pages reported by the caller from the
2741  * migration dirty bitmap. @addr is the host address corresponding to the
2742  * start of the continuous guest free pages, and @len is the total bytes of
2743  * those pages.
2744  */
2745 void qemu_guest_free_page_hint(void *addr, size_t len)
2746 {
2747     RAMBlock *block;
2748     ram_addr_t offset;
2749     size_t used_len, start, npages;
2750     MigrationState *s = migrate_get_current();
2751
2752     /* This function is currently expected to be used during live migration */
2753     if (!migration_is_setup_or_active(s->state)) {
2754         return;
2755     }
2756
2757     for (; len > 0; len -= used_len, addr += used_len) {
2758         block = qemu_ram_block_from_host(addr, false, &offset);
2759         if (unlikely(!block || offset >= block->used_length)) {
2760             /*
2761              * The implementation might not support RAMBlock resize during
2762              * live migration, but it could happen in theory with future
2763              * updates. So we add a check here to capture that case.
2764              */
2765             error_report_once("%s unexpected error", __func__);
2766             return;
2767         }
2768
2769         if (len <= block->used_length - offset) {
2770             used_len = len;
2771         } else {
2772             used_len = block->used_length - offset;
2773         }
2774
2775         start = offset >> TARGET_PAGE_BITS;
2776         npages = used_len >> TARGET_PAGE_BITS;
2777
2778         qemu_mutex_lock(&ram_state->bitmap_mutex);
2779         ram_state->migration_dirty_pages -=
2780                       bitmap_count_one_with_offset(block->bmap, start, npages);
2781         bitmap_clear(block->bmap, start, npages);
2782         qemu_mutex_unlock(&ram_state->bitmap_mutex);
2783     }
2784 }
2785
2786 /*
2787  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2788  * long-running RCU critical section.  When rcu-reclaims in the code
2789  * start to become numerous it will be necessary to reduce the
2790  * granularity of these critical sections.
2791  */
2792
2793 /**
2794  * ram_save_setup: Setup RAM for migration
2795  *
2796  * Returns zero to indicate success and negative for error
2797  *
2798  * @f: QEMUFile where to send the data
2799  * @opaque: RAMState pointer
2800  */
2801 static int ram_save_setup(QEMUFile *f, void *opaque)
2802 {
2803     RAMState **rsp = opaque;
2804     RAMBlock *block;
2805
2806     if (compress_threads_save_setup()) {
2807         return -1;
2808     }
2809
2810     /* migration has already setup the bitmap, reuse it. */
2811     if (!migration_in_colo_state()) {
2812         if (ram_init_all(rsp) != 0) {
2813             compress_threads_save_cleanup();
2814             return -1;
2815         }
2816     }
2817     (*rsp)->f = f;
2818
2819     WITH_RCU_READ_LOCK_GUARD() {
2820         qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
2821
2822         RAMBLOCK_FOREACH_MIGRATABLE(block) {
2823             qemu_put_byte(f, strlen(block->idstr));
2824             qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2825             qemu_put_be64(f, block->used_length);
2826             if (migrate_postcopy_ram() && block->page_size !=
2827                                           qemu_host_page_size) {
2828                 qemu_put_be64(f, block->page_size);
2829             }
2830             if (migrate_ignore_shared()) {
2831                 qemu_put_be64(f, block->mr->addr);
2832             }
2833         }
2834     }
2835
2836     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2837     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2838
2839     multifd_send_sync_main(f);
2840     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2841     qemu_fflush(f);
2842
2843     return 0;
2844 }
2845
2846 /**
2847  * ram_save_iterate: iterative stage for migration
2848  *
2849  * Returns zero to indicate success and negative for error
2850  *
2851  * @f: QEMUFile where to send the data
2852  * @opaque: RAMState pointer
2853  */
2854 static int ram_save_iterate(QEMUFile *f, void *opaque)
2855 {
2856     RAMState **temp = opaque;
2857     RAMState *rs = *temp;
2858     int ret = 0;
2859     int i;
2860     int64_t t0;
2861     int done = 0;
2862
2863     if (blk_mig_bulk_active()) {
2864         /* Avoid transferring ram during bulk phase of block migration as
2865          * the bulk phase will usually take a long time and transferring
2866          * ram updates during that time is pointless. */
2867         goto out;
2868     }
2869
2870     WITH_RCU_READ_LOCK_GUARD() {
2871         if (ram_list.version != rs->last_version) {
2872             ram_state_reset(rs);
2873         }
2874
2875         /* Read version before ram_list.blocks */
2876         smp_rmb();
2877
2878         ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2879
2880         t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2881         i = 0;
2882         while ((ret = qemu_file_rate_limit(f)) == 0 ||
2883                 !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
2884             int pages;
2885
2886             if (qemu_file_get_error(f)) {
2887                 break;
2888             }
2889
2890             pages = ram_find_and_save_block(rs, false);
2891             /* no more pages to sent */
2892             if (pages == 0) {
2893                 done = 1;
2894                 break;
2895             }
2896
2897             if (pages < 0) {
2898                 qemu_file_set_error(f, pages);
2899                 break;
2900             }
2901
2902             rs->target_page_count += pages;
2903
2904             /*
2905              * During postcopy, it is necessary to make sure one whole host
2906              * page is sent in one chunk.
2907              */
2908             if (migrate_postcopy_ram()) {
2909                 flush_compressed_data(rs);
2910             }
2911
2912             /*
2913              * we want to check in the 1st loop, just in case it was the 1st
2914              * time and we had to sync the dirty bitmap.
2915              * qemu_clock_get_ns() is a bit expensive, so we only check each
2916              * some iterations
2917              */
2918             if ((i & 63) == 0) {
2919                 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
2920                               1000000;
2921                 if (t1 > MAX_WAIT) {
2922                     trace_ram_save_iterate_big_wait(t1, i);
2923                     break;
2924                 }
2925             }
2926             i++;
2927         }
2928     }
2929
2930     /*
2931      * Must occur before EOS (or any QEMUFile operation)
2932      * because of RDMA protocol.
2933      */
2934     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2935
2936 out:
2937     if (ret >= 0
2938         && migration_is_setup_or_active(migrate_get_current()->state)) {
2939         multifd_send_sync_main(rs->f);
2940         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2941         qemu_fflush(f);
2942         ram_counters.transferred += 8;
2943
2944         ret = qemu_file_get_error(f);
2945     }
2946     if (ret < 0) {
2947         return ret;
2948     }
2949
2950     return done;
2951 }
2952
2953 /**
2954  * ram_save_complete: function called to send the remaining amount of ram
2955  *
2956  * Returns zero to indicate success or negative on error
2957  *
2958  * Called with iothread lock
2959  *
2960  * @f: QEMUFile where to send the data
2961  * @opaque: RAMState pointer
2962  */
2963 static int ram_save_complete(QEMUFile *f, void *opaque)
2964 {
2965     RAMState **temp = opaque;
2966     RAMState *rs = *temp;
2967     int ret = 0;
2968
2969     WITH_RCU_READ_LOCK_GUARD() {
2970         if (!migration_in_postcopy()) {
2971             migration_bitmap_sync_precopy(rs);
2972         }
2973
2974         ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2975
2976         /* try transferring iterative blocks of memory */
2977
2978         /* flush all remaining blocks regardless of rate limiting */
2979         while (true) {
2980             int pages;
2981
2982             pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2983             /* no more blocks to sent */
2984             if (pages == 0) {
2985                 break;
2986             }
2987             if (pages < 0) {
2988                 ret = pages;
2989                 break;
2990             }
2991         }
2992
2993         flush_compressed_data(rs);
2994         ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2995     }
2996
2997     if (ret >= 0) {
2998         multifd_send_sync_main(rs->f);
2999         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3000         qemu_fflush(f);
3001     }
3002
3003     return ret;
3004 }
3005
3006 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
3007                              uint64_t *res_precopy_only,
3008                              uint64_t *res_compatible,
3009                              uint64_t *res_postcopy_only)
3010 {
3011     RAMState **temp = opaque;
3012     RAMState *rs = *temp;
3013     uint64_t remaining_size;
3014
3015     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3016
3017     if (!migration_in_postcopy() &&
3018         remaining_size < max_size) {
3019         qemu_mutex_lock_iothread();
3020         WITH_RCU_READ_LOCK_GUARD() {
3021             migration_bitmap_sync_precopy(rs);
3022         }
3023         qemu_mutex_unlock_iothread();
3024         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3025     }
3026
3027     if (migrate_postcopy_ram()) {
3028         /* We can do postcopy, and all the data is postcopiable */
3029         *res_compatible += remaining_size;
3030     } else {
3031         *res_precopy_only += remaining_size;
3032     }
3033 }
3034
3035 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3036 {
3037     unsigned int xh_len;
3038     int xh_flags;
3039     uint8_t *loaded_data;
3040
3041     /* extract RLE header */
3042     xh_flags = qemu_get_byte(f);
3043     xh_len = qemu_get_be16(f);
3044
3045     if (xh_flags != ENCODING_FLAG_XBZRLE) {
3046         error_report("Failed to load XBZRLE page - wrong compression!");
3047         return -1;
3048     }
3049
3050     if (xh_len > TARGET_PAGE_SIZE) {
3051         error_report("Failed to load XBZRLE page - len overflow!");
3052         return -1;
3053     }
3054     loaded_data = XBZRLE.decoded_buf;
3055     /* load data and decode */
3056     /* it can change loaded_data to point to an internal buffer */
3057     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3058
3059     /* decode RLE */
3060     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3061                              TARGET_PAGE_SIZE) == -1) {
3062         error_report("Failed to load XBZRLE page - decode error!");
3063         return -1;
3064     }
3065
3066     return 0;
3067 }
3068
3069 /**
3070  * ram_block_from_stream: read a RAMBlock id from the migration stream
3071  *
3072  * Must be called from within a rcu critical section.
3073  *
3074  * Returns a pointer from within the RCU-protected ram_list.
3075  *
3076  * @f: QEMUFile where to read the data from
3077  * @flags: Page flags (mostly to see if it's a continuation of previous block)
3078  */
3079 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
3080 {
3081     static RAMBlock *block;
3082     char id[256];
3083     uint8_t len;
3084
3085     if (flags & RAM_SAVE_FLAG_CONTINUE) {
3086         if (!block) {
3087             error_report("Ack, bad migration stream!");
3088             return NULL;
3089         }
3090         return block;
3091     }
3092
3093     len = qemu_get_byte(f);
3094     qemu_get_buffer(f, (uint8_t *)id, len);
3095     id[len] = 0;
3096
3097     block = qemu_ram_block_by_name(id);
3098     if (!block) {
3099         error_report("Can't find block %s", id);
3100         return NULL;
3101     }
3102
3103     if (ramblock_is_ignored(block)) {
3104         error_report("block %s should not be migrated !", id);
3105         return NULL;
3106     }
3107
3108     return block;
3109 }
3110
3111 static inline void *host_from_ram_block_offset(RAMBlock *block,
3112                                                ram_addr_t offset)
3113 {
3114     if (!offset_in_ramblock(block, offset)) {
3115         return NULL;
3116     }
3117
3118     return block->host + offset;
3119 }
3120
3121 static inline void *colo_cache_from_block_offset(RAMBlock *block,
3122                              ram_addr_t offset, bool record_bitmap)
3123 {
3124     if (!offset_in_ramblock(block, offset)) {
3125         return NULL;
3126     }
3127     if (!block->colo_cache) {
3128         error_report("%s: colo_cache is NULL in block :%s",
3129                      __func__, block->idstr);
3130         return NULL;
3131     }
3132
3133     /*
3134     * During colo checkpoint, we need bitmap of these migrated pages.
3135     * It help us to decide which pages in ram cache should be flushed
3136     * into VM's RAM later.
3137     */
3138     if (record_bitmap &&
3139         !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
3140         ram_state->migration_dirty_pages++;
3141     }
3142     return block->colo_cache + offset;
3143 }
3144
3145 /**
3146  * ram_handle_compressed: handle the zero page case
3147  *
3148  * If a page (or a whole RDMA chunk) has been
3149  * determined to be zero, then zap it.
3150  *
3151  * @host: host address for the zero page
3152  * @ch: what the page is filled from.  We only support zero
3153  * @size: size of the zero page
3154  */
3155 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3156 {
3157     if (ch != 0 || !is_zero_range(host, size)) {
3158         memset(host, ch, size);
3159     }
3160 }
3161
3162 /* return the size after decompression, or negative value on error */
3163 static int
3164 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3165                      const uint8_t *source, size_t source_len)
3166 {
3167     int err;
3168
3169     err = inflateReset(stream);
3170     if (err != Z_OK) {
3171         return -1;
3172     }
3173
3174     stream->avail_in = source_len;
3175     stream->next_in = (uint8_t *)source;
3176     stream->avail_out = dest_len;
3177     stream->next_out = dest;
3178
3179     err = inflate(stream, Z_NO_FLUSH);
3180     if (err != Z_STREAM_END) {
3181         return -1;
3182     }
3183
3184     return stream->total_out;
3185 }
3186
3187 static void *do_data_decompress(void *opaque)
3188 {
3189     DecompressParam *param = opaque;
3190     unsigned long pagesize;
3191     uint8_t *des;
3192     int len, ret;
3193
3194     qemu_mutex_lock(&param->mutex);
3195     while (!param->quit) {
3196         if (param->des) {
3197             des = param->des;
3198             len = param->len;
3199             param->des = 0;
3200             qemu_mutex_unlock(&param->mutex);
3201
3202             pagesize = TARGET_PAGE_SIZE;
3203
3204             ret = qemu_uncompress_data(&param->stream, des, pagesize,
3205                                        param->compbuf, len);
3206             if (ret < 0 && migrate_get_current()->decompress_error_check) {
3207                 error_report("decompress data failed");
3208                 qemu_file_set_error(decomp_file, ret);
3209             }
3210
3211             qemu_mutex_lock(&decomp_done_lock);
3212             param->done = true;
3213             qemu_cond_signal(&decomp_done_cond);
3214             qemu_mutex_unlock(&decomp_done_lock);
3215
3216             qemu_mutex_lock(&param->mutex);
3217         } else {
3218             qemu_cond_wait(&param->cond, &param->mutex);
3219         }
3220     }
3221     qemu_mutex_unlock(&param->mutex);
3222
3223     return NULL;
3224 }
3225
3226 static int wait_for_decompress_done(void)
3227 {
3228     int idx, thread_count;
3229
3230     if (!migrate_use_compression()) {
3231         return 0;
3232     }
3233
3234     thread_count = migrate_decompress_threads();
3235     qemu_mutex_lock(&decomp_done_lock);
3236     for (idx = 0; idx < thread_count; idx++) {
3237         while (!decomp_param[idx].done) {
3238             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3239         }
3240     }
3241     qemu_mutex_unlock(&decomp_done_lock);
3242     return qemu_file_get_error(decomp_file);
3243 }
3244
3245 static void compress_threads_load_cleanup(void)
3246 {
3247     int i, thread_count;
3248
3249     if (!migrate_use_compression()) {
3250         return;
3251     }
3252     thread_count = migrate_decompress_threads();
3253     for (i = 0; i < thread_count; i++) {
3254         /*
3255          * we use it as a indicator which shows if the thread is
3256          * properly init'd or not
3257          */
3258         if (!decomp_param[i].compbuf) {
3259             break;
3260         }
3261
3262         qemu_mutex_lock(&decomp_param[i].mutex);
3263         decomp_param[i].quit = true;
3264         qemu_cond_signal(&decomp_param[i].cond);
3265         qemu_mutex_unlock(&decomp_param[i].mutex);
3266     }
3267     for (i = 0; i < thread_count; i++) {
3268         if (!decomp_param[i].compbuf) {
3269             break;
3270         }
3271
3272         qemu_thread_join(decompress_threads + i);
3273         qemu_mutex_destroy(&decomp_param[i].mutex);
3274         qemu_cond_destroy(&decomp_param[i].cond);
3275         inflateEnd(&decomp_param[i].stream);
3276         g_free(decomp_param[i].compbuf);
3277         decomp_param[i].compbuf = NULL;
3278     }
3279     g_free(decompress_threads);
3280     g_free(decomp_param);
3281     decompress_threads = NULL;
3282     decomp_param = NULL;
3283     decomp_file = NULL;
3284 }
3285
3286 static int compress_threads_load_setup(QEMUFile *f)
3287 {
3288     int i, thread_count;
3289
3290     if (!migrate_use_compression()) {
3291         return 0;
3292     }
3293
3294     thread_count = migrate_decompress_threads();
3295     decompress_threads = g_new0(QemuThread, thread_count);
3296     decomp_param = g_new0(DecompressParam, thread_count);
3297     qemu_mutex_init(&decomp_done_lock);
3298     qemu_cond_init(&decomp_done_cond);
3299     decomp_file = f;
3300     for (i = 0; i < thread_count; i++) {
3301         if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3302             goto exit;
3303         }
3304
3305         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3306         qemu_mutex_init(&decomp_param[i].mutex);
3307         qemu_cond_init(&decomp_param[i].cond);
3308         decomp_param[i].done = true;
3309         decomp_param[i].quit = false;
3310         qemu_thread_create(decompress_threads + i, "decompress",
3311                            do_data_decompress, decomp_param + i,
3312                            QEMU_THREAD_JOINABLE);
3313     }
3314     return 0;
3315 exit:
3316     compress_threads_load_cleanup();
3317     return -1;
3318 }
3319
3320 static void decompress_data_with_multi_threads(QEMUFile *f,
3321                                                void *host, int len)
3322 {
3323     int idx, thread_count;
3324
3325     thread_count = migrate_decompress_threads();
3326     QEMU_LOCK_GUARD(&decomp_done_lock);
3327     while (true) {
3328         for (idx = 0; idx < thread_count; idx++) {
3329             if (decomp_param[idx].done) {
3330                 decomp_param[idx].done = false;
3331                 qemu_mutex_lock(&decomp_param[idx].mutex);
3332                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3333                 decomp_param[idx].des = host;
3334                 decomp_param[idx].len = len;
3335                 qemu_cond_signal(&decomp_param[idx].cond);
3336                 qemu_mutex_unlock(&decomp_param[idx].mutex);
3337                 break;
3338             }
3339         }
3340         if (idx < thread_count) {
3341             break;
3342         } else {
3343             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3344         }
3345     }
3346 }
3347
3348  /*
3349   * we must set ram_bulk_stage to false, otherwise in
3350   * migation_bitmap_find_dirty the bitmap will be unused and
3351   * all the pages in ram cache wil be flushed to the ram of
3352   * secondary VM.
3353   */
3354 static void colo_init_ram_state(void)
3355 {
3356     ram_state_init(&ram_state);
3357     ram_state->ram_bulk_stage = false;
3358 }
3359
3360 /*
3361  * colo cache: this is for secondary VM, we cache the whole
3362  * memory of the secondary VM, it is need to hold the global lock
3363  * to call this helper.
3364  */
3365 int colo_init_ram_cache(void)
3366 {
3367     RAMBlock *block;
3368
3369     WITH_RCU_READ_LOCK_GUARD() {
3370         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3371             block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3372                                                     NULL,
3373                                                     false);
3374             if (!block->colo_cache) {
3375                 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3376                              "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3377                              block->used_length);
3378                 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3379                     if (block->colo_cache) {
3380                         qemu_anon_ram_free(block->colo_cache, block->used_length);
3381                         block->colo_cache = NULL;
3382                     }
3383                 }
3384                 return -errno;
3385             }
3386         }
3387     }
3388
3389     /*
3390     * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3391     * with to decide which page in cache should be flushed into SVM's RAM. Here
3392     * we use the same name 'ram_bitmap' as for migration.
3393     */
3394     if (ram_bytes_total()) {
3395         RAMBlock *block;
3396
3397         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3398             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3399             block->bmap = bitmap_new(pages);
3400         }
3401     }
3402
3403     colo_init_ram_state();
3404     return 0;
3405 }
3406
3407 /* TODO: duplicated with ram_init_bitmaps */
3408 void colo_incoming_start_dirty_log(void)
3409 {
3410     RAMBlock *block = NULL;
3411     /* For memory_global_dirty_log_start below. */
3412     qemu_mutex_lock_iothread();
3413     qemu_mutex_lock_ramlist();
3414
3415     memory_global_dirty_log_sync();
3416     WITH_RCU_READ_LOCK_GUARD() {
3417         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3418             ramblock_sync_dirty_bitmap(ram_state, block);
3419             /* Discard this dirty bitmap record */
3420             bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3421         }
3422         memory_global_dirty_log_start();
3423     }
3424     ram_state->migration_dirty_pages = 0;
3425     qemu_mutex_unlock_ramlist();
3426     qemu_mutex_unlock_iothread();
3427 }
3428
3429 /* It is need to hold the global lock to call this helper */
3430 void colo_release_ram_cache(void)
3431 {
3432     RAMBlock *block;
3433
3434     memory_global_dirty_log_stop();
3435     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3436         g_free(block->bmap);
3437         block->bmap = NULL;
3438     }
3439
3440     WITH_RCU_READ_LOCK_GUARD() {
3441         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3442             if (block->colo_cache) {
3443                 qemu_anon_ram_free(block->colo_cache, block->used_length);
3444                 block->colo_cache = NULL;
3445             }
3446         }
3447     }
3448     ram_state_cleanup(&ram_state);
3449 }
3450
3451 /**
3452  * ram_load_setup: Setup RAM for migration incoming side
3453  *
3454  * Returns zero to indicate success and negative for error
3455  *
3456  * @f: QEMUFile where to receive the data
3457  * @opaque: RAMState pointer
3458  */
3459 static int ram_load_setup(QEMUFile *f, void *opaque)
3460 {
3461     if (compress_threads_load_setup(f)) {
3462         return -1;
3463     }
3464
3465     xbzrle_load_setup();
3466     ramblock_recv_map_init();
3467
3468     return 0;
3469 }
3470
3471 static int ram_load_cleanup(void *opaque)
3472 {
3473     RAMBlock *rb;
3474
3475     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3476         qemu_ram_block_writeback(rb);
3477     }
3478
3479     xbzrle_load_cleanup();
3480     compress_threads_load_cleanup();
3481
3482     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3483         g_free(rb->receivedmap);
3484         rb->receivedmap = NULL;
3485     }
3486
3487     return 0;
3488 }
3489
3490 /**
3491  * ram_postcopy_incoming_init: allocate postcopy data structures
3492  *
3493  * Returns 0 for success and negative if there was one error
3494  *
3495  * @mis: current migration incoming state
3496  *
3497  * Allocate data structures etc needed by incoming migration with
3498  * postcopy-ram. postcopy-ram's similarly names
3499  * postcopy_ram_incoming_init does the work.
3500  */
3501 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3502 {
3503     return postcopy_ram_incoming_init(mis);
3504 }
3505
3506 /**
3507  * ram_load_postcopy: load a page in postcopy case
3508  *
3509  * Returns 0 for success or -errno in case of error
3510  *
3511  * Called in postcopy mode by ram_load().
3512  * rcu_read_lock is taken prior to this being called.
3513  *
3514  * @f: QEMUFile where to send the data
3515  */
3516 static int ram_load_postcopy(QEMUFile *f)
3517 {
3518     int flags = 0, ret = 0;
3519     bool place_needed = false;
3520     bool matches_target_page_size = false;
3521     MigrationIncomingState *mis = migration_incoming_get_current();
3522     /* Temporary page that is later 'placed' */
3523     void *postcopy_host_page = mis->postcopy_tmp_page;
3524     void *this_host = NULL;
3525     bool all_zero = true;
3526     int target_pages = 0;
3527
3528     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3529         ram_addr_t addr;
3530         void *host = NULL;
3531         void *page_buffer = NULL;
3532         void *place_source = NULL;
3533         RAMBlock *block = NULL;
3534         uint8_t ch;
3535         int len;
3536
3537         addr = qemu_get_be64(f);
3538
3539         /*
3540          * If qemu file error, we should stop here, and then "addr"
3541          * may be invalid
3542          */
3543         ret = qemu_file_get_error(f);
3544         if (ret) {
3545             break;
3546         }
3547
3548         flags = addr & ~TARGET_PAGE_MASK;
3549         addr &= TARGET_PAGE_MASK;
3550
3551         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
3552         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3553                      RAM_SAVE_FLAG_COMPRESS_PAGE)) {
3554             block = ram_block_from_stream(f, flags);
3555
3556             host = host_from_ram_block_offset(block, addr);
3557             if (!host) {
3558                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3559                 ret = -EINVAL;
3560                 break;
3561             }
3562             target_pages++;
3563             matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
3564             /*
3565              * Postcopy requires that we place whole host pages atomically;
3566              * these may be huge pages for RAMBlocks that are backed by
3567              * hugetlbfs.
3568              * To make it atomic, the data is read into a temporary page
3569              * that's moved into place later.
3570              * The migration protocol uses,  possibly smaller, target-pages
3571              * however the source ensures it always sends all the components
3572              * of a host page in one chunk.
3573              */
3574             page_buffer = postcopy_host_page +
3575                           ((uintptr_t)host & (block->page_size - 1));
3576             if (target_pages == 1) {
3577                 this_host = (void *)QEMU_ALIGN_DOWN((uintptr_t)host,
3578                                                     block->page_size);
3579             } else {
3580                 /* not the 1st TP within the HP */
3581                 if (QEMU_ALIGN_DOWN((uintptr_t)host, block->page_size) !=
3582                     (uintptr_t)this_host) {
3583                     error_report("Non-same host page %p/%p",
3584                                   host, this_host);
3585                     ret = -EINVAL;
3586                     break;
3587                 }
3588             }
3589
3590             /*
3591              * If it's the last part of a host page then we place the host
3592              * page
3593              */
3594             if (target_pages == (block->page_size / TARGET_PAGE_SIZE)) {
3595                 place_needed = true;
3596             }
3597             place_source = postcopy_host_page;
3598         }
3599
3600         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3601         case RAM_SAVE_FLAG_ZERO:
3602             ch = qemu_get_byte(f);
3603             /*
3604              * Can skip to set page_buffer when
3605              * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
3606              */
3607             if (ch || !matches_target_page_size) {
3608                 memset(page_buffer, ch, TARGET_PAGE_SIZE);
3609             }
3610             if (ch) {
3611                 all_zero = false;
3612             }
3613             break;
3614
3615         case RAM_SAVE_FLAG_PAGE:
3616             all_zero = false;
3617             if (!matches_target_page_size) {
3618                 /* For huge pages, we always use temporary buffer */
3619                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3620             } else {
3621                 /*
3622                  * For small pages that matches target page size, we
3623                  * avoid the qemu_file copy.  Instead we directly use
3624                  * the buffer of QEMUFile to place the page.  Note: we
3625                  * cannot do any QEMUFile operation before using that
3626                  * buffer to make sure the buffer is valid when
3627                  * placing the page.
3628                  */
3629                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3630                                          TARGET_PAGE_SIZE);
3631             }
3632             break;
3633         case RAM_SAVE_FLAG_COMPRESS_PAGE:
3634             all_zero = false;
3635             len = qemu_get_be32(f);
3636             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3637                 error_report("Invalid compressed data length: %d", len);
3638                 ret = -EINVAL;
3639                 break;
3640             }
3641             decompress_data_with_multi_threads(f, page_buffer, len);
3642             break;
3643
3644         case RAM_SAVE_FLAG_EOS:
3645             /* normal exit */
3646             multifd_recv_sync_main();
3647             break;
3648         default:
3649             error_report("Unknown combination of migration flags: 0x%x"
3650                          " (postcopy mode)", flags);
3651             ret = -EINVAL;
3652             break;
3653         }
3654
3655         /* Got the whole host page, wait for decompress before placing. */
3656         if (place_needed) {
3657             ret |= wait_for_decompress_done();
3658         }
3659
3660         /* Detect for any possible file errors */
3661         if (!ret && qemu_file_get_error(f)) {
3662             ret = qemu_file_get_error(f);
3663         }
3664
3665         if (!ret && place_needed) {
3666             /* This gets called at the last target page in the host page */
3667             void *place_dest = (void *)QEMU_ALIGN_DOWN((uintptr_t)host,
3668                                                        block->page_size);
3669
3670             if (all_zero) {
3671                 ret = postcopy_place_page_zero(mis, place_dest,
3672                                                block);
3673             } else {
3674                 ret = postcopy_place_page(mis, place_dest,
3675                                           place_source, block);
3676             }
3677             place_needed = false;
3678             target_pages = 0;
3679             /* Assume we have a zero page until we detect something different */
3680             all_zero = true;
3681         }
3682     }
3683
3684     return ret;
3685 }
3686
3687 static bool postcopy_is_advised(void)
3688 {
3689     PostcopyState ps = postcopy_state_get();
3690     return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
3691 }
3692
3693 static bool postcopy_is_running(void)
3694 {
3695     PostcopyState ps = postcopy_state_get();
3696     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3697 }
3698
3699 /*
3700  * Flush content of RAM cache into SVM's memory.
3701  * Only flush the pages that be dirtied by PVM or SVM or both.
3702  */
3703 void colo_flush_ram_cache(void)
3704 {
3705     RAMBlock *block = NULL;
3706     void *dst_host;
3707     void *src_host;
3708     unsigned long offset = 0;
3709
3710     memory_global_dirty_log_sync();
3711     WITH_RCU_READ_LOCK_GUARD() {
3712         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3713             ramblock_sync_dirty_bitmap(ram_state, block);
3714         }
3715     }
3716
3717     trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
3718     WITH_RCU_READ_LOCK_GUARD() {
3719         block = QLIST_FIRST_RCU(&ram_list.blocks);
3720
3721         while (block) {
3722             offset = migration_bitmap_find_dirty(ram_state, block, offset);
3723
3724             if (((ram_addr_t)offset) << TARGET_PAGE_BITS
3725                 >= block->used_length) {
3726                 offset = 0;
3727                 block = QLIST_NEXT_RCU(block, next);
3728             } else {
3729                 migration_bitmap_clear_dirty(ram_state, block, offset);
3730                 dst_host = block->host
3731                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3732                 src_host = block->colo_cache
3733                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3734                 memcpy(dst_host, src_host, TARGET_PAGE_SIZE);
3735             }
3736         }
3737     }
3738     trace_colo_flush_ram_cache_end();
3739 }
3740
3741 /**
3742  * ram_load_precopy: load pages in precopy case
3743  *
3744  * Returns 0 for success or -errno in case of error
3745  *
3746  * Called in precopy mode by ram_load().
3747  * rcu_read_lock is taken prior to this being called.
3748  *
3749  * @f: QEMUFile where to send the data
3750  */
3751 static int ram_load_precopy(QEMUFile *f)
3752 {
3753     int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
3754     /* ADVISE is earlier, it shows the source has the postcopy capability on */
3755     bool postcopy_advised = postcopy_is_advised();
3756     if (!migrate_use_compression()) {
3757         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3758     }
3759
3760     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3761         ram_addr_t addr, total_ram_bytes;
3762         void *host = NULL, *host_bak = NULL;
3763         uint8_t ch;
3764
3765         /*
3766          * Yield periodically to let main loop run, but an iteration of
3767          * the main loop is expensive, so do it each some iterations
3768          */
3769         if ((i & 32767) == 0 && qemu_in_coroutine()) {
3770             aio_co_schedule(qemu_get_current_aio_context(),
3771                             qemu_coroutine_self());
3772             qemu_coroutine_yield();
3773         }
3774         i++;
3775
3776         addr = qemu_get_be64(f);
3777         flags = addr & ~TARGET_PAGE_MASK;
3778         addr &= TARGET_PAGE_MASK;
3779
3780         if (flags & invalid_flags) {
3781             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3782                 error_report("Received an unexpected compressed page");
3783             }
3784
3785             ret = -EINVAL;
3786             break;
3787         }
3788
3789         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3790                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
3791             RAMBlock *block = ram_block_from_stream(f, flags);
3792
3793             host = host_from_ram_block_offset(block, addr);
3794             /*
3795              * After going into COLO stage, we should not load the page
3796              * into SVM's memory directly, we put them into colo_cache firstly.
3797              * NOTE: We need to keep a copy of SVM's ram in colo_cache.
3798              * Previously, we copied all these memory in preparing stage of COLO
3799              * while we need to stop VM, which is a time-consuming process.
3800              * Here we optimize it by a trick, back-up every page while in
3801              * migration process while COLO is enabled, though it affects the
3802              * speed of the migration, but it obviously reduce the downtime of
3803              * back-up all SVM'S memory in COLO preparing stage.
3804              */
3805             if (migration_incoming_colo_enabled()) {
3806                 if (migration_incoming_in_colo_state()) {
3807                     /* In COLO stage, put all pages into cache temporarily */
3808                     host = colo_cache_from_block_offset(block, addr, true);
3809                 } else {
3810                    /*
3811                     * In migration stage but before COLO stage,
3812                     * Put all pages into both cache and SVM's memory.
3813                     */
3814                     host_bak = colo_cache_from_block_offset(block, addr, false);
3815                 }
3816             }
3817             if (!host) {
3818                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3819                 ret = -EINVAL;
3820                 break;
3821             }
3822             if (!migration_incoming_in_colo_state()) {
3823                 ramblock_recv_bitmap_set(block, host);
3824             }
3825
3826             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
3827         }
3828
3829         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3830         case RAM_SAVE_FLAG_MEM_SIZE:
3831             /* Synchronize RAM block list */
3832             total_ram_bytes = addr;
3833             while (!ret && total_ram_bytes) {
3834                 RAMBlock *block;
3835                 char id[256];
3836                 ram_addr_t length;
3837
3838                 len = qemu_get_byte(f);
3839                 qemu_get_buffer(f, (uint8_t *)id, len);
3840                 id[len] = 0;
3841                 length = qemu_get_be64(f);
3842
3843                 block = qemu_ram_block_by_name(id);
3844                 if (block && !qemu_ram_is_migratable(block)) {
3845                     error_report("block %s should not be migrated !", id);
3846                     ret = -EINVAL;
3847                 } else if (block) {
3848                     if (length != block->used_length) {
3849                         Error *local_err = NULL;
3850
3851                         ret = qemu_ram_resize(block, length,
3852                                               &local_err);
3853                         if (local_err) {
3854                             error_report_err(local_err);
3855                         }
3856                     }
3857                     /* For postcopy we need to check hugepage sizes match */
3858                     if (postcopy_advised && migrate_postcopy_ram() &&
3859                         block->page_size != qemu_host_page_size) {
3860                         uint64_t remote_page_size = qemu_get_be64(f);
3861                         if (remote_page_size != block->page_size) {
3862                             error_report("Mismatched RAM page size %s "
3863                                          "(local) %zd != %" PRId64,
3864                                          id, block->page_size,
3865                                          remote_page_size);
3866                             ret = -EINVAL;
3867                         }
3868                     }
3869                     if (migrate_ignore_shared()) {
3870                         hwaddr addr = qemu_get_be64(f);
3871                         if (ramblock_is_ignored(block) &&
3872                             block->mr->addr != addr) {
3873                             error_report("Mismatched GPAs for block %s "
3874                                          "%" PRId64 "!= %" PRId64,
3875                                          id, (uint64_t)addr,
3876                                          (uint64_t)block->mr->addr);
3877                             ret = -EINVAL;
3878                         }
3879                     }
3880                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
3881                                           block->idstr);
3882                 } else {
3883                     error_report("Unknown ramblock \"%s\", cannot "
3884                                  "accept migration", id);
3885                     ret = -EINVAL;
3886                 }
3887
3888                 total_ram_bytes -= length;
3889             }
3890             break;
3891
3892         case RAM_SAVE_FLAG_ZERO:
3893             ch = qemu_get_byte(f);
3894             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
3895             break;
3896
3897         case RAM_SAVE_FLAG_PAGE:
3898             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
3899             break;
3900
3901         case RAM_SAVE_FLAG_COMPRESS_PAGE:
3902             len = qemu_get_be32(f);
3903             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3904                 error_report("Invalid compressed data length: %d", len);
3905                 ret = -EINVAL;
3906                 break;
3907             }
3908             decompress_data_with_multi_threads(f, host, len);
3909             break;
3910
3911         case RAM_SAVE_FLAG_XBZRLE:
3912             if (load_xbzrle(f, addr, host) < 0) {
3913                 error_report("Failed to decompress XBZRLE page at "
3914                              RAM_ADDR_FMT, addr);
3915                 ret = -EINVAL;
3916                 break;
3917             }
3918             break;
3919         case RAM_SAVE_FLAG_EOS:
3920             /* normal exit */
3921             multifd_recv_sync_main();
3922             break;
3923         default:
3924             if (flags & RAM_SAVE_FLAG_HOOK) {
3925                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
3926             } else {
3927                 error_report("Unknown combination of migration flags: 0x%x",
3928                              flags);
3929                 ret = -EINVAL;
3930             }
3931         }
3932         if (!ret) {
3933             ret = qemu_file_get_error(f);
3934         }
3935         if (!ret && host_bak) {
3936             memcpy(host_bak, host, TARGET_PAGE_SIZE);
3937         }
3938     }
3939
3940     ret |= wait_for_decompress_done();
3941     return ret;
3942 }
3943
3944 static int ram_load(QEMUFile *f, void *opaque, int version_id)
3945 {
3946     int ret = 0;
3947     static uint64_t seq_iter;
3948     /*
3949      * If system is running in postcopy mode, page inserts to host memory must
3950      * be atomic
3951      */
3952     bool postcopy_running = postcopy_is_running();
3953
3954     seq_iter++;
3955
3956     if (version_id != 4) {
3957         return -EINVAL;
3958     }
3959
3960     /*
3961      * This RCU critical section can be very long running.
3962      * When RCU reclaims in the code start to become numerous,
3963      * it will be necessary to reduce the granularity of this
3964      * critical section.
3965      */
3966     WITH_RCU_READ_LOCK_GUARD() {
3967         if (postcopy_running) {
3968             ret = ram_load_postcopy(f);
3969         } else {
3970             ret = ram_load_precopy(f);
3971         }
3972     }
3973     trace_ram_load_complete(ret, seq_iter);
3974
3975     return ret;
3976 }
3977
3978 static bool ram_has_postcopy(void *opaque)
3979 {
3980     RAMBlock *rb;
3981     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3982         if (ramblock_is_pmem(rb)) {
3983             info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
3984                          "is not supported now!", rb->idstr, rb->host);
3985             return false;
3986         }
3987     }
3988
3989     return migrate_postcopy_ram();
3990 }
3991
3992 /* Sync all the dirty bitmap with destination VM.  */
3993 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
3994 {
3995     RAMBlock *block;
3996     QEMUFile *file = s->to_dst_file;
3997     int ramblock_count = 0;
3998
3999     trace_ram_dirty_bitmap_sync_start();
4000
4001     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4002         qemu_savevm_send_recv_bitmap(file, block->idstr);
4003         trace_ram_dirty_bitmap_request(block->idstr);
4004         ramblock_count++;
4005     }
4006
4007     trace_ram_dirty_bitmap_sync_wait();
4008
4009     /* Wait until all the ramblocks' dirty bitmap synced */
4010     while (ramblock_count--) {
4011         qemu_sem_wait(&s->rp_state.rp_sem);
4012     }
4013
4014     trace_ram_dirty_bitmap_sync_complete();
4015
4016     return 0;
4017 }
4018
4019 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4020 {
4021     qemu_sem_post(&s->rp_state.rp_sem);
4022 }
4023
4024 /*
4025  * Read the received bitmap, revert it as the initial dirty bitmap.
4026  * This is only used when the postcopy migration is paused but wants
4027  * to resume from a middle point.
4028  */
4029 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4030 {
4031     int ret = -EINVAL;
4032     QEMUFile *file = s->rp_state.from_dst_file;
4033     unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
4034     uint64_t local_size = DIV_ROUND_UP(nbits, 8);
4035     uint64_t size, end_mark;
4036
4037     trace_ram_dirty_bitmap_reload_begin(block->idstr);
4038
4039     if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4040         error_report("%s: incorrect state %s", __func__,
4041                      MigrationStatus_str(s->state));
4042         return -EINVAL;
4043     }
4044
4045     /*
4046      * Note: see comments in ramblock_recv_bitmap_send() on why we
4047      * need the endianness conversion, and the paddings.
4048      */
4049     local_size = ROUND_UP(local_size, 8);
4050
4051     /* Add paddings */
4052     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4053
4054     size = qemu_get_be64(file);
4055
4056     /* The size of the bitmap should match with our ramblock */
4057     if (size != local_size) {
4058         error_report("%s: ramblock '%s' bitmap size mismatch "
4059                      "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4060                      block->idstr, size, local_size);
4061         ret = -EINVAL;
4062         goto out;
4063     }
4064
4065     size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4066     end_mark = qemu_get_be64(file);
4067
4068     ret = qemu_file_get_error(file);
4069     if (ret || size != local_size) {
4070         error_report("%s: read bitmap failed for ramblock '%s': %d"
4071                      " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4072                      __func__, block->idstr, ret, local_size, size);
4073         ret = -EIO;
4074         goto out;
4075     }
4076
4077     if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4078         error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64,
4079                      __func__, block->idstr, end_mark);
4080         ret = -EINVAL;
4081         goto out;
4082     }
4083
4084     /*
4085      * Endianness conversion. We are during postcopy (though paused).
4086      * The dirty bitmap won't change. We can directly modify it.
4087      */
4088     bitmap_from_le(block->bmap, le_bitmap, nbits);
4089
4090     /*
4091      * What we received is "received bitmap". Revert it as the initial
4092      * dirty bitmap for this ramblock.
4093      */
4094     bitmap_complement(block->bmap, block->bmap, nbits);
4095
4096     trace_ram_dirty_bitmap_reload_complete(block->idstr);
4097
4098     /*
4099      * We succeeded to sync bitmap for current ramblock. If this is
4100      * the last one to sync, we need to notify the main send thread.
4101      */
4102     ram_dirty_bitmap_reload_notify(s);
4103
4104     ret = 0;
4105 out:
4106     g_free(le_bitmap);
4107     return ret;
4108 }
4109
4110 static int ram_resume_prepare(MigrationState *s, void *opaque)
4111 {
4112     RAMState *rs = *(RAMState **)opaque;
4113     int ret;
4114
4115     ret = ram_dirty_bitmap_sync_all(s, rs);
4116     if (ret) {
4117         return ret;
4118     }
4119
4120     ram_state_resume_prepare(rs, s->to_dst_file);
4121
4122     return 0;
4123 }
4124
4125 static SaveVMHandlers savevm_ram_handlers = {
4126     .save_setup = ram_save_setup,
4127     .save_live_iterate = ram_save_iterate,
4128     .save_live_complete_postcopy = ram_save_complete,
4129     .save_live_complete_precopy = ram_save_complete,
4130     .has_postcopy = ram_has_postcopy,
4131     .save_live_pending = ram_save_pending,
4132     .load_state = ram_load,
4133     .save_cleanup = ram_save_cleanup,
4134     .load_setup = ram_load_setup,
4135     .load_cleanup = ram_load_cleanup,
4136     .resume_prepare = ram_resume_prepare,
4137 };
4138
4139 void ram_mig_init(void)
4140 {
4141     qemu_mutex_init(&XBZRLE.lock);
4142     register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
4143 }