migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <quintela@redhat.com>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28
  29 #include "qemu/osdep.h"
  30 #include "qemu/cutils.h"
  31 #include "qemu/bitops.h"
  32 #include "qemu/bitmap.h"
  33 #include "qemu/main-loop.h"
  34 #include "xbzrle.h"
  35 #include "ram.h"
  36 #include "migration.h"
  37 #include "migration/register.h"
  38 #include "migration/misc.h"
  39 #include "qemu-file.h"
  40 #include "postcopy-ram.h"
  41 #include "page_cache.h"
  42 #include "qemu/error-report.h"
  43 #include "qapi/error.h"
  44 #include "qapi/qapi-types-migration.h"
  45 #include "qapi/qapi-events-migration.h"
  46 #include "qapi/qmp/qerror.h"
  47 #include "trace.h"
  48 #include "exec/ram_addr.h"
  49 #include "exec/target_page.h"
  50 #include "qemu/rcu_queue.h"
  51 #include "migration/colo.h"
  52 #include "block.h"
  53 #include "sysemu/cpu-throttle.h"
  54 #include "savevm.h"
  55 #include "qemu/iov.h"
  56 #include "multifd.h"
  57 #include "sysemu/runstate.h"
  58
  59 #include "hw/boards.h" /* for machine_dump_guest_core() */
  60
  61 #if defined(__linux__)
  62 #include "qemu/userfaultfd.h"
  63 #endif /* defined(__linux__) */
  64
  65 /***********************************************************/
  66 /* ram save/restore */
  67
  68 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
  69  * worked for pages that where filled with the same char.  We switched
  70  * it to only search for the zero value.  And to avoid confusion with
  71  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
  72  */
  73
  74 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  75 #define RAM_SAVE_FLAG_ZERO     0x02
  76 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  77 #define RAM_SAVE_FLAG_PAGE     0x08
  78 #define RAM_SAVE_FLAG_EOS      0x10
  79 #define RAM_SAVE_FLAG_CONTINUE 0x20
  80 #define RAM_SAVE_FLAG_XBZRLE   0x40
  81 /* 0x80 is reserved in migration.h start with 0x100 next */
  82 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  83
  84 XBZRLECacheStats xbzrle_counters;
  85
  86 /* struct contains XBZRLE cache and a static page
  87    used by the compression */
  88 static struct {
  89     /* buffer used for XBZRLE encoding */
  90     uint8_t *encoded_buf;
  91     /* buffer for storing page content */
  92     uint8_t *current_buf;
  93     /* Cache for XBZRLE, Protected by lock. */
  94     PageCache *cache;
  95     QemuMutex lock;
  96     /* it will store a page full of zeros */
  97     uint8_t *zero_target_page;
  98     /* buffer used for XBZRLE decoding */
  99     uint8_t *decoded_buf;
 100 } XBZRLE;
 101
 102 static void XBZRLE_cache_lock(void)
 103 {
 104     if (migrate_use_xbzrle()) {
 105         qemu_mutex_lock(&XBZRLE.lock);
 106     }
 107 }
 108
 109 static void XBZRLE_cache_unlock(void)
 110 {
 111     if (migrate_use_xbzrle()) {
 112         qemu_mutex_unlock(&XBZRLE.lock);
 113     }
 114 }
 115
 116 /**
 117  * xbzrle_cache_resize: resize the xbzrle cache
 118  *
 119  * This function is called from migrate_params_apply in main
 120  * thread, possibly while a migration is in progress.  A running
 121  * migration may be using the cache and might finish during this call,
 122  * hence changes to the cache are protected by XBZRLE.lock().
 123  *
 124  * Returns 0 for success or -1 for error
 125  *
 126  * @new_size: new cache size
 127  * @errp: set *errp if the check failed, with reason
 128  */
 129 int xbzrle_cache_resize(uint64_t new_size, Error **errp)
 130 {
 131     PageCache *new_cache;
 132     int64_t ret = 0;
 133
 134     /* Check for truncation */
 135     if (new_size != (size_t)new_size) {
 136         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
 137                    "exceeding address space");
 138         return -1;
 139     }
 140
 141     if (new_size == migrate_xbzrle_cache_size()) {
 142         /* nothing to do */
 143         return 0;
 144     }
 145
 146     XBZRLE_cache_lock();
 147
 148     if (XBZRLE.cache != NULL) {
 149         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
 150         if (!new_cache) {
 151             ret = -1;
 152             goto out;
 153         }
 154
 155         cache_fini(XBZRLE.cache);
 156         XBZRLE.cache = new_cache;
 157     }
 158 out:
 159     XBZRLE_cache_unlock();
 160     return ret;
 161 }
 162
 163 bool ramblock_is_ignored(RAMBlock *block)
 164 {
 165     return !qemu_ram_is_migratable(block) ||
 166            (migrate_ignore_shared() && qemu_ram_is_shared(block));
 167 }
 168
 169 #undef RAMBLOCK_FOREACH
 170
 171 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
 172 {
 173     RAMBlock *block;
 174     int ret = 0;
 175
 176     RCU_READ_LOCK_GUARD();
 177
 178     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 179         ret = func(block, opaque);
 180         if (ret) {
 181             break;
 182         }
 183     }
 184     return ret;
 185 }
 186
 187 static void ramblock_recv_map_init(void)
 188 {
 189     RAMBlock *rb;
 190
 191     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
 192         assert(!rb->receivedmap);
 193         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
 194     }
 195 }
 196
 197 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
 198 {
 199     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
 200                     rb->receivedmap);
 201 }
 202
 203 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
 204 {
 205     return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
 206 }
 207
 208 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
 209 {
 210     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
 211 }
 212
 213 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
 214                                     size_t nr)
 215 {
 216     bitmap_set_atomic(rb->receivedmap,
 217                       ramblock_recv_bitmap_offset(host_addr, rb),
 218                       nr);
 219 }
 220
 221 #define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
 222
 223 /*
 224  * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
 225  *
 226  * Returns >0 if success with sent bytes, or <0 if error.
 227  */
 228 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
 229                                   const char *block_name)
 230 {
 231     RAMBlock *block = qemu_ram_block_by_name(block_name);
 232     unsigned long *le_bitmap, nbits;
 233     uint64_t size;
 234
 235     if (!block) {
 236         error_report("%s: invalid block name: %s", __func__, block_name);
 237         return -1;
 238     }
 239
 240     nbits = block->postcopy_length >> TARGET_PAGE_BITS;
 241
 242     /*
 243      * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
 244      * machines we may need 4 more bytes for padding (see below
 245      * comment). So extend it a bit before hand.
 246      */
 247     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
 248
 249     /*
 250      * Always use little endian when sending the bitmap. This is
 251      * required that when source and destination VMs are not using the
 252      * same endianness. (Note: big endian won't work.)
 253      */
 254     bitmap_to_le(le_bitmap, block->receivedmap, nbits);
 255
 256     /* Size of the bitmap, in bytes */
 257     size = DIV_ROUND_UP(nbits, 8);
 258
 259     /*
 260      * size is always aligned to 8 bytes for 64bit machines, but it
 261      * may not be true for 32bit machines. We need this padding to
 262      * make sure the migration can survive even between 32bit and
 263      * 64bit machines.
 264      */
 265     size = ROUND_UP(size, 8);
 266
 267     qemu_put_be64(file, size);
 268     qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
 269     /*
 270      * Mark as an end, in case the middle part is screwed up due to
 271      * some "mysterious" reason.
 272      */
 273     qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
 274     qemu_fflush(file);
 275
 276     g_free(le_bitmap);
 277
 278     if (qemu_file_get_error(file)) {
 279         return qemu_file_get_error(file);
 280     }
 281
 282     return size + sizeof(size);
 283 }
 284
 285 /*
 286  * An outstanding page request, on the source, having been received
 287  * and queued
 288  */
 289 struct RAMSrcPageRequest {
 290     RAMBlock *rb;
 291     hwaddr    offset;
 292     hwaddr    len;
 293
 294     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 295 };
 296
 297 /* State of RAM for migration */
 298 struct RAMState {
 299     /* QEMUFile used for this migration */
 300     QEMUFile *f;
 301     /* UFFD file descriptor, used in 'write-tracking' migration */
 302     int uffdio_fd;
 303     /* Last block that we have visited searching for dirty pages */
 304     RAMBlock *last_seen_block;
 305     /* Last block from where we have sent data */
 306     RAMBlock *last_sent_block;
 307     /* Last dirty target page we have sent */
 308     ram_addr_t last_page;
 309     /* last ram version we have seen */
 310     uint32_t last_version;
 311     /* How many times we have dirty too many pages */
 312     int dirty_rate_high_cnt;
 313     /* these variables are used for bitmap sync */
 314     /* last time we did a full bitmap_sync */
 315     int64_t time_last_bitmap_sync;
 316     /* bytes transferred at start_time */
 317     uint64_t bytes_xfer_prev;
 318     /* number of dirty pages since start_time */
 319     uint64_t num_dirty_pages_period;
 320     /* xbzrle misses since the beginning of the period */
 321     uint64_t xbzrle_cache_miss_prev;
 322     /* Amount of xbzrle pages since the beginning of the period */
 323     uint64_t xbzrle_pages_prev;
 324     /* Amount of xbzrle encoded bytes since the beginning of the period */
 325     uint64_t xbzrle_bytes_prev;
 326     /* Start using XBZRLE (e.g., after the first round). */
 327     bool xbzrle_enabled;
 328     /* Are we on the last stage of migration */
 329     bool last_stage;
 330     /* compression statistics since the beginning of the period */
 331     /* amount of count that no free thread to compress data */
 332     uint64_t compress_thread_busy_prev;
 333     /* amount bytes after compression */
 334     uint64_t compressed_size_prev;
 335     /* amount of compressed pages */
 336     uint64_t compress_pages_prev;
 337
 338     /* total handled target pages at the beginning of period */
 339     uint64_t target_page_count_prev;
 340     /* total handled target pages since start */
 341     uint64_t target_page_count;
 342     /* number of dirty bits in the bitmap */
 343     uint64_t migration_dirty_pages;
 344     /* Protects modification of the bitmap and migration dirty pages */
 345     QemuMutex bitmap_mutex;
 346     /* The RAMBlock used in the last src_page_requests */
 347     RAMBlock *last_req_rb;
 348     /* Queue of outstanding page requests from the destination */
 349     QemuMutex src_page_req_mutex;
 350     QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
 351 };
 352 typedef struct RAMState RAMState;
 353
 354 static RAMState *ram_state;
 355
 356 static NotifierWithReturnList precopy_notifier_list;
 357
 358 void precopy_infrastructure_init(void)
 359 {
 360     notifier_with_return_list_init(&precopy_notifier_list);
 361 }
 362
 363 void precopy_add_notifier(NotifierWithReturn *n)
 364 {
 365     notifier_with_return_list_add(&precopy_notifier_list, n);
 366 }
 367
 368 void precopy_remove_notifier(NotifierWithReturn *n)
 369 {
 370     notifier_with_return_remove(n);
 371 }
 372
 373 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
 374 {
 375     PrecopyNotifyData pnd;
 376     pnd.reason = reason;
 377     pnd.errp = errp;
 378
 379     return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
 380 }
 381
 382 uint64_t ram_bytes_remaining(void)
 383 {
 384     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
 385                        0;
 386 }
 387
 388 MigrationStats ram_counters;
 389
 390 /* used by the search for pages to send */
 391 struct PageSearchStatus {
 392     /* Current block being searched */
 393     RAMBlock    *block;
 394     /* Current page to search from */
 395     unsigned long page;
 396     /* Set once we wrap around */
 397     bool         complete_round;
 398 };
 399 typedef struct PageSearchStatus PageSearchStatus;
 400
 401 CompressionStats compression_counters;
 402
 403 struct CompressParam {
 404     bool done;
 405     bool quit;
 406     bool zero_page;
 407     QEMUFile *file;
 408     QemuMutex mutex;
 409     QemuCond cond;
 410     RAMBlock *block;
 411     ram_addr_t offset;
 412
 413     /* internally used fields */
 414     z_stream stream;
 415     uint8_t *originbuf;
 416 };
 417 typedef struct CompressParam CompressParam;
 418
 419 struct DecompressParam {
 420     bool done;
 421     bool quit;
 422     QemuMutex mutex;
 423     QemuCond cond;
 424     void *des;
 425     uint8_t *compbuf;
 426     int len;
 427     z_stream stream;
 428 };
 429 typedef struct DecompressParam DecompressParam;
 430
 431 static CompressParam *comp_param;
 432 static QemuThread *compress_threads;
 433 /* comp_done_cond is used to wake up the migration thread when
 434  * one of the compression threads has finished the compression.
 435  * comp_done_lock is used to co-work with comp_done_cond.
 436  */
 437 static QemuMutex comp_done_lock;
 438 static QemuCond comp_done_cond;
 439 /* The empty QEMUFileOps will be used by file in CompressParam */
 440 static const QEMUFileOps empty_ops = { };
 441
 442 static QEMUFile *decomp_file;
 443 static DecompressParam *decomp_param;
 444 static QemuThread *decompress_threads;
 445 static QemuMutex decomp_done_lock;
 446 static QemuCond decomp_done_cond;
 447
 448 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
 449                                  ram_addr_t offset, uint8_t *source_buf);
 450
 451 static void *do_data_compress(void *opaque)
 452 {
 453     CompressParam *param = opaque;
 454     RAMBlock *block;
 455     ram_addr_t offset;
 456     bool zero_page;
 457
 458     qemu_mutex_lock(&param->mutex);
 459     while (!param->quit) {
 460         if (param->block) {
 461             block = param->block;
 462             offset = param->offset;
 463             param->block = NULL;
 464             qemu_mutex_unlock(&param->mutex);
 465
 466             zero_page = do_compress_ram_page(param->file, &param->stream,
 467                                              block, offset, param->originbuf);
 468
 469             qemu_mutex_lock(&comp_done_lock);
 470             param->done = true;
 471             param->zero_page = zero_page;
 472             qemu_cond_signal(&comp_done_cond);
 473             qemu_mutex_unlock(&comp_done_lock);
 474
 475             qemu_mutex_lock(&param->mutex);
 476         } else {
 477             qemu_cond_wait(&param->cond, &param->mutex);
 478         }
 479     }
 480     qemu_mutex_unlock(&param->mutex);
 481
 482     return NULL;
 483 }
 484
 485 static void compress_threads_save_cleanup(void)
 486 {
 487     int i, thread_count;
 488
 489     if (!migrate_use_compression() || !comp_param) {
 490         return;
 491     }
 492
 493     thread_count = migrate_compress_threads();
 494     for (i = 0; i < thread_count; i++) {
 495         /*
 496          * we use it as a indicator which shows if the thread is
 497          * properly init'd or not
 498          */
 499         if (!comp_param[i].file) {
 500             break;
 501         }
 502
 503         qemu_mutex_lock(&comp_param[i].mutex);
 504         comp_param[i].quit = true;
 505         qemu_cond_signal(&comp_param[i].cond);
 506         qemu_mutex_unlock(&comp_param[i].mutex);
 507
 508         qemu_thread_join(compress_threads + i);
 509         qemu_mutex_destroy(&comp_param[i].mutex);
 510         qemu_cond_destroy(&comp_param[i].cond);
 511         deflateEnd(&comp_param[i].stream);
 512         g_free(comp_param[i].originbuf);
 513         qemu_fclose(comp_param[i].file);
 514         comp_param[i].file = NULL;
 515     }
 516     qemu_mutex_destroy(&comp_done_lock);
 517     qemu_cond_destroy(&comp_done_cond);
 518     g_free(compress_threads);
 519     g_free(comp_param);
 520     compress_threads = NULL;
 521     comp_param = NULL;
 522 }
 523
 524 static int compress_threads_save_setup(void)
 525 {
 526     int i, thread_count;
 527
 528     if (!migrate_use_compression()) {
 529         return 0;
 530     }
 531     thread_count = migrate_compress_threads();
 532     compress_threads = g_new0(QemuThread, thread_count);
 533     comp_param = g_new0(CompressParam, thread_count);
 534     qemu_cond_init(&comp_done_cond);
 535     qemu_mutex_init(&comp_done_lock);
 536     for (i = 0; i < thread_count; i++) {
 537         comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
 538         if (!comp_param[i].originbuf) {
 539             goto exit;
 540         }
 541
 542         if (deflateInit(&comp_param[i].stream,
 543                         migrate_compress_level()) != Z_OK) {
 544             g_free(comp_param[i].originbuf);
 545             goto exit;
 546         }
 547
 548         /* comp_param[i].file is just used as a dummy buffer to save data,
 549          * set its ops to empty.
 550          */
 551         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops, false);
 552         comp_param[i].done = true;
 553         comp_param[i].quit = false;
 554         qemu_mutex_init(&comp_param[i].mutex);
 555         qemu_cond_init(&comp_param[i].cond);
 556         qemu_thread_create(compress_threads + i, "compress",
 557                            do_data_compress, comp_param + i,
 558                            QEMU_THREAD_JOINABLE);
 559     }
 560     return 0;
 561
 562 exit:
 563     compress_threads_save_cleanup();
 564     return -1;
 565 }
 566
 567 /**
 568  * save_page_header: write page header to wire
 569  *
 570  * If this is the 1st block, it also writes the block identification
 571  *
 572  * Returns the number of bytes written
 573  *
 574  * @f: QEMUFile where to send the data
 575  * @block: block that contains the page we want to send
 576  * @offset: offset inside the block for the page
 577  *          in the lower bits, it contains flags
 578  */
 579 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
 580                                ram_addr_t offset)
 581 {
 582     size_t size, len;
 583
 584     if (block == rs->last_sent_block) {
 585         offset |= RAM_SAVE_FLAG_CONTINUE;
 586     }
 587     qemu_put_be64(f, offset);
 588     size = 8;
 589
 590     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
 591         len = strlen(block->idstr);
 592         qemu_put_byte(f, len);
 593         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
 594         size += 1 + len;
 595         rs->last_sent_block = block;
 596     }
 597     return size;
 598 }
 599
 600 /**
 601  * mig_throttle_guest_down: throttle down the guest
 602  *
 603  * Reduce amount of guest cpu execution to hopefully slow down memory
 604  * writes. If guest dirty memory rate is reduced below the rate at
 605  * which we can transfer pages to the destination then we should be
 606  * able to complete migration. Some workloads dirty memory way too
 607  * fast and will not effectively converge, even with auto-converge.
 608  */
 609 static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
 610                                     uint64_t bytes_dirty_threshold)
 611 {
 612     MigrationState *s = migrate_get_current();
 613     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
 614     uint64_t pct_increment = s->parameters.cpu_throttle_increment;
 615     bool pct_tailslow = s->parameters.cpu_throttle_tailslow;
 616     int pct_max = s->parameters.max_cpu_throttle;
 617
 618     uint64_t throttle_now = cpu_throttle_get_percentage();
 619     uint64_t cpu_now, cpu_ideal, throttle_inc;
 620
 621     /* We have not started throttling yet. Let's start it. */
 622     if (!cpu_throttle_active()) {
 623         cpu_throttle_set(pct_initial);
 624     } else {
 625         /* Throttling already on, just increase the rate */
 626         if (!pct_tailslow) {
 627             throttle_inc = pct_increment;
 628         } else {
 629             /* Compute the ideal CPU percentage used by Guest, which may
 630              * make the dirty rate match the dirty rate threshold. */
 631             cpu_now = 100 - throttle_now;
 632             cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
 633                         bytes_dirty_period);
 634             throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
 635         }
 636         cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
 637     }
 638 }
 639
 640 void mig_throttle_counter_reset(void)
 641 {
 642     RAMState *rs = ram_state;
 643
 644     rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 645     rs->num_dirty_pages_period = 0;
 646     rs->bytes_xfer_prev = ram_counters.transferred;
 647 }
 648
 649 /**
 650  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
 651  *
 652  * @rs: current RAM state
 653  * @current_addr: address for the zero page
 654  *
 655  * Update the xbzrle cache to reflect a page that's been sent as all 0.
 656  * The important thing is that a stale (not-yet-0'd) page be replaced
 657  * by the new data.
 658  * As a bonus, if the page wasn't in the cache it gets added so that
 659  * when a small write is made into the 0'd page it gets XBZRLE sent.
 660  */
 661 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
 662 {
 663     if (!rs->xbzrle_enabled) {
 664         return;
 665     }
 666
 667     /* We don't care if this fails to allocate a new cache page
 668      * as long as it updated an old one */
 669     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
 670                  ram_counters.dirty_sync_count);
 671 }
 672
 673 #define ENCODING_FLAG_XBZRLE 0x1
 674
 675 /**
 676  * save_xbzrle_page: compress and send current page
 677  *
 678  * Returns: 1 means that we wrote the page
 679  *          0 means that page is identical to the one already sent
 680  *          -1 means that xbzrle would be longer than normal
 681  *
 682  * @rs: current RAM state
 683  * @current_data: pointer to the address of the page contents
 684  * @current_addr: addr of the page
 685  * @block: block that contains the page we want to send
 686  * @offset: offset inside the block for the page
 687  */
 688 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
 689                             ram_addr_t current_addr, RAMBlock *block,
 690                             ram_addr_t offset)
 691 {
 692     int encoded_len = 0, bytes_xbzrle;
 693     uint8_t *prev_cached_page;
 694
 695     if (!cache_is_cached(XBZRLE.cache, current_addr,
 696                          ram_counters.dirty_sync_count)) {
 697         xbzrle_counters.cache_miss++;
 698         if (!rs->last_stage) {
 699             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
 700                              ram_counters.dirty_sync_count) == -1) {
 701                 return -1;
 702             } else {
 703                 /* update *current_data when the page has been
 704                    inserted into cache */
 705                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
 706             }
 707         }
 708         return -1;
 709     }
 710
 711     /*
 712      * Reaching here means the page has hit the xbzrle cache, no matter what
 713      * encoding result it is (normal encoding, overflow or skipping the page),
 714      * count the page as encoded. This is used to calculate the encoding rate.
 715      *
 716      * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
 717      * 2nd page turns out to be skipped (i.e. no new bytes written to the
 718      * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
 719      * skipped page included. In this way, the encoding rate can tell if the
 720      * guest page is good for xbzrle encoding.
 721      */
 722     xbzrle_counters.pages++;
 723     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
 724
 725     /* save current buffer into memory */
 726     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 727
 728     /* XBZRLE encoding (if there is no overflow) */
 729     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
 730                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
 731                                        TARGET_PAGE_SIZE);
 732
 733     /*
 734      * Update the cache contents, so that it corresponds to the data
 735      * sent, in all cases except where we skip the page.
 736      */
 737     if (!rs->last_stage && encoded_len != 0) {
 738         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
 739         /*
 740          * In the case where we couldn't compress, ensure that the caller
 741          * sends the data from the cache, since the guest might have
 742          * changed the RAM since we copied it.
 743          */
 744         *current_data = prev_cached_page;
 745     }
 746
 747     if (encoded_len == 0) {
 748         trace_save_xbzrle_page_skipping();
 749         return 0;
 750     } else if (encoded_len == -1) {
 751         trace_save_xbzrle_page_overflow();
 752         xbzrle_counters.overflow++;
 753         xbzrle_counters.bytes += TARGET_PAGE_SIZE;
 754         return -1;
 755     }
 756
 757     /* Send XBZRLE based compressed page */
 758     bytes_xbzrle = save_page_header(rs, rs->f, block,
 759                                     offset | RAM_SAVE_FLAG_XBZRLE);
 760     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
 761     qemu_put_be16(rs->f, encoded_len);
 762     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
 763     bytes_xbzrle += encoded_len + 1 + 2;
 764     /*
 765      * Like compressed_size (please see update_compress_thread_counts),
 766      * the xbzrle encoded bytes don't count the 8 byte header with
 767      * RAM_SAVE_FLAG_CONTINUE.
 768      */
 769     xbzrle_counters.bytes += bytes_xbzrle - 8;
 770     ram_counters.transferred += bytes_xbzrle;
 771
 772     return 1;
 773 }
 774
 775 /**
 776  * migration_bitmap_find_dirty: find the next dirty page from start
 777  *
 778  * Returns the page offset within memory region of the start of a dirty page
 779  *
 780  * @rs: current RAM state
 781  * @rb: RAMBlock where to search for dirty pages
 782  * @start: page where we start the search
 783  */
 784 static inline
 785 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
 786                                           unsigned long start)
 787 {
 788     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
 789     unsigned long *bitmap = rb->bmap;
 790
 791     if (ramblock_is_ignored(rb)) {
 792         return size;
 793     }
 794
 795     return find_next_bit(bitmap, size, start);
 796 }
 797
 798 static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb,
 799                                                        unsigned long page)
 800 {
 801     uint8_t shift;
 802     hwaddr size, start;
 803
 804     if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) {
 805         return;
 806     }
 807
 808     shift = rb->clear_bmap_shift;
 809     /*
 810      * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
 811      * can make things easier sometimes since then start address
 812      * of the small chunk will always be 64 pages aligned so the
 813      * bitmap will always be aligned to unsigned long. We should
 814      * even be able to remove this restriction but I'm simply
 815      * keeping it.
 816      */
 817     assert(shift >= 6);
 818
 819     size = 1ULL << (TARGET_PAGE_BITS + shift);
 820     start = QEMU_ALIGN_DOWN((ram_addr_t)page << TARGET_PAGE_BITS, size);
 821     trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
 822     memory_region_clear_dirty_bitmap(rb->mr, start, size);
 823 }
 824
 825 static void
 826 migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb,
 827                                                  unsigned long start,
 828                                                  unsigned long npages)
 829 {
 830     unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift;
 831     unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages);
 832     unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages);
 833
 834     /*
 835      * Clear pages from start to start + npages - 1, so the end boundary is
 836      * exclusive.
 837      */
 838     for (i = chunk_start; i < chunk_end; i += chunk_pages) {
 839         migration_clear_memory_region_dirty_bitmap(rb, i);
 840     }
 841 }
 842
 843 /*
 844  * colo_bitmap_find_diry:find contiguous dirty pages from start
 845  *
 846  * Returns the page offset within memory region of the start of the contiguout
 847  * dirty page
 848  *
 849  * @rs: current RAM state
 850  * @rb: RAMBlock where to search for dirty pages
 851  * @start: page where we start the search
 852  * @num: the number of contiguous dirty pages
 853  */
 854 static inline
 855 unsigned long colo_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
 856                                      unsigned long start, unsigned long *num)
 857 {
 858     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
 859     unsigned long *bitmap = rb->bmap;
 860     unsigned long first, next;
 861
 862     *num = 0;
 863
 864     if (ramblock_is_ignored(rb)) {
 865         return size;
 866     }
 867
 868     first = find_next_bit(bitmap, size, start);
 869     if (first >= size) {
 870         return first;
 871     }
 872     next = find_next_zero_bit(bitmap, size, first + 1);
 873     assert(next >= first);
 874     *num = next - first;
 875     return first;
 876 }
 877
 878 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
 879                                                 RAMBlock *rb,
 880                                                 unsigned long page)
 881 {
 882     bool ret;
 883
 884     /*
 885      * Clear dirty bitmap if needed.  This _must_ be called before we
 886      * send any of the page in the chunk because we need to make sure
 887      * we can capture further page content changes when we sync dirty
 888      * log the next time.  So as long as we are going to send any of
 889      * the page in the chunk we clear the remote dirty bitmap for all.
 890      * Clearing it earlier won't be a problem, but too late will.
 891      */
 892     migration_clear_memory_region_dirty_bitmap(rb, page);
 893
 894     ret = test_and_clear_bit(page, rb->bmap);
 895     if (ret) {
 896         rs->migration_dirty_pages--;
 897     }
 898
 899     return ret;
 900 }
 901
 902 static void dirty_bitmap_clear_section(MemoryRegionSection *section,
 903                                        void *opaque)
 904 {
 905     const hwaddr offset = section->offset_within_region;
 906     const hwaddr size = int128_get64(section->size);
 907     const unsigned long start = offset >> TARGET_PAGE_BITS;
 908     const unsigned long npages = size >> TARGET_PAGE_BITS;
 909     RAMBlock *rb = section->mr->ram_block;
 910     uint64_t *cleared_bits = opaque;
 911
 912     /*
 913      * We don't grab ram_state->bitmap_mutex because we expect to run
 914      * only when starting migration or during postcopy recovery where
 915      * we don't have concurrent access.
 916      */
 917     if (!migration_in_postcopy() && !migrate_background_snapshot()) {
 918         migration_clear_memory_region_dirty_bitmap_range(rb, start, npages);
 919     }
 920     *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages);
 921     bitmap_clear(rb->bmap, start, npages);
 922 }
 923
 924 /*
 925  * Exclude all dirty pages from migration that fall into a discarded range as
 926  * managed by a RamDiscardManager responsible for the mapped memory region of
 927  * the RAMBlock. Clear the corresponding bits in the dirty bitmaps.
 928  *
 929  * Discarded pages ("logically unplugged") have undefined content and must
 930  * not get migrated, because even reading these pages for migration might
 931  * result in undesired behavior.
 932  *
 933  * Returns the number of cleared bits in the RAMBlock dirty bitmap.
 934  *
 935  * Note: The result is only stable while migrating (precopy/postcopy).
 936  */
 937 static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb)
 938 {
 939     uint64_t cleared_bits = 0;
 940
 941     if (rb->mr && rb->bmap && memory_region_has_ram_discard_manager(rb->mr)) {
 942         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
 943         MemoryRegionSection section = {
 944             .mr = rb->mr,
 945             .offset_within_region = 0,
 946             .size = int128_make64(qemu_ram_get_used_length(rb)),
 947         };
 948
 949         ram_discard_manager_replay_discarded(rdm, &section,
 950                                              dirty_bitmap_clear_section,
 951                                              &cleared_bits);
 952     }
 953     return cleared_bits;
 954 }
 955
 956 /*
 957  * Check if a host-page aligned page falls into a discarded range as managed by
 958  * a RamDiscardManager responsible for the mapped memory region of the RAMBlock.
 959  *
 960  * Note: The result is only stable while migrating (precopy/postcopy).
 961  */
 962 bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start)
 963 {
 964     if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
 965         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
 966         MemoryRegionSection section = {
 967             .mr = rb->mr,
 968             .offset_within_region = start,
 969             .size = int128_make64(qemu_ram_pagesize(rb)),
 970         };
 971
 972         return !ram_discard_manager_is_populated(rdm, &section);
 973     }
 974     return false;
 975 }
 976
 977 /* Called with RCU critical section */
 978 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
 979 {
 980     uint64_t new_dirty_pages =
 981         cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
 982
 983     rs->migration_dirty_pages += new_dirty_pages;
 984     rs->num_dirty_pages_period += new_dirty_pages;
 985 }
 986
 987 /**
 988  * ram_pagesize_summary: calculate all the pagesizes of a VM
 989  *
 990  * Returns a summary bitmap of the page sizes of all RAMBlocks
 991  *
 992  * For VMs with just normal pages this is equivalent to the host page
 993  * size. If it's got some huge pages then it's the OR of all the
 994  * different page sizes.
 995  */
 996 uint64_t ram_pagesize_summary(void)
 997 {
 998     RAMBlock *block;
 999     uint64_t summary = 0;
1000
1001     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1002         summary |= block->page_size;
1003     }
1004
1005     return summary;
1006 }
1007
1008 uint64_t ram_get_total_transferred_pages(void)
1009 {
1010     return  ram_counters.normal + ram_counters.duplicate +
1011                 compression_counters.pages + xbzrle_counters.pages;
1012 }
1013
1014 static void migration_update_rates(RAMState *rs, int64_t end_time)
1015 {
1016     uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
1017     double compressed_size;
1018
1019     /* calculate period counters */
1020     ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
1021                 / (end_time - rs->time_last_bitmap_sync);
1022
1023     if (!page_count) {
1024         return;
1025     }
1026
1027     if (migrate_use_xbzrle()) {
1028         double encoded_size, unencoded_size;
1029
1030         xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
1031             rs->xbzrle_cache_miss_prev) / page_count;
1032         rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
1033         unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
1034                          TARGET_PAGE_SIZE;
1035         encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
1036         if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
1037             xbzrle_counters.encoding_rate = 0;
1038         } else {
1039             xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
1040         }
1041         rs->xbzrle_pages_prev = xbzrle_counters.pages;
1042         rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
1043     }
1044
1045     if (migrate_use_compression()) {
1046         compression_counters.busy_rate = (double)(compression_counters.busy -
1047             rs->compress_thread_busy_prev) / page_count;
1048         rs->compress_thread_busy_prev = compression_counters.busy;
1049
1050         compressed_size = compression_counters.compressed_size -
1051                           rs->compressed_size_prev;
1052         if (compressed_size) {
1053             double uncompressed_size = (compression_counters.pages -
1054                                     rs->compress_pages_prev) * TARGET_PAGE_SIZE;
1055
1056             /* Compression-Ratio = Uncompressed-size / Compressed-size */
1057             compression_counters.compression_rate =
1058                                         uncompressed_size / compressed_size;
1059
1060             rs->compress_pages_prev = compression_counters.pages;
1061             rs->compressed_size_prev = compression_counters.compressed_size;
1062         }
1063     }
1064 }
1065
1066 static void migration_trigger_throttle(RAMState *rs)
1067 {
1068     MigrationState *s = migrate_get_current();
1069     uint64_t threshold = s->parameters.throttle_trigger_threshold;
1070
1071     uint64_t bytes_xfer_period = ram_counters.transferred - rs->bytes_xfer_prev;
1072     uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
1073     uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
1074
1075     /* During block migration the auto-converge logic incorrectly detects
1076      * that ram migration makes no progress. Avoid this by disabling the
1077      * throttling logic during the bulk phase of block migration. */
1078     if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1079         /* The following detection logic can be refined later. For now:
1080            Check to see if the ratio between dirtied bytes and the approx.
1081            amount of bytes that just got transferred since the last time
1082            we were in this routine reaches the threshold. If that happens
1083            twice, start or increase throttling. */
1084
1085         if ((bytes_dirty_period > bytes_dirty_threshold) &&
1086             (++rs->dirty_rate_high_cnt >= 2)) {
1087             trace_migration_throttle();
1088             rs->dirty_rate_high_cnt = 0;
1089             mig_throttle_guest_down(bytes_dirty_period,
1090                                     bytes_dirty_threshold);
1091         }
1092     }
1093 }
1094
1095 static void migration_bitmap_sync(RAMState *rs)
1096 {
1097     RAMBlock *block;
1098     int64_t end_time;
1099
1100     ram_counters.dirty_sync_count++;
1101
1102     if (!rs->time_last_bitmap_sync) {
1103         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1104     }
1105
1106     trace_migration_bitmap_sync_start();
1107     memory_global_dirty_log_sync();
1108
1109     qemu_mutex_lock(&rs->bitmap_mutex);
1110     WITH_RCU_READ_LOCK_GUARD() {
1111         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1112             ramblock_sync_dirty_bitmap(rs, block);
1113         }
1114         ram_counters.remaining = ram_bytes_remaining();
1115     }
1116     qemu_mutex_unlock(&rs->bitmap_mutex);
1117
1118     memory_global_after_dirty_log_sync();
1119     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1120
1121     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1122
1123     /* more than 1 second = 1000 millisecons */
1124     if (end_time > rs->time_last_bitmap_sync + 1000) {
1125         migration_trigger_throttle(rs);
1126
1127         migration_update_rates(rs, end_time);
1128
1129         rs->target_page_count_prev = rs->target_page_count;
1130
1131         /* reset period counters */
1132         rs->time_last_bitmap_sync = end_time;
1133         rs->num_dirty_pages_period = 0;
1134         rs->bytes_xfer_prev = ram_counters.transferred;
1135     }
1136     if (migrate_use_events()) {
1137         qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
1138     }
1139 }
1140
1141 static void migration_bitmap_sync_precopy(RAMState *rs)
1142 {
1143     Error *local_err = NULL;
1144
1145     /*
1146      * The current notifier usage is just an optimization to migration, so we
1147      * don't stop the normal migration process in the error case.
1148      */
1149     if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1150         error_report_err(local_err);
1151         local_err = NULL;
1152     }
1153
1154     migration_bitmap_sync(rs);
1155
1156     if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1157         error_report_err(local_err);
1158     }
1159 }
1160
1161 static void ram_release_page(const char *rbname, uint64_t offset)
1162 {
1163     if (!migrate_release_ram() || !migration_in_postcopy()) {
1164         return;
1165     }
1166
1167     ram_discard_range(rbname, offset, TARGET_PAGE_SIZE);
1168 }
1169
1170 /**
1171  * save_zero_page_to_file: send the zero page to the file
1172  *
1173  * Returns the size of data written to the file, 0 means the page is not
1174  * a zero page
1175  *
1176  * @rs: current RAM state
1177  * @file: the file where the data is saved
1178  * @block: block that contains the page we want to send
1179  * @offset: offset inside the block for the page
1180  */
1181 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1182                                   RAMBlock *block, ram_addr_t offset)
1183 {
1184     uint8_t *p = block->host + offset;
1185     int len = 0;
1186
1187     if (buffer_is_zero(p, TARGET_PAGE_SIZE)) {
1188         len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1189         qemu_put_byte(file, 0);
1190         len += 1;
1191         ram_release_page(block->idstr, offset);
1192     }
1193     return len;
1194 }
1195
1196 /**
1197  * save_zero_page: send the zero page to the stream
1198  *
1199  * Returns the number of pages written.
1200  *
1201  * @rs: current RAM state
1202  * @block: block that contains the page we want to send
1203  * @offset: offset inside the block for the page
1204  */
1205 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1206 {
1207     int len = save_zero_page_to_file(rs, rs->f, block, offset);
1208
1209     if (len) {
1210         ram_counters.duplicate++;
1211         ram_counters.transferred += len;
1212         return 1;
1213     }
1214     return -1;
1215 }
1216
1217 /*
1218  * @pages: the number of pages written by the control path,
1219  *        < 0 - error
1220  *        > 0 - number of pages written
1221  *
1222  * Return true if the pages has been saved, otherwise false is returned.
1223  */
1224 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1225                               int *pages)
1226 {
1227     uint64_t bytes_xmit = 0;
1228     int ret;
1229
1230     *pages = -1;
1231     ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1232                                 &bytes_xmit);
1233     if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1234         return false;
1235     }
1236
1237     if (bytes_xmit) {
1238         ram_counters.transferred += bytes_xmit;
1239         *pages = 1;
1240     }
1241
1242     if (ret == RAM_SAVE_CONTROL_DELAYED) {
1243         return true;
1244     }
1245
1246     if (bytes_xmit > 0) {
1247         ram_counters.normal++;
1248     } else if (bytes_xmit == 0) {
1249         ram_counters.duplicate++;
1250     }
1251
1252     return true;
1253 }
1254
1255 /*
1256  * directly send the page to the stream
1257  *
1258  * Returns the number of pages written.
1259  *
1260  * @rs: current RAM state
1261  * @block: block that contains the page we want to send
1262  * @offset: offset inside the block for the page
1263  * @buf: the page to be sent
1264  * @async: send to page asyncly
1265  */
1266 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1267                             uint8_t *buf, bool async)
1268 {
1269     ram_counters.transferred += save_page_header(rs, rs->f, block,
1270                                                  offset | RAM_SAVE_FLAG_PAGE);
1271     if (async) {
1272         qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1273                               migrate_release_ram() &
1274                               migration_in_postcopy());
1275     } else {
1276         qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1277     }
1278     ram_counters.transferred += TARGET_PAGE_SIZE;
1279     ram_counters.normal++;
1280     return 1;
1281 }
1282
1283 /**
1284  * ram_save_page: send the given page to the stream
1285  *
1286  * Returns the number of pages written.
1287  *          < 0 - error
1288  *          >=0 - Number of pages written - this might legally be 0
1289  *                if xbzrle noticed the page was the same.
1290  *
1291  * @rs: current RAM state
1292  * @block: block that contains the page we want to send
1293  * @offset: offset inside the block for the page
1294  */
1295 static int ram_save_page(RAMState *rs, PageSearchStatus *pss)
1296 {
1297     int pages = -1;
1298     uint8_t *p;
1299     bool send_async = true;
1300     RAMBlock *block = pss->block;
1301     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1302     ram_addr_t current_addr = block->offset + offset;
1303
1304     p = block->host + offset;
1305     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1306
1307     XBZRLE_cache_lock();
1308     if (rs->xbzrle_enabled && !migration_in_postcopy()) {
1309         pages = save_xbzrle_page(rs, &p, current_addr, block,
1310                                  offset);
1311         if (!rs->last_stage) {
1312             /* Can't send this cached data async, since the cache page
1313              * might get updated before it gets to the wire
1314              */
1315             send_async = false;
1316         }
1317     }
1318
1319     /* XBZRLE overflow or normal page */
1320     if (pages == -1) {
1321         pages = save_normal_page(rs, block, offset, p, send_async);
1322     }
1323
1324     XBZRLE_cache_unlock();
1325
1326     return pages;
1327 }
1328
1329 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
1330                                  ram_addr_t offset)
1331 {
1332     if (multifd_queue_page(rs->f, block, offset) < 0) {
1333         return -1;
1334     }
1335     ram_counters.normal++;
1336
1337     return 1;
1338 }
1339
1340 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1341                                  ram_addr_t offset, uint8_t *source_buf)
1342 {
1343     RAMState *rs = ram_state;
1344     uint8_t *p = block->host + offset;
1345     int ret;
1346
1347     if (save_zero_page_to_file(rs, f, block, offset)) {
1348         return true;
1349     }
1350
1351     save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1352
1353     /*
1354      * copy it to a internal buffer to avoid it being modified by VM
1355      * so that we can catch up the error during compression and
1356      * decompression
1357      */
1358     memcpy(source_buf, p, TARGET_PAGE_SIZE);
1359     ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1360     if (ret < 0) {
1361         qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
1362         error_report("compressed data failed!");
1363     }
1364     return false;
1365 }
1366
1367 static void
1368 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1369 {
1370     ram_counters.transferred += bytes_xmit;
1371
1372     if (param->zero_page) {
1373         ram_counters.duplicate++;
1374         return;
1375     }
1376
1377     /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1378     compression_counters.compressed_size += bytes_xmit - 8;
1379     compression_counters.pages++;
1380 }
1381
1382 static bool save_page_use_compression(RAMState *rs);
1383
1384 static void flush_compressed_data(RAMState *rs)
1385 {
1386     int idx, len, thread_count;
1387
1388     if (!save_page_use_compression(rs)) {
1389         return;
1390     }
1391     thread_count = migrate_compress_threads();
1392
1393     qemu_mutex_lock(&comp_done_lock);
1394     for (idx = 0; idx < thread_count; idx++) {
1395         while (!comp_param[idx].done) {
1396             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1397         }
1398     }
1399     qemu_mutex_unlock(&comp_done_lock);
1400
1401     for (idx = 0; idx < thread_count; idx++) {
1402         qemu_mutex_lock(&comp_param[idx].mutex);
1403         if (!comp_param[idx].quit) {
1404             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1405             /*
1406              * it's safe to fetch zero_page without holding comp_done_lock
1407              * as there is no further request submitted to the thread,
1408              * i.e, the thread should be waiting for a request at this point.
1409              */
1410             update_compress_thread_counts(&comp_param[idx], len);
1411         }
1412         qemu_mutex_unlock(&comp_param[idx].mutex);
1413     }
1414 }
1415
1416 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1417                                        ram_addr_t offset)
1418 {
1419     param->block = block;
1420     param->offset = offset;
1421 }
1422
1423 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1424                                            ram_addr_t offset)
1425 {
1426     int idx, thread_count, bytes_xmit = -1, pages = -1;
1427     bool wait = migrate_compress_wait_thread();
1428
1429     thread_count = migrate_compress_threads();
1430     qemu_mutex_lock(&comp_done_lock);
1431 retry:
1432     for (idx = 0; idx < thread_count; idx++) {
1433         if (comp_param[idx].done) {
1434             comp_param[idx].done = false;
1435             bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1436             qemu_mutex_lock(&comp_param[idx].mutex);
1437             set_compress_params(&comp_param[idx], block, offset);
1438             qemu_cond_signal(&comp_param[idx].cond);
1439             qemu_mutex_unlock(&comp_param[idx].mutex);
1440             pages = 1;
1441             update_compress_thread_counts(&comp_param[idx], bytes_xmit);
1442             break;
1443         }
1444     }
1445
1446     /*
1447      * wait for the free thread if the user specifies 'compress-wait-thread',
1448      * otherwise we will post the page out in the main thread as normal page.
1449      */
1450     if (pages < 0 && wait) {
1451         qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1452         goto retry;
1453     }
1454     qemu_mutex_unlock(&comp_done_lock);
1455
1456     return pages;
1457 }
1458
1459 /**
1460  * find_dirty_block: find the next dirty page and update any state
1461  * associated with the search process.
1462  *
1463  * Returns true if a page is found
1464  *
1465  * @rs: current RAM state
1466  * @pss: data about the state of the current dirty page scan
1467  * @again: set to false if the search has scanned the whole of RAM
1468  */
1469 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1470 {
1471     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1472     if (pss->complete_round && pss->block == rs->last_seen_block &&
1473         pss->page >= rs->last_page) {
1474         /*
1475          * We've been once around the RAM and haven't found anything.
1476          * Give up.
1477          */
1478         *again = false;
1479         return false;
1480     }
1481     if (!offset_in_ramblock(pss->block,
1482                             ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) {
1483         /* Didn't find anything in this RAM Block */
1484         pss->page = 0;
1485         pss->block = QLIST_NEXT_RCU(pss->block, next);
1486         if (!pss->block) {
1487             /*
1488              * If memory migration starts over, we will meet a dirtied page
1489              * which may still exists in compression threads's ring, so we
1490              * should flush the compressed data to make sure the new page
1491              * is not overwritten by the old one in the destination.
1492              *
1493              * Also If xbzrle is on, stop using the data compression at this
1494              * point. In theory, xbzrle can do better than compression.
1495              */
1496             flush_compressed_data(rs);
1497
1498             /* Hit the end of the list */
1499             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1500             /* Flag that we've looped */
1501             pss->complete_round = true;
1502             /* After the first round, enable XBZRLE. */
1503             if (migrate_use_xbzrle()) {
1504                 rs->xbzrle_enabled = true;
1505             }
1506         }
1507         /* Didn't find anything this time, but try again on the new block */
1508         *again = true;
1509         return false;
1510     } else {
1511         /* Can go around again, but... */
1512         *again = true;
1513         /* We've found something so probably don't need to */
1514         return true;
1515     }
1516 }
1517
1518 /**
1519  * unqueue_page: gets a page of the queue
1520  *
1521  * Helper for 'get_queued_page' - gets a page off the queue
1522  *
1523  * Returns the block of the page (or NULL if none available)
1524  *
1525  * @rs: current RAM state
1526  * @offset: used to return the offset within the RAMBlock
1527  */
1528 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1529 {
1530     RAMBlock *block = NULL;
1531
1532     if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) {
1533         return NULL;
1534     }
1535
1536     QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
1537     if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1538         struct RAMSrcPageRequest *entry =
1539                                 QSIMPLEQ_FIRST(&rs->src_page_requests);
1540         block = entry->rb;
1541         *offset = entry->offset;
1542
1543         if (entry->len > TARGET_PAGE_SIZE) {
1544             entry->len -= TARGET_PAGE_SIZE;
1545             entry->offset += TARGET_PAGE_SIZE;
1546         } else {
1547             memory_region_unref(block->mr);
1548             QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1549             g_free(entry);
1550             migration_consume_urgent_request();
1551         }
1552     }
1553
1554     return block;
1555 }
1556
1557 #if defined(__linux__)
1558 /**
1559  * poll_fault_page: try to get next UFFD write fault page and, if pending fault
1560  *   is found, return RAM block pointer and page offset
1561  *
1562  * Returns pointer to the RAMBlock containing faulting page,
1563  *   NULL if no write faults are pending
1564  *
1565  * @rs: current RAM state
1566  * @offset: page offset from the beginning of the block
1567  */
1568 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1569 {
1570     struct uffd_msg uffd_msg;
1571     void *page_address;
1572     RAMBlock *block;
1573     int res;
1574
1575     if (!migrate_background_snapshot()) {
1576         return NULL;
1577     }
1578
1579     res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1);
1580     if (res <= 0) {
1581         return NULL;
1582     }
1583
1584     page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address;
1585     block = qemu_ram_block_from_host(page_address, false, offset);
1586     assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0);
1587     return block;
1588 }
1589
1590 /**
1591  * ram_save_release_protection: release UFFD write protection after
1592  *   a range of pages has been saved
1593  *
1594  * @rs: current RAM state
1595  * @pss: page-search-status structure
1596  * @start_page: index of the first page in the range relative to pss->block
1597  *
1598  * Returns 0 on success, negative value in case of an error
1599 */
1600 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1601         unsigned long start_page)
1602 {
1603     int res = 0;
1604
1605     /* Check if page is from UFFD-managed region. */
1606     if (pss->block->flags & RAM_UF_WRITEPROTECT) {
1607         void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS);
1608         uint64_t run_length = (pss->page - start_page + 1) << TARGET_PAGE_BITS;
1609
1610         /* Flush async buffers before un-protect. */
1611         qemu_fflush(rs->f);
1612         /* Un-protect memory range. */
1613         res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
1614                 false, false);
1615     }
1616
1617     return res;
1618 }
1619
1620 /* ram_write_tracking_available: check if kernel supports required UFFD features
1621  *
1622  * Returns true if supports, false otherwise
1623  */
1624 bool ram_write_tracking_available(void)
1625 {
1626     uint64_t uffd_features;
1627     int res;
1628
1629     res = uffd_query_features(&uffd_features);
1630     return (res == 0 &&
1631             (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0);
1632 }
1633
1634 /* ram_write_tracking_compatible: check if guest configuration is
1635  *   compatible with 'write-tracking'
1636  *
1637  * Returns true if compatible, false otherwise
1638  */
1639 bool ram_write_tracking_compatible(void)
1640 {
1641     const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT);
1642     int uffd_fd;
1643     RAMBlock *block;
1644     bool ret = false;
1645
1646     /* Open UFFD file descriptor */
1647     uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false);
1648     if (uffd_fd < 0) {
1649         return false;
1650     }
1651
1652     RCU_READ_LOCK_GUARD();
1653
1654     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1655         uint64_t uffd_ioctls;
1656
1657         /* Nothing to do with read-only and MMIO-writable regions */
1658         if (block->mr->readonly || block->mr->rom_device) {
1659             continue;
1660         }
1661         /* Try to register block memory via UFFD-IO to track writes */
1662         if (uffd_register_memory(uffd_fd, block->host, block->max_length,
1663                 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) {
1664             goto out;
1665         }
1666         if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) {
1667             goto out;
1668         }
1669     }
1670     ret = true;
1671
1672 out:
1673     uffd_close_fd(uffd_fd);
1674     return ret;
1675 }
1676
1677 static inline void populate_read_range(RAMBlock *block, ram_addr_t offset,
1678                                        ram_addr_t size)
1679 {
1680     /*
1681      * We read one byte of each page; this will preallocate page tables if
1682      * required and populate the shared zeropage on MAP_PRIVATE anonymous memory
1683      * where no page was populated yet. This might require adaption when
1684      * supporting other mappings, like shmem.
1685      */
1686     for (; offset < size; offset += block->page_size) {
1687         char tmp = *((char *)block->host + offset);
1688
1689         /* Don't optimize the read out */
1690         asm volatile("" : "+r" (tmp));
1691     }
1692 }
1693
1694 static inline int populate_read_section(MemoryRegionSection *section,
1695                                         void *opaque)
1696 {
1697     const hwaddr size = int128_get64(section->size);
1698     hwaddr offset = section->offset_within_region;
1699     RAMBlock *block = section->mr->ram_block;
1700
1701     populate_read_range(block, offset, size);
1702     return 0;
1703 }
1704
1705 /*
1706  * ram_block_populate_read: preallocate page tables and populate pages in the
1707  *   RAM block by reading a byte of each page.
1708  *
1709  * Since it's solely used for userfault_fd WP feature, here we just
1710  *   hardcode page size to qemu_real_host_page_size.
1711  *
1712  * @block: RAM block to populate
1713  */
1714 static void ram_block_populate_read(RAMBlock *rb)
1715 {
1716     /*
1717      * Skip populating all pages that fall into a discarded range as managed by
1718      * a RamDiscardManager responsible for the mapped memory region of the
1719      * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock
1720      * must not get populated automatically. We don't have to track
1721      * modifications via userfaultfd WP reliably, because these pages will
1722      * not be part of the migration stream either way -- see
1723      * ramblock_dirty_bitmap_exclude_discarded_pages().
1724      *
1725      * Note: The result is only stable while migrating (precopy/postcopy).
1726      */
1727     if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1728         RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1729         MemoryRegionSection section = {
1730             .mr = rb->mr,
1731             .offset_within_region = 0,
1732             .size = rb->mr->size,
1733         };
1734
1735         ram_discard_manager_replay_populated(rdm, &section,
1736                                              populate_read_section, NULL);
1737     } else {
1738         populate_read_range(rb, 0, rb->used_length);
1739     }
1740 }
1741
1742 /*
1743  * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking
1744  */
1745 void ram_write_tracking_prepare(void)
1746 {
1747     RAMBlock *block;
1748
1749     RCU_READ_LOCK_GUARD();
1750
1751     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1752         /* Nothing to do with read-only and MMIO-writable regions */
1753         if (block->mr->readonly || block->mr->rom_device) {
1754             continue;
1755         }
1756
1757         /*
1758          * Populate pages of the RAM block before enabling userfault_fd
1759          * write protection.
1760          *
1761          * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with
1762          * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip
1763          * pages with pte_none() entries in page table.
1764          */
1765         ram_block_populate_read(block);
1766     }
1767 }
1768
1769 /*
1770  * ram_write_tracking_start: start UFFD-WP memory tracking
1771  *
1772  * Returns 0 for success or negative value in case of error
1773  */
1774 int ram_write_tracking_start(void)
1775 {
1776     int uffd_fd;
1777     RAMState *rs = ram_state;
1778     RAMBlock *block;
1779
1780     /* Open UFFD file descriptor */
1781     uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true);
1782     if (uffd_fd < 0) {
1783         return uffd_fd;
1784     }
1785     rs->uffdio_fd = uffd_fd;
1786
1787     RCU_READ_LOCK_GUARD();
1788
1789     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1790         /* Nothing to do with read-only and MMIO-writable regions */
1791         if (block->mr->readonly || block->mr->rom_device) {
1792             continue;
1793         }
1794
1795         /* Register block memory with UFFD to track writes */
1796         if (uffd_register_memory(rs->uffdio_fd, block->host,
1797                 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
1798             goto fail;
1799         }
1800         /* Apply UFFD write protection to the block memory range */
1801         if (uffd_change_protection(rs->uffdio_fd, block->host,
1802                 block->max_length, true, false)) {
1803             goto fail;
1804         }
1805         block->flags |= RAM_UF_WRITEPROTECT;
1806         memory_region_ref(block->mr);
1807
1808         trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size,
1809                 block->host, block->max_length);
1810     }
1811
1812     return 0;
1813
1814 fail:
1815     error_report("ram_write_tracking_start() failed: restoring initial memory state");
1816
1817     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1818         if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1819             continue;
1820         }
1821         /*
1822          * In case some memory block failed to be write-protected
1823          * remove protection and unregister all succeeded RAM blocks
1824          */
1825         uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1826                 false, false);
1827         uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1828         /* Cleanup flags and remove reference */
1829         block->flags &= ~RAM_UF_WRITEPROTECT;
1830         memory_region_unref(block->mr);
1831     }
1832
1833     uffd_close_fd(uffd_fd);
1834     rs->uffdio_fd = -1;
1835     return -1;
1836 }
1837
1838 /**
1839  * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
1840  */
1841 void ram_write_tracking_stop(void)
1842 {
1843     RAMState *rs = ram_state;
1844     RAMBlock *block;
1845
1846     RCU_READ_LOCK_GUARD();
1847
1848     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1849         if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1850             continue;
1851         }
1852         /* Remove protection and unregister all affected RAM blocks */
1853         uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1854                 false, false);
1855         uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1856
1857         trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size,
1858                 block->host, block->max_length);
1859
1860         /* Cleanup flags and remove reference */
1861         block->flags &= ~RAM_UF_WRITEPROTECT;
1862         memory_region_unref(block->mr);
1863     }
1864
1865     /* Finally close UFFD file descriptor */
1866     uffd_close_fd(rs->uffdio_fd);
1867     rs->uffdio_fd = -1;
1868 }
1869
1870 #else
1871 /* No target OS support, stubs just fail or ignore */
1872
1873 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1874 {
1875     (void) rs;
1876     (void) offset;
1877
1878     return NULL;
1879 }
1880
1881 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1882         unsigned long start_page)
1883 {
1884     (void) rs;
1885     (void) pss;
1886     (void) start_page;
1887
1888     return 0;
1889 }
1890
1891 bool ram_write_tracking_available(void)
1892 {
1893     return false;
1894 }
1895
1896 bool ram_write_tracking_compatible(void)
1897 {
1898     assert(0);
1899     return false;
1900 }
1901
1902 int ram_write_tracking_start(void)
1903 {
1904     assert(0);
1905     return -1;
1906 }
1907
1908 void ram_write_tracking_stop(void)
1909 {
1910     assert(0);
1911 }
1912 #endif /* defined(__linux__) */
1913
1914 /**
1915  * get_queued_page: unqueue a page from the postcopy requests
1916  *
1917  * Skips pages that are already sent (!dirty)
1918  *
1919  * Returns true if a queued page is found
1920  *
1921  * @rs: current RAM state
1922  * @pss: data about the state of the current dirty page scan
1923  */
1924 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1925 {
1926     RAMBlock  *block;
1927     ram_addr_t offset;
1928     bool dirty;
1929
1930     do {
1931         block = unqueue_page(rs, &offset);
1932         /*
1933          * We're sending this page, and since it's postcopy nothing else
1934          * will dirty it, and we must make sure it doesn't get sent again
1935          * even if this queue request was received after the background
1936          * search already sent it.
1937          */
1938         if (block) {
1939             unsigned long page;
1940
1941             page = offset >> TARGET_PAGE_BITS;
1942             dirty = test_bit(page, block->bmap);
1943             if (!dirty) {
1944                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1945                                                 page);
1946             } else {
1947                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1948             }
1949         }
1950
1951     } while (block && !dirty);
1952
1953     if (!block) {
1954         /*
1955          * Poll write faults too if background snapshot is enabled; that's
1956          * when we have vcpus got blocked by the write protected pages.
1957          */
1958         block = poll_fault_page(rs, &offset);
1959     }
1960
1961     if (block) {
1962         /*
1963          * We want the background search to continue from the queued page
1964          * since the guest is likely to want other pages near to the page
1965          * it just requested.
1966          */
1967         pss->block = block;
1968         pss->page = offset >> TARGET_PAGE_BITS;
1969
1970         /*
1971          * This unqueued page would break the "one round" check, even is
1972          * really rare.
1973          */
1974         pss->complete_round = false;
1975     }
1976
1977     return !!block;
1978 }
1979
1980 /**
1981  * migration_page_queue_free: drop any remaining pages in the ram
1982  * request queue
1983  *
1984  * It should be empty at the end anyway, but in error cases there may
1985  * be some left.  in case that there is any page left, we drop it.
1986  *
1987  */
1988 static void migration_page_queue_free(RAMState *rs)
1989 {
1990     struct RAMSrcPageRequest *mspr, *next_mspr;
1991     /* This queue generally should be empty - but in the case of a failed
1992      * migration might have some droppings in.
1993      */
1994     RCU_READ_LOCK_GUARD();
1995     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1996         memory_region_unref(mspr->rb->mr);
1997         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1998         g_free(mspr);
1999     }
2000 }
2001
2002 /**
2003  * ram_save_queue_pages: queue the page for transmission
2004  *
2005  * A request from postcopy destination for example.
2006  *
2007  * Returns zero on success or negative on error
2008  *
2009  * @rbname: Name of the RAMBLock of the request. NULL means the
2010  *          same that last one.
2011  * @start: starting address from the start of the RAMBlock
2012  * @len: length (in bytes) to send
2013  */
2014 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
2015 {
2016     RAMBlock *ramblock;
2017     RAMState *rs = ram_state;
2018
2019     ram_counters.postcopy_requests++;
2020     RCU_READ_LOCK_GUARD();
2021
2022     if (!rbname) {
2023         /* Reuse last RAMBlock */
2024         ramblock = rs->last_req_rb;
2025
2026         if (!ramblock) {
2027             /*
2028              * Shouldn't happen, we can't reuse the last RAMBlock if
2029              * it's the 1st request.
2030              */
2031             error_report("ram_save_queue_pages no previous block");
2032             return -1;
2033         }
2034     } else {
2035         ramblock = qemu_ram_block_by_name(rbname);
2036
2037         if (!ramblock) {
2038             /* We shouldn't be asked for a non-existent RAMBlock */
2039             error_report("ram_save_queue_pages no block '%s'", rbname);
2040             return -1;
2041         }
2042         rs->last_req_rb = ramblock;
2043     }
2044     trace_ram_save_queue_pages(ramblock->idstr, start, len);
2045     if (!offset_in_ramblock(ramblock, start + len - 1)) {
2046         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
2047                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
2048                      __func__, start, len, ramblock->used_length);
2049         return -1;
2050     }
2051
2052     struct RAMSrcPageRequest *new_entry =
2053         g_malloc0(sizeof(struct RAMSrcPageRequest));
2054     new_entry->rb = ramblock;
2055     new_entry->offset = start;
2056     new_entry->len = len;
2057
2058     memory_region_ref(ramblock->mr);
2059     qemu_mutex_lock(&rs->src_page_req_mutex);
2060     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
2061     migration_make_urgent_request();
2062     qemu_mutex_unlock(&rs->src_page_req_mutex);
2063
2064     return 0;
2065 }
2066
2067 static bool save_page_use_compression(RAMState *rs)
2068 {
2069     if (!migrate_use_compression()) {
2070         return false;
2071     }
2072
2073     /*
2074      * If xbzrle is enabled (e.g., after first round of migration), stop
2075      * using the data compression. In theory, xbzrle can do better than
2076      * compression.
2077      */
2078     if (rs->xbzrle_enabled) {
2079         return false;
2080     }
2081
2082     return true;
2083 }
2084
2085 /*
2086  * try to compress the page before posting it out, return true if the page
2087  * has been properly handled by compression, otherwise needs other
2088  * paths to handle it
2089  */
2090 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
2091 {
2092     if (!save_page_use_compression(rs)) {
2093         return false;
2094     }
2095
2096     /*
2097      * When starting the process of a new block, the first page of
2098      * the block should be sent out before other pages in the same
2099      * block, and all the pages in last block should have been sent
2100      * out, keeping this order is important, because the 'cont' flag
2101      * is used to avoid resending the block name.
2102      *
2103      * We post the fist page as normal page as compression will take
2104      * much CPU resource.
2105      */
2106     if (block != rs->last_sent_block) {
2107         flush_compressed_data(rs);
2108         return false;
2109     }
2110
2111     if (compress_page_with_multi_thread(rs, block, offset) > 0) {
2112         return true;
2113     }
2114
2115     compression_counters.busy++;
2116     return false;
2117 }
2118
2119 /**
2120  * ram_save_target_page: save one target page
2121  *
2122  * Returns the number of pages written
2123  *
2124  * @rs: current RAM state
2125  * @pss: data about the page we want to send
2126  */
2127 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss)
2128 {
2129     RAMBlock *block = pss->block;
2130     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
2131     int res;
2132
2133     if (control_save_page(rs, block, offset, &res)) {
2134         return res;
2135     }
2136
2137     if (save_compress_page(rs, block, offset)) {
2138         return 1;
2139     }
2140
2141     res = save_zero_page(rs, block, offset);
2142     if (res > 0) {
2143         /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2144          * page would be stale
2145          */
2146         if (!save_page_use_compression(rs)) {
2147             XBZRLE_cache_lock();
2148             xbzrle_cache_zero_page(rs, block->offset + offset);
2149             XBZRLE_cache_unlock();
2150         }
2151         return res;
2152     }
2153
2154     /*
2155      * Do not use multifd for:
2156      * 1. Compression as the first page in the new block should be posted out
2157      *    before sending the compressed page
2158      * 2. In postcopy as one whole host page should be placed
2159      */
2160     if (!save_page_use_compression(rs) && migrate_use_multifd()
2161         && !migration_in_postcopy()) {
2162         return ram_save_multifd_page(rs, block, offset);
2163     }
2164
2165     return ram_save_page(rs, pss);
2166 }
2167
2168 /**
2169  * ram_save_host_page: save a whole host page
2170  *
2171  * Starting at *offset send pages up to the end of the current host
2172  * page. It's valid for the initial offset to point into the middle of
2173  * a host page in which case the remainder of the hostpage is sent.
2174  * Only dirty target pages are sent. Note that the host page size may
2175  * be a huge page for this block.
2176  * The saving stops at the boundary of the used_length of the block
2177  * if the RAMBlock isn't a multiple of the host page size.
2178  *
2179  * Returns the number of pages written or negative on error
2180  *
2181  * @rs: current RAM state
2182  * @ms: current migration state
2183  * @pss: data about the page we want to send
2184  */
2185 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss)
2186 {
2187     int tmppages, pages = 0;
2188     size_t pagesize_bits =
2189         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2190     unsigned long hostpage_boundary =
2191         QEMU_ALIGN_UP(pss->page + 1, pagesize_bits);
2192     unsigned long start_page = pss->page;
2193     int res;
2194
2195     if (ramblock_is_ignored(pss->block)) {
2196         error_report("block %s should not be migrated !", pss->block->idstr);
2197         return 0;
2198     }
2199
2200     do {
2201         /* Check the pages is dirty and if it is send it */
2202         if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
2203             tmppages = ram_save_target_page(rs, pss);
2204             if (tmppages < 0) {
2205                 return tmppages;
2206             }
2207
2208             pages += tmppages;
2209             /*
2210              * Allow rate limiting to happen in the middle of huge pages if
2211              * something is sent in the current iteration.
2212              */
2213             if (pagesize_bits > 1 && tmppages > 0) {
2214                 migration_rate_limit();
2215             }
2216         }
2217         pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
2218     } while ((pss->page < hostpage_boundary) &&
2219              offset_in_ramblock(pss->block,
2220                                 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS));
2221     /* The offset we leave with is the min boundary of host page and block */
2222     pss->page = MIN(pss->page, hostpage_boundary) - 1;
2223
2224     res = ram_save_release_protection(rs, pss, start_page);
2225     return (res < 0 ? res : pages);
2226 }
2227
2228 /**
2229  * ram_find_and_save_block: finds a dirty page and sends it to f
2230  *
2231  * Called within an RCU critical section.
2232  *
2233  * Returns the number of pages written where zero means no dirty pages,
2234  * or negative on error
2235  *
2236  * @rs: current RAM state
2237  *
2238  * On systems where host-page-size > target-page-size it will send all the
2239  * pages in a host page that are dirty.
2240  */
2241 static int ram_find_and_save_block(RAMState *rs)
2242 {
2243     PageSearchStatus pss;
2244     int pages = 0;
2245     bool again, found;
2246
2247     /* No dirty page as there is zero RAM */
2248     if (!ram_bytes_total()) {
2249         return pages;
2250     }
2251
2252     pss.block = rs->last_seen_block;
2253     pss.page = rs->last_page;
2254     pss.complete_round = false;
2255
2256     if (!pss.block) {
2257         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
2258     }
2259
2260     do {
2261         again = true;
2262         found = get_queued_page(rs, &pss);
2263
2264         if (!found) {
2265             /* priority queue empty, so just search for something dirty */
2266             found = find_dirty_block(rs, &pss, &again);
2267         }
2268
2269         if (found) {
2270             pages = ram_save_host_page(rs, &pss);
2271         }
2272     } while (!pages && again);
2273
2274     rs->last_seen_block = pss.block;
2275     rs->last_page = pss.page;
2276
2277     return pages;
2278 }
2279
2280 void acct_update_position(QEMUFile *f, size_t size, bool zero)
2281 {
2282     uint64_t pages = size / TARGET_PAGE_SIZE;
2283
2284     if (zero) {
2285         ram_counters.duplicate += pages;
2286     } else {
2287         ram_counters.normal += pages;
2288         ram_counters.transferred += size;
2289         qemu_update_position(f, size);
2290     }
2291 }
2292
2293 static uint64_t ram_bytes_total_common(bool count_ignored)
2294 {
2295     RAMBlock *block;
2296     uint64_t total = 0;
2297
2298     RCU_READ_LOCK_GUARD();
2299
2300     if (count_ignored) {
2301         RAMBLOCK_FOREACH_MIGRATABLE(block) {
2302             total += block->used_length;
2303         }
2304     } else {
2305         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2306             total += block->used_length;
2307         }
2308     }
2309     return total;
2310 }
2311
2312 uint64_t ram_bytes_total(void)
2313 {
2314     return ram_bytes_total_common(false);
2315 }
2316
2317 static void xbzrle_load_setup(void)
2318 {
2319     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2320 }
2321
2322 static void xbzrle_load_cleanup(void)
2323 {
2324     g_free(XBZRLE.decoded_buf);
2325     XBZRLE.decoded_buf = NULL;
2326 }
2327
2328 static void ram_state_cleanup(RAMState **rsp)
2329 {
2330     if (*rsp) {
2331         migration_page_queue_free(*rsp);
2332         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2333         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2334         g_free(*rsp);
2335         *rsp = NULL;
2336     }
2337 }
2338
2339 static void xbzrle_cleanup(void)
2340 {
2341     XBZRLE_cache_lock();
2342     if (XBZRLE.cache) {
2343         cache_fini(XBZRLE.cache);
2344         g_free(XBZRLE.encoded_buf);
2345         g_free(XBZRLE.current_buf);
2346         g_free(XBZRLE.zero_target_page);
2347         XBZRLE.cache = NULL;
2348         XBZRLE.encoded_buf = NULL;
2349         XBZRLE.current_buf = NULL;
2350         XBZRLE.zero_target_page = NULL;
2351     }
2352     XBZRLE_cache_unlock();
2353 }
2354
2355 static void ram_save_cleanup(void *opaque)
2356 {
2357     RAMState **rsp = opaque;
2358     RAMBlock *block;
2359
2360     /* We don't use dirty log with background snapshots */
2361     if (!migrate_background_snapshot()) {
2362         /* caller have hold iothread lock or is in a bh, so there is
2363          * no writing race against the migration bitmap
2364          */
2365         if (global_dirty_tracking & GLOBAL_DIRTY_MIGRATION) {
2366             /*
2367              * do not stop dirty log without starting it, since
2368              * memory_global_dirty_log_stop will assert that
2369              * memory_global_dirty_log_start/stop used in pairs
2370              */
2371             memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
2372         }
2373     }
2374
2375     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2376         g_free(block->clear_bmap);
2377         block->clear_bmap = NULL;
2378         g_free(block->bmap);
2379         block->bmap = NULL;
2380     }
2381
2382     xbzrle_cleanup();
2383     compress_threads_save_cleanup();
2384     ram_state_cleanup(rsp);
2385 }
2386
2387 static void ram_state_reset(RAMState *rs)
2388 {
2389     rs->last_seen_block = NULL;
2390     rs->last_sent_block = NULL;
2391     rs->last_page = 0;
2392     rs->last_version = ram_list.version;
2393     rs->xbzrle_enabled = false;
2394 }
2395
2396 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2397
2398 /*
2399  * 'expected' is the value you expect the bitmap mostly to be full
2400  * of; it won't bother printing lines that are all this value.
2401  * If 'todump' is null the migration bitmap is dumped.
2402  */
2403 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
2404                            unsigned long pages)
2405 {
2406     int64_t cur;
2407     int64_t linelen = 128;
2408     char linebuf[129];
2409
2410     for (cur = 0; cur < pages; cur += linelen) {
2411         int64_t curb;
2412         bool found = false;
2413         /*
2414          * Last line; catch the case where the line length
2415          * is longer than remaining ram
2416          */
2417         if (cur + linelen > pages) {
2418             linelen = pages - cur;
2419         }
2420         for (curb = 0; curb < linelen; curb++) {
2421             bool thisbit = test_bit(cur + curb, todump);
2422             linebuf[curb] = thisbit ? '1' : '.';
2423             found = found || (thisbit != expected);
2424         }
2425         if (found) {
2426             linebuf[curb] = '\0';
2427             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
2428         }
2429     }
2430 }
2431
2432 /* **** functions for postcopy ***** */
2433
2434 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2435 {
2436     struct RAMBlock *block;
2437
2438     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2439         unsigned long *bitmap = block->bmap;
2440         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2441         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2442
2443         while (run_start < range) {
2444             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2445             ram_discard_range(block->idstr,
2446                               ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
2447                               ((ram_addr_t)(run_end - run_start))
2448                                 << TARGET_PAGE_BITS);
2449             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2450         }
2451     }
2452 }
2453
2454 /**
2455  * postcopy_send_discard_bm_ram: discard a RAMBlock
2456  *
2457  * Returns zero on success
2458  *
2459  * Callback from postcopy_each_ram_send_discard for each RAMBlock
2460  *
2461  * @ms: current migration state
2462  * @block: RAMBlock to discard
2463  */
2464 static int postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
2465 {
2466     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2467     unsigned long current;
2468     unsigned long *bitmap = block->bmap;
2469
2470     for (current = 0; current < end; ) {
2471         unsigned long one = find_next_bit(bitmap, end, current);
2472         unsigned long zero, discard_length;
2473
2474         if (one >= end) {
2475             break;
2476         }
2477
2478         zero = find_next_zero_bit(bitmap, end, one + 1);
2479
2480         if (zero >= end) {
2481             discard_length = end - one;
2482         } else {
2483             discard_length = zero - one;
2484         }
2485         postcopy_discard_send_range(ms, one, discard_length);
2486         current = one + discard_length;
2487     }
2488
2489     return 0;
2490 }
2491
2492 /**
2493  * postcopy_each_ram_send_discard: discard all RAMBlocks
2494  *
2495  * Returns 0 for success or negative for error
2496  *
2497  * Utility for the outgoing postcopy code.
2498  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
2499  *   passing it bitmap indexes and name.
2500  * (qemu_ram_foreach_block ends up passing unscaled lengths
2501  *  which would mean postcopy code would have to deal with target page)
2502  *
2503  * @ms: current migration state
2504  */
2505 static int postcopy_each_ram_send_discard(MigrationState *ms)
2506 {
2507     struct RAMBlock *block;
2508     int ret;
2509
2510     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2511         postcopy_discard_send_init(ms, block->idstr);
2512
2513         /*
2514          * Postcopy sends chunks of bitmap over the wire, but it
2515          * just needs indexes at this point, avoids it having
2516          * target page specific code.
2517          */
2518         ret = postcopy_send_discard_bm_ram(ms, block);
2519         postcopy_discard_send_finish(ms);
2520         if (ret) {
2521             return ret;
2522         }
2523     }
2524
2525     return 0;
2526 }
2527
2528 /**
2529  * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2530  *
2531  * Helper for postcopy_chunk_hostpages; it's called twice to
2532  * canonicalize the two bitmaps, that are similar, but one is
2533  * inverted.
2534  *
2535  * Postcopy requires that all target pages in a hostpage are dirty or
2536  * clean, not a mix.  This function canonicalizes the bitmaps.
2537  *
2538  * @ms: current migration state
2539  * @block: block that contains the page we want to canonicalize
2540  */
2541 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
2542 {
2543     RAMState *rs = ram_state;
2544     unsigned long *bitmap = block->bmap;
2545     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2546     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2547     unsigned long run_start;
2548
2549     if (block->page_size == TARGET_PAGE_SIZE) {
2550         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2551         return;
2552     }
2553
2554     /* Find a dirty page */
2555     run_start = find_next_bit(bitmap, pages, 0);
2556
2557     while (run_start < pages) {
2558
2559         /*
2560          * If the start of this run of pages is in the middle of a host
2561          * page, then we need to fixup this host page.
2562          */
2563         if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2564             /* Find the end of this run */
2565             run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2566             /*
2567              * If the end isn't at the start of a host page, then the
2568              * run doesn't finish at the end of a host page
2569              * and we need to discard.
2570              */
2571         }
2572
2573         if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2574             unsigned long page;
2575             unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2576                                                              host_ratio);
2577             run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2578
2579             /* Clean up the bitmap */
2580             for (page = fixup_start_addr;
2581                  page < fixup_start_addr + host_ratio; page++) {
2582                 /*
2583                  * Remark them as dirty, updating the count for any pages
2584                  * that weren't previously dirty.
2585                  */
2586                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2587             }
2588         }
2589
2590         /* Find the next dirty page for the next iteration */
2591         run_start = find_next_bit(bitmap, pages, run_start);
2592     }
2593 }
2594
2595 /**
2596  * postcopy_chunk_hostpages: discard any partially sent host page
2597  *
2598  * Utility for the outgoing postcopy code.
2599  *
2600  * Discard any partially sent host-page size chunks, mark any partially
2601  * dirty host-page size chunks as all dirty.  In this case the host-page
2602  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
2603  *
2604  * Returns zero on success
2605  *
2606  * @ms: current migration state
2607  * @block: block we want to work with
2608  */
2609 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
2610 {
2611     postcopy_discard_send_init(ms, block->idstr);
2612
2613     /*
2614      * Ensure that all partially dirty host pages are made fully dirty.
2615      */
2616     postcopy_chunk_hostpages_pass(ms, block);
2617
2618     postcopy_discard_send_finish(ms);
2619     return 0;
2620 }
2621
2622 /**
2623  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2624  *
2625  * Returns zero on success
2626  *
2627  * Transmit the set of pages to be discarded after precopy to the target
2628  * these are pages that:
2629  *     a) Have been previously transmitted but are now dirty again
2630  *     b) Pages that have never been transmitted, this ensures that
2631  *        any pages on the destination that have been mapped by background
2632  *        tasks get discarded (transparent huge pages is the specific concern)
2633  * Hopefully this is pretty sparse
2634  *
2635  * @ms: current migration state
2636  */
2637 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
2638 {
2639     RAMState *rs = ram_state;
2640     RAMBlock *block;
2641     int ret;
2642
2643     RCU_READ_LOCK_GUARD();
2644
2645     /* This should be our last sync, the src is now paused */
2646     migration_bitmap_sync(rs);
2647
2648     /* Easiest way to make sure we don't resume in the middle of a host-page */
2649     rs->last_seen_block = NULL;
2650     rs->last_sent_block = NULL;
2651     rs->last_page = 0;
2652
2653     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2654         /* Deal with TPS != HPS and huge pages */
2655         ret = postcopy_chunk_hostpages(ms, block);
2656         if (ret) {
2657             return ret;
2658         }
2659
2660 #ifdef DEBUG_POSTCOPY
2661         ram_debug_dump_bitmap(block->bmap, true,
2662                               block->used_length >> TARGET_PAGE_BITS);
2663 #endif
2664     }
2665     trace_ram_postcopy_send_discard_bitmap();
2666
2667     return postcopy_each_ram_send_discard(ms);
2668 }
2669
2670 /**
2671  * ram_discard_range: discard dirtied pages at the beginning of postcopy
2672  *
2673  * Returns zero on success
2674  *
2675  * @rbname: name of the RAMBlock of the request. NULL means the
2676  *          same that last one.
2677  * @start: RAMBlock starting page
2678  * @length: RAMBlock size
2679  */
2680 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2681 {
2682     trace_ram_discard_range(rbname, start, length);
2683
2684     RCU_READ_LOCK_GUARD();
2685     RAMBlock *rb = qemu_ram_block_by_name(rbname);
2686
2687     if (!rb) {
2688         error_report("ram_discard_range: Failed to find block '%s'", rbname);
2689         return -1;
2690     }
2691
2692     /*
2693      * On source VM, we don't need to update the received bitmap since
2694      * we don't even have one.
2695      */
2696     if (rb->receivedmap) {
2697         bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2698                      length >> qemu_target_page_bits());
2699     }
2700
2701     return ram_block_discard_range(rb, start, length);
2702 }
2703
2704 /*
2705  * For every allocation, we will try not to crash the VM if the
2706  * allocation failed.
2707  */
2708 static int xbzrle_init(void)
2709 {
2710     Error *local_err = NULL;
2711
2712     if (!migrate_use_xbzrle()) {
2713         return 0;
2714     }
2715
2716     XBZRLE_cache_lock();
2717
2718     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2719     if (!XBZRLE.zero_target_page) {
2720         error_report("%s: Error allocating zero page", __func__);
2721         goto err_out;
2722     }
2723
2724     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2725                               TARGET_PAGE_SIZE, &local_err);
2726     if (!XBZRLE.cache) {
2727         error_report_err(local_err);
2728         goto free_zero_page;
2729     }
2730
2731     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2732     if (!XBZRLE.encoded_buf) {
2733         error_report("%s: Error allocating encoded_buf", __func__);
2734         goto free_cache;
2735     }
2736
2737     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2738     if (!XBZRLE.current_buf) {
2739         error_report("%s: Error allocating current_buf", __func__);
2740         goto free_encoded_buf;
2741     }
2742
2743     /* We are all good */
2744     XBZRLE_cache_unlock();
2745     return 0;
2746
2747 free_encoded_buf:
2748     g_free(XBZRLE.encoded_buf);
2749     XBZRLE.encoded_buf = NULL;
2750 free_cache:
2751     cache_fini(XBZRLE.cache);
2752     XBZRLE.cache = NULL;
2753 free_zero_page:
2754     g_free(XBZRLE.zero_target_page);
2755     XBZRLE.zero_target_page = NULL;
2756 err_out:
2757     XBZRLE_cache_unlock();
2758     return -ENOMEM;
2759 }
2760
2761 static int ram_state_init(RAMState **rsp)
2762 {
2763     *rsp = g_try_new0(RAMState, 1);
2764
2765     if (!*rsp) {
2766         error_report("%s: Init ramstate fail", __func__);
2767         return -1;
2768     }
2769
2770     qemu_mutex_init(&(*rsp)->bitmap_mutex);
2771     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2772     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2773
2774     /*
2775      * Count the total number of pages used by ram blocks not including any
2776      * gaps due to alignment or unplugs.
2777      * This must match with the initial values of dirty bitmap.
2778      */
2779     (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2780     ram_state_reset(*rsp);
2781
2782     return 0;
2783 }
2784
2785 static void ram_list_init_bitmaps(void)
2786 {
2787     MigrationState *ms = migrate_get_current();
2788     RAMBlock *block;
2789     unsigned long pages;
2790     uint8_t shift;
2791
2792     /* Skip setting bitmap if there is no RAM */
2793     if (ram_bytes_total()) {
2794         shift = ms->clear_bitmap_shift;
2795         if (shift > CLEAR_BITMAP_SHIFT_MAX) {
2796             error_report("clear_bitmap_shift (%u) too big, using "
2797                          "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
2798             shift = CLEAR_BITMAP_SHIFT_MAX;
2799         } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
2800             error_report("clear_bitmap_shift (%u) too small, using "
2801                          "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
2802             shift = CLEAR_BITMAP_SHIFT_MIN;
2803         }
2804
2805         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2806             pages = block->max_length >> TARGET_PAGE_BITS;
2807             /*
2808              * The initial dirty bitmap for migration must be set with all
2809              * ones to make sure we'll migrate every guest RAM page to
2810              * destination.
2811              * Here we set RAMBlock.bmap all to 1 because when rebegin a
2812              * new migration after a failed migration, ram_list.
2813              * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
2814              * guest memory.
2815              */
2816             block->bmap = bitmap_new(pages);
2817             bitmap_set(block->bmap, 0, pages);
2818             block->clear_bmap_shift = shift;
2819             block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
2820         }
2821     }
2822 }
2823
2824 static void migration_bitmap_clear_discarded_pages(RAMState *rs)
2825 {
2826     unsigned long pages;
2827     RAMBlock *rb;
2828
2829     RCU_READ_LOCK_GUARD();
2830
2831     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
2832             pages = ramblock_dirty_bitmap_clear_discarded_pages(rb);
2833             rs->migration_dirty_pages -= pages;
2834     }
2835 }
2836
2837 static void ram_init_bitmaps(RAMState *rs)
2838 {
2839     /* For memory_global_dirty_log_start below.  */
2840     qemu_mutex_lock_iothread();
2841     qemu_mutex_lock_ramlist();
2842
2843     WITH_RCU_READ_LOCK_GUARD() {
2844         ram_list_init_bitmaps();
2845         /* We don't use dirty log with background snapshots */
2846         if (!migrate_background_snapshot()) {
2847             memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
2848             migration_bitmap_sync_precopy(rs);
2849         }
2850     }
2851     qemu_mutex_unlock_ramlist();
2852     qemu_mutex_unlock_iothread();
2853
2854     /*
2855      * After an eventual first bitmap sync, fixup the initial bitmap
2856      * containing all 1s to exclude any discarded pages from migration.
2857      */
2858     migration_bitmap_clear_discarded_pages(rs);
2859 }
2860
2861 static int ram_init_all(RAMState **rsp)
2862 {
2863     if (ram_state_init(rsp)) {
2864         return -1;
2865     }
2866
2867     if (xbzrle_init()) {
2868         ram_state_cleanup(rsp);
2869         return -1;
2870     }
2871
2872     ram_init_bitmaps(*rsp);
2873
2874     return 0;
2875 }
2876
2877 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2878 {
2879     RAMBlock *block;
2880     uint64_t pages = 0;
2881
2882     /*
2883      * Postcopy is not using xbzrle/compression, so no need for that.
2884      * Also, since source are already halted, we don't need to care
2885      * about dirty page logging as well.
2886      */
2887
2888     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2889         pages += bitmap_count_one(block->bmap,
2890                                   block->used_length >> TARGET_PAGE_BITS);
2891     }
2892
2893     /* This may not be aligned with current bitmaps. Recalculate. */
2894     rs->migration_dirty_pages = pages;
2895
2896     ram_state_reset(rs);
2897
2898     /* Update RAMState cache of output QEMUFile */
2899     rs->f = out;
2900
2901     trace_ram_state_resume_prepare(pages);
2902 }
2903
2904 /*
2905  * This function clears bits of the free pages reported by the caller from the
2906  * migration dirty bitmap. @addr is the host address corresponding to the
2907  * start of the continuous guest free pages, and @len is the total bytes of
2908  * those pages.
2909  */
2910 void qemu_guest_free_page_hint(void *addr, size_t len)
2911 {
2912     RAMBlock *block;
2913     ram_addr_t offset;
2914     size_t used_len, start, npages;
2915     MigrationState *s = migrate_get_current();
2916
2917     /* This function is currently expected to be used during live migration */
2918     if (!migration_is_setup_or_active(s->state)) {
2919         return;
2920     }
2921
2922     for (; len > 0; len -= used_len, addr += used_len) {
2923         block = qemu_ram_block_from_host(addr, false, &offset);
2924         if (unlikely(!block || offset >= block->used_length)) {
2925             /*
2926              * The implementation might not support RAMBlock resize during
2927              * live migration, but it could happen in theory with future
2928              * updates. So we add a check here to capture that case.
2929              */
2930             error_report_once("%s unexpected error", __func__);
2931             return;
2932         }
2933
2934         if (len <= block->used_length - offset) {
2935             used_len = len;
2936         } else {
2937             used_len = block->used_length - offset;
2938         }
2939
2940         start = offset >> TARGET_PAGE_BITS;
2941         npages = used_len >> TARGET_PAGE_BITS;
2942
2943         qemu_mutex_lock(&ram_state->bitmap_mutex);
2944         /*
2945          * The skipped free pages are equavalent to be sent from clear_bmap's
2946          * perspective, so clear the bits from the memory region bitmap which
2947          * are initially set. Otherwise those skipped pages will be sent in
2948          * the next round after syncing from the memory region bitmap.
2949          */
2950         migration_clear_memory_region_dirty_bitmap_range(block, start, npages);
2951         ram_state->migration_dirty_pages -=
2952                       bitmap_count_one_with_offset(block->bmap, start, npages);
2953         bitmap_clear(block->bmap, start, npages);
2954         qemu_mutex_unlock(&ram_state->bitmap_mutex);
2955     }
2956 }
2957
2958 /*
2959  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2960  * long-running RCU critical section.  When rcu-reclaims in the code
2961  * start to become numerous it will be necessary to reduce the
2962  * granularity of these critical sections.
2963  */
2964
2965 /**
2966  * ram_save_setup: Setup RAM for migration
2967  *
2968  * Returns zero to indicate success and negative for error
2969  *
2970  * @f: QEMUFile where to send the data
2971  * @opaque: RAMState pointer
2972  */
2973 static int ram_save_setup(QEMUFile *f, void *opaque)
2974 {
2975     RAMState **rsp = opaque;
2976     RAMBlock *block;
2977
2978     if (compress_threads_save_setup()) {
2979         return -1;
2980     }
2981
2982     /* migration has already setup the bitmap, reuse it. */
2983     if (!migration_in_colo_state()) {
2984         if (ram_init_all(rsp) != 0) {
2985             compress_threads_save_cleanup();
2986             return -1;
2987         }
2988     }
2989     (*rsp)->f = f;
2990
2991     WITH_RCU_READ_LOCK_GUARD() {
2992         qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
2993
2994         RAMBLOCK_FOREACH_MIGRATABLE(block) {
2995             qemu_put_byte(f, strlen(block->idstr));
2996             qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2997             qemu_put_be64(f, block->used_length);
2998             if (migrate_postcopy_ram() && block->page_size !=
2999                                           qemu_host_page_size) {
3000                 qemu_put_be64(f, block->page_size);
3001             }
3002             if (migrate_ignore_shared()) {
3003                 qemu_put_be64(f, block->mr->addr);
3004             }
3005         }
3006     }
3007
3008     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
3009     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
3010
3011     multifd_send_sync_main(f);
3012     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3013     qemu_fflush(f);
3014
3015     return 0;
3016 }
3017
3018 /**
3019  * ram_save_iterate: iterative stage for migration
3020  *
3021  * Returns zero to indicate success and negative for error
3022  *
3023  * @f: QEMUFile where to send the data
3024  * @opaque: RAMState pointer
3025  */
3026 static int ram_save_iterate(QEMUFile *f, void *opaque)
3027 {
3028     RAMState **temp = opaque;
3029     RAMState *rs = *temp;
3030     int ret = 0;
3031     int i;
3032     int64_t t0;
3033     int done = 0;
3034
3035     if (blk_mig_bulk_active()) {
3036         /* Avoid transferring ram during bulk phase of block migration as
3037          * the bulk phase will usually take a long time and transferring
3038          * ram updates during that time is pointless. */
3039         goto out;
3040     }
3041
3042     /*
3043      * We'll take this lock a little bit long, but it's okay for two reasons.
3044      * Firstly, the only possible other thread to take it is who calls
3045      * qemu_guest_free_page_hint(), which should be rare; secondly, see
3046      * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which
3047      * guarantees that we'll at least released it in a regular basis.
3048      */
3049     qemu_mutex_lock(&rs->bitmap_mutex);
3050     WITH_RCU_READ_LOCK_GUARD() {
3051         if (ram_list.version != rs->last_version) {
3052             ram_state_reset(rs);
3053         }
3054
3055         /* Read version before ram_list.blocks */
3056         smp_rmb();
3057
3058         ram_control_before_iterate(f, RAM_CONTROL_ROUND);
3059
3060         t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
3061         i = 0;
3062         while ((ret = qemu_file_rate_limit(f)) == 0 ||
3063                 !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
3064             int pages;
3065
3066             if (qemu_file_get_error(f)) {
3067                 break;
3068             }
3069
3070             pages = ram_find_and_save_block(rs);
3071             /* no more pages to sent */
3072             if (pages == 0) {
3073                 done = 1;
3074                 break;
3075             }
3076
3077             if (pages < 0) {
3078                 qemu_file_set_error(f, pages);
3079                 break;
3080             }
3081
3082             rs->target_page_count += pages;
3083
3084             /*
3085              * During postcopy, it is necessary to make sure one whole host
3086              * page is sent in one chunk.
3087              */
3088             if (migrate_postcopy_ram()) {
3089                 flush_compressed_data(rs);
3090             }
3091
3092             /*
3093              * we want to check in the 1st loop, just in case it was the 1st
3094              * time and we had to sync the dirty bitmap.
3095              * qemu_clock_get_ns() is a bit expensive, so we only check each
3096              * some iterations
3097              */
3098             if ((i & 63) == 0) {
3099                 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
3100                               1000000;
3101                 if (t1 > MAX_WAIT) {
3102                     trace_ram_save_iterate_big_wait(t1, i);
3103                     break;
3104                 }
3105             }
3106             i++;
3107         }
3108     }
3109     qemu_mutex_unlock(&rs->bitmap_mutex);
3110
3111     /*
3112      * Must occur before EOS (or any QEMUFile operation)
3113      * because of RDMA protocol.
3114      */
3115     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3116
3117 out:
3118     if (ret >= 0
3119         && migration_is_setup_or_active(migrate_get_current()->state)) {
3120         multifd_send_sync_main(rs->f);
3121         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3122         qemu_fflush(f);
3123         ram_counters.transferred += 8;
3124
3125         ret = qemu_file_get_error(f);
3126     }
3127     if (ret < 0) {
3128         return ret;
3129     }
3130
3131     return done;
3132 }
3133
3134 /**
3135  * ram_save_complete: function called to send the remaining amount of ram
3136  *
3137  * Returns zero to indicate success or negative on error
3138  *
3139  * Called with iothread lock
3140  *
3141  * @f: QEMUFile where to send the data
3142  * @opaque: RAMState pointer
3143  */
3144 static int ram_save_complete(QEMUFile *f, void *opaque)
3145 {
3146     RAMState **temp = opaque;
3147     RAMState *rs = *temp;
3148     int ret = 0;
3149
3150     rs->last_stage = !migration_in_colo_state();
3151
3152     WITH_RCU_READ_LOCK_GUARD() {
3153         if (!migration_in_postcopy()) {
3154             migration_bitmap_sync_precopy(rs);
3155         }
3156
3157         ram_control_before_iterate(f, RAM_CONTROL_FINISH);
3158
3159         /* try transferring iterative blocks of memory */
3160
3161         /* flush all remaining blocks regardless of rate limiting */
3162         while (true) {
3163             int pages;
3164
3165             pages = ram_find_and_save_block(rs);
3166             /* no more blocks to sent */
3167             if (pages == 0) {
3168                 break;
3169             }
3170             if (pages < 0) {
3171                 ret = pages;
3172                 break;
3173             }
3174         }
3175
3176         flush_compressed_data(rs);
3177         ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3178     }
3179
3180     if (ret >= 0) {
3181         multifd_send_sync_main(rs->f);
3182         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3183         qemu_fflush(f);
3184     }
3185
3186     return ret;
3187 }
3188
3189 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
3190                              uint64_t *res_precopy_only,
3191                              uint64_t *res_compatible,
3192                              uint64_t *res_postcopy_only)
3193 {
3194     RAMState **temp = opaque;
3195     RAMState *rs = *temp;
3196     uint64_t remaining_size;
3197
3198     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3199
3200     if (!migration_in_postcopy() &&
3201         remaining_size < max_size) {
3202         qemu_mutex_lock_iothread();
3203         WITH_RCU_READ_LOCK_GUARD() {
3204             migration_bitmap_sync_precopy(rs);
3205         }
3206         qemu_mutex_unlock_iothread();
3207         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3208     }
3209
3210     if (migrate_postcopy_ram()) {
3211         /* We can do postcopy, and all the data is postcopiable */
3212         *res_compatible += remaining_size;
3213     } else {
3214         *res_precopy_only += remaining_size;
3215     }
3216 }
3217
3218 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3219 {
3220     unsigned int xh_len;
3221     int xh_flags;
3222     uint8_t *loaded_data;
3223
3224     /* extract RLE header */
3225     xh_flags = qemu_get_byte(f);
3226     xh_len = qemu_get_be16(f);
3227
3228     if (xh_flags != ENCODING_FLAG_XBZRLE) {
3229         error_report("Failed to load XBZRLE page - wrong compression!");
3230         return -1;
3231     }
3232
3233     if (xh_len > TARGET_PAGE_SIZE) {
3234         error_report("Failed to load XBZRLE page - len overflow!");
3235         return -1;
3236     }
3237     loaded_data = XBZRLE.decoded_buf;
3238     /* load data and decode */
3239     /* it can change loaded_data to point to an internal buffer */
3240     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3241
3242     /* decode RLE */
3243     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3244                              TARGET_PAGE_SIZE) == -1) {
3245         error_report("Failed to load XBZRLE page - decode error!");
3246         return -1;
3247     }
3248
3249     return 0;
3250 }
3251
3252 /**
3253  * ram_block_from_stream: read a RAMBlock id from the migration stream
3254  *
3255  * Must be called from within a rcu critical section.
3256  *
3257  * Returns a pointer from within the RCU-protected ram_list.
3258  *
3259  * @f: QEMUFile where to read the data from
3260  * @flags: Page flags (mostly to see if it's a continuation of previous block)
3261  */
3262 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
3263 {
3264     static RAMBlock *block;
3265     char id[256];
3266     uint8_t len;
3267
3268     if (flags & RAM_SAVE_FLAG_CONTINUE) {
3269         if (!block) {
3270             error_report("Ack, bad migration stream!");
3271             return NULL;
3272         }
3273         return block;
3274     }
3275
3276     len = qemu_get_byte(f);
3277     qemu_get_buffer(f, (uint8_t *)id, len);
3278     id[len] = 0;
3279
3280     block = qemu_ram_block_by_name(id);
3281     if (!block) {
3282         error_report("Can't find block %s", id);
3283         return NULL;
3284     }
3285
3286     if (ramblock_is_ignored(block)) {
3287         error_report("block %s should not be migrated !", id);
3288         return NULL;
3289     }
3290
3291     return block;
3292 }
3293
3294 static inline void *host_from_ram_block_offset(RAMBlock *block,
3295                                                ram_addr_t offset)
3296 {
3297     if (!offset_in_ramblock(block, offset)) {
3298         return NULL;
3299     }
3300
3301     return block->host + offset;
3302 }
3303
3304 static void *host_page_from_ram_block_offset(RAMBlock *block,
3305                                              ram_addr_t offset)
3306 {
3307     /* Note: Explicitly no check against offset_in_ramblock(). */
3308     return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset),
3309                                    block->page_size);
3310 }
3311
3312 static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block,
3313                                                          ram_addr_t offset)
3314 {
3315     return ((uintptr_t)block->host + offset) & (block->page_size - 1);
3316 }
3317
3318 static inline void *colo_cache_from_block_offset(RAMBlock *block,
3319                              ram_addr_t offset, bool record_bitmap)
3320 {
3321     if (!offset_in_ramblock(block, offset)) {
3322         return NULL;
3323     }
3324     if (!block->colo_cache) {
3325         error_report("%s: colo_cache is NULL in block :%s",
3326                      __func__, block->idstr);
3327         return NULL;
3328     }
3329
3330     /*
3331     * During colo checkpoint, we need bitmap of these migrated pages.
3332     * It help us to decide which pages in ram cache should be flushed
3333     * into VM's RAM later.
3334     */
3335     if (record_bitmap &&
3336         !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
3337         ram_state->migration_dirty_pages++;
3338     }
3339     return block->colo_cache + offset;
3340 }
3341
3342 /**
3343  * ram_handle_compressed: handle the zero page case
3344  *
3345  * If a page (or a whole RDMA chunk) has been
3346  * determined to be zero, then zap it.
3347  *
3348  * @host: host address for the zero page
3349  * @ch: what the page is filled from.  We only support zero
3350  * @size: size of the zero page
3351  */
3352 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3353 {
3354     if (ch != 0 || !buffer_is_zero(host, size)) {
3355         memset(host, ch, size);
3356     }
3357 }
3358
3359 /* return the size after decompression, or negative value on error */
3360 static int
3361 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3362                      const uint8_t *source, size_t source_len)
3363 {
3364     int err;
3365
3366     err = inflateReset(stream);
3367     if (err != Z_OK) {
3368         return -1;
3369     }
3370
3371     stream->avail_in = source_len;
3372     stream->next_in = (uint8_t *)source;
3373     stream->avail_out = dest_len;
3374     stream->next_out = dest;
3375
3376     err = inflate(stream, Z_NO_FLUSH);
3377     if (err != Z_STREAM_END) {
3378         return -1;
3379     }
3380
3381     return stream->total_out;
3382 }
3383
3384 static void *do_data_decompress(void *opaque)
3385 {
3386     DecompressParam *param = opaque;
3387     unsigned long pagesize;
3388     uint8_t *des;
3389     int len, ret;
3390
3391     qemu_mutex_lock(&param->mutex);
3392     while (!param->quit) {
3393         if (param->des) {
3394             des = param->des;
3395             len = param->len;
3396             param->des = 0;
3397             qemu_mutex_unlock(&param->mutex);
3398
3399             pagesize = TARGET_PAGE_SIZE;
3400
3401             ret = qemu_uncompress_data(&param->stream, des, pagesize,
3402                                        param->compbuf, len);
3403             if (ret < 0 && migrate_get_current()->decompress_error_check) {
3404                 error_report("decompress data failed");
3405                 qemu_file_set_error(decomp_file, ret);
3406             }
3407
3408             qemu_mutex_lock(&decomp_done_lock);
3409             param->done = true;
3410             qemu_cond_signal(&decomp_done_cond);
3411             qemu_mutex_unlock(&decomp_done_lock);
3412
3413             qemu_mutex_lock(&param->mutex);
3414         } else {
3415             qemu_cond_wait(&param->cond, &param->mutex);
3416         }
3417     }
3418     qemu_mutex_unlock(&param->mutex);
3419
3420     return NULL;
3421 }
3422
3423 static int wait_for_decompress_done(void)
3424 {
3425     int idx, thread_count;
3426
3427     if (!migrate_use_compression()) {
3428         return 0;
3429     }
3430
3431     thread_count = migrate_decompress_threads();
3432     qemu_mutex_lock(&decomp_done_lock);
3433     for (idx = 0; idx < thread_count; idx++) {
3434         while (!decomp_param[idx].done) {
3435             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3436         }
3437     }
3438     qemu_mutex_unlock(&decomp_done_lock);
3439     return qemu_file_get_error(decomp_file);
3440 }
3441
3442 static void compress_threads_load_cleanup(void)
3443 {
3444     int i, thread_count;
3445
3446     if (!migrate_use_compression()) {
3447         return;
3448     }
3449     thread_count = migrate_decompress_threads();
3450     for (i = 0; i < thread_count; i++) {
3451         /*
3452          * we use it as a indicator which shows if the thread is
3453          * properly init'd or not
3454          */
3455         if (!decomp_param[i].compbuf) {
3456             break;
3457         }
3458
3459         qemu_mutex_lock(&decomp_param[i].mutex);
3460         decomp_param[i].quit = true;
3461         qemu_cond_signal(&decomp_param[i].cond);
3462         qemu_mutex_unlock(&decomp_param[i].mutex);
3463     }
3464     for (i = 0; i < thread_count; i++) {
3465         if (!decomp_param[i].compbuf) {
3466             break;
3467         }
3468
3469         qemu_thread_join(decompress_threads + i);
3470         qemu_mutex_destroy(&decomp_param[i].mutex);
3471         qemu_cond_destroy(&decomp_param[i].cond);
3472         inflateEnd(&decomp_param[i].stream);
3473         g_free(decomp_param[i].compbuf);
3474         decomp_param[i].compbuf = NULL;
3475     }
3476     g_free(decompress_threads);
3477     g_free(decomp_param);
3478     decompress_threads = NULL;
3479     decomp_param = NULL;
3480     decomp_file = NULL;
3481 }
3482
3483 static int compress_threads_load_setup(QEMUFile *f)
3484 {
3485     int i, thread_count;
3486
3487     if (!migrate_use_compression()) {
3488         return 0;
3489     }
3490
3491     thread_count = migrate_decompress_threads();
3492     decompress_threads = g_new0(QemuThread, thread_count);
3493     decomp_param = g_new0(DecompressParam, thread_count);
3494     qemu_mutex_init(&decomp_done_lock);
3495     qemu_cond_init(&decomp_done_cond);
3496     decomp_file = f;
3497     for (i = 0; i < thread_count; i++) {
3498         if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3499             goto exit;
3500         }
3501
3502         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3503         qemu_mutex_init(&decomp_param[i].mutex);
3504         qemu_cond_init(&decomp_param[i].cond);
3505         decomp_param[i].done = true;
3506         decomp_param[i].quit = false;
3507         qemu_thread_create(decompress_threads + i, "decompress",
3508                            do_data_decompress, decomp_param + i,
3509                            QEMU_THREAD_JOINABLE);
3510     }
3511     return 0;
3512 exit:
3513     compress_threads_load_cleanup();
3514     return -1;
3515 }
3516
3517 static void decompress_data_with_multi_threads(QEMUFile *f,
3518                                                void *host, int len)
3519 {
3520     int idx, thread_count;
3521
3522     thread_count = migrate_decompress_threads();
3523     QEMU_LOCK_GUARD(&decomp_done_lock);
3524     while (true) {
3525         for (idx = 0; idx < thread_count; idx++) {
3526             if (decomp_param[idx].done) {
3527                 decomp_param[idx].done = false;
3528                 qemu_mutex_lock(&decomp_param[idx].mutex);
3529                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3530                 decomp_param[idx].des = host;
3531                 decomp_param[idx].len = len;
3532                 qemu_cond_signal(&decomp_param[idx].cond);
3533                 qemu_mutex_unlock(&decomp_param[idx].mutex);
3534                 break;
3535             }
3536         }
3537         if (idx < thread_count) {
3538             break;
3539         } else {
3540             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3541         }
3542     }
3543 }
3544
3545 static void colo_init_ram_state(void)
3546 {
3547     ram_state_init(&ram_state);
3548 }
3549
3550 /*
3551  * colo cache: this is for secondary VM, we cache the whole
3552  * memory of the secondary VM, it is need to hold the global lock
3553  * to call this helper.
3554  */
3555 int colo_init_ram_cache(void)
3556 {
3557     RAMBlock *block;
3558
3559     WITH_RCU_READ_LOCK_GUARD() {
3560         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3561             block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3562                                                     NULL, false, false);
3563             if (!block->colo_cache) {
3564                 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3565                              "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3566                              block->used_length);
3567                 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3568                     if (block->colo_cache) {
3569                         qemu_anon_ram_free(block->colo_cache, block->used_length);
3570                         block->colo_cache = NULL;
3571                     }
3572                 }
3573                 return -errno;
3574             }
3575             if (!machine_dump_guest_core(current_machine)) {
3576                 qemu_madvise(block->colo_cache, block->used_length,
3577                              QEMU_MADV_DONTDUMP);
3578             }
3579         }
3580     }
3581
3582     /*
3583     * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3584     * with to decide which page in cache should be flushed into SVM's RAM. Here
3585     * we use the same name 'ram_bitmap' as for migration.
3586     */
3587     if (ram_bytes_total()) {
3588         RAMBlock *block;
3589
3590         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3591             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3592             block->bmap = bitmap_new(pages);
3593         }
3594     }
3595
3596     colo_init_ram_state();
3597     return 0;
3598 }
3599
3600 /* TODO: duplicated with ram_init_bitmaps */
3601 void colo_incoming_start_dirty_log(void)
3602 {
3603     RAMBlock *block = NULL;
3604     /* For memory_global_dirty_log_start below. */
3605     qemu_mutex_lock_iothread();
3606     qemu_mutex_lock_ramlist();
3607
3608     memory_global_dirty_log_sync();
3609     WITH_RCU_READ_LOCK_GUARD() {
3610         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3611             ramblock_sync_dirty_bitmap(ram_state, block);
3612             /* Discard this dirty bitmap record */
3613             bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3614         }
3615         memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
3616     }
3617     ram_state->migration_dirty_pages = 0;
3618     qemu_mutex_unlock_ramlist();
3619     qemu_mutex_unlock_iothread();
3620 }
3621
3622 /* It is need to hold the global lock to call this helper */
3623 void colo_release_ram_cache(void)
3624 {
3625     RAMBlock *block;
3626
3627     memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
3628     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3629         g_free(block->bmap);
3630         block->bmap = NULL;
3631     }
3632
3633     WITH_RCU_READ_LOCK_GUARD() {
3634         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3635             if (block->colo_cache) {
3636                 qemu_anon_ram_free(block->colo_cache, block->used_length);
3637                 block->colo_cache = NULL;
3638             }
3639         }
3640     }
3641     ram_state_cleanup(&ram_state);
3642 }
3643
3644 /**
3645  * ram_load_setup: Setup RAM for migration incoming side
3646  *
3647  * Returns zero to indicate success and negative for error
3648  *
3649  * @f: QEMUFile where to receive the data
3650  * @opaque: RAMState pointer
3651  */
3652 static int ram_load_setup(QEMUFile *f, void *opaque)
3653 {
3654     if (compress_threads_load_setup(f)) {
3655         return -1;
3656     }
3657
3658     xbzrle_load_setup();
3659     ramblock_recv_map_init();
3660
3661     return 0;
3662 }
3663
3664 static int ram_load_cleanup(void *opaque)
3665 {
3666     RAMBlock *rb;
3667
3668     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3669         qemu_ram_block_writeback(rb);
3670     }
3671
3672     xbzrle_load_cleanup();
3673     compress_threads_load_cleanup();
3674
3675     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3676         g_free(rb->receivedmap);
3677         rb->receivedmap = NULL;
3678     }
3679
3680     return 0;
3681 }
3682
3683 /**
3684  * ram_postcopy_incoming_init: allocate postcopy data structures
3685  *
3686  * Returns 0 for success and negative if there was one error
3687  *
3688  * @mis: current migration incoming state
3689  *
3690  * Allocate data structures etc needed by incoming migration with
3691  * postcopy-ram. postcopy-ram's similarly names
3692  * postcopy_ram_incoming_init does the work.
3693  */
3694 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3695 {
3696     return postcopy_ram_incoming_init(mis);
3697 }
3698
3699 /**
3700  * ram_load_postcopy: load a page in postcopy case
3701  *
3702  * Returns 0 for success or -errno in case of error
3703  *
3704  * Called in postcopy mode by ram_load().
3705  * rcu_read_lock is taken prior to this being called.
3706  *
3707  * @f: QEMUFile where to send the data
3708  */
3709 static int ram_load_postcopy(QEMUFile *f)
3710 {
3711     int flags = 0, ret = 0;
3712     bool place_needed = false;
3713     bool matches_target_page_size = false;
3714     MigrationIncomingState *mis = migration_incoming_get_current();
3715     /* Temporary page that is later 'placed' */
3716     void *postcopy_host_page = mis->postcopy_tmp_page;
3717     void *host_page = NULL;
3718     bool all_zero = true;
3719     int target_pages = 0;
3720
3721     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3722         ram_addr_t addr;
3723         void *page_buffer = NULL;
3724         void *place_source = NULL;
3725         RAMBlock *block = NULL;
3726         uint8_t ch;
3727         int len;
3728
3729         addr = qemu_get_be64(f);
3730
3731         /*
3732          * If qemu file error, we should stop here, and then "addr"
3733          * may be invalid
3734          */
3735         ret = qemu_file_get_error(f);
3736         if (ret) {
3737             break;
3738         }
3739
3740         flags = addr & ~TARGET_PAGE_MASK;
3741         addr &= TARGET_PAGE_MASK;
3742
3743         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
3744         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3745                      RAM_SAVE_FLAG_COMPRESS_PAGE)) {
3746             block = ram_block_from_stream(f, flags);
3747             if (!block) {
3748                 ret = -EINVAL;
3749                 break;
3750             }
3751
3752             /*
3753              * Relying on used_length is racy and can result in false positives.
3754              * We might place pages beyond used_length in case RAM was shrunk
3755              * while in postcopy, which is fine - trying to place via
3756              * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault.
3757              */
3758             if (!block->host || addr >= block->postcopy_length) {
3759                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3760                 ret = -EINVAL;
3761                 break;
3762             }
3763             target_pages++;
3764             matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
3765             /*
3766              * Postcopy requires that we place whole host pages atomically;
3767              * these may be huge pages for RAMBlocks that are backed by
3768              * hugetlbfs.
3769              * To make it atomic, the data is read into a temporary page
3770              * that's moved into place later.
3771              * The migration protocol uses,  possibly smaller, target-pages
3772              * however the source ensures it always sends all the components
3773              * of a host page in one chunk.
3774              */
3775             page_buffer = postcopy_host_page +
3776                           host_page_offset_from_ram_block_offset(block, addr);
3777             /* If all TP are zero then we can optimise the place */
3778             if (target_pages == 1) {
3779                 host_page = host_page_from_ram_block_offset(block, addr);
3780             } else if (host_page != host_page_from_ram_block_offset(block,
3781                                                                     addr)) {
3782                 /* not the 1st TP within the HP */
3783                 error_report("Non-same host page %p/%p", host_page,
3784                              host_page_from_ram_block_offset(block, addr));
3785                 ret = -EINVAL;
3786                 break;
3787             }
3788
3789             /*
3790              * If it's the last part of a host page then we place the host
3791              * page
3792              */
3793             if (target_pages == (block->page_size / TARGET_PAGE_SIZE)) {
3794                 place_needed = true;
3795             }
3796             place_source = postcopy_host_page;
3797         }
3798
3799         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3800         case RAM_SAVE_FLAG_ZERO:
3801             ch = qemu_get_byte(f);
3802             /*
3803              * Can skip to set page_buffer when
3804              * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
3805              */
3806             if (ch || !matches_target_page_size) {
3807                 memset(page_buffer, ch, TARGET_PAGE_SIZE);
3808             }
3809             if (ch) {
3810                 all_zero = false;
3811             }
3812             break;
3813
3814         case RAM_SAVE_FLAG_PAGE:
3815             all_zero = false;
3816             if (!matches_target_page_size) {
3817                 /* For huge pages, we always use temporary buffer */
3818                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3819             } else {
3820                 /*
3821                  * For small pages that matches target page size, we
3822                  * avoid the qemu_file copy.  Instead we directly use
3823                  * the buffer of QEMUFile to place the page.  Note: we
3824                  * cannot do any QEMUFile operation before using that
3825                  * buffer to make sure the buffer is valid when
3826                  * placing the page.
3827                  */
3828                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3829                                          TARGET_PAGE_SIZE);
3830             }
3831             break;
3832         case RAM_SAVE_FLAG_COMPRESS_PAGE:
3833             all_zero = false;
3834             len = qemu_get_be32(f);
3835             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3836                 error_report("Invalid compressed data length: %d", len);
3837                 ret = -EINVAL;
3838                 break;
3839             }
3840             decompress_data_with_multi_threads(f, page_buffer, len);
3841             break;
3842
3843         case RAM_SAVE_FLAG_EOS:
3844             /* normal exit */
3845             multifd_recv_sync_main();
3846             break;
3847         default:
3848             error_report("Unknown combination of migration flags: 0x%x"
3849                          " (postcopy mode)", flags);
3850             ret = -EINVAL;
3851             break;
3852         }
3853
3854         /* Got the whole host page, wait for decompress before placing. */
3855         if (place_needed) {
3856             ret |= wait_for_decompress_done();
3857         }
3858
3859         /* Detect for any possible file errors */
3860         if (!ret && qemu_file_get_error(f)) {
3861             ret = qemu_file_get_error(f);
3862         }
3863
3864         if (!ret && place_needed) {
3865             if (all_zero) {
3866                 ret = postcopy_place_page_zero(mis, host_page, block);
3867             } else {
3868                 ret = postcopy_place_page(mis, host_page, place_source,
3869                                           block);
3870             }
3871             place_needed = false;
3872             target_pages = 0;
3873             /* Assume we have a zero page until we detect something different */
3874             all_zero = true;
3875         }
3876     }
3877
3878     return ret;
3879 }
3880
3881 static bool postcopy_is_advised(void)
3882 {
3883     PostcopyState ps = postcopy_state_get();
3884     return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
3885 }
3886
3887 static bool postcopy_is_running(void)
3888 {
3889     PostcopyState ps = postcopy_state_get();
3890     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3891 }
3892
3893 /*
3894  * Flush content of RAM cache into SVM's memory.
3895  * Only flush the pages that be dirtied by PVM or SVM or both.
3896  */
3897 void colo_flush_ram_cache(void)
3898 {
3899     RAMBlock *block = NULL;
3900     void *dst_host;
3901     void *src_host;
3902     unsigned long offset = 0;
3903
3904     memory_global_dirty_log_sync();
3905     WITH_RCU_READ_LOCK_GUARD() {
3906         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3907             ramblock_sync_dirty_bitmap(ram_state, block);
3908         }
3909     }
3910
3911     trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
3912     WITH_RCU_READ_LOCK_GUARD() {
3913         block = QLIST_FIRST_RCU(&ram_list.blocks);
3914
3915         while (block) {
3916             unsigned long num = 0;
3917
3918             offset = colo_bitmap_find_dirty(ram_state, block, offset, &num);
3919             if (!offset_in_ramblock(block,
3920                                     ((ram_addr_t)offset) << TARGET_PAGE_BITS)) {
3921                 offset = 0;
3922                 num = 0;
3923                 block = QLIST_NEXT_RCU(block, next);
3924             } else {
3925                 unsigned long i = 0;
3926
3927                 for (i = 0; i < num; i++) {
3928                     migration_bitmap_clear_dirty(ram_state, block, offset + i);
3929                 }
3930                 dst_host = block->host
3931                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3932                 src_host = block->colo_cache
3933                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3934                 memcpy(dst_host, src_host, TARGET_PAGE_SIZE * num);
3935                 offset += num;
3936             }
3937         }
3938     }
3939     trace_colo_flush_ram_cache_end();
3940 }
3941
3942 /**
3943  * ram_load_precopy: load pages in precopy case
3944  *
3945  * Returns 0 for success or -errno in case of error
3946  *
3947  * Called in precopy mode by ram_load().
3948  * rcu_read_lock is taken prior to this being called.
3949  *
3950  * @f: QEMUFile where to send the data
3951  */
3952 static int ram_load_precopy(QEMUFile *f)
3953 {
3954     int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
3955     /* ADVISE is earlier, it shows the source has the postcopy capability on */
3956     bool postcopy_advised = postcopy_is_advised();
3957     if (!migrate_use_compression()) {
3958         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3959     }
3960
3961     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3962         ram_addr_t addr, total_ram_bytes;
3963         void *host = NULL, *host_bak = NULL;
3964         uint8_t ch;
3965
3966         /*
3967          * Yield periodically to let main loop run, but an iteration of
3968          * the main loop is expensive, so do it each some iterations
3969          */
3970         if ((i & 32767) == 0 && qemu_in_coroutine()) {
3971             aio_co_schedule(qemu_get_current_aio_context(),
3972                             qemu_coroutine_self());
3973             qemu_coroutine_yield();
3974         }
3975         i++;
3976
3977         addr = qemu_get_be64(f);
3978         flags = addr & ~TARGET_PAGE_MASK;
3979         addr &= TARGET_PAGE_MASK;
3980
3981         if (flags & invalid_flags) {
3982             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3983                 error_report("Received an unexpected compressed page");
3984             }
3985
3986             ret = -EINVAL;
3987             break;
3988         }
3989
3990         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3991                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
3992             RAMBlock *block = ram_block_from_stream(f, flags);
3993
3994             host = host_from_ram_block_offset(block, addr);
3995             /*
3996              * After going into COLO stage, we should not load the page
3997              * into SVM's memory directly, we put them into colo_cache firstly.
3998              * NOTE: We need to keep a copy of SVM's ram in colo_cache.
3999              * Previously, we copied all these memory in preparing stage of COLO
4000              * while we need to stop VM, which is a time-consuming process.
4001              * Here we optimize it by a trick, back-up every page while in
4002              * migration process while COLO is enabled, though it affects the
4003              * speed of the migration, but it obviously reduce the downtime of
4004              * back-up all SVM'S memory in COLO preparing stage.
4005              */
4006             if (migration_incoming_colo_enabled()) {
4007                 if (migration_incoming_in_colo_state()) {
4008                     /* In COLO stage, put all pages into cache temporarily */
4009                     host = colo_cache_from_block_offset(block, addr, true);
4010                 } else {
4011                    /*
4012                     * In migration stage but before COLO stage,
4013                     * Put all pages into both cache and SVM's memory.
4014                     */
4015                     host_bak = colo_cache_from_block_offset(block, addr, false);
4016                 }
4017             }
4018             if (!host) {
4019                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4020                 ret = -EINVAL;
4021                 break;
4022             }
4023             if (!migration_incoming_in_colo_state()) {
4024                 ramblock_recv_bitmap_set(block, host);
4025             }
4026
4027             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
4028         }
4029
4030         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4031         case RAM_SAVE_FLAG_MEM_SIZE:
4032             /* Synchronize RAM block list */
4033             total_ram_bytes = addr;
4034             while (!ret && total_ram_bytes) {
4035                 RAMBlock *block;
4036                 char id[256];
4037                 ram_addr_t length;
4038
4039                 len = qemu_get_byte(f);
4040                 qemu_get_buffer(f, (uint8_t *)id, len);
4041                 id[len] = 0;
4042                 length = qemu_get_be64(f);
4043
4044                 block = qemu_ram_block_by_name(id);
4045                 if (block && !qemu_ram_is_migratable(block)) {
4046                     error_report("block %s should not be migrated !", id);
4047                     ret = -EINVAL;
4048                 } else if (block) {
4049                     if (length != block->used_length) {
4050                         Error *local_err = NULL;
4051
4052                         ret = qemu_ram_resize(block, length,
4053                                               &local_err);
4054                         if (local_err) {
4055                             error_report_err(local_err);
4056                         }
4057                     }
4058                     /* For postcopy we need to check hugepage sizes match */
4059                     if (postcopy_advised && migrate_postcopy_ram() &&
4060                         block->page_size != qemu_host_page_size) {
4061                         uint64_t remote_page_size = qemu_get_be64(f);
4062                         if (remote_page_size != block->page_size) {
4063                             error_report("Mismatched RAM page size %s "
4064                                          "(local) %zd != %" PRId64,
4065                                          id, block->page_size,
4066                                          remote_page_size);
4067                             ret = -EINVAL;
4068                         }
4069                     }
4070                     if (migrate_ignore_shared()) {
4071                         hwaddr addr = qemu_get_be64(f);
4072                         if (ramblock_is_ignored(block) &&
4073                             block->mr->addr != addr) {
4074                             error_report("Mismatched GPAs for block %s "
4075                                          "%" PRId64 "!= %" PRId64,
4076                                          id, (uint64_t)addr,
4077                                          (uint64_t)block->mr->addr);
4078                             ret = -EINVAL;
4079                         }
4080                     }
4081                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
4082                                           block->idstr);
4083                 } else {
4084                     error_report("Unknown ramblock \"%s\", cannot "
4085                                  "accept migration", id);
4086                     ret = -EINVAL;
4087                 }
4088
4089                 total_ram_bytes -= length;
4090             }
4091             break;
4092
4093         case RAM_SAVE_FLAG_ZERO:
4094             ch = qemu_get_byte(f);
4095             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
4096             break;
4097
4098         case RAM_SAVE_FLAG_PAGE:
4099             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
4100             break;
4101
4102         case RAM_SAVE_FLAG_COMPRESS_PAGE:
4103             len = qemu_get_be32(f);
4104             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4105                 error_report("Invalid compressed data length: %d", len);
4106                 ret = -EINVAL;
4107                 break;
4108             }
4109             decompress_data_with_multi_threads(f, host, len);
4110             break;
4111
4112         case RAM_SAVE_FLAG_XBZRLE:
4113             if (load_xbzrle(f, addr, host) < 0) {
4114                 error_report("Failed to decompress XBZRLE page at "
4115                              RAM_ADDR_FMT, addr);
4116                 ret = -EINVAL;
4117                 break;
4118             }
4119             break;
4120         case RAM_SAVE_FLAG_EOS:
4121             /* normal exit */
4122             multifd_recv_sync_main();
4123             break;
4124         default:
4125             if (flags & RAM_SAVE_FLAG_HOOK) {
4126                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
4127             } else {
4128                 error_report("Unknown combination of migration flags: 0x%x",
4129                              flags);
4130                 ret = -EINVAL;
4131             }
4132         }
4133         if (!ret) {
4134             ret = qemu_file_get_error(f);
4135         }
4136         if (!ret && host_bak) {
4137             memcpy(host_bak, host, TARGET_PAGE_SIZE);
4138         }
4139     }
4140
4141     ret |= wait_for_decompress_done();
4142     return ret;
4143 }
4144
4145 static int ram_load(QEMUFile *f, void *opaque, int version_id)
4146 {
4147     int ret = 0;
4148     static uint64_t seq_iter;
4149     /*
4150      * If system is running in postcopy mode, page inserts to host memory must
4151      * be atomic
4152      */
4153     bool postcopy_running = postcopy_is_running();
4154
4155     seq_iter++;
4156
4157     if (version_id != 4) {
4158         return -EINVAL;
4159     }
4160
4161     /*
4162      * This RCU critical section can be very long running.
4163      * When RCU reclaims in the code start to become numerous,
4164      * it will be necessary to reduce the granularity of this
4165      * critical section.
4166      */
4167     WITH_RCU_READ_LOCK_GUARD() {
4168         if (postcopy_running) {
4169             ret = ram_load_postcopy(f);
4170         } else {
4171             ret = ram_load_precopy(f);
4172         }
4173     }
4174     trace_ram_load_complete(ret, seq_iter);
4175
4176     return ret;
4177 }
4178
4179 static bool ram_has_postcopy(void *opaque)
4180 {
4181     RAMBlock *rb;
4182     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4183         if (ramblock_is_pmem(rb)) {
4184             info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4185                          "is not supported now!", rb->idstr, rb->host);
4186             return false;
4187         }
4188     }
4189
4190     return migrate_postcopy_ram();
4191 }
4192
4193 /* Sync all the dirty bitmap with destination VM.  */
4194 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4195 {
4196     RAMBlock *block;
4197     QEMUFile *file = s->to_dst_file;
4198     int ramblock_count = 0;
4199
4200     trace_ram_dirty_bitmap_sync_start();
4201
4202     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4203         qemu_savevm_send_recv_bitmap(file, block->idstr);
4204         trace_ram_dirty_bitmap_request(block->idstr);
4205         ramblock_count++;
4206     }
4207
4208     trace_ram_dirty_bitmap_sync_wait();
4209
4210     /* Wait until all the ramblocks' dirty bitmap synced */
4211     while (ramblock_count--) {
4212         qemu_sem_wait(&s->rp_state.rp_sem);
4213     }
4214
4215     trace_ram_dirty_bitmap_sync_complete();
4216
4217     return 0;
4218 }
4219
4220 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4221 {
4222     qemu_sem_post(&s->rp_state.rp_sem);
4223 }
4224
4225 /*
4226  * Read the received bitmap, revert it as the initial dirty bitmap.
4227  * This is only used when the postcopy migration is paused but wants
4228  * to resume from a middle point.
4229  */
4230 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4231 {
4232     int ret = -EINVAL;
4233     /* from_dst_file is always valid because we're within rp_thread */
4234     QEMUFile *file = s->rp_state.from_dst_file;
4235     unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
4236     uint64_t local_size = DIV_ROUND_UP(nbits, 8);
4237     uint64_t size, end_mark;
4238
4239     trace_ram_dirty_bitmap_reload_begin(block->idstr);
4240
4241     if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4242         error_report("%s: incorrect state %s", __func__,
4243                      MigrationStatus_str(s->state));
4244         return -EINVAL;
4245     }
4246
4247     /*
4248      * Note: see comments in ramblock_recv_bitmap_send() on why we
4249      * need the endianness conversion, and the paddings.
4250      */
4251     local_size = ROUND_UP(local_size, 8);
4252
4253     /* Add paddings */
4254     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4255
4256     size = qemu_get_be64(file);
4257
4258     /* The size of the bitmap should match with our ramblock */
4259     if (size != local_size) {
4260         error_report("%s: ramblock '%s' bitmap size mismatch "
4261                      "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4262                      block->idstr, size, local_size);
4263         ret = -EINVAL;
4264         goto out;
4265     }
4266
4267     size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4268     end_mark = qemu_get_be64(file);
4269
4270     ret = qemu_file_get_error(file);
4271     if (ret || size != local_size) {
4272         error_report("%s: read bitmap failed for ramblock '%s': %d"
4273                      " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4274                      __func__, block->idstr, ret, local_size, size);
4275         ret = -EIO;
4276         goto out;
4277     }
4278
4279     if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4280         error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64,
4281                      __func__, block->idstr, end_mark);
4282         ret = -EINVAL;
4283         goto out;
4284     }
4285
4286     /*
4287      * Endianness conversion. We are during postcopy (though paused).
4288      * The dirty bitmap won't change. We can directly modify it.
4289      */
4290     bitmap_from_le(block->bmap, le_bitmap, nbits);
4291
4292     /*
4293      * What we received is "received bitmap". Revert it as the initial
4294      * dirty bitmap for this ramblock.
4295      */
4296     bitmap_complement(block->bmap, block->bmap, nbits);
4297
4298     /* Clear dirty bits of discarded ranges that we don't want to migrate. */
4299     ramblock_dirty_bitmap_clear_discarded_pages(block);
4300
4301     /* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */
4302     trace_ram_dirty_bitmap_reload_complete(block->idstr);
4303
4304     /*
4305      * We succeeded to sync bitmap for current ramblock. If this is
4306      * the last one to sync, we need to notify the main send thread.
4307      */
4308     ram_dirty_bitmap_reload_notify(s);
4309
4310     ret = 0;
4311 out:
4312     g_free(le_bitmap);
4313     return ret;
4314 }
4315
4316 static int ram_resume_prepare(MigrationState *s, void *opaque)
4317 {
4318     RAMState *rs = *(RAMState **)opaque;
4319     int ret;
4320
4321     ret = ram_dirty_bitmap_sync_all(s, rs);
4322     if (ret) {
4323         return ret;
4324     }
4325
4326     ram_state_resume_prepare(rs, s->to_dst_file);
4327
4328     return 0;
4329 }
4330
4331 static SaveVMHandlers savevm_ram_handlers = {
4332     .save_setup = ram_save_setup,
4333     .save_live_iterate = ram_save_iterate,
4334     .save_live_complete_postcopy = ram_save_complete,
4335     .save_live_complete_precopy = ram_save_complete,
4336     .has_postcopy = ram_has_postcopy,
4337     .save_live_pending = ram_save_pending,
4338     .load_state = ram_load,
4339     .save_cleanup = ram_save_cleanup,
4340     .load_setup = ram_load_setup,
4341     .load_cleanup = ram_load_cleanup,
4342     .resume_prepare = ram_resume_prepare,
4343 };
4344
4345 static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host,
4346                                       size_t old_size, size_t new_size)
4347 {
4348     PostcopyState ps = postcopy_state_get();
4349     ram_addr_t offset;
4350     RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset);
4351     Error *err = NULL;
4352
4353     if (ramblock_is_ignored(rb)) {
4354         return;
4355     }
4356
4357     if (!migration_is_idle()) {
4358         /*
4359          * Precopy code on the source cannot deal with the size of RAM blocks
4360          * changing at random points in time - especially after sending the
4361          * RAM block sizes in the migration stream, they must no longer change.
4362          * Abort and indicate a proper reason.
4363          */
4364         error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr);
4365         migration_cancel(err);
4366         error_free(err);
4367     }
4368
4369     switch (ps) {
4370     case POSTCOPY_INCOMING_ADVISE:
4371         /*
4372          * Update what ram_postcopy_incoming_init()->init_range() does at the
4373          * time postcopy was advised. Syncing RAM blocks with the source will
4374          * result in RAM resizes.
4375          */
4376         if (old_size < new_size) {
4377             if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) {
4378                 error_report("RAM block '%s' discard of resized RAM failed",
4379                              rb->idstr);
4380             }
4381         }
4382         rb->postcopy_length = new_size;
4383         break;
4384     case POSTCOPY_INCOMING_NONE:
4385     case POSTCOPY_INCOMING_RUNNING:
4386     case POSTCOPY_INCOMING_END:
4387         /*
4388          * Once our guest is running, postcopy does no longer care about
4389          * resizes. When growing, the new memory was not available on the
4390          * source, no handler needed.
4391          */
4392         break;
4393     default:
4394         error_report("RAM block '%s' resized during postcopy state: %d",
4395                      rb->idstr, ps);
4396         exit(-1);
4397     }
4398 }
4399
4400 static RAMBlockNotifier ram_mig_ram_notifier = {
4401     .ram_block_resized = ram_mig_ram_block_resized,
4402 };
4403
4404 void ram_mig_init(void)
4405 {
4406     qemu_mutex_init(&XBZRLE.lock);
4407     register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
4408     ram_block_notifier_add(&ram_mig_ram_notifier);
4409 }