migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <quintela@redhat.com>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28
  29 #include "qemu/osdep.h"
  30 #include "qemu/cutils.h"
  31 #include "qemu/bitops.h"
  32 #include "qemu/bitmap.h"
  33 #include "qemu/main-loop.h"
  34 #include "xbzrle.h"
  35 #include "ram.h"
  36 #include "migration.h"
  37 #include "migration/register.h"
  38 #include "migration/misc.h"
  39 #include "qemu-file.h"
  40 #include "postcopy-ram.h"
  41 #include "page_cache.h"
  42 #include "qemu/error-report.h"
  43 #include "qapi/error.h"
  44 #include "qapi/qapi-types-migration.h"
  45 #include "qapi/qapi-events-migration.h"
  46 #include "qapi/qmp/qerror.h"
  47 #include "trace.h"
  48 #include "exec/ram_addr.h"
  49 #include "exec/target_page.h"
  50 #include "qemu/rcu_queue.h"
  51 #include "migration/colo.h"
  52 #include "block.h"
  53 #include "sysemu/cpu-throttle.h"
  54 #include "savevm.h"
  55 #include "qemu/iov.h"
  56 #include "multifd.h"
  57 #include "sysemu/runstate.h"
  58
  59 #if defined(__linux__)
  60 #include "qemu/userfaultfd.h"
  61 #endif /* defined(__linux__) */
  62
  63 /***********************************************************/
  64 /* ram save/restore */
  65
  66 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
  67  * worked for pages that where filled with the same char.  We switched
  68  * it to only search for the zero value.  And to avoid confusion with
  69  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
  70  */
  71
  72 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  73 #define RAM_SAVE_FLAG_ZERO     0x02
  74 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  75 #define RAM_SAVE_FLAG_PAGE     0x08
  76 #define RAM_SAVE_FLAG_EOS      0x10
  77 #define RAM_SAVE_FLAG_CONTINUE 0x20
  78 #define RAM_SAVE_FLAG_XBZRLE   0x40
  79 /* 0x80 is reserved in migration.h start with 0x100 next */
  80 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  81
  82 static inline bool is_zero_range(uint8_t *p, uint64_t size)
  83 {
  84     return buffer_is_zero(p, size);
  85 }
  86
  87 XBZRLECacheStats xbzrle_counters;
  88
  89 /* struct contains XBZRLE cache and a static page
  90    used by the compression */
  91 static struct {
  92     /* buffer used for XBZRLE encoding */
  93     uint8_t *encoded_buf;
  94     /* buffer for storing page content */
  95     uint8_t *current_buf;
  96     /* Cache for XBZRLE, Protected by lock. */
  97     PageCache *cache;
  98     QemuMutex lock;
  99     /* it will store a page full of zeros */
 100     uint8_t *zero_target_page;
 101     /* buffer used for XBZRLE decoding */
 102     uint8_t *decoded_buf;
 103 } XBZRLE;
 104
 105 static void XBZRLE_cache_lock(void)
 106 {
 107     if (migrate_use_xbzrle()) {
 108         qemu_mutex_lock(&XBZRLE.lock);
 109     }
 110 }
 111
 112 static void XBZRLE_cache_unlock(void)
 113 {
 114     if (migrate_use_xbzrle()) {
 115         qemu_mutex_unlock(&XBZRLE.lock);
 116     }
 117 }
 118
 119 /**
 120  * xbzrle_cache_resize: resize the xbzrle cache
 121  *
 122  * This function is called from migrate_params_apply in main
 123  * thread, possibly while a migration is in progress.  A running
 124  * migration may be using the cache and might finish during this call,
 125  * hence changes to the cache are protected by XBZRLE.lock().
 126  *
 127  * Returns 0 for success or -1 for error
 128  *
 129  * @new_size: new cache size
 130  * @errp: set *errp if the check failed, with reason
 131  */
 132 int xbzrle_cache_resize(uint64_t new_size, Error **errp)
 133 {
 134     PageCache *new_cache;
 135     int64_t ret = 0;
 136
 137     /* Check for truncation */
 138     if (new_size != (size_t)new_size) {
 139         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
 140                    "exceeding address space");
 141         return -1;
 142     }
 143
 144     if (new_size == migrate_xbzrle_cache_size()) {
 145         /* nothing to do */
 146         return 0;
 147     }
 148
 149     XBZRLE_cache_lock();
 150
 151     if (XBZRLE.cache != NULL) {
 152         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
 153         if (!new_cache) {
 154             ret = -1;
 155             goto out;
 156         }
 157
 158         cache_fini(XBZRLE.cache);
 159         XBZRLE.cache = new_cache;
 160     }
 161 out:
 162     XBZRLE_cache_unlock();
 163     return ret;
 164 }
 165
 166 bool ramblock_is_ignored(RAMBlock *block)
 167 {
 168     return !qemu_ram_is_migratable(block) ||
 169            (migrate_ignore_shared() && qemu_ram_is_shared(block));
 170 }
 171
 172 #undef RAMBLOCK_FOREACH
 173
 174 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
 175 {
 176     RAMBlock *block;
 177     int ret = 0;
 178
 179     RCU_READ_LOCK_GUARD();
 180
 181     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 182         ret = func(block, opaque);
 183         if (ret) {
 184             break;
 185         }
 186     }
 187     return ret;
 188 }
 189
 190 static void ramblock_recv_map_init(void)
 191 {
 192     RAMBlock *rb;
 193
 194     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
 195         assert(!rb->receivedmap);
 196         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
 197     }
 198 }
 199
 200 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
 201 {
 202     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
 203                     rb->receivedmap);
 204 }
 205
 206 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
 207 {
 208     return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
 209 }
 210
 211 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
 212 {
 213     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
 214 }
 215
 216 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
 217                                     size_t nr)
 218 {
 219     bitmap_set_atomic(rb->receivedmap,
 220                       ramblock_recv_bitmap_offset(host_addr, rb),
 221                       nr);
 222 }
 223
 224 #define  RAMBLOCK_RECV_BITMAP_ENDING  (0x0123456789abcdefULL)
 225
 226 /*
 227  * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
 228  *
 229  * Returns >0 if success with sent bytes, or <0 if error.
 230  */
 231 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
 232                                   const char *block_name)
 233 {
 234     RAMBlock *block = qemu_ram_block_by_name(block_name);
 235     unsigned long *le_bitmap, nbits;
 236     uint64_t size;
 237
 238     if (!block) {
 239         error_report("%s: invalid block name: %s", __func__, block_name);
 240         return -1;
 241     }
 242
 243     nbits = block->used_length >> TARGET_PAGE_BITS;
 244
 245     /*
 246      * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
 247      * machines we may need 4 more bytes for padding (see below
 248      * comment). So extend it a bit before hand.
 249      */
 250     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
 251
 252     /*
 253      * Always use little endian when sending the bitmap. This is
 254      * required that when source and destination VMs are not using the
 255      * same endianness. (Note: big endian won't work.)
 256      */
 257     bitmap_to_le(le_bitmap, block->receivedmap, nbits);
 258
 259     /* Size of the bitmap, in bytes */
 260     size = DIV_ROUND_UP(nbits, 8);
 261
 262     /*
 263      * size is always aligned to 8 bytes for 64bit machines, but it
 264      * may not be true for 32bit machines. We need this padding to
 265      * make sure the migration can survive even between 32bit and
 266      * 64bit machines.
 267      */
 268     size = ROUND_UP(size, 8);
 269
 270     qemu_put_be64(file, size);
 271     qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
 272     /*
 273      * Mark as an end, in case the middle part is screwed up due to
 274      * some "mysterious" reason.
 275      */
 276     qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
 277     qemu_fflush(file);
 278
 279     g_free(le_bitmap);
 280
 281     if (qemu_file_get_error(file)) {
 282         return qemu_file_get_error(file);
 283     }
 284
 285     return size + sizeof(size);
 286 }
 287
 288 /*
 289  * An outstanding page request, on the source, having been received
 290  * and queued
 291  */
 292 struct RAMSrcPageRequest {
 293     RAMBlock *rb;
 294     hwaddr    offset;
 295     hwaddr    len;
 296
 297     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 298 };
 299
 300 /* State of RAM for migration */
 301 struct RAMState {
 302     /* QEMUFile used for this migration */
 303     QEMUFile *f;
 304     /* UFFD file descriptor, used in 'write-tracking' migration */
 305     int uffdio_fd;
 306     /* Last block that we have visited searching for dirty pages */
 307     RAMBlock *last_seen_block;
 308     /* Last block from where we have sent data */
 309     RAMBlock *last_sent_block;
 310     /* Last dirty target page we have sent */
 311     ram_addr_t last_page;
 312     /* last ram version we have seen */
 313     uint32_t last_version;
 314     /* How many times we have dirty too many pages */
 315     int dirty_rate_high_cnt;
 316     /* these variables are used for bitmap sync */
 317     /* last time we did a full bitmap_sync */
 318     int64_t time_last_bitmap_sync;
 319     /* bytes transferred at start_time */
 320     uint64_t bytes_xfer_prev;
 321     /* number of dirty pages since start_time */
 322     uint64_t num_dirty_pages_period;
 323     /* xbzrle misses since the beginning of the period */
 324     uint64_t xbzrle_cache_miss_prev;
 325     /* Amount of xbzrle pages since the beginning of the period */
 326     uint64_t xbzrle_pages_prev;
 327     /* Amount of xbzrle encoded bytes since the beginning of the period */
 328     uint64_t xbzrle_bytes_prev;
 329     /* Start using XBZRLE (e.g., after the first round). */
 330     bool xbzrle_enabled;
 331
 332     /* compression statistics since the beginning of the period */
 333     /* amount of count that no free thread to compress data */
 334     uint64_t compress_thread_busy_prev;
 335     /* amount bytes after compression */
 336     uint64_t compressed_size_prev;
 337     /* amount of compressed pages */
 338     uint64_t compress_pages_prev;
 339
 340     /* total handled target pages at the beginning of period */
 341     uint64_t target_page_count_prev;
 342     /* total handled target pages since start */
 343     uint64_t target_page_count;
 344     /* number of dirty bits in the bitmap */
 345     uint64_t migration_dirty_pages;
 346     /* Protects modification of the bitmap and migration dirty pages */
 347     QemuMutex bitmap_mutex;
 348     /* The RAMBlock used in the last src_page_requests */
 349     RAMBlock *last_req_rb;
 350     /* Queue of outstanding page requests from the destination */
 351     QemuMutex src_page_req_mutex;
 352     QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
 353 };
 354 typedef struct RAMState RAMState;
 355
 356 static RAMState *ram_state;
 357
 358 static NotifierWithReturnList precopy_notifier_list;
 359
 360 void precopy_infrastructure_init(void)
 361 {
 362     notifier_with_return_list_init(&precopy_notifier_list);
 363 }
 364
 365 void precopy_add_notifier(NotifierWithReturn *n)
 366 {
 367     notifier_with_return_list_add(&precopy_notifier_list, n);
 368 }
 369
 370 void precopy_remove_notifier(NotifierWithReturn *n)
 371 {
 372     notifier_with_return_remove(n);
 373 }
 374
 375 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
 376 {
 377     PrecopyNotifyData pnd;
 378     pnd.reason = reason;
 379     pnd.errp = errp;
 380
 381     return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
 382 }
 383
 384 uint64_t ram_bytes_remaining(void)
 385 {
 386     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
 387                        0;
 388 }
 389
 390 MigrationStats ram_counters;
 391
 392 /* used by the search for pages to send */
 393 struct PageSearchStatus {
 394     /* Current block being searched */
 395     RAMBlock    *block;
 396     /* Current page to search from */
 397     unsigned long page;
 398     /* Set once we wrap around */
 399     bool         complete_round;
 400 };
 401 typedef struct PageSearchStatus PageSearchStatus;
 402
 403 CompressionStats compression_counters;
 404
 405 struct CompressParam {
 406     bool done;
 407     bool quit;
 408     bool zero_page;
 409     QEMUFile *file;
 410     QemuMutex mutex;
 411     QemuCond cond;
 412     RAMBlock *block;
 413     ram_addr_t offset;
 414
 415     /* internally used fields */
 416     z_stream stream;
 417     uint8_t *originbuf;
 418 };
 419 typedef struct CompressParam CompressParam;
 420
 421 struct DecompressParam {
 422     bool done;
 423     bool quit;
 424     QemuMutex mutex;
 425     QemuCond cond;
 426     void *des;
 427     uint8_t *compbuf;
 428     int len;
 429     z_stream stream;
 430 };
 431 typedef struct DecompressParam DecompressParam;
 432
 433 static CompressParam *comp_param;
 434 static QemuThread *compress_threads;
 435 /* comp_done_cond is used to wake up the migration thread when
 436  * one of the compression threads has finished the compression.
 437  * comp_done_lock is used to co-work with comp_done_cond.
 438  */
 439 static QemuMutex comp_done_lock;
 440 static QemuCond comp_done_cond;
 441 /* The empty QEMUFileOps will be used by file in CompressParam */
 442 static const QEMUFileOps empty_ops = { };
 443
 444 static QEMUFile *decomp_file;
 445 static DecompressParam *decomp_param;
 446 static QemuThread *decompress_threads;
 447 static QemuMutex decomp_done_lock;
 448 static QemuCond decomp_done_cond;
 449
 450 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
 451                                  ram_addr_t offset, uint8_t *source_buf);
 452
 453 static void *do_data_compress(void *opaque)
 454 {
 455     CompressParam *param = opaque;
 456     RAMBlock *block;
 457     ram_addr_t offset;
 458     bool zero_page;
 459
 460     qemu_mutex_lock(&param->mutex);
 461     while (!param->quit) {
 462         if (param->block) {
 463             block = param->block;
 464             offset = param->offset;
 465             param->block = NULL;
 466             qemu_mutex_unlock(&param->mutex);
 467
 468             zero_page = do_compress_ram_page(param->file, &param->stream,
 469                                              block, offset, param->originbuf);
 470
 471             qemu_mutex_lock(&comp_done_lock);
 472             param->done = true;
 473             param->zero_page = zero_page;
 474             qemu_cond_signal(&comp_done_cond);
 475             qemu_mutex_unlock(&comp_done_lock);
 476
 477             qemu_mutex_lock(&param->mutex);
 478         } else {
 479             qemu_cond_wait(&param->cond, &param->mutex);
 480         }
 481     }
 482     qemu_mutex_unlock(&param->mutex);
 483
 484     return NULL;
 485 }
 486
 487 static void compress_threads_save_cleanup(void)
 488 {
 489     int i, thread_count;
 490
 491     if (!migrate_use_compression() || !comp_param) {
 492         return;
 493     }
 494
 495     thread_count = migrate_compress_threads();
 496     for (i = 0; i < thread_count; i++) {
 497         /*
 498          * we use it as a indicator which shows if the thread is
 499          * properly init'd or not
 500          */
 501         if (!comp_param[i].file) {
 502             break;
 503         }
 504
 505         qemu_mutex_lock(&comp_param[i].mutex);
 506         comp_param[i].quit = true;
 507         qemu_cond_signal(&comp_param[i].cond);
 508         qemu_mutex_unlock(&comp_param[i].mutex);
 509
 510         qemu_thread_join(compress_threads + i);
 511         qemu_mutex_destroy(&comp_param[i].mutex);
 512         qemu_cond_destroy(&comp_param[i].cond);
 513         deflateEnd(&comp_param[i].stream);
 514         g_free(comp_param[i].originbuf);
 515         qemu_fclose(comp_param[i].file);
 516         comp_param[i].file = NULL;
 517     }
 518     qemu_mutex_destroy(&comp_done_lock);
 519     qemu_cond_destroy(&comp_done_cond);
 520     g_free(compress_threads);
 521     g_free(comp_param);
 522     compress_threads = NULL;
 523     comp_param = NULL;
 524 }
 525
 526 static int compress_threads_save_setup(void)
 527 {
 528     int i, thread_count;
 529
 530     if (!migrate_use_compression()) {
 531         return 0;
 532     }
 533     thread_count = migrate_compress_threads();
 534     compress_threads = g_new0(QemuThread, thread_count);
 535     comp_param = g_new0(CompressParam, thread_count);
 536     qemu_cond_init(&comp_done_cond);
 537     qemu_mutex_init(&comp_done_lock);
 538     for (i = 0; i < thread_count; i++) {
 539         comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
 540         if (!comp_param[i].originbuf) {
 541             goto exit;
 542         }
 543
 544         if (deflateInit(&comp_param[i].stream,
 545                         migrate_compress_level()) != Z_OK) {
 546             g_free(comp_param[i].originbuf);
 547             goto exit;
 548         }
 549
 550         /* comp_param[i].file is just used as a dummy buffer to save data,
 551          * set its ops to empty.
 552          */
 553         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
 554         comp_param[i].done = true;
 555         comp_param[i].quit = false;
 556         qemu_mutex_init(&comp_param[i].mutex);
 557         qemu_cond_init(&comp_param[i].cond);
 558         qemu_thread_create(compress_threads + i, "compress",
 559                            do_data_compress, comp_param + i,
 560                            QEMU_THREAD_JOINABLE);
 561     }
 562     return 0;
 563
 564 exit:
 565     compress_threads_save_cleanup();
 566     return -1;
 567 }
 568
 569 /**
 570  * save_page_header: write page header to wire
 571  *
 572  * If this is the 1st block, it also writes the block identification
 573  *
 574  * Returns the number of bytes written
 575  *
 576  * @f: QEMUFile where to send the data
 577  * @block: block that contains the page we want to send
 578  * @offset: offset inside the block for the page
 579  *          in the lower bits, it contains flags
 580  */
 581 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
 582                                ram_addr_t offset)
 583 {
 584     size_t size, len;
 585
 586     if (block == rs->last_sent_block) {
 587         offset |= RAM_SAVE_FLAG_CONTINUE;
 588     }
 589     qemu_put_be64(f, offset);
 590     size = 8;
 591
 592     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
 593         len = strlen(block->idstr);
 594         qemu_put_byte(f, len);
 595         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
 596         size += 1 + len;
 597         rs->last_sent_block = block;
 598     }
 599     return size;
 600 }
 601
 602 /**
 603  * mig_throttle_guest_down: throotle down the guest
 604  *
 605  * Reduce amount of guest cpu execution to hopefully slow down memory
 606  * writes. If guest dirty memory rate is reduced below the rate at
 607  * which we can transfer pages to the destination then we should be
 608  * able to complete migration. Some workloads dirty memory way too
 609  * fast and will not effectively converge, even with auto-converge.
 610  */
 611 static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
 612                                     uint64_t bytes_dirty_threshold)
 613 {
 614     MigrationState *s = migrate_get_current();
 615     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
 616     uint64_t pct_increment = s->parameters.cpu_throttle_increment;
 617     bool pct_tailslow = s->parameters.cpu_throttle_tailslow;
 618     int pct_max = s->parameters.max_cpu_throttle;
 619
 620     uint64_t throttle_now = cpu_throttle_get_percentage();
 621     uint64_t cpu_now, cpu_ideal, throttle_inc;
 622
 623     /* We have not started throttling yet. Let's start it. */
 624     if (!cpu_throttle_active()) {
 625         cpu_throttle_set(pct_initial);
 626     } else {
 627         /* Throttling already on, just increase the rate */
 628         if (!pct_tailslow) {
 629             throttle_inc = pct_increment;
 630         } else {
 631             /* Compute the ideal CPU percentage used by Guest, which may
 632              * make the dirty rate match the dirty rate threshold. */
 633             cpu_now = 100 - throttle_now;
 634             cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
 635                         bytes_dirty_period);
 636             throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
 637         }
 638         cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
 639     }
 640 }
 641
 642 /**
 643  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
 644  *
 645  * @rs: current RAM state
 646  * @current_addr: address for the zero page
 647  *
 648  * Update the xbzrle cache to reflect a page that's been sent as all 0.
 649  * The important thing is that a stale (not-yet-0'd) page be replaced
 650  * by the new data.
 651  * As a bonus, if the page wasn't in the cache it gets added so that
 652  * when a small write is made into the 0'd page it gets XBZRLE sent.
 653  */
 654 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
 655 {
 656     if (!rs->xbzrle_enabled) {
 657         return;
 658     }
 659
 660     /* We don't care if this fails to allocate a new cache page
 661      * as long as it updated an old one */
 662     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
 663                  ram_counters.dirty_sync_count);
 664 }
 665
 666 #define ENCODING_FLAG_XBZRLE 0x1
 667
 668 /**
 669  * save_xbzrle_page: compress and send current page
 670  *
 671  * Returns: 1 means that we wrote the page
 672  *          0 means that page is identical to the one already sent
 673  *          -1 means that xbzrle would be longer than normal
 674  *
 675  * @rs: current RAM state
 676  * @current_data: pointer to the address of the page contents
 677  * @current_addr: addr of the page
 678  * @block: block that contains the page we want to send
 679  * @offset: offset inside the block for the page
 680  * @last_stage: if we are at the completion stage
 681  */
 682 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
 683                             ram_addr_t current_addr, RAMBlock *block,
 684                             ram_addr_t offset, bool last_stage)
 685 {
 686     int encoded_len = 0, bytes_xbzrle;
 687     uint8_t *prev_cached_page;
 688
 689     if (!cache_is_cached(XBZRLE.cache, current_addr,
 690                          ram_counters.dirty_sync_count)) {
 691         xbzrle_counters.cache_miss++;
 692         if (!last_stage) {
 693             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
 694                              ram_counters.dirty_sync_count) == -1) {
 695                 return -1;
 696             } else {
 697                 /* update *current_data when the page has been
 698                    inserted into cache */
 699                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
 700             }
 701         }
 702         return -1;
 703     }
 704
 705     /*
 706      * Reaching here means the page has hit the xbzrle cache, no matter what
 707      * encoding result it is (normal encoding, overflow or skipping the page),
 708      * count the page as encoded. This is used to calculate the encoding rate.
 709      *
 710      * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
 711      * 2nd page turns out to be skipped (i.e. no new bytes written to the
 712      * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
 713      * skipped page included. In this way, the encoding rate can tell if the
 714      * guest page is good for xbzrle encoding.
 715      */
 716     xbzrle_counters.pages++;
 717     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
 718
 719     /* save current buffer into memory */
 720     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 721
 722     /* XBZRLE encoding (if there is no overflow) */
 723     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
 724                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
 725                                        TARGET_PAGE_SIZE);
 726
 727     /*
 728      * Update the cache contents, so that it corresponds to the data
 729      * sent, in all cases except where we skip the page.
 730      */
 731     if (!last_stage && encoded_len != 0) {
 732         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
 733         /*
 734          * In the case where we couldn't compress, ensure that the caller
 735          * sends the data from the cache, since the guest might have
 736          * changed the RAM since we copied it.
 737          */
 738         *current_data = prev_cached_page;
 739     }
 740
 741     if (encoded_len == 0) {
 742         trace_save_xbzrle_page_skipping();
 743         return 0;
 744     } else if (encoded_len == -1) {
 745         trace_save_xbzrle_page_overflow();
 746         xbzrle_counters.overflow++;
 747         xbzrle_counters.bytes += TARGET_PAGE_SIZE;
 748         return -1;
 749     }
 750
 751     /* Send XBZRLE based compressed page */
 752     bytes_xbzrle = save_page_header(rs, rs->f, block,
 753                                     offset | RAM_SAVE_FLAG_XBZRLE);
 754     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
 755     qemu_put_be16(rs->f, encoded_len);
 756     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
 757     bytes_xbzrle += encoded_len + 1 + 2;
 758     /*
 759      * Like compressed_size (please see update_compress_thread_counts),
 760      * the xbzrle encoded bytes don't count the 8 byte header with
 761      * RAM_SAVE_FLAG_CONTINUE.
 762      */
 763     xbzrle_counters.bytes += bytes_xbzrle - 8;
 764     ram_counters.transferred += bytes_xbzrle;
 765
 766     return 1;
 767 }
 768
 769 /**
 770  * migration_bitmap_find_dirty: find the next dirty page from start
 771  *
 772  * Returns the page offset within memory region of the start of a dirty page
 773  *
 774  * @rs: current RAM state
 775  * @rb: RAMBlock where to search for dirty pages
 776  * @start: page where we start the search
 777  */
 778 static inline
 779 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
 780                                           unsigned long start)
 781 {
 782     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
 783     unsigned long *bitmap = rb->bmap;
 784
 785     if (ramblock_is_ignored(rb)) {
 786         return size;
 787     }
 788
 789     return find_next_bit(bitmap, size, start);
 790 }
 791
 792 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
 793                                                 RAMBlock *rb,
 794                                                 unsigned long page)
 795 {
 796     bool ret;
 797
 798     QEMU_LOCK_GUARD(&rs->bitmap_mutex);
 799
 800     /*
 801      * Clear dirty bitmap if needed.  This _must_ be called before we
 802      * send any of the page in the chunk because we need to make sure
 803      * we can capture further page content changes when we sync dirty
 804      * log the next time.  So as long as we are going to send any of
 805      * the page in the chunk we clear the remote dirty bitmap for all.
 806      * Clearing it earlier won't be a problem, but too late will.
 807      */
 808     if (rb->clear_bmap && clear_bmap_test_and_clear(rb, page)) {
 809         uint8_t shift = rb->clear_bmap_shift;
 810         hwaddr size = 1ULL << (TARGET_PAGE_BITS + shift);
 811         hwaddr start = (((ram_addr_t)page) << TARGET_PAGE_BITS) & (-size);
 812
 813         /*
 814          * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
 815          * can make things easier sometimes since then start address
 816          * of the small chunk will always be 64 pages aligned so the
 817          * bitmap will always be aligned to unsigned long.  We should
 818          * even be able to remove this restriction but I'm simply
 819          * keeping it.
 820          */
 821         assert(shift >= 6);
 822         trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
 823         memory_region_clear_dirty_bitmap(rb->mr, start, size);
 824     }
 825
 826     ret = test_and_clear_bit(page, rb->bmap);
 827
 828     if (ret) {
 829         rs->migration_dirty_pages--;
 830     }
 831
 832     return ret;
 833 }
 834
 835 /* Called with RCU critical section */
 836 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
 837 {
 838     uint64_t new_dirty_pages =
 839         cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
 840
 841     rs->migration_dirty_pages += new_dirty_pages;
 842     rs->num_dirty_pages_period += new_dirty_pages;
 843 }
 844
 845 /**
 846  * ram_pagesize_summary: calculate all the pagesizes of a VM
 847  *
 848  * Returns a summary bitmap of the page sizes of all RAMBlocks
 849  *
 850  * For VMs with just normal pages this is equivalent to the host page
 851  * size. If it's got some huge pages then it's the OR of all the
 852  * different page sizes.
 853  */
 854 uint64_t ram_pagesize_summary(void)
 855 {
 856     RAMBlock *block;
 857     uint64_t summary = 0;
 858
 859     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 860         summary |= block->page_size;
 861     }
 862
 863     return summary;
 864 }
 865
 866 uint64_t ram_get_total_transferred_pages(void)
 867 {
 868     return  ram_counters.normal + ram_counters.duplicate +
 869                 compression_counters.pages + xbzrle_counters.pages;
 870 }
 871
 872 static void migration_update_rates(RAMState *rs, int64_t end_time)
 873 {
 874     uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
 875     double compressed_size;
 876
 877     /* calculate period counters */
 878     ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
 879                 / (end_time - rs->time_last_bitmap_sync);
 880
 881     if (!page_count) {
 882         return;
 883     }
 884
 885     if (migrate_use_xbzrle()) {
 886         double encoded_size, unencoded_size;
 887
 888         xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
 889             rs->xbzrle_cache_miss_prev) / page_count;
 890         rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
 891         unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
 892                          TARGET_PAGE_SIZE;
 893         encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
 894         if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
 895             xbzrle_counters.encoding_rate = 0;
 896         } else {
 897             xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
 898         }
 899         rs->xbzrle_pages_prev = xbzrle_counters.pages;
 900         rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
 901     }
 902
 903     if (migrate_use_compression()) {
 904         compression_counters.busy_rate = (double)(compression_counters.busy -
 905             rs->compress_thread_busy_prev) / page_count;
 906         rs->compress_thread_busy_prev = compression_counters.busy;
 907
 908         compressed_size = compression_counters.compressed_size -
 909                           rs->compressed_size_prev;
 910         if (compressed_size) {
 911             double uncompressed_size = (compression_counters.pages -
 912                                     rs->compress_pages_prev) * TARGET_PAGE_SIZE;
 913
 914             /* Compression-Ratio = Uncompressed-size / Compressed-size */
 915             compression_counters.compression_rate =
 916                                         uncompressed_size / compressed_size;
 917
 918             rs->compress_pages_prev = compression_counters.pages;
 919             rs->compressed_size_prev = compression_counters.compressed_size;
 920         }
 921     }
 922 }
 923
 924 static void migration_trigger_throttle(RAMState *rs)
 925 {
 926     MigrationState *s = migrate_get_current();
 927     uint64_t threshold = s->parameters.throttle_trigger_threshold;
 928
 929     uint64_t bytes_xfer_period = ram_counters.transferred - rs->bytes_xfer_prev;
 930     uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
 931     uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
 932
 933     /* During block migration the auto-converge logic incorrectly detects
 934      * that ram migration makes no progress. Avoid this by disabling the
 935      * throttling logic during the bulk phase of block migration. */
 936     if (migrate_auto_converge() && !blk_mig_bulk_active()) {
 937         /* The following detection logic can be refined later. For now:
 938            Check to see if the ratio between dirtied bytes and the approx.
 939            amount of bytes that just got transferred since the last time
 940            we were in this routine reaches the threshold. If that happens
 941            twice, start or increase throttling. */
 942
 943         if ((bytes_dirty_period > bytes_dirty_threshold) &&
 944             (++rs->dirty_rate_high_cnt >= 2)) {
 945             trace_migration_throttle();
 946             rs->dirty_rate_high_cnt = 0;
 947             mig_throttle_guest_down(bytes_dirty_period,
 948                                     bytes_dirty_threshold);
 949         }
 950     }
 951 }
 952
 953 static void migration_bitmap_sync(RAMState *rs)
 954 {
 955     RAMBlock *block;
 956     int64_t end_time;
 957
 958     ram_counters.dirty_sync_count++;
 959
 960     if (!rs->time_last_bitmap_sync) {
 961         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 962     }
 963
 964     trace_migration_bitmap_sync_start();
 965     memory_global_dirty_log_sync();
 966
 967     qemu_mutex_lock(&rs->bitmap_mutex);
 968     WITH_RCU_READ_LOCK_GUARD() {
 969         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
 970             ramblock_sync_dirty_bitmap(rs, block);
 971         }
 972         ram_counters.remaining = ram_bytes_remaining();
 973     }
 974     qemu_mutex_unlock(&rs->bitmap_mutex);
 975
 976     memory_global_after_dirty_log_sync();
 977     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
 978
 979     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 980
 981     /* more than 1 second = 1000 millisecons */
 982     if (end_time > rs->time_last_bitmap_sync + 1000) {
 983         migration_trigger_throttle(rs);
 984
 985         migration_update_rates(rs, end_time);
 986
 987         rs->target_page_count_prev = rs->target_page_count;
 988
 989         /* reset period counters */
 990         rs->time_last_bitmap_sync = end_time;
 991         rs->num_dirty_pages_period = 0;
 992         rs->bytes_xfer_prev = ram_counters.transferred;
 993     }
 994     if (migrate_use_events()) {
 995         qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
 996     }
 997 }
 998
 999 static void migration_bitmap_sync_precopy(RAMState *rs)
1000 {
1001     Error *local_err = NULL;
1002
1003     /*
1004      * The current notifier usage is just an optimization to migration, so we
1005      * don't stop the normal migration process in the error case.
1006      */
1007     if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1008         error_report_err(local_err);
1009         local_err = NULL;
1010     }
1011
1012     migration_bitmap_sync(rs);
1013
1014     if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1015         error_report_err(local_err);
1016     }
1017 }
1018
1019 /**
1020  * save_zero_page_to_file: send the zero page to the file
1021  *
1022  * Returns the size of data written to the file, 0 means the page is not
1023  * a zero page
1024  *
1025  * @rs: current RAM state
1026  * @file: the file where the data is saved
1027  * @block: block that contains the page we want to send
1028  * @offset: offset inside the block for the page
1029  */
1030 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1031                                   RAMBlock *block, ram_addr_t offset)
1032 {
1033     uint8_t *p = block->host + offset;
1034     int len = 0;
1035
1036     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1037         len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1038         qemu_put_byte(file, 0);
1039         len += 1;
1040     }
1041     return len;
1042 }
1043
1044 /**
1045  * save_zero_page: send the zero page to the stream
1046  *
1047  * Returns the number of pages written.
1048  *
1049  * @rs: current RAM state
1050  * @block: block that contains the page we want to send
1051  * @offset: offset inside the block for the page
1052  */
1053 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1054 {
1055     int len = save_zero_page_to_file(rs, rs->f, block, offset);
1056
1057     if (len) {
1058         ram_counters.duplicate++;
1059         ram_counters.transferred += len;
1060         return 1;
1061     }
1062     return -1;
1063 }
1064
1065 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
1066 {
1067     if (!migrate_release_ram() || !migration_in_postcopy()) {
1068         return;
1069     }
1070
1071     ram_discard_range(rbname, offset, ((ram_addr_t)pages) << TARGET_PAGE_BITS);
1072 }
1073
1074 /*
1075  * @pages: the number of pages written by the control path,
1076  *        < 0 - error
1077  *        > 0 - number of pages written
1078  *
1079  * Return true if the pages has been saved, otherwise false is returned.
1080  */
1081 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1082                               int *pages)
1083 {
1084     uint64_t bytes_xmit = 0;
1085     int ret;
1086
1087     *pages = -1;
1088     ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1089                                 &bytes_xmit);
1090     if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1091         return false;
1092     }
1093
1094     if (bytes_xmit) {
1095         ram_counters.transferred += bytes_xmit;
1096         *pages = 1;
1097     }
1098
1099     if (ret == RAM_SAVE_CONTROL_DELAYED) {
1100         return true;
1101     }
1102
1103     if (bytes_xmit > 0) {
1104         ram_counters.normal++;
1105     } else if (bytes_xmit == 0) {
1106         ram_counters.duplicate++;
1107     }
1108
1109     return true;
1110 }
1111
1112 /*
1113  * directly send the page to the stream
1114  *
1115  * Returns the number of pages written.
1116  *
1117  * @rs: current RAM state
1118  * @block: block that contains the page we want to send
1119  * @offset: offset inside the block for the page
1120  * @buf: the page to be sent
1121  * @async: send to page asyncly
1122  */
1123 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1124                             uint8_t *buf, bool async)
1125 {
1126     ram_counters.transferred += save_page_header(rs, rs->f, block,
1127                                                  offset | RAM_SAVE_FLAG_PAGE);
1128     if (async) {
1129         qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1130                               migrate_release_ram() &
1131                               migration_in_postcopy());
1132     } else {
1133         qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1134     }
1135     ram_counters.transferred += TARGET_PAGE_SIZE;
1136     ram_counters.normal++;
1137     return 1;
1138 }
1139
1140 /**
1141  * ram_save_page: send the given page to the stream
1142  *
1143  * Returns the number of pages written.
1144  *          < 0 - error
1145  *          >=0 - Number of pages written - this might legally be 0
1146  *                if xbzrle noticed the page was the same.
1147  *
1148  * @rs: current RAM state
1149  * @block: block that contains the page we want to send
1150  * @offset: offset inside the block for the page
1151  * @last_stage: if we are at the completion stage
1152  */
1153 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
1154 {
1155     int pages = -1;
1156     uint8_t *p;
1157     bool send_async = true;
1158     RAMBlock *block = pss->block;
1159     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1160     ram_addr_t current_addr = block->offset + offset;
1161
1162     p = block->host + offset;
1163     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1164
1165     XBZRLE_cache_lock();
1166     if (rs->xbzrle_enabled && !migration_in_postcopy()) {
1167         pages = save_xbzrle_page(rs, &p, current_addr, block,
1168                                  offset, last_stage);
1169         if (!last_stage) {
1170             /* Can't send this cached data async, since the cache page
1171              * might get updated before it gets to the wire
1172              */
1173             send_async = false;
1174         }
1175     }
1176
1177     /* XBZRLE overflow or normal page */
1178     if (pages == -1) {
1179         pages = save_normal_page(rs, block, offset, p, send_async);
1180     }
1181
1182     XBZRLE_cache_unlock();
1183
1184     return pages;
1185 }
1186
1187 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
1188                                  ram_addr_t offset)
1189 {
1190     if (multifd_queue_page(rs->f, block, offset) < 0) {
1191         return -1;
1192     }
1193     ram_counters.normal++;
1194
1195     return 1;
1196 }
1197
1198 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1199                                  ram_addr_t offset, uint8_t *source_buf)
1200 {
1201     RAMState *rs = ram_state;
1202     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1203     bool zero_page = false;
1204     int ret;
1205
1206     if (save_zero_page_to_file(rs, f, block, offset)) {
1207         zero_page = true;
1208         goto exit;
1209     }
1210
1211     save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1212
1213     /*
1214      * copy it to a internal buffer to avoid it being modified by VM
1215      * so that we can catch up the error during compression and
1216      * decompression
1217      */
1218     memcpy(source_buf, p, TARGET_PAGE_SIZE);
1219     ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1220     if (ret < 0) {
1221         qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
1222         error_report("compressed data failed!");
1223         return false;
1224     }
1225
1226 exit:
1227     ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
1228     return zero_page;
1229 }
1230
1231 static void
1232 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1233 {
1234     ram_counters.transferred += bytes_xmit;
1235
1236     if (param->zero_page) {
1237         ram_counters.duplicate++;
1238         return;
1239     }
1240
1241     /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1242     compression_counters.compressed_size += bytes_xmit - 8;
1243     compression_counters.pages++;
1244 }
1245
1246 static bool save_page_use_compression(RAMState *rs);
1247
1248 static void flush_compressed_data(RAMState *rs)
1249 {
1250     int idx, len, thread_count;
1251
1252     if (!save_page_use_compression(rs)) {
1253         return;
1254     }
1255     thread_count = migrate_compress_threads();
1256
1257     qemu_mutex_lock(&comp_done_lock);
1258     for (idx = 0; idx < thread_count; idx++) {
1259         while (!comp_param[idx].done) {
1260             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1261         }
1262     }
1263     qemu_mutex_unlock(&comp_done_lock);
1264
1265     for (idx = 0; idx < thread_count; idx++) {
1266         qemu_mutex_lock(&comp_param[idx].mutex);
1267         if (!comp_param[idx].quit) {
1268             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1269             /*
1270              * it's safe to fetch zero_page without holding comp_done_lock
1271              * as there is no further request submitted to the thread,
1272              * i.e, the thread should be waiting for a request at this point.
1273              */
1274             update_compress_thread_counts(&comp_param[idx], len);
1275         }
1276         qemu_mutex_unlock(&comp_param[idx].mutex);
1277     }
1278 }
1279
1280 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1281                                        ram_addr_t offset)
1282 {
1283     param->block = block;
1284     param->offset = offset;
1285 }
1286
1287 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1288                                            ram_addr_t offset)
1289 {
1290     int idx, thread_count, bytes_xmit = -1, pages = -1;
1291     bool wait = migrate_compress_wait_thread();
1292
1293     thread_count = migrate_compress_threads();
1294     qemu_mutex_lock(&comp_done_lock);
1295 retry:
1296     for (idx = 0; idx < thread_count; idx++) {
1297         if (comp_param[idx].done) {
1298             comp_param[idx].done = false;
1299             bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1300             qemu_mutex_lock(&comp_param[idx].mutex);
1301             set_compress_params(&comp_param[idx], block, offset);
1302             qemu_cond_signal(&comp_param[idx].cond);
1303             qemu_mutex_unlock(&comp_param[idx].mutex);
1304             pages = 1;
1305             update_compress_thread_counts(&comp_param[idx], bytes_xmit);
1306             break;
1307         }
1308     }
1309
1310     /*
1311      * wait for the free thread if the user specifies 'compress-wait-thread',
1312      * otherwise we will post the page out in the main thread as normal page.
1313      */
1314     if (pages < 0 && wait) {
1315         qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1316         goto retry;
1317     }
1318     qemu_mutex_unlock(&comp_done_lock);
1319
1320     return pages;
1321 }
1322
1323 /**
1324  * find_dirty_block: find the next dirty page and update any state
1325  * associated with the search process.
1326  *
1327  * Returns true if a page is found
1328  *
1329  * @rs: current RAM state
1330  * @pss: data about the state of the current dirty page scan
1331  * @again: set to false if the search has scanned the whole of RAM
1332  */
1333 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1334 {
1335     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1336     if (pss->complete_round && pss->block == rs->last_seen_block &&
1337         pss->page >= rs->last_page) {
1338         /*
1339          * We've been once around the RAM and haven't found anything.
1340          * Give up.
1341          */
1342         *again = false;
1343         return false;
1344     }
1345     if ((((ram_addr_t)pss->page) << TARGET_PAGE_BITS)
1346         >= pss->block->used_length) {
1347         /* Didn't find anything in this RAM Block */
1348         pss->page = 0;
1349         pss->block = QLIST_NEXT_RCU(pss->block, next);
1350         if (!pss->block) {
1351             /*
1352              * If memory migration starts over, we will meet a dirtied page
1353              * which may still exists in compression threads's ring, so we
1354              * should flush the compressed data to make sure the new page
1355              * is not overwritten by the old one in the destination.
1356              *
1357              * Also If xbzrle is on, stop using the data compression at this
1358              * point. In theory, xbzrle can do better than compression.
1359              */
1360             flush_compressed_data(rs);
1361
1362             /* Hit the end of the list */
1363             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1364             /* Flag that we've looped */
1365             pss->complete_round = true;
1366             /* After the first round, enable XBZRLE. */
1367             if (migrate_use_xbzrle()) {
1368                 rs->xbzrle_enabled = true;
1369             }
1370         }
1371         /* Didn't find anything this time, but try again on the new block */
1372         *again = true;
1373         return false;
1374     } else {
1375         /* Can go around again, but... */
1376         *again = true;
1377         /* We've found something so probably don't need to */
1378         return true;
1379     }
1380 }
1381
1382 /**
1383  * unqueue_page: gets a page of the queue
1384  *
1385  * Helper for 'get_queued_page' - gets a page off the queue
1386  *
1387  * Returns the block of the page (or NULL if none available)
1388  *
1389  * @rs: current RAM state
1390  * @offset: used to return the offset within the RAMBlock
1391  */
1392 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1393 {
1394     RAMBlock *block = NULL;
1395
1396     if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) {
1397         return NULL;
1398     }
1399
1400     QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
1401     if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1402         struct RAMSrcPageRequest *entry =
1403                                 QSIMPLEQ_FIRST(&rs->src_page_requests);
1404         block = entry->rb;
1405         *offset = entry->offset;
1406
1407         if (entry->len > TARGET_PAGE_SIZE) {
1408             entry->len -= TARGET_PAGE_SIZE;
1409             entry->offset += TARGET_PAGE_SIZE;
1410         } else {
1411             memory_region_unref(block->mr);
1412             QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1413             g_free(entry);
1414             migration_consume_urgent_request();
1415         }
1416     }
1417
1418     return block;
1419 }
1420
1421 #if defined(__linux__)
1422 /**
1423  * poll_fault_page: try to get next UFFD write fault page and, if pending fault
1424  *   is found, return RAM block pointer and page offset
1425  *
1426  * Returns pointer to the RAMBlock containing faulting page,
1427  *   NULL if no write faults are pending
1428  *
1429  * @rs: current RAM state
1430  * @offset: page offset from the beginning of the block
1431  */
1432 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1433 {
1434     struct uffd_msg uffd_msg;
1435     void *page_address;
1436     RAMBlock *block;
1437     int res;
1438
1439     if (!migrate_background_snapshot()) {
1440         return NULL;
1441     }
1442
1443     res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1);
1444     if (res <= 0) {
1445         return NULL;
1446     }
1447
1448     page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address;
1449     block = qemu_ram_block_from_host(page_address, false, offset);
1450     assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0);
1451     return block;
1452 }
1453
1454 /**
1455  * ram_save_release_protection: release UFFD write protection after
1456  *   a range of pages has been saved
1457  *
1458  * @rs: current RAM state
1459  * @pss: page-search-status structure
1460  * @start_page: index of the first page in the range relative to pss->block
1461  *
1462  * Returns 0 on success, negative value in case of an error
1463 */
1464 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1465         unsigned long start_page)
1466 {
1467     int res = 0;
1468
1469     /* Check if page is from UFFD-managed region. */
1470     if (pss->block->flags & RAM_UF_WRITEPROTECT) {
1471         void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS);
1472         uint64_t run_length = (pss->page - start_page + 1) << TARGET_PAGE_BITS;
1473
1474         /* Flush async buffers before un-protect. */
1475         qemu_fflush(rs->f);
1476         /* Un-protect memory range. */
1477         res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
1478                 false, false);
1479     }
1480
1481     return res;
1482 }
1483
1484 /* ram_write_tracking_available: check if kernel supports required UFFD features
1485  *
1486  * Returns true if supports, false otherwise
1487  */
1488 bool ram_write_tracking_available(void)
1489 {
1490     uint64_t uffd_features;
1491     int res;
1492
1493     res = uffd_query_features(&uffd_features);
1494     return (res == 0 &&
1495             (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0);
1496 }
1497
1498 /* ram_write_tracking_compatible: check if guest configuration is
1499  *   compatible with 'write-tracking'
1500  *
1501  * Returns true if compatible, false otherwise
1502  */
1503 bool ram_write_tracking_compatible(void)
1504 {
1505     const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT);
1506     int uffd_fd;
1507     RAMBlock *block;
1508     bool ret = false;
1509
1510     /* Open UFFD file descriptor */
1511     uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false);
1512     if (uffd_fd < 0) {
1513         return false;
1514     }
1515
1516     RCU_READ_LOCK_GUARD();
1517
1518     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1519         uint64_t uffd_ioctls;
1520
1521         /* Nothing to do with read-only and MMIO-writable regions */
1522         if (block->mr->readonly || block->mr->rom_device) {
1523             continue;
1524         }
1525         /* Try to register block memory via UFFD-IO to track writes */
1526         if (uffd_register_memory(uffd_fd, block->host, block->max_length,
1527                 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) {
1528             goto out;
1529         }
1530         if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) {
1531             goto out;
1532         }
1533     }
1534     ret = true;
1535
1536 out:
1537     uffd_close_fd(uffd_fd);
1538     return ret;
1539 }
1540
1541 /*
1542  * ram_block_populate_pages: populate memory in the RAM block by reading
1543  *   an integer from the beginning of each page.
1544  *
1545  * Since it's solely used for userfault_fd WP feature, here we just
1546  *   hardcode page size to qemu_real_host_page_size.
1547  *
1548  * @block: RAM block to populate
1549  */
1550 static void ram_block_populate_pages(RAMBlock *block)
1551 {
1552     char *ptr = (char *) block->host;
1553
1554     for (ram_addr_t offset = 0; offset < block->used_length;
1555             offset += qemu_real_host_page_size) {
1556         char tmp = *(ptr + offset);
1557
1558         /* Don't optimize the read out */
1559         asm volatile("" : "+r" (tmp));
1560     }
1561 }
1562
1563 /*
1564  * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking
1565  */
1566 void ram_write_tracking_prepare(void)
1567 {
1568     RAMBlock *block;
1569
1570     RCU_READ_LOCK_GUARD();
1571
1572     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1573         /* Nothing to do with read-only and MMIO-writable regions */
1574         if (block->mr->readonly || block->mr->rom_device) {
1575             continue;
1576         }
1577
1578         /*
1579          * Populate pages of the RAM block before enabling userfault_fd
1580          * write protection.
1581          *
1582          * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with
1583          * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip
1584          * pages with pte_none() entries in page table.
1585          */
1586         ram_block_populate_pages(block);
1587     }
1588 }
1589
1590 /*
1591  * ram_write_tracking_start: start UFFD-WP memory tracking
1592  *
1593  * Returns 0 for success or negative value in case of error
1594  */
1595 int ram_write_tracking_start(void)
1596 {
1597     int uffd_fd;
1598     RAMState *rs = ram_state;
1599     RAMBlock *block;
1600
1601     /* Open UFFD file descriptor */
1602     uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true);
1603     if (uffd_fd < 0) {
1604         return uffd_fd;
1605     }
1606     rs->uffdio_fd = uffd_fd;
1607
1608     RCU_READ_LOCK_GUARD();
1609
1610     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1611         /* Nothing to do with read-only and MMIO-writable regions */
1612         if (block->mr->readonly || block->mr->rom_device) {
1613             continue;
1614         }
1615
1616         /* Register block memory with UFFD to track writes */
1617         if (uffd_register_memory(rs->uffdio_fd, block->host,
1618                 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
1619             goto fail;
1620         }
1621         /* Apply UFFD write protection to the block memory range */
1622         if (uffd_change_protection(rs->uffdio_fd, block->host,
1623                 block->max_length, true, false)) {
1624             goto fail;
1625         }
1626         block->flags |= RAM_UF_WRITEPROTECT;
1627         memory_region_ref(block->mr);
1628
1629         trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size,
1630                 block->host, block->max_length);
1631     }
1632
1633     return 0;
1634
1635 fail:
1636     error_report("ram_write_tracking_start() failed: restoring initial memory state");
1637
1638     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1639         if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1640             continue;
1641         }
1642         /*
1643          * In case some memory block failed to be write-protected
1644          * remove protection and unregister all succeeded RAM blocks
1645          */
1646         uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1647                 false, false);
1648         uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1649         /* Cleanup flags and remove reference */
1650         block->flags &= ~RAM_UF_WRITEPROTECT;
1651         memory_region_unref(block->mr);
1652     }
1653
1654     uffd_close_fd(uffd_fd);
1655     rs->uffdio_fd = -1;
1656     return -1;
1657 }
1658
1659 /**
1660  * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
1661  */
1662 void ram_write_tracking_stop(void)
1663 {
1664     RAMState *rs = ram_state;
1665     RAMBlock *block;
1666
1667     RCU_READ_LOCK_GUARD();
1668
1669     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1670         if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1671             continue;
1672         }
1673         /* Remove protection and unregister all affected RAM blocks */
1674         uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1675                 false, false);
1676         uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1677
1678         trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size,
1679                 block->host, block->max_length);
1680
1681         /* Cleanup flags and remove reference */
1682         block->flags &= ~RAM_UF_WRITEPROTECT;
1683         memory_region_unref(block->mr);
1684     }
1685
1686     /* Finally close UFFD file descriptor */
1687     uffd_close_fd(rs->uffdio_fd);
1688     rs->uffdio_fd = -1;
1689 }
1690
1691 #else
1692 /* No target OS support, stubs just fail or ignore */
1693
1694 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1695 {
1696     (void) rs;
1697     (void) offset;
1698
1699     return NULL;
1700 }
1701
1702 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1703         unsigned long start_page)
1704 {
1705     (void) rs;
1706     (void) pss;
1707     (void) start_page;
1708
1709     return 0;
1710 }
1711
1712 bool ram_write_tracking_available(void)
1713 {
1714     return false;
1715 }
1716
1717 bool ram_write_tracking_compatible(void)
1718 {
1719     assert(0);
1720     return false;
1721 }
1722
1723 int ram_write_tracking_start(void)
1724 {
1725     assert(0);
1726     return -1;
1727 }
1728
1729 void ram_write_tracking_stop(void)
1730 {
1731     assert(0);
1732 }
1733 #endif /* defined(__linux__) */
1734
1735 /**
1736  * get_queued_page: unqueue a page from the postcopy requests
1737  *
1738  * Skips pages that are already sent (!dirty)
1739  *
1740  * Returns true if a queued page is found
1741  *
1742  * @rs: current RAM state
1743  * @pss: data about the state of the current dirty page scan
1744  */
1745 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1746 {
1747     RAMBlock  *block;
1748     ram_addr_t offset;
1749     bool dirty;
1750
1751     do {
1752         block = unqueue_page(rs, &offset);
1753         /*
1754          * We're sending this page, and since it's postcopy nothing else
1755          * will dirty it, and we must make sure it doesn't get sent again
1756          * even if this queue request was received after the background
1757          * search already sent it.
1758          */
1759         if (block) {
1760             unsigned long page;
1761
1762             page = offset >> TARGET_PAGE_BITS;
1763             dirty = test_bit(page, block->bmap);
1764             if (!dirty) {
1765                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1766                                                 page);
1767             } else {
1768                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1769             }
1770         }
1771
1772     } while (block && !dirty);
1773
1774     if (!block) {
1775         /*
1776          * Poll write faults too if background snapshot is enabled; that's
1777          * when we have vcpus got blocked by the write protected pages.
1778          */
1779         block = poll_fault_page(rs, &offset);
1780     }
1781
1782     if (block) {
1783         /*
1784          * We want the background search to continue from the queued page
1785          * since the guest is likely to want other pages near to the page
1786          * it just requested.
1787          */
1788         pss->block = block;
1789         pss->page = offset >> TARGET_PAGE_BITS;
1790
1791         /*
1792          * This unqueued page would break the "one round" check, even is
1793          * really rare.
1794          */
1795         pss->complete_round = false;
1796     }
1797
1798     return !!block;
1799 }
1800
1801 /**
1802  * migration_page_queue_free: drop any remaining pages in the ram
1803  * request queue
1804  *
1805  * It should be empty at the end anyway, but in error cases there may
1806  * be some left.  in case that there is any page left, we drop it.
1807  *
1808  */
1809 static void migration_page_queue_free(RAMState *rs)
1810 {
1811     struct RAMSrcPageRequest *mspr, *next_mspr;
1812     /* This queue generally should be empty - but in the case of a failed
1813      * migration might have some droppings in.
1814      */
1815     RCU_READ_LOCK_GUARD();
1816     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1817         memory_region_unref(mspr->rb->mr);
1818         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1819         g_free(mspr);
1820     }
1821 }
1822
1823 /**
1824  * ram_save_queue_pages: queue the page for transmission
1825  *
1826  * A request from postcopy destination for example.
1827  *
1828  * Returns zero on success or negative on error
1829  *
1830  * @rbname: Name of the RAMBLock of the request. NULL means the
1831  *          same that last one.
1832  * @start: starting address from the start of the RAMBlock
1833  * @len: length (in bytes) to send
1834  */
1835 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1836 {
1837     RAMBlock *ramblock;
1838     RAMState *rs = ram_state;
1839
1840     ram_counters.postcopy_requests++;
1841     RCU_READ_LOCK_GUARD();
1842
1843     if (!rbname) {
1844         /* Reuse last RAMBlock */
1845         ramblock = rs->last_req_rb;
1846
1847         if (!ramblock) {
1848             /*
1849              * Shouldn't happen, we can't reuse the last RAMBlock if
1850              * it's the 1st request.
1851              */
1852             error_report("ram_save_queue_pages no previous block");
1853             return -1;
1854         }
1855     } else {
1856         ramblock = qemu_ram_block_by_name(rbname);
1857
1858         if (!ramblock) {
1859             /* We shouldn't be asked for a non-existent RAMBlock */
1860             error_report("ram_save_queue_pages no block '%s'", rbname);
1861             return -1;
1862         }
1863         rs->last_req_rb = ramblock;
1864     }
1865     trace_ram_save_queue_pages(ramblock->idstr, start, len);
1866     if (start + len > ramblock->used_length) {
1867         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1868                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1869                      __func__, start, len, ramblock->used_length);
1870         return -1;
1871     }
1872
1873     struct RAMSrcPageRequest *new_entry =
1874         g_malloc0(sizeof(struct RAMSrcPageRequest));
1875     new_entry->rb = ramblock;
1876     new_entry->offset = start;
1877     new_entry->len = len;
1878
1879     memory_region_ref(ramblock->mr);
1880     qemu_mutex_lock(&rs->src_page_req_mutex);
1881     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1882     migration_make_urgent_request();
1883     qemu_mutex_unlock(&rs->src_page_req_mutex);
1884
1885     return 0;
1886 }
1887
1888 static bool save_page_use_compression(RAMState *rs)
1889 {
1890     if (!migrate_use_compression()) {
1891         return false;
1892     }
1893
1894     /*
1895      * If xbzrle is enabled (e.g., after first round of migration), stop
1896      * using the data compression. In theory, xbzrle can do better than
1897      * compression.
1898      */
1899     if (rs->xbzrle_enabled) {
1900         return false;
1901     }
1902
1903     return true;
1904 }
1905
1906 /*
1907  * try to compress the page before posting it out, return true if the page
1908  * has been properly handled by compression, otherwise needs other
1909  * paths to handle it
1910  */
1911 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1912 {
1913     if (!save_page_use_compression(rs)) {
1914         return false;
1915     }
1916
1917     /*
1918      * When starting the process of a new block, the first page of
1919      * the block should be sent out before other pages in the same
1920      * block, and all the pages in last block should have been sent
1921      * out, keeping this order is important, because the 'cont' flag
1922      * is used to avoid resending the block name.
1923      *
1924      * We post the fist page as normal page as compression will take
1925      * much CPU resource.
1926      */
1927     if (block != rs->last_sent_block) {
1928         flush_compressed_data(rs);
1929         return false;
1930     }
1931
1932     if (compress_page_with_multi_thread(rs, block, offset) > 0) {
1933         return true;
1934     }
1935
1936     compression_counters.busy++;
1937     return false;
1938 }
1939
1940 /**
1941  * ram_save_target_page: save one target page
1942  *
1943  * Returns the number of pages written
1944  *
1945  * @rs: current RAM state
1946  * @pss: data about the page we want to send
1947  * @last_stage: if we are at the completion stage
1948  */
1949 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
1950                                 bool last_stage)
1951 {
1952     RAMBlock *block = pss->block;
1953     ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1954     int res;
1955
1956     if (control_save_page(rs, block, offset, &res)) {
1957         return res;
1958     }
1959
1960     if (save_compress_page(rs, block, offset)) {
1961         return 1;
1962     }
1963
1964     res = save_zero_page(rs, block, offset);
1965     if (res > 0) {
1966         /* Must let xbzrle know, otherwise a previous (now 0'd) cached
1967          * page would be stale
1968          */
1969         if (!save_page_use_compression(rs)) {
1970             XBZRLE_cache_lock();
1971             xbzrle_cache_zero_page(rs, block->offset + offset);
1972             XBZRLE_cache_unlock();
1973         }
1974         ram_release_pages(block->idstr, offset, res);
1975         return res;
1976     }
1977
1978     /*
1979      * Do not use multifd for:
1980      * 1. Compression as the first page in the new block should be posted out
1981      *    before sending the compressed page
1982      * 2. In postcopy as one whole host page should be placed
1983      */
1984     if (!save_page_use_compression(rs) && migrate_use_multifd()
1985         && !migration_in_postcopy()) {
1986         return ram_save_multifd_page(rs, block, offset);
1987     }
1988
1989     return ram_save_page(rs, pss, last_stage);
1990 }
1991
1992 /**
1993  * ram_save_host_page: save a whole host page
1994  *
1995  * Starting at *offset send pages up to the end of the current host
1996  * page. It's valid for the initial offset to point into the middle of
1997  * a host page in which case the remainder of the hostpage is sent.
1998  * Only dirty target pages are sent. Note that the host page size may
1999  * be a huge page for this block.
2000  * The saving stops at the boundary of the used_length of the block
2001  * if the RAMBlock isn't a multiple of the host page size.
2002  *
2003  * Returns the number of pages written or negative on error
2004  *
2005  * @rs: current RAM state
2006  * @ms: current migration state
2007  * @pss: data about the page we want to send
2008  * @last_stage: if we are at the completion stage
2009  */
2010 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
2011                               bool last_stage)
2012 {
2013     int tmppages, pages = 0;
2014     size_t pagesize_bits =
2015         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2016     unsigned long start_page = pss->page;
2017     int res;
2018
2019     if (ramblock_is_ignored(pss->block)) {
2020         error_report("block %s should not be migrated !", pss->block->idstr);
2021         return 0;
2022     }
2023
2024     do {
2025         /* Check the pages is dirty and if it is send it */
2026         if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
2027             pss->page++;
2028             continue;
2029         }
2030
2031         tmppages = ram_save_target_page(rs, pss, last_stage);
2032         if (tmppages < 0) {
2033             return tmppages;
2034         }
2035
2036         pages += tmppages;
2037         pss->page++;
2038         /* Allow rate limiting to happen in the middle of huge pages */
2039         migration_rate_limit();
2040     } while ((pss->page & (pagesize_bits - 1)) &&
2041              offset_in_ramblock(pss->block,
2042                                 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS));
2043     /* The offset we leave with is the last one we looked at */
2044     pss->page--;
2045
2046     res = ram_save_release_protection(rs, pss, start_page);
2047     return (res < 0 ? res : pages);
2048 }
2049
2050 /**
2051  * ram_find_and_save_block: finds a dirty page and sends it to f
2052  *
2053  * Called within an RCU critical section.
2054  *
2055  * Returns the number of pages written where zero means no dirty pages,
2056  * or negative on error
2057  *
2058  * @rs: current RAM state
2059  * @last_stage: if we are at the completion stage
2060  *
2061  * On systems where host-page-size > target-page-size it will send all the
2062  * pages in a host page that are dirty.
2063  */
2064
2065 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
2066 {
2067     PageSearchStatus pss;
2068     int pages = 0;
2069     bool again, found;
2070
2071     /* No dirty page as there is zero RAM */
2072     if (!ram_bytes_total()) {
2073         return pages;
2074     }
2075
2076     pss.block = rs->last_seen_block;
2077     pss.page = rs->last_page;
2078     pss.complete_round = false;
2079
2080     if (!pss.block) {
2081         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
2082     }
2083
2084     do {
2085         again = true;
2086         found = get_queued_page(rs, &pss);
2087
2088         if (!found) {
2089             /* priority queue empty, so just search for something dirty */
2090             found = find_dirty_block(rs, &pss, &again);
2091         }
2092
2093         if (found) {
2094             pages = ram_save_host_page(rs, &pss, last_stage);
2095         }
2096     } while (!pages && again);
2097
2098     rs->last_seen_block = pss.block;
2099     rs->last_page = pss.page;
2100
2101     return pages;
2102 }
2103
2104 void acct_update_position(QEMUFile *f, size_t size, bool zero)
2105 {
2106     uint64_t pages = size / TARGET_PAGE_SIZE;
2107
2108     if (zero) {
2109         ram_counters.duplicate += pages;
2110     } else {
2111         ram_counters.normal += pages;
2112         ram_counters.transferred += size;
2113         qemu_update_position(f, size);
2114     }
2115 }
2116
2117 static uint64_t ram_bytes_total_common(bool count_ignored)
2118 {
2119     RAMBlock *block;
2120     uint64_t total = 0;
2121
2122     RCU_READ_LOCK_GUARD();
2123
2124     if (count_ignored) {
2125         RAMBLOCK_FOREACH_MIGRATABLE(block) {
2126             total += block->used_length;
2127         }
2128     } else {
2129         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2130             total += block->used_length;
2131         }
2132     }
2133     return total;
2134 }
2135
2136 uint64_t ram_bytes_total(void)
2137 {
2138     return ram_bytes_total_common(false);
2139 }
2140
2141 static void xbzrle_load_setup(void)
2142 {
2143     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2144 }
2145
2146 static void xbzrle_load_cleanup(void)
2147 {
2148     g_free(XBZRLE.decoded_buf);
2149     XBZRLE.decoded_buf = NULL;
2150 }
2151
2152 static void ram_state_cleanup(RAMState **rsp)
2153 {
2154     if (*rsp) {
2155         migration_page_queue_free(*rsp);
2156         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2157         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2158         g_free(*rsp);
2159         *rsp = NULL;
2160     }
2161 }
2162
2163 static void xbzrle_cleanup(void)
2164 {
2165     XBZRLE_cache_lock();
2166     if (XBZRLE.cache) {
2167         cache_fini(XBZRLE.cache);
2168         g_free(XBZRLE.encoded_buf);
2169         g_free(XBZRLE.current_buf);
2170         g_free(XBZRLE.zero_target_page);
2171         XBZRLE.cache = NULL;
2172         XBZRLE.encoded_buf = NULL;
2173         XBZRLE.current_buf = NULL;
2174         XBZRLE.zero_target_page = NULL;
2175     }
2176     XBZRLE_cache_unlock();
2177 }
2178
2179 static void ram_save_cleanup(void *opaque)
2180 {
2181     RAMState **rsp = opaque;
2182     RAMBlock *block;
2183
2184     /* We don't use dirty log with background snapshots */
2185     if (!migrate_background_snapshot()) {
2186         /* caller have hold iothread lock or is in a bh, so there is
2187          * no writing race against the migration bitmap
2188          */
2189         memory_global_dirty_log_stop();
2190     }
2191
2192     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2193         g_free(block->clear_bmap);
2194         block->clear_bmap = NULL;
2195         g_free(block->bmap);
2196         block->bmap = NULL;
2197     }
2198
2199     xbzrle_cleanup();
2200     compress_threads_save_cleanup();
2201     ram_state_cleanup(rsp);
2202 }
2203
2204 static void ram_state_reset(RAMState *rs)
2205 {
2206     rs->last_seen_block = NULL;
2207     rs->last_sent_block = NULL;
2208     rs->last_page = 0;
2209     rs->last_version = ram_list.version;
2210     rs->xbzrle_enabled = false;
2211 }
2212
2213 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2214
2215 /*
2216  * 'expected' is the value you expect the bitmap mostly to be full
2217  * of; it won't bother printing lines that are all this value.
2218  * If 'todump' is null the migration bitmap is dumped.
2219  */
2220 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
2221                            unsigned long pages)
2222 {
2223     int64_t cur;
2224     int64_t linelen = 128;
2225     char linebuf[129];
2226
2227     for (cur = 0; cur < pages; cur += linelen) {
2228         int64_t curb;
2229         bool found = false;
2230         /*
2231          * Last line; catch the case where the line length
2232          * is longer than remaining ram
2233          */
2234         if (cur + linelen > pages) {
2235             linelen = pages - cur;
2236         }
2237         for (curb = 0; curb < linelen; curb++) {
2238             bool thisbit = test_bit(cur + curb, todump);
2239             linebuf[curb] = thisbit ? '1' : '.';
2240             found = found || (thisbit != expected);
2241         }
2242         if (found) {
2243             linebuf[curb] = '\0';
2244             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
2245         }
2246     }
2247 }
2248
2249 /* **** functions for postcopy ***** */
2250
2251 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2252 {
2253     struct RAMBlock *block;
2254
2255     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2256         unsigned long *bitmap = block->bmap;
2257         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2258         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2259
2260         while (run_start < range) {
2261             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2262             ram_discard_range(block->idstr,
2263                               ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
2264                               ((ram_addr_t)(run_end - run_start))
2265                                 << TARGET_PAGE_BITS);
2266             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2267         }
2268     }
2269 }
2270
2271 /**
2272  * postcopy_send_discard_bm_ram: discard a RAMBlock
2273  *
2274  * Returns zero on success
2275  *
2276  * Callback from postcopy_each_ram_send_discard for each RAMBlock
2277  *
2278  * @ms: current migration state
2279  * @block: RAMBlock to discard
2280  */
2281 static int postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
2282 {
2283     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2284     unsigned long current;
2285     unsigned long *bitmap = block->bmap;
2286
2287     for (current = 0; current < end; ) {
2288         unsigned long one = find_next_bit(bitmap, end, current);
2289         unsigned long zero, discard_length;
2290
2291         if (one >= end) {
2292             break;
2293         }
2294
2295         zero = find_next_zero_bit(bitmap, end, one + 1);
2296
2297         if (zero >= end) {
2298             discard_length = end - one;
2299         } else {
2300             discard_length = zero - one;
2301         }
2302         postcopy_discard_send_range(ms, one, discard_length);
2303         current = one + discard_length;
2304     }
2305
2306     return 0;
2307 }
2308
2309 /**
2310  * postcopy_each_ram_send_discard: discard all RAMBlocks
2311  *
2312  * Returns 0 for success or negative for error
2313  *
2314  * Utility for the outgoing postcopy code.
2315  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
2316  *   passing it bitmap indexes and name.
2317  * (qemu_ram_foreach_block ends up passing unscaled lengths
2318  *  which would mean postcopy code would have to deal with target page)
2319  *
2320  * @ms: current migration state
2321  */
2322 static int postcopy_each_ram_send_discard(MigrationState *ms)
2323 {
2324     struct RAMBlock *block;
2325     int ret;
2326
2327     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2328         postcopy_discard_send_init(ms, block->idstr);
2329
2330         /*
2331          * Postcopy sends chunks of bitmap over the wire, but it
2332          * just needs indexes at this point, avoids it having
2333          * target page specific code.
2334          */
2335         ret = postcopy_send_discard_bm_ram(ms, block);
2336         postcopy_discard_send_finish(ms);
2337         if (ret) {
2338             return ret;
2339         }
2340     }
2341
2342     return 0;
2343 }
2344
2345 /**
2346  * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2347  *
2348  * Helper for postcopy_chunk_hostpages; it's called twice to
2349  * canonicalize the two bitmaps, that are similar, but one is
2350  * inverted.
2351  *
2352  * Postcopy requires that all target pages in a hostpage are dirty or
2353  * clean, not a mix.  This function canonicalizes the bitmaps.
2354  *
2355  * @ms: current migration state
2356  * @block: block that contains the page we want to canonicalize
2357  */
2358 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
2359 {
2360     RAMState *rs = ram_state;
2361     unsigned long *bitmap = block->bmap;
2362     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2363     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2364     unsigned long run_start;
2365
2366     if (block->page_size == TARGET_PAGE_SIZE) {
2367         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2368         return;
2369     }
2370
2371     /* Find a dirty page */
2372     run_start = find_next_bit(bitmap, pages, 0);
2373
2374     while (run_start < pages) {
2375
2376         /*
2377          * If the start of this run of pages is in the middle of a host
2378          * page, then we need to fixup this host page.
2379          */
2380         if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2381             /* Find the end of this run */
2382             run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2383             /*
2384              * If the end isn't at the start of a host page, then the
2385              * run doesn't finish at the end of a host page
2386              * and we need to discard.
2387              */
2388         }
2389
2390         if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2391             unsigned long page;
2392             unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2393                                                              host_ratio);
2394             run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2395
2396             /* Clean up the bitmap */
2397             for (page = fixup_start_addr;
2398                  page < fixup_start_addr + host_ratio; page++) {
2399                 /*
2400                  * Remark them as dirty, updating the count for any pages
2401                  * that weren't previously dirty.
2402                  */
2403                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2404             }
2405         }
2406
2407         /* Find the next dirty page for the next iteration */
2408         run_start = find_next_bit(bitmap, pages, run_start);
2409     }
2410 }
2411
2412 /**
2413  * postcopy_chunk_hostpages: discard any partially sent host page
2414  *
2415  * Utility for the outgoing postcopy code.
2416  *
2417  * Discard any partially sent host-page size chunks, mark any partially
2418  * dirty host-page size chunks as all dirty.  In this case the host-page
2419  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
2420  *
2421  * Returns zero on success
2422  *
2423  * @ms: current migration state
2424  * @block: block we want to work with
2425  */
2426 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
2427 {
2428     postcopy_discard_send_init(ms, block->idstr);
2429
2430     /*
2431      * Ensure that all partially dirty host pages are made fully dirty.
2432      */
2433     postcopy_chunk_hostpages_pass(ms, block);
2434
2435     postcopy_discard_send_finish(ms);
2436     return 0;
2437 }
2438
2439 /**
2440  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2441  *
2442  * Returns zero on success
2443  *
2444  * Transmit the set of pages to be discarded after precopy to the target
2445  * these are pages that:
2446  *     a) Have been previously transmitted but are now dirty again
2447  *     b) Pages that have never been transmitted, this ensures that
2448  *        any pages on the destination that have been mapped by background
2449  *        tasks get discarded (transparent huge pages is the specific concern)
2450  * Hopefully this is pretty sparse
2451  *
2452  * @ms: current migration state
2453  */
2454 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
2455 {
2456     RAMState *rs = ram_state;
2457     RAMBlock *block;
2458     int ret;
2459
2460     RCU_READ_LOCK_GUARD();
2461
2462     /* This should be our last sync, the src is now paused */
2463     migration_bitmap_sync(rs);
2464
2465     /* Easiest way to make sure we don't resume in the middle of a host-page */
2466     rs->last_seen_block = NULL;
2467     rs->last_sent_block = NULL;
2468     rs->last_page = 0;
2469
2470     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2471         /* Deal with TPS != HPS and huge pages */
2472         ret = postcopy_chunk_hostpages(ms, block);
2473         if (ret) {
2474             return ret;
2475         }
2476
2477 #ifdef DEBUG_POSTCOPY
2478         ram_debug_dump_bitmap(block->bmap, true,
2479                               block->used_length >> TARGET_PAGE_BITS);
2480 #endif
2481     }
2482     trace_ram_postcopy_send_discard_bitmap();
2483
2484     return postcopy_each_ram_send_discard(ms);
2485 }
2486
2487 /**
2488  * ram_discard_range: discard dirtied pages at the beginning of postcopy
2489  *
2490  * Returns zero on success
2491  *
2492  * @rbname: name of the RAMBlock of the request. NULL means the
2493  *          same that last one.
2494  * @start: RAMBlock starting page
2495  * @length: RAMBlock size
2496  */
2497 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2498 {
2499     trace_ram_discard_range(rbname, start, length);
2500
2501     RCU_READ_LOCK_GUARD();
2502     RAMBlock *rb = qemu_ram_block_by_name(rbname);
2503
2504     if (!rb) {
2505         error_report("ram_discard_range: Failed to find block '%s'", rbname);
2506         return -1;
2507     }
2508
2509     /*
2510      * On source VM, we don't need to update the received bitmap since
2511      * we don't even have one.
2512      */
2513     if (rb->receivedmap) {
2514         bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2515                      length >> qemu_target_page_bits());
2516     }
2517
2518     return ram_block_discard_range(rb, start, length);
2519 }
2520
2521 /*
2522  * For every allocation, we will try not to crash the VM if the
2523  * allocation failed.
2524  */
2525 static int xbzrle_init(void)
2526 {
2527     Error *local_err = NULL;
2528
2529     if (!migrate_use_xbzrle()) {
2530         return 0;
2531     }
2532
2533     XBZRLE_cache_lock();
2534
2535     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2536     if (!XBZRLE.zero_target_page) {
2537         error_report("%s: Error allocating zero page", __func__);
2538         goto err_out;
2539     }
2540
2541     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2542                               TARGET_PAGE_SIZE, &local_err);
2543     if (!XBZRLE.cache) {
2544         error_report_err(local_err);
2545         goto free_zero_page;
2546     }
2547
2548     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2549     if (!XBZRLE.encoded_buf) {
2550         error_report("%s: Error allocating encoded_buf", __func__);
2551         goto free_cache;
2552     }
2553
2554     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2555     if (!XBZRLE.current_buf) {
2556         error_report("%s: Error allocating current_buf", __func__);
2557         goto free_encoded_buf;
2558     }
2559
2560     /* We are all good */
2561     XBZRLE_cache_unlock();
2562     return 0;
2563
2564 free_encoded_buf:
2565     g_free(XBZRLE.encoded_buf);
2566     XBZRLE.encoded_buf = NULL;
2567 free_cache:
2568     cache_fini(XBZRLE.cache);
2569     XBZRLE.cache = NULL;
2570 free_zero_page:
2571     g_free(XBZRLE.zero_target_page);
2572     XBZRLE.zero_target_page = NULL;
2573 err_out:
2574     XBZRLE_cache_unlock();
2575     return -ENOMEM;
2576 }
2577
2578 static int ram_state_init(RAMState **rsp)
2579 {
2580     *rsp = g_try_new0(RAMState, 1);
2581
2582     if (!*rsp) {
2583         error_report("%s: Init ramstate fail", __func__);
2584         return -1;
2585     }
2586
2587     qemu_mutex_init(&(*rsp)->bitmap_mutex);
2588     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2589     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2590
2591     /*
2592      * Count the total number of pages used by ram blocks not including any
2593      * gaps due to alignment or unplugs.
2594      * This must match with the initial values of dirty bitmap.
2595      */
2596     (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2597     ram_state_reset(*rsp);
2598
2599     return 0;
2600 }
2601
2602 static void ram_list_init_bitmaps(void)
2603 {
2604     MigrationState *ms = migrate_get_current();
2605     RAMBlock *block;
2606     unsigned long pages;
2607     uint8_t shift;
2608
2609     /* Skip setting bitmap if there is no RAM */
2610     if (ram_bytes_total()) {
2611         shift = ms->clear_bitmap_shift;
2612         if (shift > CLEAR_BITMAP_SHIFT_MAX) {
2613             error_report("clear_bitmap_shift (%u) too big, using "
2614                          "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
2615             shift = CLEAR_BITMAP_SHIFT_MAX;
2616         } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
2617             error_report("clear_bitmap_shift (%u) too small, using "
2618                          "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
2619             shift = CLEAR_BITMAP_SHIFT_MIN;
2620         }
2621
2622         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2623             pages = block->max_length >> TARGET_PAGE_BITS;
2624             /*
2625              * The initial dirty bitmap for migration must be set with all
2626              * ones to make sure we'll migrate every guest RAM page to
2627              * destination.
2628              * Here we set RAMBlock.bmap all to 1 because when rebegin a
2629              * new migration after a failed migration, ram_list.
2630              * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
2631              * guest memory.
2632              */
2633             block->bmap = bitmap_new(pages);
2634             bitmap_set(block->bmap, 0, pages);
2635             block->clear_bmap_shift = shift;
2636             block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
2637         }
2638     }
2639 }
2640
2641 static void ram_init_bitmaps(RAMState *rs)
2642 {
2643     /* For memory_global_dirty_log_start below.  */
2644     qemu_mutex_lock_iothread();
2645     qemu_mutex_lock_ramlist();
2646
2647     WITH_RCU_READ_LOCK_GUARD() {
2648         ram_list_init_bitmaps();
2649         /* We don't use dirty log with background snapshots */
2650         if (!migrate_background_snapshot()) {
2651             memory_global_dirty_log_start();
2652             migration_bitmap_sync_precopy(rs);
2653         }
2654     }
2655     qemu_mutex_unlock_ramlist();
2656     qemu_mutex_unlock_iothread();
2657 }
2658
2659 static int ram_init_all(RAMState **rsp)
2660 {
2661     if (ram_state_init(rsp)) {
2662         return -1;
2663     }
2664
2665     if (xbzrle_init()) {
2666         ram_state_cleanup(rsp);
2667         return -1;
2668     }
2669
2670     ram_init_bitmaps(*rsp);
2671
2672     return 0;
2673 }
2674
2675 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2676 {
2677     RAMBlock *block;
2678     uint64_t pages = 0;
2679
2680     /*
2681      * Postcopy is not using xbzrle/compression, so no need for that.
2682      * Also, since source are already halted, we don't need to care
2683      * about dirty page logging as well.
2684      */
2685
2686     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2687         pages += bitmap_count_one(block->bmap,
2688                                   block->used_length >> TARGET_PAGE_BITS);
2689     }
2690
2691     /* This may not be aligned with current bitmaps. Recalculate. */
2692     rs->migration_dirty_pages = pages;
2693
2694     ram_state_reset(rs);
2695
2696     /* Update RAMState cache of output QEMUFile */
2697     rs->f = out;
2698
2699     trace_ram_state_resume_prepare(pages);
2700 }
2701
2702 /*
2703  * This function clears bits of the free pages reported by the caller from the
2704  * migration dirty bitmap. @addr is the host address corresponding to the
2705  * start of the continuous guest free pages, and @len is the total bytes of
2706  * those pages.
2707  */
2708 void qemu_guest_free_page_hint(void *addr, size_t len)
2709 {
2710     RAMBlock *block;
2711     ram_addr_t offset;
2712     size_t used_len, start, npages;
2713     MigrationState *s = migrate_get_current();
2714
2715     /* This function is currently expected to be used during live migration */
2716     if (!migration_is_setup_or_active(s->state)) {
2717         return;
2718     }
2719
2720     for (; len > 0; len -= used_len, addr += used_len) {
2721         block = qemu_ram_block_from_host(addr, false, &offset);
2722         if (unlikely(!block || offset >= block->used_length)) {
2723             /*
2724              * The implementation might not support RAMBlock resize during
2725              * live migration, but it could happen in theory with future
2726              * updates. So we add a check here to capture that case.
2727              */
2728             error_report_once("%s unexpected error", __func__);
2729             return;
2730         }
2731
2732         if (len <= block->used_length - offset) {
2733             used_len = len;
2734         } else {
2735             used_len = block->used_length - offset;
2736         }
2737
2738         start = offset >> TARGET_PAGE_BITS;
2739         npages = used_len >> TARGET_PAGE_BITS;
2740
2741         qemu_mutex_lock(&ram_state->bitmap_mutex);
2742         ram_state->migration_dirty_pages -=
2743                       bitmap_count_one_with_offset(block->bmap, start, npages);
2744         bitmap_clear(block->bmap, start, npages);
2745         qemu_mutex_unlock(&ram_state->bitmap_mutex);
2746     }
2747 }
2748
2749 /*
2750  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2751  * long-running RCU critical section.  When rcu-reclaims in the code
2752  * start to become numerous it will be necessary to reduce the
2753  * granularity of these critical sections.
2754  */
2755
2756 /**
2757  * ram_save_setup: Setup RAM for migration
2758  *
2759  * Returns zero to indicate success and negative for error
2760  *
2761  * @f: QEMUFile where to send the data
2762  * @opaque: RAMState pointer
2763  */
2764 static int ram_save_setup(QEMUFile *f, void *opaque)
2765 {
2766     RAMState **rsp = opaque;
2767     RAMBlock *block;
2768
2769     if (compress_threads_save_setup()) {
2770         return -1;
2771     }
2772
2773     /* migration has already setup the bitmap, reuse it. */
2774     if (!migration_in_colo_state()) {
2775         if (ram_init_all(rsp) != 0) {
2776             compress_threads_save_cleanup();
2777             return -1;
2778         }
2779     }
2780     (*rsp)->f = f;
2781
2782     WITH_RCU_READ_LOCK_GUARD() {
2783         qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
2784
2785         RAMBLOCK_FOREACH_MIGRATABLE(block) {
2786             qemu_put_byte(f, strlen(block->idstr));
2787             qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2788             qemu_put_be64(f, block->used_length);
2789             if (migrate_postcopy_ram() && block->page_size !=
2790                                           qemu_host_page_size) {
2791                 qemu_put_be64(f, block->page_size);
2792             }
2793             if (migrate_ignore_shared()) {
2794                 qemu_put_be64(f, block->mr->addr);
2795             }
2796         }
2797     }
2798
2799     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2800     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2801
2802     multifd_send_sync_main(f);
2803     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2804     qemu_fflush(f);
2805
2806     return 0;
2807 }
2808
2809 /**
2810  * ram_save_iterate: iterative stage for migration
2811  *
2812  * Returns zero to indicate success and negative for error
2813  *
2814  * @f: QEMUFile where to send the data
2815  * @opaque: RAMState pointer
2816  */
2817 static int ram_save_iterate(QEMUFile *f, void *opaque)
2818 {
2819     RAMState **temp = opaque;
2820     RAMState *rs = *temp;
2821     int ret = 0;
2822     int i;
2823     int64_t t0;
2824     int done = 0;
2825
2826     if (blk_mig_bulk_active()) {
2827         /* Avoid transferring ram during bulk phase of block migration as
2828          * the bulk phase will usually take a long time and transferring
2829          * ram updates during that time is pointless. */
2830         goto out;
2831     }
2832
2833     WITH_RCU_READ_LOCK_GUARD() {
2834         if (ram_list.version != rs->last_version) {
2835             ram_state_reset(rs);
2836         }
2837
2838         /* Read version before ram_list.blocks */
2839         smp_rmb();
2840
2841         ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2842
2843         t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2844         i = 0;
2845         while ((ret = qemu_file_rate_limit(f)) == 0 ||
2846                 !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
2847             int pages;
2848
2849             if (qemu_file_get_error(f)) {
2850                 break;
2851             }
2852
2853             pages = ram_find_and_save_block(rs, false);
2854             /* no more pages to sent */
2855             if (pages == 0) {
2856                 done = 1;
2857                 break;
2858             }
2859
2860             if (pages < 0) {
2861                 qemu_file_set_error(f, pages);
2862                 break;
2863             }
2864
2865             rs->target_page_count += pages;
2866
2867             /*
2868              * During postcopy, it is necessary to make sure one whole host
2869              * page is sent in one chunk.
2870              */
2871             if (migrate_postcopy_ram()) {
2872                 flush_compressed_data(rs);
2873             }
2874
2875             /*
2876              * we want to check in the 1st loop, just in case it was the 1st
2877              * time and we had to sync the dirty bitmap.
2878              * qemu_clock_get_ns() is a bit expensive, so we only check each
2879              * some iterations
2880              */
2881             if ((i & 63) == 0) {
2882                 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
2883                               1000000;
2884                 if (t1 > MAX_WAIT) {
2885                     trace_ram_save_iterate_big_wait(t1, i);
2886                     break;
2887                 }
2888             }
2889             i++;
2890         }
2891     }
2892
2893     /*
2894      * Must occur before EOS (or any QEMUFile operation)
2895      * because of RDMA protocol.
2896      */
2897     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2898
2899 out:
2900     if (ret >= 0
2901         && migration_is_setup_or_active(migrate_get_current()->state)) {
2902         multifd_send_sync_main(rs->f);
2903         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2904         qemu_fflush(f);
2905         ram_counters.transferred += 8;
2906
2907         ret = qemu_file_get_error(f);
2908     }
2909     if (ret < 0) {
2910         return ret;
2911     }
2912
2913     return done;
2914 }
2915
2916 /**
2917  * ram_save_complete: function called to send the remaining amount of ram
2918  *
2919  * Returns zero to indicate success or negative on error
2920  *
2921  * Called with iothread lock
2922  *
2923  * @f: QEMUFile where to send the data
2924  * @opaque: RAMState pointer
2925  */
2926 static int ram_save_complete(QEMUFile *f, void *opaque)
2927 {
2928     RAMState **temp = opaque;
2929     RAMState *rs = *temp;
2930     int ret = 0;
2931
2932     WITH_RCU_READ_LOCK_GUARD() {
2933         if (!migration_in_postcopy()) {
2934             migration_bitmap_sync_precopy(rs);
2935         }
2936
2937         ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2938
2939         /* try transferring iterative blocks of memory */
2940
2941         /* flush all remaining blocks regardless of rate limiting */
2942         while (true) {
2943             int pages;
2944
2945             pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2946             /* no more blocks to sent */
2947             if (pages == 0) {
2948                 break;
2949             }
2950             if (pages < 0) {
2951                 ret = pages;
2952                 break;
2953             }
2954         }
2955
2956         flush_compressed_data(rs);
2957         ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2958     }
2959
2960     if (ret >= 0) {
2961         multifd_send_sync_main(rs->f);
2962         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2963         qemu_fflush(f);
2964     }
2965
2966     return ret;
2967 }
2968
2969 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2970                              uint64_t *res_precopy_only,
2971                              uint64_t *res_compatible,
2972                              uint64_t *res_postcopy_only)
2973 {
2974     RAMState **temp = opaque;
2975     RAMState *rs = *temp;
2976     uint64_t remaining_size;
2977
2978     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2979
2980     if (!migration_in_postcopy() &&
2981         remaining_size < max_size) {
2982         qemu_mutex_lock_iothread();
2983         WITH_RCU_READ_LOCK_GUARD() {
2984             migration_bitmap_sync_precopy(rs);
2985         }
2986         qemu_mutex_unlock_iothread();
2987         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2988     }
2989
2990     if (migrate_postcopy_ram()) {
2991         /* We can do postcopy, and all the data is postcopiable */
2992         *res_compatible += remaining_size;
2993     } else {
2994         *res_precopy_only += remaining_size;
2995     }
2996 }
2997
2998 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2999 {
3000     unsigned int xh_len;
3001     int xh_flags;
3002     uint8_t *loaded_data;
3003
3004     /* extract RLE header */
3005     xh_flags = qemu_get_byte(f);
3006     xh_len = qemu_get_be16(f);
3007
3008     if (xh_flags != ENCODING_FLAG_XBZRLE) {
3009         error_report("Failed to load XBZRLE page - wrong compression!");
3010         return -1;
3011     }
3012
3013     if (xh_len > TARGET_PAGE_SIZE) {
3014         error_report("Failed to load XBZRLE page - len overflow!");
3015         return -1;
3016     }
3017     loaded_data = XBZRLE.decoded_buf;
3018     /* load data and decode */
3019     /* it can change loaded_data to point to an internal buffer */
3020     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3021
3022     /* decode RLE */
3023     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3024                              TARGET_PAGE_SIZE) == -1) {
3025         error_report("Failed to load XBZRLE page - decode error!");
3026         return -1;
3027     }
3028
3029     return 0;
3030 }
3031
3032 /**
3033  * ram_block_from_stream: read a RAMBlock id from the migration stream
3034  *
3035  * Must be called from within a rcu critical section.
3036  *
3037  * Returns a pointer from within the RCU-protected ram_list.
3038  *
3039  * @f: QEMUFile where to read the data from
3040  * @flags: Page flags (mostly to see if it's a continuation of previous block)
3041  */
3042 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
3043 {
3044     static RAMBlock *block;
3045     char id[256];
3046     uint8_t len;
3047
3048     if (flags & RAM_SAVE_FLAG_CONTINUE) {
3049         if (!block) {
3050             error_report("Ack, bad migration stream!");
3051             return NULL;
3052         }
3053         return block;
3054     }
3055
3056     len = qemu_get_byte(f);
3057     qemu_get_buffer(f, (uint8_t *)id, len);
3058     id[len] = 0;
3059
3060     block = qemu_ram_block_by_name(id);
3061     if (!block) {
3062         error_report("Can't find block %s", id);
3063         return NULL;
3064     }
3065
3066     if (ramblock_is_ignored(block)) {
3067         error_report("block %s should not be migrated !", id);
3068         return NULL;
3069     }
3070
3071     return block;
3072 }
3073
3074 static inline void *host_from_ram_block_offset(RAMBlock *block,
3075                                                ram_addr_t offset)
3076 {
3077     if (!offset_in_ramblock(block, offset)) {
3078         return NULL;
3079     }
3080
3081     return block->host + offset;
3082 }
3083
3084 static inline void *colo_cache_from_block_offset(RAMBlock *block,
3085                              ram_addr_t offset, bool record_bitmap)
3086 {
3087     if (!offset_in_ramblock(block, offset)) {
3088         return NULL;
3089     }
3090     if (!block->colo_cache) {
3091         error_report("%s: colo_cache is NULL in block :%s",
3092                      __func__, block->idstr);
3093         return NULL;
3094     }
3095
3096     /*
3097     * During colo checkpoint, we need bitmap of these migrated pages.
3098     * It help us to decide which pages in ram cache should be flushed
3099     * into VM's RAM later.
3100     */
3101     if (record_bitmap &&
3102         !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
3103         ram_state->migration_dirty_pages++;
3104     }
3105     return block->colo_cache + offset;
3106 }
3107
3108 /**
3109  * ram_handle_compressed: handle the zero page case
3110  *
3111  * If a page (or a whole RDMA chunk) has been
3112  * determined to be zero, then zap it.
3113  *
3114  * @host: host address for the zero page
3115  * @ch: what the page is filled from.  We only support zero
3116  * @size: size of the zero page
3117  */
3118 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3119 {
3120     if (ch != 0 || !is_zero_range(host, size)) {
3121         memset(host, ch, size);
3122     }
3123 }
3124
3125 /* return the size after decompression, or negative value on error */
3126 static int
3127 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3128                      const uint8_t *source, size_t source_len)
3129 {
3130     int err;
3131
3132     err = inflateReset(stream);
3133     if (err != Z_OK) {
3134         return -1;
3135     }
3136
3137     stream->avail_in = source_len;
3138     stream->next_in = (uint8_t *)source;
3139     stream->avail_out = dest_len;
3140     stream->next_out = dest;
3141
3142     err = inflate(stream, Z_NO_FLUSH);
3143     if (err != Z_STREAM_END) {
3144         return -1;
3145     }
3146
3147     return stream->total_out;
3148 }
3149
3150 static void *do_data_decompress(void *opaque)
3151 {
3152     DecompressParam *param = opaque;
3153     unsigned long pagesize;
3154     uint8_t *des;
3155     int len, ret;
3156
3157     qemu_mutex_lock(&param->mutex);
3158     while (!param->quit) {
3159         if (param->des) {
3160             des = param->des;
3161             len = param->len;
3162             param->des = 0;
3163             qemu_mutex_unlock(&param->mutex);
3164
3165             pagesize = TARGET_PAGE_SIZE;
3166
3167             ret = qemu_uncompress_data(&param->stream, des, pagesize,
3168                                        param->compbuf, len);
3169             if (ret < 0 && migrate_get_current()->decompress_error_check) {
3170                 error_report("decompress data failed");
3171                 qemu_file_set_error(decomp_file, ret);
3172             }
3173
3174             qemu_mutex_lock(&decomp_done_lock);
3175             param->done = true;
3176             qemu_cond_signal(&decomp_done_cond);
3177             qemu_mutex_unlock(&decomp_done_lock);
3178
3179             qemu_mutex_lock(&param->mutex);
3180         } else {
3181             qemu_cond_wait(&param->cond, &param->mutex);
3182         }
3183     }
3184     qemu_mutex_unlock(&param->mutex);
3185
3186     return NULL;
3187 }
3188
3189 static int wait_for_decompress_done(void)
3190 {
3191     int idx, thread_count;
3192
3193     if (!migrate_use_compression()) {
3194         return 0;
3195     }
3196
3197     thread_count = migrate_decompress_threads();
3198     qemu_mutex_lock(&decomp_done_lock);
3199     for (idx = 0; idx < thread_count; idx++) {
3200         while (!decomp_param[idx].done) {
3201             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3202         }
3203     }
3204     qemu_mutex_unlock(&decomp_done_lock);
3205     return qemu_file_get_error(decomp_file);
3206 }
3207
3208 static void compress_threads_load_cleanup(void)
3209 {
3210     int i, thread_count;
3211
3212     if (!migrate_use_compression()) {
3213         return;
3214     }
3215     thread_count = migrate_decompress_threads();
3216     for (i = 0; i < thread_count; i++) {
3217         /*
3218          * we use it as a indicator which shows if the thread is
3219          * properly init'd or not
3220          */
3221         if (!decomp_param[i].compbuf) {
3222             break;
3223         }
3224
3225         qemu_mutex_lock(&decomp_param[i].mutex);
3226         decomp_param[i].quit = true;
3227         qemu_cond_signal(&decomp_param[i].cond);
3228         qemu_mutex_unlock(&decomp_param[i].mutex);
3229     }
3230     for (i = 0; i < thread_count; i++) {
3231         if (!decomp_param[i].compbuf) {
3232             break;
3233         }
3234
3235         qemu_thread_join(decompress_threads + i);
3236         qemu_mutex_destroy(&decomp_param[i].mutex);
3237         qemu_cond_destroy(&decomp_param[i].cond);
3238         inflateEnd(&decomp_param[i].stream);
3239         g_free(decomp_param[i].compbuf);
3240         decomp_param[i].compbuf = NULL;
3241     }
3242     g_free(decompress_threads);
3243     g_free(decomp_param);
3244     decompress_threads = NULL;
3245     decomp_param = NULL;
3246     decomp_file = NULL;
3247 }
3248
3249 static int compress_threads_load_setup(QEMUFile *f)
3250 {
3251     int i, thread_count;
3252
3253     if (!migrate_use_compression()) {
3254         return 0;
3255     }
3256
3257     thread_count = migrate_decompress_threads();
3258     decompress_threads = g_new0(QemuThread, thread_count);
3259     decomp_param = g_new0(DecompressParam, thread_count);
3260     qemu_mutex_init(&decomp_done_lock);
3261     qemu_cond_init(&decomp_done_cond);
3262     decomp_file = f;
3263     for (i = 0; i < thread_count; i++) {
3264         if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3265             goto exit;
3266         }
3267
3268         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3269         qemu_mutex_init(&decomp_param[i].mutex);
3270         qemu_cond_init(&decomp_param[i].cond);
3271         decomp_param[i].done = true;
3272         decomp_param[i].quit = false;
3273         qemu_thread_create(decompress_threads + i, "decompress",
3274                            do_data_decompress, decomp_param + i,
3275                            QEMU_THREAD_JOINABLE);
3276     }
3277     return 0;
3278 exit:
3279     compress_threads_load_cleanup();
3280     return -1;
3281 }
3282
3283 static void decompress_data_with_multi_threads(QEMUFile *f,
3284                                                void *host, int len)
3285 {
3286     int idx, thread_count;
3287
3288     thread_count = migrate_decompress_threads();
3289     QEMU_LOCK_GUARD(&decomp_done_lock);
3290     while (true) {
3291         for (idx = 0; idx < thread_count; idx++) {
3292             if (decomp_param[idx].done) {
3293                 decomp_param[idx].done = false;
3294                 qemu_mutex_lock(&decomp_param[idx].mutex);
3295                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3296                 decomp_param[idx].des = host;
3297                 decomp_param[idx].len = len;
3298                 qemu_cond_signal(&decomp_param[idx].cond);
3299                 qemu_mutex_unlock(&decomp_param[idx].mutex);
3300                 break;
3301             }
3302         }
3303         if (idx < thread_count) {
3304             break;
3305         } else {
3306             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3307         }
3308     }
3309 }
3310
3311 static void colo_init_ram_state(void)
3312 {
3313     ram_state_init(&ram_state);
3314 }
3315
3316 /*
3317  * colo cache: this is for secondary VM, we cache the whole
3318  * memory of the secondary VM, it is need to hold the global lock
3319  * to call this helper.
3320  */
3321 int colo_init_ram_cache(void)
3322 {
3323     RAMBlock *block;
3324
3325     WITH_RCU_READ_LOCK_GUARD() {
3326         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3327             block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3328                                                     NULL,
3329                                                     false);
3330             if (!block->colo_cache) {
3331                 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3332                              "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3333                              block->used_length);
3334                 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3335                     if (block->colo_cache) {
3336                         qemu_anon_ram_free(block->colo_cache, block->used_length);
3337                         block->colo_cache = NULL;
3338                     }
3339                 }
3340                 return -errno;
3341             }
3342         }
3343     }
3344
3345     /*
3346     * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3347     * with to decide which page in cache should be flushed into SVM's RAM. Here
3348     * we use the same name 'ram_bitmap' as for migration.
3349     */
3350     if (ram_bytes_total()) {
3351         RAMBlock *block;
3352
3353         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3354             unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3355             block->bmap = bitmap_new(pages);
3356         }
3357     }
3358
3359     colo_init_ram_state();
3360     return 0;
3361 }
3362
3363 /* TODO: duplicated with ram_init_bitmaps */
3364 void colo_incoming_start_dirty_log(void)
3365 {
3366     RAMBlock *block = NULL;
3367     /* For memory_global_dirty_log_start below. */
3368     qemu_mutex_lock_iothread();
3369     qemu_mutex_lock_ramlist();
3370
3371     memory_global_dirty_log_sync();
3372     WITH_RCU_READ_LOCK_GUARD() {
3373         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3374             ramblock_sync_dirty_bitmap(ram_state, block);
3375             /* Discard this dirty bitmap record */
3376             bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3377         }
3378         memory_global_dirty_log_start();
3379     }
3380     ram_state->migration_dirty_pages = 0;
3381     qemu_mutex_unlock_ramlist();
3382     qemu_mutex_unlock_iothread();
3383 }
3384
3385 /* It is need to hold the global lock to call this helper */
3386 void colo_release_ram_cache(void)
3387 {
3388     RAMBlock *block;
3389
3390     memory_global_dirty_log_stop();
3391     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3392         g_free(block->bmap);
3393         block->bmap = NULL;
3394     }
3395
3396     WITH_RCU_READ_LOCK_GUARD() {
3397         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3398             if (block->colo_cache) {
3399                 qemu_anon_ram_free(block->colo_cache, block->used_length);
3400                 block->colo_cache = NULL;
3401             }
3402         }
3403     }
3404     ram_state_cleanup(&ram_state);
3405 }
3406
3407 /**
3408  * ram_load_setup: Setup RAM for migration incoming side
3409  *
3410  * Returns zero to indicate success and negative for error
3411  *
3412  * @f: QEMUFile where to receive the data
3413  * @opaque: RAMState pointer
3414  */
3415 static int ram_load_setup(QEMUFile *f, void *opaque)
3416 {
3417     if (compress_threads_load_setup(f)) {
3418         return -1;
3419     }
3420
3421     xbzrle_load_setup();
3422     ramblock_recv_map_init();
3423
3424     return 0;
3425 }
3426
3427 static int ram_load_cleanup(void *opaque)
3428 {
3429     RAMBlock *rb;
3430
3431     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3432         qemu_ram_block_writeback(rb);
3433     }
3434
3435     xbzrle_load_cleanup();
3436     compress_threads_load_cleanup();
3437
3438     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3439         g_free(rb->receivedmap);
3440         rb->receivedmap = NULL;
3441     }
3442
3443     return 0;
3444 }
3445
3446 /**
3447  * ram_postcopy_incoming_init: allocate postcopy data structures
3448  *
3449  * Returns 0 for success and negative if there was one error
3450  *
3451  * @mis: current migration incoming state
3452  *
3453  * Allocate data structures etc needed by incoming migration with
3454  * postcopy-ram. postcopy-ram's similarly names
3455  * postcopy_ram_incoming_init does the work.
3456  */
3457 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3458 {
3459     return postcopy_ram_incoming_init(mis);
3460 }
3461
3462 /**
3463  * ram_load_postcopy: load a page in postcopy case
3464  *
3465  * Returns 0 for success or -errno in case of error
3466  *
3467  * Called in postcopy mode by ram_load().
3468  * rcu_read_lock is taken prior to this being called.
3469  *
3470  * @f: QEMUFile where to send the data
3471  */
3472 static int ram_load_postcopy(QEMUFile *f)
3473 {
3474     int flags = 0, ret = 0;
3475     bool place_needed = false;
3476     bool matches_target_page_size = false;
3477     MigrationIncomingState *mis = migration_incoming_get_current();
3478     /* Temporary page that is later 'placed' */
3479     void *postcopy_host_page = mis->postcopy_tmp_page;
3480     void *this_host = NULL;
3481     bool all_zero = true;
3482     int target_pages = 0;
3483
3484     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3485         ram_addr_t addr;
3486         void *host = NULL;
3487         void *page_buffer = NULL;
3488         void *place_source = NULL;
3489         RAMBlock *block = NULL;
3490         uint8_t ch;
3491         int len;
3492
3493         addr = qemu_get_be64(f);
3494
3495         /*
3496          * If qemu file error, we should stop here, and then "addr"
3497          * may be invalid
3498          */
3499         ret = qemu_file_get_error(f);
3500         if (ret) {
3501             break;
3502         }
3503
3504         flags = addr & ~TARGET_PAGE_MASK;
3505         addr &= TARGET_PAGE_MASK;
3506
3507         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
3508         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3509                      RAM_SAVE_FLAG_COMPRESS_PAGE)) {
3510             block = ram_block_from_stream(f, flags);
3511
3512             host = host_from_ram_block_offset(block, addr);
3513             if (!host) {
3514                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3515                 ret = -EINVAL;
3516                 break;
3517             }
3518             target_pages++;
3519             matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
3520             /*
3521              * Postcopy requires that we place whole host pages atomically;
3522              * these may be huge pages for RAMBlocks that are backed by
3523              * hugetlbfs.
3524              * To make it atomic, the data is read into a temporary page
3525              * that's moved into place later.
3526              * The migration protocol uses,  possibly smaller, target-pages
3527              * however the source ensures it always sends all the components
3528              * of a host page in one chunk.
3529              */
3530             page_buffer = postcopy_host_page +
3531                           ((uintptr_t)host & (block->page_size - 1));
3532             if (target_pages == 1) {
3533                 this_host = (void *)QEMU_ALIGN_DOWN((uintptr_t)host,
3534                                                     block->page_size);
3535             } else {
3536                 /* not the 1st TP within the HP */
3537                 if (QEMU_ALIGN_DOWN((uintptr_t)host, block->page_size) !=
3538                     (uintptr_t)this_host) {
3539                     error_report("Non-same host page %p/%p",
3540                                   host, this_host);
3541                     ret = -EINVAL;
3542                     break;
3543                 }
3544             }
3545
3546             /*
3547              * If it's the last part of a host page then we place the host
3548              * page
3549              */
3550             if (target_pages == (block->page_size / TARGET_PAGE_SIZE)) {
3551                 place_needed = true;
3552             }
3553             place_source = postcopy_host_page;
3554         }
3555
3556         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3557         case RAM_SAVE_FLAG_ZERO:
3558             ch = qemu_get_byte(f);
3559             /*
3560              * Can skip to set page_buffer when
3561              * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
3562              */
3563             if (ch || !matches_target_page_size) {
3564                 memset(page_buffer, ch, TARGET_PAGE_SIZE);
3565             }
3566             if (ch) {
3567                 all_zero = false;
3568             }
3569             break;
3570
3571         case RAM_SAVE_FLAG_PAGE:
3572             all_zero = false;
3573             if (!matches_target_page_size) {
3574                 /* For huge pages, we always use temporary buffer */
3575                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3576             } else {
3577                 /*
3578                  * For small pages that matches target page size, we
3579                  * avoid the qemu_file copy.  Instead we directly use
3580                  * the buffer of QEMUFile to place the page.  Note: we
3581                  * cannot do any QEMUFile operation before using that
3582                  * buffer to make sure the buffer is valid when
3583                  * placing the page.
3584                  */
3585                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3586                                          TARGET_PAGE_SIZE);
3587             }
3588             break;
3589         case RAM_SAVE_FLAG_COMPRESS_PAGE:
3590             all_zero = false;
3591             len = qemu_get_be32(f);
3592             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3593                 error_report("Invalid compressed data length: %d", len);
3594                 ret = -EINVAL;
3595                 break;
3596             }
3597             decompress_data_with_multi_threads(f, page_buffer, len);
3598             break;
3599
3600         case RAM_SAVE_FLAG_EOS:
3601             /* normal exit */
3602             multifd_recv_sync_main();
3603             break;
3604         default:
3605             error_report("Unknown combination of migration flags: 0x%x"
3606                          " (postcopy mode)", flags);
3607             ret = -EINVAL;
3608             break;
3609         }
3610
3611         /* Got the whole host page, wait for decompress before placing. */
3612         if (place_needed) {
3613             ret |= wait_for_decompress_done();
3614         }
3615
3616         /* Detect for any possible file errors */
3617         if (!ret && qemu_file_get_error(f)) {
3618             ret = qemu_file_get_error(f);
3619         }
3620
3621         if (!ret && place_needed) {
3622             /* This gets called at the last target page in the host page */
3623             void *place_dest = (void *)QEMU_ALIGN_DOWN((uintptr_t)host,
3624                                                        block->page_size);
3625
3626             if (all_zero) {
3627                 ret = postcopy_place_page_zero(mis, place_dest,
3628                                                block);
3629             } else {
3630                 ret = postcopy_place_page(mis, place_dest,
3631                                           place_source, block);
3632             }
3633             place_needed = false;
3634             target_pages = 0;
3635             /* Assume we have a zero page until we detect something different */
3636             all_zero = true;
3637         }
3638     }
3639
3640     return ret;
3641 }
3642
3643 static bool postcopy_is_advised(void)
3644 {
3645     PostcopyState ps = postcopy_state_get();
3646     return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
3647 }
3648
3649 static bool postcopy_is_running(void)
3650 {
3651     PostcopyState ps = postcopy_state_get();
3652     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3653 }
3654
3655 /*
3656  * Flush content of RAM cache into SVM's memory.
3657  * Only flush the pages that be dirtied by PVM or SVM or both.
3658  */
3659 void colo_flush_ram_cache(void)
3660 {
3661     RAMBlock *block = NULL;
3662     void *dst_host;
3663     void *src_host;
3664     unsigned long offset = 0;
3665
3666     memory_global_dirty_log_sync();
3667     WITH_RCU_READ_LOCK_GUARD() {
3668         RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3669             ramblock_sync_dirty_bitmap(ram_state, block);
3670         }
3671     }
3672
3673     trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
3674     WITH_RCU_READ_LOCK_GUARD() {
3675         block = QLIST_FIRST_RCU(&ram_list.blocks);
3676
3677         while (block) {
3678             offset = migration_bitmap_find_dirty(ram_state, block, offset);
3679
3680             if (((ram_addr_t)offset) << TARGET_PAGE_BITS
3681                 >= block->used_length) {
3682                 offset = 0;
3683                 block = QLIST_NEXT_RCU(block, next);
3684             } else {
3685                 migration_bitmap_clear_dirty(ram_state, block, offset);
3686                 dst_host = block->host
3687                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3688                 src_host = block->colo_cache
3689                          + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3690                 memcpy(dst_host, src_host, TARGET_PAGE_SIZE);
3691             }
3692         }
3693     }
3694     trace_colo_flush_ram_cache_end();
3695 }
3696
3697 /**
3698  * ram_load_precopy: load pages in precopy case
3699  *
3700  * Returns 0 for success or -errno in case of error
3701  *
3702  * Called in precopy mode by ram_load().
3703  * rcu_read_lock is taken prior to this being called.
3704  *
3705  * @f: QEMUFile where to send the data
3706  */
3707 static int ram_load_precopy(QEMUFile *f)
3708 {
3709     int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
3710     /* ADVISE is earlier, it shows the source has the postcopy capability on */
3711     bool postcopy_advised = postcopy_is_advised();
3712     if (!migrate_use_compression()) {
3713         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3714     }
3715
3716     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3717         ram_addr_t addr, total_ram_bytes;
3718         void *host = NULL, *host_bak = NULL;
3719         uint8_t ch;
3720
3721         /*
3722          * Yield periodically to let main loop run, but an iteration of
3723          * the main loop is expensive, so do it each some iterations
3724          */
3725         if ((i & 32767) == 0 && qemu_in_coroutine()) {
3726             aio_co_schedule(qemu_get_current_aio_context(),
3727                             qemu_coroutine_self());
3728             qemu_coroutine_yield();
3729         }
3730         i++;
3731
3732         addr = qemu_get_be64(f);
3733         flags = addr & ~TARGET_PAGE_MASK;
3734         addr &= TARGET_PAGE_MASK;
3735
3736         if (flags & invalid_flags) {
3737             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3738                 error_report("Received an unexpected compressed page");
3739             }
3740
3741             ret = -EINVAL;
3742             break;
3743         }
3744
3745         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3746                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
3747             RAMBlock *block = ram_block_from_stream(f, flags);
3748
3749             host = host_from_ram_block_offset(block, addr);
3750             /*
3751              * After going into COLO stage, we should not load the page
3752              * into SVM's memory directly, we put them into colo_cache firstly.
3753              * NOTE: We need to keep a copy of SVM's ram in colo_cache.
3754              * Previously, we copied all these memory in preparing stage of COLO
3755              * while we need to stop VM, which is a time-consuming process.
3756              * Here we optimize it by a trick, back-up every page while in
3757              * migration process while COLO is enabled, though it affects the
3758              * speed of the migration, but it obviously reduce the downtime of
3759              * back-up all SVM'S memory in COLO preparing stage.
3760              */
3761             if (migration_incoming_colo_enabled()) {
3762                 if (migration_incoming_in_colo_state()) {
3763                     /* In COLO stage, put all pages into cache temporarily */
3764                     host = colo_cache_from_block_offset(block, addr, true);
3765                 } else {
3766                    /*
3767                     * In migration stage but before COLO stage,
3768                     * Put all pages into both cache and SVM's memory.
3769                     */
3770                     host_bak = colo_cache_from_block_offset(block, addr, false);
3771                 }
3772             }
3773             if (!host) {
3774                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3775                 ret = -EINVAL;
3776                 break;
3777             }
3778             if (!migration_incoming_in_colo_state()) {
3779                 ramblock_recv_bitmap_set(block, host);
3780             }
3781
3782             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
3783         }
3784
3785         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3786         case RAM_SAVE_FLAG_MEM_SIZE:
3787             /* Synchronize RAM block list */
3788             total_ram_bytes = addr;
3789             while (!ret && total_ram_bytes) {
3790                 RAMBlock *block;
3791                 char id[256];
3792                 ram_addr_t length;
3793
3794                 len = qemu_get_byte(f);
3795                 qemu_get_buffer(f, (uint8_t *)id, len);
3796                 id[len] = 0;
3797                 length = qemu_get_be64(f);
3798
3799                 block = qemu_ram_block_by_name(id);
3800                 if (block && !qemu_ram_is_migratable(block)) {
3801                     error_report("block %s should not be migrated !", id);
3802                     ret = -EINVAL;
3803                 } else if (block) {
3804                     if (length != block->used_length) {
3805                         Error *local_err = NULL;
3806
3807                         ret = qemu_ram_resize(block, length,
3808                                               &local_err);
3809                         if (local_err) {
3810                             error_report_err(local_err);
3811                         }
3812                     }
3813                     /* For postcopy we need to check hugepage sizes match */
3814                     if (postcopy_advised && migrate_postcopy_ram() &&
3815                         block->page_size != qemu_host_page_size) {
3816                         uint64_t remote_page_size = qemu_get_be64(f);
3817                         if (remote_page_size != block->page_size) {
3818                             error_report("Mismatched RAM page size %s "
3819                                          "(local) %zd != %" PRId64,
3820                                          id, block->page_size,
3821                                          remote_page_size);
3822                             ret = -EINVAL;
3823                         }
3824                     }
3825                     if (migrate_ignore_shared()) {
3826                         hwaddr addr = qemu_get_be64(f);
3827                         if (ramblock_is_ignored(block) &&
3828                             block->mr->addr != addr) {
3829                             error_report("Mismatched GPAs for block %s "
3830                                          "%" PRId64 "!= %" PRId64,
3831                                          id, (uint64_t)addr,
3832                                          (uint64_t)block->mr->addr);
3833                             ret = -EINVAL;
3834                         }
3835                     }
3836                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
3837                                           block->idstr);
3838                 } else {
3839                     error_report("Unknown ramblock \"%s\", cannot "
3840                                  "accept migration", id);
3841                     ret = -EINVAL;
3842                 }
3843
3844                 total_ram_bytes -= length;
3845             }
3846             break;
3847
3848         case RAM_SAVE_FLAG_ZERO:
3849             ch = qemu_get_byte(f);
3850             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
3851             break;
3852
3853         case RAM_SAVE_FLAG_PAGE:
3854             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
3855             break;
3856
3857         case RAM_SAVE_FLAG_COMPRESS_PAGE:
3858             len = qemu_get_be32(f);
3859             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3860                 error_report("Invalid compressed data length: %d", len);
3861                 ret = -EINVAL;
3862                 break;
3863             }
3864             decompress_data_with_multi_threads(f, host, len);
3865             break;
3866
3867         case RAM_SAVE_FLAG_XBZRLE:
3868             if (load_xbzrle(f, addr, host) < 0) {
3869                 error_report("Failed to decompress XBZRLE page at "
3870                              RAM_ADDR_FMT, addr);
3871                 ret = -EINVAL;
3872                 break;
3873             }
3874             break;
3875         case RAM_SAVE_FLAG_EOS:
3876             /* normal exit */
3877             multifd_recv_sync_main();
3878             break;
3879         default:
3880             if (flags & RAM_SAVE_FLAG_HOOK) {
3881                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
3882             } else {
3883                 error_report("Unknown combination of migration flags: 0x%x",
3884                              flags);
3885                 ret = -EINVAL;
3886             }
3887         }
3888         if (!ret) {
3889             ret = qemu_file_get_error(f);
3890         }
3891         if (!ret && host_bak) {
3892             memcpy(host_bak, host, TARGET_PAGE_SIZE);
3893         }
3894     }
3895
3896     ret |= wait_for_decompress_done();
3897     return ret;
3898 }
3899
3900 static int ram_load(QEMUFile *f, void *opaque, int version_id)
3901 {
3902     int ret = 0;
3903     static uint64_t seq_iter;
3904     /*
3905      * If system is running in postcopy mode, page inserts to host memory must
3906      * be atomic
3907      */
3908     bool postcopy_running = postcopy_is_running();
3909
3910     seq_iter++;
3911
3912     if (version_id != 4) {
3913         return -EINVAL;
3914     }
3915
3916     /*
3917      * This RCU critical section can be very long running.
3918      * When RCU reclaims in the code start to become numerous,
3919      * it will be necessary to reduce the granularity of this
3920      * critical section.
3921      */
3922     WITH_RCU_READ_LOCK_GUARD() {
3923         if (postcopy_running) {
3924             ret = ram_load_postcopy(f);
3925         } else {
3926             ret = ram_load_precopy(f);
3927         }
3928     }
3929     trace_ram_load_complete(ret, seq_iter);
3930
3931     return ret;
3932 }
3933
3934 static bool ram_has_postcopy(void *opaque)
3935 {
3936     RAMBlock *rb;
3937     RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3938         if (ramblock_is_pmem(rb)) {
3939             info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
3940                          "is not supported now!", rb->idstr, rb->host);
3941             return false;
3942         }
3943     }
3944
3945     return migrate_postcopy_ram();
3946 }
3947
3948 /* Sync all the dirty bitmap with destination VM.  */
3949 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
3950 {
3951     RAMBlock *block;
3952     QEMUFile *file = s->to_dst_file;
3953     int ramblock_count = 0;
3954
3955     trace_ram_dirty_bitmap_sync_start();
3956
3957     RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3958         qemu_savevm_send_recv_bitmap(file, block->idstr);
3959         trace_ram_dirty_bitmap_request(block->idstr);
3960         ramblock_count++;
3961     }
3962
3963     trace_ram_dirty_bitmap_sync_wait();
3964
3965     /* Wait until all the ramblocks' dirty bitmap synced */
3966     while (ramblock_count--) {
3967         qemu_sem_wait(&s->rp_state.rp_sem);
3968     }
3969
3970     trace_ram_dirty_bitmap_sync_complete();
3971
3972     return 0;
3973 }
3974
3975 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
3976 {
3977     qemu_sem_post(&s->rp_state.rp_sem);
3978 }
3979
3980 /*
3981  * Read the received bitmap, revert it as the initial dirty bitmap.
3982  * This is only used when the postcopy migration is paused but wants
3983  * to resume from a middle point.
3984  */
3985 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
3986 {
3987     int ret = -EINVAL;
3988     QEMUFile *file = s->rp_state.from_dst_file;
3989     unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
3990     uint64_t local_size = DIV_ROUND_UP(nbits, 8);
3991     uint64_t size, end_mark;
3992
3993     trace_ram_dirty_bitmap_reload_begin(block->idstr);
3994
3995     if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
3996         error_report("%s: incorrect state %s", __func__,
3997                      MigrationStatus_str(s->state));
3998         return -EINVAL;
3999     }
4000
4001     /*
4002      * Note: see comments in ramblock_recv_bitmap_send() on why we
4003      * need the endianness conversion, and the paddings.
4004      */
4005     local_size = ROUND_UP(local_size, 8);
4006
4007     /* Add paddings */
4008     le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4009
4010     size = qemu_get_be64(file);
4011
4012     /* The size of the bitmap should match with our ramblock */
4013     if (size != local_size) {
4014         error_report("%s: ramblock '%s' bitmap size mismatch "
4015                      "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4016                      block->idstr, size, local_size);
4017         ret = -EINVAL;
4018         goto out;
4019     }
4020
4021     size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4022     end_mark = qemu_get_be64(file);
4023
4024     ret = qemu_file_get_error(file);
4025     if (ret || size != local_size) {
4026         error_report("%s: read bitmap failed for ramblock '%s': %d"
4027                      " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4028                      __func__, block->idstr, ret, local_size, size);
4029         ret = -EIO;
4030         goto out;
4031     }
4032
4033     if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4034         error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64,
4035                      __func__, block->idstr, end_mark);
4036         ret = -EINVAL;
4037         goto out;
4038     }
4039
4040     /*
4041      * Endianness conversion. We are during postcopy (though paused).
4042      * The dirty bitmap won't change. We can directly modify it.
4043      */
4044     bitmap_from_le(block->bmap, le_bitmap, nbits);
4045
4046     /*
4047      * What we received is "received bitmap". Revert it as the initial
4048      * dirty bitmap for this ramblock.
4049      */
4050     bitmap_complement(block->bmap, block->bmap, nbits);
4051
4052     trace_ram_dirty_bitmap_reload_complete(block->idstr);
4053
4054     /*
4055      * We succeeded to sync bitmap for current ramblock. If this is
4056      * the last one to sync, we need to notify the main send thread.
4057      */
4058     ram_dirty_bitmap_reload_notify(s);
4059
4060     ret = 0;
4061 out:
4062     g_free(le_bitmap);
4063     return ret;
4064 }
4065
4066 static int ram_resume_prepare(MigrationState *s, void *opaque)
4067 {
4068     RAMState *rs = *(RAMState **)opaque;
4069     int ret;
4070
4071     ret = ram_dirty_bitmap_sync_all(s, rs);
4072     if (ret) {
4073         return ret;
4074     }
4075
4076     ram_state_resume_prepare(rs, s->to_dst_file);
4077
4078     return 0;
4079 }
4080
4081 static SaveVMHandlers savevm_ram_handlers = {
4082     .save_setup = ram_save_setup,
4083     .save_live_iterate = ram_save_iterate,
4084     .save_live_complete_postcopy = ram_save_complete,
4085     .save_live_complete_precopy = ram_save_complete,
4086     .has_postcopy = ram_has_postcopy,
4087     .save_live_pending = ram_save_pending,
4088     .load_state = ram_load,
4089     .save_cleanup = ram_save_cleanup,
4090     .load_setup = ram_load_setup,
4091     .load_cleanup = ram_load_cleanup,
4092     .resume_prepare = ram_resume_prepare,
4093 };
4094
4095 void ram_mig_init(void)
4096 {
4097     qemu_mutex_init(&XBZRLE.lock);
4098     register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
4099 }