migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <quintela@redhat.com>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28
  29 #include "qemu/osdep.h"
  30 #include "cpu.h"
  31 #include <zlib.h>
  32 #include "qemu/cutils.h"
  33 #include "qemu/bitops.h"
  34 #include "qemu/bitmap.h"
  35 #include "qemu/main-loop.h"
  36 #include "xbzrle.h"
  37 #include "ram.h"
  38 #include "migration.h"
  39 #include "migration/register.h"
  40 #include "migration/misc.h"
  41 #include "qemu-file.h"
  42 #include "postcopy-ram.h"
  43 #include "migration/page_cache.h"
  44 #include "qemu/error-report.h"
  45 #include "qapi/error.h"
  46 #include "qapi/qapi-events-migration.h"
  47 #include "qapi/qmp/qerror.h"
  48 #include "trace.h"
  49 #include "exec/ram_addr.h"
  50 #include "exec/target_page.h"
  51 #include "qemu/rcu_queue.h"
  52 #include "migration/colo.h"
  53 #include "migration/block.h"
  54
  55 /***********************************************************/
  56 /* ram save/restore */
  57
  58 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
  59  * worked for pages that where filled with the same char.  We switched
  60  * it to only search for the zero value.  And to avoid confusion with
  61  * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
  62  */
  63
  64 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  65 #define RAM_SAVE_FLAG_ZERO     0x02
  66 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  67 #define RAM_SAVE_FLAG_PAGE     0x08
  68 #define RAM_SAVE_FLAG_EOS      0x10
  69 #define RAM_SAVE_FLAG_CONTINUE 0x20
  70 #define RAM_SAVE_FLAG_XBZRLE   0x40
  71 /* 0x80 is reserved in migration.h start with 0x100 next */
  72 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  73
  74 static inline bool is_zero_range(uint8_t *p, uint64_t size)
  75 {
  76     return buffer_is_zero(p, size);
  77 }
  78
  79 XBZRLECacheStats xbzrle_counters;
  80
  81 /* struct contains XBZRLE cache and a static page
  82    used by the compression */
  83 static struct {
  84     /* buffer used for XBZRLE encoding */
  85     uint8_t *encoded_buf;
  86     /* buffer for storing page content */
  87     uint8_t *current_buf;
  88     /* Cache for XBZRLE, Protected by lock. */
  89     PageCache *cache;
  90     QemuMutex lock;
  91     /* it will store a page full of zeros */
  92     uint8_t *zero_target_page;
  93     /* buffer used for XBZRLE decoding */
  94     uint8_t *decoded_buf;
  95 } XBZRLE;
  96
  97 static void XBZRLE_cache_lock(void)
  98 {
  99     if (migrate_use_xbzrle())
 100         qemu_mutex_lock(&XBZRLE.lock);
 101 }
 102
 103 static void XBZRLE_cache_unlock(void)
 104 {
 105     if (migrate_use_xbzrle())
 106         qemu_mutex_unlock(&XBZRLE.lock);
 107 }
 108
 109 /**
 110  * xbzrle_cache_resize: resize the xbzrle cache
 111  *
 112  * This function is called from qmp_migrate_set_cache_size in main
 113  * thread, possibly while a migration is in progress.  A running
 114  * migration may be using the cache and might finish during this call,
 115  * hence changes to the cache are protected by XBZRLE.lock().
 116  *
 117  * Returns 0 for success or -1 for error
 118  *
 119  * @new_size: new cache size
 120  * @errp: set *errp if the check failed, with reason
 121  */
 122 int xbzrle_cache_resize(int64_t new_size, Error **errp)
 123 {
 124     PageCache *new_cache;
 125     int64_t ret = 0;
 126
 127     /* Check for truncation */
 128     if (new_size != (size_t)new_size) {
 129         error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
 130                    "exceeding address space");
 131         return -1;
 132     }
 133
 134     if (new_size == migrate_xbzrle_cache_size()) {
 135         /* nothing to do */
 136         return 0;
 137     }
 138
 139     XBZRLE_cache_lock();
 140
 141     if (XBZRLE.cache != NULL) {
 142         new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
 143         if (!new_cache) {
 144             ret = -1;
 145             goto out;
 146         }
 147
 148         cache_fini(XBZRLE.cache);
 149         XBZRLE.cache = new_cache;
 150     }
 151 out:
 152     XBZRLE_cache_unlock();
 153     return ret;
 154 }
 155
 156 static void ramblock_recv_map_init(void)
 157 {
 158     RAMBlock *rb;
 159
 160     RAMBLOCK_FOREACH(rb) {
 161         assert(!rb->receivedmap);
 162         rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
 163     }
 164 }
 165
 166 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
 167 {
 168     return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
 169                     rb->receivedmap);
 170 }
 171
 172 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
 173 {
 174     return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
 175 }
 176
 177 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
 178 {
 179     set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
 180 }
 181
 182 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
 183                                     size_t nr)
 184 {
 185     bitmap_set_atomic(rb->receivedmap,
 186                       ramblock_recv_bitmap_offset(host_addr, rb),
 187                       nr);
 188 }
 189
 190 /*
 191  * An outstanding page request, on the source, having been received
 192  * and queued
 193  */
 194 struct RAMSrcPageRequest {
 195     RAMBlock *rb;
 196     hwaddr    offset;
 197     hwaddr    len;
 198
 199     QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
 200 };
 201
 202 /* State of RAM for migration */
 203 struct RAMState {
 204     /* QEMUFile used for this migration */
 205     QEMUFile *f;
 206     /* Last block that we have visited searching for dirty pages */
 207     RAMBlock *last_seen_block;
 208     /* Last block from where we have sent data */
 209     RAMBlock *last_sent_block;
 210     /* Last dirty target page we have sent */
 211     ram_addr_t last_page;
 212     /* last ram version we have seen */
 213     uint32_t last_version;
 214     /* We are in the first round */
 215     bool ram_bulk_stage;
 216     /* How many times we have dirty too many pages */
 217     int dirty_rate_high_cnt;
 218     /* these variables are used for bitmap sync */
 219     /* last time we did a full bitmap_sync */
 220     int64_t time_last_bitmap_sync;
 221     /* bytes transferred at start_time */
 222     uint64_t bytes_xfer_prev;
 223     /* number of dirty pages since start_time */
 224     uint64_t num_dirty_pages_period;
 225     /* xbzrle misses since the beginning of the period */
 226     uint64_t xbzrle_cache_miss_prev;
 227     /* number of iterations at the beginning of period */
 228     uint64_t iterations_prev;
 229     /* Iterations since start */
 230     uint64_t iterations;
 231     /* number of dirty bits in the bitmap */
 232     uint64_t migration_dirty_pages;
 233     /* protects modification of the bitmap */
 234     QemuMutex bitmap_mutex;
 235     /* The RAMBlock used in the last src_page_requests */
 236     RAMBlock *last_req_rb;
 237     /* Queue of outstanding page requests from the destination */
 238     QemuMutex src_page_req_mutex;
 239     QSIMPLEQ_HEAD(src_page_requests, RAMSrcPageRequest) src_page_requests;
 240 };
 241 typedef struct RAMState RAMState;
 242
 243 static RAMState *ram_state;
 244
 245 uint64_t ram_bytes_remaining(void)
 246 {
 247     return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
 248                        0;
 249 }
 250
 251 MigrationStats ram_counters;
 252
 253 /* used by the search for pages to send */
 254 struct PageSearchStatus {
 255     /* Current block being searched */
 256     RAMBlock    *block;
 257     /* Current page to search from */
 258     unsigned long page;
 259     /* Set once we wrap around */
 260     bool         complete_round;
 261 };
 262 typedef struct PageSearchStatus PageSearchStatus;
 263
 264 struct CompressParam {
 265     bool done;
 266     bool quit;
 267     QEMUFile *file;
 268     QemuMutex mutex;
 269     QemuCond cond;
 270     RAMBlock *block;
 271     ram_addr_t offset;
 272
 273     /* internally used fields */
 274     z_stream stream;
 275     uint8_t *originbuf;
 276 };
 277 typedef struct CompressParam CompressParam;
 278
 279 struct DecompressParam {
 280     bool done;
 281     bool quit;
 282     QemuMutex mutex;
 283     QemuCond cond;
 284     void *des;
 285     uint8_t *compbuf;
 286     int len;
 287     z_stream stream;
 288 };
 289 typedef struct DecompressParam DecompressParam;
 290
 291 static CompressParam *comp_param;
 292 static QemuThread *compress_threads;
 293 /* comp_done_cond is used to wake up the migration thread when
 294  * one of the compression threads has finished the compression.
 295  * comp_done_lock is used to co-work with comp_done_cond.
 296  */
 297 static QemuMutex comp_done_lock;
 298 static QemuCond comp_done_cond;
 299 /* The empty QEMUFileOps will be used by file in CompressParam */
 300 static const QEMUFileOps empty_ops = { };
 301
 302 static QEMUFile *decomp_file;
 303 static DecompressParam *decomp_param;
 304 static QemuThread *decompress_threads;
 305 static QemuMutex decomp_done_lock;
 306 static QemuCond decomp_done_cond;
 307
 308 static int do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
 309                                 ram_addr_t offset, uint8_t *source_buf);
 310
 311 static void *do_data_compress(void *opaque)
 312 {
 313     CompressParam *param = opaque;
 314     RAMBlock *block;
 315     ram_addr_t offset;
 316
 317     qemu_mutex_lock(&param->mutex);
 318     while (!param->quit) {
 319         if (param->block) {
 320             block = param->block;
 321             offset = param->offset;
 322             param->block = NULL;
 323             qemu_mutex_unlock(&param->mutex);
 324
 325             do_compress_ram_page(param->file, &param->stream, block, offset,
 326                                  param->originbuf);
 327
 328             qemu_mutex_lock(&comp_done_lock);
 329             param->done = true;
 330             qemu_cond_signal(&comp_done_cond);
 331             qemu_mutex_unlock(&comp_done_lock);
 332
 333             qemu_mutex_lock(&param->mutex);
 334         } else {
 335             qemu_cond_wait(&param->cond, &param->mutex);
 336         }
 337     }
 338     qemu_mutex_unlock(&param->mutex);
 339
 340     return NULL;
 341 }
 342
 343 static inline void terminate_compression_threads(void)
 344 {
 345     int idx, thread_count;
 346
 347     thread_count = migrate_compress_threads();
 348
 349     for (idx = 0; idx < thread_count; idx++) {
 350         qemu_mutex_lock(&comp_param[idx].mutex);
 351         comp_param[idx].quit = true;
 352         qemu_cond_signal(&comp_param[idx].cond);
 353         qemu_mutex_unlock(&comp_param[idx].mutex);
 354     }
 355 }
 356
 357 static void compress_threads_save_cleanup(void)
 358 {
 359     int i, thread_count;
 360
 361     if (!migrate_use_compression()) {
 362         return;
 363     }
 364     terminate_compression_threads();
 365     thread_count = migrate_compress_threads();
 366     for (i = 0; i < thread_count; i++) {
 367         /*
 368          * we use it as a indicator which shows if the thread is
 369          * properly init'd or not
 370          */
 371         if (!comp_param[i].file) {
 372             break;
 373         }
 374         qemu_thread_join(compress_threads + i);
 375         qemu_mutex_destroy(&comp_param[i].mutex);
 376         qemu_cond_destroy(&comp_param[i].cond);
 377         deflateEnd(&comp_param[i].stream);
 378         g_free(comp_param[i].originbuf);
 379         qemu_fclose(comp_param[i].file);
 380         comp_param[i].file = NULL;
 381     }
 382     qemu_mutex_destroy(&comp_done_lock);
 383     qemu_cond_destroy(&comp_done_cond);
 384     g_free(compress_threads);
 385     g_free(comp_param);
 386     compress_threads = NULL;
 387     comp_param = NULL;
 388 }
 389
 390 static int compress_threads_save_setup(void)
 391 {
 392     int i, thread_count;
 393
 394     if (!migrate_use_compression()) {
 395         return 0;
 396     }
 397     thread_count = migrate_compress_threads();
 398     compress_threads = g_new0(QemuThread, thread_count);
 399     comp_param = g_new0(CompressParam, thread_count);
 400     qemu_cond_init(&comp_done_cond);
 401     qemu_mutex_init(&comp_done_lock);
 402     for (i = 0; i < thread_count; i++) {
 403         comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
 404         if (!comp_param[i].originbuf) {
 405             goto exit;
 406         }
 407
 408         if (deflateInit(&comp_param[i].stream,
 409                         migrate_compress_level()) != Z_OK) {
 410             g_free(comp_param[i].originbuf);
 411             goto exit;
 412         }
 413
 414         /* comp_param[i].file is just used as a dummy buffer to save data,
 415          * set its ops to empty.
 416          */
 417         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
 418         comp_param[i].done = true;
 419         comp_param[i].quit = false;
 420         qemu_mutex_init(&comp_param[i].mutex);
 421         qemu_cond_init(&comp_param[i].cond);
 422         qemu_thread_create(compress_threads + i, "compress",
 423                            do_data_compress, comp_param + i,
 424                            QEMU_THREAD_JOINABLE);
 425     }
 426     return 0;
 427
 428 exit:
 429     compress_threads_save_cleanup();
 430     return -1;
 431 }
 432
 433 /* Multiple fd's */
 434
 435 struct MultiFDSendParams {
 436     uint8_t id;
 437     char *name;
 438     QemuThread thread;
 439     QemuSemaphore sem;
 440     QemuMutex mutex;
 441     bool quit;
 442 };
 443 typedef struct MultiFDSendParams MultiFDSendParams;
 444
 445 struct {
 446     MultiFDSendParams *params;
 447     /* number of created threads */
 448     int count;
 449 } *multifd_send_state;
 450
 451 static void terminate_multifd_send_threads(Error *errp)
 452 {
 453     int i;
 454
 455     for (i = 0; i < multifd_send_state->count; i++) {
 456         MultiFDSendParams *p = &multifd_send_state->params[i];
 457
 458         qemu_mutex_lock(&p->mutex);
 459         p->quit = true;
 460         qemu_sem_post(&p->sem);
 461         qemu_mutex_unlock(&p->mutex);
 462     }
 463 }
 464
 465 int multifd_save_cleanup(Error **errp)
 466 {
 467     int i;
 468     int ret = 0;
 469
 470     if (!migrate_use_multifd()) {
 471         return 0;
 472     }
 473     terminate_multifd_send_threads(NULL);
 474     for (i = 0; i < multifd_send_state->count; i++) {
 475         MultiFDSendParams *p = &multifd_send_state->params[i];
 476
 477         qemu_thread_join(&p->thread);
 478         qemu_mutex_destroy(&p->mutex);
 479         qemu_sem_destroy(&p->sem);
 480         g_free(p->name);
 481         p->name = NULL;
 482     }
 483     g_free(multifd_send_state->params);
 484     multifd_send_state->params = NULL;
 485     g_free(multifd_send_state);
 486     multifd_send_state = NULL;
 487     return ret;
 488 }
 489
 490 static void *multifd_send_thread(void *opaque)
 491 {
 492     MultiFDSendParams *p = opaque;
 493
 494     while (true) {
 495         qemu_mutex_lock(&p->mutex);
 496         if (p->quit) {
 497             qemu_mutex_unlock(&p->mutex);
 498             break;
 499         }
 500         qemu_mutex_unlock(&p->mutex);
 501         qemu_sem_wait(&p->sem);
 502     }
 503
 504     return NULL;
 505 }
 506
 507 int multifd_save_setup(void)
 508 {
 509     int thread_count;
 510     uint8_t i;
 511
 512     if (!migrate_use_multifd()) {
 513         return 0;
 514     }
 515     thread_count = migrate_multifd_channels();
 516     multifd_send_state = g_malloc0(sizeof(*multifd_send_state));
 517     multifd_send_state->params = g_new0(MultiFDSendParams, thread_count);
 518     multifd_send_state->count = 0;
 519     for (i = 0; i < thread_count; i++) {
 520         MultiFDSendParams *p = &multifd_send_state->params[i];
 521
 522         qemu_mutex_init(&p->mutex);
 523         qemu_sem_init(&p->sem, 0);
 524         p->quit = false;
 525         p->id = i;
 526         p->name = g_strdup_printf("multifdsend_%d", i);
 527         qemu_thread_create(&p->thread, p->name, multifd_send_thread, p,
 528                            QEMU_THREAD_JOINABLE);
 529
 530         multifd_send_state->count++;
 531     }
 532     return 0;
 533 }
 534
 535 struct MultiFDRecvParams {
 536     uint8_t id;
 537     char *name;
 538     QemuThread thread;
 539     QemuSemaphore sem;
 540     QemuMutex mutex;
 541     bool quit;
 542 };
 543 typedef struct MultiFDRecvParams MultiFDRecvParams;
 544
 545 struct {
 546     MultiFDRecvParams *params;
 547     /* number of created threads */
 548     int count;
 549 } *multifd_recv_state;
 550
 551 static void terminate_multifd_recv_threads(Error *errp)
 552 {
 553     int i;
 554
 555     for (i = 0; i < multifd_recv_state->count; i++) {
 556         MultiFDRecvParams *p = &multifd_recv_state->params[i];
 557
 558         qemu_mutex_lock(&p->mutex);
 559         p->quit = true;
 560         qemu_sem_post(&p->sem);
 561         qemu_mutex_unlock(&p->mutex);
 562     }
 563 }
 564
 565 int multifd_load_cleanup(Error **errp)
 566 {
 567     int i;
 568     int ret = 0;
 569
 570     if (!migrate_use_multifd()) {
 571         return 0;
 572     }
 573     terminate_multifd_recv_threads(NULL);
 574     for (i = 0; i < multifd_recv_state->count; i++) {
 575         MultiFDRecvParams *p = &multifd_recv_state->params[i];
 576
 577         qemu_thread_join(&p->thread);
 578         qemu_mutex_destroy(&p->mutex);
 579         qemu_sem_destroy(&p->sem);
 580         g_free(p->name);
 581         p->name = NULL;
 582     }
 583     g_free(multifd_recv_state->params);
 584     multifd_recv_state->params = NULL;
 585     g_free(multifd_recv_state);
 586     multifd_recv_state = NULL;
 587
 588     return ret;
 589 }
 590
 591 static void *multifd_recv_thread(void *opaque)
 592 {
 593     MultiFDRecvParams *p = opaque;
 594
 595     while (true) {
 596         qemu_mutex_lock(&p->mutex);
 597         if (p->quit) {
 598             qemu_mutex_unlock(&p->mutex);
 599             break;
 600         }
 601         qemu_mutex_unlock(&p->mutex);
 602         qemu_sem_wait(&p->sem);
 603     }
 604
 605     return NULL;
 606 }
 607
 608 int multifd_load_setup(void)
 609 {
 610     int thread_count;
 611     uint8_t i;
 612
 613     if (!migrate_use_multifd()) {
 614         return 0;
 615     }
 616     thread_count = migrate_multifd_channels();
 617     multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state));
 618     multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count);
 619     multifd_recv_state->count = 0;
 620     for (i = 0; i < thread_count; i++) {
 621         MultiFDRecvParams *p = &multifd_recv_state->params[i];
 622
 623         qemu_mutex_init(&p->mutex);
 624         qemu_sem_init(&p->sem, 0);
 625         p->quit = false;
 626         p->id = i;
 627         p->name = g_strdup_printf("multifdrecv_%d", i);
 628         qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p,
 629                            QEMU_THREAD_JOINABLE);
 630         multifd_recv_state->count++;
 631     }
 632     return 0;
 633 }
 634
 635 /**
 636  * save_page_header: write page header to wire
 637  *
 638  * If this is the 1st block, it also writes the block identification
 639  *
 640  * Returns the number of bytes written
 641  *
 642  * @f: QEMUFile where to send the data
 643  * @block: block that contains the page we want to send
 644  * @offset: offset inside the block for the page
 645  *          in the lower bits, it contains flags
 646  */
 647 static size_t save_page_header(RAMState *rs, QEMUFile *f,  RAMBlock *block,
 648                                ram_addr_t offset)
 649 {
 650     size_t size, len;
 651
 652     if (block == rs->last_sent_block) {
 653         offset |= RAM_SAVE_FLAG_CONTINUE;
 654     }
 655     qemu_put_be64(f, offset);
 656     size = 8;
 657
 658     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
 659         len = strlen(block->idstr);
 660         qemu_put_byte(f, len);
 661         qemu_put_buffer(f, (uint8_t *)block->idstr, len);
 662         size += 1 + len;
 663         rs->last_sent_block = block;
 664     }
 665     return size;
 666 }
 667
 668 /**
 669  * mig_throttle_guest_down: throotle down the guest
 670  *
 671  * Reduce amount of guest cpu execution to hopefully slow down memory
 672  * writes. If guest dirty memory rate is reduced below the rate at
 673  * which we can transfer pages to the destination then we should be
 674  * able to complete migration. Some workloads dirty memory way too
 675  * fast and will not effectively converge, even with auto-converge.
 676  */
 677 static void mig_throttle_guest_down(void)
 678 {
 679     MigrationState *s = migrate_get_current();
 680     uint64_t pct_initial = s->parameters.cpu_throttle_initial;
 681     uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
 682
 683     /* We have not started throttling yet. Let's start it. */
 684     if (!cpu_throttle_active()) {
 685         cpu_throttle_set(pct_initial);
 686     } else {
 687         /* Throttling already on, just increase the rate */
 688         cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
 689     }
 690 }
 691
 692 /**
 693  * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
 694  *
 695  * @rs: current RAM state
 696  * @current_addr: address for the zero page
 697  *
 698  * Update the xbzrle cache to reflect a page that's been sent as all 0.
 699  * The important thing is that a stale (not-yet-0'd) page be replaced
 700  * by the new data.
 701  * As a bonus, if the page wasn't in the cache it gets added so that
 702  * when a small write is made into the 0'd page it gets XBZRLE sent.
 703  */
 704 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
 705 {
 706     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
 707         return;
 708     }
 709
 710     /* We don't care if this fails to allocate a new cache page
 711      * as long as it updated an old one */
 712     cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
 713                  ram_counters.dirty_sync_count);
 714 }
 715
 716 #define ENCODING_FLAG_XBZRLE 0x1
 717
 718 /**
 719  * save_xbzrle_page: compress and send current page
 720  *
 721  * Returns: 1 means that we wrote the page
 722  *          0 means that page is identical to the one already sent
 723  *          -1 means that xbzrle would be longer than normal
 724  *
 725  * @rs: current RAM state
 726  * @current_data: pointer to the address of the page contents
 727  * @current_addr: addr of the page
 728  * @block: block that contains the page we want to send
 729  * @offset: offset inside the block for the page
 730  * @last_stage: if we are at the completion stage
 731  */
 732 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
 733                             ram_addr_t current_addr, RAMBlock *block,
 734                             ram_addr_t offset, bool last_stage)
 735 {
 736     int encoded_len = 0, bytes_xbzrle;
 737     uint8_t *prev_cached_page;
 738
 739     if (!cache_is_cached(XBZRLE.cache, current_addr,
 740                          ram_counters.dirty_sync_count)) {
 741         xbzrle_counters.cache_miss++;
 742         if (!last_stage) {
 743             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
 744                              ram_counters.dirty_sync_count) == -1) {
 745                 return -1;
 746             } else {
 747                 /* update *current_data when the page has been
 748                    inserted into cache */
 749                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
 750             }
 751         }
 752         return -1;
 753     }
 754
 755     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
 756
 757     /* save current buffer into memory */
 758     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 759
 760     /* XBZRLE encoding (if there is no overflow) */
 761     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
 762                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
 763                                        TARGET_PAGE_SIZE);
 764     if (encoded_len == 0) {
 765         trace_save_xbzrle_page_skipping();
 766         return 0;
 767     } else if (encoded_len == -1) {
 768         trace_save_xbzrle_page_overflow();
 769         xbzrle_counters.overflow++;
 770         /* update data in the cache */
 771         if (!last_stage) {
 772             memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
 773             *current_data = prev_cached_page;
 774         }
 775         return -1;
 776     }
 777
 778     /* we need to update the data in the cache, in order to get the same data */
 779     if (!last_stage) {
 780         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
 781     }
 782
 783     /* Send XBZRLE based compressed page */
 784     bytes_xbzrle = save_page_header(rs, rs->f, block,
 785                                     offset | RAM_SAVE_FLAG_XBZRLE);
 786     qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
 787     qemu_put_be16(rs->f, encoded_len);
 788     qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
 789     bytes_xbzrle += encoded_len + 1 + 2;
 790     xbzrle_counters.pages++;
 791     xbzrle_counters.bytes += bytes_xbzrle;
 792     ram_counters.transferred += bytes_xbzrle;
 793
 794     return 1;
 795 }
 796
 797 /**
 798  * migration_bitmap_find_dirty: find the next dirty page from start
 799  *
 800  * Called with rcu_read_lock() to protect migration_bitmap
 801  *
 802  * Returns the byte offset within memory region of the start of a dirty page
 803  *
 804  * @rs: current RAM state
 805  * @rb: RAMBlock where to search for dirty pages
 806  * @start: page where we start the search
 807  */
 808 static inline
 809 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
 810                                           unsigned long start)
 811 {
 812     unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
 813     unsigned long *bitmap = rb->bmap;
 814     unsigned long next;
 815
 816     if (rs->ram_bulk_stage && start > 0) {
 817         next = start + 1;
 818     } else {
 819         next = find_next_bit(bitmap, size, start);
 820     }
 821
 822     return next;
 823 }
 824
 825 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
 826                                                 RAMBlock *rb,
 827                                                 unsigned long page)
 828 {
 829     bool ret;
 830
 831     ret = test_and_clear_bit(page, rb->bmap);
 832
 833     if (ret) {
 834         rs->migration_dirty_pages--;
 835     }
 836     return ret;
 837 }
 838
 839 static void migration_bitmap_sync_range(RAMState *rs, RAMBlock *rb,
 840                                         ram_addr_t start, ram_addr_t length)
 841 {
 842     rs->migration_dirty_pages +=
 843         cpu_physical_memory_sync_dirty_bitmap(rb, start, length,
 844                                               &rs->num_dirty_pages_period);
 845 }
 846
 847 /**
 848  * ram_pagesize_summary: calculate all the pagesizes of a VM
 849  *
 850  * Returns a summary bitmap of the page sizes of all RAMBlocks
 851  *
 852  * For VMs with just normal pages this is equivalent to the host page
 853  * size. If it's got some huge pages then it's the OR of all the
 854  * different page sizes.
 855  */
 856 uint64_t ram_pagesize_summary(void)
 857 {
 858     RAMBlock *block;
 859     uint64_t summary = 0;
 860
 861     RAMBLOCK_FOREACH(block) {
 862         summary |= block->page_size;
 863     }
 864
 865     return summary;
 866 }
 867
 868 static void migration_bitmap_sync(RAMState *rs)
 869 {
 870     RAMBlock *block;
 871     int64_t end_time;
 872     uint64_t bytes_xfer_now;
 873
 874     ram_counters.dirty_sync_count++;
 875
 876     if (!rs->time_last_bitmap_sync) {
 877         rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 878     }
 879
 880     trace_migration_bitmap_sync_start();
 881     memory_global_dirty_log_sync();
 882
 883     qemu_mutex_lock(&rs->bitmap_mutex);
 884     rcu_read_lock();
 885     RAMBLOCK_FOREACH(block) {
 886         migration_bitmap_sync_range(rs, block, 0, block->used_length);
 887     }
 888     rcu_read_unlock();
 889     qemu_mutex_unlock(&rs->bitmap_mutex);
 890
 891     trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
 892
 893     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 894
 895     /* more than 1 second = 1000 millisecons */
 896     if (end_time > rs->time_last_bitmap_sync + 1000) {
 897         /* calculate period counters */
 898         ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
 899             / (end_time - rs->time_last_bitmap_sync);
 900         bytes_xfer_now = ram_counters.transferred;
 901
 902         /* During block migration the auto-converge logic incorrectly detects
 903          * that ram migration makes no progress. Avoid this by disabling the
 904          * throttling logic during the bulk phase of block migration. */
 905         if (migrate_auto_converge() && !blk_mig_bulk_active()) {
 906             /* The following detection logic can be refined later. For now:
 907                Check to see if the dirtied bytes is 50% more than the approx.
 908                amount of bytes that just got transferred since the last time we
 909                were in this routine. If that happens twice, start or increase
 910                throttling */
 911
 912             if ((rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
 913                    (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
 914                 (++rs->dirty_rate_high_cnt >= 2)) {
 915                     trace_migration_throttle();
 916                     rs->dirty_rate_high_cnt = 0;
 917                     mig_throttle_guest_down();
 918             }
 919         }
 920
 921         if (migrate_use_xbzrle()) {
 922             if (rs->iterations_prev != rs->iterations) {
 923                 xbzrle_counters.cache_miss_rate =
 924                    (double)(xbzrle_counters.cache_miss -
 925                             rs->xbzrle_cache_miss_prev) /
 926                    (rs->iterations - rs->iterations_prev);
 927             }
 928             rs->iterations_prev = rs->iterations;
 929             rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
 930         }
 931
 932         /* reset period counters */
 933         rs->time_last_bitmap_sync = end_time;
 934         rs->num_dirty_pages_period = 0;
 935         rs->bytes_xfer_prev = bytes_xfer_now;
 936     }
 937     if (migrate_use_events()) {
 938         qapi_event_send_migration_pass(ram_counters.dirty_sync_count, NULL);
 939     }
 940 }
 941
 942 /**
 943  * save_zero_page: send the zero page to the stream
 944  *
 945  * Returns the number of pages written.
 946  *
 947  * @rs: current RAM state
 948  * @block: block that contains the page we want to send
 949  * @offset: offset inside the block for the page
 950  */
 951 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
 952 {
 953     uint8_t *p = block->host + offset;
 954     int pages = -1;
 955
 956     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
 957         ram_counters.duplicate++;
 958         ram_counters.transferred +=
 959             save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_ZERO);
 960         qemu_put_byte(rs->f, 0);
 961         ram_counters.transferred += 1;
 962         pages = 1;
 963     }
 964
 965     return pages;
 966 }
 967
 968 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
 969 {
 970     if (!migrate_release_ram() || !migration_in_postcopy()) {
 971         return;
 972     }
 973
 974     ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
 975 }
 976
 977 /*
 978  * @pages: the number of pages written by the control path,
 979  *        < 0 - error
 980  *        > 0 - number of pages written
 981  *
 982  * Return true if the pages has been saved, otherwise false is returned.
 983  */
 984 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
 985                               int *pages)
 986 {
 987     uint64_t bytes_xmit = 0;
 988     int ret;
 989
 990     *pages = -1;
 991     ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
 992                                 &bytes_xmit);
 993     if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
 994         return false;
 995     }
 996
 997     if (bytes_xmit) {
 998         ram_counters.transferred += bytes_xmit;
 999         *pages = 1;
1000     }
1001
1002     if (ret == RAM_SAVE_CONTROL_DELAYED) {
1003         return true;
1004     }
1005
1006     if (bytes_xmit > 0) {
1007         ram_counters.normal++;
1008     } else if (bytes_xmit == 0) {
1009         ram_counters.duplicate++;
1010     }
1011
1012     return true;
1013 }
1014
1015 /*
1016  * directly send the page to the stream
1017  *
1018  * Returns the number of pages written.
1019  *
1020  * @rs: current RAM state
1021  * @block: block that contains the page we want to send
1022  * @offset: offset inside the block for the page
1023  * @buf: the page to be sent
1024  * @async: send to page asyncly
1025  */
1026 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1027                             uint8_t *buf, bool async)
1028 {
1029     ram_counters.transferred += save_page_header(rs, rs->f, block,
1030                                                  offset | RAM_SAVE_FLAG_PAGE);
1031     if (async) {
1032         qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1033                               migrate_release_ram() &
1034                               migration_in_postcopy());
1035     } else {
1036         qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1037     }
1038     ram_counters.transferred += TARGET_PAGE_SIZE;
1039     ram_counters.normal++;
1040     return 1;
1041 }
1042
1043 /**
1044  * ram_save_page: send the given page to the stream
1045  *
1046  * Returns the number of pages written.
1047  *          < 0 - error
1048  *          >=0 - Number of pages written - this might legally be 0
1049  *                if xbzrle noticed the page was the same.
1050  *
1051  * @rs: current RAM state
1052  * @block: block that contains the page we want to send
1053  * @offset: offset inside the block for the page
1054  * @last_stage: if we are at the completion stage
1055  */
1056 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
1057 {
1058     int pages = -1;
1059     uint8_t *p;
1060     bool send_async = true;
1061     RAMBlock *block = pss->block;
1062     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
1063     ram_addr_t current_addr = block->offset + offset;
1064
1065     p = block->host + offset;
1066     trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1067
1068     XBZRLE_cache_lock();
1069     if (!rs->ram_bulk_stage && !migration_in_postcopy() &&
1070         migrate_use_xbzrle()) {
1071         pages = save_xbzrle_page(rs, &p, current_addr, block,
1072                                  offset, last_stage);
1073         if (!last_stage) {
1074             /* Can't send this cached data async, since the cache page
1075              * might get updated before it gets to the wire
1076              */
1077             send_async = false;
1078         }
1079     }
1080
1081     /* XBZRLE overflow or normal page */
1082     if (pages == -1) {
1083         pages = save_normal_page(rs, block, offset, p, send_async);
1084     }
1085
1086     XBZRLE_cache_unlock();
1087
1088     return pages;
1089 }
1090
1091 static int do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1092                                 ram_addr_t offset, uint8_t *source_buf)
1093 {
1094     RAMState *rs = ram_state;
1095     int bytes_sent, blen;
1096     uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1097
1098     bytes_sent = save_page_header(rs, f, block, offset |
1099                                   RAM_SAVE_FLAG_COMPRESS_PAGE);
1100
1101     /*
1102      * copy it to a internal buffer to avoid it being modified by VM
1103      * so that we can catch up the error during compression and
1104      * decompression
1105      */
1106     memcpy(source_buf, p, TARGET_PAGE_SIZE);
1107     blen = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1108     if (blen < 0) {
1109         bytes_sent = 0;
1110         qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
1111         error_report("compressed data failed!");
1112     } else {
1113         bytes_sent += blen;
1114         ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
1115     }
1116
1117     return bytes_sent;
1118 }
1119
1120 static void flush_compressed_data(RAMState *rs)
1121 {
1122     int idx, len, thread_count;
1123
1124     if (!migrate_use_compression()) {
1125         return;
1126     }
1127     thread_count = migrate_compress_threads();
1128
1129     qemu_mutex_lock(&comp_done_lock);
1130     for (idx = 0; idx < thread_count; idx++) {
1131         while (!comp_param[idx].done) {
1132             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1133         }
1134     }
1135     qemu_mutex_unlock(&comp_done_lock);
1136
1137     for (idx = 0; idx < thread_count; idx++) {
1138         qemu_mutex_lock(&comp_param[idx].mutex);
1139         if (!comp_param[idx].quit) {
1140             len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1141             ram_counters.transferred += len;
1142         }
1143         qemu_mutex_unlock(&comp_param[idx].mutex);
1144     }
1145 }
1146
1147 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1148                                        ram_addr_t offset)
1149 {
1150     param->block = block;
1151     param->offset = offset;
1152 }
1153
1154 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1155                                            ram_addr_t offset)
1156 {
1157     int idx, thread_count, bytes_xmit = -1, pages = -1;
1158
1159     thread_count = migrate_compress_threads();
1160     qemu_mutex_lock(&comp_done_lock);
1161     while (true) {
1162         for (idx = 0; idx < thread_count; idx++) {
1163             if (comp_param[idx].done) {
1164                 comp_param[idx].done = false;
1165                 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1166                 qemu_mutex_lock(&comp_param[idx].mutex);
1167                 set_compress_params(&comp_param[idx], block, offset);
1168                 qemu_cond_signal(&comp_param[idx].cond);
1169                 qemu_mutex_unlock(&comp_param[idx].mutex);
1170                 pages = 1;
1171                 ram_counters.normal++;
1172                 ram_counters.transferred += bytes_xmit;
1173                 break;
1174             }
1175         }
1176         if (pages > 0) {
1177             break;
1178         } else {
1179             qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1180         }
1181     }
1182     qemu_mutex_unlock(&comp_done_lock);
1183
1184     return pages;
1185 }
1186
1187 /**
1188  * find_dirty_block: find the next dirty page and update any state
1189  * associated with the search process.
1190  *
1191  * Returns if a page is found
1192  *
1193  * @rs: current RAM state
1194  * @pss: data about the state of the current dirty page scan
1195  * @again: set to false if the search has scanned the whole of RAM
1196  */
1197 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1198 {
1199     pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1200     if (pss->complete_round && pss->block == rs->last_seen_block &&
1201         pss->page >= rs->last_page) {
1202         /*
1203          * We've been once around the RAM and haven't found anything.
1204          * Give up.
1205          */
1206         *again = false;
1207         return false;
1208     }
1209     if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
1210         /* Didn't find anything in this RAM Block */
1211         pss->page = 0;
1212         pss->block = QLIST_NEXT_RCU(pss->block, next);
1213         if (!pss->block) {
1214             /* Hit the end of the list */
1215             pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1216             /* Flag that we've looped */
1217             pss->complete_round = true;
1218             rs->ram_bulk_stage = false;
1219             if (migrate_use_xbzrle()) {
1220                 /* If xbzrle is on, stop using the data compression at this
1221                  * point. In theory, xbzrle can do better than compression.
1222                  */
1223                 flush_compressed_data(rs);
1224             }
1225         }
1226         /* Didn't find anything this time, but try again on the new block */
1227         *again = true;
1228         return false;
1229     } else {
1230         /* Can go around again, but... */
1231         *again = true;
1232         /* We've found something so probably don't need to */
1233         return true;
1234     }
1235 }
1236
1237 /**
1238  * unqueue_page: gets a page of the queue
1239  *
1240  * Helper for 'get_queued_page' - gets a page off the queue
1241  *
1242  * Returns the block of the page (or NULL if none available)
1243  *
1244  * @rs: current RAM state
1245  * @offset: used to return the offset within the RAMBlock
1246  */
1247 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1248 {
1249     RAMBlock *block = NULL;
1250
1251     qemu_mutex_lock(&rs->src_page_req_mutex);
1252     if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1253         struct RAMSrcPageRequest *entry =
1254                                 QSIMPLEQ_FIRST(&rs->src_page_requests);
1255         block = entry->rb;
1256         *offset = entry->offset;
1257
1258         if (entry->len > TARGET_PAGE_SIZE) {
1259             entry->len -= TARGET_PAGE_SIZE;
1260             entry->offset += TARGET_PAGE_SIZE;
1261         } else {
1262             memory_region_unref(block->mr);
1263             QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1264             g_free(entry);
1265         }
1266     }
1267     qemu_mutex_unlock(&rs->src_page_req_mutex);
1268
1269     return block;
1270 }
1271
1272 /**
1273  * get_queued_page: unqueue a page from the postocpy requests
1274  *
1275  * Skips pages that are already sent (!dirty)
1276  *
1277  * Returns if a queued page is found
1278  *
1279  * @rs: current RAM state
1280  * @pss: data about the state of the current dirty page scan
1281  */
1282 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1283 {
1284     RAMBlock  *block;
1285     ram_addr_t offset;
1286     bool dirty;
1287
1288     do {
1289         block = unqueue_page(rs, &offset);
1290         /*
1291          * We're sending this page, and since it's postcopy nothing else
1292          * will dirty it, and we must make sure it doesn't get sent again
1293          * even if this queue request was received after the background
1294          * search already sent it.
1295          */
1296         if (block) {
1297             unsigned long page;
1298
1299             page = offset >> TARGET_PAGE_BITS;
1300             dirty = test_bit(page, block->bmap);
1301             if (!dirty) {
1302                 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1303                        page, test_bit(page, block->unsentmap));
1304             } else {
1305                 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1306             }
1307         }
1308
1309     } while (block && !dirty);
1310
1311     if (block) {
1312         /*
1313          * As soon as we start servicing pages out of order, then we have
1314          * to kill the bulk stage, since the bulk stage assumes
1315          * in (migration_bitmap_find_and_reset_dirty) that every page is
1316          * dirty, that's no longer true.
1317          */
1318         rs->ram_bulk_stage = false;
1319
1320         /*
1321          * We want the background search to continue from the queued page
1322          * since the guest is likely to want other pages near to the page
1323          * it just requested.
1324          */
1325         pss->block = block;
1326         pss->page = offset >> TARGET_PAGE_BITS;
1327     }
1328
1329     return !!block;
1330 }
1331
1332 /**
1333  * migration_page_queue_free: drop any remaining pages in the ram
1334  * request queue
1335  *
1336  * It should be empty at the end anyway, but in error cases there may
1337  * be some left.  in case that there is any page left, we drop it.
1338  *
1339  */
1340 static void migration_page_queue_free(RAMState *rs)
1341 {
1342     struct RAMSrcPageRequest *mspr, *next_mspr;
1343     /* This queue generally should be empty - but in the case of a failed
1344      * migration might have some droppings in.
1345      */
1346     rcu_read_lock();
1347     QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1348         memory_region_unref(mspr->rb->mr);
1349         QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1350         g_free(mspr);
1351     }
1352     rcu_read_unlock();
1353 }
1354
1355 /**
1356  * ram_save_queue_pages: queue the page for transmission
1357  *
1358  * A request from postcopy destination for example.
1359  *
1360  * Returns zero on success or negative on error
1361  *
1362  * @rbname: Name of the RAMBLock of the request. NULL means the
1363  *          same that last one.
1364  * @start: starting address from the start of the RAMBlock
1365  * @len: length (in bytes) to send
1366  */
1367 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1368 {
1369     RAMBlock *ramblock;
1370     RAMState *rs = ram_state;
1371
1372     ram_counters.postcopy_requests++;
1373     rcu_read_lock();
1374     if (!rbname) {
1375         /* Reuse last RAMBlock */
1376         ramblock = rs->last_req_rb;
1377
1378         if (!ramblock) {
1379             /*
1380              * Shouldn't happen, we can't reuse the last RAMBlock if
1381              * it's the 1st request.
1382              */
1383             error_report("ram_save_queue_pages no previous block");
1384             goto err;
1385         }
1386     } else {
1387         ramblock = qemu_ram_block_by_name(rbname);
1388
1389         if (!ramblock) {
1390             /* We shouldn't be asked for a non-existent RAMBlock */
1391             error_report("ram_save_queue_pages no block '%s'", rbname);
1392             goto err;
1393         }
1394         rs->last_req_rb = ramblock;
1395     }
1396     trace_ram_save_queue_pages(ramblock->idstr, start, len);
1397     if (start+len > ramblock->used_length) {
1398         error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1399                      RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1400                      __func__, start, len, ramblock->used_length);
1401         goto err;
1402     }
1403
1404     struct RAMSrcPageRequest *new_entry =
1405         g_malloc0(sizeof(struct RAMSrcPageRequest));
1406     new_entry->rb = ramblock;
1407     new_entry->offset = start;
1408     new_entry->len = len;
1409
1410     memory_region_ref(ramblock->mr);
1411     qemu_mutex_lock(&rs->src_page_req_mutex);
1412     QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1413     qemu_mutex_unlock(&rs->src_page_req_mutex);
1414     rcu_read_unlock();
1415
1416     return 0;
1417
1418 err:
1419     rcu_read_unlock();
1420     return -1;
1421 }
1422
1423 static bool save_page_use_compression(RAMState *rs)
1424 {
1425     if (!migrate_use_compression()) {
1426         return false;
1427     }
1428
1429     /*
1430      * If xbzrle is on, stop using the data compression after first
1431      * round of migration even if compression is enabled. In theory,
1432      * xbzrle can do better than compression.
1433      */
1434     if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
1435         return true;
1436     }
1437
1438     return false;
1439 }
1440
1441 /**
1442  * ram_save_target_page: save one target page
1443  *
1444  * Returns the number of pages written
1445  *
1446  * @rs: current RAM state
1447  * @pss: data about the page we want to send
1448  * @last_stage: if we are at the completion stage
1449  */
1450 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
1451                                 bool last_stage)
1452 {
1453     RAMBlock *block = pss->block;
1454     ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
1455     int res;
1456
1457     if (control_save_page(rs, block, offset, &res)) {
1458         return res;
1459     }
1460
1461     /*
1462      * When starting the process of a new block, the first page of
1463      * the block should be sent out before other pages in the same
1464      * block, and all the pages in last block should have been sent
1465      * out, keeping this order is important, because the 'cont' flag
1466      * is used to avoid resending the block name.
1467      */
1468     if (block != rs->last_sent_block && save_page_use_compression(rs)) {
1469             flush_compressed_data(rs);
1470     }
1471
1472     res = save_zero_page(rs, block, offset);
1473     if (res > 0) {
1474         /* Must let xbzrle know, otherwise a previous (now 0'd) cached
1475          * page would be stale
1476          */
1477         if (!save_page_use_compression(rs)) {
1478             XBZRLE_cache_lock();
1479             xbzrle_cache_zero_page(rs, block->offset + offset);
1480             XBZRLE_cache_unlock();
1481         }
1482         ram_release_pages(block->idstr, offset, res);
1483         return res;
1484     }
1485
1486     /*
1487      * Make sure the first page is sent out before other pages.
1488      *
1489      * we post it as normal page as compression will take much
1490      * CPU resource.
1491      */
1492     if (block == rs->last_sent_block && save_page_use_compression(rs)) {
1493         res = compress_page_with_multi_thread(rs, block, offset);
1494     }
1495
1496     return ram_save_page(rs, pss, last_stage);
1497 }
1498
1499 /**
1500  * ram_save_host_page: save a whole host page
1501  *
1502  * Starting at *offset send pages up to the end of the current host
1503  * page. It's valid for the initial offset to point into the middle of
1504  * a host page in which case the remainder of the hostpage is sent.
1505  * Only dirty target pages are sent. Note that the host page size may
1506  * be a huge page for this block.
1507  * The saving stops at the boundary of the used_length of the block
1508  * if the RAMBlock isn't a multiple of the host page size.
1509  *
1510  * Returns the number of pages written or negative on error
1511  *
1512  * @rs: current RAM state
1513  * @ms: current migration state
1514  * @pss: data about the page we want to send
1515  * @last_stage: if we are at the completion stage
1516  */
1517 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
1518                               bool last_stage)
1519 {
1520     int tmppages, pages = 0;
1521     size_t pagesize_bits =
1522         qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
1523
1524     do {
1525         /* Check the pages is dirty and if it is send it */
1526         if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
1527             pss->page++;
1528             continue;
1529         }
1530
1531         tmppages = ram_save_target_page(rs, pss, last_stage);
1532         if (tmppages < 0) {
1533             return tmppages;
1534         }
1535
1536         pages += tmppages;
1537         if (pss->block->unsentmap) {
1538             clear_bit(pss->page, pss->block->unsentmap);
1539         }
1540
1541         pss->page++;
1542     } while ((pss->page & (pagesize_bits - 1)) &&
1543              offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS));
1544
1545     /* The offset we leave with is the last one we looked at */
1546     pss->page--;
1547     return pages;
1548 }
1549
1550 /**
1551  * ram_find_and_save_block: finds a dirty page and sends it to f
1552  *
1553  * Called within an RCU critical section.
1554  *
1555  * Returns the number of pages written where zero means no dirty pages
1556  *
1557  * @rs: current RAM state
1558  * @last_stage: if we are at the completion stage
1559  *
1560  * On systems where host-page-size > target-page-size it will send all the
1561  * pages in a host page that are dirty.
1562  */
1563
1564 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
1565 {
1566     PageSearchStatus pss;
1567     int pages = 0;
1568     bool again, found;
1569
1570     /* No dirty page as there is zero RAM */
1571     if (!ram_bytes_total()) {
1572         return pages;
1573     }
1574
1575     pss.block = rs->last_seen_block;
1576     pss.page = rs->last_page;
1577     pss.complete_round = false;
1578
1579     if (!pss.block) {
1580         pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1581     }
1582
1583     do {
1584         again = true;
1585         found = get_queued_page(rs, &pss);
1586
1587         if (!found) {
1588             /* priority queue empty, so just search for something dirty */
1589             found = find_dirty_block(rs, &pss, &again);
1590         }
1591
1592         if (found) {
1593             pages = ram_save_host_page(rs, &pss, last_stage);
1594         }
1595     } while (!pages && again);
1596
1597     rs->last_seen_block = pss.block;
1598     rs->last_page = pss.page;
1599
1600     return pages;
1601 }
1602
1603 void acct_update_position(QEMUFile *f, size_t size, bool zero)
1604 {
1605     uint64_t pages = size / TARGET_PAGE_SIZE;
1606
1607     if (zero) {
1608         ram_counters.duplicate += pages;
1609     } else {
1610         ram_counters.normal += pages;
1611         ram_counters.transferred += size;
1612         qemu_update_position(f, size);
1613     }
1614 }
1615
1616 uint64_t ram_bytes_total(void)
1617 {
1618     RAMBlock *block;
1619     uint64_t total = 0;
1620
1621     rcu_read_lock();
1622     RAMBLOCK_FOREACH(block) {
1623         total += block->used_length;
1624     }
1625     rcu_read_unlock();
1626     return total;
1627 }
1628
1629 static void xbzrle_load_setup(void)
1630 {
1631     XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
1632 }
1633
1634 static void xbzrle_load_cleanup(void)
1635 {
1636     g_free(XBZRLE.decoded_buf);
1637     XBZRLE.decoded_buf = NULL;
1638 }
1639
1640 static void ram_state_cleanup(RAMState **rsp)
1641 {
1642     if (*rsp) {
1643         migration_page_queue_free(*rsp);
1644         qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
1645         qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
1646         g_free(*rsp);
1647         *rsp = NULL;
1648     }
1649 }
1650
1651 static void xbzrle_cleanup(void)
1652 {
1653     XBZRLE_cache_lock();
1654     if (XBZRLE.cache) {
1655         cache_fini(XBZRLE.cache);
1656         g_free(XBZRLE.encoded_buf);
1657         g_free(XBZRLE.current_buf);
1658         g_free(XBZRLE.zero_target_page);
1659         XBZRLE.cache = NULL;
1660         XBZRLE.encoded_buf = NULL;
1661         XBZRLE.current_buf = NULL;
1662         XBZRLE.zero_target_page = NULL;
1663     }
1664     XBZRLE_cache_unlock();
1665 }
1666
1667 static void ram_save_cleanup(void *opaque)
1668 {
1669     RAMState **rsp = opaque;
1670     RAMBlock *block;
1671
1672     /* caller have hold iothread lock or is in a bh, so there is
1673      * no writing race against this migration_bitmap
1674      */
1675     memory_global_dirty_log_stop();
1676
1677     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1678         g_free(block->bmap);
1679         block->bmap = NULL;
1680         g_free(block->unsentmap);
1681         block->unsentmap = NULL;
1682     }
1683
1684     xbzrle_cleanup();
1685     compress_threads_save_cleanup();
1686     ram_state_cleanup(rsp);
1687 }
1688
1689 static void ram_state_reset(RAMState *rs)
1690 {
1691     rs->last_seen_block = NULL;
1692     rs->last_sent_block = NULL;
1693     rs->last_page = 0;
1694     rs->last_version = ram_list.version;
1695     rs->ram_bulk_stage = true;
1696 }
1697
1698 #define MAX_WAIT 50 /* ms, half buffered_file limit */
1699
1700 /*
1701  * 'expected' is the value you expect the bitmap mostly to be full
1702  * of; it won't bother printing lines that are all this value.
1703  * If 'todump' is null the migration bitmap is dumped.
1704  */
1705 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
1706                            unsigned long pages)
1707 {
1708     int64_t cur;
1709     int64_t linelen = 128;
1710     char linebuf[129];
1711
1712     for (cur = 0; cur < pages; cur += linelen) {
1713         int64_t curb;
1714         bool found = false;
1715         /*
1716          * Last line; catch the case where the line length
1717          * is longer than remaining ram
1718          */
1719         if (cur + linelen > pages) {
1720             linelen = pages - cur;
1721         }
1722         for (curb = 0; curb < linelen; curb++) {
1723             bool thisbit = test_bit(cur + curb, todump);
1724             linebuf[curb] = thisbit ? '1' : '.';
1725             found = found || (thisbit != expected);
1726         }
1727         if (found) {
1728             linebuf[curb] = '\0';
1729             fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
1730         }
1731     }
1732 }
1733
1734 /* **** functions for postcopy ***** */
1735
1736 void ram_postcopy_migrated_memory_release(MigrationState *ms)
1737 {
1738     struct RAMBlock *block;
1739
1740     RAMBLOCK_FOREACH(block) {
1741         unsigned long *bitmap = block->bmap;
1742         unsigned long range = block->used_length >> TARGET_PAGE_BITS;
1743         unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
1744
1745         while (run_start < range) {
1746             unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
1747             ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
1748                               (run_end - run_start) << TARGET_PAGE_BITS);
1749             run_start = find_next_zero_bit(bitmap, range, run_end + 1);
1750         }
1751     }
1752 }
1753
1754 /**
1755  * postcopy_send_discard_bm_ram: discard a RAMBlock
1756  *
1757  * Returns zero on success
1758  *
1759  * Callback from postcopy_each_ram_send_discard for each RAMBlock
1760  * Note: At this point the 'unsentmap' is the processed bitmap combined
1761  *       with the dirtymap; so a '1' means it's either dirty or unsent.
1762  *
1763  * @ms: current migration state
1764  * @pds: state for postcopy
1765  * @start: RAMBlock starting page
1766  * @length: RAMBlock size
1767  */
1768 static int postcopy_send_discard_bm_ram(MigrationState *ms,
1769                                         PostcopyDiscardState *pds,
1770                                         RAMBlock *block)
1771 {
1772     unsigned long end = block->used_length >> TARGET_PAGE_BITS;
1773     unsigned long current;
1774     unsigned long *unsentmap = block->unsentmap;
1775
1776     for (current = 0; current < end; ) {
1777         unsigned long one = find_next_bit(unsentmap, end, current);
1778
1779         if (one <= end) {
1780             unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
1781             unsigned long discard_length;
1782
1783             if (zero >= end) {
1784                 discard_length = end - one;
1785             } else {
1786                 discard_length = zero - one;
1787             }
1788             if (discard_length) {
1789                 postcopy_discard_send_range(ms, pds, one, discard_length);
1790             }
1791             current = one + discard_length;
1792         } else {
1793             current = one;
1794         }
1795     }
1796
1797     return 0;
1798 }
1799
1800 /**
1801  * postcopy_each_ram_send_discard: discard all RAMBlocks
1802  *
1803  * Returns 0 for success or negative for error
1804  *
1805  * Utility for the outgoing postcopy code.
1806  *   Calls postcopy_send_discard_bm_ram for each RAMBlock
1807  *   passing it bitmap indexes and name.
1808  * (qemu_ram_foreach_block ends up passing unscaled lengths
1809  *  which would mean postcopy code would have to deal with target page)
1810  *
1811  * @ms: current migration state
1812  */
1813 static int postcopy_each_ram_send_discard(MigrationState *ms)
1814 {
1815     struct RAMBlock *block;
1816     int ret;
1817
1818     RAMBLOCK_FOREACH(block) {
1819         PostcopyDiscardState *pds =
1820             postcopy_discard_send_init(ms, block->idstr);
1821
1822         /*
1823          * Postcopy sends chunks of bitmap over the wire, but it
1824          * just needs indexes at this point, avoids it having
1825          * target page specific code.
1826          */
1827         ret = postcopy_send_discard_bm_ram(ms, pds, block);
1828         postcopy_discard_send_finish(ms, pds);
1829         if (ret) {
1830             return ret;
1831         }
1832     }
1833
1834     return 0;
1835 }
1836
1837 /**
1838  * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
1839  *
1840  * Helper for postcopy_chunk_hostpages; it's called twice to
1841  * canonicalize the two bitmaps, that are similar, but one is
1842  * inverted.
1843  *
1844  * Postcopy requires that all target pages in a hostpage are dirty or
1845  * clean, not a mix.  This function canonicalizes the bitmaps.
1846  *
1847  * @ms: current migration state
1848  * @unsent_pass: if true we need to canonicalize partially unsent host pages
1849  *               otherwise we need to canonicalize partially dirty host pages
1850  * @block: block that contains the page we want to canonicalize
1851  * @pds: state for postcopy
1852  */
1853 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
1854                                           RAMBlock *block,
1855                                           PostcopyDiscardState *pds)
1856 {
1857     RAMState *rs = ram_state;
1858     unsigned long *bitmap = block->bmap;
1859     unsigned long *unsentmap = block->unsentmap;
1860     unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
1861     unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
1862     unsigned long run_start;
1863
1864     if (block->page_size == TARGET_PAGE_SIZE) {
1865         /* Easy case - TPS==HPS for a non-huge page RAMBlock */
1866         return;
1867     }
1868
1869     if (unsent_pass) {
1870         /* Find a sent page */
1871         run_start = find_next_zero_bit(unsentmap, pages, 0);
1872     } else {
1873         /* Find a dirty page */
1874         run_start = find_next_bit(bitmap, pages, 0);
1875     }
1876
1877     while (run_start < pages) {
1878         bool do_fixup = false;
1879         unsigned long fixup_start_addr;
1880         unsigned long host_offset;
1881
1882         /*
1883          * If the start of this run of pages is in the middle of a host
1884          * page, then we need to fixup this host page.
1885          */
1886         host_offset = run_start % host_ratio;
1887         if (host_offset) {
1888             do_fixup = true;
1889             run_start -= host_offset;
1890             fixup_start_addr = run_start;
1891             /* For the next pass */
1892             run_start = run_start + host_ratio;
1893         } else {
1894             /* Find the end of this run */
1895             unsigned long run_end;
1896             if (unsent_pass) {
1897                 run_end = find_next_bit(unsentmap, pages, run_start + 1);
1898             } else {
1899                 run_end = find_next_zero_bit(bitmap, pages, run_start + 1);
1900             }
1901             /*
1902              * If the end isn't at the start of a host page, then the
1903              * run doesn't finish at the end of a host page
1904              * and we need to discard.
1905              */
1906             host_offset = run_end % host_ratio;
1907             if (host_offset) {
1908                 do_fixup = true;
1909                 fixup_start_addr = run_end - host_offset;
1910                 /*
1911                  * This host page has gone, the next loop iteration starts
1912                  * from after the fixup
1913                  */
1914                 run_start = fixup_start_addr + host_ratio;
1915             } else {
1916                 /*
1917                  * No discards on this iteration, next loop starts from
1918                  * next sent/dirty page
1919                  */
1920                 run_start = run_end + 1;
1921             }
1922         }
1923
1924         if (do_fixup) {
1925             unsigned long page;
1926
1927             /* Tell the destination to discard this page */
1928             if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
1929                 /* For the unsent_pass we:
1930                  *     discard partially sent pages
1931                  * For the !unsent_pass (dirty) we:
1932                  *     discard partially dirty pages that were sent
1933                  *     (any partially sent pages were already discarded
1934                  *     by the previous unsent_pass)
1935                  */
1936                 postcopy_discard_send_range(ms, pds, fixup_start_addr,
1937                                             host_ratio);
1938             }
1939
1940             /* Clean up the bitmap */
1941             for (page = fixup_start_addr;
1942                  page < fixup_start_addr + host_ratio; page++) {
1943                 /* All pages in this host page are now not sent */
1944                 set_bit(page, unsentmap);
1945
1946                 /*
1947                  * Remark them as dirty, updating the count for any pages
1948                  * that weren't previously dirty.
1949                  */
1950                 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
1951             }
1952         }
1953
1954         if (unsent_pass) {
1955             /* Find the next sent page for the next iteration */
1956             run_start = find_next_zero_bit(unsentmap, pages, run_start);
1957         } else {
1958             /* Find the next dirty page for the next iteration */
1959             run_start = find_next_bit(bitmap, pages, run_start);
1960         }
1961     }
1962 }
1963
1964 /**
1965  * postcopy_chuck_hostpages: discrad any partially sent host page
1966  *
1967  * Utility for the outgoing postcopy code.
1968  *
1969  * Discard any partially sent host-page size chunks, mark any partially
1970  * dirty host-page size chunks as all dirty.  In this case the host-page
1971  * is the host-page for the particular RAMBlock, i.e. it might be a huge page
1972  *
1973  * Returns zero on success
1974  *
1975  * @ms: current migration state
1976  * @block: block we want to work with
1977  */
1978 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
1979 {
1980     PostcopyDiscardState *pds =
1981         postcopy_discard_send_init(ms, block->idstr);
1982
1983     /* First pass: Discard all partially sent host pages */
1984     postcopy_chunk_hostpages_pass(ms, true, block, pds);
1985     /*
1986      * Second pass: Ensure that all partially dirty host pages are made
1987      * fully dirty.
1988      */
1989     postcopy_chunk_hostpages_pass(ms, false, block, pds);
1990
1991     postcopy_discard_send_finish(ms, pds);
1992     return 0;
1993 }
1994
1995 /**
1996  * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
1997  *
1998  * Returns zero on success
1999  *
2000  * Transmit the set of pages to be discarded after precopy to the target
2001  * these are pages that:
2002  *     a) Have been previously transmitted but are now dirty again
2003  *     b) Pages that have never been transmitted, this ensures that
2004  *        any pages on the destination that have been mapped by background
2005  *        tasks get discarded (transparent huge pages is the specific concern)
2006  * Hopefully this is pretty sparse
2007  *
2008  * @ms: current migration state
2009  */
2010 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
2011 {
2012     RAMState *rs = ram_state;
2013     RAMBlock *block;
2014     int ret;
2015
2016     rcu_read_lock();
2017
2018     /* This should be our last sync, the src is now paused */
2019     migration_bitmap_sync(rs);
2020
2021     /* Easiest way to make sure we don't resume in the middle of a host-page */
2022     rs->last_seen_block = NULL;
2023     rs->last_sent_block = NULL;
2024     rs->last_page = 0;
2025
2026     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
2027         unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2028         unsigned long *bitmap = block->bmap;
2029         unsigned long *unsentmap = block->unsentmap;
2030
2031         if (!unsentmap) {
2032             /* We don't have a safe way to resize the sentmap, so
2033              * if the bitmap was resized it will be NULL at this
2034              * point.
2035              */
2036             error_report("migration ram resized during precopy phase");
2037             rcu_read_unlock();
2038             return -EINVAL;
2039         }
2040         /* Deal with TPS != HPS and huge pages */
2041         ret = postcopy_chunk_hostpages(ms, block);
2042         if (ret) {
2043             rcu_read_unlock();
2044             return ret;
2045         }
2046
2047         /*
2048          * Update the unsentmap to be unsentmap = unsentmap | dirty
2049          */
2050         bitmap_or(unsentmap, unsentmap, bitmap, pages);
2051 #ifdef DEBUG_POSTCOPY
2052         ram_debug_dump_bitmap(unsentmap, true, pages);
2053 #endif
2054     }
2055     trace_ram_postcopy_send_discard_bitmap();
2056
2057     ret = postcopy_each_ram_send_discard(ms);
2058     rcu_read_unlock();
2059
2060     return ret;
2061 }
2062
2063 /**
2064  * ram_discard_range: discard dirtied pages at the beginning of postcopy
2065  *
2066  * Returns zero on success
2067  *
2068  * @rbname: name of the RAMBlock of the request. NULL means the
2069  *          same that last one.
2070  * @start: RAMBlock starting page
2071  * @length: RAMBlock size
2072  */
2073 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2074 {
2075     int ret = -1;
2076
2077     trace_ram_discard_range(rbname, start, length);
2078
2079     rcu_read_lock();
2080     RAMBlock *rb = qemu_ram_block_by_name(rbname);
2081
2082     if (!rb) {
2083         error_report("ram_discard_range: Failed to find block '%s'", rbname);
2084         goto err;
2085     }
2086
2087     bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2088                  length >> qemu_target_page_bits());
2089     ret = ram_block_discard_range(rb, start, length);
2090
2091 err:
2092     rcu_read_unlock();
2093
2094     return ret;
2095 }
2096
2097 /*
2098  * For every allocation, we will try not to crash the VM if the
2099  * allocation failed.
2100  */
2101 static int xbzrle_init(void)
2102 {
2103     Error *local_err = NULL;
2104
2105     if (!migrate_use_xbzrle()) {
2106         return 0;
2107     }
2108
2109     XBZRLE_cache_lock();
2110
2111     XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2112     if (!XBZRLE.zero_target_page) {
2113         error_report("%s: Error allocating zero page", __func__);
2114         goto err_out;
2115     }
2116
2117     XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2118                               TARGET_PAGE_SIZE, &local_err);
2119     if (!XBZRLE.cache) {
2120         error_report_err(local_err);
2121         goto free_zero_page;
2122     }
2123
2124     XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2125     if (!XBZRLE.encoded_buf) {
2126         error_report("%s: Error allocating encoded_buf", __func__);
2127         goto free_cache;
2128     }
2129
2130     XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2131     if (!XBZRLE.current_buf) {
2132         error_report("%s: Error allocating current_buf", __func__);
2133         goto free_encoded_buf;
2134     }
2135
2136     /* We are all good */
2137     XBZRLE_cache_unlock();
2138     return 0;
2139
2140 free_encoded_buf:
2141     g_free(XBZRLE.encoded_buf);
2142     XBZRLE.encoded_buf = NULL;
2143 free_cache:
2144     cache_fini(XBZRLE.cache);
2145     XBZRLE.cache = NULL;
2146 free_zero_page:
2147     g_free(XBZRLE.zero_target_page);
2148     XBZRLE.zero_target_page = NULL;
2149 err_out:
2150     XBZRLE_cache_unlock();
2151     return -ENOMEM;
2152 }
2153
2154 static int ram_state_init(RAMState **rsp)
2155 {
2156     *rsp = g_try_new0(RAMState, 1);
2157
2158     if (!*rsp) {
2159         error_report("%s: Init ramstate fail", __func__);
2160         return -1;
2161     }
2162
2163     qemu_mutex_init(&(*rsp)->bitmap_mutex);
2164     qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2165     QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2166
2167     /*
2168      * Count the total number of pages used by ram blocks not including any
2169      * gaps due to alignment or unplugs.
2170      */
2171     (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2172
2173     ram_state_reset(*rsp);
2174
2175     return 0;
2176 }
2177
2178 static void ram_list_init_bitmaps(void)
2179 {
2180     RAMBlock *block;
2181     unsigned long pages;
2182
2183     /* Skip setting bitmap if there is no RAM */
2184     if (ram_bytes_total()) {
2185         QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
2186             pages = block->max_length >> TARGET_PAGE_BITS;
2187             block->bmap = bitmap_new(pages);
2188             bitmap_set(block->bmap, 0, pages);
2189             if (migrate_postcopy_ram()) {
2190                 block->unsentmap = bitmap_new(pages);
2191                 bitmap_set(block->unsentmap, 0, pages);
2192             }
2193         }
2194     }
2195 }
2196
2197 static void ram_init_bitmaps(RAMState *rs)
2198 {
2199     /* For memory_global_dirty_log_start below.  */
2200     qemu_mutex_lock_iothread();
2201     qemu_mutex_lock_ramlist();
2202     rcu_read_lock();
2203
2204     ram_list_init_bitmaps();
2205     memory_global_dirty_log_start();
2206     migration_bitmap_sync(rs);
2207
2208     rcu_read_unlock();
2209     qemu_mutex_unlock_ramlist();
2210     qemu_mutex_unlock_iothread();
2211 }
2212
2213 static int ram_init_all(RAMState **rsp)
2214 {
2215     if (ram_state_init(rsp)) {
2216         return -1;
2217     }
2218
2219     if (xbzrle_init()) {
2220         ram_state_cleanup(rsp);
2221         return -1;
2222     }
2223
2224     ram_init_bitmaps(*rsp);
2225
2226     return 0;
2227 }
2228
2229 /*
2230  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2231  * long-running RCU critical section.  When rcu-reclaims in the code
2232  * start to become numerous it will be necessary to reduce the
2233  * granularity of these critical sections.
2234  */
2235
2236 /**
2237  * ram_save_setup: Setup RAM for migration
2238  *
2239  * Returns zero to indicate success and negative for error
2240  *
2241  * @f: QEMUFile where to send the data
2242  * @opaque: RAMState pointer
2243  */
2244 static int ram_save_setup(QEMUFile *f, void *opaque)
2245 {
2246     RAMState **rsp = opaque;
2247     RAMBlock *block;
2248
2249     if (compress_threads_save_setup()) {
2250         return -1;
2251     }
2252
2253     /* migration has already setup the bitmap, reuse it. */
2254     if (!migration_in_colo_state()) {
2255         if (ram_init_all(rsp) != 0) {
2256             compress_threads_save_cleanup();
2257             return -1;
2258         }
2259     }
2260     (*rsp)->f = f;
2261
2262     rcu_read_lock();
2263
2264     qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
2265
2266     RAMBLOCK_FOREACH(block) {
2267         qemu_put_byte(f, strlen(block->idstr));
2268         qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2269         qemu_put_be64(f, block->used_length);
2270         if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
2271             qemu_put_be64(f, block->page_size);
2272         }
2273     }
2274
2275     rcu_read_unlock();
2276
2277     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2278     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2279
2280     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2281
2282     return 0;
2283 }
2284
2285 /**
2286  * ram_save_iterate: iterative stage for migration
2287  *
2288  * Returns zero to indicate success and negative for error
2289  *
2290  * @f: QEMUFile where to send the data
2291  * @opaque: RAMState pointer
2292  */
2293 static int ram_save_iterate(QEMUFile *f, void *opaque)
2294 {
2295     RAMState **temp = opaque;
2296     RAMState *rs = *temp;
2297     int ret;
2298     int i;
2299     int64_t t0;
2300     int done = 0;
2301
2302     if (blk_mig_bulk_active()) {
2303         /* Avoid transferring ram during bulk phase of block migration as
2304          * the bulk phase will usually take a long time and transferring
2305          * ram updates during that time is pointless. */
2306         goto out;
2307     }
2308
2309     rcu_read_lock();
2310     if (ram_list.version != rs->last_version) {
2311         ram_state_reset(rs);
2312     }
2313
2314     /* Read version before ram_list.blocks */
2315     smp_rmb();
2316
2317     ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2318
2319     t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2320     i = 0;
2321     while ((ret = qemu_file_rate_limit(f)) == 0) {
2322         int pages;
2323
2324         pages = ram_find_and_save_block(rs, false);
2325         /* no more pages to sent */
2326         if (pages == 0) {
2327             done = 1;
2328             break;
2329         }
2330         rs->iterations++;
2331
2332         /* we want to check in the 1st loop, just in case it was the 1st time
2333            and we had to sync the dirty bitmap.
2334            qemu_get_clock_ns() is a bit expensive, so we only check each some
2335            iterations
2336         */
2337         if ((i & 63) == 0) {
2338             uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
2339             if (t1 > MAX_WAIT) {
2340                 trace_ram_save_iterate_big_wait(t1, i);
2341                 break;
2342             }
2343         }
2344         i++;
2345     }
2346     flush_compressed_data(rs);
2347     rcu_read_unlock();
2348
2349     /*
2350      * Must occur before EOS (or any QEMUFile operation)
2351      * because of RDMA protocol.
2352      */
2353     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2354
2355 out:
2356     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2357     ram_counters.transferred += 8;
2358
2359     ret = qemu_file_get_error(f);
2360     if (ret < 0) {
2361         return ret;
2362     }
2363
2364     return done;
2365 }
2366
2367 /**
2368  * ram_save_complete: function called to send the remaining amount of ram
2369  *
2370  * Returns zero to indicate success
2371  *
2372  * Called with iothread lock
2373  *
2374  * @f: QEMUFile where to send the data
2375  * @opaque: RAMState pointer
2376  */
2377 static int ram_save_complete(QEMUFile *f, void *opaque)
2378 {
2379     RAMState **temp = opaque;
2380     RAMState *rs = *temp;
2381
2382     rcu_read_lock();
2383
2384     if (!migration_in_postcopy()) {
2385         migration_bitmap_sync(rs);
2386     }
2387
2388     ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2389
2390     /* try transferring iterative blocks of memory */
2391
2392     /* flush all remaining blocks regardless of rate limiting */
2393     while (true) {
2394         int pages;
2395
2396         pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2397         /* no more blocks to sent */
2398         if (pages == 0) {
2399             break;
2400         }
2401     }
2402
2403     flush_compressed_data(rs);
2404     ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2405
2406     rcu_read_unlock();
2407
2408     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2409
2410     return 0;
2411 }
2412
2413 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2414                              uint64_t *res_precopy_only,
2415                              uint64_t *res_compatible,
2416                              uint64_t *res_postcopy_only)
2417 {
2418     RAMState **temp = opaque;
2419     RAMState *rs = *temp;
2420     uint64_t remaining_size;
2421
2422     remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2423
2424     if (!migration_in_postcopy() &&
2425         remaining_size < max_size) {
2426         qemu_mutex_lock_iothread();
2427         rcu_read_lock();
2428         migration_bitmap_sync(rs);
2429         rcu_read_unlock();
2430         qemu_mutex_unlock_iothread();
2431         remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2432     }
2433
2434     if (migrate_postcopy_ram()) {
2435         /* We can do postcopy, and all the data is postcopiable */
2436         *res_compatible += remaining_size;
2437     } else {
2438         *res_precopy_only += remaining_size;
2439     }
2440 }
2441
2442 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2443 {
2444     unsigned int xh_len;
2445     int xh_flags;
2446     uint8_t *loaded_data;
2447
2448     /* extract RLE header */
2449     xh_flags = qemu_get_byte(f);
2450     xh_len = qemu_get_be16(f);
2451
2452     if (xh_flags != ENCODING_FLAG_XBZRLE) {
2453         error_report("Failed to load XBZRLE page - wrong compression!");
2454         return -1;
2455     }
2456
2457     if (xh_len > TARGET_PAGE_SIZE) {
2458         error_report("Failed to load XBZRLE page - len overflow!");
2459         return -1;
2460     }
2461     loaded_data = XBZRLE.decoded_buf;
2462     /* load data and decode */
2463     /* it can change loaded_data to point to an internal buffer */
2464     qemu_get_buffer_in_place(f, &loaded_data, xh_len);
2465
2466     /* decode RLE */
2467     if (xbzrle_decode_buffer(loaded_data, xh_len, host,
2468                              TARGET_PAGE_SIZE) == -1) {
2469         error_report("Failed to load XBZRLE page - decode error!");
2470         return -1;
2471     }
2472
2473     return 0;
2474 }
2475
2476 /**
2477  * ram_block_from_stream: read a RAMBlock id from the migration stream
2478  *
2479  * Must be called from within a rcu critical section.
2480  *
2481  * Returns a pointer from within the RCU-protected ram_list.
2482  *
2483  * @f: QEMUFile where to read the data from
2484  * @flags: Page flags (mostly to see if it's a continuation of previous block)
2485  */
2486 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
2487 {
2488     static RAMBlock *block = NULL;
2489     char id[256];
2490     uint8_t len;
2491
2492     if (flags & RAM_SAVE_FLAG_CONTINUE) {
2493         if (!block) {
2494             error_report("Ack, bad migration stream!");
2495             return NULL;
2496         }
2497         return block;
2498     }
2499
2500     len = qemu_get_byte(f);
2501     qemu_get_buffer(f, (uint8_t *)id, len);
2502     id[len] = 0;
2503
2504     block = qemu_ram_block_by_name(id);
2505     if (!block) {
2506         error_report("Can't find block %s", id);
2507         return NULL;
2508     }
2509
2510     return block;
2511 }
2512
2513 static inline void *host_from_ram_block_offset(RAMBlock *block,
2514                                                ram_addr_t offset)
2515 {
2516     if (!offset_in_ramblock(block, offset)) {
2517         return NULL;
2518     }
2519
2520     return block->host + offset;
2521 }
2522
2523 /**
2524  * ram_handle_compressed: handle the zero page case
2525  *
2526  * If a page (or a whole RDMA chunk) has been
2527  * determined to be zero, then zap it.
2528  *
2529  * @host: host address for the zero page
2530  * @ch: what the page is filled from.  We only support zero
2531  * @size: size of the zero page
2532  */
2533 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2534 {
2535     if (ch != 0 || !is_zero_range(host, size)) {
2536         memset(host, ch, size);
2537     }
2538 }
2539
2540 /* return the size after decompression, or negative value on error */
2541 static int
2542 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
2543                      const uint8_t *source, size_t source_len)
2544 {
2545     int err;
2546
2547     err = inflateReset(stream);
2548     if (err != Z_OK) {
2549         return -1;
2550     }
2551
2552     stream->avail_in = source_len;
2553     stream->next_in = (uint8_t *)source;
2554     stream->avail_out = dest_len;
2555     stream->next_out = dest;
2556
2557     err = inflate(stream, Z_NO_FLUSH);
2558     if (err != Z_STREAM_END) {
2559         return -1;
2560     }
2561
2562     return stream->total_out;
2563 }
2564
2565 static void *do_data_decompress(void *opaque)
2566 {
2567     DecompressParam *param = opaque;
2568     unsigned long pagesize;
2569     uint8_t *des;
2570     int len, ret;
2571
2572     qemu_mutex_lock(&param->mutex);
2573     while (!param->quit) {
2574         if (param->des) {
2575             des = param->des;
2576             len = param->len;
2577             param->des = 0;
2578             qemu_mutex_unlock(&param->mutex);
2579
2580             pagesize = TARGET_PAGE_SIZE;
2581
2582             ret = qemu_uncompress_data(&param->stream, des, pagesize,
2583                                        param->compbuf, len);
2584             if (ret < 0) {
2585                 error_report("decompress data failed");
2586                 qemu_file_set_error(decomp_file, ret);
2587             }
2588
2589             qemu_mutex_lock(&decomp_done_lock);
2590             param->done = true;
2591             qemu_cond_signal(&decomp_done_cond);
2592             qemu_mutex_unlock(&decomp_done_lock);
2593
2594             qemu_mutex_lock(&param->mutex);
2595         } else {
2596             qemu_cond_wait(&param->cond, &param->mutex);
2597         }
2598     }
2599     qemu_mutex_unlock(&param->mutex);
2600
2601     return NULL;
2602 }
2603
2604 static int wait_for_decompress_done(void)
2605 {
2606     int idx, thread_count;
2607
2608     if (!migrate_use_compression()) {
2609         return 0;
2610     }
2611
2612     thread_count = migrate_decompress_threads();
2613     qemu_mutex_lock(&decomp_done_lock);
2614     for (idx = 0; idx < thread_count; idx++) {
2615         while (!decomp_param[idx].done) {
2616             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2617         }
2618     }
2619     qemu_mutex_unlock(&decomp_done_lock);
2620     return qemu_file_get_error(decomp_file);
2621 }
2622
2623 static void compress_threads_load_cleanup(void)
2624 {
2625     int i, thread_count;
2626
2627     if (!migrate_use_compression()) {
2628         return;
2629     }
2630     thread_count = migrate_decompress_threads();
2631     for (i = 0; i < thread_count; i++) {
2632         /*
2633          * we use it as a indicator which shows if the thread is
2634          * properly init'd or not
2635          */
2636         if (!decomp_param[i].compbuf) {
2637             break;
2638         }
2639
2640         qemu_mutex_lock(&decomp_param[i].mutex);
2641         decomp_param[i].quit = true;
2642         qemu_cond_signal(&decomp_param[i].cond);
2643         qemu_mutex_unlock(&decomp_param[i].mutex);
2644     }
2645     for (i = 0; i < thread_count; i++) {
2646         if (!decomp_param[i].compbuf) {
2647             break;
2648         }
2649
2650         qemu_thread_join(decompress_threads + i);
2651         qemu_mutex_destroy(&decomp_param[i].mutex);
2652         qemu_cond_destroy(&decomp_param[i].cond);
2653         inflateEnd(&decomp_param[i].stream);
2654         g_free(decomp_param[i].compbuf);
2655         decomp_param[i].compbuf = NULL;
2656     }
2657     g_free(decompress_threads);
2658     g_free(decomp_param);
2659     decompress_threads = NULL;
2660     decomp_param = NULL;
2661     decomp_file = NULL;
2662 }
2663
2664 static int compress_threads_load_setup(QEMUFile *f)
2665 {
2666     int i, thread_count;
2667
2668     if (!migrate_use_compression()) {
2669         return 0;
2670     }
2671
2672     thread_count = migrate_decompress_threads();
2673     decompress_threads = g_new0(QemuThread, thread_count);
2674     decomp_param = g_new0(DecompressParam, thread_count);
2675     qemu_mutex_init(&decomp_done_lock);
2676     qemu_cond_init(&decomp_done_cond);
2677     decomp_file = f;
2678     for (i = 0; i < thread_count; i++) {
2679         if (inflateInit(&decomp_param[i].stream) != Z_OK) {
2680             goto exit;
2681         }
2682
2683         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
2684         qemu_mutex_init(&decomp_param[i].mutex);
2685         qemu_cond_init(&decomp_param[i].cond);
2686         decomp_param[i].done = true;
2687         decomp_param[i].quit = false;
2688         qemu_thread_create(decompress_threads + i, "decompress",
2689                            do_data_decompress, decomp_param + i,
2690                            QEMU_THREAD_JOINABLE);
2691     }
2692     return 0;
2693 exit:
2694     compress_threads_load_cleanup();
2695     return -1;
2696 }
2697
2698 static void decompress_data_with_multi_threads(QEMUFile *f,
2699                                                void *host, int len)
2700 {
2701     int idx, thread_count;
2702
2703     thread_count = migrate_decompress_threads();
2704     qemu_mutex_lock(&decomp_done_lock);
2705     while (true) {
2706         for (idx = 0; idx < thread_count; idx++) {
2707             if (decomp_param[idx].done) {
2708                 decomp_param[idx].done = false;
2709                 qemu_mutex_lock(&decomp_param[idx].mutex);
2710                 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
2711                 decomp_param[idx].des = host;
2712                 decomp_param[idx].len = len;
2713                 qemu_cond_signal(&decomp_param[idx].cond);
2714                 qemu_mutex_unlock(&decomp_param[idx].mutex);
2715                 break;
2716             }
2717         }
2718         if (idx < thread_count) {
2719             break;
2720         } else {
2721             qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2722         }
2723     }
2724     qemu_mutex_unlock(&decomp_done_lock);
2725 }
2726
2727 /**
2728  * ram_load_setup: Setup RAM for migration incoming side
2729  *
2730  * Returns zero to indicate success and negative for error
2731  *
2732  * @f: QEMUFile where to receive the data
2733  * @opaque: RAMState pointer
2734  */
2735 static int ram_load_setup(QEMUFile *f, void *opaque)
2736 {
2737     if (compress_threads_load_setup(f)) {
2738         return -1;
2739     }
2740
2741     xbzrle_load_setup();
2742     ramblock_recv_map_init();
2743     return 0;
2744 }
2745
2746 static int ram_load_cleanup(void *opaque)
2747 {
2748     RAMBlock *rb;
2749     xbzrle_load_cleanup();
2750     compress_threads_load_cleanup();
2751
2752     RAMBLOCK_FOREACH(rb) {
2753         g_free(rb->receivedmap);
2754         rb->receivedmap = NULL;
2755     }
2756     return 0;
2757 }
2758
2759 /**
2760  * ram_postcopy_incoming_init: allocate postcopy data structures
2761  *
2762  * Returns 0 for success and negative if there was one error
2763  *
2764  * @mis: current migration incoming state
2765  *
2766  * Allocate data structures etc needed by incoming migration with
2767  * postcopy-ram. postcopy-ram's similarly names
2768  * postcopy_ram_incoming_init does the work.
2769  */
2770 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
2771 {
2772     unsigned long ram_pages = last_ram_page();
2773
2774     return postcopy_ram_incoming_init(mis, ram_pages);
2775 }
2776
2777 /**
2778  * ram_load_postcopy: load a page in postcopy case
2779  *
2780  * Returns 0 for success or -errno in case of error
2781  *
2782  * Called in postcopy mode by ram_load().
2783  * rcu_read_lock is taken prior to this being called.
2784  *
2785  * @f: QEMUFile where to send the data
2786  */
2787 static int ram_load_postcopy(QEMUFile *f)
2788 {
2789     int flags = 0, ret = 0;
2790     bool place_needed = false;
2791     bool matching_page_sizes = false;
2792     MigrationIncomingState *mis = migration_incoming_get_current();
2793     /* Temporary page that is later 'placed' */
2794     void *postcopy_host_page = postcopy_get_tmp_page(mis);
2795     void *last_host = NULL;
2796     bool all_zero = false;
2797
2798     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2799         ram_addr_t addr;
2800         void *host = NULL;
2801         void *page_buffer = NULL;
2802         void *place_source = NULL;
2803         RAMBlock *block = NULL;
2804         uint8_t ch;
2805
2806         addr = qemu_get_be64(f);
2807
2808         /*
2809          * If qemu file error, we should stop here, and then "addr"
2810          * may be invalid
2811          */
2812         ret = qemu_file_get_error(f);
2813         if (ret) {
2814             break;
2815         }
2816
2817         flags = addr & ~TARGET_PAGE_MASK;
2818         addr &= TARGET_PAGE_MASK;
2819
2820         trace_ram_load_postcopy_loop((uint64_t)addr, flags);
2821         place_needed = false;
2822         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) {
2823             block = ram_block_from_stream(f, flags);
2824
2825             host = host_from_ram_block_offset(block, addr);
2826             if (!host) {
2827                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2828                 ret = -EINVAL;
2829                 break;
2830             }
2831             matching_page_sizes = block->page_size == TARGET_PAGE_SIZE;
2832             /*
2833              * Postcopy requires that we place whole host pages atomically;
2834              * these may be huge pages for RAMBlocks that are backed by
2835              * hugetlbfs.
2836              * To make it atomic, the data is read into a temporary page
2837              * that's moved into place later.
2838              * The migration protocol uses,  possibly smaller, target-pages
2839              * however the source ensures it always sends all the components
2840              * of a host page in order.
2841              */
2842             page_buffer = postcopy_host_page +
2843                           ((uintptr_t)host & (block->page_size - 1));
2844             /* If all TP are zero then we can optimise the place */
2845             if (!((uintptr_t)host & (block->page_size - 1))) {
2846                 all_zero = true;
2847             } else {
2848                 /* not the 1st TP within the HP */
2849                 if (host != (last_host + TARGET_PAGE_SIZE)) {
2850                     error_report("Non-sequential target page %p/%p",
2851                                   host, last_host);
2852                     ret = -EINVAL;
2853                     break;
2854                 }
2855             }
2856
2857
2858             /*
2859              * If it's the last part of a host page then we place the host
2860              * page
2861              */
2862             place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
2863                                      (block->page_size - 1)) == 0;
2864             place_source = postcopy_host_page;
2865         }
2866         last_host = host;
2867
2868         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2869         case RAM_SAVE_FLAG_ZERO:
2870             ch = qemu_get_byte(f);
2871             memset(page_buffer, ch, TARGET_PAGE_SIZE);
2872             if (ch) {
2873                 all_zero = false;
2874             }
2875             break;
2876
2877         case RAM_SAVE_FLAG_PAGE:
2878             all_zero = false;
2879             if (!place_needed || !matching_page_sizes) {
2880                 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
2881             } else {
2882                 /* Avoids the qemu_file copy during postcopy, which is
2883                  * going to do a copy later; can only do it when we
2884                  * do this read in one go (matching page sizes)
2885                  */
2886                 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
2887                                          TARGET_PAGE_SIZE);
2888             }
2889             break;
2890         case RAM_SAVE_FLAG_EOS:
2891             /* normal exit */
2892             break;
2893         default:
2894             error_report("Unknown combination of migration flags: %#x"
2895                          " (postcopy mode)", flags);
2896             ret = -EINVAL;
2897             break;
2898         }
2899
2900         /* Detect for any possible file errors */
2901         if (!ret && qemu_file_get_error(f)) {
2902             ret = qemu_file_get_error(f);
2903         }
2904
2905         if (!ret && place_needed) {
2906             /* This gets called at the last target page in the host page */
2907             void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
2908
2909             if (all_zero) {
2910                 ret = postcopy_place_page_zero(mis, place_dest,
2911                                                block);
2912             } else {
2913                 ret = postcopy_place_page(mis, place_dest,
2914                                           place_source, block);
2915             }
2916         }
2917     }
2918
2919     return ret;
2920 }
2921
2922 static bool postcopy_is_advised(void)
2923 {
2924     PostcopyState ps = postcopy_state_get();
2925     return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
2926 }
2927
2928 static bool postcopy_is_running(void)
2929 {
2930     PostcopyState ps = postcopy_state_get();
2931     return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
2932 }
2933
2934 static int ram_load(QEMUFile *f, void *opaque, int version_id)
2935 {
2936     int flags = 0, ret = 0, invalid_flags = 0;
2937     static uint64_t seq_iter;
2938     int len = 0;
2939     /*
2940      * If system is running in postcopy mode, page inserts to host memory must
2941      * be atomic
2942      */
2943     bool postcopy_running = postcopy_is_running();
2944     /* ADVISE is earlier, it shows the source has the postcopy capability on */
2945     bool postcopy_advised = postcopy_is_advised();
2946
2947     seq_iter++;
2948
2949     if (version_id != 4) {
2950         ret = -EINVAL;
2951     }
2952
2953     if (!migrate_use_compression()) {
2954         invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
2955     }
2956     /* This RCU critical section can be very long running.
2957      * When RCU reclaims in the code start to become numerous,
2958      * it will be necessary to reduce the granularity of this
2959      * critical section.
2960      */
2961     rcu_read_lock();
2962
2963     if (postcopy_running) {
2964         ret = ram_load_postcopy(f);
2965     }
2966
2967     while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2968         ram_addr_t addr, total_ram_bytes;
2969         void *host = NULL;
2970         uint8_t ch;
2971
2972         addr = qemu_get_be64(f);
2973         flags = addr & ~TARGET_PAGE_MASK;
2974         addr &= TARGET_PAGE_MASK;
2975
2976         if (flags & invalid_flags) {
2977             if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
2978                 error_report("Received an unexpected compressed page");
2979             }
2980
2981             ret = -EINVAL;
2982             break;
2983         }
2984
2985         if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
2986                      RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
2987             RAMBlock *block = ram_block_from_stream(f, flags);
2988
2989             host = host_from_ram_block_offset(block, addr);
2990             if (!host) {
2991                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2992                 ret = -EINVAL;
2993                 break;
2994             }
2995             ramblock_recv_bitmap_set(block, host);
2996             trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
2997         }
2998
2999         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3000         case RAM_SAVE_FLAG_MEM_SIZE:
3001             /* Synchronize RAM block list */
3002             total_ram_bytes = addr;
3003             while (!ret && total_ram_bytes) {
3004                 RAMBlock *block;
3005                 char id[256];
3006                 ram_addr_t length;
3007
3008                 len = qemu_get_byte(f);
3009                 qemu_get_buffer(f, (uint8_t *)id, len);
3010                 id[len] = 0;
3011                 length = qemu_get_be64(f);
3012
3013                 block = qemu_ram_block_by_name(id);
3014                 if (block) {
3015                     if (length != block->used_length) {
3016                         Error *local_err = NULL;
3017
3018                         ret = qemu_ram_resize(block, length,
3019                                               &local_err);
3020                         if (local_err) {
3021                             error_report_err(local_err);
3022                         }
3023                     }
3024                     /* For postcopy we need to check hugepage sizes match */
3025                     if (postcopy_advised &&
3026                         block->page_size != qemu_host_page_size) {
3027                         uint64_t remote_page_size = qemu_get_be64(f);
3028                         if (remote_page_size != block->page_size) {
3029                             error_report("Mismatched RAM page size %s "
3030                                          "(local) %zd != %" PRId64,
3031                                          id, block->page_size,
3032                                          remote_page_size);
3033                             ret = -EINVAL;
3034                         }
3035                     }
3036                     ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
3037                                           block->idstr);
3038                 } else {
3039                     error_report("Unknown ramblock \"%s\", cannot "
3040                                  "accept migration", id);
3041                     ret = -EINVAL;
3042                 }
3043
3044                 total_ram_bytes -= length;
3045             }
3046             break;
3047
3048         case RAM_SAVE_FLAG_ZERO:
3049             ch = qemu_get_byte(f);
3050             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
3051             break;
3052
3053         case RAM_SAVE_FLAG_PAGE:
3054             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
3055             break;
3056
3057         case RAM_SAVE_FLAG_COMPRESS_PAGE:
3058             len = qemu_get_be32(f);
3059             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3060                 error_report("Invalid compressed data length: %d", len);
3061                 ret = -EINVAL;
3062                 break;
3063             }
3064             decompress_data_with_multi_threads(f, host, len);
3065             break;
3066
3067         case RAM_SAVE_FLAG_XBZRLE:
3068             if (load_xbzrle(f, addr, host) < 0) {
3069                 error_report("Failed to decompress XBZRLE page at "
3070                              RAM_ADDR_FMT, addr);
3071                 ret = -EINVAL;
3072                 break;
3073             }
3074             break;
3075         case RAM_SAVE_FLAG_EOS:
3076             /* normal exit */
3077             break;
3078         default:
3079             if (flags & RAM_SAVE_FLAG_HOOK) {
3080                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
3081             } else {
3082                 error_report("Unknown combination of migration flags: %#x",
3083                              flags);
3084                 ret = -EINVAL;
3085             }
3086         }
3087         if (!ret) {
3088             ret = qemu_file_get_error(f);
3089         }
3090     }
3091
3092     ret |= wait_for_decompress_done();
3093     rcu_read_unlock();
3094     trace_ram_load_complete(ret, seq_iter);
3095     return ret;
3096 }
3097
3098 static bool ram_has_postcopy(void *opaque)
3099 {
3100     return migrate_postcopy_ram();
3101 }
3102
3103 static SaveVMHandlers savevm_ram_handlers = {
3104     .save_setup = ram_save_setup,
3105     .save_live_iterate = ram_save_iterate,
3106     .save_live_complete_postcopy = ram_save_complete,
3107     .save_live_complete_precopy = ram_save_complete,
3108     .has_postcopy = ram_has_postcopy,
3109     .save_live_pending = ram_save_pending,
3110     .load_state = ram_load,
3111     .save_cleanup = ram_save_cleanup,
3112     .load_setup = ram_load_setup,
3113     .load_cleanup = ram_load_cleanup,
3114 };
3115
3116 void ram_mig_init(void)
3117 {
3118     qemu_mutex_init(&XBZRLE.lock);
3119     register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);
3120 }