migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <quintela@redhat.com>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28 #include <stdint.h>
  29 #include <stdarg.h>
  30 #include <stdlib.h>
  31 #include <zlib.h>
  32 #ifndef _WIN32
  33 #include <sys/types.h>
  34 #include <sys/mman.h>
  35 #endif
  36 #include "config.h"
  37 #include "monitor/monitor.h"
  38 #include "sysemu/sysemu.h"
  39 #include "qemu/bitops.h"
  40 #include "qemu/bitmap.h"
  41 #include "hw/i386/pc.h"
  42 #include "hw/pci/pci.h"
  43 #include "hw/audio/audio.h"
  44 #include "migration/migration.h"
  45 #include "exec/address-spaces.h"
  46 #include "migration/page_cache.h"
  47 #include "qemu/config-file.h"
  48 #include "qemu/error-report.h"
  49 #include "qmp-commands.h"
  50 #include "trace.h"
  51 #include "exec/cpu-all.h"
  52 #include "exec/ram_addr.h"
  53 #include "qemu/host-utils.h"
  54 #include "qemu/rcu_queue.h"
  55
  56 #ifdef DEBUG_MIGRATION_RAM
  57 #define DPRINTF(fmt, ...) \
  58     do { fprintf(stdout, "migration_ram: " fmt, ## __VA_ARGS__); } while (0)
  59 #else
  60 #define DPRINTF(fmt, ...) \
  61     do { } while (0)
  62 #endif
  63
  64 static bool mig_throttle_on;
  65 static int dirty_rate_high_cnt;
  66 static void check_guest_throttling(void);
  67
  68 static uint64_t bitmap_sync_count;
  69
  70 /***********************************************************/
  71 /* ram save/restore */
  72
  73 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  74 #define RAM_SAVE_FLAG_COMPRESS 0x02
  75 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  76 #define RAM_SAVE_FLAG_PAGE     0x08
  77 #define RAM_SAVE_FLAG_EOS      0x10
  78 #define RAM_SAVE_FLAG_CONTINUE 0x20
  79 #define RAM_SAVE_FLAG_XBZRLE   0x40
  80 /* 0x80 is reserved in migration.h start with 0x100 next */
  81 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  82
  83 static const uint8_t ZERO_TARGET_PAGE[TARGET_PAGE_SIZE];
  84
  85 static inline bool is_zero_range(uint8_t *p, uint64_t size)
  86 {
  87     return buffer_find_nonzero_offset(p, size) == size;
  88 }
  89
  90 /* struct contains XBZRLE cache and a static page
  91    used by the compression */
  92 static struct {
  93     /* buffer used for XBZRLE encoding */
  94     uint8_t *encoded_buf;
  95     /* buffer for storing page content */
  96     uint8_t *current_buf;
  97     /* Cache for XBZRLE, Protected by lock. */
  98     PageCache *cache;
  99     QemuMutex lock;
 100 } XBZRLE;
 101
 102 /* buffer used for XBZRLE decoding */
 103 static uint8_t *xbzrle_decoded_buf;
 104
 105 static void XBZRLE_cache_lock(void)
 106 {
 107     if (migrate_use_xbzrle())
 108         qemu_mutex_lock(&XBZRLE.lock);
 109 }
 110
 111 static void XBZRLE_cache_unlock(void)
 112 {
 113     if (migrate_use_xbzrle())
 114         qemu_mutex_unlock(&XBZRLE.lock);
 115 }
 116
 117 /*
 118  * called from qmp_migrate_set_cache_size in main thread, possibly while
 119  * a migration is in progress.
 120  * A running migration maybe using the cache and might finish during this
 121  * call, hence changes to the cache are protected by XBZRLE.lock().
 122  */
 123 int64_t xbzrle_cache_resize(int64_t new_size)
 124 {
 125     PageCache *new_cache;
 126     int64_t ret;
 127
 128     if (new_size < TARGET_PAGE_SIZE) {
 129         return -1;
 130     }
 131
 132     XBZRLE_cache_lock();
 133
 134     if (XBZRLE.cache != NULL) {
 135         if (pow2floor(new_size) == migrate_xbzrle_cache_size()) {
 136             goto out_new_size;
 137         }
 138         new_cache = cache_init(new_size / TARGET_PAGE_SIZE,
 139                                         TARGET_PAGE_SIZE);
 140         if (!new_cache) {
 141             error_report("Error creating cache");
 142             ret = -1;
 143             goto out;
 144         }
 145
 146         cache_fini(XBZRLE.cache);
 147         XBZRLE.cache = new_cache;
 148     }
 149
 150 out_new_size:
 151     ret = pow2floor(new_size);
 152 out:
 153     XBZRLE_cache_unlock();
 154     return ret;
 155 }
 156
 157 /* accounting for migration statistics */
 158 typedef struct AccountingInfo {
 159     uint64_t dup_pages;
 160     uint64_t skipped_pages;
 161     uint64_t norm_pages;
 162     uint64_t iterations;
 163     uint64_t xbzrle_bytes;
 164     uint64_t xbzrle_pages;
 165     uint64_t xbzrle_cache_miss;
 166     double xbzrle_cache_miss_rate;
 167     uint64_t xbzrle_overflows;
 168 } AccountingInfo;
 169
 170 static AccountingInfo acct_info;
 171
 172 static void acct_clear(void)
 173 {
 174     memset(&acct_info, 0, sizeof(acct_info));
 175 }
 176
 177 uint64_t dup_mig_bytes_transferred(void)
 178 {
 179     return acct_info.dup_pages * TARGET_PAGE_SIZE;
 180 }
 181
 182 uint64_t dup_mig_pages_transferred(void)
 183 {
 184     return acct_info.dup_pages;
 185 }
 186
 187 uint64_t skipped_mig_bytes_transferred(void)
 188 {
 189     return acct_info.skipped_pages * TARGET_PAGE_SIZE;
 190 }
 191
 192 uint64_t skipped_mig_pages_transferred(void)
 193 {
 194     return acct_info.skipped_pages;
 195 }
 196
 197 uint64_t norm_mig_bytes_transferred(void)
 198 {
 199     return acct_info.norm_pages * TARGET_PAGE_SIZE;
 200 }
 201
 202 uint64_t norm_mig_pages_transferred(void)
 203 {
 204     return acct_info.norm_pages;
 205 }
 206
 207 uint64_t xbzrle_mig_bytes_transferred(void)
 208 {
 209     return acct_info.xbzrle_bytes;
 210 }
 211
 212 uint64_t xbzrle_mig_pages_transferred(void)
 213 {
 214     return acct_info.xbzrle_pages;
 215 }
 216
 217 uint64_t xbzrle_mig_pages_cache_miss(void)
 218 {
 219     return acct_info.xbzrle_cache_miss;
 220 }
 221
 222 double xbzrle_mig_cache_miss_rate(void)
 223 {
 224     return acct_info.xbzrle_cache_miss_rate;
 225 }
 226
 227 uint64_t xbzrle_mig_pages_overflow(void)
 228 {
 229     return acct_info.xbzrle_overflows;
 230 }
 231
 232 /* This is the last block that we have visited serching for dirty pages
 233  */
 234 static RAMBlock *last_seen_block;
 235 /* This is the last block from where we have sent data */
 236 static RAMBlock *last_sent_block;
 237 static ram_addr_t last_offset;
 238 static unsigned long *migration_bitmap;
 239 static uint64_t migration_dirty_pages;
 240 static uint32_t last_version;
 241 static bool ram_bulk_stage;
 242
 243 struct CompressParam {
 244     bool start;
 245     bool done;
 246     QEMUFile *file;
 247     QemuMutex mutex;
 248     QemuCond cond;
 249     RAMBlock *block;
 250     ram_addr_t offset;
 251 };
 252 typedef struct CompressParam CompressParam;
 253
 254 struct DecompressParam {
 255     bool start;
 256     QemuMutex mutex;
 257     QemuCond cond;
 258     void *des;
 259     uint8 *compbuf;
 260     int len;
 261 };
 262 typedef struct DecompressParam DecompressParam;
 263
 264 static CompressParam *comp_param;
 265 static QemuThread *compress_threads;
 266 /* comp_done_cond is used to wake up the migration thread when
 267  * one of the compression threads has finished the compression.
 268  * comp_done_lock is used to co-work with comp_done_cond.
 269  */
 270 static QemuMutex *comp_done_lock;
 271 static QemuCond *comp_done_cond;
 272 /* The empty QEMUFileOps will be used by file in CompressParam */
 273 static const QEMUFileOps empty_ops = { };
 274
 275 static bool compression_switch;
 276 static bool quit_comp_thread;
 277 static bool quit_decomp_thread;
 278 static DecompressParam *decomp_param;
 279 static QemuThread *decompress_threads;
 280 static uint8_t *compressed_data_buf;
 281
 282 static int do_compress_ram_page(CompressParam *param);
 283
 284 static void *do_data_compress(void *opaque)
 285 {
 286     CompressParam *param = opaque;
 287
 288     while (!quit_comp_thread) {
 289         qemu_mutex_lock(&param->mutex);
 290         /* Re-check the quit_comp_thread in case of
 291          * terminate_compression_threads is called just before
 292          * qemu_mutex_lock(&param->mutex) and after
 293          * while(!quit_comp_thread), re-check it here can make
 294          * sure the compression thread terminate as expected.
 295          */
 296         while (!param->start && !quit_comp_thread) {
 297             qemu_cond_wait(&param->cond, &param->mutex);
 298         }
 299         if (!quit_comp_thread) {
 300             do_compress_ram_page(param);
 301         }
 302         param->start = false;
 303         qemu_mutex_unlock(&param->mutex);
 304
 305         qemu_mutex_lock(comp_done_lock);
 306         param->done = true;
 307         qemu_cond_signal(comp_done_cond);
 308         qemu_mutex_unlock(comp_done_lock);
 309     }
 310
 311     return NULL;
 312 }
 313
 314 static inline void terminate_compression_threads(void)
 315 {
 316     int idx, thread_count;
 317
 318     thread_count = migrate_compress_threads();
 319     quit_comp_thread = true;
 320     for (idx = 0; idx < thread_count; idx++) {
 321         qemu_mutex_lock(&comp_param[idx].mutex);
 322         qemu_cond_signal(&comp_param[idx].cond);
 323         qemu_mutex_unlock(&comp_param[idx].mutex);
 324     }
 325 }
 326
 327 void migrate_compress_threads_join(void)
 328 {
 329     int i, thread_count;
 330
 331     if (!migrate_use_compression()) {
 332         return;
 333     }
 334     terminate_compression_threads();
 335     thread_count = migrate_compress_threads();
 336     for (i = 0; i < thread_count; i++) {
 337         qemu_thread_join(compress_threads + i);
 338         qemu_fclose(comp_param[i].file);
 339         qemu_mutex_destroy(&comp_param[i].mutex);
 340         qemu_cond_destroy(&comp_param[i].cond);
 341     }
 342     qemu_mutex_destroy(comp_done_lock);
 343     qemu_cond_destroy(comp_done_cond);
 344     g_free(compress_threads);
 345     g_free(comp_param);
 346     g_free(comp_done_cond);
 347     g_free(comp_done_lock);
 348     compress_threads = NULL;
 349     comp_param = NULL;
 350     comp_done_cond = NULL;
 351     comp_done_lock = NULL;
 352 }
 353
 354 void migrate_compress_threads_create(void)
 355 {
 356     int i, thread_count;
 357
 358     if (!migrate_use_compression()) {
 359         return;
 360     }
 361     quit_comp_thread = false;
 362     compression_switch = true;
 363     thread_count = migrate_compress_threads();
 364     compress_threads = g_new0(QemuThread, thread_count);
 365     comp_param = g_new0(CompressParam, thread_count);
 366     comp_done_cond = g_new0(QemuCond, 1);
 367     comp_done_lock = g_new0(QemuMutex, 1);
 368     qemu_cond_init(comp_done_cond);
 369     qemu_mutex_init(comp_done_lock);
 370     for (i = 0; i < thread_count; i++) {
 371         /* com_param[i].file is just used as a dummy buffer to save data, set
 372          * it's ops to empty.
 373          */
 374         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
 375         comp_param[i].done = true;
 376         qemu_mutex_init(&comp_param[i].mutex);
 377         qemu_cond_init(&comp_param[i].cond);
 378         qemu_thread_create(compress_threads + i, "compress",
 379                            do_data_compress, comp_param + i,
 380                            QEMU_THREAD_JOINABLE);
 381     }
 382 }
 383
 384 /**
 385  * save_page_header: Write page header to wire
 386  *
 387  * If this is the 1st block, it also writes the block identification
 388  *
 389  * Returns: Number of bytes written
 390  *
 391  * @f: QEMUFile where to send the data
 392  * @block: block that contains the page we want to send
 393  * @offset: offset inside the block for the page
 394  *          in the lower bits, it contains flags
 395  */
 396 static size_t save_page_header(QEMUFile *f, RAMBlock *block, ram_addr_t offset)
 397 {
 398     size_t size;
 399
 400     qemu_put_be64(f, offset);
 401     size = 8;
 402
 403     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
 404         qemu_put_byte(f, strlen(block->idstr));
 405         qemu_put_buffer(f, (uint8_t *)block->idstr,
 406                         strlen(block->idstr));
 407         size += 1 + strlen(block->idstr);
 408     }
 409     return size;
 410 }
 411
 412 /* Update the xbzrle cache to reflect a page that's been sent as all 0.
 413  * The important thing is that a stale (not-yet-0'd) page be replaced
 414  * by the new data.
 415  * As a bonus, if the page wasn't in the cache it gets added so that
 416  * when a small write is made into the 0'd page it gets XBZRLE sent
 417  */
 418 static void xbzrle_cache_zero_page(ram_addr_t current_addr)
 419 {
 420     if (ram_bulk_stage || !migrate_use_xbzrle()) {
 421         return;
 422     }
 423
 424     /* We don't care if this fails to allocate a new cache page
 425      * as long as it updated an old one */
 426     cache_insert(XBZRLE.cache, current_addr, ZERO_TARGET_PAGE,
 427                  bitmap_sync_count);
 428 }
 429
 430 #define ENCODING_FLAG_XBZRLE 0x1
 431
 432 /**
 433  * save_xbzrle_page: compress and send current page
 434  *
 435  * Returns: 1 means that we wrote the page
 436  *          0 means that page is identical to the one already sent
 437  *          -1 means that xbzrle would be longer than normal
 438  *
 439  * @f: QEMUFile where to send the data
 440  * @current_data:
 441  * @current_addr:
 442  * @block: block that contains the page we want to send
 443  * @offset: offset inside the block for the page
 444  * @last_stage: if we are at the completion stage
 445  * @bytes_transferred: increase it with the number of transferred bytes
 446  */
 447 static int save_xbzrle_page(QEMUFile *f, uint8_t **current_data,
 448                             ram_addr_t current_addr, RAMBlock *block,
 449                             ram_addr_t offset, bool last_stage,
 450                             uint64_t *bytes_transferred)
 451 {
 452     int encoded_len = 0, bytes_xbzrle;
 453     uint8_t *prev_cached_page;
 454
 455     if (!cache_is_cached(XBZRLE.cache, current_addr, bitmap_sync_count)) {
 456         acct_info.xbzrle_cache_miss++;
 457         if (!last_stage) {
 458             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
 459                              bitmap_sync_count) == -1) {
 460                 return -1;
 461             } else {
 462                 /* update *current_data when the page has been
 463                    inserted into cache */
 464                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
 465             }
 466         }
 467         return -1;
 468     }
 469
 470     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
 471
 472     /* save current buffer into memory */
 473     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 474
 475     /* XBZRLE encoding (if there is no overflow) */
 476     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
 477                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
 478                                        TARGET_PAGE_SIZE);
 479     if (encoded_len == 0) {
 480         DPRINTF("Skipping unmodified page\n");
 481         return 0;
 482     } else if (encoded_len == -1) {
 483         DPRINTF("Overflow\n");
 484         acct_info.xbzrle_overflows++;
 485         /* update data in the cache */
 486         if (!last_stage) {
 487             memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
 488             *current_data = prev_cached_page;
 489         }
 490         return -1;
 491     }
 492
 493     /* we need to update the data in the cache, in order to get the same data */
 494     if (!last_stage) {
 495         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
 496     }
 497
 498     /* Send XBZRLE based compressed page */
 499     bytes_xbzrle = save_page_header(f, block, offset | RAM_SAVE_FLAG_XBZRLE);
 500     qemu_put_byte(f, ENCODING_FLAG_XBZRLE);
 501     qemu_put_be16(f, encoded_len);
 502     qemu_put_buffer(f, XBZRLE.encoded_buf, encoded_len);
 503     bytes_xbzrle += encoded_len + 1 + 2;
 504     acct_info.xbzrle_pages++;
 505     acct_info.xbzrle_bytes += bytes_xbzrle;
 506     *bytes_transferred += bytes_xbzrle;
 507
 508     return 1;
 509 }
 510
 511 static inline
 512 ram_addr_t migration_bitmap_find_and_reset_dirty(MemoryRegion *mr,
 513                                                  ram_addr_t start)
 514 {
 515     unsigned long base = mr->ram_addr >> TARGET_PAGE_BITS;
 516     unsigned long nr = base + (start >> TARGET_PAGE_BITS);
 517     uint64_t mr_size = TARGET_PAGE_ALIGN(memory_region_size(mr));
 518     unsigned long size = base + (mr_size >> TARGET_PAGE_BITS);
 519
 520     unsigned long next;
 521
 522     if (ram_bulk_stage && nr > base) {
 523         next = nr + 1;
 524     } else {
 525         next = find_next_bit(migration_bitmap, size, nr);
 526     }
 527
 528     if (next < size) {
 529         clear_bit(next, migration_bitmap);
 530         migration_dirty_pages--;
 531     }
 532     return (next - base) << TARGET_PAGE_BITS;
 533 }
 534
 535 static void migration_bitmap_sync_range(ram_addr_t start, ram_addr_t length)
 536 {
 537     migration_dirty_pages +=
 538         cpu_physical_memory_sync_dirty_bitmap(migration_bitmap, start, length);
 539 }
 540
 541
 542 /* Fix me: there are too many global variables used in migration process. */
 543 static int64_t start_time;
 544 static int64_t bytes_xfer_prev;
 545 static int64_t num_dirty_pages_period;
 546 static uint64_t xbzrle_cache_miss_prev;
 547 static uint64_t iterations_prev;
 548
 549 static void migration_bitmap_sync_init(void)
 550 {
 551     start_time = 0;
 552     bytes_xfer_prev = 0;
 553     num_dirty_pages_period = 0;
 554     xbzrle_cache_miss_prev = 0;
 555     iterations_prev = 0;
 556 }
 557
 558 /* Called with iothread lock held, to protect ram_list.dirty_memory[] */
 559 static void migration_bitmap_sync(void)
 560 {
 561     RAMBlock *block;
 562     uint64_t num_dirty_pages_init = migration_dirty_pages;
 563     MigrationState *s = migrate_get_current();
 564     int64_t end_time;
 565     int64_t bytes_xfer_now;
 566
 567     bitmap_sync_count++;
 568
 569     if (!bytes_xfer_prev) {
 570         bytes_xfer_prev = ram_bytes_transferred();
 571     }
 572
 573     if (!start_time) {
 574         start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 575     }
 576
 577     trace_migration_bitmap_sync_start();
 578     address_space_sync_dirty_bitmap(&address_space_memory);
 579
 580     rcu_read_lock();
 581     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
 582         migration_bitmap_sync_range(block->mr->ram_addr, block->used_length);
 583     }
 584     rcu_read_unlock();
 585
 586     trace_migration_bitmap_sync_end(migration_dirty_pages
 587                                     - num_dirty_pages_init);
 588     num_dirty_pages_period += migration_dirty_pages - num_dirty_pages_init;
 589     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 590
 591     /* more than 1 second = 1000 millisecons */
 592     if (end_time > start_time + 1000) {
 593         if (migrate_auto_converge()) {
 594             /* The following detection logic can be refined later. For now:
 595                Check to see if the dirtied bytes is 50% more than the approx.
 596                amount of bytes that just got transferred since the last time we
 597                were in this routine. If that happens >N times (for now N==4)
 598                we turn on the throttle down logic */
 599             bytes_xfer_now = ram_bytes_transferred();
 600             if (s->dirty_pages_rate &&
 601                (num_dirty_pages_period * TARGET_PAGE_SIZE >
 602                    (bytes_xfer_now - bytes_xfer_prev)/2) &&
 603                (dirty_rate_high_cnt++ > 4)) {
 604                     trace_migration_throttle();
 605                     mig_throttle_on = true;
 606                     dirty_rate_high_cnt = 0;
 607              }
 608              bytes_xfer_prev = bytes_xfer_now;
 609         } else {
 610              mig_throttle_on = false;
 611         }
 612         if (migrate_use_xbzrle()) {
 613             if (iterations_prev != acct_info.iterations) {
 614                 acct_info.xbzrle_cache_miss_rate =
 615                    (double)(acct_info.xbzrle_cache_miss -
 616                             xbzrle_cache_miss_prev) /
 617                    (acct_info.iterations - iterations_prev);
 618             }
 619             iterations_prev = acct_info.iterations;
 620             xbzrle_cache_miss_prev = acct_info.xbzrle_cache_miss;
 621         }
 622         s->dirty_pages_rate = num_dirty_pages_period * 1000
 623             / (end_time - start_time);
 624         s->dirty_bytes_rate = s->dirty_pages_rate * TARGET_PAGE_SIZE;
 625         start_time = end_time;
 626         num_dirty_pages_period = 0;
 627     }
 628     s->dirty_sync_count = bitmap_sync_count;
 629 }
 630
 631 /**
 632  * save_zero_page: Send the zero page to the stream
 633  *
 634  * Returns: Number of pages written.
 635  *
 636  * @f: QEMUFile where to send the data
 637  * @block: block that contains the page we want to send
 638  * @offset: offset inside the block for the page
 639  * @p: pointer to the page
 640  * @bytes_transferred: increase it with the number of transferred bytes
 641  */
 642 static int save_zero_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset,
 643                           uint8_t *p, uint64_t *bytes_transferred)
 644 {
 645     int pages = -1;
 646
 647     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
 648         acct_info.dup_pages++;
 649         *bytes_transferred += save_page_header(f, block,
 650                                                offset | RAM_SAVE_FLAG_COMPRESS);
 651         qemu_put_byte(f, 0);
 652         *bytes_transferred += 1;
 653         pages = 1;
 654     }
 655
 656     return pages;
 657 }
 658
 659 /**
 660  * ram_save_page: Send the given page to the stream
 661  *
 662  * Returns: Number of pages written.
 663  *
 664  * @f: QEMUFile where to send the data
 665  * @block: block that contains the page we want to send
 666  * @offset: offset inside the block for the page
 667  * @last_stage: if we are at the completion stage
 668  * @bytes_transferred: increase it with the number of transferred bytes
 669  */
 670 static int ram_save_page(QEMUFile *f, RAMBlock* block, ram_addr_t offset,
 671                          bool last_stage, uint64_t *bytes_transferred)
 672 {
 673     int pages = -1;
 674     uint64_t bytes_xmit;
 675     ram_addr_t current_addr;
 676     MemoryRegion *mr = block->mr;
 677     uint8_t *p;
 678     int ret;
 679     bool send_async = true;
 680
 681     p = memory_region_get_ram_ptr(mr) + offset;
 682
 683     /* In doubt sent page as normal */
 684     bytes_xmit = 0;
 685     ret = ram_control_save_page(f, block->offset,
 686                            offset, TARGET_PAGE_SIZE, &bytes_xmit);
 687     if (bytes_xmit) {
 688         *bytes_transferred += bytes_xmit;
 689         pages = 1;
 690     }
 691
 692     XBZRLE_cache_lock();
 693
 694     current_addr = block->offset + offset;
 695
 696     if (block == last_sent_block) {
 697         offset |= RAM_SAVE_FLAG_CONTINUE;
 698     }
 699     if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
 700         if (ret != RAM_SAVE_CONTROL_DELAYED) {
 701             if (bytes_xmit > 0) {
 702                 acct_info.norm_pages++;
 703             } else if (bytes_xmit == 0) {
 704                 acct_info.dup_pages++;
 705             }
 706         }
 707     } else {
 708         pages = save_zero_page(f, block, offset, p, bytes_transferred);
 709         if (pages > 0) {
 710             /* Must let xbzrle know, otherwise a previous (now 0'd) cached
 711              * page would be stale
 712              */
 713             xbzrle_cache_zero_page(current_addr);
 714         } else if (!ram_bulk_stage && migrate_use_xbzrle()) {
 715             pages = save_xbzrle_page(f, &p, current_addr, block,
 716                                      offset, last_stage, bytes_transferred);
 717             if (!last_stage) {
 718                 /* Can't send this cached data async, since the cache page
 719                  * might get updated before it gets to the wire
 720                  */
 721                 send_async = false;
 722             }
 723         }
 724     }
 725
 726     /* XBZRLE overflow or normal page */
 727     if (pages == -1) {
 728         *bytes_transferred += save_page_header(f, block,
 729                                                offset | RAM_SAVE_FLAG_PAGE);
 730         if (send_async) {
 731             qemu_put_buffer_async(f, p, TARGET_PAGE_SIZE);
 732         } else {
 733             qemu_put_buffer(f, p, TARGET_PAGE_SIZE);
 734         }
 735         *bytes_transferred += TARGET_PAGE_SIZE;
 736         pages = 1;
 737         acct_info.norm_pages++;
 738     }
 739
 740     XBZRLE_cache_unlock();
 741
 742     return pages;
 743 }
 744
 745 static int do_compress_ram_page(CompressParam *param)
 746 {
 747     int bytes_sent, blen;
 748     uint8_t *p;
 749     RAMBlock *block = param->block;
 750     ram_addr_t offset = param->offset;
 751
 752     p = memory_region_get_ram_ptr(block->mr) + (offset & TARGET_PAGE_MASK);
 753
 754     bytes_sent = save_page_header(param->file, block, offset |
 755                                   RAM_SAVE_FLAG_COMPRESS_PAGE);
 756     blen = qemu_put_compression_data(param->file, p, TARGET_PAGE_SIZE,
 757                                      migrate_compress_level());
 758     bytes_sent += blen;
 759
 760     return bytes_sent;
 761 }
 762
 763 static inline void start_compression(CompressParam *param)
 764 {
 765     param->done = false;
 766     qemu_mutex_lock(&param->mutex);
 767     param->start = true;
 768     qemu_cond_signal(&param->cond);
 769     qemu_mutex_unlock(&param->mutex);
 770 }
 771
 772 static inline void start_decompression(DecompressParam *param)
 773 {
 774     qemu_mutex_lock(&param->mutex);
 775     param->start = true;
 776     qemu_cond_signal(&param->cond);
 777     qemu_mutex_unlock(&param->mutex);
 778 }
 779
 780 static uint64_t bytes_transferred;
 781
 782 static void flush_compressed_data(QEMUFile *f)
 783 {
 784     int idx, len, thread_count;
 785
 786     if (!migrate_use_compression()) {
 787         return;
 788     }
 789     thread_count = migrate_compress_threads();
 790     for (idx = 0; idx < thread_count; idx++) {
 791         if (!comp_param[idx].done) {
 792             qemu_mutex_lock(comp_done_lock);
 793             while (!comp_param[idx].done && !quit_comp_thread) {
 794                 qemu_cond_wait(comp_done_cond, comp_done_lock);
 795             }
 796             qemu_mutex_unlock(comp_done_lock);
 797         }
 798         if (!quit_comp_thread) {
 799             len = qemu_put_qemu_file(f, comp_param[idx].file);
 800             bytes_transferred += len;
 801         }
 802     }
 803 }
 804
 805 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
 806                                        ram_addr_t offset)
 807 {
 808     param->block = block;
 809     param->offset = offset;
 810 }
 811
 812 static int compress_page_with_multi_thread(QEMUFile *f, RAMBlock *block,
 813                                            ram_addr_t offset,
 814                                            uint64_t *bytes_transferred)
 815 {
 816     int idx, thread_count, bytes_xmit = -1, pages = -1;
 817
 818     thread_count = migrate_compress_threads();
 819     qemu_mutex_lock(comp_done_lock);
 820     while (true) {
 821         for (idx = 0; idx < thread_count; idx++) {
 822             if (comp_param[idx].done) {
 823                 bytes_xmit = qemu_put_qemu_file(f, comp_param[idx].file);
 824                 set_compress_params(&comp_param[idx], block, offset);
 825                 start_compression(&comp_param[idx]);
 826                 pages = 1;
 827                 acct_info.norm_pages++;
 828                 *bytes_transferred += bytes_xmit;
 829                 break;
 830             }
 831         }
 832         if (pages > 0) {
 833             break;
 834         } else {
 835             qemu_cond_wait(comp_done_cond, comp_done_lock);
 836         }
 837     }
 838     qemu_mutex_unlock(comp_done_lock);
 839
 840     return pages;
 841 }
 842
 843 /**
 844  * ram_save_compressed_page: compress the given page and send it to the stream
 845  *
 846  * Returns: Number of pages written.
 847  *
 848  * @f: QEMUFile where to send the data
 849  * @block: block that contains the page we want to send
 850  * @offset: offset inside the block for the page
 851  * @last_stage: if we are at the completion stage
 852  * @bytes_transferred: increase it with the number of transferred bytes
 853  */
 854 static int ram_save_compressed_page(QEMUFile *f, RAMBlock *block,
 855                                     ram_addr_t offset, bool last_stage,
 856                                     uint64_t *bytes_transferred)
 857 {
 858     int pages = -1;
 859     uint64_t bytes_xmit;
 860     MemoryRegion *mr = block->mr;
 861     uint8_t *p;
 862     int ret;
 863
 864     p = memory_region_get_ram_ptr(mr) + offset;
 865
 866     bytes_xmit = 0;
 867     ret = ram_control_save_page(f, block->offset,
 868                                 offset, TARGET_PAGE_SIZE, &bytes_xmit);
 869     if (bytes_xmit) {
 870         *bytes_transferred += bytes_xmit;
 871         pages = 1;
 872     }
 873     if (block == last_sent_block) {
 874         offset |= RAM_SAVE_FLAG_CONTINUE;
 875     }
 876     if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
 877         if (ret != RAM_SAVE_CONTROL_DELAYED) {
 878             if (bytes_xmit > 0) {
 879                 acct_info.norm_pages++;
 880             } else if (bytes_xmit == 0) {
 881                 acct_info.dup_pages++;
 882             }
 883         }
 884     } else {
 885         /* When starting the process of a new block, the first page of
 886          * the block should be sent out before other pages in the same
 887          * block, and all the pages in last block should have been sent
 888          * out, keeping this order is important, because the 'cont' flag
 889          * is used to avoid resending the block name.
 890          */
 891         if (block != last_sent_block) {
 892             flush_compressed_data(f);
 893             pages = save_zero_page(f, block, offset, p, bytes_transferred);
 894             if (pages == -1) {
 895                 set_compress_params(&comp_param[0], block, offset);
 896                 /* Use the qemu thread to compress the data to make sure the
 897                  * first page is sent out before other pages
 898                  */
 899                 bytes_xmit = do_compress_ram_page(&comp_param[0]);
 900                 acct_info.norm_pages++;
 901                 qemu_put_qemu_file(f, comp_param[0].file);
 902                 *bytes_transferred += bytes_xmit;
 903                 pages = 1;
 904             }
 905         } else {
 906             pages = save_zero_page(f, block, offset, p, bytes_transferred);
 907             if (pages == -1) {
 908                 pages = compress_page_with_multi_thread(f, block, offset,
 909                                                         bytes_transferred);
 910             }
 911         }
 912     }
 913
 914     return pages;
 915 }
 916
 917 /**
 918  * ram_find_and_save_block: Finds a dirty page and sends it to f
 919  *
 920  * Called within an RCU critical section.
 921  *
 922  * Returns:  The number of pages written
 923  *           0 means no dirty pages
 924  *
 925  * @f: QEMUFile where to send the data
 926  * @last_stage: if we are at the completion stage
 927  * @bytes_transferred: increase it with the number of transferred bytes
 928  */
 929
 930 static int ram_find_and_save_block(QEMUFile *f, bool last_stage,
 931                                    uint64_t *bytes_transferred)
 932 {
 933     RAMBlock *block = last_seen_block;
 934     ram_addr_t offset = last_offset;
 935     bool complete_round = false;
 936     int pages = 0;
 937     MemoryRegion *mr;
 938
 939     if (!block)
 940         block = QLIST_FIRST_RCU(&ram_list.blocks);
 941
 942     while (true) {
 943         mr = block->mr;
 944         offset = migration_bitmap_find_and_reset_dirty(mr, offset);
 945         if (complete_round && block == last_seen_block &&
 946             offset >= last_offset) {
 947             break;
 948         }
 949         if (offset >= block->used_length) {
 950             offset = 0;
 951             block = QLIST_NEXT_RCU(block, next);
 952             if (!block) {
 953                 block = QLIST_FIRST_RCU(&ram_list.blocks);
 954                 complete_round = true;
 955                 ram_bulk_stage = false;
 956                 if (migrate_use_xbzrle()) {
 957                     /* If xbzrle is on, stop using the data compression at this
 958                      * point. In theory, xbzrle can do better than compression.
 959                      */
 960                     flush_compressed_data(f);
 961                     compression_switch = false;
 962                 }
 963             }
 964         } else {
 965             if (compression_switch && migrate_use_compression()) {
 966                 pages = ram_save_compressed_page(f, block, offset, last_stage,
 967                                                  bytes_transferred);
 968             } else {
 969                 pages = ram_save_page(f, block, offset, last_stage,
 970                                       bytes_transferred);
 971             }
 972
 973             /* if page is unmodified, continue to the next */
 974             if (pages > 0) {
 975                 last_sent_block = block;
 976                 break;
 977             }
 978         }
 979     }
 980
 981     last_seen_block = block;
 982     last_offset = offset;
 983
 984     return pages;
 985 }
 986
 987 void acct_update_position(QEMUFile *f, size_t size, bool zero)
 988 {
 989     uint64_t pages = size / TARGET_PAGE_SIZE;
 990     if (zero) {
 991         acct_info.dup_pages += pages;
 992     } else {
 993         acct_info.norm_pages += pages;
 994         bytes_transferred += size;
 995         qemu_update_position(f, size);
 996     }
 997 }
 998
 999 static ram_addr_t ram_save_remaining(void)
1000 {
1001     return migration_dirty_pages;
1002 }
1003
1004 uint64_t ram_bytes_remaining(void)
1005 {
1006     return ram_save_remaining() * TARGET_PAGE_SIZE;
1007 }
1008
1009 uint64_t ram_bytes_transferred(void)
1010 {
1011     return bytes_transferred;
1012 }
1013
1014 uint64_t ram_bytes_total(void)
1015 {
1016     RAMBlock *block;
1017     uint64_t total = 0;
1018
1019     rcu_read_lock();
1020     QLIST_FOREACH_RCU(block, &ram_list.blocks, next)
1021         total += block->used_length;
1022     rcu_read_unlock();
1023     return total;
1024 }
1025
1026 void free_xbzrle_decoded_buf(void)
1027 {
1028     g_free(xbzrle_decoded_buf);
1029     xbzrle_decoded_buf = NULL;
1030 }
1031
1032 static void migration_end(void)
1033 {
1034     if (migration_bitmap) {
1035         memory_global_dirty_log_stop();
1036         g_free(migration_bitmap);
1037         migration_bitmap = NULL;
1038     }
1039
1040     XBZRLE_cache_lock();
1041     if (XBZRLE.cache) {
1042         cache_fini(XBZRLE.cache);
1043         g_free(XBZRLE.encoded_buf);
1044         g_free(XBZRLE.current_buf);
1045         XBZRLE.cache = NULL;
1046         XBZRLE.encoded_buf = NULL;
1047         XBZRLE.current_buf = NULL;
1048     }
1049     XBZRLE_cache_unlock();
1050 }
1051
1052 static void ram_migration_cancel(void *opaque)
1053 {
1054     migration_end();
1055 }
1056
1057 static void reset_ram_globals(void)
1058 {
1059     last_seen_block = NULL;
1060     last_sent_block = NULL;
1061     last_offset = 0;
1062     last_version = ram_list.version;
1063     ram_bulk_stage = true;
1064 }
1065
1066 #define MAX_WAIT 50 /* ms, half buffered_file limit */
1067
1068
1069 /* Each of ram_save_setup, ram_save_iterate and ram_save_complete has
1070  * long-running RCU critical section.  When rcu-reclaims in the code
1071  * start to become numerous it will be necessary to reduce the
1072  * granularity of these critical sections.
1073  */
1074
1075 static int ram_save_setup(QEMUFile *f, void *opaque)
1076 {
1077     RAMBlock *block;
1078     int64_t ram_bitmap_pages; /* Size of bitmap in pages, including gaps */
1079
1080     mig_throttle_on = false;
1081     dirty_rate_high_cnt = 0;
1082     bitmap_sync_count = 0;
1083     migration_bitmap_sync_init();
1084
1085     if (migrate_use_xbzrle()) {
1086         XBZRLE_cache_lock();
1087         XBZRLE.cache = cache_init(migrate_xbzrle_cache_size() /
1088                                   TARGET_PAGE_SIZE,
1089                                   TARGET_PAGE_SIZE);
1090         if (!XBZRLE.cache) {
1091             XBZRLE_cache_unlock();
1092             error_report("Error creating cache");
1093             return -1;
1094         }
1095         XBZRLE_cache_unlock();
1096
1097         /* We prefer not to abort if there is no memory */
1098         XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
1099         if (!XBZRLE.encoded_buf) {
1100             error_report("Error allocating encoded_buf");
1101             return -1;
1102         }
1103
1104         XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
1105         if (!XBZRLE.current_buf) {
1106             error_report("Error allocating current_buf");
1107             g_free(XBZRLE.encoded_buf);
1108             XBZRLE.encoded_buf = NULL;
1109             return -1;
1110         }
1111
1112         acct_clear();
1113     }
1114
1115     /* iothread lock needed for ram_list.dirty_memory[] */
1116     qemu_mutex_lock_iothread();
1117     qemu_mutex_lock_ramlist();
1118     rcu_read_lock();
1119     bytes_transferred = 0;
1120     reset_ram_globals();
1121
1122     ram_bitmap_pages = last_ram_offset() >> TARGET_PAGE_BITS;
1123     migration_bitmap = bitmap_new(ram_bitmap_pages);
1124     bitmap_set(migration_bitmap, 0, ram_bitmap_pages);
1125
1126     /*
1127      * Count the total number of pages used by ram blocks not including any
1128      * gaps due to alignment or unplugs.
1129      */
1130     migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
1131
1132     memory_global_dirty_log_start();
1133     migration_bitmap_sync();
1134     qemu_mutex_unlock_ramlist();
1135     qemu_mutex_unlock_iothread();
1136
1137     qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
1138
1139     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1140         qemu_put_byte(f, strlen(block->idstr));
1141         qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
1142         qemu_put_be64(f, block->used_length);
1143     }
1144
1145     rcu_read_unlock();
1146
1147     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
1148     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
1149
1150     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
1151
1152     return 0;
1153 }
1154
1155 static int ram_save_iterate(QEMUFile *f, void *opaque)
1156 {
1157     int ret;
1158     int i;
1159     int64_t t0;
1160     int pages_sent = 0;
1161
1162     rcu_read_lock();
1163     if (ram_list.version != last_version) {
1164         reset_ram_globals();
1165     }
1166
1167     /* Read version before ram_list.blocks */
1168     smp_rmb();
1169
1170     ram_control_before_iterate(f, RAM_CONTROL_ROUND);
1171
1172     t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
1173     i = 0;
1174     while ((ret = qemu_file_rate_limit(f)) == 0) {
1175         int pages;
1176
1177         pages = ram_find_and_save_block(f, false, &bytes_transferred);
1178         /* no more pages to sent */
1179         if (pages == 0) {
1180             break;
1181         }
1182         pages_sent += pages;
1183         acct_info.iterations++;
1184         check_guest_throttling();
1185         /* we want to check in the 1st loop, just in case it was the 1st time
1186            and we had to sync the dirty bitmap.
1187            qemu_get_clock_ns() is a bit expensive, so we only check each some
1188            iterations
1189         */
1190         if ((i & 63) == 0) {
1191             uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
1192             if (t1 > MAX_WAIT) {
1193                 DPRINTF("big wait: %" PRIu64 " milliseconds, %d iterations\n",
1194                         t1, i);
1195                 break;
1196             }
1197         }
1198         i++;
1199     }
1200     flush_compressed_data(f);
1201     rcu_read_unlock();
1202
1203     /*
1204      * Must occur before EOS (or any QEMUFile operation)
1205      * because of RDMA protocol.
1206      */
1207     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
1208
1209     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
1210     bytes_transferred += 8;
1211
1212     ret = qemu_file_get_error(f);
1213     if (ret < 0) {
1214         return ret;
1215     }
1216
1217     return pages_sent;
1218 }
1219
1220 /* Called with iothread lock */
1221 static int ram_save_complete(QEMUFile *f, void *opaque)
1222 {
1223     rcu_read_lock();
1224
1225     migration_bitmap_sync();
1226
1227     ram_control_before_iterate(f, RAM_CONTROL_FINISH);
1228
1229     /* try transferring iterative blocks of memory */
1230
1231     /* flush all remaining blocks regardless of rate limiting */
1232     while (true) {
1233         int pages;
1234
1235         pages = ram_find_and_save_block(f, true, &bytes_transferred);
1236         /* no more blocks to sent */
1237         if (pages == 0) {
1238             break;
1239         }
1240     }
1241
1242     flush_compressed_data(f);
1243     ram_control_after_iterate(f, RAM_CONTROL_FINISH);
1244     migration_end();
1245
1246     rcu_read_unlock();
1247     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
1248
1249     return 0;
1250 }
1251
1252 static uint64_t ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size)
1253 {
1254     uint64_t remaining_size;
1255
1256     remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
1257
1258     if (remaining_size < max_size) {
1259         qemu_mutex_lock_iothread();
1260         rcu_read_lock();
1261         migration_bitmap_sync();
1262         rcu_read_unlock();
1263         qemu_mutex_unlock_iothread();
1264         remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
1265     }
1266     return remaining_size;
1267 }
1268
1269 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
1270 {
1271     unsigned int xh_len;
1272     int xh_flags;
1273
1274     if (!xbzrle_decoded_buf) {
1275         xbzrle_decoded_buf = g_malloc(TARGET_PAGE_SIZE);
1276     }
1277
1278     /* extract RLE header */
1279     xh_flags = qemu_get_byte(f);
1280     xh_len = qemu_get_be16(f);
1281
1282     if (xh_flags != ENCODING_FLAG_XBZRLE) {
1283         error_report("Failed to load XBZRLE page - wrong compression!");
1284         return -1;
1285     }
1286
1287     if (xh_len > TARGET_PAGE_SIZE) {
1288         error_report("Failed to load XBZRLE page - len overflow!");
1289         return -1;
1290     }
1291     /* load data and decode */
1292     qemu_get_buffer(f, xbzrle_decoded_buf, xh_len);
1293
1294     /* decode RLE */
1295     if (xbzrle_decode_buffer(xbzrle_decoded_buf, xh_len, host,
1296                              TARGET_PAGE_SIZE) == -1) {
1297         error_report("Failed to load XBZRLE page - decode error!");
1298         return -1;
1299     }
1300
1301     return 0;
1302 }
1303
1304 /* Must be called from within a rcu critical section.
1305  * Returns a pointer from within the RCU-protected ram_list.
1306  */
1307 static inline void *host_from_stream_offset(QEMUFile *f,
1308                                             ram_addr_t offset,
1309                                             int flags)
1310 {
1311     static RAMBlock *block = NULL;
1312     char id[256];
1313     uint8_t len;
1314
1315     if (flags & RAM_SAVE_FLAG_CONTINUE) {
1316         if (!block || block->max_length <= offset) {
1317             error_report("Ack, bad migration stream!");
1318             return NULL;
1319         }
1320
1321         return memory_region_get_ram_ptr(block->mr) + offset;
1322     }
1323
1324     len = qemu_get_byte(f);
1325     qemu_get_buffer(f, (uint8_t *)id, len);
1326     id[len] = 0;
1327
1328     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1329         if (!strncmp(id, block->idstr, sizeof(id)) &&
1330             block->max_length > offset) {
1331             return memory_region_get_ram_ptr(block->mr) + offset;
1332         }
1333     }
1334
1335     error_report("Can't find block %s!", id);
1336     return NULL;
1337 }
1338
1339 /*
1340  * If a page (or a whole RDMA chunk) has been
1341  * determined to be zero, then zap it.
1342  */
1343 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
1344 {
1345     if (ch != 0 || !is_zero_range(host, size)) {
1346         memset(host, ch, size);
1347     }
1348 }
1349
1350 static void *do_data_decompress(void *opaque)
1351 {
1352     DecompressParam *param = opaque;
1353     unsigned long pagesize;
1354
1355     while (!quit_decomp_thread) {
1356         qemu_mutex_lock(&param->mutex);
1357         while (!param->start && !quit_decomp_thread) {
1358             qemu_cond_wait(&param->cond, &param->mutex);
1359             pagesize = TARGET_PAGE_SIZE;
1360             if (!quit_decomp_thread) {
1361                 /* uncompress() will return failed in some case, especially
1362                  * when the page is dirted when doing the compression, it's
1363                  * not a problem because the dirty page will be retransferred
1364                  * and uncompress() won't break the data in other pages.
1365                  */
1366                 uncompress((Bytef *)param->des, &pagesize,
1367                            (const Bytef *)param->compbuf, param->len);
1368             }
1369             param->start = false;
1370         }
1371         qemu_mutex_unlock(&param->mutex);
1372     }
1373
1374     return NULL;
1375 }
1376
1377 void migrate_decompress_threads_create(void)
1378 {
1379     int i, thread_count;
1380
1381     thread_count = migrate_decompress_threads();
1382     decompress_threads = g_new0(QemuThread, thread_count);
1383     decomp_param = g_new0(DecompressParam, thread_count);
1384     compressed_data_buf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
1385     quit_decomp_thread = false;
1386     for (i = 0; i < thread_count; i++) {
1387         qemu_mutex_init(&decomp_param[i].mutex);
1388         qemu_cond_init(&decomp_param[i].cond);
1389         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
1390         qemu_thread_create(decompress_threads + i, "decompress",
1391                            do_data_decompress, decomp_param + i,
1392                            QEMU_THREAD_JOINABLE);
1393     }
1394 }
1395
1396 void migrate_decompress_threads_join(void)
1397 {
1398     int i, thread_count;
1399
1400     quit_decomp_thread = true;
1401     thread_count = migrate_decompress_threads();
1402     for (i = 0; i < thread_count; i++) {
1403         qemu_mutex_lock(&decomp_param[i].mutex);
1404         qemu_cond_signal(&decomp_param[i].cond);
1405         qemu_mutex_unlock(&decomp_param[i].mutex);
1406     }
1407     for (i = 0; i < thread_count; i++) {
1408         qemu_thread_join(decompress_threads + i);
1409         qemu_mutex_destroy(&decomp_param[i].mutex);
1410         qemu_cond_destroy(&decomp_param[i].cond);
1411         g_free(decomp_param[i].compbuf);
1412     }
1413     g_free(decompress_threads);
1414     g_free(decomp_param);
1415     g_free(compressed_data_buf);
1416     decompress_threads = NULL;
1417     decomp_param = NULL;
1418     compressed_data_buf = NULL;
1419 }
1420
1421 static void decompress_data_with_multi_threads(uint8_t *compbuf,
1422                                                void *host, int len)
1423 {
1424     int idx, thread_count;
1425
1426     thread_count = migrate_decompress_threads();
1427     while (true) {
1428         for (idx = 0; idx < thread_count; idx++) {
1429             if (!decomp_param[idx].start) {
1430                 memcpy(decomp_param[idx].compbuf, compbuf, len);
1431                 decomp_param[idx].des = host;
1432                 decomp_param[idx].len = len;
1433                 start_decompression(&decomp_param[idx]);
1434                 break;
1435             }
1436         }
1437         if (idx < thread_count) {
1438             break;
1439         }
1440     }
1441 }
1442
1443 static int ram_load(QEMUFile *f, void *opaque, int version_id)
1444 {
1445     int flags = 0, ret = 0;
1446     static uint64_t seq_iter;
1447     int len = 0;
1448
1449     seq_iter++;
1450
1451     if (version_id != 4) {
1452         ret = -EINVAL;
1453     }
1454
1455     /* This RCU critical section can be very long running.
1456      * When RCU reclaims in the code start to become numerous,
1457      * it will be necessary to reduce the granularity of this
1458      * critical section.
1459      */
1460     rcu_read_lock();
1461     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
1462         ram_addr_t addr, total_ram_bytes;
1463         void *host;
1464         uint8_t ch;
1465
1466         addr = qemu_get_be64(f);
1467         flags = addr & ~TARGET_PAGE_MASK;
1468         addr &= TARGET_PAGE_MASK;
1469
1470         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
1471         case RAM_SAVE_FLAG_MEM_SIZE:
1472             /* Synchronize RAM block list */
1473             total_ram_bytes = addr;
1474             while (!ret && total_ram_bytes) {
1475                 RAMBlock *block;
1476                 uint8_t len;
1477                 char id[256];
1478                 ram_addr_t length;
1479
1480                 len = qemu_get_byte(f);
1481                 qemu_get_buffer(f, (uint8_t *)id, len);
1482                 id[len] = 0;
1483                 length = qemu_get_be64(f);
1484
1485                 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1486                     if (!strncmp(id, block->idstr, sizeof(id))) {
1487                         if (length != block->used_length) {
1488                             Error *local_err = NULL;
1489
1490                             ret = qemu_ram_resize(block->offset, length, &local_err);
1491                             if (local_err) {
1492                                 error_report_err(local_err);
1493                             }
1494                         }
1495                         break;
1496                     }
1497                 }
1498
1499                 if (!block) {
1500                     error_report("Unknown ramblock \"%s\", cannot "
1501                                  "accept migration", id);
1502                     ret = -EINVAL;
1503                 }
1504
1505                 total_ram_bytes -= length;
1506             }
1507             break;
1508         case RAM_SAVE_FLAG_COMPRESS:
1509             host = host_from_stream_offset(f, addr, flags);
1510             if (!host) {
1511                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
1512                 ret = -EINVAL;
1513                 break;
1514             }
1515             ch = qemu_get_byte(f);
1516             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
1517             break;
1518         case RAM_SAVE_FLAG_PAGE:
1519             host = host_from_stream_offset(f, addr, flags);
1520             if (!host) {
1521                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
1522                 ret = -EINVAL;
1523                 break;
1524             }
1525             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
1526             break;
1527         case RAM_SAVE_FLAG_COMPRESS_PAGE:
1528             host = host_from_stream_offset(f, addr, flags);
1529             if (!host) {
1530                 error_report("Invalid RAM offset " RAM_ADDR_FMT, addr);
1531                 ret = -EINVAL;
1532                 break;
1533             }
1534
1535             len = qemu_get_be32(f);
1536             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
1537                 error_report("Invalid compressed data length: %d", len);
1538                 ret = -EINVAL;
1539                 break;
1540             }
1541             qemu_get_buffer(f, compressed_data_buf, len);
1542             decompress_data_with_multi_threads(compressed_data_buf, host, len);
1543             break;
1544         case RAM_SAVE_FLAG_XBZRLE:
1545             host = host_from_stream_offset(f, addr, flags);
1546             if (!host) {
1547                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
1548                 ret = -EINVAL;
1549                 break;
1550             }
1551             if (load_xbzrle(f, addr, host) < 0) {
1552                 error_report("Failed to decompress XBZRLE page at "
1553                              RAM_ADDR_FMT, addr);
1554                 ret = -EINVAL;
1555                 break;
1556             }
1557             break;
1558         case RAM_SAVE_FLAG_EOS:
1559             /* normal exit */
1560             break;
1561         default:
1562             if (flags & RAM_SAVE_FLAG_HOOK) {
1563                 ram_control_load_hook(f, flags);
1564             } else {
1565                 error_report("Unknown combination of migration flags: %#x",
1566                              flags);
1567                 ret = -EINVAL;
1568             }
1569         }
1570         if (!ret) {
1571             ret = qemu_file_get_error(f);
1572         }
1573     }
1574
1575     rcu_read_unlock();
1576     DPRINTF("Completed load of VM with exit code %d seq iteration "
1577             "%" PRIu64 "\n", ret, seq_iter);
1578     return ret;
1579 }
1580
1581 static SaveVMHandlers savevm_ram_handlers = {
1582     .save_live_setup = ram_save_setup,
1583     .save_live_iterate = ram_save_iterate,
1584     .save_live_complete = ram_save_complete,
1585     .save_live_pending = ram_save_pending,
1586     .load_state = ram_load,
1587     .cancel = ram_migration_cancel,
1588 };
1589
1590 void ram_mig_init(void)
1591 {
1592     qemu_mutex_init(&XBZRLE.lock);
1593     register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, NULL);
1594 }
1595 /* Stub function that's gets run on the vcpu when its brought out of the
1596    VM to run inside qemu via async_run_on_cpu()*/
1597
1598 static void mig_sleep_cpu(void *opq)
1599 {
1600     qemu_mutex_unlock_iothread();
1601     g_usleep(30*1000);
1602     qemu_mutex_lock_iothread();
1603 }
1604
1605 /* To reduce the dirty rate explicitly disallow the VCPUs from spending
1606    much time in the VM. The migration thread will try to catchup.
1607    Workload will experience a performance drop.
1608 */
1609 static void mig_throttle_guest_down(void)
1610 {
1611     CPUState *cpu;
1612
1613     qemu_mutex_lock_iothread();
1614     CPU_FOREACH(cpu) {
1615         async_run_on_cpu(cpu, mig_sleep_cpu, NULL);
1616     }
1617     qemu_mutex_unlock_iothread();
1618 }
1619
1620 static void check_guest_throttling(void)
1621 {
1622     static int64_t t0;
1623     int64_t        t1;
1624
1625     if (!mig_throttle_on) {
1626         return;
1627     }
1628
1629     if (!t0)  {
1630         t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
1631         return;
1632     }
1633
1634     t1 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
1635
1636     /* If it has been more than 40 ms since the last time the guest
1637      * was throttled then do it again.
1638      */
1639     if (40 < (t1-t0)/1000000) {
1640         mig_throttle_guest_down();
1641         t0 = t1;
1642     }
1643 }