migration/ram.c

   1 /*
   2  * QEMU System Emulator
   3  *
   4  * Copyright (c) 2003-2008 Fabrice Bellard
   5  * Copyright (c) 2011-2015 Red Hat Inc
   6  *
   7  * Authors:
   8  *  Juan Quintela <quintela@redhat.com>
   9  *
  10  * Permission is hereby granted, free of charge, to any person obtaining a copy
  11  * of this software and associated documentation files (the "Software"), to deal
  12  * in the Software without restriction, including without limitation the rights
  13  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  14  * copies of the Software, and to permit persons to whom the Software is
  15  * furnished to do so, subject to the following conditions:
  16  *
  17  * The above copyright notice and this permission notice shall be included in
  18  * all copies or substantial portions of the Software.
  19  *
  20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  25  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  26  * THE SOFTWARE.
  27  */
  28 #include <stdint.h>
  29 #include <zlib.h>
  30 #include "qemu/bitops.h"
  31 #include "qemu/bitmap.h"
  32 #include "qemu/timer.h"
  33 #include "qemu/main-loop.h"
  34 #include "migration/migration.h"
  35 #include "exec/address-spaces.h"
  36 #include "migration/page_cache.h"
  37 #include "qemu/error-report.h"
  38 #include "trace.h"
  39 #include "exec/ram_addr.h"
  40 #include "qemu/rcu_queue.h"
  41
  42 #ifdef DEBUG_MIGRATION_RAM
  43 #define DPRINTF(fmt, ...) \
  44     do { fprintf(stdout, "migration_ram: " fmt, ## __VA_ARGS__); } while (0)
  45 #else
  46 #define DPRINTF(fmt, ...) \
  47     do { } while (0)
  48 #endif
  49
  50 static bool mig_throttle_on;
  51 static int dirty_rate_high_cnt;
  52 static void check_guest_throttling(void);
  53
  54 static uint64_t bitmap_sync_count;
  55
  56 /***********************************************************/
  57 /* ram save/restore */
  58
  59 #define RAM_SAVE_FLAG_FULL     0x01 /* Obsolete, not used anymore */
  60 #define RAM_SAVE_FLAG_COMPRESS 0x02
  61 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
  62 #define RAM_SAVE_FLAG_PAGE     0x08
  63 #define RAM_SAVE_FLAG_EOS      0x10
  64 #define RAM_SAVE_FLAG_CONTINUE 0x20
  65 #define RAM_SAVE_FLAG_XBZRLE   0x40
  66 /* 0x80 is reserved in migration.h start with 0x100 next */
  67 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
  68
  69 static const uint8_t ZERO_TARGET_PAGE[TARGET_PAGE_SIZE];
  70
  71 static inline bool is_zero_range(uint8_t *p, uint64_t size)
  72 {
  73     return buffer_find_nonzero_offset(p, size) == size;
  74 }
  75
  76 /* struct contains XBZRLE cache and a static page
  77    used by the compression */
  78 static struct {
  79     /* buffer used for XBZRLE encoding */
  80     uint8_t *encoded_buf;
  81     /* buffer for storing page content */
  82     uint8_t *current_buf;
  83     /* Cache for XBZRLE, Protected by lock. */
  84     PageCache *cache;
  85     QemuMutex lock;
  86 } XBZRLE;
  87
  88 /* buffer used for XBZRLE decoding */
  89 static uint8_t *xbzrle_decoded_buf;
  90
  91 static void XBZRLE_cache_lock(void)
  92 {
  93     if (migrate_use_xbzrle())
  94         qemu_mutex_lock(&XBZRLE.lock);
  95 }
  96
  97 static void XBZRLE_cache_unlock(void)
  98 {
  99     if (migrate_use_xbzrle())
 100         qemu_mutex_unlock(&XBZRLE.lock);
 101 }
 102
 103 /*
 104  * called from qmp_migrate_set_cache_size in main thread, possibly while
 105  * a migration is in progress.
 106  * A running migration maybe using the cache and might finish during this
 107  * call, hence changes to the cache are protected by XBZRLE.lock().
 108  */
 109 int64_t xbzrle_cache_resize(int64_t new_size)
 110 {
 111     PageCache *new_cache;
 112     int64_t ret;
 113
 114     if (new_size < TARGET_PAGE_SIZE) {
 115         return -1;
 116     }
 117
 118     XBZRLE_cache_lock();
 119
 120     if (XBZRLE.cache != NULL) {
 121         if (pow2floor(new_size) == migrate_xbzrle_cache_size()) {
 122             goto out_new_size;
 123         }
 124         new_cache = cache_init(new_size / TARGET_PAGE_SIZE,
 125                                         TARGET_PAGE_SIZE);
 126         if (!new_cache) {
 127             error_report("Error creating cache");
 128             ret = -1;
 129             goto out;
 130         }
 131
 132         cache_fini(XBZRLE.cache);
 133         XBZRLE.cache = new_cache;
 134     }
 135
 136 out_new_size:
 137     ret = pow2floor(new_size);
 138 out:
 139     XBZRLE_cache_unlock();
 140     return ret;
 141 }
 142
 143 /* accounting for migration statistics */
 144 typedef struct AccountingInfo {
 145     uint64_t dup_pages;
 146     uint64_t skipped_pages;
 147     uint64_t norm_pages;
 148     uint64_t iterations;
 149     uint64_t xbzrle_bytes;
 150     uint64_t xbzrle_pages;
 151     uint64_t xbzrle_cache_miss;
 152     double xbzrle_cache_miss_rate;
 153     uint64_t xbzrle_overflows;
 154 } AccountingInfo;
 155
 156 static AccountingInfo acct_info;
 157
 158 static void acct_clear(void)
 159 {
 160     memset(&acct_info, 0, sizeof(acct_info));
 161 }
 162
 163 uint64_t dup_mig_bytes_transferred(void)
 164 {
 165     return acct_info.dup_pages * TARGET_PAGE_SIZE;
 166 }
 167
 168 uint64_t dup_mig_pages_transferred(void)
 169 {
 170     return acct_info.dup_pages;
 171 }
 172
 173 uint64_t skipped_mig_bytes_transferred(void)
 174 {
 175     return acct_info.skipped_pages * TARGET_PAGE_SIZE;
 176 }
 177
 178 uint64_t skipped_mig_pages_transferred(void)
 179 {
 180     return acct_info.skipped_pages;
 181 }
 182
 183 uint64_t norm_mig_bytes_transferred(void)
 184 {
 185     return acct_info.norm_pages * TARGET_PAGE_SIZE;
 186 }
 187
 188 uint64_t norm_mig_pages_transferred(void)
 189 {
 190     return acct_info.norm_pages;
 191 }
 192
 193 uint64_t xbzrle_mig_bytes_transferred(void)
 194 {
 195     return acct_info.xbzrle_bytes;
 196 }
 197
 198 uint64_t xbzrle_mig_pages_transferred(void)
 199 {
 200     return acct_info.xbzrle_pages;
 201 }
 202
 203 uint64_t xbzrle_mig_pages_cache_miss(void)
 204 {
 205     return acct_info.xbzrle_cache_miss;
 206 }
 207
 208 double xbzrle_mig_cache_miss_rate(void)
 209 {
 210     return acct_info.xbzrle_cache_miss_rate;
 211 }
 212
 213 uint64_t xbzrle_mig_pages_overflow(void)
 214 {
 215     return acct_info.xbzrle_overflows;
 216 }
 217
 218 /* This is the last block that we have visited serching for dirty pages
 219  */
 220 static RAMBlock *last_seen_block;
 221 /* This is the last block from where we have sent data */
 222 static RAMBlock *last_sent_block;
 223 static ram_addr_t last_offset;
 224 static unsigned long *migration_bitmap;
 225 static uint64_t migration_dirty_pages;
 226 static uint32_t last_version;
 227 static bool ram_bulk_stage;
 228
 229 struct CompressParam {
 230     bool start;
 231     bool done;
 232     QEMUFile *file;
 233     QemuMutex mutex;
 234     QemuCond cond;
 235     RAMBlock *block;
 236     ram_addr_t offset;
 237 };
 238 typedef struct CompressParam CompressParam;
 239
 240 struct DecompressParam {
 241     bool start;
 242     QemuMutex mutex;
 243     QemuCond cond;
 244     void *des;
 245     uint8 *compbuf;
 246     int len;
 247 };
 248 typedef struct DecompressParam DecompressParam;
 249
 250 static CompressParam *comp_param;
 251 static QemuThread *compress_threads;
 252 /* comp_done_cond is used to wake up the migration thread when
 253  * one of the compression threads has finished the compression.
 254  * comp_done_lock is used to co-work with comp_done_cond.
 255  */
 256 static QemuMutex *comp_done_lock;
 257 static QemuCond *comp_done_cond;
 258 /* The empty QEMUFileOps will be used by file in CompressParam */
 259 static const QEMUFileOps empty_ops = { };
 260
 261 static bool compression_switch;
 262 static bool quit_comp_thread;
 263 static bool quit_decomp_thread;
 264 static DecompressParam *decomp_param;
 265 static QemuThread *decompress_threads;
 266 static uint8_t *compressed_data_buf;
 267
 268 static int do_compress_ram_page(CompressParam *param);
 269
 270 static void *do_data_compress(void *opaque)
 271 {
 272     CompressParam *param = opaque;
 273
 274     while (!quit_comp_thread) {
 275         qemu_mutex_lock(&param->mutex);
 276         /* Re-check the quit_comp_thread in case of
 277          * terminate_compression_threads is called just before
 278          * qemu_mutex_lock(&param->mutex) and after
 279          * while(!quit_comp_thread), re-check it here can make
 280          * sure the compression thread terminate as expected.
 281          */
 282         while (!param->start && !quit_comp_thread) {
 283             qemu_cond_wait(&param->cond, &param->mutex);
 284         }
 285         if (!quit_comp_thread) {
 286             do_compress_ram_page(param);
 287         }
 288         param->start = false;
 289         qemu_mutex_unlock(&param->mutex);
 290
 291         qemu_mutex_lock(comp_done_lock);
 292         param->done = true;
 293         qemu_cond_signal(comp_done_cond);
 294         qemu_mutex_unlock(comp_done_lock);
 295     }
 296
 297     return NULL;
 298 }
 299
 300 static inline void terminate_compression_threads(void)
 301 {
 302     int idx, thread_count;
 303
 304     thread_count = migrate_compress_threads();
 305     quit_comp_thread = true;
 306     for (idx = 0; idx < thread_count; idx++) {
 307         qemu_mutex_lock(&comp_param[idx].mutex);
 308         qemu_cond_signal(&comp_param[idx].cond);
 309         qemu_mutex_unlock(&comp_param[idx].mutex);
 310     }
 311 }
 312
 313 void migrate_compress_threads_join(void)
 314 {
 315     int i, thread_count;
 316
 317     if (!migrate_use_compression()) {
 318         return;
 319     }
 320     terminate_compression_threads();
 321     thread_count = migrate_compress_threads();
 322     for (i = 0; i < thread_count; i++) {
 323         qemu_thread_join(compress_threads + i);
 324         qemu_fclose(comp_param[i].file);
 325         qemu_mutex_destroy(&comp_param[i].mutex);
 326         qemu_cond_destroy(&comp_param[i].cond);
 327     }
 328     qemu_mutex_destroy(comp_done_lock);
 329     qemu_cond_destroy(comp_done_cond);
 330     g_free(compress_threads);
 331     g_free(comp_param);
 332     g_free(comp_done_cond);
 333     g_free(comp_done_lock);
 334     compress_threads = NULL;
 335     comp_param = NULL;
 336     comp_done_cond = NULL;
 337     comp_done_lock = NULL;
 338 }
 339
 340 void migrate_compress_threads_create(void)
 341 {
 342     int i, thread_count;
 343
 344     if (!migrate_use_compression()) {
 345         return;
 346     }
 347     quit_comp_thread = false;
 348     compression_switch = true;
 349     thread_count = migrate_compress_threads();
 350     compress_threads = g_new0(QemuThread, thread_count);
 351     comp_param = g_new0(CompressParam, thread_count);
 352     comp_done_cond = g_new0(QemuCond, 1);
 353     comp_done_lock = g_new0(QemuMutex, 1);
 354     qemu_cond_init(comp_done_cond);
 355     qemu_mutex_init(comp_done_lock);
 356     for (i = 0; i < thread_count; i++) {
 357         /* com_param[i].file is just used as a dummy buffer to save data, set
 358          * it's ops to empty.
 359          */
 360         comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
 361         comp_param[i].done = true;
 362         qemu_mutex_init(&comp_param[i].mutex);
 363         qemu_cond_init(&comp_param[i].cond);
 364         qemu_thread_create(compress_threads + i, "compress",
 365                            do_data_compress, comp_param + i,
 366                            QEMU_THREAD_JOINABLE);
 367     }
 368 }
 369
 370 /**
 371  * save_page_header: Write page header to wire
 372  *
 373  * If this is the 1st block, it also writes the block identification
 374  *
 375  * Returns: Number of bytes written
 376  *
 377  * @f: QEMUFile where to send the data
 378  * @block: block that contains the page we want to send
 379  * @offset: offset inside the block for the page
 380  *          in the lower bits, it contains flags
 381  */
 382 static size_t save_page_header(QEMUFile *f, RAMBlock *block, ram_addr_t offset)
 383 {
 384     size_t size;
 385
 386     qemu_put_be64(f, offset);
 387     size = 8;
 388
 389     if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
 390         qemu_put_byte(f, strlen(block->idstr));
 391         qemu_put_buffer(f, (uint8_t *)block->idstr,
 392                         strlen(block->idstr));
 393         size += 1 + strlen(block->idstr);
 394     }
 395     return size;
 396 }
 397
 398 /* Update the xbzrle cache to reflect a page that's been sent as all 0.
 399  * The important thing is that a stale (not-yet-0'd) page be replaced
 400  * by the new data.
 401  * As a bonus, if the page wasn't in the cache it gets added so that
 402  * when a small write is made into the 0'd page it gets XBZRLE sent
 403  */
 404 static void xbzrle_cache_zero_page(ram_addr_t current_addr)
 405 {
 406     if (ram_bulk_stage || !migrate_use_xbzrle()) {
 407         return;
 408     }
 409
 410     /* We don't care if this fails to allocate a new cache page
 411      * as long as it updated an old one */
 412     cache_insert(XBZRLE.cache, current_addr, ZERO_TARGET_PAGE,
 413                  bitmap_sync_count);
 414 }
 415
 416 #define ENCODING_FLAG_XBZRLE 0x1
 417
 418 /**
 419  * save_xbzrle_page: compress and send current page
 420  *
 421  * Returns: 1 means that we wrote the page
 422  *          0 means that page is identical to the one already sent
 423  *          -1 means that xbzrle would be longer than normal
 424  *
 425  * @f: QEMUFile where to send the data
 426  * @current_data:
 427  * @current_addr:
 428  * @block: block that contains the page we want to send
 429  * @offset: offset inside the block for the page
 430  * @last_stage: if we are at the completion stage
 431  * @bytes_transferred: increase it with the number of transferred bytes
 432  */
 433 static int save_xbzrle_page(QEMUFile *f, uint8_t **current_data,
 434                             ram_addr_t current_addr, RAMBlock *block,
 435                             ram_addr_t offset, bool last_stage,
 436                             uint64_t *bytes_transferred)
 437 {
 438     int encoded_len = 0, bytes_xbzrle;
 439     uint8_t *prev_cached_page;
 440
 441     if (!cache_is_cached(XBZRLE.cache, current_addr, bitmap_sync_count)) {
 442         acct_info.xbzrle_cache_miss++;
 443         if (!last_stage) {
 444             if (cache_insert(XBZRLE.cache, current_addr, *current_data,
 445                              bitmap_sync_count) == -1) {
 446                 return -1;
 447             } else {
 448                 /* update *current_data when the page has been
 449                    inserted into cache */
 450                 *current_data = get_cached_data(XBZRLE.cache, current_addr);
 451             }
 452         }
 453         return -1;
 454     }
 455
 456     prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
 457
 458     /* save current buffer into memory */
 459     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 460
 461     /* XBZRLE encoding (if there is no overflow) */
 462     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
 463                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
 464                                        TARGET_PAGE_SIZE);
 465     if (encoded_len == 0) {
 466         DPRINTF("Skipping unmodified page\n");
 467         return 0;
 468     } else if (encoded_len == -1) {
 469         DPRINTF("Overflow\n");
 470         acct_info.xbzrle_overflows++;
 471         /* update data in the cache */
 472         if (!last_stage) {
 473             memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
 474             *current_data = prev_cached_page;
 475         }
 476         return -1;
 477     }
 478
 479     /* we need to update the data in the cache, in order to get the same data */
 480     if (!last_stage) {
 481         memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
 482     }
 483
 484     /* Send XBZRLE based compressed page */
 485     bytes_xbzrle = save_page_header(f, block, offset | RAM_SAVE_FLAG_XBZRLE);
 486     qemu_put_byte(f, ENCODING_FLAG_XBZRLE);
 487     qemu_put_be16(f, encoded_len);
 488     qemu_put_buffer(f, XBZRLE.encoded_buf, encoded_len);
 489     bytes_xbzrle += encoded_len + 1 + 2;
 490     acct_info.xbzrle_pages++;
 491     acct_info.xbzrle_bytes += bytes_xbzrle;
 492     *bytes_transferred += bytes_xbzrle;
 493
 494     return 1;
 495 }
 496
 497 /* Called with rcu_read_lock() to protect migration_bitmap */
 498 static inline
 499 ram_addr_t migration_bitmap_find_and_reset_dirty(MemoryRegion *mr,
 500                                                  ram_addr_t start)
 501 {
 502     unsigned long base = mr->ram_addr >> TARGET_PAGE_BITS;
 503     unsigned long nr = base + (start >> TARGET_PAGE_BITS);
 504     uint64_t mr_size = TARGET_PAGE_ALIGN(memory_region_size(mr));
 505     unsigned long size = base + (mr_size >> TARGET_PAGE_BITS);
 506     unsigned long *bitmap;
 507
 508     unsigned long next;
 509
 510     bitmap = atomic_rcu_read(&migration_bitmap);
 511     if (ram_bulk_stage && nr > base) {
 512         next = nr + 1;
 513     } else {
 514         next = find_next_bit(bitmap, size, nr);
 515     }
 516
 517     if (next < size) {
 518         clear_bit(next, bitmap);
 519         migration_dirty_pages--;
 520     }
 521     return (next - base) << TARGET_PAGE_BITS;
 522 }
 523
 524 /* Called with rcu_read_lock() to protect migration_bitmap */
 525 static void migration_bitmap_sync_range(ram_addr_t start, ram_addr_t length)
 526 {
 527     unsigned long *bitmap;
 528     bitmap = atomic_rcu_read(&migration_bitmap);
 529     migration_dirty_pages +=
 530         cpu_physical_memory_sync_dirty_bitmap(bitmap, start, length);
 531 }
 532
 533
 534 /* Fix me: there are too many global variables used in migration process. */
 535 static int64_t start_time;
 536 static int64_t bytes_xfer_prev;
 537 static int64_t num_dirty_pages_period;
 538 static uint64_t xbzrle_cache_miss_prev;
 539 static uint64_t iterations_prev;
 540
 541 static void migration_bitmap_sync_init(void)
 542 {
 543     start_time = 0;
 544     bytes_xfer_prev = 0;
 545     num_dirty_pages_period = 0;
 546     xbzrle_cache_miss_prev = 0;
 547     iterations_prev = 0;
 548 }
 549
 550 /* Called with iothread lock held, to protect ram_list.dirty_memory[] */
 551 static void migration_bitmap_sync(void)
 552 {
 553     RAMBlock *block;
 554     uint64_t num_dirty_pages_init = migration_dirty_pages;
 555     MigrationState *s = migrate_get_current();
 556     int64_t end_time;
 557     int64_t bytes_xfer_now;
 558
 559     bitmap_sync_count++;
 560
 561     if (!bytes_xfer_prev) {
 562         bytes_xfer_prev = ram_bytes_transferred();
 563     }
 564
 565     if (!start_time) {
 566         start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 567     }
 568
 569     trace_migration_bitmap_sync_start();
 570     address_space_sync_dirty_bitmap(&address_space_memory);
 571
 572     rcu_read_lock();
 573     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
 574         migration_bitmap_sync_range(block->mr->ram_addr, block->used_length);
 575     }
 576     rcu_read_unlock();
 577
 578     trace_migration_bitmap_sync_end(migration_dirty_pages
 579                                     - num_dirty_pages_init);
 580     num_dirty_pages_period += migration_dirty_pages - num_dirty_pages_init;
 581     end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 582
 583     /* more than 1 second = 1000 millisecons */
 584     if (end_time > start_time + 1000) {
 585         if (migrate_auto_converge()) {
 586             /* The following detection logic can be refined later. For now:
 587                Check to see if the dirtied bytes is 50% more than the approx.
 588                amount of bytes that just got transferred since the last time we
 589                were in this routine. If that happens >N times (for now N==4)
 590                we turn on the throttle down logic */
 591             bytes_xfer_now = ram_bytes_transferred();
 592             if (s->dirty_pages_rate &&
 593                (num_dirty_pages_period * TARGET_PAGE_SIZE >
 594                    (bytes_xfer_now - bytes_xfer_prev)/2) &&
 595                (dirty_rate_high_cnt++ > 4)) {
 596                     trace_migration_throttle();
 597                     mig_throttle_on = true;
 598                     dirty_rate_high_cnt = 0;
 599              }
 600              bytes_xfer_prev = bytes_xfer_now;
 601         } else {
 602              mig_throttle_on = false;
 603         }
 604         if (migrate_use_xbzrle()) {
 605             if (iterations_prev != acct_info.iterations) {
 606                 acct_info.xbzrle_cache_miss_rate =
 607                    (double)(acct_info.xbzrle_cache_miss -
 608                             xbzrle_cache_miss_prev) /
 609                    (acct_info.iterations - iterations_prev);
 610             }
 611             iterations_prev = acct_info.iterations;
 612             xbzrle_cache_miss_prev = acct_info.xbzrle_cache_miss;
 613         }
 614         s->dirty_pages_rate = num_dirty_pages_period * 1000
 615             / (end_time - start_time);
 616         s->dirty_bytes_rate = s->dirty_pages_rate * TARGET_PAGE_SIZE;
 617         start_time = end_time;
 618         num_dirty_pages_period = 0;
 619     }
 620     s->dirty_sync_count = bitmap_sync_count;
 621 }
 622
 623 /**
 624  * save_zero_page: Send the zero page to the stream
 625  *
 626  * Returns: Number of pages written.
 627  *
 628  * @f: QEMUFile where to send the data
 629  * @block: block that contains the page we want to send
 630  * @offset: offset inside the block for the page
 631  * @p: pointer to the page
 632  * @bytes_transferred: increase it with the number of transferred bytes
 633  */
 634 static int save_zero_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset,
 635                           uint8_t *p, uint64_t *bytes_transferred)
 636 {
 637     int pages = -1;
 638
 639     if (is_zero_range(p, TARGET_PAGE_SIZE)) {
 640         acct_info.dup_pages++;
 641         *bytes_transferred += save_page_header(f, block,
 642                                                offset | RAM_SAVE_FLAG_COMPRESS);
 643         qemu_put_byte(f, 0);
 644         *bytes_transferred += 1;
 645         pages = 1;
 646     }
 647
 648     return pages;
 649 }
 650
 651 /**
 652  * ram_save_page: Send the given page to the stream
 653  *
 654  * Returns: Number of pages written.
 655  *
 656  * @f: QEMUFile where to send the data
 657  * @block: block that contains the page we want to send
 658  * @offset: offset inside the block for the page
 659  * @last_stage: if we are at the completion stage
 660  * @bytes_transferred: increase it with the number of transferred bytes
 661  */
 662 static int ram_save_page(QEMUFile *f, RAMBlock* block, ram_addr_t offset,
 663                          bool last_stage, uint64_t *bytes_transferred)
 664 {
 665     int pages = -1;
 666     uint64_t bytes_xmit;
 667     ram_addr_t current_addr;
 668     MemoryRegion *mr = block->mr;
 669     uint8_t *p;
 670     int ret;
 671     bool send_async = true;
 672
 673     p = memory_region_get_ram_ptr(mr) + offset;
 674
 675     /* In doubt sent page as normal */
 676     bytes_xmit = 0;
 677     ret = ram_control_save_page(f, block->offset,
 678                            offset, TARGET_PAGE_SIZE, &bytes_xmit);
 679     if (bytes_xmit) {
 680         *bytes_transferred += bytes_xmit;
 681         pages = 1;
 682     }
 683
 684     XBZRLE_cache_lock();
 685
 686     current_addr = block->offset + offset;
 687
 688     if (block == last_sent_block) {
 689         offset |= RAM_SAVE_FLAG_CONTINUE;
 690     }
 691     if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
 692         if (ret != RAM_SAVE_CONTROL_DELAYED) {
 693             if (bytes_xmit > 0) {
 694                 acct_info.norm_pages++;
 695             } else if (bytes_xmit == 0) {
 696                 acct_info.dup_pages++;
 697             }
 698         }
 699     } else {
 700         pages = save_zero_page(f, block, offset, p, bytes_transferred);
 701         if (pages > 0) {
 702             /* Must let xbzrle know, otherwise a previous (now 0'd) cached
 703              * page would be stale
 704              */
 705             xbzrle_cache_zero_page(current_addr);
 706         } else if (!ram_bulk_stage && migrate_use_xbzrle()) {
 707             pages = save_xbzrle_page(f, &p, current_addr, block,
 708                                      offset, last_stage, bytes_transferred);
 709             if (!last_stage) {
 710                 /* Can't send this cached data async, since the cache page
 711                  * might get updated before it gets to the wire
 712                  */
 713                 send_async = false;
 714             }
 715         }
 716     }
 717
 718     /* XBZRLE overflow or normal page */
 719     if (pages == -1) {
 720         *bytes_transferred += save_page_header(f, block,
 721                                                offset | RAM_SAVE_FLAG_PAGE);
 722         if (send_async) {
 723             qemu_put_buffer_async(f, p, TARGET_PAGE_SIZE);
 724         } else {
 725             qemu_put_buffer(f, p, TARGET_PAGE_SIZE);
 726         }
 727         *bytes_transferred += TARGET_PAGE_SIZE;
 728         pages = 1;
 729         acct_info.norm_pages++;
 730     }
 731
 732     XBZRLE_cache_unlock();
 733
 734     return pages;
 735 }
 736
 737 static int do_compress_ram_page(CompressParam *param)
 738 {
 739     int bytes_sent, blen;
 740     uint8_t *p;
 741     RAMBlock *block = param->block;
 742     ram_addr_t offset = param->offset;
 743
 744     p = memory_region_get_ram_ptr(block->mr) + (offset & TARGET_PAGE_MASK);
 745
 746     bytes_sent = save_page_header(param->file, block, offset |
 747                                   RAM_SAVE_FLAG_COMPRESS_PAGE);
 748     blen = qemu_put_compression_data(param->file, p, TARGET_PAGE_SIZE,
 749                                      migrate_compress_level());
 750     bytes_sent += blen;
 751
 752     return bytes_sent;
 753 }
 754
 755 static inline void start_compression(CompressParam *param)
 756 {
 757     param->done = false;
 758     qemu_mutex_lock(&param->mutex);
 759     param->start = true;
 760     qemu_cond_signal(&param->cond);
 761     qemu_mutex_unlock(&param->mutex);
 762 }
 763
 764 static inline void start_decompression(DecompressParam *param)
 765 {
 766     qemu_mutex_lock(&param->mutex);
 767     param->start = true;
 768     qemu_cond_signal(&param->cond);
 769     qemu_mutex_unlock(&param->mutex);
 770 }
 771
 772 static uint64_t bytes_transferred;
 773
 774 static void flush_compressed_data(QEMUFile *f)
 775 {
 776     int idx, len, thread_count;
 777
 778     if (!migrate_use_compression()) {
 779         return;
 780     }
 781     thread_count = migrate_compress_threads();
 782     for (idx = 0; idx < thread_count; idx++) {
 783         if (!comp_param[idx].done) {
 784             qemu_mutex_lock(comp_done_lock);
 785             while (!comp_param[idx].done && !quit_comp_thread) {
 786                 qemu_cond_wait(comp_done_cond, comp_done_lock);
 787             }
 788             qemu_mutex_unlock(comp_done_lock);
 789         }
 790         if (!quit_comp_thread) {
 791             len = qemu_put_qemu_file(f, comp_param[idx].file);
 792             bytes_transferred += len;
 793         }
 794     }
 795 }
 796
 797 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
 798                                        ram_addr_t offset)
 799 {
 800     param->block = block;
 801     param->offset = offset;
 802 }
 803
 804 static int compress_page_with_multi_thread(QEMUFile *f, RAMBlock *block,
 805                                            ram_addr_t offset,
 806                                            uint64_t *bytes_transferred)
 807 {
 808     int idx, thread_count, bytes_xmit = -1, pages = -1;
 809
 810     thread_count = migrate_compress_threads();
 811     qemu_mutex_lock(comp_done_lock);
 812     while (true) {
 813         for (idx = 0; idx < thread_count; idx++) {
 814             if (comp_param[idx].done) {
 815                 bytes_xmit = qemu_put_qemu_file(f, comp_param[idx].file);
 816                 set_compress_params(&comp_param[idx], block, offset);
 817                 start_compression(&comp_param[idx]);
 818                 pages = 1;
 819                 acct_info.norm_pages++;
 820                 *bytes_transferred += bytes_xmit;
 821                 break;
 822             }
 823         }
 824         if (pages > 0) {
 825             break;
 826         } else {
 827             qemu_cond_wait(comp_done_cond, comp_done_lock);
 828         }
 829     }
 830     qemu_mutex_unlock(comp_done_lock);
 831
 832     return pages;
 833 }
 834
 835 /**
 836  * ram_save_compressed_page: compress the given page and send it to the stream
 837  *
 838  * Returns: Number of pages written.
 839  *
 840  * @f: QEMUFile where to send the data
 841  * @block: block that contains the page we want to send
 842  * @offset: offset inside the block for the page
 843  * @last_stage: if we are at the completion stage
 844  * @bytes_transferred: increase it with the number of transferred bytes
 845  */
 846 static int ram_save_compressed_page(QEMUFile *f, RAMBlock *block,
 847                                     ram_addr_t offset, bool last_stage,
 848                                     uint64_t *bytes_transferred)
 849 {
 850     int pages = -1;
 851     uint64_t bytes_xmit;
 852     MemoryRegion *mr = block->mr;
 853     uint8_t *p;
 854     int ret;
 855
 856     p = memory_region_get_ram_ptr(mr) + offset;
 857
 858     bytes_xmit = 0;
 859     ret = ram_control_save_page(f, block->offset,
 860                                 offset, TARGET_PAGE_SIZE, &bytes_xmit);
 861     if (bytes_xmit) {
 862         *bytes_transferred += bytes_xmit;
 863         pages = 1;
 864     }
 865     if (block == last_sent_block) {
 866         offset |= RAM_SAVE_FLAG_CONTINUE;
 867     }
 868     if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
 869         if (ret != RAM_SAVE_CONTROL_DELAYED) {
 870             if (bytes_xmit > 0) {
 871                 acct_info.norm_pages++;
 872             } else if (bytes_xmit == 0) {
 873                 acct_info.dup_pages++;
 874             }
 875         }
 876     } else {
 877         /* When starting the process of a new block, the first page of
 878          * the block should be sent out before other pages in the same
 879          * block, and all the pages in last block should have been sent
 880          * out, keeping this order is important, because the 'cont' flag
 881          * is used to avoid resending the block name.
 882          */
 883         if (block != last_sent_block) {
 884             flush_compressed_data(f);
 885             pages = save_zero_page(f, block, offset, p, bytes_transferred);
 886             if (pages == -1) {
 887                 set_compress_params(&comp_param[0], block, offset);
 888                 /* Use the qemu thread to compress the data to make sure the
 889                  * first page is sent out before other pages
 890                  */
 891                 bytes_xmit = do_compress_ram_page(&comp_param[0]);
 892                 acct_info.norm_pages++;
 893                 qemu_put_qemu_file(f, comp_param[0].file);
 894                 *bytes_transferred += bytes_xmit;
 895                 pages = 1;
 896             }
 897         } else {
 898             pages = save_zero_page(f, block, offset, p, bytes_transferred);
 899             if (pages == -1) {
 900                 pages = compress_page_with_multi_thread(f, block, offset,
 901                                                         bytes_transferred);
 902             }
 903         }
 904     }
 905
 906     return pages;
 907 }
 908
 909 /**
 910  * ram_find_and_save_block: Finds a dirty page and sends it to f
 911  *
 912  * Called within an RCU critical section.
 913  *
 914  * Returns:  The number of pages written
 915  *           0 means no dirty pages
 916  *
 917  * @f: QEMUFile where to send the data
 918  * @last_stage: if we are at the completion stage
 919  * @bytes_transferred: increase it with the number of transferred bytes
 920  */
 921
 922 static int ram_find_and_save_block(QEMUFile *f, bool last_stage,
 923                                    uint64_t *bytes_transferred)
 924 {
 925     RAMBlock *block = last_seen_block;
 926     ram_addr_t offset = last_offset;
 927     bool complete_round = false;
 928     int pages = 0;
 929     MemoryRegion *mr;
 930
 931     if (!block)
 932         block = QLIST_FIRST_RCU(&ram_list.blocks);
 933
 934     while (true) {
 935         mr = block->mr;
 936         offset = migration_bitmap_find_and_reset_dirty(mr, offset);
 937         if (complete_round && block == last_seen_block &&
 938             offset >= last_offset) {
 939             break;
 940         }
 941         if (offset >= block->used_length) {
 942             offset = 0;
 943             block = QLIST_NEXT_RCU(block, next);
 944             if (!block) {
 945                 block = QLIST_FIRST_RCU(&ram_list.blocks);
 946                 complete_round = true;
 947                 ram_bulk_stage = false;
 948                 if (migrate_use_xbzrle()) {
 949                     /* If xbzrle is on, stop using the data compression at this
 950                      * point. In theory, xbzrle can do better than compression.
 951                      */
 952                     flush_compressed_data(f);
 953                     compression_switch = false;
 954                 }
 955             }
 956         } else {
 957             if (compression_switch && migrate_use_compression()) {
 958                 pages = ram_save_compressed_page(f, block, offset, last_stage,
 959                                                  bytes_transferred);
 960             } else {
 961                 pages = ram_save_page(f, block, offset, last_stage,
 962                                       bytes_transferred);
 963             }
 964
 965             /* if page is unmodified, continue to the next */
 966             if (pages > 0) {
 967                 last_sent_block = block;
 968                 break;
 969             }
 970         }
 971     }
 972
 973     last_seen_block = block;
 974     last_offset = offset;
 975
 976     return pages;
 977 }
 978
 979 void acct_update_position(QEMUFile *f, size_t size, bool zero)
 980 {
 981     uint64_t pages = size / TARGET_PAGE_SIZE;
 982     if (zero) {
 983         acct_info.dup_pages += pages;
 984     } else {
 985         acct_info.norm_pages += pages;
 986         bytes_transferred += size;
 987         qemu_update_position(f, size);
 988     }
 989 }
 990
 991 static ram_addr_t ram_save_remaining(void)
 992 {
 993     return migration_dirty_pages;
 994 }
 995
 996 uint64_t ram_bytes_remaining(void)
 997 {
 998     return ram_save_remaining() * TARGET_PAGE_SIZE;
 999 }
1000
1001 uint64_t ram_bytes_transferred(void)
1002 {
1003     return bytes_transferred;
1004 }
1005
1006 uint64_t ram_bytes_total(void)
1007 {
1008     RAMBlock *block;
1009     uint64_t total = 0;
1010
1011     rcu_read_lock();
1012     QLIST_FOREACH_RCU(block, &ram_list.blocks, next)
1013         total += block->used_length;
1014     rcu_read_unlock();
1015     return total;
1016 }
1017
1018 void free_xbzrle_decoded_buf(void)
1019 {
1020     g_free(xbzrle_decoded_buf);
1021     xbzrle_decoded_buf = NULL;
1022 }
1023
1024 static void migration_end(void)
1025 {
1026     /* caller have hold iothread lock or is in a bh, so there is
1027      * no writing race against this migration_bitmap
1028      */
1029     unsigned long *bitmap = migration_bitmap;
1030     atomic_rcu_set(&migration_bitmap, NULL);
1031     if (bitmap) {
1032         memory_global_dirty_log_stop();
1033         synchronize_rcu();
1034         g_free(bitmap);
1035     }
1036
1037     XBZRLE_cache_lock();
1038     if (XBZRLE.cache) {
1039         cache_fini(XBZRLE.cache);
1040         g_free(XBZRLE.encoded_buf);
1041         g_free(XBZRLE.current_buf);
1042         XBZRLE.cache = NULL;
1043         XBZRLE.encoded_buf = NULL;
1044         XBZRLE.current_buf = NULL;
1045     }
1046     XBZRLE_cache_unlock();
1047 }
1048
1049 static void ram_migration_cancel(void *opaque)
1050 {
1051     migration_end();
1052 }
1053
1054 static void reset_ram_globals(void)
1055 {
1056     last_seen_block = NULL;
1057     last_sent_block = NULL;
1058     last_offset = 0;
1059     last_version = ram_list.version;
1060     ram_bulk_stage = true;
1061 }
1062
1063 #define MAX_WAIT 50 /* ms, half buffered_file limit */
1064
1065
1066 /* Each of ram_save_setup, ram_save_iterate and ram_save_complete has
1067  * long-running RCU critical section.  When rcu-reclaims in the code
1068  * start to become numerous it will be necessary to reduce the
1069  * granularity of these critical sections.
1070  */
1071
1072 static int ram_save_setup(QEMUFile *f, void *opaque)
1073 {
1074     RAMBlock *block;
1075     int64_t ram_bitmap_pages; /* Size of bitmap in pages, including gaps */
1076
1077     mig_throttle_on = false;
1078     dirty_rate_high_cnt = 0;
1079     bitmap_sync_count = 0;
1080     migration_bitmap_sync_init();
1081
1082     if (migrate_use_xbzrle()) {
1083         XBZRLE_cache_lock();
1084         XBZRLE.cache = cache_init(migrate_xbzrle_cache_size() /
1085                                   TARGET_PAGE_SIZE,
1086                                   TARGET_PAGE_SIZE);
1087         if (!XBZRLE.cache) {
1088             XBZRLE_cache_unlock();
1089             error_report("Error creating cache");
1090             return -1;
1091         }
1092         XBZRLE_cache_unlock();
1093
1094         /* We prefer not to abort if there is no memory */
1095         XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
1096         if (!XBZRLE.encoded_buf) {
1097             error_report("Error allocating encoded_buf");
1098             return -1;
1099         }
1100
1101         XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
1102         if (!XBZRLE.current_buf) {
1103             error_report("Error allocating current_buf");
1104             g_free(XBZRLE.encoded_buf);
1105             XBZRLE.encoded_buf = NULL;
1106             return -1;
1107         }
1108
1109         acct_clear();
1110     }
1111
1112     /* iothread lock needed for ram_list.dirty_memory[] */
1113     qemu_mutex_lock_iothread();
1114     qemu_mutex_lock_ramlist();
1115     rcu_read_lock();
1116     bytes_transferred = 0;
1117     reset_ram_globals();
1118
1119     ram_bitmap_pages = last_ram_offset() >> TARGET_PAGE_BITS;
1120     migration_bitmap = bitmap_new(ram_bitmap_pages);
1121     bitmap_set(migration_bitmap, 0, ram_bitmap_pages);
1122
1123     /*
1124      * Count the total number of pages used by ram blocks not including any
1125      * gaps due to alignment or unplugs.
1126      */
1127     migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
1128
1129     memory_global_dirty_log_start();
1130     migration_bitmap_sync();
1131     qemu_mutex_unlock_ramlist();
1132     qemu_mutex_unlock_iothread();
1133
1134     qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
1135
1136     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1137         qemu_put_byte(f, strlen(block->idstr));
1138         qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
1139         qemu_put_be64(f, block->used_length);
1140     }
1141
1142     rcu_read_unlock();
1143
1144     ram_control_before_iterate(f, RAM_CONTROL_SETUP);
1145     ram_control_after_iterate(f, RAM_CONTROL_SETUP);
1146
1147     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
1148
1149     return 0;
1150 }
1151
1152 static int ram_save_iterate(QEMUFile *f, void *opaque)
1153 {
1154     int ret;
1155     int i;
1156     int64_t t0;
1157     int pages_sent = 0;
1158
1159     rcu_read_lock();
1160     if (ram_list.version != last_version) {
1161         reset_ram_globals();
1162     }
1163
1164     /* Read version before ram_list.blocks */
1165     smp_rmb();
1166
1167     ram_control_before_iterate(f, RAM_CONTROL_ROUND);
1168
1169     t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
1170     i = 0;
1171     while ((ret = qemu_file_rate_limit(f)) == 0) {
1172         int pages;
1173
1174         pages = ram_find_and_save_block(f, false, &bytes_transferred);
1175         /* no more pages to sent */
1176         if (pages == 0) {
1177             break;
1178         }
1179         pages_sent += pages;
1180         acct_info.iterations++;
1181         check_guest_throttling();
1182         /* we want to check in the 1st loop, just in case it was the 1st time
1183            and we had to sync the dirty bitmap.
1184            qemu_get_clock_ns() is a bit expensive, so we only check each some
1185            iterations
1186         */
1187         if ((i & 63) == 0) {
1188             uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
1189             if (t1 > MAX_WAIT) {
1190                 DPRINTF("big wait: %" PRIu64 " milliseconds, %d iterations\n",
1191                         t1, i);
1192                 break;
1193             }
1194         }
1195         i++;
1196     }
1197     flush_compressed_data(f);
1198     rcu_read_unlock();
1199
1200     /*
1201      * Must occur before EOS (or any QEMUFile operation)
1202      * because of RDMA protocol.
1203      */
1204     ram_control_after_iterate(f, RAM_CONTROL_ROUND);
1205
1206     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
1207     bytes_transferred += 8;
1208
1209     ret = qemu_file_get_error(f);
1210     if (ret < 0) {
1211         return ret;
1212     }
1213
1214     return pages_sent;
1215 }
1216
1217 /* Called with iothread lock */
1218 static int ram_save_complete(QEMUFile *f, void *opaque)
1219 {
1220     rcu_read_lock();
1221
1222     migration_bitmap_sync();
1223
1224     ram_control_before_iterate(f, RAM_CONTROL_FINISH);
1225
1226     /* try transferring iterative blocks of memory */
1227
1228     /* flush all remaining blocks regardless of rate limiting */
1229     while (true) {
1230         int pages;
1231
1232         pages = ram_find_and_save_block(f, true, &bytes_transferred);
1233         /* no more blocks to sent */
1234         if (pages == 0) {
1235             break;
1236         }
1237     }
1238
1239     flush_compressed_data(f);
1240     ram_control_after_iterate(f, RAM_CONTROL_FINISH);
1241     migration_end();
1242
1243     rcu_read_unlock();
1244     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
1245
1246     return 0;
1247 }
1248
1249 static uint64_t ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size)
1250 {
1251     uint64_t remaining_size;
1252
1253     remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
1254
1255     if (remaining_size < max_size) {
1256         qemu_mutex_lock_iothread();
1257         rcu_read_lock();
1258         migration_bitmap_sync();
1259         rcu_read_unlock();
1260         qemu_mutex_unlock_iothread();
1261         remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
1262     }
1263     return remaining_size;
1264 }
1265
1266 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
1267 {
1268     unsigned int xh_len;
1269     int xh_flags;
1270
1271     if (!xbzrle_decoded_buf) {
1272         xbzrle_decoded_buf = g_malloc(TARGET_PAGE_SIZE);
1273     }
1274
1275     /* extract RLE header */
1276     xh_flags = qemu_get_byte(f);
1277     xh_len = qemu_get_be16(f);
1278
1279     if (xh_flags != ENCODING_FLAG_XBZRLE) {
1280         error_report("Failed to load XBZRLE page - wrong compression!");
1281         return -1;
1282     }
1283
1284     if (xh_len > TARGET_PAGE_SIZE) {
1285         error_report("Failed to load XBZRLE page - len overflow!");
1286         return -1;
1287     }
1288     /* load data and decode */
1289     qemu_get_buffer(f, xbzrle_decoded_buf, xh_len);
1290
1291     /* decode RLE */
1292     if (xbzrle_decode_buffer(xbzrle_decoded_buf, xh_len, host,
1293                              TARGET_PAGE_SIZE) == -1) {
1294         error_report("Failed to load XBZRLE page - decode error!");
1295         return -1;
1296     }
1297
1298     return 0;
1299 }
1300
1301 /* Must be called from within a rcu critical section.
1302  * Returns a pointer from within the RCU-protected ram_list.
1303  */
1304 static inline void *host_from_stream_offset(QEMUFile *f,
1305                                             ram_addr_t offset,
1306                                             int flags)
1307 {
1308     static RAMBlock *block = NULL;
1309     char id[256];
1310     uint8_t len;
1311
1312     if (flags & RAM_SAVE_FLAG_CONTINUE) {
1313         if (!block || block->max_length <= offset) {
1314             error_report("Ack, bad migration stream!");
1315             return NULL;
1316         }
1317
1318         return memory_region_get_ram_ptr(block->mr) + offset;
1319     }
1320
1321     len = qemu_get_byte(f);
1322     qemu_get_buffer(f, (uint8_t *)id, len);
1323     id[len] = 0;
1324
1325     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1326         if (!strncmp(id, block->idstr, sizeof(id)) &&
1327             block->max_length > offset) {
1328             return memory_region_get_ram_ptr(block->mr) + offset;
1329         }
1330     }
1331
1332     error_report("Can't find block %s!", id);
1333     return NULL;
1334 }
1335
1336 /*
1337  * If a page (or a whole RDMA chunk) has been
1338  * determined to be zero, then zap it.
1339  */
1340 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
1341 {
1342     if (ch != 0 || !is_zero_range(host, size)) {
1343         memset(host, ch, size);
1344     }
1345 }
1346
1347 static void *do_data_decompress(void *opaque)
1348 {
1349     DecompressParam *param = opaque;
1350     unsigned long pagesize;
1351
1352     while (!quit_decomp_thread) {
1353         qemu_mutex_lock(&param->mutex);
1354         while (!param->start && !quit_decomp_thread) {
1355             qemu_cond_wait(&param->cond, &param->mutex);
1356             pagesize = TARGET_PAGE_SIZE;
1357             if (!quit_decomp_thread) {
1358                 /* uncompress() will return failed in some case, especially
1359                  * when the page is dirted when doing the compression, it's
1360                  * not a problem because the dirty page will be retransferred
1361                  * and uncompress() won't break the data in other pages.
1362                  */
1363                 uncompress((Bytef *)param->des, &pagesize,
1364                            (const Bytef *)param->compbuf, param->len);
1365             }
1366             param->start = false;
1367         }
1368         qemu_mutex_unlock(&param->mutex);
1369     }
1370
1371     return NULL;
1372 }
1373
1374 void migrate_decompress_threads_create(void)
1375 {
1376     int i, thread_count;
1377
1378     thread_count = migrate_decompress_threads();
1379     decompress_threads = g_new0(QemuThread, thread_count);
1380     decomp_param = g_new0(DecompressParam, thread_count);
1381     compressed_data_buf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
1382     quit_decomp_thread = false;
1383     for (i = 0; i < thread_count; i++) {
1384         qemu_mutex_init(&decomp_param[i].mutex);
1385         qemu_cond_init(&decomp_param[i].cond);
1386         decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
1387         qemu_thread_create(decompress_threads + i, "decompress",
1388                            do_data_decompress, decomp_param + i,
1389                            QEMU_THREAD_JOINABLE);
1390     }
1391 }
1392
1393 void migrate_decompress_threads_join(void)
1394 {
1395     int i, thread_count;
1396
1397     quit_decomp_thread = true;
1398     thread_count = migrate_decompress_threads();
1399     for (i = 0; i < thread_count; i++) {
1400         qemu_mutex_lock(&decomp_param[i].mutex);
1401         qemu_cond_signal(&decomp_param[i].cond);
1402         qemu_mutex_unlock(&decomp_param[i].mutex);
1403     }
1404     for (i = 0; i < thread_count; i++) {
1405         qemu_thread_join(decompress_threads + i);
1406         qemu_mutex_destroy(&decomp_param[i].mutex);
1407         qemu_cond_destroy(&decomp_param[i].cond);
1408         g_free(decomp_param[i].compbuf);
1409     }
1410     g_free(decompress_threads);
1411     g_free(decomp_param);
1412     g_free(compressed_data_buf);
1413     decompress_threads = NULL;
1414     decomp_param = NULL;
1415     compressed_data_buf = NULL;
1416 }
1417
1418 static void decompress_data_with_multi_threads(uint8_t *compbuf,
1419                                                void *host, int len)
1420 {
1421     int idx, thread_count;
1422
1423     thread_count = migrate_decompress_threads();
1424     while (true) {
1425         for (idx = 0; idx < thread_count; idx++) {
1426             if (!decomp_param[idx].start) {
1427                 memcpy(decomp_param[idx].compbuf, compbuf, len);
1428                 decomp_param[idx].des = host;
1429                 decomp_param[idx].len = len;
1430                 start_decompression(&decomp_param[idx]);
1431                 break;
1432             }
1433         }
1434         if (idx < thread_count) {
1435             break;
1436         }
1437     }
1438 }
1439
1440 static int ram_load(QEMUFile *f, void *opaque, int version_id)
1441 {
1442     int flags = 0, ret = 0;
1443     static uint64_t seq_iter;
1444     int len = 0;
1445
1446     seq_iter++;
1447
1448     if (version_id != 4) {
1449         ret = -EINVAL;
1450     }
1451
1452     /* This RCU critical section can be very long running.
1453      * When RCU reclaims in the code start to become numerous,
1454      * it will be necessary to reduce the granularity of this
1455      * critical section.
1456      */
1457     rcu_read_lock();
1458     while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
1459         ram_addr_t addr, total_ram_bytes;
1460         void *host;
1461         uint8_t ch;
1462
1463         addr = qemu_get_be64(f);
1464         flags = addr & ~TARGET_PAGE_MASK;
1465         addr &= TARGET_PAGE_MASK;
1466
1467         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
1468         case RAM_SAVE_FLAG_MEM_SIZE:
1469             /* Synchronize RAM block list */
1470             total_ram_bytes = addr;
1471             while (!ret && total_ram_bytes) {
1472                 RAMBlock *block;
1473                 char id[256];
1474                 ram_addr_t length;
1475
1476                 len = qemu_get_byte(f);
1477                 qemu_get_buffer(f, (uint8_t *)id, len);
1478                 id[len] = 0;
1479                 length = qemu_get_be64(f);
1480
1481                 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1482                     if (!strncmp(id, block->idstr, sizeof(id))) {
1483                         if (length != block->used_length) {
1484                             Error *local_err = NULL;
1485
1486                             ret = qemu_ram_resize(block->offset, length, &local_err);
1487                             if (local_err) {
1488                                 error_report_err(local_err);
1489                             }
1490                         }
1491                         ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
1492                                               block->idstr);
1493                         break;
1494                     }
1495                 }
1496
1497                 if (!block) {
1498                     error_report("Unknown ramblock \"%s\", cannot "
1499                                  "accept migration", id);
1500                     ret = -EINVAL;
1501                 }
1502
1503                 total_ram_bytes -= length;
1504             }
1505             break;
1506         case RAM_SAVE_FLAG_COMPRESS:
1507             host = host_from_stream_offset(f, addr, flags);
1508             if (!host) {
1509                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
1510                 ret = -EINVAL;
1511                 break;
1512             }
1513             ch = qemu_get_byte(f);
1514             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
1515             break;
1516         case RAM_SAVE_FLAG_PAGE:
1517             host = host_from_stream_offset(f, addr, flags);
1518             if (!host) {
1519                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
1520                 ret = -EINVAL;
1521                 break;
1522             }
1523             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
1524             break;
1525         case RAM_SAVE_FLAG_COMPRESS_PAGE:
1526             host = host_from_stream_offset(f, addr, flags);
1527             if (!host) {
1528                 error_report("Invalid RAM offset " RAM_ADDR_FMT, addr);
1529                 ret = -EINVAL;
1530                 break;
1531             }
1532
1533             len = qemu_get_be32(f);
1534             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
1535                 error_report("Invalid compressed data length: %d", len);
1536                 ret = -EINVAL;
1537                 break;
1538             }
1539             qemu_get_buffer(f, compressed_data_buf, len);
1540             decompress_data_with_multi_threads(compressed_data_buf, host, len);
1541             break;
1542         case RAM_SAVE_FLAG_XBZRLE:
1543             host = host_from_stream_offset(f, addr, flags);
1544             if (!host) {
1545                 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
1546                 ret = -EINVAL;
1547                 break;
1548             }
1549             if (load_xbzrle(f, addr, host) < 0) {
1550                 error_report("Failed to decompress XBZRLE page at "
1551                              RAM_ADDR_FMT, addr);
1552                 ret = -EINVAL;
1553                 break;
1554             }
1555             break;
1556         case RAM_SAVE_FLAG_EOS:
1557             /* normal exit */
1558             break;
1559         default:
1560             if (flags & RAM_SAVE_FLAG_HOOK) {
1561                 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
1562             } else {
1563                 error_report("Unknown combination of migration flags: %#x",
1564                              flags);
1565                 ret = -EINVAL;
1566             }
1567         }
1568         if (!ret) {
1569             ret = qemu_file_get_error(f);
1570         }
1571     }
1572
1573     rcu_read_unlock();
1574     DPRINTF("Completed load of VM with exit code %d seq iteration "
1575             "%" PRIu64 "\n", ret, seq_iter);
1576     return ret;
1577 }
1578
1579 static SaveVMHandlers savevm_ram_handlers = {
1580     .save_live_setup = ram_save_setup,
1581     .save_live_iterate = ram_save_iterate,
1582     .save_live_complete = ram_save_complete,
1583     .save_live_pending = ram_save_pending,
1584     .load_state = ram_load,
1585     .cancel = ram_migration_cancel,
1586 };
1587
1588 void ram_mig_init(void)
1589 {
1590     qemu_mutex_init(&XBZRLE.lock);
1591     register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, NULL);
1592 }
1593 /* Stub function that's gets run on the vcpu when its brought out of the
1594    VM to run inside qemu via async_run_on_cpu()*/
1595
1596 static void mig_sleep_cpu(void *opq)
1597 {
1598     qemu_mutex_unlock_iothread();
1599     g_usleep(30*1000);
1600     qemu_mutex_lock_iothread();
1601 }
1602
1603 /* To reduce the dirty rate explicitly disallow the VCPUs from spending
1604    much time in the VM. The migration thread will try to catchup.
1605    Workload will experience a performance drop.
1606 */
1607 static void mig_throttle_guest_down(void)
1608 {
1609     CPUState *cpu;
1610
1611     qemu_mutex_lock_iothread();
1612     CPU_FOREACH(cpu) {
1613         async_run_on_cpu(cpu, mig_sleep_cpu, NULL);
1614     }
1615     qemu_mutex_unlock_iothread();
1616 }
1617
1618 static void check_guest_throttling(void)
1619 {
1620     static int64_t t0;
1621     int64_t        t1;
1622
1623     if (!mig_throttle_on) {
1624         return;
1625     }
1626
1627     if (!t0)  {
1628         t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
1629         return;
1630     }
1631
1632     t1 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
1633
1634     /* If it has been more than 40 ms since the last time the guest
1635      * was throttled then do it again.
1636      */
1637     if (40 < (t1-t0)/1000000) {
1638         mig_throttle_guest_down();
1639         t0 = t1;
1640     }
1641 }