osdep.h: Remove <sys/signal.h> include
[qemu/ar7.git] / migration / ram.c
blob7811cde643a1f0ebf12e870373ba5eef07a6436d
1 /*
2 * QEMU System Emulator
4 * Copyright (c) 2003-2008 Fabrice Bellard
5 * Copyright (c) 2011-2015 Red Hat Inc
7 * Authors:
8 * Juan Quintela <quintela@redhat.com>
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 * THE SOFTWARE.
29 #include "qemu/osdep.h"
30 #include "cpu.h"
31 #include "qemu/cutils.h"
32 #include "qemu/bitops.h"
33 #include "qemu/bitmap.h"
34 #include "qemu/main-loop.h"
35 #include "xbzrle.h"
36 #include "ram.h"
37 #include "migration.h"
38 #include "migration/register.h"
39 #include "migration/misc.h"
40 #include "qemu-file.h"
41 #include "postcopy-ram.h"
42 #include "page_cache.h"
43 #include "qemu/error-report.h"
44 #include "qapi/error.h"
45 #include "qapi/qapi-types-migration.h"
46 #include "qapi/qapi-events-migration.h"
47 #include "qapi/qmp/qerror.h"
48 #include "trace.h"
49 #include "exec/ram_addr.h"
50 #include "exec/target_page.h"
51 #include "qemu/rcu_queue.h"
52 #include "migration/colo.h"
53 #include "block.h"
54 #include "sysemu/sysemu.h"
55 #include "sysemu/cpu-throttle.h"
56 #include "savevm.h"
57 #include "qemu/iov.h"
58 #include "multifd.h"
60 /***********************************************************/
61 /* ram save/restore */
63 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
64 * worked for pages that where filled with the same char. We switched
65 * it to only search for the zero value. And to avoid confusion with
66 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
69 #define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */
70 #define RAM_SAVE_FLAG_ZERO 0x02
71 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
72 #define RAM_SAVE_FLAG_PAGE 0x08
73 #define RAM_SAVE_FLAG_EOS 0x10
74 #define RAM_SAVE_FLAG_CONTINUE 0x20
75 #define RAM_SAVE_FLAG_XBZRLE 0x40
76 /* 0x80 is reserved in migration.h start with 0x100 next */
77 #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
79 static inline bool is_zero_range(uint8_t *p, uint64_t size)
81 return buffer_is_zero(p, size);
84 XBZRLECacheStats xbzrle_counters;
86 /* struct contains XBZRLE cache and a static page
87 used by the compression */
88 static struct {
89 /* buffer used for XBZRLE encoding */
90 uint8_t *encoded_buf;
91 /* buffer for storing page content */
92 uint8_t *current_buf;
93 /* Cache for XBZRLE, Protected by lock. */
94 PageCache *cache;
95 QemuMutex lock;
96 /* it will store a page full of zeros */
97 uint8_t *zero_target_page;
98 /* buffer used for XBZRLE decoding */
99 uint8_t *decoded_buf;
100 } XBZRLE;
102 static void XBZRLE_cache_lock(void)
104 if (migrate_use_xbzrle()) {
105 qemu_mutex_lock(&XBZRLE.lock);
109 static void XBZRLE_cache_unlock(void)
111 if (migrate_use_xbzrle()) {
112 qemu_mutex_unlock(&XBZRLE.lock);
117 * xbzrle_cache_resize: resize the xbzrle cache
119 * This function is called from qmp_migrate_set_cache_size in main
120 * thread, possibly while a migration is in progress. A running
121 * migration may be using the cache and might finish during this call,
122 * hence changes to the cache are protected by XBZRLE.lock().
124 * Returns 0 for success or -1 for error
126 * @new_size: new cache size
127 * @errp: set *errp if the check failed, with reason
129 int xbzrle_cache_resize(int64_t new_size, Error **errp)
131 PageCache *new_cache;
132 int64_t ret = 0;
134 /* Check for truncation */
135 if (new_size != (size_t)new_size) {
136 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
137 "exceeding address space");
138 return -1;
141 if (new_size == migrate_xbzrle_cache_size()) {
142 /* nothing to do */
143 return 0;
146 XBZRLE_cache_lock();
148 if (XBZRLE.cache != NULL) {
149 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
150 if (!new_cache) {
151 ret = -1;
152 goto out;
155 cache_fini(XBZRLE.cache);
156 XBZRLE.cache = new_cache;
158 out:
159 XBZRLE_cache_unlock();
160 return ret;
163 bool ramblock_is_ignored(RAMBlock *block)
165 return !qemu_ram_is_migratable(block) ||
166 (migrate_ignore_shared() && qemu_ram_is_shared(block));
169 #undef RAMBLOCK_FOREACH
171 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
173 RAMBlock *block;
174 int ret = 0;
176 RCU_READ_LOCK_GUARD();
178 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
179 ret = func(block, opaque);
180 if (ret) {
181 break;
184 return ret;
187 static void ramblock_recv_map_init(void)
189 RAMBlock *rb;
191 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
192 assert(!rb->receivedmap);
193 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
197 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
199 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
200 rb->receivedmap);
203 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
205 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
208 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
210 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
213 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
214 size_t nr)
216 bitmap_set_atomic(rb->receivedmap,
217 ramblock_recv_bitmap_offset(host_addr, rb),
218 nr);
221 #define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL)
224 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
226 * Returns >0 if success with sent bytes, or <0 if error.
228 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
229 const char *block_name)
231 RAMBlock *block = qemu_ram_block_by_name(block_name);
232 unsigned long *le_bitmap, nbits;
233 uint64_t size;
235 if (!block) {
236 error_report("%s: invalid block name: %s", __func__, block_name);
237 return -1;
240 nbits = block->used_length >> TARGET_PAGE_BITS;
243 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
244 * machines we may need 4 more bytes for padding (see below
245 * comment). So extend it a bit before hand.
247 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
250 * Always use little endian when sending the bitmap. This is
251 * required that when source and destination VMs are not using the
252 * same endianness. (Note: big endian won't work.)
254 bitmap_to_le(le_bitmap, block->receivedmap, nbits);
256 /* Size of the bitmap, in bytes */
257 size = DIV_ROUND_UP(nbits, 8);
260 * size is always aligned to 8 bytes for 64bit machines, but it
261 * may not be true for 32bit machines. We need this padding to
262 * make sure the migration can survive even between 32bit and
263 * 64bit machines.
265 size = ROUND_UP(size, 8);
267 qemu_put_be64(file, size);
268 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
270 * Mark as an end, in case the middle part is screwed up due to
271 * some "mysterious" reason.
273 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
274 qemu_fflush(file);
276 g_free(le_bitmap);
278 if (qemu_file_get_error(file)) {
279 return qemu_file_get_error(file);
282 return size + sizeof(size);
286 * An outstanding page request, on the source, having been received
287 * and queued
289 struct RAMSrcPageRequest {
290 RAMBlock *rb;
291 hwaddr offset;
292 hwaddr len;
294 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
297 /* State of RAM for migration */
298 struct RAMState {
299 /* QEMUFile used for this migration */
300 QEMUFile *f;
301 /* Last block that we have visited searching for dirty pages */
302 RAMBlock *last_seen_block;
303 /* Last block from where we have sent data */
304 RAMBlock *last_sent_block;
305 /* Last dirty target page we have sent */
306 ram_addr_t last_page;
307 /* last ram version we have seen */
308 uint32_t last_version;
309 /* We are in the first round */
310 bool ram_bulk_stage;
311 /* The free page optimization is enabled */
312 bool fpo_enabled;
313 /* How many times we have dirty too many pages */
314 int dirty_rate_high_cnt;
315 /* these variables are used for bitmap sync */
316 /* last time we did a full bitmap_sync */
317 int64_t time_last_bitmap_sync;
318 /* bytes transferred at start_time */
319 uint64_t bytes_xfer_prev;
320 /* number of dirty pages since start_time */
321 uint64_t num_dirty_pages_period;
322 /* xbzrle misses since the beginning of the period */
323 uint64_t xbzrle_cache_miss_prev;
324 /* Amount of xbzrle pages since the beginning of the period */
325 uint64_t xbzrle_pages_prev;
326 /* Amount of xbzrle encoded bytes since the beginning of the period */
327 uint64_t xbzrle_bytes_prev;
329 /* compression statistics since the beginning of the period */
330 /* amount of count that no free thread to compress data */
331 uint64_t compress_thread_busy_prev;
332 /* amount bytes after compression */
333 uint64_t compressed_size_prev;
334 /* amount of compressed pages */
335 uint64_t compress_pages_prev;
337 /* total handled target pages at the beginning of period */
338 uint64_t target_page_count_prev;
339 /* total handled target pages since start */
340 uint64_t target_page_count;
341 /* number of dirty bits in the bitmap */
342 uint64_t migration_dirty_pages;
343 /* Protects modification of the bitmap and migration dirty pages */
344 QemuMutex bitmap_mutex;
345 /* The RAMBlock used in the last src_page_requests */
346 RAMBlock *last_req_rb;
347 /* Queue of outstanding page requests from the destination */
348 QemuMutex src_page_req_mutex;
349 QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
351 typedef struct RAMState RAMState;
353 static RAMState *ram_state;
355 static NotifierWithReturnList precopy_notifier_list;
357 void precopy_infrastructure_init(void)
359 notifier_with_return_list_init(&precopy_notifier_list);
362 void precopy_add_notifier(NotifierWithReturn *n)
364 notifier_with_return_list_add(&precopy_notifier_list, n);
367 void precopy_remove_notifier(NotifierWithReturn *n)
369 notifier_with_return_remove(n);
372 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
374 PrecopyNotifyData pnd;
375 pnd.reason = reason;
376 pnd.errp = errp;
378 return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
381 void precopy_enable_free_page_optimization(void)
383 if (!ram_state) {
384 return;
387 ram_state->fpo_enabled = true;
390 uint64_t ram_bytes_remaining(void)
392 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
396 MigrationStats ram_counters;
398 /* used by the search for pages to send */
399 struct PageSearchStatus {
400 /* Current block being searched */
401 RAMBlock *block;
402 /* Current page to search from */
403 unsigned long page;
404 /* Set once we wrap around */
405 bool complete_round;
407 typedef struct PageSearchStatus PageSearchStatus;
409 CompressionStats compression_counters;
411 struct CompressParam {
412 bool done;
413 bool quit;
414 bool zero_page;
415 QEMUFile *file;
416 QemuMutex mutex;
417 QemuCond cond;
418 RAMBlock *block;
419 ram_addr_t offset;
421 /* internally used fields */
422 z_stream stream;
423 uint8_t *originbuf;
425 typedef struct CompressParam CompressParam;
427 struct DecompressParam {
428 bool done;
429 bool quit;
430 QemuMutex mutex;
431 QemuCond cond;
432 void *des;
433 uint8_t *compbuf;
434 int len;
435 z_stream stream;
437 typedef struct DecompressParam DecompressParam;
439 static CompressParam *comp_param;
440 static QemuThread *compress_threads;
441 /* comp_done_cond is used to wake up the migration thread when
442 * one of the compression threads has finished the compression.
443 * comp_done_lock is used to co-work with comp_done_cond.
445 static QemuMutex comp_done_lock;
446 static QemuCond comp_done_cond;
447 /* The empty QEMUFileOps will be used by file in CompressParam */
448 static const QEMUFileOps empty_ops = { };
450 static QEMUFile *decomp_file;
451 static DecompressParam *decomp_param;
452 static QemuThread *decompress_threads;
453 static QemuMutex decomp_done_lock;
454 static QemuCond decomp_done_cond;
456 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
457 ram_addr_t offset, uint8_t *source_buf);
459 static void *do_data_compress(void *opaque)
461 CompressParam *param = opaque;
462 RAMBlock *block;
463 ram_addr_t offset;
464 bool zero_page;
466 qemu_mutex_lock(&param->mutex);
467 while (!param->quit) {
468 if (param->block) {
469 block = param->block;
470 offset = param->offset;
471 param->block = NULL;
472 qemu_mutex_unlock(&param->mutex);
474 zero_page = do_compress_ram_page(param->file, &param->stream,
475 block, offset, param->originbuf);
477 qemu_mutex_lock(&comp_done_lock);
478 param->done = true;
479 param->zero_page = zero_page;
480 qemu_cond_signal(&comp_done_cond);
481 qemu_mutex_unlock(&comp_done_lock);
483 qemu_mutex_lock(&param->mutex);
484 } else {
485 qemu_cond_wait(&param->cond, &param->mutex);
488 qemu_mutex_unlock(&param->mutex);
490 return NULL;
493 static void compress_threads_save_cleanup(void)
495 int i, thread_count;
497 if (!migrate_use_compression() || !comp_param) {
498 return;
501 thread_count = migrate_compress_threads();
502 for (i = 0; i < thread_count; i++) {
504 * we use it as a indicator which shows if the thread is
505 * properly init'd or not
507 if (!comp_param[i].file) {
508 break;
511 qemu_mutex_lock(&comp_param[i].mutex);
512 comp_param[i].quit = true;
513 qemu_cond_signal(&comp_param[i].cond);
514 qemu_mutex_unlock(&comp_param[i].mutex);
516 qemu_thread_join(compress_threads + i);
517 qemu_mutex_destroy(&comp_param[i].mutex);
518 qemu_cond_destroy(&comp_param[i].cond);
519 deflateEnd(&comp_param[i].stream);
520 g_free(comp_param[i].originbuf);
521 qemu_fclose(comp_param[i].file);
522 comp_param[i].file = NULL;
524 qemu_mutex_destroy(&comp_done_lock);
525 qemu_cond_destroy(&comp_done_cond);
526 g_free(compress_threads);
527 g_free(comp_param);
528 compress_threads = NULL;
529 comp_param = NULL;
532 static int compress_threads_save_setup(void)
534 int i, thread_count;
536 if (!migrate_use_compression()) {
537 return 0;
539 thread_count = migrate_compress_threads();
540 compress_threads = g_new0(QemuThread, thread_count);
541 comp_param = g_new0(CompressParam, thread_count);
542 qemu_cond_init(&comp_done_cond);
543 qemu_mutex_init(&comp_done_lock);
544 for (i = 0; i < thread_count; i++) {
545 comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
546 if (!comp_param[i].originbuf) {
547 goto exit;
550 if (deflateInit(&comp_param[i].stream,
551 migrate_compress_level()) != Z_OK) {
552 g_free(comp_param[i].originbuf);
553 goto exit;
556 /* comp_param[i].file is just used as a dummy buffer to save data,
557 * set its ops to empty.
559 comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
560 comp_param[i].done = true;
561 comp_param[i].quit = false;
562 qemu_mutex_init(&comp_param[i].mutex);
563 qemu_cond_init(&comp_param[i].cond);
564 qemu_thread_create(compress_threads + i, "compress",
565 do_data_compress, comp_param + i,
566 QEMU_THREAD_JOINABLE);
568 return 0;
570 exit:
571 compress_threads_save_cleanup();
572 return -1;
576 * save_page_header: write page header to wire
578 * If this is the 1st block, it also writes the block identification
580 * Returns the number of bytes written
582 * @f: QEMUFile where to send the data
583 * @block: block that contains the page we want to send
584 * @offset: offset inside the block for the page
585 * in the lower bits, it contains flags
587 static size_t save_page_header(RAMState *rs, QEMUFile *f, RAMBlock *block,
588 ram_addr_t offset)
590 size_t size, len;
592 if (block == rs->last_sent_block) {
593 offset |= RAM_SAVE_FLAG_CONTINUE;
595 qemu_put_be64(f, offset);
596 size = 8;
598 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
599 len = strlen(block->idstr);
600 qemu_put_byte(f, len);
601 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
602 size += 1 + len;
603 rs->last_sent_block = block;
605 return size;
609 * mig_throttle_guest_down: throotle down the guest
611 * Reduce amount of guest cpu execution to hopefully slow down memory
612 * writes. If guest dirty memory rate is reduced below the rate at
613 * which we can transfer pages to the destination then we should be
614 * able to complete migration. Some workloads dirty memory way too
615 * fast and will not effectively converge, even with auto-converge.
617 static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
618 uint64_t bytes_dirty_threshold)
620 MigrationState *s = migrate_get_current();
621 uint64_t pct_initial = s->parameters.cpu_throttle_initial;
622 uint64_t pct_increment = s->parameters.cpu_throttle_increment;
623 bool pct_tailslow = s->parameters.cpu_throttle_tailslow;
624 int pct_max = s->parameters.max_cpu_throttle;
626 uint64_t throttle_now = cpu_throttle_get_percentage();
627 uint64_t cpu_now, cpu_ideal, throttle_inc;
629 /* We have not started throttling yet. Let's start it. */
630 if (!cpu_throttle_active()) {
631 cpu_throttle_set(pct_initial);
632 } else {
633 /* Throttling already on, just increase the rate */
634 if (!pct_tailslow) {
635 throttle_inc = pct_increment;
636 } else {
637 /* Compute the ideal CPU percentage used by Guest, which may
638 * make the dirty rate match the dirty rate threshold. */
639 cpu_now = 100 - throttle_now;
640 cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
641 bytes_dirty_period);
642 throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
644 cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
649 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
651 * @rs: current RAM state
652 * @current_addr: address for the zero page
654 * Update the xbzrle cache to reflect a page that's been sent as all 0.
655 * The important thing is that a stale (not-yet-0'd) page be replaced
656 * by the new data.
657 * As a bonus, if the page wasn't in the cache it gets added so that
658 * when a small write is made into the 0'd page it gets XBZRLE sent.
660 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
662 if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
663 return;
666 /* We don't care if this fails to allocate a new cache page
667 * as long as it updated an old one */
668 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
669 ram_counters.dirty_sync_count);
672 #define ENCODING_FLAG_XBZRLE 0x1
675 * save_xbzrle_page: compress and send current page
677 * Returns: 1 means that we wrote the page
678 * 0 means that page is identical to the one already sent
679 * -1 means that xbzrle would be longer than normal
681 * @rs: current RAM state
682 * @current_data: pointer to the address of the page contents
683 * @current_addr: addr of the page
684 * @block: block that contains the page we want to send
685 * @offset: offset inside the block for the page
686 * @last_stage: if we are at the completion stage
688 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
689 ram_addr_t current_addr, RAMBlock *block,
690 ram_addr_t offset, bool last_stage)
692 int encoded_len = 0, bytes_xbzrle;
693 uint8_t *prev_cached_page;
695 if (!cache_is_cached(XBZRLE.cache, current_addr,
696 ram_counters.dirty_sync_count)) {
697 xbzrle_counters.cache_miss++;
698 if (!last_stage) {
699 if (cache_insert(XBZRLE.cache, current_addr, *current_data,
700 ram_counters.dirty_sync_count) == -1) {
701 return -1;
702 } else {
703 /* update *current_data when the page has been
704 inserted into cache */
705 *current_data = get_cached_data(XBZRLE.cache, current_addr);
708 return -1;
712 * Reaching here means the page has hit the xbzrle cache, no matter what
713 * encoding result it is (normal encoding, overflow or skipping the page),
714 * count the page as encoded. This is used to calculate the encoding rate.
716 * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
717 * 2nd page turns out to be skipped (i.e. no new bytes written to the
718 * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
719 * skipped page included. In this way, the encoding rate can tell if the
720 * guest page is good for xbzrle encoding.
722 xbzrle_counters.pages++;
723 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
725 /* save current buffer into memory */
726 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
728 /* XBZRLE encoding (if there is no overflow) */
729 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
730 TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
731 TARGET_PAGE_SIZE);
734 * Update the cache contents, so that it corresponds to the data
735 * sent, in all cases except where we skip the page.
737 if (!last_stage && encoded_len != 0) {
738 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
740 * In the case where we couldn't compress, ensure that the caller
741 * sends the data from the cache, since the guest might have
742 * changed the RAM since we copied it.
744 *current_data = prev_cached_page;
747 if (encoded_len == 0) {
748 trace_save_xbzrle_page_skipping();
749 return 0;
750 } else if (encoded_len == -1) {
751 trace_save_xbzrle_page_overflow();
752 xbzrle_counters.overflow++;
753 xbzrle_counters.bytes += TARGET_PAGE_SIZE;
754 return -1;
757 /* Send XBZRLE based compressed page */
758 bytes_xbzrle = save_page_header(rs, rs->f, block,
759 offset | RAM_SAVE_FLAG_XBZRLE);
760 qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
761 qemu_put_be16(rs->f, encoded_len);
762 qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
763 bytes_xbzrle += encoded_len + 1 + 2;
765 * Like compressed_size (please see update_compress_thread_counts),
766 * the xbzrle encoded bytes don't count the 8 byte header with
767 * RAM_SAVE_FLAG_CONTINUE.
769 xbzrle_counters.bytes += bytes_xbzrle - 8;
770 ram_counters.transferred += bytes_xbzrle;
772 return 1;
776 * migration_bitmap_find_dirty: find the next dirty page from start
778 * Returns the page offset within memory region of the start of a dirty page
780 * @rs: current RAM state
781 * @rb: RAMBlock where to search for dirty pages
782 * @start: page where we start the search
784 static inline
785 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
786 unsigned long start)
788 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
789 unsigned long *bitmap = rb->bmap;
790 unsigned long next;
792 if (ramblock_is_ignored(rb)) {
793 return size;
797 * When the free page optimization is enabled, we need to check the bitmap
798 * to send the non-free pages rather than all the pages in the bulk stage.
800 if (!rs->fpo_enabled && rs->ram_bulk_stage && start > 0) {
801 next = start + 1;
802 } else {
803 next = find_next_bit(bitmap, size, start);
806 return next;
809 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
810 RAMBlock *rb,
811 unsigned long page)
813 bool ret;
815 qemu_mutex_lock(&rs->bitmap_mutex);
818 * Clear dirty bitmap if needed. This _must_ be called before we
819 * send any of the page in the chunk because we need to make sure
820 * we can capture further page content changes when we sync dirty
821 * log the next time. So as long as we are going to send any of
822 * the page in the chunk we clear the remote dirty bitmap for all.
823 * Clearing it earlier won't be a problem, but too late will.
825 if (rb->clear_bmap && clear_bmap_test_and_clear(rb, page)) {
826 uint8_t shift = rb->clear_bmap_shift;
827 hwaddr size = 1ULL << (TARGET_PAGE_BITS + shift);
828 hwaddr start = (((ram_addr_t)page) << TARGET_PAGE_BITS) & (-size);
831 * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
832 * can make things easier sometimes since then start address
833 * of the small chunk will always be 64 pages aligned so the
834 * bitmap will always be aligned to unsigned long. We should
835 * even be able to remove this restriction but I'm simply
836 * keeping it.
838 assert(shift >= 6);
839 trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
840 memory_region_clear_dirty_bitmap(rb->mr, start, size);
843 ret = test_and_clear_bit(page, rb->bmap);
845 if (ret) {
846 rs->migration_dirty_pages--;
848 qemu_mutex_unlock(&rs->bitmap_mutex);
850 return ret;
853 /* Called with RCU critical section */
854 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
856 uint64_t new_dirty_pages =
857 cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
859 rs->migration_dirty_pages += new_dirty_pages;
860 rs->num_dirty_pages_period += new_dirty_pages;
864 * ram_pagesize_summary: calculate all the pagesizes of a VM
866 * Returns a summary bitmap of the page sizes of all RAMBlocks
868 * For VMs with just normal pages this is equivalent to the host page
869 * size. If it's got some huge pages then it's the OR of all the
870 * different page sizes.
872 uint64_t ram_pagesize_summary(void)
874 RAMBlock *block;
875 uint64_t summary = 0;
877 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
878 summary |= block->page_size;
881 return summary;
884 uint64_t ram_get_total_transferred_pages(void)
886 return ram_counters.normal + ram_counters.duplicate +
887 compression_counters.pages + xbzrle_counters.pages;
890 static void migration_update_rates(RAMState *rs, int64_t end_time)
892 uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
893 double compressed_size;
895 /* calculate period counters */
896 ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
897 / (end_time - rs->time_last_bitmap_sync);
899 if (!page_count) {
900 return;
903 if (migrate_use_xbzrle()) {
904 double encoded_size, unencoded_size;
906 xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
907 rs->xbzrle_cache_miss_prev) / page_count;
908 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
909 unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
910 TARGET_PAGE_SIZE;
911 encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
912 if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
913 xbzrle_counters.encoding_rate = 0;
914 } else {
915 xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
917 rs->xbzrle_pages_prev = xbzrle_counters.pages;
918 rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
921 if (migrate_use_compression()) {
922 compression_counters.busy_rate = (double)(compression_counters.busy -
923 rs->compress_thread_busy_prev) / page_count;
924 rs->compress_thread_busy_prev = compression_counters.busy;
926 compressed_size = compression_counters.compressed_size -
927 rs->compressed_size_prev;
928 if (compressed_size) {
929 double uncompressed_size = (compression_counters.pages -
930 rs->compress_pages_prev) * TARGET_PAGE_SIZE;
932 /* Compression-Ratio = Uncompressed-size / Compressed-size */
933 compression_counters.compression_rate =
934 uncompressed_size / compressed_size;
936 rs->compress_pages_prev = compression_counters.pages;
937 rs->compressed_size_prev = compression_counters.compressed_size;
942 static void migration_trigger_throttle(RAMState *rs)
944 MigrationState *s = migrate_get_current();
945 uint64_t threshold = s->parameters.throttle_trigger_threshold;
947 uint64_t bytes_xfer_period = ram_counters.transferred - rs->bytes_xfer_prev;
948 uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
949 uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
951 /* During block migration the auto-converge logic incorrectly detects
952 * that ram migration makes no progress. Avoid this by disabling the
953 * throttling logic during the bulk phase of block migration. */
954 if (migrate_auto_converge() && !blk_mig_bulk_active()) {
955 /* The following detection logic can be refined later. For now:
956 Check to see if the ratio between dirtied bytes and the approx.
957 amount of bytes that just got transferred since the last time
958 we were in this routine reaches the threshold. If that happens
959 twice, start or increase throttling. */
961 if ((bytes_dirty_period > bytes_dirty_threshold) &&
962 (++rs->dirty_rate_high_cnt >= 2)) {
963 trace_migration_throttle();
964 rs->dirty_rate_high_cnt = 0;
965 mig_throttle_guest_down(bytes_dirty_period,
966 bytes_dirty_threshold);
971 static void migration_bitmap_sync(RAMState *rs)
973 RAMBlock *block;
974 int64_t end_time;
976 ram_counters.dirty_sync_count++;
978 if (!rs->time_last_bitmap_sync) {
979 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
982 trace_migration_bitmap_sync_start();
983 memory_global_dirty_log_sync();
985 qemu_mutex_lock(&rs->bitmap_mutex);
986 WITH_RCU_READ_LOCK_GUARD() {
987 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
988 ramblock_sync_dirty_bitmap(rs, block);
990 ram_counters.remaining = ram_bytes_remaining();
992 qemu_mutex_unlock(&rs->bitmap_mutex);
994 memory_global_after_dirty_log_sync();
995 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
997 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
999 /* more than 1 second = 1000 millisecons */
1000 if (end_time > rs->time_last_bitmap_sync + 1000) {
1001 migration_trigger_throttle(rs);
1003 migration_update_rates(rs, end_time);
1005 rs->target_page_count_prev = rs->target_page_count;
1007 /* reset period counters */
1008 rs->time_last_bitmap_sync = end_time;
1009 rs->num_dirty_pages_period = 0;
1010 rs->bytes_xfer_prev = ram_counters.transferred;
1012 if (migrate_use_events()) {
1013 qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
1017 static void migration_bitmap_sync_precopy(RAMState *rs)
1019 Error *local_err = NULL;
1022 * The current notifier usage is just an optimization to migration, so we
1023 * don't stop the normal migration process in the error case.
1025 if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1026 error_report_err(local_err);
1027 local_err = NULL;
1030 migration_bitmap_sync(rs);
1032 if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1033 error_report_err(local_err);
1038 * save_zero_page_to_file: send the zero page to the file
1040 * Returns the size of data written to the file, 0 means the page is not
1041 * a zero page
1043 * @rs: current RAM state
1044 * @file: the file where the data is saved
1045 * @block: block that contains the page we want to send
1046 * @offset: offset inside the block for the page
1048 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1049 RAMBlock *block, ram_addr_t offset)
1051 uint8_t *p = block->host + offset;
1052 int len = 0;
1054 if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1055 len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1056 qemu_put_byte(file, 0);
1057 len += 1;
1059 return len;
1063 * save_zero_page: send the zero page to the stream
1065 * Returns the number of pages written.
1067 * @rs: current RAM state
1068 * @block: block that contains the page we want to send
1069 * @offset: offset inside the block for the page
1071 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1073 int len = save_zero_page_to_file(rs, rs->f, block, offset);
1075 if (len) {
1076 ram_counters.duplicate++;
1077 ram_counters.transferred += len;
1078 return 1;
1080 return -1;
1083 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
1085 if (!migrate_release_ram() || !migration_in_postcopy()) {
1086 return;
1089 ram_discard_range(rbname, offset, ((ram_addr_t)pages) << TARGET_PAGE_BITS);
1093 * @pages: the number of pages written by the control path,
1094 * < 0 - error
1095 * > 0 - number of pages written
1097 * Return true if the pages has been saved, otherwise false is returned.
1099 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1100 int *pages)
1102 uint64_t bytes_xmit = 0;
1103 int ret;
1105 *pages = -1;
1106 ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1107 &bytes_xmit);
1108 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1109 return false;
1112 if (bytes_xmit) {
1113 ram_counters.transferred += bytes_xmit;
1114 *pages = 1;
1117 if (ret == RAM_SAVE_CONTROL_DELAYED) {
1118 return true;
1121 if (bytes_xmit > 0) {
1122 ram_counters.normal++;
1123 } else if (bytes_xmit == 0) {
1124 ram_counters.duplicate++;
1127 return true;
1131 * directly send the page to the stream
1133 * Returns the number of pages written.
1135 * @rs: current RAM state
1136 * @block: block that contains the page we want to send
1137 * @offset: offset inside the block for the page
1138 * @buf: the page to be sent
1139 * @async: send to page asyncly
1141 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1142 uint8_t *buf, bool async)
1144 ram_counters.transferred += save_page_header(rs, rs->f, block,
1145 offset | RAM_SAVE_FLAG_PAGE);
1146 if (async) {
1147 qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1148 migrate_release_ram() &
1149 migration_in_postcopy());
1150 } else {
1151 qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1153 ram_counters.transferred += TARGET_PAGE_SIZE;
1154 ram_counters.normal++;
1155 return 1;
1159 * ram_save_page: send the given page to the stream
1161 * Returns the number of pages written.
1162 * < 0 - error
1163 * >=0 - Number of pages written - this might legally be 0
1164 * if xbzrle noticed the page was the same.
1166 * @rs: current RAM state
1167 * @block: block that contains the page we want to send
1168 * @offset: offset inside the block for the page
1169 * @last_stage: if we are at the completion stage
1171 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
1173 int pages = -1;
1174 uint8_t *p;
1175 bool send_async = true;
1176 RAMBlock *block = pss->block;
1177 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1178 ram_addr_t current_addr = block->offset + offset;
1180 p = block->host + offset;
1181 trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1183 XBZRLE_cache_lock();
1184 if (!rs->ram_bulk_stage && !migration_in_postcopy() &&
1185 migrate_use_xbzrle()) {
1186 pages = save_xbzrle_page(rs, &p, current_addr, block,
1187 offset, last_stage);
1188 if (!last_stage) {
1189 /* Can't send this cached data async, since the cache page
1190 * might get updated before it gets to the wire
1192 send_async = false;
1196 /* XBZRLE overflow or normal page */
1197 if (pages == -1) {
1198 pages = save_normal_page(rs, block, offset, p, send_async);
1201 XBZRLE_cache_unlock();
1203 return pages;
1206 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
1207 ram_addr_t offset)
1209 if (multifd_queue_page(rs->f, block, offset) < 0) {
1210 return -1;
1212 ram_counters.normal++;
1214 return 1;
1217 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1218 ram_addr_t offset, uint8_t *source_buf)
1220 RAMState *rs = ram_state;
1221 uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1222 bool zero_page = false;
1223 int ret;
1225 if (save_zero_page_to_file(rs, f, block, offset)) {
1226 zero_page = true;
1227 goto exit;
1230 save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1233 * copy it to a internal buffer to avoid it being modified by VM
1234 * so that we can catch up the error during compression and
1235 * decompression
1237 memcpy(source_buf, p, TARGET_PAGE_SIZE);
1238 ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1239 if (ret < 0) {
1240 qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
1241 error_report("compressed data failed!");
1242 return false;
1245 exit:
1246 ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
1247 return zero_page;
1250 static void
1251 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1253 ram_counters.transferred += bytes_xmit;
1255 if (param->zero_page) {
1256 ram_counters.duplicate++;
1257 return;
1260 /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1261 compression_counters.compressed_size += bytes_xmit - 8;
1262 compression_counters.pages++;
1265 static bool save_page_use_compression(RAMState *rs);
1267 static void flush_compressed_data(RAMState *rs)
1269 int idx, len, thread_count;
1271 if (!save_page_use_compression(rs)) {
1272 return;
1274 thread_count = migrate_compress_threads();
1276 qemu_mutex_lock(&comp_done_lock);
1277 for (idx = 0; idx < thread_count; idx++) {
1278 while (!comp_param[idx].done) {
1279 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1282 qemu_mutex_unlock(&comp_done_lock);
1284 for (idx = 0; idx < thread_count; idx++) {
1285 qemu_mutex_lock(&comp_param[idx].mutex);
1286 if (!comp_param[idx].quit) {
1287 len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1289 * it's safe to fetch zero_page without holding comp_done_lock
1290 * as there is no further request submitted to the thread,
1291 * i.e, the thread should be waiting for a request at this point.
1293 update_compress_thread_counts(&comp_param[idx], len);
1295 qemu_mutex_unlock(&comp_param[idx].mutex);
1299 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1300 ram_addr_t offset)
1302 param->block = block;
1303 param->offset = offset;
1306 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1307 ram_addr_t offset)
1309 int idx, thread_count, bytes_xmit = -1, pages = -1;
1310 bool wait = migrate_compress_wait_thread();
1312 thread_count = migrate_compress_threads();
1313 qemu_mutex_lock(&comp_done_lock);
1314 retry:
1315 for (idx = 0; idx < thread_count; idx++) {
1316 if (comp_param[idx].done) {
1317 comp_param[idx].done = false;
1318 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1319 qemu_mutex_lock(&comp_param[idx].mutex);
1320 set_compress_params(&comp_param[idx], block, offset);
1321 qemu_cond_signal(&comp_param[idx].cond);
1322 qemu_mutex_unlock(&comp_param[idx].mutex);
1323 pages = 1;
1324 update_compress_thread_counts(&comp_param[idx], bytes_xmit);
1325 break;
1330 * wait for the free thread if the user specifies 'compress-wait-thread',
1331 * otherwise we will post the page out in the main thread as normal page.
1333 if (pages < 0 && wait) {
1334 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1335 goto retry;
1337 qemu_mutex_unlock(&comp_done_lock);
1339 return pages;
1343 * find_dirty_block: find the next dirty page and update any state
1344 * associated with the search process.
1346 * Returns true if a page is found
1348 * @rs: current RAM state
1349 * @pss: data about the state of the current dirty page scan
1350 * @again: set to false if the search has scanned the whole of RAM
1352 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1354 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1355 if (pss->complete_round && pss->block == rs->last_seen_block &&
1356 pss->page >= rs->last_page) {
1358 * We've been once around the RAM and haven't found anything.
1359 * Give up.
1361 *again = false;
1362 return false;
1364 if ((((ram_addr_t)pss->page) << TARGET_PAGE_BITS)
1365 >= pss->block->used_length) {
1366 /* Didn't find anything in this RAM Block */
1367 pss->page = 0;
1368 pss->block = QLIST_NEXT_RCU(pss->block, next);
1369 if (!pss->block) {
1371 * If memory migration starts over, we will meet a dirtied page
1372 * which may still exists in compression threads's ring, so we
1373 * should flush the compressed data to make sure the new page
1374 * is not overwritten by the old one in the destination.
1376 * Also If xbzrle is on, stop using the data compression at this
1377 * point. In theory, xbzrle can do better than compression.
1379 flush_compressed_data(rs);
1381 /* Hit the end of the list */
1382 pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1383 /* Flag that we've looped */
1384 pss->complete_round = true;
1385 rs->ram_bulk_stage = false;
1387 /* Didn't find anything this time, but try again on the new block */
1388 *again = true;
1389 return false;
1390 } else {
1391 /* Can go around again, but... */
1392 *again = true;
1393 /* We've found something so probably don't need to */
1394 return true;
1399 * unqueue_page: gets a page of the queue
1401 * Helper for 'get_queued_page' - gets a page off the queue
1403 * Returns the block of the page (or NULL if none available)
1405 * @rs: current RAM state
1406 * @offset: used to return the offset within the RAMBlock
1408 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1410 RAMBlock *block = NULL;
1412 if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) {
1413 return NULL;
1416 QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
1417 if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1418 struct RAMSrcPageRequest *entry =
1419 QSIMPLEQ_FIRST(&rs->src_page_requests);
1420 block = entry->rb;
1421 *offset = entry->offset;
1423 if (entry->len > TARGET_PAGE_SIZE) {
1424 entry->len -= TARGET_PAGE_SIZE;
1425 entry->offset += TARGET_PAGE_SIZE;
1426 } else {
1427 memory_region_unref(block->mr);
1428 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1429 g_free(entry);
1430 migration_consume_urgent_request();
1434 return block;
1438 * get_queued_page: unqueue a page from the postcopy requests
1440 * Skips pages that are already sent (!dirty)
1442 * Returns true if a queued page is found
1444 * @rs: current RAM state
1445 * @pss: data about the state of the current dirty page scan
1447 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1449 RAMBlock *block;
1450 ram_addr_t offset;
1451 bool dirty;
1453 do {
1454 block = unqueue_page(rs, &offset);
1456 * We're sending this page, and since it's postcopy nothing else
1457 * will dirty it, and we must make sure it doesn't get sent again
1458 * even if this queue request was received after the background
1459 * search already sent it.
1461 if (block) {
1462 unsigned long page;
1464 page = offset >> TARGET_PAGE_BITS;
1465 dirty = test_bit(page, block->bmap);
1466 if (!dirty) {
1467 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1468 page);
1469 } else {
1470 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1474 } while (block && !dirty);
1476 if (block) {
1478 * As soon as we start servicing pages out of order, then we have
1479 * to kill the bulk stage, since the bulk stage assumes
1480 * in (migration_bitmap_find_and_reset_dirty) that every page is
1481 * dirty, that's no longer true.
1483 rs->ram_bulk_stage = false;
1486 * We want the background search to continue from the queued page
1487 * since the guest is likely to want other pages near to the page
1488 * it just requested.
1490 pss->block = block;
1491 pss->page = offset >> TARGET_PAGE_BITS;
1494 * This unqueued page would break the "one round" check, even is
1495 * really rare.
1497 pss->complete_round = false;
1500 return !!block;
1504 * migration_page_queue_free: drop any remaining pages in the ram
1505 * request queue
1507 * It should be empty at the end anyway, but in error cases there may
1508 * be some left. in case that there is any page left, we drop it.
1511 static void migration_page_queue_free(RAMState *rs)
1513 struct RAMSrcPageRequest *mspr, *next_mspr;
1514 /* This queue generally should be empty - but in the case of a failed
1515 * migration might have some droppings in.
1517 RCU_READ_LOCK_GUARD();
1518 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1519 memory_region_unref(mspr->rb->mr);
1520 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1521 g_free(mspr);
1526 * ram_save_queue_pages: queue the page for transmission
1528 * A request from postcopy destination for example.
1530 * Returns zero on success or negative on error
1532 * @rbname: Name of the RAMBLock of the request. NULL means the
1533 * same that last one.
1534 * @start: starting address from the start of the RAMBlock
1535 * @len: length (in bytes) to send
1537 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1539 RAMBlock *ramblock;
1540 RAMState *rs = ram_state;
1542 ram_counters.postcopy_requests++;
1543 RCU_READ_LOCK_GUARD();
1545 if (!rbname) {
1546 /* Reuse last RAMBlock */
1547 ramblock = rs->last_req_rb;
1549 if (!ramblock) {
1551 * Shouldn't happen, we can't reuse the last RAMBlock if
1552 * it's the 1st request.
1554 error_report("ram_save_queue_pages no previous block");
1555 return -1;
1557 } else {
1558 ramblock = qemu_ram_block_by_name(rbname);
1560 if (!ramblock) {
1561 /* We shouldn't be asked for a non-existent RAMBlock */
1562 error_report("ram_save_queue_pages no block '%s'", rbname);
1563 return -1;
1565 rs->last_req_rb = ramblock;
1567 trace_ram_save_queue_pages(ramblock->idstr, start, len);
1568 if (start + len > ramblock->used_length) {
1569 error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1570 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1571 __func__, start, len, ramblock->used_length);
1572 return -1;
1575 struct RAMSrcPageRequest *new_entry =
1576 g_malloc0(sizeof(struct RAMSrcPageRequest));
1577 new_entry->rb = ramblock;
1578 new_entry->offset = start;
1579 new_entry->len = len;
1581 memory_region_ref(ramblock->mr);
1582 qemu_mutex_lock(&rs->src_page_req_mutex);
1583 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1584 migration_make_urgent_request();
1585 qemu_mutex_unlock(&rs->src_page_req_mutex);
1587 return 0;
1590 static bool save_page_use_compression(RAMState *rs)
1592 if (!migrate_use_compression()) {
1593 return false;
1597 * If xbzrle is on, stop using the data compression after first
1598 * round of migration even if compression is enabled. In theory,
1599 * xbzrle can do better than compression.
1601 if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
1602 return true;
1605 return false;
1609 * try to compress the page before posting it out, return true if the page
1610 * has been properly handled by compression, otherwise needs other
1611 * paths to handle it
1613 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1615 if (!save_page_use_compression(rs)) {
1616 return false;
1620 * When starting the process of a new block, the first page of
1621 * the block should be sent out before other pages in the same
1622 * block, and all the pages in last block should have been sent
1623 * out, keeping this order is important, because the 'cont' flag
1624 * is used to avoid resending the block name.
1626 * We post the fist page as normal page as compression will take
1627 * much CPU resource.
1629 if (block != rs->last_sent_block) {
1630 flush_compressed_data(rs);
1631 return false;
1634 if (compress_page_with_multi_thread(rs, block, offset) > 0) {
1635 return true;
1638 compression_counters.busy++;
1639 return false;
1643 * ram_save_target_page: save one target page
1645 * Returns the number of pages written
1647 * @rs: current RAM state
1648 * @pss: data about the page we want to send
1649 * @last_stage: if we are at the completion stage
1651 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
1652 bool last_stage)
1654 RAMBlock *block = pss->block;
1655 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1656 int res;
1658 if (control_save_page(rs, block, offset, &res)) {
1659 return res;
1662 if (save_compress_page(rs, block, offset)) {
1663 return 1;
1666 res = save_zero_page(rs, block, offset);
1667 if (res > 0) {
1668 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
1669 * page would be stale
1671 if (!save_page_use_compression(rs)) {
1672 XBZRLE_cache_lock();
1673 xbzrle_cache_zero_page(rs, block->offset + offset);
1674 XBZRLE_cache_unlock();
1676 ram_release_pages(block->idstr, offset, res);
1677 return res;
1681 * Do not use multifd for:
1682 * 1. Compression as the first page in the new block should be posted out
1683 * before sending the compressed page
1684 * 2. In postcopy as one whole host page should be placed
1686 if (!save_page_use_compression(rs) && migrate_use_multifd()
1687 && !migration_in_postcopy()) {
1688 return ram_save_multifd_page(rs, block, offset);
1691 return ram_save_page(rs, pss, last_stage);
1695 * ram_save_host_page: save a whole host page
1697 * Starting at *offset send pages up to the end of the current host
1698 * page. It's valid for the initial offset to point into the middle of
1699 * a host page in which case the remainder of the hostpage is sent.
1700 * Only dirty target pages are sent. Note that the host page size may
1701 * be a huge page for this block.
1702 * The saving stops at the boundary of the used_length of the block
1703 * if the RAMBlock isn't a multiple of the host page size.
1705 * Returns the number of pages written or negative on error
1707 * @rs: current RAM state
1708 * @ms: current migration state
1709 * @pss: data about the page we want to send
1710 * @last_stage: if we are at the completion stage
1712 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
1713 bool last_stage)
1715 int tmppages, pages = 0;
1716 size_t pagesize_bits =
1717 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
1719 if (ramblock_is_ignored(pss->block)) {
1720 error_report("block %s should not be migrated !", pss->block->idstr);
1721 return 0;
1724 do {
1725 /* Check the pages is dirty and if it is send it */
1726 if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
1727 pss->page++;
1728 continue;
1731 tmppages = ram_save_target_page(rs, pss, last_stage);
1732 if (tmppages < 0) {
1733 return tmppages;
1736 pages += tmppages;
1737 pss->page++;
1738 /* Allow rate limiting to happen in the middle of huge pages */
1739 migration_rate_limit();
1740 } while ((pss->page & (pagesize_bits - 1)) &&
1741 offset_in_ramblock(pss->block,
1742 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS));
1744 /* The offset we leave with is the last one we looked at */
1745 pss->page--;
1746 return pages;
1750 * ram_find_and_save_block: finds a dirty page and sends it to f
1752 * Called within an RCU critical section.
1754 * Returns the number of pages written where zero means no dirty pages,
1755 * or negative on error
1757 * @rs: current RAM state
1758 * @last_stage: if we are at the completion stage
1760 * On systems where host-page-size > target-page-size it will send all the
1761 * pages in a host page that are dirty.
1764 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
1766 PageSearchStatus pss;
1767 int pages = 0;
1768 bool again, found;
1770 /* No dirty page as there is zero RAM */
1771 if (!ram_bytes_total()) {
1772 return pages;
1775 pss.block = rs->last_seen_block;
1776 pss.page = rs->last_page;
1777 pss.complete_round = false;
1779 if (!pss.block) {
1780 pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1783 do {
1784 again = true;
1785 found = get_queued_page(rs, &pss);
1787 if (!found) {
1788 /* priority queue empty, so just search for something dirty */
1789 found = find_dirty_block(rs, &pss, &again);
1792 if (found) {
1793 pages = ram_save_host_page(rs, &pss, last_stage);
1795 } while (!pages && again);
1797 rs->last_seen_block = pss.block;
1798 rs->last_page = pss.page;
1800 return pages;
1803 void acct_update_position(QEMUFile *f, size_t size, bool zero)
1805 uint64_t pages = size / TARGET_PAGE_SIZE;
1807 if (zero) {
1808 ram_counters.duplicate += pages;
1809 } else {
1810 ram_counters.normal += pages;
1811 ram_counters.transferred += size;
1812 qemu_update_position(f, size);
1816 static uint64_t ram_bytes_total_common(bool count_ignored)
1818 RAMBlock *block;
1819 uint64_t total = 0;
1821 RCU_READ_LOCK_GUARD();
1823 if (count_ignored) {
1824 RAMBLOCK_FOREACH_MIGRATABLE(block) {
1825 total += block->used_length;
1827 } else {
1828 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1829 total += block->used_length;
1832 return total;
1835 uint64_t ram_bytes_total(void)
1837 return ram_bytes_total_common(false);
1840 static void xbzrle_load_setup(void)
1842 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
1845 static void xbzrle_load_cleanup(void)
1847 g_free(XBZRLE.decoded_buf);
1848 XBZRLE.decoded_buf = NULL;
1851 static void ram_state_cleanup(RAMState **rsp)
1853 if (*rsp) {
1854 migration_page_queue_free(*rsp);
1855 qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
1856 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
1857 g_free(*rsp);
1858 *rsp = NULL;
1862 static void xbzrle_cleanup(void)
1864 XBZRLE_cache_lock();
1865 if (XBZRLE.cache) {
1866 cache_fini(XBZRLE.cache);
1867 g_free(XBZRLE.encoded_buf);
1868 g_free(XBZRLE.current_buf);
1869 g_free(XBZRLE.zero_target_page);
1870 XBZRLE.cache = NULL;
1871 XBZRLE.encoded_buf = NULL;
1872 XBZRLE.current_buf = NULL;
1873 XBZRLE.zero_target_page = NULL;
1875 XBZRLE_cache_unlock();
1878 static void ram_save_cleanup(void *opaque)
1880 RAMState **rsp = opaque;
1881 RAMBlock *block;
1883 /* caller have hold iothread lock or is in a bh, so there is
1884 * no writing race against the migration bitmap
1886 memory_global_dirty_log_stop();
1888 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1889 g_free(block->clear_bmap);
1890 block->clear_bmap = NULL;
1891 g_free(block->bmap);
1892 block->bmap = NULL;
1895 xbzrle_cleanup();
1896 compress_threads_save_cleanup();
1897 ram_state_cleanup(rsp);
1900 static void ram_state_reset(RAMState *rs)
1902 rs->last_seen_block = NULL;
1903 rs->last_sent_block = NULL;
1904 rs->last_page = 0;
1905 rs->last_version = ram_list.version;
1906 rs->ram_bulk_stage = true;
1907 rs->fpo_enabled = false;
1910 #define MAX_WAIT 50 /* ms, half buffered_file limit */
1913 * 'expected' is the value you expect the bitmap mostly to be full
1914 * of; it won't bother printing lines that are all this value.
1915 * If 'todump' is null the migration bitmap is dumped.
1917 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
1918 unsigned long pages)
1920 int64_t cur;
1921 int64_t linelen = 128;
1922 char linebuf[129];
1924 for (cur = 0; cur < pages; cur += linelen) {
1925 int64_t curb;
1926 bool found = false;
1928 * Last line; catch the case where the line length
1929 * is longer than remaining ram
1931 if (cur + linelen > pages) {
1932 linelen = pages - cur;
1934 for (curb = 0; curb < linelen; curb++) {
1935 bool thisbit = test_bit(cur + curb, todump);
1936 linebuf[curb] = thisbit ? '1' : '.';
1937 found = found || (thisbit != expected);
1939 if (found) {
1940 linebuf[curb] = '\0';
1941 fprintf(stderr, "0x%08" PRIx64 " : %s\n", cur, linebuf);
1946 /* **** functions for postcopy ***** */
1948 void ram_postcopy_migrated_memory_release(MigrationState *ms)
1950 struct RAMBlock *block;
1952 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1953 unsigned long *bitmap = block->bmap;
1954 unsigned long range = block->used_length >> TARGET_PAGE_BITS;
1955 unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
1957 while (run_start < range) {
1958 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
1959 ram_discard_range(block->idstr,
1960 ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
1961 ((ram_addr_t)(run_end - run_start))
1962 << TARGET_PAGE_BITS);
1963 run_start = find_next_zero_bit(bitmap, range, run_end + 1);
1969 * postcopy_send_discard_bm_ram: discard a RAMBlock
1971 * Returns zero on success
1973 * Callback from postcopy_each_ram_send_discard for each RAMBlock
1975 * @ms: current migration state
1976 * @block: RAMBlock to discard
1978 static int postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
1980 unsigned long end = block->used_length >> TARGET_PAGE_BITS;
1981 unsigned long current;
1982 unsigned long *bitmap = block->bmap;
1984 for (current = 0; current < end; ) {
1985 unsigned long one = find_next_bit(bitmap, end, current);
1986 unsigned long zero, discard_length;
1988 if (one >= end) {
1989 break;
1992 zero = find_next_zero_bit(bitmap, end, one + 1);
1994 if (zero >= end) {
1995 discard_length = end - one;
1996 } else {
1997 discard_length = zero - one;
1999 postcopy_discard_send_range(ms, one, discard_length);
2000 current = one + discard_length;
2003 return 0;
2007 * postcopy_each_ram_send_discard: discard all RAMBlocks
2009 * Returns 0 for success or negative for error
2011 * Utility for the outgoing postcopy code.
2012 * Calls postcopy_send_discard_bm_ram for each RAMBlock
2013 * passing it bitmap indexes and name.
2014 * (qemu_ram_foreach_block ends up passing unscaled lengths
2015 * which would mean postcopy code would have to deal with target page)
2017 * @ms: current migration state
2019 static int postcopy_each_ram_send_discard(MigrationState *ms)
2021 struct RAMBlock *block;
2022 int ret;
2024 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2025 postcopy_discard_send_init(ms, block->idstr);
2028 * Postcopy sends chunks of bitmap over the wire, but it
2029 * just needs indexes at this point, avoids it having
2030 * target page specific code.
2032 ret = postcopy_send_discard_bm_ram(ms, block);
2033 postcopy_discard_send_finish(ms);
2034 if (ret) {
2035 return ret;
2039 return 0;
2043 * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2045 * Helper for postcopy_chunk_hostpages; it's called twice to
2046 * canonicalize the two bitmaps, that are similar, but one is
2047 * inverted.
2049 * Postcopy requires that all target pages in a hostpage are dirty or
2050 * clean, not a mix. This function canonicalizes the bitmaps.
2052 * @ms: current migration state
2053 * @block: block that contains the page we want to canonicalize
2055 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
2057 RAMState *rs = ram_state;
2058 unsigned long *bitmap = block->bmap;
2059 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2060 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2061 unsigned long run_start;
2063 if (block->page_size == TARGET_PAGE_SIZE) {
2064 /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2065 return;
2068 /* Find a dirty page */
2069 run_start = find_next_bit(bitmap, pages, 0);
2071 while (run_start < pages) {
2074 * If the start of this run of pages is in the middle of a host
2075 * page, then we need to fixup this host page.
2077 if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2078 /* Find the end of this run */
2079 run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2081 * If the end isn't at the start of a host page, then the
2082 * run doesn't finish at the end of a host page
2083 * and we need to discard.
2087 if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2088 unsigned long page;
2089 unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2090 host_ratio);
2091 run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2093 /* Clean up the bitmap */
2094 for (page = fixup_start_addr;
2095 page < fixup_start_addr + host_ratio; page++) {
2097 * Remark them as dirty, updating the count for any pages
2098 * that weren't previously dirty.
2100 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2104 /* Find the next dirty page for the next iteration */
2105 run_start = find_next_bit(bitmap, pages, run_start);
2110 * postcopy_chunk_hostpages: discard any partially sent host page
2112 * Utility for the outgoing postcopy code.
2114 * Discard any partially sent host-page size chunks, mark any partially
2115 * dirty host-page size chunks as all dirty. In this case the host-page
2116 * is the host-page for the particular RAMBlock, i.e. it might be a huge page
2118 * Returns zero on success
2120 * @ms: current migration state
2121 * @block: block we want to work with
2123 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
2125 postcopy_discard_send_init(ms, block->idstr);
2128 * Ensure that all partially dirty host pages are made fully dirty.
2130 postcopy_chunk_hostpages_pass(ms, block);
2132 postcopy_discard_send_finish(ms);
2133 return 0;
2137 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2139 * Returns zero on success
2141 * Transmit the set of pages to be discarded after precopy to the target
2142 * these are pages that:
2143 * a) Have been previously transmitted but are now dirty again
2144 * b) Pages that have never been transmitted, this ensures that
2145 * any pages on the destination that have been mapped by background
2146 * tasks get discarded (transparent huge pages is the specific concern)
2147 * Hopefully this is pretty sparse
2149 * @ms: current migration state
2151 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
2153 RAMState *rs = ram_state;
2154 RAMBlock *block;
2155 int ret;
2157 RCU_READ_LOCK_GUARD();
2159 /* This should be our last sync, the src is now paused */
2160 migration_bitmap_sync(rs);
2162 /* Easiest way to make sure we don't resume in the middle of a host-page */
2163 rs->last_seen_block = NULL;
2164 rs->last_sent_block = NULL;
2165 rs->last_page = 0;
2167 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2168 /* Deal with TPS != HPS and huge pages */
2169 ret = postcopy_chunk_hostpages(ms, block);
2170 if (ret) {
2171 return ret;
2174 #ifdef DEBUG_POSTCOPY
2175 ram_debug_dump_bitmap(block->bmap, true,
2176 block->used_length >> TARGET_PAGE_BITS);
2177 #endif
2179 trace_ram_postcopy_send_discard_bitmap();
2181 return postcopy_each_ram_send_discard(ms);
2185 * ram_discard_range: discard dirtied pages at the beginning of postcopy
2187 * Returns zero on success
2189 * @rbname: name of the RAMBlock of the request. NULL means the
2190 * same that last one.
2191 * @start: RAMBlock starting page
2192 * @length: RAMBlock size
2194 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2196 trace_ram_discard_range(rbname, start, length);
2198 RCU_READ_LOCK_GUARD();
2199 RAMBlock *rb = qemu_ram_block_by_name(rbname);
2201 if (!rb) {
2202 error_report("ram_discard_range: Failed to find block '%s'", rbname);
2203 return -1;
2207 * On source VM, we don't need to update the received bitmap since
2208 * we don't even have one.
2210 if (rb->receivedmap) {
2211 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2212 length >> qemu_target_page_bits());
2215 return ram_block_discard_range(rb, start, length);
2219 * For every allocation, we will try not to crash the VM if the
2220 * allocation failed.
2222 static int xbzrle_init(void)
2224 Error *local_err = NULL;
2226 if (!migrate_use_xbzrle()) {
2227 return 0;
2230 XBZRLE_cache_lock();
2232 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2233 if (!XBZRLE.zero_target_page) {
2234 error_report("%s: Error allocating zero page", __func__);
2235 goto err_out;
2238 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2239 TARGET_PAGE_SIZE, &local_err);
2240 if (!XBZRLE.cache) {
2241 error_report_err(local_err);
2242 goto free_zero_page;
2245 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2246 if (!XBZRLE.encoded_buf) {
2247 error_report("%s: Error allocating encoded_buf", __func__);
2248 goto free_cache;
2251 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2252 if (!XBZRLE.current_buf) {
2253 error_report("%s: Error allocating current_buf", __func__);
2254 goto free_encoded_buf;
2257 /* We are all good */
2258 XBZRLE_cache_unlock();
2259 return 0;
2261 free_encoded_buf:
2262 g_free(XBZRLE.encoded_buf);
2263 XBZRLE.encoded_buf = NULL;
2264 free_cache:
2265 cache_fini(XBZRLE.cache);
2266 XBZRLE.cache = NULL;
2267 free_zero_page:
2268 g_free(XBZRLE.zero_target_page);
2269 XBZRLE.zero_target_page = NULL;
2270 err_out:
2271 XBZRLE_cache_unlock();
2272 return -ENOMEM;
2275 static int ram_state_init(RAMState **rsp)
2277 *rsp = g_try_new0(RAMState, 1);
2279 if (!*rsp) {
2280 error_report("%s: Init ramstate fail", __func__);
2281 return -1;
2284 qemu_mutex_init(&(*rsp)->bitmap_mutex);
2285 qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2286 QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2289 * Count the total number of pages used by ram blocks not including any
2290 * gaps due to alignment or unplugs.
2291 * This must match with the initial values of dirty bitmap.
2293 (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2294 ram_state_reset(*rsp);
2296 return 0;
2299 static void ram_list_init_bitmaps(void)
2301 MigrationState *ms = migrate_get_current();
2302 RAMBlock *block;
2303 unsigned long pages;
2304 uint8_t shift;
2306 /* Skip setting bitmap if there is no RAM */
2307 if (ram_bytes_total()) {
2308 shift = ms->clear_bitmap_shift;
2309 if (shift > CLEAR_BITMAP_SHIFT_MAX) {
2310 error_report("clear_bitmap_shift (%u) too big, using "
2311 "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
2312 shift = CLEAR_BITMAP_SHIFT_MAX;
2313 } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
2314 error_report("clear_bitmap_shift (%u) too small, using "
2315 "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
2316 shift = CLEAR_BITMAP_SHIFT_MIN;
2319 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2320 pages = block->max_length >> TARGET_PAGE_BITS;
2322 * The initial dirty bitmap for migration must be set with all
2323 * ones to make sure we'll migrate every guest RAM page to
2324 * destination.
2325 * Here we set RAMBlock.bmap all to 1 because when rebegin a
2326 * new migration after a failed migration, ram_list.
2327 * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
2328 * guest memory.
2330 block->bmap = bitmap_new(pages);
2331 bitmap_set(block->bmap, 0, pages);
2332 block->clear_bmap_shift = shift;
2333 block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
2338 static void ram_init_bitmaps(RAMState *rs)
2340 /* For memory_global_dirty_log_start below. */
2341 qemu_mutex_lock_iothread();
2342 qemu_mutex_lock_ramlist();
2344 WITH_RCU_READ_LOCK_GUARD() {
2345 ram_list_init_bitmaps();
2346 memory_global_dirty_log_start();
2347 migration_bitmap_sync_precopy(rs);
2349 qemu_mutex_unlock_ramlist();
2350 qemu_mutex_unlock_iothread();
2353 static int ram_init_all(RAMState **rsp)
2355 if (ram_state_init(rsp)) {
2356 return -1;
2359 if (xbzrle_init()) {
2360 ram_state_cleanup(rsp);
2361 return -1;
2364 ram_init_bitmaps(*rsp);
2366 return 0;
2369 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2371 RAMBlock *block;
2372 uint64_t pages = 0;
2375 * Postcopy is not using xbzrle/compression, so no need for that.
2376 * Also, since source are already halted, we don't need to care
2377 * about dirty page logging as well.
2380 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2381 pages += bitmap_count_one(block->bmap,
2382 block->used_length >> TARGET_PAGE_BITS);
2385 /* This may not be aligned with current bitmaps. Recalculate. */
2386 rs->migration_dirty_pages = pages;
2388 rs->last_seen_block = NULL;
2389 rs->last_sent_block = NULL;
2390 rs->last_page = 0;
2391 rs->last_version = ram_list.version;
2393 * Disable the bulk stage, otherwise we'll resend the whole RAM no
2394 * matter what we have sent.
2396 rs->ram_bulk_stage = false;
2398 /* Update RAMState cache of output QEMUFile */
2399 rs->f = out;
2401 trace_ram_state_resume_prepare(pages);
2405 * This function clears bits of the free pages reported by the caller from the
2406 * migration dirty bitmap. @addr is the host address corresponding to the
2407 * start of the continuous guest free pages, and @len is the total bytes of
2408 * those pages.
2410 void qemu_guest_free_page_hint(void *addr, size_t len)
2412 RAMBlock *block;
2413 ram_addr_t offset;
2414 size_t used_len, start, npages;
2415 MigrationState *s = migrate_get_current();
2417 /* This function is currently expected to be used during live migration */
2418 if (!migration_is_setup_or_active(s->state)) {
2419 return;
2422 for (; len > 0; len -= used_len, addr += used_len) {
2423 block = qemu_ram_block_from_host(addr, false, &offset);
2424 if (unlikely(!block || offset >= block->used_length)) {
2426 * The implementation might not support RAMBlock resize during
2427 * live migration, but it could happen in theory with future
2428 * updates. So we add a check here to capture that case.
2430 error_report_once("%s unexpected error", __func__);
2431 return;
2434 if (len <= block->used_length - offset) {
2435 used_len = len;
2436 } else {
2437 used_len = block->used_length - offset;
2440 start = offset >> TARGET_PAGE_BITS;
2441 npages = used_len >> TARGET_PAGE_BITS;
2443 qemu_mutex_lock(&ram_state->bitmap_mutex);
2444 ram_state->migration_dirty_pages -=
2445 bitmap_count_one_with_offset(block->bmap, start, npages);
2446 bitmap_clear(block->bmap, start, npages);
2447 qemu_mutex_unlock(&ram_state->bitmap_mutex);
2452 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2453 * long-running RCU critical section. When rcu-reclaims in the code
2454 * start to become numerous it will be necessary to reduce the
2455 * granularity of these critical sections.
2459 * ram_save_setup: Setup RAM for migration
2461 * Returns zero to indicate success and negative for error
2463 * @f: QEMUFile where to send the data
2464 * @opaque: RAMState pointer
2466 static int ram_save_setup(QEMUFile *f, void *opaque)
2468 RAMState **rsp = opaque;
2469 RAMBlock *block;
2471 if (compress_threads_save_setup()) {
2472 return -1;
2475 /* migration has already setup the bitmap, reuse it. */
2476 if (!migration_in_colo_state()) {
2477 if (ram_init_all(rsp) != 0) {
2478 compress_threads_save_cleanup();
2479 return -1;
2482 (*rsp)->f = f;
2484 WITH_RCU_READ_LOCK_GUARD() {
2485 qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
2487 RAMBLOCK_FOREACH_MIGRATABLE(block) {
2488 qemu_put_byte(f, strlen(block->idstr));
2489 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2490 qemu_put_be64(f, block->used_length);
2491 if (migrate_postcopy_ram() && block->page_size !=
2492 qemu_host_page_size) {
2493 qemu_put_be64(f, block->page_size);
2495 if (migrate_ignore_shared()) {
2496 qemu_put_be64(f, block->mr->addr);
2501 ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2502 ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2504 multifd_send_sync_main(f);
2505 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2506 qemu_fflush(f);
2508 return 0;
2512 * ram_save_iterate: iterative stage for migration
2514 * Returns zero to indicate success and negative for error
2516 * @f: QEMUFile where to send the data
2517 * @opaque: RAMState pointer
2519 static int ram_save_iterate(QEMUFile *f, void *opaque)
2521 RAMState **temp = opaque;
2522 RAMState *rs = *temp;
2523 int ret = 0;
2524 int i;
2525 int64_t t0;
2526 int done = 0;
2528 if (blk_mig_bulk_active()) {
2529 /* Avoid transferring ram during bulk phase of block migration as
2530 * the bulk phase will usually take a long time and transferring
2531 * ram updates during that time is pointless. */
2532 goto out;
2535 WITH_RCU_READ_LOCK_GUARD() {
2536 if (ram_list.version != rs->last_version) {
2537 ram_state_reset(rs);
2540 /* Read version before ram_list.blocks */
2541 smp_rmb();
2543 ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2545 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2546 i = 0;
2547 while ((ret = qemu_file_rate_limit(f)) == 0 ||
2548 !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
2549 int pages;
2551 if (qemu_file_get_error(f)) {
2552 break;
2555 pages = ram_find_and_save_block(rs, false);
2556 /* no more pages to sent */
2557 if (pages == 0) {
2558 done = 1;
2559 break;
2562 if (pages < 0) {
2563 qemu_file_set_error(f, pages);
2564 break;
2567 rs->target_page_count += pages;
2570 * During postcopy, it is necessary to make sure one whole host
2571 * page is sent in one chunk.
2573 if (migrate_postcopy_ram()) {
2574 flush_compressed_data(rs);
2578 * we want to check in the 1st loop, just in case it was the 1st
2579 * time and we had to sync the dirty bitmap.
2580 * qemu_clock_get_ns() is a bit expensive, so we only check each
2581 * some iterations
2583 if ((i & 63) == 0) {
2584 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
2585 1000000;
2586 if (t1 > MAX_WAIT) {
2587 trace_ram_save_iterate_big_wait(t1, i);
2588 break;
2591 i++;
2596 * Must occur before EOS (or any QEMUFile operation)
2597 * because of RDMA protocol.
2599 ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2601 out:
2602 if (ret >= 0
2603 && migration_is_setup_or_active(migrate_get_current()->state)) {
2604 multifd_send_sync_main(rs->f);
2605 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2606 qemu_fflush(f);
2607 ram_counters.transferred += 8;
2609 ret = qemu_file_get_error(f);
2611 if (ret < 0) {
2612 return ret;
2615 return done;
2619 * ram_save_complete: function called to send the remaining amount of ram
2621 * Returns zero to indicate success or negative on error
2623 * Called with iothread lock
2625 * @f: QEMUFile where to send the data
2626 * @opaque: RAMState pointer
2628 static int ram_save_complete(QEMUFile *f, void *opaque)
2630 RAMState **temp = opaque;
2631 RAMState *rs = *temp;
2632 int ret = 0;
2634 WITH_RCU_READ_LOCK_GUARD() {
2635 if (!migration_in_postcopy()) {
2636 migration_bitmap_sync_precopy(rs);
2639 ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2641 /* try transferring iterative blocks of memory */
2643 /* flush all remaining blocks regardless of rate limiting */
2644 while (true) {
2645 int pages;
2647 pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2648 /* no more blocks to sent */
2649 if (pages == 0) {
2650 break;
2652 if (pages < 0) {
2653 ret = pages;
2654 break;
2658 flush_compressed_data(rs);
2659 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2662 if (ret >= 0) {
2663 multifd_send_sync_main(rs->f);
2664 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2665 qemu_fflush(f);
2668 return ret;
2671 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2672 uint64_t *res_precopy_only,
2673 uint64_t *res_compatible,
2674 uint64_t *res_postcopy_only)
2676 RAMState **temp = opaque;
2677 RAMState *rs = *temp;
2678 uint64_t remaining_size;
2680 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2682 if (!migration_in_postcopy() &&
2683 remaining_size < max_size) {
2684 qemu_mutex_lock_iothread();
2685 WITH_RCU_READ_LOCK_GUARD() {
2686 migration_bitmap_sync_precopy(rs);
2688 qemu_mutex_unlock_iothread();
2689 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2692 if (migrate_postcopy_ram()) {
2693 /* We can do postcopy, and all the data is postcopiable */
2694 *res_compatible += remaining_size;
2695 } else {
2696 *res_precopy_only += remaining_size;
2700 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2702 unsigned int xh_len;
2703 int xh_flags;
2704 uint8_t *loaded_data;
2706 /* extract RLE header */
2707 xh_flags = qemu_get_byte(f);
2708 xh_len = qemu_get_be16(f);
2710 if (xh_flags != ENCODING_FLAG_XBZRLE) {
2711 error_report("Failed to load XBZRLE page - wrong compression!");
2712 return -1;
2715 if (xh_len > TARGET_PAGE_SIZE) {
2716 error_report("Failed to load XBZRLE page - len overflow!");
2717 return -1;
2719 loaded_data = XBZRLE.decoded_buf;
2720 /* load data and decode */
2721 /* it can change loaded_data to point to an internal buffer */
2722 qemu_get_buffer_in_place(f, &loaded_data, xh_len);
2724 /* decode RLE */
2725 if (xbzrle_decode_buffer(loaded_data, xh_len, host,
2726 TARGET_PAGE_SIZE) == -1) {
2727 error_report("Failed to load XBZRLE page - decode error!");
2728 return -1;
2731 return 0;
2735 * ram_block_from_stream: read a RAMBlock id from the migration stream
2737 * Must be called from within a rcu critical section.
2739 * Returns a pointer from within the RCU-protected ram_list.
2741 * @f: QEMUFile where to read the data from
2742 * @flags: Page flags (mostly to see if it's a continuation of previous block)
2744 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
2746 static RAMBlock *block;
2747 char id[256];
2748 uint8_t len;
2750 if (flags & RAM_SAVE_FLAG_CONTINUE) {
2751 if (!block) {
2752 error_report("Ack, bad migration stream!");
2753 return NULL;
2755 return block;
2758 len = qemu_get_byte(f);
2759 qemu_get_buffer(f, (uint8_t *)id, len);
2760 id[len] = 0;
2762 block = qemu_ram_block_by_name(id);
2763 if (!block) {
2764 error_report("Can't find block %s", id);
2765 return NULL;
2768 if (ramblock_is_ignored(block)) {
2769 error_report("block %s should not be migrated !", id);
2770 return NULL;
2773 return block;
2776 static inline void *host_from_ram_block_offset(RAMBlock *block,
2777 ram_addr_t offset)
2779 if (!offset_in_ramblock(block, offset)) {
2780 return NULL;
2783 return block->host + offset;
2786 static inline void *colo_cache_from_block_offset(RAMBlock *block,
2787 ram_addr_t offset, bool record_bitmap)
2789 if (!offset_in_ramblock(block, offset)) {
2790 return NULL;
2792 if (!block->colo_cache) {
2793 error_report("%s: colo_cache is NULL in block :%s",
2794 __func__, block->idstr);
2795 return NULL;
2799 * During colo checkpoint, we need bitmap of these migrated pages.
2800 * It help us to decide which pages in ram cache should be flushed
2801 * into VM's RAM later.
2803 if (record_bitmap &&
2804 !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
2805 ram_state->migration_dirty_pages++;
2807 return block->colo_cache + offset;
2811 * ram_handle_compressed: handle the zero page case
2813 * If a page (or a whole RDMA chunk) has been
2814 * determined to be zero, then zap it.
2816 * @host: host address for the zero page
2817 * @ch: what the page is filled from. We only support zero
2818 * @size: size of the zero page
2820 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2822 if (ch != 0 || !is_zero_range(host, size)) {
2823 memset(host, ch, size);
2827 /* return the size after decompression, or negative value on error */
2828 static int
2829 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
2830 const uint8_t *source, size_t source_len)
2832 int err;
2834 err = inflateReset(stream);
2835 if (err != Z_OK) {
2836 return -1;
2839 stream->avail_in = source_len;
2840 stream->next_in = (uint8_t *)source;
2841 stream->avail_out = dest_len;
2842 stream->next_out = dest;
2844 err = inflate(stream, Z_NO_FLUSH);
2845 if (err != Z_STREAM_END) {
2846 return -1;
2849 return stream->total_out;
2852 static void *do_data_decompress(void *opaque)
2854 DecompressParam *param = opaque;
2855 unsigned long pagesize;
2856 uint8_t *des;
2857 int len, ret;
2859 qemu_mutex_lock(&param->mutex);
2860 while (!param->quit) {
2861 if (param->des) {
2862 des = param->des;
2863 len = param->len;
2864 param->des = 0;
2865 qemu_mutex_unlock(&param->mutex);
2867 pagesize = TARGET_PAGE_SIZE;
2869 ret = qemu_uncompress_data(&param->stream, des, pagesize,
2870 param->compbuf, len);
2871 if (ret < 0 && migrate_get_current()->decompress_error_check) {
2872 error_report("decompress data failed");
2873 qemu_file_set_error(decomp_file, ret);
2876 qemu_mutex_lock(&decomp_done_lock);
2877 param->done = true;
2878 qemu_cond_signal(&decomp_done_cond);
2879 qemu_mutex_unlock(&decomp_done_lock);
2881 qemu_mutex_lock(&param->mutex);
2882 } else {
2883 qemu_cond_wait(&param->cond, &param->mutex);
2886 qemu_mutex_unlock(&param->mutex);
2888 return NULL;
2891 static int wait_for_decompress_done(void)
2893 int idx, thread_count;
2895 if (!migrate_use_compression()) {
2896 return 0;
2899 thread_count = migrate_decompress_threads();
2900 qemu_mutex_lock(&decomp_done_lock);
2901 for (idx = 0; idx < thread_count; idx++) {
2902 while (!decomp_param[idx].done) {
2903 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2906 qemu_mutex_unlock(&decomp_done_lock);
2907 return qemu_file_get_error(decomp_file);
2910 static void compress_threads_load_cleanup(void)
2912 int i, thread_count;
2914 if (!migrate_use_compression()) {
2915 return;
2917 thread_count = migrate_decompress_threads();
2918 for (i = 0; i < thread_count; i++) {
2920 * we use it as a indicator which shows if the thread is
2921 * properly init'd or not
2923 if (!decomp_param[i].compbuf) {
2924 break;
2927 qemu_mutex_lock(&decomp_param[i].mutex);
2928 decomp_param[i].quit = true;
2929 qemu_cond_signal(&decomp_param[i].cond);
2930 qemu_mutex_unlock(&decomp_param[i].mutex);
2932 for (i = 0; i < thread_count; i++) {
2933 if (!decomp_param[i].compbuf) {
2934 break;
2937 qemu_thread_join(decompress_threads + i);
2938 qemu_mutex_destroy(&decomp_param[i].mutex);
2939 qemu_cond_destroy(&decomp_param[i].cond);
2940 inflateEnd(&decomp_param[i].stream);
2941 g_free(decomp_param[i].compbuf);
2942 decomp_param[i].compbuf = NULL;
2944 g_free(decompress_threads);
2945 g_free(decomp_param);
2946 decompress_threads = NULL;
2947 decomp_param = NULL;
2948 decomp_file = NULL;
2951 static int compress_threads_load_setup(QEMUFile *f)
2953 int i, thread_count;
2955 if (!migrate_use_compression()) {
2956 return 0;
2959 thread_count = migrate_decompress_threads();
2960 decompress_threads = g_new0(QemuThread, thread_count);
2961 decomp_param = g_new0(DecompressParam, thread_count);
2962 qemu_mutex_init(&decomp_done_lock);
2963 qemu_cond_init(&decomp_done_cond);
2964 decomp_file = f;
2965 for (i = 0; i < thread_count; i++) {
2966 if (inflateInit(&decomp_param[i].stream) != Z_OK) {
2967 goto exit;
2970 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
2971 qemu_mutex_init(&decomp_param[i].mutex);
2972 qemu_cond_init(&decomp_param[i].cond);
2973 decomp_param[i].done = true;
2974 decomp_param[i].quit = false;
2975 qemu_thread_create(decompress_threads + i, "decompress",
2976 do_data_decompress, decomp_param + i,
2977 QEMU_THREAD_JOINABLE);
2979 return 0;
2980 exit:
2981 compress_threads_load_cleanup();
2982 return -1;
2985 static void decompress_data_with_multi_threads(QEMUFile *f,
2986 void *host, int len)
2988 int idx, thread_count;
2990 thread_count = migrate_decompress_threads();
2991 qemu_mutex_lock(&decomp_done_lock);
2992 while (true) {
2993 for (idx = 0; idx < thread_count; idx++) {
2994 if (decomp_param[idx].done) {
2995 decomp_param[idx].done = false;
2996 qemu_mutex_lock(&decomp_param[idx].mutex);
2997 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
2998 decomp_param[idx].des = host;
2999 decomp_param[idx].len = len;
3000 qemu_cond_signal(&decomp_param[idx].cond);
3001 qemu_mutex_unlock(&decomp_param[idx].mutex);
3002 break;
3005 if (idx < thread_count) {
3006 break;
3007 } else {
3008 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3011 qemu_mutex_unlock(&decomp_done_lock);
3015 * we must set ram_bulk_stage to false, otherwise in
3016 * migation_bitmap_find_dirty the bitmap will be unused and
3017 * all the pages in ram cache wil be flushed to the ram of
3018 * secondary VM.
3020 static void colo_init_ram_state(void)
3022 ram_state_init(&ram_state);
3023 ram_state->ram_bulk_stage = false;
3027 * colo cache: this is for secondary VM, we cache the whole
3028 * memory of the secondary VM, it is need to hold the global lock
3029 * to call this helper.
3031 int colo_init_ram_cache(void)
3033 RAMBlock *block;
3035 WITH_RCU_READ_LOCK_GUARD() {
3036 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3037 block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3038 NULL,
3039 false);
3040 if (!block->colo_cache) {
3041 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3042 "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3043 block->used_length);
3044 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3045 if (block->colo_cache) {
3046 qemu_anon_ram_free(block->colo_cache, block->used_length);
3047 block->colo_cache = NULL;
3050 return -errno;
3056 * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3057 * with to decide which page in cache should be flushed into SVM's RAM. Here
3058 * we use the same name 'ram_bitmap' as for migration.
3060 if (ram_bytes_total()) {
3061 RAMBlock *block;
3063 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3064 unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3065 block->bmap = bitmap_new(pages);
3069 colo_init_ram_state();
3070 return 0;
3073 /* TODO: duplicated with ram_init_bitmaps */
3074 void colo_incoming_start_dirty_log(void)
3076 RAMBlock *block = NULL;
3077 /* For memory_global_dirty_log_start below. */
3078 qemu_mutex_lock_iothread();
3079 qemu_mutex_lock_ramlist();
3081 memory_global_dirty_log_sync();
3082 WITH_RCU_READ_LOCK_GUARD() {
3083 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3084 ramblock_sync_dirty_bitmap(ram_state, block);
3085 /* Discard this dirty bitmap record */
3086 bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3088 memory_global_dirty_log_start();
3090 ram_state->migration_dirty_pages = 0;
3091 qemu_mutex_unlock_ramlist();
3092 qemu_mutex_unlock_iothread();
3095 /* It is need to hold the global lock to call this helper */
3096 void colo_release_ram_cache(void)
3098 RAMBlock *block;
3100 memory_global_dirty_log_stop();
3101 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3102 g_free(block->bmap);
3103 block->bmap = NULL;
3106 WITH_RCU_READ_LOCK_GUARD() {
3107 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3108 if (block->colo_cache) {
3109 qemu_anon_ram_free(block->colo_cache, block->used_length);
3110 block->colo_cache = NULL;
3114 ram_state_cleanup(&ram_state);
3118 * ram_load_setup: Setup RAM for migration incoming side
3120 * Returns zero to indicate success and negative for error
3122 * @f: QEMUFile where to receive the data
3123 * @opaque: RAMState pointer
3125 static int ram_load_setup(QEMUFile *f, void *opaque)
3127 if (compress_threads_load_setup(f)) {
3128 return -1;
3131 xbzrle_load_setup();
3132 ramblock_recv_map_init();
3134 return 0;
3137 static int ram_load_cleanup(void *opaque)
3139 RAMBlock *rb;
3141 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3142 qemu_ram_block_writeback(rb);
3145 xbzrle_load_cleanup();
3146 compress_threads_load_cleanup();
3148 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3149 g_free(rb->receivedmap);
3150 rb->receivedmap = NULL;
3153 return 0;
3157 * ram_postcopy_incoming_init: allocate postcopy data structures
3159 * Returns 0 for success and negative if there was one error
3161 * @mis: current migration incoming state
3163 * Allocate data structures etc needed by incoming migration with
3164 * postcopy-ram. postcopy-ram's similarly names
3165 * postcopy_ram_incoming_init does the work.
3167 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3169 return postcopy_ram_incoming_init(mis);
3173 * ram_load_postcopy: load a page in postcopy case
3175 * Returns 0 for success or -errno in case of error
3177 * Called in postcopy mode by ram_load().
3178 * rcu_read_lock is taken prior to this being called.
3180 * @f: QEMUFile where to send the data
3182 static int ram_load_postcopy(QEMUFile *f)
3184 int flags = 0, ret = 0;
3185 bool place_needed = false;
3186 bool matches_target_page_size = false;
3187 MigrationIncomingState *mis = migration_incoming_get_current();
3188 /* Temporary page that is later 'placed' */
3189 void *postcopy_host_page = mis->postcopy_tmp_page;
3190 void *this_host = NULL;
3191 bool all_zero = true;
3192 int target_pages = 0;
3194 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3195 ram_addr_t addr;
3196 void *host = NULL;
3197 void *page_buffer = NULL;
3198 void *place_source = NULL;
3199 RAMBlock *block = NULL;
3200 uint8_t ch;
3201 int len;
3203 addr = qemu_get_be64(f);
3206 * If qemu file error, we should stop here, and then "addr"
3207 * may be invalid
3209 ret = qemu_file_get_error(f);
3210 if (ret) {
3211 break;
3214 flags = addr & ~TARGET_PAGE_MASK;
3215 addr &= TARGET_PAGE_MASK;
3217 trace_ram_load_postcopy_loop((uint64_t)addr, flags);
3218 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3219 RAM_SAVE_FLAG_COMPRESS_PAGE)) {
3220 block = ram_block_from_stream(f, flags);
3222 host = host_from_ram_block_offset(block, addr);
3223 if (!host) {
3224 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3225 ret = -EINVAL;
3226 break;
3228 target_pages++;
3229 matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
3231 * Postcopy requires that we place whole host pages atomically;
3232 * these may be huge pages for RAMBlocks that are backed by
3233 * hugetlbfs.
3234 * To make it atomic, the data is read into a temporary page
3235 * that's moved into place later.
3236 * The migration protocol uses, possibly smaller, target-pages
3237 * however the source ensures it always sends all the components
3238 * of a host page in one chunk.
3240 page_buffer = postcopy_host_page +
3241 ((uintptr_t)host & (block->page_size - 1));
3242 if (target_pages == 1) {
3243 this_host = (void *)QEMU_ALIGN_DOWN((uintptr_t)host,
3244 block->page_size);
3245 } else {
3246 /* not the 1st TP within the HP */
3247 if (QEMU_ALIGN_DOWN((uintptr_t)host, block->page_size) !=
3248 (uintptr_t)this_host) {
3249 error_report("Non-same host page %p/%p",
3250 host, this_host);
3251 ret = -EINVAL;
3252 break;
3257 * If it's the last part of a host page then we place the host
3258 * page
3260 if (target_pages == (block->page_size / TARGET_PAGE_SIZE)) {
3261 place_needed = true;
3263 place_source = postcopy_host_page;
3266 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3267 case RAM_SAVE_FLAG_ZERO:
3268 ch = qemu_get_byte(f);
3270 * Can skip to set page_buffer when
3271 * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
3273 if (ch || !matches_target_page_size) {
3274 memset(page_buffer, ch, TARGET_PAGE_SIZE);
3276 if (ch) {
3277 all_zero = false;
3279 break;
3281 case RAM_SAVE_FLAG_PAGE:
3282 all_zero = false;
3283 if (!matches_target_page_size) {
3284 /* For huge pages, we always use temporary buffer */
3285 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3286 } else {
3288 * For small pages that matches target page size, we
3289 * avoid the qemu_file copy. Instead we directly use
3290 * the buffer of QEMUFile to place the page. Note: we
3291 * cannot do any QEMUFile operation before using that
3292 * buffer to make sure the buffer is valid when
3293 * placing the page.
3295 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3296 TARGET_PAGE_SIZE);
3298 break;
3299 case RAM_SAVE_FLAG_COMPRESS_PAGE:
3300 all_zero = false;
3301 len = qemu_get_be32(f);
3302 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3303 error_report("Invalid compressed data length: %d", len);
3304 ret = -EINVAL;
3305 break;
3307 decompress_data_with_multi_threads(f, page_buffer, len);
3308 break;
3310 case RAM_SAVE_FLAG_EOS:
3311 /* normal exit */
3312 multifd_recv_sync_main();
3313 break;
3314 default:
3315 error_report("Unknown combination of migration flags: 0x%x"
3316 " (postcopy mode)", flags);
3317 ret = -EINVAL;
3318 break;
3321 /* Got the whole host page, wait for decompress before placing. */
3322 if (place_needed) {
3323 ret |= wait_for_decompress_done();
3326 /* Detect for any possible file errors */
3327 if (!ret && qemu_file_get_error(f)) {
3328 ret = qemu_file_get_error(f);
3331 if (!ret && place_needed) {
3332 /* This gets called at the last target page in the host page */
3333 void *place_dest = (void *)QEMU_ALIGN_DOWN((uintptr_t)host,
3334 block->page_size);
3336 if (all_zero) {
3337 ret = postcopy_place_page_zero(mis, place_dest,
3338 block);
3339 } else {
3340 ret = postcopy_place_page(mis, place_dest,
3341 place_source, block);
3343 place_needed = false;
3344 target_pages = 0;
3345 /* Assume we have a zero page until we detect something different */
3346 all_zero = true;
3350 return ret;
3353 static bool postcopy_is_advised(void)
3355 PostcopyState ps = postcopy_state_get();
3356 return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
3359 static bool postcopy_is_running(void)
3361 PostcopyState ps = postcopy_state_get();
3362 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3366 * Flush content of RAM cache into SVM's memory.
3367 * Only flush the pages that be dirtied by PVM or SVM or both.
3369 void colo_flush_ram_cache(void)
3371 RAMBlock *block = NULL;
3372 void *dst_host;
3373 void *src_host;
3374 unsigned long offset = 0;
3376 memory_global_dirty_log_sync();
3377 WITH_RCU_READ_LOCK_GUARD() {
3378 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3379 ramblock_sync_dirty_bitmap(ram_state, block);
3383 trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
3384 WITH_RCU_READ_LOCK_GUARD() {
3385 block = QLIST_FIRST_RCU(&ram_list.blocks);
3387 while (block) {
3388 offset = migration_bitmap_find_dirty(ram_state, block, offset);
3390 if (((ram_addr_t)offset) << TARGET_PAGE_BITS
3391 >= block->used_length) {
3392 offset = 0;
3393 block = QLIST_NEXT_RCU(block, next);
3394 } else {
3395 migration_bitmap_clear_dirty(ram_state, block, offset);
3396 dst_host = block->host
3397 + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3398 src_host = block->colo_cache
3399 + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3400 memcpy(dst_host, src_host, TARGET_PAGE_SIZE);
3404 trace_colo_flush_ram_cache_end();
3408 * ram_load_precopy: load pages in precopy case
3410 * Returns 0 for success or -errno in case of error
3412 * Called in precopy mode by ram_load().
3413 * rcu_read_lock is taken prior to this being called.
3415 * @f: QEMUFile where to send the data
3417 static int ram_load_precopy(QEMUFile *f)
3419 int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
3420 /* ADVISE is earlier, it shows the source has the postcopy capability on */
3421 bool postcopy_advised = postcopy_is_advised();
3422 if (!migrate_use_compression()) {
3423 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3426 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3427 ram_addr_t addr, total_ram_bytes;
3428 void *host = NULL, *host_bak = NULL;
3429 uint8_t ch;
3432 * Yield periodically to let main loop run, but an iteration of
3433 * the main loop is expensive, so do it each some iterations
3435 if ((i & 32767) == 0 && qemu_in_coroutine()) {
3436 aio_co_schedule(qemu_get_current_aio_context(),
3437 qemu_coroutine_self());
3438 qemu_coroutine_yield();
3440 i++;
3442 addr = qemu_get_be64(f);
3443 flags = addr & ~TARGET_PAGE_MASK;
3444 addr &= TARGET_PAGE_MASK;
3446 if (flags & invalid_flags) {
3447 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3448 error_report("Received an unexpected compressed page");
3451 ret = -EINVAL;
3452 break;
3455 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3456 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
3457 RAMBlock *block = ram_block_from_stream(f, flags);
3459 host = host_from_ram_block_offset(block, addr);
3461 * After going into COLO stage, we should not load the page
3462 * into SVM's memory directly, we put them into colo_cache firstly.
3463 * NOTE: We need to keep a copy of SVM's ram in colo_cache.
3464 * Previously, we copied all these memory in preparing stage of COLO
3465 * while we need to stop VM, which is a time-consuming process.
3466 * Here we optimize it by a trick, back-up every page while in
3467 * migration process while COLO is enabled, though it affects the
3468 * speed of the migration, but it obviously reduce the downtime of
3469 * back-up all SVM'S memory in COLO preparing stage.
3471 if (migration_incoming_colo_enabled()) {
3472 if (migration_incoming_in_colo_state()) {
3473 /* In COLO stage, put all pages into cache temporarily */
3474 host = colo_cache_from_block_offset(block, addr, true);
3475 } else {
3477 * In migration stage but before COLO stage,
3478 * Put all pages into both cache and SVM's memory.
3480 host_bak = colo_cache_from_block_offset(block, addr, false);
3483 if (!host) {
3484 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3485 ret = -EINVAL;
3486 break;
3488 if (!migration_incoming_in_colo_state()) {
3489 ramblock_recv_bitmap_set(block, host);
3492 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
3495 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3496 case RAM_SAVE_FLAG_MEM_SIZE:
3497 /* Synchronize RAM block list */
3498 total_ram_bytes = addr;
3499 while (!ret && total_ram_bytes) {
3500 RAMBlock *block;
3501 char id[256];
3502 ram_addr_t length;
3504 len = qemu_get_byte(f);
3505 qemu_get_buffer(f, (uint8_t *)id, len);
3506 id[len] = 0;
3507 length = qemu_get_be64(f);
3509 block = qemu_ram_block_by_name(id);
3510 if (block && !qemu_ram_is_migratable(block)) {
3511 error_report("block %s should not be migrated !", id);
3512 ret = -EINVAL;
3513 } else if (block) {
3514 if (length != block->used_length) {
3515 Error *local_err = NULL;
3517 ret = qemu_ram_resize(block, length,
3518 &local_err);
3519 if (local_err) {
3520 error_report_err(local_err);
3523 /* For postcopy we need to check hugepage sizes match */
3524 if (postcopy_advised &&
3525 block->page_size != qemu_host_page_size) {
3526 uint64_t remote_page_size = qemu_get_be64(f);
3527 if (remote_page_size != block->page_size) {
3528 error_report("Mismatched RAM page size %s "
3529 "(local) %zd != %" PRId64,
3530 id, block->page_size,
3531 remote_page_size);
3532 ret = -EINVAL;
3535 if (migrate_ignore_shared()) {
3536 hwaddr addr = qemu_get_be64(f);
3537 if (ramblock_is_ignored(block) &&
3538 block->mr->addr != addr) {
3539 error_report("Mismatched GPAs for block %s "
3540 "%" PRId64 "!= %" PRId64,
3541 id, (uint64_t)addr,
3542 (uint64_t)block->mr->addr);
3543 ret = -EINVAL;
3546 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
3547 block->idstr);
3548 } else {
3549 error_report("Unknown ramblock \"%s\", cannot "
3550 "accept migration", id);
3551 ret = -EINVAL;
3554 total_ram_bytes -= length;
3556 break;
3558 case RAM_SAVE_FLAG_ZERO:
3559 ch = qemu_get_byte(f);
3560 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
3561 break;
3563 case RAM_SAVE_FLAG_PAGE:
3564 qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
3565 break;
3567 case RAM_SAVE_FLAG_COMPRESS_PAGE:
3568 len = qemu_get_be32(f);
3569 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3570 error_report("Invalid compressed data length: %d", len);
3571 ret = -EINVAL;
3572 break;
3574 decompress_data_with_multi_threads(f, host, len);
3575 break;
3577 case RAM_SAVE_FLAG_XBZRLE:
3578 if (load_xbzrle(f, addr, host) < 0) {
3579 error_report("Failed to decompress XBZRLE page at "
3580 RAM_ADDR_FMT, addr);
3581 ret = -EINVAL;
3582 break;
3584 break;
3585 case RAM_SAVE_FLAG_EOS:
3586 /* normal exit */
3587 multifd_recv_sync_main();
3588 break;
3589 default:
3590 if (flags & RAM_SAVE_FLAG_HOOK) {
3591 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
3592 } else {
3593 error_report("Unknown combination of migration flags: 0x%x",
3594 flags);
3595 ret = -EINVAL;
3598 if (!ret) {
3599 ret = qemu_file_get_error(f);
3601 if (!ret && host_bak) {
3602 memcpy(host_bak, host, TARGET_PAGE_SIZE);
3606 ret |= wait_for_decompress_done();
3607 return ret;
3610 static int ram_load(QEMUFile *f, void *opaque, int version_id)
3612 int ret = 0;
3613 static uint64_t seq_iter;
3615 * If system is running in postcopy mode, page inserts to host memory must
3616 * be atomic
3618 bool postcopy_running = postcopy_is_running();
3620 seq_iter++;
3622 if (version_id != 4) {
3623 return -EINVAL;
3627 * This RCU critical section can be very long running.
3628 * When RCU reclaims in the code start to become numerous,
3629 * it will be necessary to reduce the granularity of this
3630 * critical section.
3632 WITH_RCU_READ_LOCK_GUARD() {
3633 if (postcopy_running) {
3634 ret = ram_load_postcopy(f);
3635 } else {
3636 ret = ram_load_precopy(f);
3639 trace_ram_load_complete(ret, seq_iter);
3641 return ret;
3644 static bool ram_has_postcopy(void *opaque)
3646 RAMBlock *rb;
3647 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3648 if (ramblock_is_pmem(rb)) {
3649 info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
3650 "is not supported now!", rb->idstr, rb->host);
3651 return false;
3655 return migrate_postcopy_ram();
3658 /* Sync all the dirty bitmap with destination VM. */
3659 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
3661 RAMBlock *block;
3662 QEMUFile *file = s->to_dst_file;
3663 int ramblock_count = 0;
3665 trace_ram_dirty_bitmap_sync_start();
3667 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3668 qemu_savevm_send_recv_bitmap(file, block->idstr);
3669 trace_ram_dirty_bitmap_request(block->idstr);
3670 ramblock_count++;
3673 trace_ram_dirty_bitmap_sync_wait();
3675 /* Wait until all the ramblocks' dirty bitmap synced */
3676 while (ramblock_count--) {
3677 qemu_sem_wait(&s->rp_state.rp_sem);
3680 trace_ram_dirty_bitmap_sync_complete();
3682 return 0;
3685 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
3687 qemu_sem_post(&s->rp_state.rp_sem);
3691 * Read the received bitmap, revert it as the initial dirty bitmap.
3692 * This is only used when the postcopy migration is paused but wants
3693 * to resume from a middle point.
3695 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
3697 int ret = -EINVAL;
3698 QEMUFile *file = s->rp_state.from_dst_file;
3699 unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
3700 uint64_t local_size = DIV_ROUND_UP(nbits, 8);
3701 uint64_t size, end_mark;
3703 trace_ram_dirty_bitmap_reload_begin(block->idstr);
3705 if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
3706 error_report("%s: incorrect state %s", __func__,
3707 MigrationStatus_str(s->state));
3708 return -EINVAL;
3712 * Note: see comments in ramblock_recv_bitmap_send() on why we
3713 * need the endianness conversion, and the paddings.
3715 local_size = ROUND_UP(local_size, 8);
3717 /* Add paddings */
3718 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
3720 size = qemu_get_be64(file);
3722 /* The size of the bitmap should match with our ramblock */
3723 if (size != local_size) {
3724 error_report("%s: ramblock '%s' bitmap size mismatch "
3725 "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
3726 block->idstr, size, local_size);
3727 ret = -EINVAL;
3728 goto out;
3731 size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
3732 end_mark = qemu_get_be64(file);
3734 ret = qemu_file_get_error(file);
3735 if (ret || size != local_size) {
3736 error_report("%s: read bitmap failed for ramblock '%s': %d"
3737 " (size 0x%"PRIx64", got: 0x%"PRIx64")",
3738 __func__, block->idstr, ret, local_size, size);
3739 ret = -EIO;
3740 goto out;
3743 if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
3744 error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64,
3745 __func__, block->idstr, end_mark);
3746 ret = -EINVAL;
3747 goto out;
3751 * Endianness conversion. We are during postcopy (though paused).
3752 * The dirty bitmap won't change. We can directly modify it.
3754 bitmap_from_le(block->bmap, le_bitmap, nbits);
3757 * What we received is "received bitmap". Revert it as the initial
3758 * dirty bitmap for this ramblock.
3760 bitmap_complement(block->bmap, block->bmap, nbits);
3762 trace_ram_dirty_bitmap_reload_complete(block->idstr);
3765 * We succeeded to sync bitmap for current ramblock. If this is
3766 * the last one to sync, we need to notify the main send thread.
3768 ram_dirty_bitmap_reload_notify(s);
3770 ret = 0;
3771 out:
3772 g_free(le_bitmap);
3773 return ret;
3776 static int ram_resume_prepare(MigrationState *s, void *opaque)
3778 RAMState *rs = *(RAMState **)opaque;
3779 int ret;
3781 ret = ram_dirty_bitmap_sync_all(s, rs);
3782 if (ret) {
3783 return ret;
3786 ram_state_resume_prepare(rs, s->to_dst_file);
3788 return 0;
3791 static SaveVMHandlers savevm_ram_handlers = {
3792 .save_setup = ram_save_setup,
3793 .save_live_iterate = ram_save_iterate,
3794 .save_live_complete_postcopy = ram_save_complete,
3795 .save_live_complete_precopy = ram_save_complete,
3796 .has_postcopy = ram_has_postcopy,
3797 .save_live_pending = ram_save_pending,
3798 .load_state = ram_load,
3799 .save_cleanup = ram_save_cleanup,
3800 .load_setup = ram_load_setup,
3801 .load_cleanup = ram_load_cleanup,
3802 .resume_prepare = ram_resume_prepare,
3805 void ram_mig_init(void)
3807 qemu_mutex_init(&XBZRLE.lock);
3808 register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);