Merge remote-tracking branch 'remotes/mst/tags/for_upstream' into staging
[qemu.git] / migration / ram.c
blob91ca743ac88fb3a3010d3abc488bbdfe058df532
1 /*
2 * QEMU System Emulator
4 * Copyright (c) 2003-2008 Fabrice Bellard
5 * Copyright (c) 2011-2015 Red Hat Inc
7 * Authors:
8 * Juan Quintela <quintela@redhat.com>
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 * THE SOFTWARE.
29 #include "qemu/osdep.h"
30 #include "qemu/cutils.h"
31 #include "qemu/bitops.h"
32 #include "qemu/bitmap.h"
33 #include "qemu/main-loop.h"
34 #include "xbzrle.h"
35 #include "ram.h"
36 #include "migration.h"
37 #include "migration/register.h"
38 #include "migration/misc.h"
39 #include "qemu-file.h"
40 #include "postcopy-ram.h"
41 #include "page_cache.h"
42 #include "qemu/error-report.h"
43 #include "qapi/error.h"
44 #include "qapi/qapi-types-migration.h"
45 #include "qapi/qapi-events-migration.h"
46 #include "qapi/qmp/qerror.h"
47 #include "trace.h"
48 #include "exec/ram_addr.h"
49 #include "exec/target_page.h"
50 #include "qemu/rcu_queue.h"
51 #include "migration/colo.h"
52 #include "block.h"
53 #include "sysemu/cpu-throttle.h"
54 #include "savevm.h"
55 #include "qemu/iov.h"
56 #include "multifd.h"
57 #include "sysemu/runstate.h"
59 #include "hw/boards.h" /* for machine_dump_guest_core() */
61 #if defined(__linux__)
62 #include "qemu/userfaultfd.h"
63 #endif /* defined(__linux__) */
65 /***********************************************************/
66 /* ram save/restore */
68 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
69 * worked for pages that where filled with the same char. We switched
70 * it to only search for the zero value. And to avoid confusion with
71 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
74 #define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */
75 #define RAM_SAVE_FLAG_ZERO 0x02
76 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
77 #define RAM_SAVE_FLAG_PAGE 0x08
78 #define RAM_SAVE_FLAG_EOS 0x10
79 #define RAM_SAVE_FLAG_CONTINUE 0x20
80 #define RAM_SAVE_FLAG_XBZRLE 0x40
81 /* 0x80 is reserved in migration.h start with 0x100 next */
82 #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
84 XBZRLECacheStats xbzrle_counters;
86 /* struct contains XBZRLE cache and a static page
87 used by the compression */
88 static struct {
89 /* buffer used for XBZRLE encoding */
90 uint8_t *encoded_buf;
91 /* buffer for storing page content */
92 uint8_t *current_buf;
93 /* Cache for XBZRLE, Protected by lock. */
94 PageCache *cache;
95 QemuMutex lock;
96 /* it will store a page full of zeros */
97 uint8_t *zero_target_page;
98 /* buffer used for XBZRLE decoding */
99 uint8_t *decoded_buf;
100 } XBZRLE;
102 static void XBZRLE_cache_lock(void)
104 if (migrate_use_xbzrle()) {
105 qemu_mutex_lock(&XBZRLE.lock);
109 static void XBZRLE_cache_unlock(void)
111 if (migrate_use_xbzrle()) {
112 qemu_mutex_unlock(&XBZRLE.lock);
117 * xbzrle_cache_resize: resize the xbzrle cache
119 * This function is called from migrate_params_apply in main
120 * thread, possibly while a migration is in progress. A running
121 * migration may be using the cache and might finish during this call,
122 * hence changes to the cache are protected by XBZRLE.lock().
124 * Returns 0 for success or -1 for error
126 * @new_size: new cache size
127 * @errp: set *errp if the check failed, with reason
129 int xbzrle_cache_resize(uint64_t new_size, Error **errp)
131 PageCache *new_cache;
132 int64_t ret = 0;
134 /* Check for truncation */
135 if (new_size != (size_t)new_size) {
136 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
137 "exceeding address space");
138 return -1;
141 if (new_size == migrate_xbzrle_cache_size()) {
142 /* nothing to do */
143 return 0;
146 XBZRLE_cache_lock();
148 if (XBZRLE.cache != NULL) {
149 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
150 if (!new_cache) {
151 ret = -1;
152 goto out;
155 cache_fini(XBZRLE.cache);
156 XBZRLE.cache = new_cache;
158 out:
159 XBZRLE_cache_unlock();
160 return ret;
163 bool ramblock_is_ignored(RAMBlock *block)
165 return !qemu_ram_is_migratable(block) ||
166 (migrate_ignore_shared() && qemu_ram_is_shared(block));
169 #undef RAMBLOCK_FOREACH
171 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
173 RAMBlock *block;
174 int ret = 0;
176 RCU_READ_LOCK_GUARD();
178 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
179 ret = func(block, opaque);
180 if (ret) {
181 break;
184 return ret;
187 static void ramblock_recv_map_init(void)
189 RAMBlock *rb;
191 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
192 assert(!rb->receivedmap);
193 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
197 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
199 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
200 rb->receivedmap);
203 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
205 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
208 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
210 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
213 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
214 size_t nr)
216 bitmap_set_atomic(rb->receivedmap,
217 ramblock_recv_bitmap_offset(host_addr, rb),
218 nr);
221 #define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL)
224 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
226 * Returns >0 if success with sent bytes, or <0 if error.
228 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
229 const char *block_name)
231 RAMBlock *block = qemu_ram_block_by_name(block_name);
232 unsigned long *le_bitmap, nbits;
233 uint64_t size;
235 if (!block) {
236 error_report("%s: invalid block name: %s", __func__, block_name);
237 return -1;
240 nbits = block->postcopy_length >> TARGET_PAGE_BITS;
243 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
244 * machines we may need 4 more bytes for padding (see below
245 * comment). So extend it a bit before hand.
247 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
250 * Always use little endian when sending the bitmap. This is
251 * required that when source and destination VMs are not using the
252 * same endianness. (Note: big endian won't work.)
254 bitmap_to_le(le_bitmap, block->receivedmap, nbits);
256 /* Size of the bitmap, in bytes */
257 size = DIV_ROUND_UP(nbits, 8);
260 * size is always aligned to 8 bytes for 64bit machines, but it
261 * may not be true for 32bit machines. We need this padding to
262 * make sure the migration can survive even between 32bit and
263 * 64bit machines.
265 size = ROUND_UP(size, 8);
267 qemu_put_be64(file, size);
268 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
270 * Mark as an end, in case the middle part is screwed up due to
271 * some "mysterious" reason.
273 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
274 qemu_fflush(file);
276 g_free(le_bitmap);
278 if (qemu_file_get_error(file)) {
279 return qemu_file_get_error(file);
282 return size + sizeof(size);
286 * An outstanding page request, on the source, having been received
287 * and queued
289 struct RAMSrcPageRequest {
290 RAMBlock *rb;
291 hwaddr offset;
292 hwaddr len;
294 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
297 /* State of RAM for migration */
298 struct RAMState {
299 /* QEMUFile used for this migration */
300 QEMUFile *f;
301 /* UFFD file descriptor, used in 'write-tracking' migration */
302 int uffdio_fd;
303 /* Last block that we have visited searching for dirty pages */
304 RAMBlock *last_seen_block;
305 /* Last block from where we have sent data */
306 RAMBlock *last_sent_block;
307 /* Last dirty target page we have sent */
308 ram_addr_t last_page;
309 /* last ram version we have seen */
310 uint32_t last_version;
311 /* How many times we have dirty too many pages */
312 int dirty_rate_high_cnt;
313 /* these variables are used for bitmap sync */
314 /* last time we did a full bitmap_sync */
315 int64_t time_last_bitmap_sync;
316 /* bytes transferred at start_time */
317 uint64_t bytes_xfer_prev;
318 /* number of dirty pages since start_time */
319 uint64_t num_dirty_pages_period;
320 /* xbzrle misses since the beginning of the period */
321 uint64_t xbzrle_cache_miss_prev;
322 /* Amount of xbzrle pages since the beginning of the period */
323 uint64_t xbzrle_pages_prev;
324 /* Amount of xbzrle encoded bytes since the beginning of the period */
325 uint64_t xbzrle_bytes_prev;
326 /* Start using XBZRLE (e.g., after the first round). */
327 bool xbzrle_enabled;
328 /* Are we on the last stage of migration */
329 bool last_stage;
330 /* compression statistics since the beginning of the period */
331 /* amount of count that no free thread to compress data */
332 uint64_t compress_thread_busy_prev;
333 /* amount bytes after compression */
334 uint64_t compressed_size_prev;
335 /* amount of compressed pages */
336 uint64_t compress_pages_prev;
338 /* total handled target pages at the beginning of period */
339 uint64_t target_page_count_prev;
340 /* total handled target pages since start */
341 uint64_t target_page_count;
342 /* number of dirty bits in the bitmap */
343 uint64_t migration_dirty_pages;
344 /* Protects modification of the bitmap and migration dirty pages */
345 QemuMutex bitmap_mutex;
346 /* The RAMBlock used in the last src_page_requests */
347 RAMBlock *last_req_rb;
348 /* Queue of outstanding page requests from the destination */
349 QemuMutex src_page_req_mutex;
350 QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
352 typedef struct RAMState RAMState;
354 static RAMState *ram_state;
356 static NotifierWithReturnList precopy_notifier_list;
358 /* Whether postcopy has queued requests? */
359 static bool postcopy_has_request(RAMState *rs)
361 return !QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests);
364 void precopy_infrastructure_init(void)
366 notifier_with_return_list_init(&precopy_notifier_list);
369 void precopy_add_notifier(NotifierWithReturn *n)
371 notifier_with_return_list_add(&precopy_notifier_list, n);
374 void precopy_remove_notifier(NotifierWithReturn *n)
376 notifier_with_return_remove(n);
379 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
381 PrecopyNotifyData pnd;
382 pnd.reason = reason;
383 pnd.errp = errp;
385 return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
388 uint64_t ram_bytes_remaining(void)
390 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
394 MigrationStats ram_counters;
396 static void ram_transferred_add(uint64_t bytes)
398 if (runstate_is_running()) {
399 ram_counters.precopy_bytes += bytes;
400 } else if (migration_in_postcopy()) {
401 ram_counters.postcopy_bytes += bytes;
402 } else {
403 ram_counters.downtime_bytes += bytes;
405 ram_counters.transferred += bytes;
408 /* used by the search for pages to send */
409 struct PageSearchStatus {
410 /* Current block being searched */
411 RAMBlock *block;
412 /* Current page to search from */
413 unsigned long page;
414 /* Set once we wrap around */
415 bool complete_round;
417 typedef struct PageSearchStatus PageSearchStatus;
419 CompressionStats compression_counters;
421 struct CompressParam {
422 bool done;
423 bool quit;
424 bool zero_page;
425 QEMUFile *file;
426 QemuMutex mutex;
427 QemuCond cond;
428 RAMBlock *block;
429 ram_addr_t offset;
431 /* internally used fields */
432 z_stream stream;
433 uint8_t *originbuf;
435 typedef struct CompressParam CompressParam;
437 struct DecompressParam {
438 bool done;
439 bool quit;
440 QemuMutex mutex;
441 QemuCond cond;
442 void *des;
443 uint8_t *compbuf;
444 int len;
445 z_stream stream;
447 typedef struct DecompressParam DecompressParam;
449 static CompressParam *comp_param;
450 static QemuThread *compress_threads;
451 /* comp_done_cond is used to wake up the migration thread when
452 * one of the compression threads has finished the compression.
453 * comp_done_lock is used to co-work with comp_done_cond.
455 static QemuMutex comp_done_lock;
456 static QemuCond comp_done_cond;
457 /* The empty QEMUFileOps will be used by file in CompressParam */
458 static const QEMUFileOps empty_ops = { };
460 static QEMUFile *decomp_file;
461 static DecompressParam *decomp_param;
462 static QemuThread *decompress_threads;
463 static QemuMutex decomp_done_lock;
464 static QemuCond decomp_done_cond;
466 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
467 ram_addr_t offset, uint8_t *source_buf);
469 static void *do_data_compress(void *opaque)
471 CompressParam *param = opaque;
472 RAMBlock *block;
473 ram_addr_t offset;
474 bool zero_page;
476 qemu_mutex_lock(&param->mutex);
477 while (!param->quit) {
478 if (param->block) {
479 block = param->block;
480 offset = param->offset;
481 param->block = NULL;
482 qemu_mutex_unlock(&param->mutex);
484 zero_page = do_compress_ram_page(param->file, &param->stream,
485 block, offset, param->originbuf);
487 qemu_mutex_lock(&comp_done_lock);
488 param->done = true;
489 param->zero_page = zero_page;
490 qemu_cond_signal(&comp_done_cond);
491 qemu_mutex_unlock(&comp_done_lock);
493 qemu_mutex_lock(&param->mutex);
494 } else {
495 qemu_cond_wait(&param->cond, &param->mutex);
498 qemu_mutex_unlock(&param->mutex);
500 return NULL;
503 static void compress_threads_save_cleanup(void)
505 int i, thread_count;
507 if (!migrate_use_compression() || !comp_param) {
508 return;
511 thread_count = migrate_compress_threads();
512 for (i = 0; i < thread_count; i++) {
514 * we use it as a indicator which shows if the thread is
515 * properly init'd or not
517 if (!comp_param[i].file) {
518 break;
521 qemu_mutex_lock(&comp_param[i].mutex);
522 comp_param[i].quit = true;
523 qemu_cond_signal(&comp_param[i].cond);
524 qemu_mutex_unlock(&comp_param[i].mutex);
526 qemu_thread_join(compress_threads + i);
527 qemu_mutex_destroy(&comp_param[i].mutex);
528 qemu_cond_destroy(&comp_param[i].cond);
529 deflateEnd(&comp_param[i].stream);
530 g_free(comp_param[i].originbuf);
531 qemu_fclose(comp_param[i].file);
532 comp_param[i].file = NULL;
534 qemu_mutex_destroy(&comp_done_lock);
535 qemu_cond_destroy(&comp_done_cond);
536 g_free(compress_threads);
537 g_free(comp_param);
538 compress_threads = NULL;
539 comp_param = NULL;
542 static int compress_threads_save_setup(void)
544 int i, thread_count;
546 if (!migrate_use_compression()) {
547 return 0;
549 thread_count = migrate_compress_threads();
550 compress_threads = g_new0(QemuThread, thread_count);
551 comp_param = g_new0(CompressParam, thread_count);
552 qemu_cond_init(&comp_done_cond);
553 qemu_mutex_init(&comp_done_lock);
554 for (i = 0; i < thread_count; i++) {
555 comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
556 if (!comp_param[i].originbuf) {
557 goto exit;
560 if (deflateInit(&comp_param[i].stream,
561 migrate_compress_level()) != Z_OK) {
562 g_free(comp_param[i].originbuf);
563 goto exit;
566 /* comp_param[i].file is just used as a dummy buffer to save data,
567 * set its ops to empty.
569 comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops, false);
570 comp_param[i].done = true;
571 comp_param[i].quit = false;
572 qemu_mutex_init(&comp_param[i].mutex);
573 qemu_cond_init(&comp_param[i].cond);
574 qemu_thread_create(compress_threads + i, "compress",
575 do_data_compress, comp_param + i,
576 QEMU_THREAD_JOINABLE);
578 return 0;
580 exit:
581 compress_threads_save_cleanup();
582 return -1;
586 * save_page_header: write page header to wire
588 * If this is the 1st block, it also writes the block identification
590 * Returns the number of bytes written
592 * @f: QEMUFile where to send the data
593 * @block: block that contains the page we want to send
594 * @offset: offset inside the block for the page
595 * in the lower bits, it contains flags
597 static size_t save_page_header(RAMState *rs, QEMUFile *f, RAMBlock *block,
598 ram_addr_t offset)
600 size_t size, len;
602 if (block == rs->last_sent_block) {
603 offset |= RAM_SAVE_FLAG_CONTINUE;
605 qemu_put_be64(f, offset);
606 size = 8;
608 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
609 len = strlen(block->idstr);
610 qemu_put_byte(f, len);
611 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
612 size += 1 + len;
613 rs->last_sent_block = block;
615 return size;
619 * mig_throttle_guest_down: throttle down the guest
621 * Reduce amount of guest cpu execution to hopefully slow down memory
622 * writes. If guest dirty memory rate is reduced below the rate at
623 * which we can transfer pages to the destination then we should be
624 * able to complete migration. Some workloads dirty memory way too
625 * fast and will not effectively converge, even with auto-converge.
627 static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
628 uint64_t bytes_dirty_threshold)
630 MigrationState *s = migrate_get_current();
631 uint64_t pct_initial = s->parameters.cpu_throttle_initial;
632 uint64_t pct_increment = s->parameters.cpu_throttle_increment;
633 bool pct_tailslow = s->parameters.cpu_throttle_tailslow;
634 int pct_max = s->parameters.max_cpu_throttle;
636 uint64_t throttle_now = cpu_throttle_get_percentage();
637 uint64_t cpu_now, cpu_ideal, throttle_inc;
639 /* We have not started throttling yet. Let's start it. */
640 if (!cpu_throttle_active()) {
641 cpu_throttle_set(pct_initial);
642 } else {
643 /* Throttling already on, just increase the rate */
644 if (!pct_tailslow) {
645 throttle_inc = pct_increment;
646 } else {
647 /* Compute the ideal CPU percentage used by Guest, which may
648 * make the dirty rate match the dirty rate threshold. */
649 cpu_now = 100 - throttle_now;
650 cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
651 bytes_dirty_period);
652 throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
654 cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
658 void mig_throttle_counter_reset(void)
660 RAMState *rs = ram_state;
662 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
663 rs->num_dirty_pages_period = 0;
664 rs->bytes_xfer_prev = ram_counters.transferred;
668 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
670 * @rs: current RAM state
671 * @current_addr: address for the zero page
673 * Update the xbzrle cache to reflect a page that's been sent as all 0.
674 * The important thing is that a stale (not-yet-0'd) page be replaced
675 * by the new data.
676 * As a bonus, if the page wasn't in the cache it gets added so that
677 * when a small write is made into the 0'd page it gets XBZRLE sent.
679 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
681 if (!rs->xbzrle_enabled) {
682 return;
685 /* We don't care if this fails to allocate a new cache page
686 * as long as it updated an old one */
687 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
688 ram_counters.dirty_sync_count);
691 #define ENCODING_FLAG_XBZRLE 0x1
694 * save_xbzrle_page: compress and send current page
696 * Returns: 1 means that we wrote the page
697 * 0 means that page is identical to the one already sent
698 * -1 means that xbzrle would be longer than normal
700 * @rs: current RAM state
701 * @current_data: pointer to the address of the page contents
702 * @current_addr: addr of the page
703 * @block: block that contains the page we want to send
704 * @offset: offset inside the block for the page
706 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
707 ram_addr_t current_addr, RAMBlock *block,
708 ram_addr_t offset)
710 int encoded_len = 0, bytes_xbzrle;
711 uint8_t *prev_cached_page;
713 if (!cache_is_cached(XBZRLE.cache, current_addr,
714 ram_counters.dirty_sync_count)) {
715 xbzrle_counters.cache_miss++;
716 if (!rs->last_stage) {
717 if (cache_insert(XBZRLE.cache, current_addr, *current_data,
718 ram_counters.dirty_sync_count) == -1) {
719 return -1;
720 } else {
721 /* update *current_data when the page has been
722 inserted into cache */
723 *current_data = get_cached_data(XBZRLE.cache, current_addr);
726 return -1;
730 * Reaching here means the page has hit the xbzrle cache, no matter what
731 * encoding result it is (normal encoding, overflow or skipping the page),
732 * count the page as encoded. This is used to calculate the encoding rate.
734 * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
735 * 2nd page turns out to be skipped (i.e. no new bytes written to the
736 * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
737 * skipped page included. In this way, the encoding rate can tell if the
738 * guest page is good for xbzrle encoding.
740 xbzrle_counters.pages++;
741 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
743 /* save current buffer into memory */
744 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
746 /* XBZRLE encoding (if there is no overflow) */
747 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
748 TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
749 TARGET_PAGE_SIZE);
752 * Update the cache contents, so that it corresponds to the data
753 * sent, in all cases except where we skip the page.
755 if (!rs->last_stage && encoded_len != 0) {
756 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
758 * In the case where we couldn't compress, ensure that the caller
759 * sends the data from the cache, since the guest might have
760 * changed the RAM since we copied it.
762 *current_data = prev_cached_page;
765 if (encoded_len == 0) {
766 trace_save_xbzrle_page_skipping();
767 return 0;
768 } else if (encoded_len == -1) {
769 trace_save_xbzrle_page_overflow();
770 xbzrle_counters.overflow++;
771 xbzrle_counters.bytes += TARGET_PAGE_SIZE;
772 return -1;
775 /* Send XBZRLE based compressed page */
776 bytes_xbzrle = save_page_header(rs, rs->f, block,
777 offset | RAM_SAVE_FLAG_XBZRLE);
778 qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
779 qemu_put_be16(rs->f, encoded_len);
780 qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
781 bytes_xbzrle += encoded_len + 1 + 2;
783 * Like compressed_size (please see update_compress_thread_counts),
784 * the xbzrle encoded bytes don't count the 8 byte header with
785 * RAM_SAVE_FLAG_CONTINUE.
787 xbzrle_counters.bytes += bytes_xbzrle - 8;
788 ram_transferred_add(bytes_xbzrle);
790 return 1;
794 * migration_bitmap_find_dirty: find the next dirty page from start
796 * Returns the page offset within memory region of the start of a dirty page
798 * @rs: current RAM state
799 * @rb: RAMBlock where to search for dirty pages
800 * @start: page where we start the search
802 static inline
803 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
804 unsigned long start)
806 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
807 unsigned long *bitmap = rb->bmap;
809 if (ramblock_is_ignored(rb)) {
810 return size;
813 return find_next_bit(bitmap, size, start);
816 static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb,
817 unsigned long page)
819 uint8_t shift;
820 hwaddr size, start;
822 if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) {
823 return;
826 shift = rb->clear_bmap_shift;
828 * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
829 * can make things easier sometimes since then start address
830 * of the small chunk will always be 64 pages aligned so the
831 * bitmap will always be aligned to unsigned long. We should
832 * even be able to remove this restriction but I'm simply
833 * keeping it.
835 assert(shift >= 6);
837 size = 1ULL << (TARGET_PAGE_BITS + shift);
838 start = QEMU_ALIGN_DOWN((ram_addr_t)page << TARGET_PAGE_BITS, size);
839 trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
840 memory_region_clear_dirty_bitmap(rb->mr, start, size);
843 static void
844 migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb,
845 unsigned long start,
846 unsigned long npages)
848 unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift;
849 unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages);
850 unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages);
853 * Clear pages from start to start + npages - 1, so the end boundary is
854 * exclusive.
856 for (i = chunk_start; i < chunk_end; i += chunk_pages) {
857 migration_clear_memory_region_dirty_bitmap(rb, i);
862 * colo_bitmap_find_diry:find contiguous dirty pages from start
864 * Returns the page offset within memory region of the start of the contiguout
865 * dirty page
867 * @rs: current RAM state
868 * @rb: RAMBlock where to search for dirty pages
869 * @start: page where we start the search
870 * @num: the number of contiguous dirty pages
872 static inline
873 unsigned long colo_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
874 unsigned long start, unsigned long *num)
876 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
877 unsigned long *bitmap = rb->bmap;
878 unsigned long first, next;
880 *num = 0;
882 if (ramblock_is_ignored(rb)) {
883 return size;
886 first = find_next_bit(bitmap, size, start);
887 if (first >= size) {
888 return first;
890 next = find_next_zero_bit(bitmap, size, first + 1);
891 assert(next >= first);
892 *num = next - first;
893 return first;
896 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
897 RAMBlock *rb,
898 unsigned long page)
900 bool ret;
903 * Clear dirty bitmap if needed. This _must_ be called before we
904 * send any of the page in the chunk because we need to make sure
905 * we can capture further page content changes when we sync dirty
906 * log the next time. So as long as we are going to send any of
907 * the page in the chunk we clear the remote dirty bitmap for all.
908 * Clearing it earlier won't be a problem, but too late will.
910 migration_clear_memory_region_dirty_bitmap(rb, page);
912 ret = test_and_clear_bit(page, rb->bmap);
913 if (ret) {
914 rs->migration_dirty_pages--;
917 return ret;
920 static void dirty_bitmap_clear_section(MemoryRegionSection *section,
921 void *opaque)
923 const hwaddr offset = section->offset_within_region;
924 const hwaddr size = int128_get64(section->size);
925 const unsigned long start = offset >> TARGET_PAGE_BITS;
926 const unsigned long npages = size >> TARGET_PAGE_BITS;
927 RAMBlock *rb = section->mr->ram_block;
928 uint64_t *cleared_bits = opaque;
931 * We don't grab ram_state->bitmap_mutex because we expect to run
932 * only when starting migration or during postcopy recovery where
933 * we don't have concurrent access.
935 if (!migration_in_postcopy() && !migrate_background_snapshot()) {
936 migration_clear_memory_region_dirty_bitmap_range(rb, start, npages);
938 *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages);
939 bitmap_clear(rb->bmap, start, npages);
943 * Exclude all dirty pages from migration that fall into a discarded range as
944 * managed by a RamDiscardManager responsible for the mapped memory region of
945 * the RAMBlock. Clear the corresponding bits in the dirty bitmaps.
947 * Discarded pages ("logically unplugged") have undefined content and must
948 * not get migrated, because even reading these pages for migration might
949 * result in undesired behavior.
951 * Returns the number of cleared bits in the RAMBlock dirty bitmap.
953 * Note: The result is only stable while migrating (precopy/postcopy).
955 static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb)
957 uint64_t cleared_bits = 0;
959 if (rb->mr && rb->bmap && memory_region_has_ram_discard_manager(rb->mr)) {
960 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
961 MemoryRegionSection section = {
962 .mr = rb->mr,
963 .offset_within_region = 0,
964 .size = int128_make64(qemu_ram_get_used_length(rb)),
967 ram_discard_manager_replay_discarded(rdm, &section,
968 dirty_bitmap_clear_section,
969 &cleared_bits);
971 return cleared_bits;
975 * Check if a host-page aligned page falls into a discarded range as managed by
976 * a RamDiscardManager responsible for the mapped memory region of the RAMBlock.
978 * Note: The result is only stable while migrating (precopy/postcopy).
980 bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start)
982 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
983 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
984 MemoryRegionSection section = {
985 .mr = rb->mr,
986 .offset_within_region = start,
987 .size = int128_make64(qemu_ram_pagesize(rb)),
990 return !ram_discard_manager_is_populated(rdm, &section);
992 return false;
995 /* Called with RCU critical section */
996 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
998 uint64_t new_dirty_pages =
999 cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
1001 rs->migration_dirty_pages += new_dirty_pages;
1002 rs->num_dirty_pages_period += new_dirty_pages;
1006 * ram_pagesize_summary: calculate all the pagesizes of a VM
1008 * Returns a summary bitmap of the page sizes of all RAMBlocks
1010 * For VMs with just normal pages this is equivalent to the host page
1011 * size. If it's got some huge pages then it's the OR of all the
1012 * different page sizes.
1014 uint64_t ram_pagesize_summary(void)
1016 RAMBlock *block;
1017 uint64_t summary = 0;
1019 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1020 summary |= block->page_size;
1023 return summary;
1026 uint64_t ram_get_total_transferred_pages(void)
1028 return ram_counters.normal + ram_counters.duplicate +
1029 compression_counters.pages + xbzrle_counters.pages;
1032 static void migration_update_rates(RAMState *rs, int64_t end_time)
1034 uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
1035 double compressed_size;
1037 /* calculate period counters */
1038 ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
1039 / (end_time - rs->time_last_bitmap_sync);
1041 if (!page_count) {
1042 return;
1045 if (migrate_use_xbzrle()) {
1046 double encoded_size, unencoded_size;
1048 xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
1049 rs->xbzrle_cache_miss_prev) / page_count;
1050 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
1051 unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
1052 TARGET_PAGE_SIZE;
1053 encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
1054 if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
1055 xbzrle_counters.encoding_rate = 0;
1056 } else {
1057 xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
1059 rs->xbzrle_pages_prev = xbzrle_counters.pages;
1060 rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
1063 if (migrate_use_compression()) {
1064 compression_counters.busy_rate = (double)(compression_counters.busy -
1065 rs->compress_thread_busy_prev) / page_count;
1066 rs->compress_thread_busy_prev = compression_counters.busy;
1068 compressed_size = compression_counters.compressed_size -
1069 rs->compressed_size_prev;
1070 if (compressed_size) {
1071 double uncompressed_size = (compression_counters.pages -
1072 rs->compress_pages_prev) * TARGET_PAGE_SIZE;
1074 /* Compression-Ratio = Uncompressed-size / Compressed-size */
1075 compression_counters.compression_rate =
1076 uncompressed_size / compressed_size;
1078 rs->compress_pages_prev = compression_counters.pages;
1079 rs->compressed_size_prev = compression_counters.compressed_size;
1084 static void migration_trigger_throttle(RAMState *rs)
1086 MigrationState *s = migrate_get_current();
1087 uint64_t threshold = s->parameters.throttle_trigger_threshold;
1089 uint64_t bytes_xfer_period = ram_counters.transferred - rs->bytes_xfer_prev;
1090 uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
1091 uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
1093 /* During block migration the auto-converge logic incorrectly detects
1094 * that ram migration makes no progress. Avoid this by disabling the
1095 * throttling logic during the bulk phase of block migration. */
1096 if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1097 /* The following detection logic can be refined later. For now:
1098 Check to see if the ratio between dirtied bytes and the approx.
1099 amount of bytes that just got transferred since the last time
1100 we were in this routine reaches the threshold. If that happens
1101 twice, start or increase throttling. */
1103 if ((bytes_dirty_period > bytes_dirty_threshold) &&
1104 (++rs->dirty_rate_high_cnt >= 2)) {
1105 trace_migration_throttle();
1106 rs->dirty_rate_high_cnt = 0;
1107 mig_throttle_guest_down(bytes_dirty_period,
1108 bytes_dirty_threshold);
1113 static void migration_bitmap_sync(RAMState *rs)
1115 RAMBlock *block;
1116 int64_t end_time;
1118 ram_counters.dirty_sync_count++;
1120 if (!rs->time_last_bitmap_sync) {
1121 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1124 trace_migration_bitmap_sync_start();
1125 memory_global_dirty_log_sync();
1127 qemu_mutex_lock(&rs->bitmap_mutex);
1128 WITH_RCU_READ_LOCK_GUARD() {
1129 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1130 ramblock_sync_dirty_bitmap(rs, block);
1132 ram_counters.remaining = ram_bytes_remaining();
1134 qemu_mutex_unlock(&rs->bitmap_mutex);
1136 memory_global_after_dirty_log_sync();
1137 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1139 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1141 /* more than 1 second = 1000 millisecons */
1142 if (end_time > rs->time_last_bitmap_sync + 1000) {
1143 migration_trigger_throttle(rs);
1145 migration_update_rates(rs, end_time);
1147 rs->target_page_count_prev = rs->target_page_count;
1149 /* reset period counters */
1150 rs->time_last_bitmap_sync = end_time;
1151 rs->num_dirty_pages_period = 0;
1152 rs->bytes_xfer_prev = ram_counters.transferred;
1154 if (migrate_use_events()) {
1155 qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
1159 static void migration_bitmap_sync_precopy(RAMState *rs)
1161 Error *local_err = NULL;
1164 * The current notifier usage is just an optimization to migration, so we
1165 * don't stop the normal migration process in the error case.
1167 if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1168 error_report_err(local_err);
1169 local_err = NULL;
1172 migration_bitmap_sync(rs);
1174 if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1175 error_report_err(local_err);
1179 static void ram_release_page(const char *rbname, uint64_t offset)
1181 if (!migrate_release_ram() || !migration_in_postcopy()) {
1182 return;
1185 ram_discard_range(rbname, offset, TARGET_PAGE_SIZE);
1189 * save_zero_page_to_file: send the zero page to the file
1191 * Returns the size of data written to the file, 0 means the page is not
1192 * a zero page
1194 * @rs: current RAM state
1195 * @file: the file where the data is saved
1196 * @block: block that contains the page we want to send
1197 * @offset: offset inside the block for the page
1199 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1200 RAMBlock *block, ram_addr_t offset)
1202 uint8_t *p = block->host + offset;
1203 int len = 0;
1205 if (buffer_is_zero(p, TARGET_PAGE_SIZE)) {
1206 len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1207 qemu_put_byte(file, 0);
1208 len += 1;
1209 ram_release_page(block->idstr, offset);
1211 return len;
1215 * save_zero_page: send the zero page to the stream
1217 * Returns the number of pages written.
1219 * @rs: current RAM state
1220 * @block: block that contains the page we want to send
1221 * @offset: offset inside the block for the page
1223 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1225 int len = save_zero_page_to_file(rs, rs->f, block, offset);
1227 if (len) {
1228 ram_counters.duplicate++;
1229 ram_transferred_add(len);
1230 return 1;
1232 return -1;
1236 * @pages: the number of pages written by the control path,
1237 * < 0 - error
1238 * > 0 - number of pages written
1240 * Return true if the pages has been saved, otherwise false is returned.
1242 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1243 int *pages)
1245 uint64_t bytes_xmit = 0;
1246 int ret;
1248 *pages = -1;
1249 ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1250 &bytes_xmit);
1251 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1252 return false;
1255 if (bytes_xmit) {
1256 ram_transferred_add(bytes_xmit);
1257 *pages = 1;
1260 if (ret == RAM_SAVE_CONTROL_DELAYED) {
1261 return true;
1264 if (bytes_xmit > 0) {
1265 ram_counters.normal++;
1266 } else if (bytes_xmit == 0) {
1267 ram_counters.duplicate++;
1270 return true;
1274 * directly send the page to the stream
1276 * Returns the number of pages written.
1278 * @rs: current RAM state
1279 * @block: block that contains the page we want to send
1280 * @offset: offset inside the block for the page
1281 * @buf: the page to be sent
1282 * @async: send to page asyncly
1284 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1285 uint8_t *buf, bool async)
1287 ram_transferred_add(save_page_header(rs, rs->f, block,
1288 offset | RAM_SAVE_FLAG_PAGE));
1289 if (async) {
1290 qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1291 migrate_release_ram() &
1292 migration_in_postcopy());
1293 } else {
1294 qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1296 ram_transferred_add(TARGET_PAGE_SIZE);
1297 ram_counters.normal++;
1298 return 1;
1302 * ram_save_page: send the given page to the stream
1304 * Returns the number of pages written.
1305 * < 0 - error
1306 * >=0 - Number of pages written - this might legally be 0
1307 * if xbzrle noticed the page was the same.
1309 * @rs: current RAM state
1310 * @block: block that contains the page we want to send
1311 * @offset: offset inside the block for the page
1313 static int ram_save_page(RAMState *rs, PageSearchStatus *pss)
1315 int pages = -1;
1316 uint8_t *p;
1317 bool send_async = true;
1318 RAMBlock *block = pss->block;
1319 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1320 ram_addr_t current_addr = block->offset + offset;
1322 p = block->host + offset;
1323 trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1325 XBZRLE_cache_lock();
1326 if (rs->xbzrle_enabled && !migration_in_postcopy()) {
1327 pages = save_xbzrle_page(rs, &p, current_addr, block,
1328 offset);
1329 if (!rs->last_stage) {
1330 /* Can't send this cached data async, since the cache page
1331 * might get updated before it gets to the wire
1333 send_async = false;
1337 /* XBZRLE overflow or normal page */
1338 if (pages == -1) {
1339 pages = save_normal_page(rs, block, offset, p, send_async);
1342 XBZRLE_cache_unlock();
1344 return pages;
1347 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
1348 ram_addr_t offset)
1350 if (multifd_queue_page(rs->f, block, offset) < 0) {
1351 return -1;
1353 ram_counters.normal++;
1355 return 1;
1358 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1359 ram_addr_t offset, uint8_t *source_buf)
1361 RAMState *rs = ram_state;
1362 uint8_t *p = block->host + offset;
1363 int ret;
1365 if (save_zero_page_to_file(rs, f, block, offset)) {
1366 return true;
1369 save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1372 * copy it to a internal buffer to avoid it being modified by VM
1373 * so that we can catch up the error during compression and
1374 * decompression
1376 memcpy(source_buf, p, TARGET_PAGE_SIZE);
1377 ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1378 if (ret < 0) {
1379 qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
1380 error_report("compressed data failed!");
1382 return false;
1385 static void
1386 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1388 ram_transferred_add(bytes_xmit);
1390 if (param->zero_page) {
1391 ram_counters.duplicate++;
1392 return;
1395 /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1396 compression_counters.compressed_size += bytes_xmit - 8;
1397 compression_counters.pages++;
1400 static bool save_page_use_compression(RAMState *rs);
1402 static void flush_compressed_data(RAMState *rs)
1404 int idx, len, thread_count;
1406 if (!save_page_use_compression(rs)) {
1407 return;
1409 thread_count = migrate_compress_threads();
1411 qemu_mutex_lock(&comp_done_lock);
1412 for (idx = 0; idx < thread_count; idx++) {
1413 while (!comp_param[idx].done) {
1414 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1417 qemu_mutex_unlock(&comp_done_lock);
1419 for (idx = 0; idx < thread_count; idx++) {
1420 qemu_mutex_lock(&comp_param[idx].mutex);
1421 if (!comp_param[idx].quit) {
1422 len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1424 * it's safe to fetch zero_page without holding comp_done_lock
1425 * as there is no further request submitted to the thread,
1426 * i.e, the thread should be waiting for a request at this point.
1428 update_compress_thread_counts(&comp_param[idx], len);
1430 qemu_mutex_unlock(&comp_param[idx].mutex);
1434 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1435 ram_addr_t offset)
1437 param->block = block;
1438 param->offset = offset;
1441 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1442 ram_addr_t offset)
1444 int idx, thread_count, bytes_xmit = -1, pages = -1;
1445 bool wait = migrate_compress_wait_thread();
1447 thread_count = migrate_compress_threads();
1448 qemu_mutex_lock(&comp_done_lock);
1449 retry:
1450 for (idx = 0; idx < thread_count; idx++) {
1451 if (comp_param[idx].done) {
1452 comp_param[idx].done = false;
1453 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1454 qemu_mutex_lock(&comp_param[idx].mutex);
1455 set_compress_params(&comp_param[idx], block, offset);
1456 qemu_cond_signal(&comp_param[idx].cond);
1457 qemu_mutex_unlock(&comp_param[idx].mutex);
1458 pages = 1;
1459 update_compress_thread_counts(&comp_param[idx], bytes_xmit);
1460 break;
1465 * wait for the free thread if the user specifies 'compress-wait-thread',
1466 * otherwise we will post the page out in the main thread as normal page.
1468 if (pages < 0 && wait) {
1469 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1470 goto retry;
1472 qemu_mutex_unlock(&comp_done_lock);
1474 return pages;
1478 * find_dirty_block: find the next dirty page and update any state
1479 * associated with the search process.
1481 * Returns true if a page is found
1483 * @rs: current RAM state
1484 * @pss: data about the state of the current dirty page scan
1485 * @again: set to false if the search has scanned the whole of RAM
1487 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1489 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1490 if (pss->complete_round && pss->block == rs->last_seen_block &&
1491 pss->page >= rs->last_page) {
1493 * We've been once around the RAM and haven't found anything.
1494 * Give up.
1496 *again = false;
1497 return false;
1499 if (!offset_in_ramblock(pss->block,
1500 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) {
1501 /* Didn't find anything in this RAM Block */
1502 pss->page = 0;
1503 pss->block = QLIST_NEXT_RCU(pss->block, next);
1504 if (!pss->block) {
1506 * If memory migration starts over, we will meet a dirtied page
1507 * which may still exists in compression threads's ring, so we
1508 * should flush the compressed data to make sure the new page
1509 * is not overwritten by the old one in the destination.
1511 * Also If xbzrle is on, stop using the data compression at this
1512 * point. In theory, xbzrle can do better than compression.
1514 flush_compressed_data(rs);
1516 /* Hit the end of the list */
1517 pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1518 /* Flag that we've looped */
1519 pss->complete_round = true;
1520 /* After the first round, enable XBZRLE. */
1521 if (migrate_use_xbzrle()) {
1522 rs->xbzrle_enabled = true;
1525 /* Didn't find anything this time, but try again on the new block */
1526 *again = true;
1527 return false;
1528 } else {
1529 /* Can go around again, but... */
1530 *again = true;
1531 /* We've found something so probably don't need to */
1532 return true;
1537 * unqueue_page: gets a page of the queue
1539 * Helper for 'get_queued_page' - gets a page off the queue
1541 * Returns the block of the page (or NULL if none available)
1543 * @rs: current RAM state
1544 * @offset: used to return the offset within the RAMBlock
1546 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1548 struct RAMSrcPageRequest *entry;
1549 RAMBlock *block = NULL;
1550 size_t page_size;
1552 if (!postcopy_has_request(rs)) {
1553 return NULL;
1556 QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
1559 * This should _never_ change even after we take the lock, because no one
1560 * should be taking anything off the request list other than us.
1562 assert(postcopy_has_request(rs));
1564 entry = QSIMPLEQ_FIRST(&rs->src_page_requests);
1565 block = entry->rb;
1566 *offset = entry->offset;
1567 page_size = qemu_ram_pagesize(block);
1568 /* Each page request should only be multiple page size of the ramblock */
1569 assert((entry->len % page_size) == 0);
1571 if (entry->len > page_size) {
1572 entry->len -= page_size;
1573 entry->offset += page_size;
1574 } else {
1575 memory_region_unref(block->mr);
1576 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1577 g_free(entry);
1578 migration_consume_urgent_request();
1581 trace_unqueue_page(block->idstr, *offset,
1582 test_bit((*offset >> TARGET_PAGE_BITS), block->bmap));
1584 return block;
1587 #if defined(__linux__)
1589 * poll_fault_page: try to get next UFFD write fault page and, if pending fault
1590 * is found, return RAM block pointer and page offset
1592 * Returns pointer to the RAMBlock containing faulting page,
1593 * NULL if no write faults are pending
1595 * @rs: current RAM state
1596 * @offset: page offset from the beginning of the block
1598 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1600 struct uffd_msg uffd_msg;
1601 void *page_address;
1602 RAMBlock *block;
1603 int res;
1605 if (!migrate_background_snapshot()) {
1606 return NULL;
1609 res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1);
1610 if (res <= 0) {
1611 return NULL;
1614 page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address;
1615 block = qemu_ram_block_from_host(page_address, false, offset);
1616 assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0);
1617 return block;
1621 * ram_save_release_protection: release UFFD write protection after
1622 * a range of pages has been saved
1624 * @rs: current RAM state
1625 * @pss: page-search-status structure
1626 * @start_page: index of the first page in the range relative to pss->block
1628 * Returns 0 on success, negative value in case of an error
1630 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1631 unsigned long start_page)
1633 int res = 0;
1635 /* Check if page is from UFFD-managed region. */
1636 if (pss->block->flags & RAM_UF_WRITEPROTECT) {
1637 void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS);
1638 uint64_t run_length = (pss->page - start_page) << TARGET_PAGE_BITS;
1640 /* Flush async buffers before un-protect. */
1641 qemu_fflush(rs->f);
1642 /* Un-protect memory range. */
1643 res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
1644 false, false);
1647 return res;
1650 /* ram_write_tracking_available: check if kernel supports required UFFD features
1652 * Returns true if supports, false otherwise
1654 bool ram_write_tracking_available(void)
1656 uint64_t uffd_features;
1657 int res;
1659 res = uffd_query_features(&uffd_features);
1660 return (res == 0 &&
1661 (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0);
1664 /* ram_write_tracking_compatible: check if guest configuration is
1665 * compatible with 'write-tracking'
1667 * Returns true if compatible, false otherwise
1669 bool ram_write_tracking_compatible(void)
1671 const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT);
1672 int uffd_fd;
1673 RAMBlock *block;
1674 bool ret = false;
1676 /* Open UFFD file descriptor */
1677 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false);
1678 if (uffd_fd < 0) {
1679 return false;
1682 RCU_READ_LOCK_GUARD();
1684 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1685 uint64_t uffd_ioctls;
1687 /* Nothing to do with read-only and MMIO-writable regions */
1688 if (block->mr->readonly || block->mr->rom_device) {
1689 continue;
1691 /* Try to register block memory via UFFD-IO to track writes */
1692 if (uffd_register_memory(uffd_fd, block->host, block->max_length,
1693 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) {
1694 goto out;
1696 if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) {
1697 goto out;
1700 ret = true;
1702 out:
1703 uffd_close_fd(uffd_fd);
1704 return ret;
1707 static inline void populate_read_range(RAMBlock *block, ram_addr_t offset,
1708 ram_addr_t size)
1711 * We read one byte of each page; this will preallocate page tables if
1712 * required and populate the shared zeropage on MAP_PRIVATE anonymous memory
1713 * where no page was populated yet. This might require adaption when
1714 * supporting other mappings, like shmem.
1716 for (; offset < size; offset += block->page_size) {
1717 char tmp = *((char *)block->host + offset);
1719 /* Don't optimize the read out */
1720 asm volatile("" : "+r" (tmp));
1724 static inline int populate_read_section(MemoryRegionSection *section,
1725 void *opaque)
1727 const hwaddr size = int128_get64(section->size);
1728 hwaddr offset = section->offset_within_region;
1729 RAMBlock *block = section->mr->ram_block;
1731 populate_read_range(block, offset, size);
1732 return 0;
1736 * ram_block_populate_read: preallocate page tables and populate pages in the
1737 * RAM block by reading a byte of each page.
1739 * Since it's solely used for userfault_fd WP feature, here we just
1740 * hardcode page size to qemu_real_host_page_size.
1742 * @block: RAM block to populate
1744 static void ram_block_populate_read(RAMBlock *rb)
1747 * Skip populating all pages that fall into a discarded range as managed by
1748 * a RamDiscardManager responsible for the mapped memory region of the
1749 * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock
1750 * must not get populated automatically. We don't have to track
1751 * modifications via userfaultfd WP reliably, because these pages will
1752 * not be part of the migration stream either way -- see
1753 * ramblock_dirty_bitmap_exclude_discarded_pages().
1755 * Note: The result is only stable while migrating (precopy/postcopy).
1757 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1758 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1759 MemoryRegionSection section = {
1760 .mr = rb->mr,
1761 .offset_within_region = 0,
1762 .size = rb->mr->size,
1765 ram_discard_manager_replay_populated(rdm, &section,
1766 populate_read_section, NULL);
1767 } else {
1768 populate_read_range(rb, 0, rb->used_length);
1773 * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking
1775 void ram_write_tracking_prepare(void)
1777 RAMBlock *block;
1779 RCU_READ_LOCK_GUARD();
1781 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1782 /* Nothing to do with read-only and MMIO-writable regions */
1783 if (block->mr->readonly || block->mr->rom_device) {
1784 continue;
1788 * Populate pages of the RAM block before enabling userfault_fd
1789 * write protection.
1791 * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with
1792 * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip
1793 * pages with pte_none() entries in page table.
1795 ram_block_populate_read(block);
1800 * ram_write_tracking_start: start UFFD-WP memory tracking
1802 * Returns 0 for success or negative value in case of error
1804 int ram_write_tracking_start(void)
1806 int uffd_fd;
1807 RAMState *rs = ram_state;
1808 RAMBlock *block;
1810 /* Open UFFD file descriptor */
1811 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true);
1812 if (uffd_fd < 0) {
1813 return uffd_fd;
1815 rs->uffdio_fd = uffd_fd;
1817 RCU_READ_LOCK_GUARD();
1819 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1820 /* Nothing to do with read-only and MMIO-writable regions */
1821 if (block->mr->readonly || block->mr->rom_device) {
1822 continue;
1825 /* Register block memory with UFFD to track writes */
1826 if (uffd_register_memory(rs->uffdio_fd, block->host,
1827 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
1828 goto fail;
1830 /* Apply UFFD write protection to the block memory range */
1831 if (uffd_change_protection(rs->uffdio_fd, block->host,
1832 block->max_length, true, false)) {
1833 goto fail;
1835 block->flags |= RAM_UF_WRITEPROTECT;
1836 memory_region_ref(block->mr);
1838 trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size,
1839 block->host, block->max_length);
1842 return 0;
1844 fail:
1845 error_report("ram_write_tracking_start() failed: restoring initial memory state");
1847 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1848 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1849 continue;
1852 * In case some memory block failed to be write-protected
1853 * remove protection and unregister all succeeded RAM blocks
1855 uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1856 false, false);
1857 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1858 /* Cleanup flags and remove reference */
1859 block->flags &= ~RAM_UF_WRITEPROTECT;
1860 memory_region_unref(block->mr);
1863 uffd_close_fd(uffd_fd);
1864 rs->uffdio_fd = -1;
1865 return -1;
1869 * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
1871 void ram_write_tracking_stop(void)
1873 RAMState *rs = ram_state;
1874 RAMBlock *block;
1876 RCU_READ_LOCK_GUARD();
1878 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1879 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1880 continue;
1882 /* Remove protection and unregister all affected RAM blocks */
1883 uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1884 false, false);
1885 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1887 trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size,
1888 block->host, block->max_length);
1890 /* Cleanup flags and remove reference */
1891 block->flags &= ~RAM_UF_WRITEPROTECT;
1892 memory_region_unref(block->mr);
1895 /* Finally close UFFD file descriptor */
1896 uffd_close_fd(rs->uffdio_fd);
1897 rs->uffdio_fd = -1;
1900 #else
1901 /* No target OS support, stubs just fail or ignore */
1903 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1905 (void) rs;
1906 (void) offset;
1908 return NULL;
1911 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1912 unsigned long start_page)
1914 (void) rs;
1915 (void) pss;
1916 (void) start_page;
1918 return 0;
1921 bool ram_write_tracking_available(void)
1923 return false;
1926 bool ram_write_tracking_compatible(void)
1928 assert(0);
1929 return false;
1932 int ram_write_tracking_start(void)
1934 assert(0);
1935 return -1;
1938 void ram_write_tracking_stop(void)
1940 assert(0);
1942 #endif /* defined(__linux__) */
1945 * get_queued_page: unqueue a page from the postcopy requests
1947 * Skips pages that are already sent (!dirty)
1949 * Returns true if a queued page is found
1951 * @rs: current RAM state
1952 * @pss: data about the state of the current dirty page scan
1954 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1956 RAMBlock *block;
1957 ram_addr_t offset;
1959 block = unqueue_page(rs, &offset);
1961 if (!block) {
1963 * Poll write faults too if background snapshot is enabled; that's
1964 * when we have vcpus got blocked by the write protected pages.
1966 block = poll_fault_page(rs, &offset);
1969 if (block) {
1971 * We want the background search to continue from the queued page
1972 * since the guest is likely to want other pages near to the page
1973 * it just requested.
1975 pss->block = block;
1976 pss->page = offset >> TARGET_PAGE_BITS;
1979 * This unqueued page would break the "one round" check, even is
1980 * really rare.
1982 pss->complete_round = false;
1985 return !!block;
1989 * migration_page_queue_free: drop any remaining pages in the ram
1990 * request queue
1992 * It should be empty at the end anyway, but in error cases there may
1993 * be some left. in case that there is any page left, we drop it.
1996 static void migration_page_queue_free(RAMState *rs)
1998 struct RAMSrcPageRequest *mspr, *next_mspr;
1999 /* This queue generally should be empty - but in the case of a failed
2000 * migration might have some droppings in.
2002 RCU_READ_LOCK_GUARD();
2003 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
2004 memory_region_unref(mspr->rb->mr);
2005 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
2006 g_free(mspr);
2011 * ram_save_queue_pages: queue the page for transmission
2013 * A request from postcopy destination for example.
2015 * Returns zero on success or negative on error
2017 * @rbname: Name of the RAMBLock of the request. NULL means the
2018 * same that last one.
2019 * @start: starting address from the start of the RAMBlock
2020 * @len: length (in bytes) to send
2022 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
2024 RAMBlock *ramblock;
2025 RAMState *rs = ram_state;
2027 ram_counters.postcopy_requests++;
2028 RCU_READ_LOCK_GUARD();
2030 if (!rbname) {
2031 /* Reuse last RAMBlock */
2032 ramblock = rs->last_req_rb;
2034 if (!ramblock) {
2036 * Shouldn't happen, we can't reuse the last RAMBlock if
2037 * it's the 1st request.
2039 error_report("ram_save_queue_pages no previous block");
2040 return -1;
2042 } else {
2043 ramblock = qemu_ram_block_by_name(rbname);
2045 if (!ramblock) {
2046 /* We shouldn't be asked for a non-existent RAMBlock */
2047 error_report("ram_save_queue_pages no block '%s'", rbname);
2048 return -1;
2050 rs->last_req_rb = ramblock;
2052 trace_ram_save_queue_pages(ramblock->idstr, start, len);
2053 if (!offset_in_ramblock(ramblock, start + len - 1)) {
2054 error_report("%s request overrun start=" RAM_ADDR_FMT " len="
2055 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
2056 __func__, start, len, ramblock->used_length);
2057 return -1;
2060 struct RAMSrcPageRequest *new_entry =
2061 g_malloc0(sizeof(struct RAMSrcPageRequest));
2062 new_entry->rb = ramblock;
2063 new_entry->offset = start;
2064 new_entry->len = len;
2066 memory_region_ref(ramblock->mr);
2067 qemu_mutex_lock(&rs->src_page_req_mutex);
2068 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
2069 migration_make_urgent_request();
2070 qemu_mutex_unlock(&rs->src_page_req_mutex);
2072 return 0;
2075 static bool save_page_use_compression(RAMState *rs)
2077 if (!migrate_use_compression()) {
2078 return false;
2082 * If xbzrle is enabled (e.g., after first round of migration), stop
2083 * using the data compression. In theory, xbzrle can do better than
2084 * compression.
2086 if (rs->xbzrle_enabled) {
2087 return false;
2090 return true;
2094 * try to compress the page before posting it out, return true if the page
2095 * has been properly handled by compression, otherwise needs other
2096 * paths to handle it
2098 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
2100 if (!save_page_use_compression(rs)) {
2101 return false;
2105 * When starting the process of a new block, the first page of
2106 * the block should be sent out before other pages in the same
2107 * block, and all the pages in last block should have been sent
2108 * out, keeping this order is important, because the 'cont' flag
2109 * is used to avoid resending the block name.
2111 * We post the fist page as normal page as compression will take
2112 * much CPU resource.
2114 if (block != rs->last_sent_block) {
2115 flush_compressed_data(rs);
2116 return false;
2119 if (compress_page_with_multi_thread(rs, block, offset) > 0) {
2120 return true;
2123 compression_counters.busy++;
2124 return false;
2128 * ram_save_target_page: save one target page
2130 * Returns the number of pages written
2132 * @rs: current RAM state
2133 * @pss: data about the page we want to send
2135 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss)
2137 RAMBlock *block = pss->block;
2138 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
2139 int res;
2141 if (control_save_page(rs, block, offset, &res)) {
2142 return res;
2145 if (save_compress_page(rs, block, offset)) {
2146 return 1;
2149 res = save_zero_page(rs, block, offset);
2150 if (res > 0) {
2151 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2152 * page would be stale
2154 if (!save_page_use_compression(rs)) {
2155 XBZRLE_cache_lock();
2156 xbzrle_cache_zero_page(rs, block->offset + offset);
2157 XBZRLE_cache_unlock();
2159 return res;
2163 * Do not use multifd for:
2164 * 1. Compression as the first page in the new block should be posted out
2165 * before sending the compressed page
2166 * 2. In postcopy as one whole host page should be placed
2168 if (!save_page_use_compression(rs) && migrate_use_multifd()
2169 && !migration_in_postcopy()) {
2170 return ram_save_multifd_page(rs, block, offset);
2173 return ram_save_page(rs, pss);
2177 * ram_save_host_page: save a whole host page
2179 * Starting at *offset send pages up to the end of the current host
2180 * page. It's valid for the initial offset to point into the middle of
2181 * a host page in which case the remainder of the hostpage is sent.
2182 * Only dirty target pages are sent. Note that the host page size may
2183 * be a huge page for this block.
2184 * The saving stops at the boundary of the used_length of the block
2185 * if the RAMBlock isn't a multiple of the host page size.
2187 * Returns the number of pages written or negative on error
2189 * @rs: current RAM state
2190 * @pss: data about the page we want to send
2192 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss)
2194 int tmppages, pages = 0;
2195 size_t pagesize_bits =
2196 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2197 unsigned long hostpage_boundary =
2198 QEMU_ALIGN_UP(pss->page + 1, pagesize_bits);
2199 unsigned long start_page = pss->page;
2200 int res;
2202 if (ramblock_is_ignored(pss->block)) {
2203 error_report("block %s should not be migrated !", pss->block->idstr);
2204 return 0;
2207 do {
2208 /* Check the pages is dirty and if it is send it */
2209 if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
2210 tmppages = ram_save_target_page(rs, pss);
2211 if (tmppages < 0) {
2212 return tmppages;
2215 pages += tmppages;
2217 * Allow rate limiting to happen in the middle of huge pages if
2218 * something is sent in the current iteration.
2220 if (pagesize_bits > 1 && tmppages > 0) {
2221 migration_rate_limit();
2224 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
2225 } while ((pss->page < hostpage_boundary) &&
2226 offset_in_ramblock(pss->block,
2227 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS));
2228 /* The offset we leave with is the min boundary of host page and block */
2229 pss->page = MIN(pss->page, hostpage_boundary);
2231 res = ram_save_release_protection(rs, pss, start_page);
2232 return (res < 0 ? res : pages);
2236 * ram_find_and_save_block: finds a dirty page and sends it to f
2238 * Called within an RCU critical section.
2240 * Returns the number of pages written where zero means no dirty pages,
2241 * or negative on error
2243 * @rs: current RAM state
2245 * On systems where host-page-size > target-page-size it will send all the
2246 * pages in a host page that are dirty.
2248 static int ram_find_and_save_block(RAMState *rs)
2250 PageSearchStatus pss;
2251 int pages = 0;
2252 bool again, found;
2254 /* No dirty page as there is zero RAM */
2255 if (!ram_bytes_total()) {
2256 return pages;
2259 pss.block = rs->last_seen_block;
2260 pss.page = rs->last_page;
2261 pss.complete_round = false;
2263 if (!pss.block) {
2264 pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
2267 do {
2268 again = true;
2269 found = get_queued_page(rs, &pss);
2271 if (!found) {
2272 /* priority queue empty, so just search for something dirty */
2273 found = find_dirty_block(rs, &pss, &again);
2276 if (found) {
2277 pages = ram_save_host_page(rs, &pss);
2279 } while (!pages && again);
2281 rs->last_seen_block = pss.block;
2282 rs->last_page = pss.page;
2284 return pages;
2287 void acct_update_position(QEMUFile *f, size_t size, bool zero)
2289 uint64_t pages = size / TARGET_PAGE_SIZE;
2291 if (zero) {
2292 ram_counters.duplicate += pages;
2293 } else {
2294 ram_counters.normal += pages;
2295 ram_transferred_add(size);
2296 qemu_update_position(f, size);
2300 static uint64_t ram_bytes_total_common(bool count_ignored)
2302 RAMBlock *block;
2303 uint64_t total = 0;
2305 RCU_READ_LOCK_GUARD();
2307 if (count_ignored) {
2308 RAMBLOCK_FOREACH_MIGRATABLE(block) {
2309 total += block->used_length;
2311 } else {
2312 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2313 total += block->used_length;
2316 return total;
2319 uint64_t ram_bytes_total(void)
2321 return ram_bytes_total_common(false);
2324 static void xbzrle_load_setup(void)
2326 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2329 static void xbzrle_load_cleanup(void)
2331 g_free(XBZRLE.decoded_buf);
2332 XBZRLE.decoded_buf = NULL;
2335 static void ram_state_cleanup(RAMState **rsp)
2337 if (*rsp) {
2338 migration_page_queue_free(*rsp);
2339 qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2340 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2341 g_free(*rsp);
2342 *rsp = NULL;
2346 static void xbzrle_cleanup(void)
2348 XBZRLE_cache_lock();
2349 if (XBZRLE.cache) {
2350 cache_fini(XBZRLE.cache);
2351 g_free(XBZRLE.encoded_buf);
2352 g_free(XBZRLE.current_buf);
2353 g_free(XBZRLE.zero_target_page);
2354 XBZRLE.cache = NULL;
2355 XBZRLE.encoded_buf = NULL;
2356 XBZRLE.current_buf = NULL;
2357 XBZRLE.zero_target_page = NULL;
2359 XBZRLE_cache_unlock();
2362 static void ram_save_cleanup(void *opaque)
2364 RAMState **rsp = opaque;
2365 RAMBlock *block;
2367 /* We don't use dirty log with background snapshots */
2368 if (!migrate_background_snapshot()) {
2369 /* caller have hold iothread lock or is in a bh, so there is
2370 * no writing race against the migration bitmap
2372 if (global_dirty_tracking & GLOBAL_DIRTY_MIGRATION) {
2374 * do not stop dirty log without starting it, since
2375 * memory_global_dirty_log_stop will assert that
2376 * memory_global_dirty_log_start/stop used in pairs
2378 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
2382 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2383 g_free(block->clear_bmap);
2384 block->clear_bmap = NULL;
2385 g_free(block->bmap);
2386 block->bmap = NULL;
2389 xbzrle_cleanup();
2390 compress_threads_save_cleanup();
2391 ram_state_cleanup(rsp);
2394 static void ram_state_reset(RAMState *rs)
2396 rs->last_seen_block = NULL;
2397 rs->last_sent_block = NULL;
2398 rs->last_page = 0;
2399 rs->last_version = ram_list.version;
2400 rs->xbzrle_enabled = false;
2403 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2405 /* **** functions for postcopy ***** */
2407 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2409 struct RAMBlock *block;
2411 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2412 unsigned long *bitmap = block->bmap;
2413 unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2414 unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2416 while (run_start < range) {
2417 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2418 ram_discard_range(block->idstr,
2419 ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
2420 ((ram_addr_t)(run_end - run_start))
2421 << TARGET_PAGE_BITS);
2422 run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2428 * postcopy_send_discard_bm_ram: discard a RAMBlock
2430 * Callback from postcopy_each_ram_send_discard for each RAMBlock
2432 * @ms: current migration state
2433 * @block: RAMBlock to discard
2435 static void postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
2437 unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2438 unsigned long current;
2439 unsigned long *bitmap = block->bmap;
2441 for (current = 0; current < end; ) {
2442 unsigned long one = find_next_bit(bitmap, end, current);
2443 unsigned long zero, discard_length;
2445 if (one >= end) {
2446 break;
2449 zero = find_next_zero_bit(bitmap, end, one + 1);
2451 if (zero >= end) {
2452 discard_length = end - one;
2453 } else {
2454 discard_length = zero - one;
2456 postcopy_discard_send_range(ms, one, discard_length);
2457 current = one + discard_length;
2461 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block);
2464 * postcopy_each_ram_send_discard: discard all RAMBlocks
2466 * Utility for the outgoing postcopy code.
2467 * Calls postcopy_send_discard_bm_ram for each RAMBlock
2468 * passing it bitmap indexes and name.
2469 * (qemu_ram_foreach_block ends up passing unscaled lengths
2470 * which would mean postcopy code would have to deal with target page)
2472 * @ms: current migration state
2474 static void postcopy_each_ram_send_discard(MigrationState *ms)
2476 struct RAMBlock *block;
2478 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2479 postcopy_discard_send_init(ms, block->idstr);
2482 * Deal with TPS != HPS and huge pages. It discard any partially sent
2483 * host-page size chunks, mark any partially dirty host-page size
2484 * chunks as all dirty. In this case the host-page is the host-page
2485 * for the particular RAMBlock, i.e. it might be a huge page.
2487 postcopy_chunk_hostpages_pass(ms, block);
2490 * Postcopy sends chunks of bitmap over the wire, but it
2491 * just needs indexes at this point, avoids it having
2492 * target page specific code.
2494 postcopy_send_discard_bm_ram(ms, block);
2495 postcopy_discard_send_finish(ms);
2500 * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2502 * Helper for postcopy_chunk_hostpages; it's called twice to
2503 * canonicalize the two bitmaps, that are similar, but one is
2504 * inverted.
2506 * Postcopy requires that all target pages in a hostpage are dirty or
2507 * clean, not a mix. This function canonicalizes the bitmaps.
2509 * @ms: current migration state
2510 * @block: block that contains the page we want to canonicalize
2512 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
2514 RAMState *rs = ram_state;
2515 unsigned long *bitmap = block->bmap;
2516 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2517 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2518 unsigned long run_start;
2520 if (block->page_size == TARGET_PAGE_SIZE) {
2521 /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2522 return;
2525 /* Find a dirty page */
2526 run_start = find_next_bit(bitmap, pages, 0);
2528 while (run_start < pages) {
2531 * If the start of this run of pages is in the middle of a host
2532 * page, then we need to fixup this host page.
2534 if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2535 /* Find the end of this run */
2536 run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2538 * If the end isn't at the start of a host page, then the
2539 * run doesn't finish at the end of a host page
2540 * and we need to discard.
2544 if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2545 unsigned long page;
2546 unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2547 host_ratio);
2548 run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2550 /* Clean up the bitmap */
2551 for (page = fixup_start_addr;
2552 page < fixup_start_addr + host_ratio; page++) {
2554 * Remark them as dirty, updating the count for any pages
2555 * that weren't previously dirty.
2557 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2561 /* Find the next dirty page for the next iteration */
2562 run_start = find_next_bit(bitmap, pages, run_start);
2567 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2569 * Transmit the set of pages to be discarded after precopy to the target
2570 * these are pages that:
2571 * a) Have been previously transmitted but are now dirty again
2572 * b) Pages that have never been transmitted, this ensures that
2573 * any pages on the destination that have been mapped by background
2574 * tasks get discarded (transparent huge pages is the specific concern)
2575 * Hopefully this is pretty sparse
2577 * @ms: current migration state
2579 void ram_postcopy_send_discard_bitmap(MigrationState *ms)
2581 RAMState *rs = ram_state;
2583 RCU_READ_LOCK_GUARD();
2585 /* This should be our last sync, the src is now paused */
2586 migration_bitmap_sync(rs);
2588 /* Easiest way to make sure we don't resume in the middle of a host-page */
2589 rs->last_seen_block = NULL;
2590 rs->last_sent_block = NULL;
2591 rs->last_page = 0;
2593 postcopy_each_ram_send_discard(ms);
2595 trace_ram_postcopy_send_discard_bitmap();
2599 * ram_discard_range: discard dirtied pages at the beginning of postcopy
2601 * Returns zero on success
2603 * @rbname: name of the RAMBlock of the request. NULL means the
2604 * same that last one.
2605 * @start: RAMBlock starting page
2606 * @length: RAMBlock size
2608 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2610 trace_ram_discard_range(rbname, start, length);
2612 RCU_READ_LOCK_GUARD();
2613 RAMBlock *rb = qemu_ram_block_by_name(rbname);
2615 if (!rb) {
2616 error_report("ram_discard_range: Failed to find block '%s'", rbname);
2617 return -1;
2621 * On source VM, we don't need to update the received bitmap since
2622 * we don't even have one.
2624 if (rb->receivedmap) {
2625 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2626 length >> qemu_target_page_bits());
2629 return ram_block_discard_range(rb, start, length);
2633 * For every allocation, we will try not to crash the VM if the
2634 * allocation failed.
2636 static int xbzrle_init(void)
2638 Error *local_err = NULL;
2640 if (!migrate_use_xbzrle()) {
2641 return 0;
2644 XBZRLE_cache_lock();
2646 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2647 if (!XBZRLE.zero_target_page) {
2648 error_report("%s: Error allocating zero page", __func__);
2649 goto err_out;
2652 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2653 TARGET_PAGE_SIZE, &local_err);
2654 if (!XBZRLE.cache) {
2655 error_report_err(local_err);
2656 goto free_zero_page;
2659 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2660 if (!XBZRLE.encoded_buf) {
2661 error_report("%s: Error allocating encoded_buf", __func__);
2662 goto free_cache;
2665 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2666 if (!XBZRLE.current_buf) {
2667 error_report("%s: Error allocating current_buf", __func__);
2668 goto free_encoded_buf;
2671 /* We are all good */
2672 XBZRLE_cache_unlock();
2673 return 0;
2675 free_encoded_buf:
2676 g_free(XBZRLE.encoded_buf);
2677 XBZRLE.encoded_buf = NULL;
2678 free_cache:
2679 cache_fini(XBZRLE.cache);
2680 XBZRLE.cache = NULL;
2681 free_zero_page:
2682 g_free(XBZRLE.zero_target_page);
2683 XBZRLE.zero_target_page = NULL;
2684 err_out:
2685 XBZRLE_cache_unlock();
2686 return -ENOMEM;
2689 static int ram_state_init(RAMState **rsp)
2691 *rsp = g_try_new0(RAMState, 1);
2693 if (!*rsp) {
2694 error_report("%s: Init ramstate fail", __func__);
2695 return -1;
2698 qemu_mutex_init(&(*rsp)->bitmap_mutex);
2699 qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2700 QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2703 * Count the total number of pages used by ram blocks not including any
2704 * gaps due to alignment or unplugs.
2705 * This must match with the initial values of dirty bitmap.
2707 (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2708 ram_state_reset(*rsp);
2710 return 0;
2713 static void ram_list_init_bitmaps(void)
2715 MigrationState *ms = migrate_get_current();
2716 RAMBlock *block;
2717 unsigned long pages;
2718 uint8_t shift;
2720 /* Skip setting bitmap if there is no RAM */
2721 if (ram_bytes_total()) {
2722 shift = ms->clear_bitmap_shift;
2723 if (shift > CLEAR_BITMAP_SHIFT_MAX) {
2724 error_report("clear_bitmap_shift (%u) too big, using "
2725 "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
2726 shift = CLEAR_BITMAP_SHIFT_MAX;
2727 } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
2728 error_report("clear_bitmap_shift (%u) too small, using "
2729 "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
2730 shift = CLEAR_BITMAP_SHIFT_MIN;
2733 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2734 pages = block->max_length >> TARGET_PAGE_BITS;
2736 * The initial dirty bitmap for migration must be set with all
2737 * ones to make sure we'll migrate every guest RAM page to
2738 * destination.
2739 * Here we set RAMBlock.bmap all to 1 because when rebegin a
2740 * new migration after a failed migration, ram_list.
2741 * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
2742 * guest memory.
2744 block->bmap = bitmap_new(pages);
2745 bitmap_set(block->bmap, 0, pages);
2746 block->clear_bmap_shift = shift;
2747 block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
2752 static void migration_bitmap_clear_discarded_pages(RAMState *rs)
2754 unsigned long pages;
2755 RAMBlock *rb;
2757 RCU_READ_LOCK_GUARD();
2759 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
2760 pages = ramblock_dirty_bitmap_clear_discarded_pages(rb);
2761 rs->migration_dirty_pages -= pages;
2765 static void ram_init_bitmaps(RAMState *rs)
2767 /* For memory_global_dirty_log_start below. */
2768 qemu_mutex_lock_iothread();
2769 qemu_mutex_lock_ramlist();
2771 WITH_RCU_READ_LOCK_GUARD() {
2772 ram_list_init_bitmaps();
2773 /* We don't use dirty log with background snapshots */
2774 if (!migrate_background_snapshot()) {
2775 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
2776 migration_bitmap_sync_precopy(rs);
2779 qemu_mutex_unlock_ramlist();
2780 qemu_mutex_unlock_iothread();
2783 * After an eventual first bitmap sync, fixup the initial bitmap
2784 * containing all 1s to exclude any discarded pages from migration.
2786 migration_bitmap_clear_discarded_pages(rs);
2789 static int ram_init_all(RAMState **rsp)
2791 if (ram_state_init(rsp)) {
2792 return -1;
2795 if (xbzrle_init()) {
2796 ram_state_cleanup(rsp);
2797 return -1;
2800 ram_init_bitmaps(*rsp);
2802 return 0;
2805 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2807 RAMBlock *block;
2808 uint64_t pages = 0;
2811 * Postcopy is not using xbzrle/compression, so no need for that.
2812 * Also, since source are already halted, we don't need to care
2813 * about dirty page logging as well.
2816 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2817 pages += bitmap_count_one(block->bmap,
2818 block->used_length >> TARGET_PAGE_BITS);
2821 /* This may not be aligned with current bitmaps. Recalculate. */
2822 rs->migration_dirty_pages = pages;
2824 ram_state_reset(rs);
2826 /* Update RAMState cache of output QEMUFile */
2827 rs->f = out;
2829 trace_ram_state_resume_prepare(pages);
2833 * This function clears bits of the free pages reported by the caller from the
2834 * migration dirty bitmap. @addr is the host address corresponding to the
2835 * start of the continuous guest free pages, and @len is the total bytes of
2836 * those pages.
2838 void qemu_guest_free_page_hint(void *addr, size_t len)
2840 RAMBlock *block;
2841 ram_addr_t offset;
2842 size_t used_len, start, npages;
2843 MigrationState *s = migrate_get_current();
2845 /* This function is currently expected to be used during live migration */
2846 if (!migration_is_setup_or_active(s->state)) {
2847 return;
2850 for (; len > 0; len -= used_len, addr += used_len) {
2851 block = qemu_ram_block_from_host(addr, false, &offset);
2852 if (unlikely(!block || offset >= block->used_length)) {
2854 * The implementation might not support RAMBlock resize during
2855 * live migration, but it could happen in theory with future
2856 * updates. So we add a check here to capture that case.
2858 error_report_once("%s unexpected error", __func__);
2859 return;
2862 if (len <= block->used_length - offset) {
2863 used_len = len;
2864 } else {
2865 used_len = block->used_length - offset;
2868 start = offset >> TARGET_PAGE_BITS;
2869 npages = used_len >> TARGET_PAGE_BITS;
2871 qemu_mutex_lock(&ram_state->bitmap_mutex);
2873 * The skipped free pages are equavalent to be sent from clear_bmap's
2874 * perspective, so clear the bits from the memory region bitmap which
2875 * are initially set. Otherwise those skipped pages will be sent in
2876 * the next round after syncing from the memory region bitmap.
2878 migration_clear_memory_region_dirty_bitmap_range(block, start, npages);
2879 ram_state->migration_dirty_pages -=
2880 bitmap_count_one_with_offset(block->bmap, start, npages);
2881 bitmap_clear(block->bmap, start, npages);
2882 qemu_mutex_unlock(&ram_state->bitmap_mutex);
2887 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2888 * long-running RCU critical section. When rcu-reclaims in the code
2889 * start to become numerous it will be necessary to reduce the
2890 * granularity of these critical sections.
2894 * ram_save_setup: Setup RAM for migration
2896 * Returns zero to indicate success and negative for error
2898 * @f: QEMUFile where to send the data
2899 * @opaque: RAMState pointer
2901 static int ram_save_setup(QEMUFile *f, void *opaque)
2903 RAMState **rsp = opaque;
2904 RAMBlock *block;
2906 if (compress_threads_save_setup()) {
2907 return -1;
2910 /* migration has already setup the bitmap, reuse it. */
2911 if (!migration_in_colo_state()) {
2912 if (ram_init_all(rsp) != 0) {
2913 compress_threads_save_cleanup();
2914 return -1;
2917 (*rsp)->f = f;
2919 WITH_RCU_READ_LOCK_GUARD() {
2920 qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
2922 RAMBLOCK_FOREACH_MIGRATABLE(block) {
2923 qemu_put_byte(f, strlen(block->idstr));
2924 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2925 qemu_put_be64(f, block->used_length);
2926 if (migrate_postcopy_ram() && block->page_size !=
2927 qemu_host_page_size) {
2928 qemu_put_be64(f, block->page_size);
2930 if (migrate_ignore_shared()) {
2931 qemu_put_be64(f, block->mr->addr);
2936 ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2937 ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2939 multifd_send_sync_main(f);
2940 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2941 qemu_fflush(f);
2943 return 0;
2947 * ram_save_iterate: iterative stage for migration
2949 * Returns zero to indicate success and negative for error
2951 * @f: QEMUFile where to send the data
2952 * @opaque: RAMState pointer
2954 static int ram_save_iterate(QEMUFile *f, void *opaque)
2956 RAMState **temp = opaque;
2957 RAMState *rs = *temp;
2958 int ret = 0;
2959 int i;
2960 int64_t t0;
2961 int done = 0;
2963 if (blk_mig_bulk_active()) {
2964 /* Avoid transferring ram during bulk phase of block migration as
2965 * the bulk phase will usually take a long time and transferring
2966 * ram updates during that time is pointless. */
2967 goto out;
2971 * We'll take this lock a little bit long, but it's okay for two reasons.
2972 * Firstly, the only possible other thread to take it is who calls
2973 * qemu_guest_free_page_hint(), which should be rare; secondly, see
2974 * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which
2975 * guarantees that we'll at least released it in a regular basis.
2977 qemu_mutex_lock(&rs->bitmap_mutex);
2978 WITH_RCU_READ_LOCK_GUARD() {
2979 if (ram_list.version != rs->last_version) {
2980 ram_state_reset(rs);
2983 /* Read version before ram_list.blocks */
2984 smp_rmb();
2986 ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2988 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2989 i = 0;
2990 while ((ret = qemu_file_rate_limit(f)) == 0 ||
2991 postcopy_has_request(rs)) {
2992 int pages;
2994 if (qemu_file_get_error(f)) {
2995 break;
2998 pages = ram_find_and_save_block(rs);
2999 /* no more pages to sent */
3000 if (pages == 0) {
3001 done = 1;
3002 break;
3005 if (pages < 0) {
3006 qemu_file_set_error(f, pages);
3007 break;
3010 rs->target_page_count += pages;
3013 * During postcopy, it is necessary to make sure one whole host
3014 * page is sent in one chunk.
3016 if (migrate_postcopy_ram()) {
3017 flush_compressed_data(rs);
3021 * we want to check in the 1st loop, just in case it was the 1st
3022 * time and we had to sync the dirty bitmap.
3023 * qemu_clock_get_ns() is a bit expensive, so we only check each
3024 * some iterations
3026 if ((i & 63) == 0) {
3027 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
3028 1000000;
3029 if (t1 > MAX_WAIT) {
3030 trace_ram_save_iterate_big_wait(t1, i);
3031 break;
3034 i++;
3037 qemu_mutex_unlock(&rs->bitmap_mutex);
3040 * Must occur before EOS (or any QEMUFile operation)
3041 * because of RDMA protocol.
3043 ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3045 out:
3046 if (ret >= 0
3047 && migration_is_setup_or_active(migrate_get_current()->state)) {
3048 multifd_send_sync_main(rs->f);
3049 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3050 qemu_fflush(f);
3051 ram_transferred_add(8);
3053 ret = qemu_file_get_error(f);
3055 if (ret < 0) {
3056 return ret;
3059 return done;
3063 * ram_save_complete: function called to send the remaining amount of ram
3065 * Returns zero to indicate success or negative on error
3067 * Called with iothread lock
3069 * @f: QEMUFile where to send the data
3070 * @opaque: RAMState pointer
3072 static int ram_save_complete(QEMUFile *f, void *opaque)
3074 RAMState **temp = opaque;
3075 RAMState *rs = *temp;
3076 int ret = 0;
3078 rs->last_stage = !migration_in_colo_state();
3080 WITH_RCU_READ_LOCK_GUARD() {
3081 if (!migration_in_postcopy()) {
3082 migration_bitmap_sync_precopy(rs);
3085 ram_control_before_iterate(f, RAM_CONTROL_FINISH);
3087 /* try transferring iterative blocks of memory */
3089 /* flush all remaining blocks regardless of rate limiting */
3090 while (true) {
3091 int pages;
3093 pages = ram_find_and_save_block(rs);
3094 /* no more blocks to sent */
3095 if (pages == 0) {
3096 break;
3098 if (pages < 0) {
3099 ret = pages;
3100 break;
3104 flush_compressed_data(rs);
3105 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3108 if (ret >= 0) {
3109 multifd_send_sync_main(rs->f);
3110 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3111 qemu_fflush(f);
3114 return ret;
3117 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
3118 uint64_t *res_precopy_only,
3119 uint64_t *res_compatible,
3120 uint64_t *res_postcopy_only)
3122 RAMState **temp = opaque;
3123 RAMState *rs = *temp;
3124 uint64_t remaining_size;
3126 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3128 if (!migration_in_postcopy() &&
3129 remaining_size < max_size) {
3130 qemu_mutex_lock_iothread();
3131 WITH_RCU_READ_LOCK_GUARD() {
3132 migration_bitmap_sync_precopy(rs);
3134 qemu_mutex_unlock_iothread();
3135 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3138 if (migrate_postcopy_ram()) {
3139 /* We can do postcopy, and all the data is postcopiable */
3140 *res_compatible += remaining_size;
3141 } else {
3142 *res_precopy_only += remaining_size;
3146 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3148 unsigned int xh_len;
3149 int xh_flags;
3150 uint8_t *loaded_data;
3152 /* extract RLE header */
3153 xh_flags = qemu_get_byte(f);
3154 xh_len = qemu_get_be16(f);
3156 if (xh_flags != ENCODING_FLAG_XBZRLE) {
3157 error_report("Failed to load XBZRLE page - wrong compression!");
3158 return -1;
3161 if (xh_len > TARGET_PAGE_SIZE) {
3162 error_report("Failed to load XBZRLE page - len overflow!");
3163 return -1;
3165 loaded_data = XBZRLE.decoded_buf;
3166 /* load data and decode */
3167 /* it can change loaded_data to point to an internal buffer */
3168 qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3170 /* decode RLE */
3171 if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3172 TARGET_PAGE_SIZE) == -1) {
3173 error_report("Failed to load XBZRLE page - decode error!");
3174 return -1;
3177 return 0;
3181 * ram_block_from_stream: read a RAMBlock id from the migration stream
3183 * Must be called from within a rcu critical section.
3185 * Returns a pointer from within the RCU-protected ram_list.
3187 * @f: QEMUFile where to read the data from
3188 * @flags: Page flags (mostly to see if it's a continuation of previous block)
3190 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
3192 static RAMBlock *block;
3193 char id[256];
3194 uint8_t len;
3196 if (flags & RAM_SAVE_FLAG_CONTINUE) {
3197 if (!block) {
3198 error_report("Ack, bad migration stream!");
3199 return NULL;
3201 return block;
3204 len = qemu_get_byte(f);
3205 qemu_get_buffer(f, (uint8_t *)id, len);
3206 id[len] = 0;
3208 block = qemu_ram_block_by_name(id);
3209 if (!block) {
3210 error_report("Can't find block %s", id);
3211 return NULL;
3214 if (ramblock_is_ignored(block)) {
3215 error_report("block %s should not be migrated !", id);
3216 return NULL;
3219 return block;
3222 static inline void *host_from_ram_block_offset(RAMBlock *block,
3223 ram_addr_t offset)
3225 if (!offset_in_ramblock(block, offset)) {
3226 return NULL;
3229 return block->host + offset;
3232 static void *host_page_from_ram_block_offset(RAMBlock *block,
3233 ram_addr_t offset)
3235 /* Note: Explicitly no check against offset_in_ramblock(). */
3236 return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset),
3237 block->page_size);
3240 static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block,
3241 ram_addr_t offset)
3243 return ((uintptr_t)block->host + offset) & (block->page_size - 1);
3246 static inline void *colo_cache_from_block_offset(RAMBlock *block,
3247 ram_addr_t offset, bool record_bitmap)
3249 if (!offset_in_ramblock(block, offset)) {
3250 return NULL;
3252 if (!block->colo_cache) {
3253 error_report("%s: colo_cache is NULL in block :%s",
3254 __func__, block->idstr);
3255 return NULL;
3259 * During colo checkpoint, we need bitmap of these migrated pages.
3260 * It help us to decide which pages in ram cache should be flushed
3261 * into VM's RAM later.
3263 if (record_bitmap &&
3264 !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
3265 ram_state->migration_dirty_pages++;
3267 return block->colo_cache + offset;
3271 * ram_handle_compressed: handle the zero page case
3273 * If a page (or a whole RDMA chunk) has been
3274 * determined to be zero, then zap it.
3276 * @host: host address for the zero page
3277 * @ch: what the page is filled from. We only support zero
3278 * @size: size of the zero page
3280 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3282 if (ch != 0 || !buffer_is_zero(host, size)) {
3283 memset(host, ch, size);
3287 /* return the size after decompression, or negative value on error */
3288 static int
3289 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3290 const uint8_t *source, size_t source_len)
3292 int err;
3294 err = inflateReset(stream);
3295 if (err != Z_OK) {
3296 return -1;
3299 stream->avail_in = source_len;
3300 stream->next_in = (uint8_t *)source;
3301 stream->avail_out = dest_len;
3302 stream->next_out = dest;
3304 err = inflate(stream, Z_NO_FLUSH);
3305 if (err != Z_STREAM_END) {
3306 return -1;
3309 return stream->total_out;
3312 static void *do_data_decompress(void *opaque)
3314 DecompressParam *param = opaque;
3315 unsigned long pagesize;
3316 uint8_t *des;
3317 int len, ret;
3319 qemu_mutex_lock(&param->mutex);
3320 while (!param->quit) {
3321 if (param->des) {
3322 des = param->des;
3323 len = param->len;
3324 param->des = 0;
3325 qemu_mutex_unlock(&param->mutex);
3327 pagesize = TARGET_PAGE_SIZE;
3329 ret = qemu_uncompress_data(&param->stream, des, pagesize,
3330 param->compbuf, len);
3331 if (ret < 0 && migrate_get_current()->decompress_error_check) {
3332 error_report("decompress data failed");
3333 qemu_file_set_error(decomp_file, ret);
3336 qemu_mutex_lock(&decomp_done_lock);
3337 param->done = true;
3338 qemu_cond_signal(&decomp_done_cond);
3339 qemu_mutex_unlock(&decomp_done_lock);
3341 qemu_mutex_lock(&param->mutex);
3342 } else {
3343 qemu_cond_wait(&param->cond, &param->mutex);
3346 qemu_mutex_unlock(&param->mutex);
3348 return NULL;
3351 static int wait_for_decompress_done(void)
3353 int idx, thread_count;
3355 if (!migrate_use_compression()) {
3356 return 0;
3359 thread_count = migrate_decompress_threads();
3360 qemu_mutex_lock(&decomp_done_lock);
3361 for (idx = 0; idx < thread_count; idx++) {
3362 while (!decomp_param[idx].done) {
3363 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3366 qemu_mutex_unlock(&decomp_done_lock);
3367 return qemu_file_get_error(decomp_file);
3370 static void compress_threads_load_cleanup(void)
3372 int i, thread_count;
3374 if (!migrate_use_compression()) {
3375 return;
3377 thread_count = migrate_decompress_threads();
3378 for (i = 0; i < thread_count; i++) {
3380 * we use it as a indicator which shows if the thread is
3381 * properly init'd or not
3383 if (!decomp_param[i].compbuf) {
3384 break;
3387 qemu_mutex_lock(&decomp_param[i].mutex);
3388 decomp_param[i].quit = true;
3389 qemu_cond_signal(&decomp_param[i].cond);
3390 qemu_mutex_unlock(&decomp_param[i].mutex);
3392 for (i = 0; i < thread_count; i++) {
3393 if (!decomp_param[i].compbuf) {
3394 break;
3397 qemu_thread_join(decompress_threads + i);
3398 qemu_mutex_destroy(&decomp_param[i].mutex);
3399 qemu_cond_destroy(&decomp_param[i].cond);
3400 inflateEnd(&decomp_param[i].stream);
3401 g_free(decomp_param[i].compbuf);
3402 decomp_param[i].compbuf = NULL;
3404 g_free(decompress_threads);
3405 g_free(decomp_param);
3406 decompress_threads = NULL;
3407 decomp_param = NULL;
3408 decomp_file = NULL;
3411 static int compress_threads_load_setup(QEMUFile *f)
3413 int i, thread_count;
3415 if (!migrate_use_compression()) {
3416 return 0;
3419 thread_count = migrate_decompress_threads();
3420 decompress_threads = g_new0(QemuThread, thread_count);
3421 decomp_param = g_new0(DecompressParam, thread_count);
3422 qemu_mutex_init(&decomp_done_lock);
3423 qemu_cond_init(&decomp_done_cond);
3424 decomp_file = f;
3425 for (i = 0; i < thread_count; i++) {
3426 if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3427 goto exit;
3430 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3431 qemu_mutex_init(&decomp_param[i].mutex);
3432 qemu_cond_init(&decomp_param[i].cond);
3433 decomp_param[i].done = true;
3434 decomp_param[i].quit = false;
3435 qemu_thread_create(decompress_threads + i, "decompress",
3436 do_data_decompress, decomp_param + i,
3437 QEMU_THREAD_JOINABLE);
3439 return 0;
3440 exit:
3441 compress_threads_load_cleanup();
3442 return -1;
3445 static void decompress_data_with_multi_threads(QEMUFile *f,
3446 void *host, int len)
3448 int idx, thread_count;
3450 thread_count = migrate_decompress_threads();
3451 QEMU_LOCK_GUARD(&decomp_done_lock);
3452 while (true) {
3453 for (idx = 0; idx < thread_count; idx++) {
3454 if (decomp_param[idx].done) {
3455 decomp_param[idx].done = false;
3456 qemu_mutex_lock(&decomp_param[idx].mutex);
3457 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3458 decomp_param[idx].des = host;
3459 decomp_param[idx].len = len;
3460 qemu_cond_signal(&decomp_param[idx].cond);
3461 qemu_mutex_unlock(&decomp_param[idx].mutex);
3462 break;
3465 if (idx < thread_count) {
3466 break;
3467 } else {
3468 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3473 static void colo_init_ram_state(void)
3475 ram_state_init(&ram_state);
3479 * colo cache: this is for secondary VM, we cache the whole
3480 * memory of the secondary VM, it is need to hold the global lock
3481 * to call this helper.
3483 int colo_init_ram_cache(void)
3485 RAMBlock *block;
3487 WITH_RCU_READ_LOCK_GUARD() {
3488 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3489 block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3490 NULL, false, false);
3491 if (!block->colo_cache) {
3492 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3493 "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3494 block->used_length);
3495 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3496 if (block->colo_cache) {
3497 qemu_anon_ram_free(block->colo_cache, block->used_length);
3498 block->colo_cache = NULL;
3501 return -errno;
3503 if (!machine_dump_guest_core(current_machine)) {
3504 qemu_madvise(block->colo_cache, block->used_length,
3505 QEMU_MADV_DONTDUMP);
3511 * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3512 * with to decide which page in cache should be flushed into SVM's RAM. Here
3513 * we use the same name 'ram_bitmap' as for migration.
3515 if (ram_bytes_total()) {
3516 RAMBlock *block;
3518 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3519 unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3520 block->bmap = bitmap_new(pages);
3524 colo_init_ram_state();
3525 return 0;
3528 /* TODO: duplicated with ram_init_bitmaps */
3529 void colo_incoming_start_dirty_log(void)
3531 RAMBlock *block = NULL;
3532 /* For memory_global_dirty_log_start below. */
3533 qemu_mutex_lock_iothread();
3534 qemu_mutex_lock_ramlist();
3536 memory_global_dirty_log_sync();
3537 WITH_RCU_READ_LOCK_GUARD() {
3538 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3539 ramblock_sync_dirty_bitmap(ram_state, block);
3540 /* Discard this dirty bitmap record */
3541 bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3543 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
3545 ram_state->migration_dirty_pages = 0;
3546 qemu_mutex_unlock_ramlist();
3547 qemu_mutex_unlock_iothread();
3550 /* It is need to hold the global lock to call this helper */
3551 void colo_release_ram_cache(void)
3553 RAMBlock *block;
3555 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
3556 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3557 g_free(block->bmap);
3558 block->bmap = NULL;
3561 WITH_RCU_READ_LOCK_GUARD() {
3562 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3563 if (block->colo_cache) {
3564 qemu_anon_ram_free(block->colo_cache, block->used_length);
3565 block->colo_cache = NULL;
3569 ram_state_cleanup(&ram_state);
3573 * ram_load_setup: Setup RAM for migration incoming side
3575 * Returns zero to indicate success and negative for error
3577 * @f: QEMUFile where to receive the data
3578 * @opaque: RAMState pointer
3580 static int ram_load_setup(QEMUFile *f, void *opaque)
3582 if (compress_threads_load_setup(f)) {
3583 return -1;
3586 xbzrle_load_setup();
3587 ramblock_recv_map_init();
3589 return 0;
3592 static int ram_load_cleanup(void *opaque)
3594 RAMBlock *rb;
3596 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3597 qemu_ram_block_writeback(rb);
3600 xbzrle_load_cleanup();
3601 compress_threads_load_cleanup();
3603 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3604 g_free(rb->receivedmap);
3605 rb->receivedmap = NULL;
3608 return 0;
3612 * ram_postcopy_incoming_init: allocate postcopy data structures
3614 * Returns 0 for success and negative if there was one error
3616 * @mis: current migration incoming state
3618 * Allocate data structures etc needed by incoming migration with
3619 * postcopy-ram. postcopy-ram's similarly names
3620 * postcopy_ram_incoming_init does the work.
3622 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3624 return postcopy_ram_incoming_init(mis);
3628 * ram_load_postcopy: load a page in postcopy case
3630 * Returns 0 for success or -errno in case of error
3632 * Called in postcopy mode by ram_load().
3633 * rcu_read_lock is taken prior to this being called.
3635 * @f: QEMUFile where to send the data
3637 static int ram_load_postcopy(QEMUFile *f)
3639 int flags = 0, ret = 0;
3640 bool place_needed = false;
3641 bool matches_target_page_size = false;
3642 MigrationIncomingState *mis = migration_incoming_get_current();
3643 /* Temporary page that is later 'placed' */
3644 void *postcopy_host_page = mis->postcopy_tmp_page;
3645 void *host_page = NULL;
3646 bool all_zero = true;
3647 int target_pages = 0;
3649 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3650 ram_addr_t addr;
3651 void *page_buffer = NULL;
3652 void *place_source = NULL;
3653 RAMBlock *block = NULL;
3654 uint8_t ch;
3655 int len;
3657 addr = qemu_get_be64(f);
3660 * If qemu file error, we should stop here, and then "addr"
3661 * may be invalid
3663 ret = qemu_file_get_error(f);
3664 if (ret) {
3665 break;
3668 flags = addr & ~TARGET_PAGE_MASK;
3669 addr &= TARGET_PAGE_MASK;
3671 trace_ram_load_postcopy_loop((uint64_t)addr, flags);
3672 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3673 RAM_SAVE_FLAG_COMPRESS_PAGE)) {
3674 block = ram_block_from_stream(f, flags);
3675 if (!block) {
3676 ret = -EINVAL;
3677 break;
3681 * Relying on used_length is racy and can result in false positives.
3682 * We might place pages beyond used_length in case RAM was shrunk
3683 * while in postcopy, which is fine - trying to place via
3684 * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault.
3686 if (!block->host || addr >= block->postcopy_length) {
3687 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3688 ret = -EINVAL;
3689 break;
3691 target_pages++;
3692 matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
3694 * Postcopy requires that we place whole host pages atomically;
3695 * these may be huge pages for RAMBlocks that are backed by
3696 * hugetlbfs.
3697 * To make it atomic, the data is read into a temporary page
3698 * that's moved into place later.
3699 * The migration protocol uses, possibly smaller, target-pages
3700 * however the source ensures it always sends all the components
3701 * of a host page in one chunk.
3703 page_buffer = postcopy_host_page +
3704 host_page_offset_from_ram_block_offset(block, addr);
3705 /* If all TP are zero then we can optimise the place */
3706 if (target_pages == 1) {
3707 host_page = host_page_from_ram_block_offset(block, addr);
3708 } else if (host_page != host_page_from_ram_block_offset(block,
3709 addr)) {
3710 /* not the 1st TP within the HP */
3711 error_report("Non-same host page %p/%p", host_page,
3712 host_page_from_ram_block_offset(block, addr));
3713 ret = -EINVAL;
3714 break;
3718 * If it's the last part of a host page then we place the host
3719 * page
3721 if (target_pages == (block->page_size / TARGET_PAGE_SIZE)) {
3722 place_needed = true;
3724 place_source = postcopy_host_page;
3727 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3728 case RAM_SAVE_FLAG_ZERO:
3729 ch = qemu_get_byte(f);
3731 * Can skip to set page_buffer when
3732 * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
3734 if (ch || !matches_target_page_size) {
3735 memset(page_buffer, ch, TARGET_PAGE_SIZE);
3737 if (ch) {
3738 all_zero = false;
3740 break;
3742 case RAM_SAVE_FLAG_PAGE:
3743 all_zero = false;
3744 if (!matches_target_page_size) {
3745 /* For huge pages, we always use temporary buffer */
3746 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3747 } else {
3749 * For small pages that matches target page size, we
3750 * avoid the qemu_file copy. Instead we directly use
3751 * the buffer of QEMUFile to place the page. Note: we
3752 * cannot do any QEMUFile operation before using that
3753 * buffer to make sure the buffer is valid when
3754 * placing the page.
3756 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3757 TARGET_PAGE_SIZE);
3759 break;
3760 case RAM_SAVE_FLAG_COMPRESS_PAGE:
3761 all_zero = false;
3762 len = qemu_get_be32(f);
3763 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3764 error_report("Invalid compressed data length: %d", len);
3765 ret = -EINVAL;
3766 break;
3768 decompress_data_with_multi_threads(f, page_buffer, len);
3769 break;
3771 case RAM_SAVE_FLAG_EOS:
3772 /* normal exit */
3773 multifd_recv_sync_main();
3774 break;
3775 default:
3776 error_report("Unknown combination of migration flags: 0x%x"
3777 " (postcopy mode)", flags);
3778 ret = -EINVAL;
3779 break;
3782 /* Got the whole host page, wait for decompress before placing. */
3783 if (place_needed) {
3784 ret |= wait_for_decompress_done();
3787 /* Detect for any possible file errors */
3788 if (!ret && qemu_file_get_error(f)) {
3789 ret = qemu_file_get_error(f);
3792 if (!ret && place_needed) {
3793 if (all_zero) {
3794 ret = postcopy_place_page_zero(mis, host_page, block);
3795 } else {
3796 ret = postcopy_place_page(mis, host_page, place_source,
3797 block);
3799 place_needed = false;
3800 target_pages = 0;
3801 /* Assume we have a zero page until we detect something different */
3802 all_zero = true;
3806 return ret;
3809 static bool postcopy_is_advised(void)
3811 PostcopyState ps = postcopy_state_get();
3812 return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
3815 static bool postcopy_is_running(void)
3817 PostcopyState ps = postcopy_state_get();
3818 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3822 * Flush content of RAM cache into SVM's memory.
3823 * Only flush the pages that be dirtied by PVM or SVM or both.
3825 void colo_flush_ram_cache(void)
3827 RAMBlock *block = NULL;
3828 void *dst_host;
3829 void *src_host;
3830 unsigned long offset = 0;
3832 memory_global_dirty_log_sync();
3833 WITH_RCU_READ_LOCK_GUARD() {
3834 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3835 ramblock_sync_dirty_bitmap(ram_state, block);
3839 trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
3840 WITH_RCU_READ_LOCK_GUARD() {
3841 block = QLIST_FIRST_RCU(&ram_list.blocks);
3843 while (block) {
3844 unsigned long num = 0;
3846 offset = colo_bitmap_find_dirty(ram_state, block, offset, &num);
3847 if (!offset_in_ramblock(block,
3848 ((ram_addr_t)offset) << TARGET_PAGE_BITS)) {
3849 offset = 0;
3850 num = 0;
3851 block = QLIST_NEXT_RCU(block, next);
3852 } else {
3853 unsigned long i = 0;
3855 for (i = 0; i < num; i++) {
3856 migration_bitmap_clear_dirty(ram_state, block, offset + i);
3858 dst_host = block->host
3859 + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3860 src_host = block->colo_cache
3861 + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3862 memcpy(dst_host, src_host, TARGET_PAGE_SIZE * num);
3863 offset += num;
3867 trace_colo_flush_ram_cache_end();
3871 * ram_load_precopy: load pages in precopy case
3873 * Returns 0 for success or -errno in case of error
3875 * Called in precopy mode by ram_load().
3876 * rcu_read_lock is taken prior to this being called.
3878 * @f: QEMUFile where to send the data
3880 static int ram_load_precopy(QEMUFile *f)
3882 int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
3883 /* ADVISE is earlier, it shows the source has the postcopy capability on */
3884 bool postcopy_advised = postcopy_is_advised();
3885 if (!migrate_use_compression()) {
3886 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3889 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3890 ram_addr_t addr, total_ram_bytes;
3891 void *host = NULL, *host_bak = NULL;
3892 uint8_t ch;
3895 * Yield periodically to let main loop run, but an iteration of
3896 * the main loop is expensive, so do it each some iterations
3898 if ((i & 32767) == 0 && qemu_in_coroutine()) {
3899 aio_co_schedule(qemu_get_current_aio_context(),
3900 qemu_coroutine_self());
3901 qemu_coroutine_yield();
3903 i++;
3905 addr = qemu_get_be64(f);
3906 flags = addr & ~TARGET_PAGE_MASK;
3907 addr &= TARGET_PAGE_MASK;
3909 if (flags & invalid_flags) {
3910 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3911 error_report("Received an unexpected compressed page");
3914 ret = -EINVAL;
3915 break;
3918 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3919 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
3920 RAMBlock *block = ram_block_from_stream(f, flags);
3922 host = host_from_ram_block_offset(block, addr);
3924 * After going into COLO stage, we should not load the page
3925 * into SVM's memory directly, we put them into colo_cache firstly.
3926 * NOTE: We need to keep a copy of SVM's ram in colo_cache.
3927 * Previously, we copied all these memory in preparing stage of COLO
3928 * while we need to stop VM, which is a time-consuming process.
3929 * Here we optimize it by a trick, back-up every page while in
3930 * migration process while COLO is enabled, though it affects the
3931 * speed of the migration, but it obviously reduce the downtime of
3932 * back-up all SVM'S memory in COLO preparing stage.
3934 if (migration_incoming_colo_enabled()) {
3935 if (migration_incoming_in_colo_state()) {
3936 /* In COLO stage, put all pages into cache temporarily */
3937 host = colo_cache_from_block_offset(block, addr, true);
3938 } else {
3940 * In migration stage but before COLO stage,
3941 * Put all pages into both cache and SVM's memory.
3943 host_bak = colo_cache_from_block_offset(block, addr, false);
3946 if (!host) {
3947 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3948 ret = -EINVAL;
3949 break;
3951 if (!migration_incoming_in_colo_state()) {
3952 ramblock_recv_bitmap_set(block, host);
3955 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
3958 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3959 case RAM_SAVE_FLAG_MEM_SIZE:
3960 /* Synchronize RAM block list */
3961 total_ram_bytes = addr;
3962 while (!ret && total_ram_bytes) {
3963 RAMBlock *block;
3964 char id[256];
3965 ram_addr_t length;
3967 len = qemu_get_byte(f);
3968 qemu_get_buffer(f, (uint8_t *)id, len);
3969 id[len] = 0;
3970 length = qemu_get_be64(f);
3972 block = qemu_ram_block_by_name(id);
3973 if (block && !qemu_ram_is_migratable(block)) {
3974 error_report("block %s should not be migrated !", id);
3975 ret = -EINVAL;
3976 } else if (block) {
3977 if (length != block->used_length) {
3978 Error *local_err = NULL;
3980 ret = qemu_ram_resize(block, length,
3981 &local_err);
3982 if (local_err) {
3983 error_report_err(local_err);
3986 /* For postcopy we need to check hugepage sizes match */
3987 if (postcopy_advised && migrate_postcopy_ram() &&
3988 block->page_size != qemu_host_page_size) {
3989 uint64_t remote_page_size = qemu_get_be64(f);
3990 if (remote_page_size != block->page_size) {
3991 error_report("Mismatched RAM page size %s "
3992 "(local) %zd != %" PRId64,
3993 id, block->page_size,
3994 remote_page_size);
3995 ret = -EINVAL;
3998 if (migrate_ignore_shared()) {
3999 hwaddr addr = qemu_get_be64(f);
4000 if (ramblock_is_ignored(block) &&
4001 block->mr->addr != addr) {
4002 error_report("Mismatched GPAs for block %s "
4003 "%" PRId64 "!= %" PRId64,
4004 id, (uint64_t)addr,
4005 (uint64_t)block->mr->addr);
4006 ret = -EINVAL;
4009 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
4010 block->idstr);
4011 } else {
4012 error_report("Unknown ramblock \"%s\", cannot "
4013 "accept migration", id);
4014 ret = -EINVAL;
4017 total_ram_bytes -= length;
4019 break;
4021 case RAM_SAVE_FLAG_ZERO:
4022 ch = qemu_get_byte(f);
4023 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
4024 break;
4026 case RAM_SAVE_FLAG_PAGE:
4027 qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
4028 break;
4030 case RAM_SAVE_FLAG_COMPRESS_PAGE:
4031 len = qemu_get_be32(f);
4032 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4033 error_report("Invalid compressed data length: %d", len);
4034 ret = -EINVAL;
4035 break;
4037 decompress_data_with_multi_threads(f, host, len);
4038 break;
4040 case RAM_SAVE_FLAG_XBZRLE:
4041 if (load_xbzrle(f, addr, host) < 0) {
4042 error_report("Failed to decompress XBZRLE page at "
4043 RAM_ADDR_FMT, addr);
4044 ret = -EINVAL;
4045 break;
4047 break;
4048 case RAM_SAVE_FLAG_EOS:
4049 /* normal exit */
4050 multifd_recv_sync_main();
4051 break;
4052 default:
4053 if (flags & RAM_SAVE_FLAG_HOOK) {
4054 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
4055 } else {
4056 error_report("Unknown combination of migration flags: 0x%x",
4057 flags);
4058 ret = -EINVAL;
4061 if (!ret) {
4062 ret = qemu_file_get_error(f);
4064 if (!ret && host_bak) {
4065 memcpy(host_bak, host, TARGET_PAGE_SIZE);
4069 ret |= wait_for_decompress_done();
4070 return ret;
4073 static int ram_load(QEMUFile *f, void *opaque, int version_id)
4075 int ret = 0;
4076 static uint64_t seq_iter;
4078 * If system is running in postcopy mode, page inserts to host memory must
4079 * be atomic
4081 bool postcopy_running = postcopy_is_running();
4083 seq_iter++;
4085 if (version_id != 4) {
4086 return -EINVAL;
4090 * This RCU critical section can be very long running.
4091 * When RCU reclaims in the code start to become numerous,
4092 * it will be necessary to reduce the granularity of this
4093 * critical section.
4095 WITH_RCU_READ_LOCK_GUARD() {
4096 if (postcopy_running) {
4097 ret = ram_load_postcopy(f);
4098 } else {
4099 ret = ram_load_precopy(f);
4102 trace_ram_load_complete(ret, seq_iter);
4104 return ret;
4107 static bool ram_has_postcopy(void *opaque)
4109 RAMBlock *rb;
4110 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4111 if (ramblock_is_pmem(rb)) {
4112 info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4113 "is not supported now!", rb->idstr, rb->host);
4114 return false;
4118 return migrate_postcopy_ram();
4121 /* Sync all the dirty bitmap with destination VM. */
4122 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4124 RAMBlock *block;
4125 QEMUFile *file = s->to_dst_file;
4126 int ramblock_count = 0;
4128 trace_ram_dirty_bitmap_sync_start();
4130 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4131 qemu_savevm_send_recv_bitmap(file, block->idstr);
4132 trace_ram_dirty_bitmap_request(block->idstr);
4133 ramblock_count++;
4136 trace_ram_dirty_bitmap_sync_wait();
4138 /* Wait until all the ramblocks' dirty bitmap synced */
4139 while (ramblock_count--) {
4140 qemu_sem_wait(&s->rp_state.rp_sem);
4143 trace_ram_dirty_bitmap_sync_complete();
4145 return 0;
4148 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4150 qemu_sem_post(&s->rp_state.rp_sem);
4154 * Read the received bitmap, revert it as the initial dirty bitmap.
4155 * This is only used when the postcopy migration is paused but wants
4156 * to resume from a middle point.
4158 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4160 int ret = -EINVAL;
4161 /* from_dst_file is always valid because we're within rp_thread */
4162 QEMUFile *file = s->rp_state.from_dst_file;
4163 unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
4164 uint64_t local_size = DIV_ROUND_UP(nbits, 8);
4165 uint64_t size, end_mark;
4167 trace_ram_dirty_bitmap_reload_begin(block->idstr);
4169 if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4170 error_report("%s: incorrect state %s", __func__,
4171 MigrationStatus_str(s->state));
4172 return -EINVAL;
4176 * Note: see comments in ramblock_recv_bitmap_send() on why we
4177 * need the endianness conversion, and the paddings.
4179 local_size = ROUND_UP(local_size, 8);
4181 /* Add paddings */
4182 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4184 size = qemu_get_be64(file);
4186 /* The size of the bitmap should match with our ramblock */
4187 if (size != local_size) {
4188 error_report("%s: ramblock '%s' bitmap size mismatch "
4189 "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4190 block->idstr, size, local_size);
4191 ret = -EINVAL;
4192 goto out;
4195 size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4196 end_mark = qemu_get_be64(file);
4198 ret = qemu_file_get_error(file);
4199 if (ret || size != local_size) {
4200 error_report("%s: read bitmap failed for ramblock '%s': %d"
4201 " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4202 __func__, block->idstr, ret, local_size, size);
4203 ret = -EIO;
4204 goto out;
4207 if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4208 error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64,
4209 __func__, block->idstr, end_mark);
4210 ret = -EINVAL;
4211 goto out;
4215 * Endianness conversion. We are during postcopy (though paused).
4216 * The dirty bitmap won't change. We can directly modify it.
4218 bitmap_from_le(block->bmap, le_bitmap, nbits);
4221 * What we received is "received bitmap". Revert it as the initial
4222 * dirty bitmap for this ramblock.
4224 bitmap_complement(block->bmap, block->bmap, nbits);
4226 /* Clear dirty bits of discarded ranges that we don't want to migrate. */
4227 ramblock_dirty_bitmap_clear_discarded_pages(block);
4229 /* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */
4230 trace_ram_dirty_bitmap_reload_complete(block->idstr);
4233 * We succeeded to sync bitmap for current ramblock. If this is
4234 * the last one to sync, we need to notify the main send thread.
4236 ram_dirty_bitmap_reload_notify(s);
4238 ret = 0;
4239 out:
4240 g_free(le_bitmap);
4241 return ret;
4244 static int ram_resume_prepare(MigrationState *s, void *opaque)
4246 RAMState *rs = *(RAMState **)opaque;
4247 int ret;
4249 ret = ram_dirty_bitmap_sync_all(s, rs);
4250 if (ret) {
4251 return ret;
4254 ram_state_resume_prepare(rs, s->to_dst_file);
4256 return 0;
4259 static SaveVMHandlers savevm_ram_handlers = {
4260 .save_setup = ram_save_setup,
4261 .save_live_iterate = ram_save_iterate,
4262 .save_live_complete_postcopy = ram_save_complete,
4263 .save_live_complete_precopy = ram_save_complete,
4264 .has_postcopy = ram_has_postcopy,
4265 .save_live_pending = ram_save_pending,
4266 .load_state = ram_load,
4267 .save_cleanup = ram_save_cleanup,
4268 .load_setup = ram_load_setup,
4269 .load_cleanup = ram_load_cleanup,
4270 .resume_prepare = ram_resume_prepare,
4273 static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host,
4274 size_t old_size, size_t new_size)
4276 PostcopyState ps = postcopy_state_get();
4277 ram_addr_t offset;
4278 RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset);
4279 Error *err = NULL;
4281 if (ramblock_is_ignored(rb)) {
4282 return;
4285 if (!migration_is_idle()) {
4287 * Precopy code on the source cannot deal with the size of RAM blocks
4288 * changing at random points in time - especially after sending the
4289 * RAM block sizes in the migration stream, they must no longer change.
4290 * Abort and indicate a proper reason.
4292 error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr);
4293 migration_cancel(err);
4294 error_free(err);
4297 switch (ps) {
4298 case POSTCOPY_INCOMING_ADVISE:
4300 * Update what ram_postcopy_incoming_init()->init_range() does at the
4301 * time postcopy was advised. Syncing RAM blocks with the source will
4302 * result in RAM resizes.
4304 if (old_size < new_size) {
4305 if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) {
4306 error_report("RAM block '%s' discard of resized RAM failed",
4307 rb->idstr);
4310 rb->postcopy_length = new_size;
4311 break;
4312 case POSTCOPY_INCOMING_NONE:
4313 case POSTCOPY_INCOMING_RUNNING:
4314 case POSTCOPY_INCOMING_END:
4316 * Once our guest is running, postcopy does no longer care about
4317 * resizes. When growing, the new memory was not available on the
4318 * source, no handler needed.
4320 break;
4321 default:
4322 error_report("RAM block '%s' resized during postcopy state: %d",
4323 rb->idstr, ps);
4324 exit(-1);
4328 static RAMBlockNotifier ram_mig_ram_notifier = {
4329 .ram_block_resized = ram_mig_ram_block_resized,
4332 void ram_mig_init(void)
4334 qemu_mutex_init(&XBZRLE.lock);
4335 register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
4336 ram_block_notifier_add(&ram_mig_ram_notifier);