migration: Split save_live_pending() into state_pending_*
[qemu/armbru.git] / migration / ram.c
blob56ff9cd29d927f164792ce67be198238cac4f74f
1 /*
2 * QEMU System Emulator
4 * Copyright (c) 2003-2008 Fabrice Bellard
5 * Copyright (c) 2011-2015 Red Hat Inc
7 * Authors:
8 * Juan Quintela <quintela@redhat.com>
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 * THE SOFTWARE.
29 #include "qemu/osdep.h"
30 #include "qemu/cutils.h"
31 #include "qemu/bitops.h"
32 #include "qemu/bitmap.h"
33 #include "qemu/madvise.h"
34 #include "qemu/main-loop.h"
35 #include "io/channel-null.h"
36 #include "xbzrle.h"
37 #include "ram.h"
38 #include "migration.h"
39 #include "migration/register.h"
40 #include "migration/misc.h"
41 #include "qemu-file.h"
42 #include "postcopy-ram.h"
43 #include "page_cache.h"
44 #include "qemu/error-report.h"
45 #include "qapi/error.h"
46 #include "qapi/qapi-types-migration.h"
47 #include "qapi/qapi-events-migration.h"
48 #include "qapi/qmp/qerror.h"
49 #include "trace.h"
50 #include "exec/ram_addr.h"
51 #include "exec/target_page.h"
52 #include "qemu/rcu_queue.h"
53 #include "migration/colo.h"
54 #include "block.h"
55 #include "sysemu/cpu-throttle.h"
56 #include "savevm.h"
57 #include "qemu/iov.h"
58 #include "multifd.h"
59 #include "sysemu/runstate.h"
61 #include "hw/boards.h" /* for machine_dump_guest_core() */
63 #if defined(__linux__)
64 #include "qemu/userfaultfd.h"
65 #endif /* defined(__linux__) */
67 /***********************************************************/
68 /* ram save/restore */
70 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
71 * worked for pages that where filled with the same char. We switched
72 * it to only search for the zero value. And to avoid confusion with
73 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
76 #define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */
77 #define RAM_SAVE_FLAG_ZERO 0x02
78 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
79 #define RAM_SAVE_FLAG_PAGE 0x08
80 #define RAM_SAVE_FLAG_EOS 0x10
81 #define RAM_SAVE_FLAG_CONTINUE 0x20
82 #define RAM_SAVE_FLAG_XBZRLE 0x40
83 /* 0x80 is reserved in migration.h start with 0x100 next */
84 #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
86 XBZRLECacheStats xbzrle_counters;
88 /* used by the search for pages to send */
89 struct PageSearchStatus {
90 /* The migration channel used for a specific host page */
91 QEMUFile *pss_channel;
92 /* Last block from where we have sent data */
93 RAMBlock *last_sent_block;
94 /* Current block being searched */
95 RAMBlock *block;
96 /* Current page to search from */
97 unsigned long page;
98 /* Set once we wrap around */
99 bool complete_round;
100 /* Whether we're sending a host page */
101 bool host_page_sending;
102 /* The start/end of current host page. Invalid if host_page_sending==false */
103 unsigned long host_page_start;
104 unsigned long host_page_end;
106 typedef struct PageSearchStatus PageSearchStatus;
108 /* struct contains XBZRLE cache and a static page
109 used by the compression */
110 static struct {
111 /* buffer used for XBZRLE encoding */
112 uint8_t *encoded_buf;
113 /* buffer for storing page content */
114 uint8_t *current_buf;
115 /* Cache for XBZRLE, Protected by lock. */
116 PageCache *cache;
117 QemuMutex lock;
118 /* it will store a page full of zeros */
119 uint8_t *zero_target_page;
120 /* buffer used for XBZRLE decoding */
121 uint8_t *decoded_buf;
122 } XBZRLE;
124 static void XBZRLE_cache_lock(void)
126 if (migrate_use_xbzrle()) {
127 qemu_mutex_lock(&XBZRLE.lock);
131 static void XBZRLE_cache_unlock(void)
133 if (migrate_use_xbzrle()) {
134 qemu_mutex_unlock(&XBZRLE.lock);
139 * xbzrle_cache_resize: resize the xbzrle cache
141 * This function is called from migrate_params_apply in main
142 * thread, possibly while a migration is in progress. A running
143 * migration may be using the cache and might finish during this call,
144 * hence changes to the cache are protected by XBZRLE.lock().
146 * Returns 0 for success or -1 for error
148 * @new_size: new cache size
149 * @errp: set *errp if the check failed, with reason
151 int xbzrle_cache_resize(uint64_t new_size, Error **errp)
153 PageCache *new_cache;
154 int64_t ret = 0;
156 /* Check for truncation */
157 if (new_size != (size_t)new_size) {
158 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
159 "exceeding address space");
160 return -1;
163 if (new_size == migrate_xbzrle_cache_size()) {
164 /* nothing to do */
165 return 0;
168 XBZRLE_cache_lock();
170 if (XBZRLE.cache != NULL) {
171 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
172 if (!new_cache) {
173 ret = -1;
174 goto out;
177 cache_fini(XBZRLE.cache);
178 XBZRLE.cache = new_cache;
180 out:
181 XBZRLE_cache_unlock();
182 return ret;
185 static bool postcopy_preempt_active(void)
187 return migrate_postcopy_preempt() && migration_in_postcopy();
190 bool ramblock_is_ignored(RAMBlock *block)
192 return !qemu_ram_is_migratable(block) ||
193 (migrate_ignore_shared() && qemu_ram_is_shared(block));
196 #undef RAMBLOCK_FOREACH
198 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
200 RAMBlock *block;
201 int ret = 0;
203 RCU_READ_LOCK_GUARD();
205 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
206 ret = func(block, opaque);
207 if (ret) {
208 break;
211 return ret;
214 static void ramblock_recv_map_init(void)
216 RAMBlock *rb;
218 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
219 assert(!rb->receivedmap);
220 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
224 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
226 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
227 rb->receivedmap);
230 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
232 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
235 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
237 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
240 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
241 size_t nr)
243 bitmap_set_atomic(rb->receivedmap,
244 ramblock_recv_bitmap_offset(host_addr, rb),
245 nr);
248 #define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL)
251 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
253 * Returns >0 if success with sent bytes, or <0 if error.
255 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
256 const char *block_name)
258 RAMBlock *block = qemu_ram_block_by_name(block_name);
259 unsigned long *le_bitmap, nbits;
260 uint64_t size;
262 if (!block) {
263 error_report("%s: invalid block name: %s", __func__, block_name);
264 return -1;
267 nbits = block->postcopy_length >> TARGET_PAGE_BITS;
270 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
271 * machines we may need 4 more bytes for padding (see below
272 * comment). So extend it a bit before hand.
274 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
277 * Always use little endian when sending the bitmap. This is
278 * required that when source and destination VMs are not using the
279 * same endianness. (Note: big endian won't work.)
281 bitmap_to_le(le_bitmap, block->receivedmap, nbits);
283 /* Size of the bitmap, in bytes */
284 size = DIV_ROUND_UP(nbits, 8);
287 * size is always aligned to 8 bytes for 64bit machines, but it
288 * may not be true for 32bit machines. We need this padding to
289 * make sure the migration can survive even between 32bit and
290 * 64bit machines.
292 size = ROUND_UP(size, 8);
294 qemu_put_be64(file, size);
295 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
297 * Mark as an end, in case the middle part is screwed up due to
298 * some "mysterious" reason.
300 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
301 qemu_fflush(file);
303 g_free(le_bitmap);
305 if (qemu_file_get_error(file)) {
306 return qemu_file_get_error(file);
309 return size + sizeof(size);
313 * An outstanding page request, on the source, having been received
314 * and queued
316 struct RAMSrcPageRequest {
317 RAMBlock *rb;
318 hwaddr offset;
319 hwaddr len;
321 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
324 /* State of RAM for migration */
325 struct RAMState {
327 * PageSearchStatus structures for the channels when send pages.
328 * Protected by the bitmap_mutex.
330 PageSearchStatus pss[RAM_CHANNEL_MAX];
331 /* UFFD file descriptor, used in 'write-tracking' migration */
332 int uffdio_fd;
333 /* Last block that we have visited searching for dirty pages */
334 RAMBlock *last_seen_block;
335 /* Last dirty target page we have sent */
336 ram_addr_t last_page;
337 /* last ram version we have seen */
338 uint32_t last_version;
339 /* How many times we have dirty too many pages */
340 int dirty_rate_high_cnt;
341 /* these variables are used for bitmap sync */
342 /* last time we did a full bitmap_sync */
343 int64_t time_last_bitmap_sync;
344 /* bytes transferred at start_time */
345 uint64_t bytes_xfer_prev;
346 /* number of dirty pages since start_time */
347 uint64_t num_dirty_pages_period;
348 /* xbzrle misses since the beginning of the period */
349 uint64_t xbzrle_cache_miss_prev;
350 /* Amount of xbzrle pages since the beginning of the period */
351 uint64_t xbzrle_pages_prev;
352 /* Amount of xbzrle encoded bytes since the beginning of the period */
353 uint64_t xbzrle_bytes_prev;
354 /* Start using XBZRLE (e.g., after the first round). */
355 bool xbzrle_enabled;
356 /* Are we on the last stage of migration */
357 bool last_stage;
358 /* compression statistics since the beginning of the period */
359 /* amount of count that no free thread to compress data */
360 uint64_t compress_thread_busy_prev;
361 /* amount bytes after compression */
362 uint64_t compressed_size_prev;
363 /* amount of compressed pages */
364 uint64_t compress_pages_prev;
366 /* total handled target pages at the beginning of period */
367 uint64_t target_page_count_prev;
368 /* total handled target pages since start */
369 uint64_t target_page_count;
370 /* number of dirty bits in the bitmap */
371 uint64_t migration_dirty_pages;
373 * Protects:
374 * - dirty/clear bitmap
375 * - migration_dirty_pages
376 * - pss structures
378 QemuMutex bitmap_mutex;
379 /* The RAMBlock used in the last src_page_requests */
380 RAMBlock *last_req_rb;
381 /* Queue of outstanding page requests from the destination */
382 QemuMutex src_page_req_mutex;
383 QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
385 typedef struct RAMState RAMState;
387 static RAMState *ram_state;
389 static NotifierWithReturnList precopy_notifier_list;
391 /* Whether postcopy has queued requests? */
392 static bool postcopy_has_request(RAMState *rs)
394 return !QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests);
397 void precopy_infrastructure_init(void)
399 notifier_with_return_list_init(&precopy_notifier_list);
402 void precopy_add_notifier(NotifierWithReturn *n)
404 notifier_with_return_list_add(&precopy_notifier_list, n);
407 void precopy_remove_notifier(NotifierWithReturn *n)
409 notifier_with_return_remove(n);
412 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
414 PrecopyNotifyData pnd;
415 pnd.reason = reason;
416 pnd.errp = errp;
418 return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
421 uint64_t ram_bytes_remaining(void)
423 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
428 * NOTE: not all stats in ram_counters are used in reality. See comments
429 * for struct MigrationAtomicStats. The ultimate result of ram migration
430 * counters will be a merged version with both ram_counters and the atomic
431 * fields in ram_atomic_counters.
433 MigrationStats ram_counters;
434 MigrationAtomicStats ram_atomic_counters;
436 void ram_transferred_add(uint64_t bytes)
438 if (runstate_is_running()) {
439 ram_counters.precopy_bytes += bytes;
440 } else if (migration_in_postcopy()) {
441 stat64_add(&ram_atomic_counters.postcopy_bytes, bytes);
442 } else {
443 ram_counters.downtime_bytes += bytes;
445 stat64_add(&ram_atomic_counters.transferred, bytes);
448 void dirty_sync_missed_zero_copy(void)
450 ram_counters.dirty_sync_missed_zero_copy++;
453 CompressionStats compression_counters;
455 struct CompressParam {
456 bool done;
457 bool quit;
458 bool zero_page;
459 QEMUFile *file;
460 QemuMutex mutex;
461 QemuCond cond;
462 RAMBlock *block;
463 ram_addr_t offset;
465 /* internally used fields */
466 z_stream stream;
467 uint8_t *originbuf;
469 typedef struct CompressParam CompressParam;
471 struct DecompressParam {
472 bool done;
473 bool quit;
474 QemuMutex mutex;
475 QemuCond cond;
476 void *des;
477 uint8_t *compbuf;
478 int len;
479 z_stream stream;
481 typedef struct DecompressParam DecompressParam;
483 static CompressParam *comp_param;
484 static QemuThread *compress_threads;
485 /* comp_done_cond is used to wake up the migration thread when
486 * one of the compression threads has finished the compression.
487 * comp_done_lock is used to co-work with comp_done_cond.
489 static QemuMutex comp_done_lock;
490 static QemuCond comp_done_cond;
492 static QEMUFile *decomp_file;
493 static DecompressParam *decomp_param;
494 static QemuThread *decompress_threads;
495 static QemuMutex decomp_done_lock;
496 static QemuCond decomp_done_cond;
498 static int ram_save_host_page_urgent(PageSearchStatus *pss);
500 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
501 ram_addr_t offset, uint8_t *source_buf);
503 /* NOTE: page is the PFN not real ram_addr_t. */
504 static void pss_init(PageSearchStatus *pss, RAMBlock *rb, ram_addr_t page)
506 pss->block = rb;
507 pss->page = page;
508 pss->complete_round = false;
512 * Check whether two PSSs are actively sending the same page. Return true
513 * if it is, false otherwise.
515 static bool pss_overlap(PageSearchStatus *pss1, PageSearchStatus *pss2)
517 return pss1->host_page_sending && pss2->host_page_sending &&
518 (pss1->host_page_start == pss2->host_page_start);
521 static void *do_data_compress(void *opaque)
523 CompressParam *param = opaque;
524 RAMBlock *block;
525 ram_addr_t offset;
526 bool zero_page;
528 qemu_mutex_lock(&param->mutex);
529 while (!param->quit) {
530 if (param->block) {
531 block = param->block;
532 offset = param->offset;
533 param->block = NULL;
534 qemu_mutex_unlock(&param->mutex);
536 zero_page = do_compress_ram_page(param->file, &param->stream,
537 block, offset, param->originbuf);
539 qemu_mutex_lock(&comp_done_lock);
540 param->done = true;
541 param->zero_page = zero_page;
542 qemu_cond_signal(&comp_done_cond);
543 qemu_mutex_unlock(&comp_done_lock);
545 qemu_mutex_lock(&param->mutex);
546 } else {
547 qemu_cond_wait(&param->cond, &param->mutex);
550 qemu_mutex_unlock(&param->mutex);
552 return NULL;
555 static void compress_threads_save_cleanup(void)
557 int i, thread_count;
559 if (!migrate_use_compression() || !comp_param) {
560 return;
563 thread_count = migrate_compress_threads();
564 for (i = 0; i < thread_count; i++) {
566 * we use it as a indicator which shows if the thread is
567 * properly init'd or not
569 if (!comp_param[i].file) {
570 break;
573 qemu_mutex_lock(&comp_param[i].mutex);
574 comp_param[i].quit = true;
575 qemu_cond_signal(&comp_param[i].cond);
576 qemu_mutex_unlock(&comp_param[i].mutex);
578 qemu_thread_join(compress_threads + i);
579 qemu_mutex_destroy(&comp_param[i].mutex);
580 qemu_cond_destroy(&comp_param[i].cond);
581 deflateEnd(&comp_param[i].stream);
582 g_free(comp_param[i].originbuf);
583 qemu_fclose(comp_param[i].file);
584 comp_param[i].file = NULL;
586 qemu_mutex_destroy(&comp_done_lock);
587 qemu_cond_destroy(&comp_done_cond);
588 g_free(compress_threads);
589 g_free(comp_param);
590 compress_threads = NULL;
591 comp_param = NULL;
594 static int compress_threads_save_setup(void)
596 int i, thread_count;
598 if (!migrate_use_compression()) {
599 return 0;
601 thread_count = migrate_compress_threads();
602 compress_threads = g_new0(QemuThread, thread_count);
603 comp_param = g_new0(CompressParam, thread_count);
604 qemu_cond_init(&comp_done_cond);
605 qemu_mutex_init(&comp_done_lock);
606 for (i = 0; i < thread_count; i++) {
607 comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
608 if (!comp_param[i].originbuf) {
609 goto exit;
612 if (deflateInit(&comp_param[i].stream,
613 migrate_compress_level()) != Z_OK) {
614 g_free(comp_param[i].originbuf);
615 goto exit;
618 /* comp_param[i].file is just used as a dummy buffer to save data,
619 * set its ops to empty.
621 comp_param[i].file = qemu_file_new_output(
622 QIO_CHANNEL(qio_channel_null_new()));
623 comp_param[i].done = true;
624 comp_param[i].quit = false;
625 qemu_mutex_init(&comp_param[i].mutex);
626 qemu_cond_init(&comp_param[i].cond);
627 qemu_thread_create(compress_threads + i, "compress",
628 do_data_compress, comp_param + i,
629 QEMU_THREAD_JOINABLE);
631 return 0;
633 exit:
634 compress_threads_save_cleanup();
635 return -1;
639 * save_page_header: write page header to wire
641 * If this is the 1st block, it also writes the block identification
643 * Returns the number of bytes written
645 * @pss: current PSS channel status
646 * @block: block that contains the page we want to send
647 * @offset: offset inside the block for the page
648 * in the lower bits, it contains flags
650 static size_t save_page_header(PageSearchStatus *pss, RAMBlock *block,
651 ram_addr_t offset)
653 size_t size, len;
654 bool same_block = (block == pss->last_sent_block);
655 QEMUFile *f = pss->pss_channel;
657 if (same_block) {
658 offset |= RAM_SAVE_FLAG_CONTINUE;
660 qemu_put_be64(f, offset);
661 size = 8;
663 if (!same_block) {
664 len = strlen(block->idstr);
665 qemu_put_byte(f, len);
666 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
667 size += 1 + len;
668 pss->last_sent_block = block;
670 return size;
674 * mig_throttle_guest_down: throttle down the guest
676 * Reduce amount of guest cpu execution to hopefully slow down memory
677 * writes. If guest dirty memory rate is reduced below the rate at
678 * which we can transfer pages to the destination then we should be
679 * able to complete migration. Some workloads dirty memory way too
680 * fast and will not effectively converge, even with auto-converge.
682 static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
683 uint64_t bytes_dirty_threshold)
685 MigrationState *s = migrate_get_current();
686 uint64_t pct_initial = s->parameters.cpu_throttle_initial;
687 uint64_t pct_increment = s->parameters.cpu_throttle_increment;
688 bool pct_tailslow = s->parameters.cpu_throttle_tailslow;
689 int pct_max = s->parameters.max_cpu_throttle;
691 uint64_t throttle_now = cpu_throttle_get_percentage();
692 uint64_t cpu_now, cpu_ideal, throttle_inc;
694 /* We have not started throttling yet. Let's start it. */
695 if (!cpu_throttle_active()) {
696 cpu_throttle_set(pct_initial);
697 } else {
698 /* Throttling already on, just increase the rate */
699 if (!pct_tailslow) {
700 throttle_inc = pct_increment;
701 } else {
702 /* Compute the ideal CPU percentage used by Guest, which may
703 * make the dirty rate match the dirty rate threshold. */
704 cpu_now = 100 - throttle_now;
705 cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
706 bytes_dirty_period);
707 throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
709 cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
713 void mig_throttle_counter_reset(void)
715 RAMState *rs = ram_state;
717 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
718 rs->num_dirty_pages_period = 0;
719 rs->bytes_xfer_prev = stat64_get(&ram_atomic_counters.transferred);
723 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
725 * @rs: current RAM state
726 * @current_addr: address for the zero page
728 * Update the xbzrle cache to reflect a page that's been sent as all 0.
729 * The important thing is that a stale (not-yet-0'd) page be replaced
730 * by the new data.
731 * As a bonus, if the page wasn't in the cache it gets added so that
732 * when a small write is made into the 0'd page it gets XBZRLE sent.
734 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
736 /* We don't care if this fails to allocate a new cache page
737 * as long as it updated an old one */
738 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
739 ram_counters.dirty_sync_count);
742 #define ENCODING_FLAG_XBZRLE 0x1
745 * save_xbzrle_page: compress and send current page
747 * Returns: 1 means that we wrote the page
748 * 0 means that page is identical to the one already sent
749 * -1 means that xbzrle would be longer than normal
751 * @rs: current RAM state
752 * @pss: current PSS channel
753 * @current_data: pointer to the address of the page contents
754 * @current_addr: addr of the page
755 * @block: block that contains the page we want to send
756 * @offset: offset inside the block for the page
758 static int save_xbzrle_page(RAMState *rs, PageSearchStatus *pss,
759 uint8_t **current_data, ram_addr_t current_addr,
760 RAMBlock *block, ram_addr_t offset)
762 int encoded_len = 0, bytes_xbzrle;
763 uint8_t *prev_cached_page;
764 QEMUFile *file = pss->pss_channel;
766 if (!cache_is_cached(XBZRLE.cache, current_addr,
767 ram_counters.dirty_sync_count)) {
768 xbzrle_counters.cache_miss++;
769 if (!rs->last_stage) {
770 if (cache_insert(XBZRLE.cache, current_addr, *current_data,
771 ram_counters.dirty_sync_count) == -1) {
772 return -1;
773 } else {
774 /* update *current_data when the page has been
775 inserted into cache */
776 *current_data = get_cached_data(XBZRLE.cache, current_addr);
779 return -1;
783 * Reaching here means the page has hit the xbzrle cache, no matter what
784 * encoding result it is (normal encoding, overflow or skipping the page),
785 * count the page as encoded. This is used to calculate the encoding rate.
787 * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
788 * 2nd page turns out to be skipped (i.e. no new bytes written to the
789 * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
790 * skipped page included. In this way, the encoding rate can tell if the
791 * guest page is good for xbzrle encoding.
793 xbzrle_counters.pages++;
794 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
796 /* save current buffer into memory */
797 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
799 /* XBZRLE encoding (if there is no overflow) */
800 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
801 TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
802 TARGET_PAGE_SIZE);
805 * Update the cache contents, so that it corresponds to the data
806 * sent, in all cases except where we skip the page.
808 if (!rs->last_stage && encoded_len != 0) {
809 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
811 * In the case where we couldn't compress, ensure that the caller
812 * sends the data from the cache, since the guest might have
813 * changed the RAM since we copied it.
815 *current_data = prev_cached_page;
818 if (encoded_len == 0) {
819 trace_save_xbzrle_page_skipping();
820 return 0;
821 } else if (encoded_len == -1) {
822 trace_save_xbzrle_page_overflow();
823 xbzrle_counters.overflow++;
824 xbzrle_counters.bytes += TARGET_PAGE_SIZE;
825 return -1;
828 /* Send XBZRLE based compressed page */
829 bytes_xbzrle = save_page_header(pss, block,
830 offset | RAM_SAVE_FLAG_XBZRLE);
831 qemu_put_byte(file, ENCODING_FLAG_XBZRLE);
832 qemu_put_be16(file, encoded_len);
833 qemu_put_buffer(file, XBZRLE.encoded_buf, encoded_len);
834 bytes_xbzrle += encoded_len + 1 + 2;
836 * Like compressed_size (please see update_compress_thread_counts),
837 * the xbzrle encoded bytes don't count the 8 byte header with
838 * RAM_SAVE_FLAG_CONTINUE.
840 xbzrle_counters.bytes += bytes_xbzrle - 8;
841 ram_transferred_add(bytes_xbzrle);
843 return 1;
847 * pss_find_next_dirty: find the next dirty page of current ramblock
849 * This function updates pss->page to point to the next dirty page index
850 * within the ramblock to migrate, or the end of ramblock when nothing
851 * found. Note that when pss->host_page_sending==true it means we're
852 * during sending a host page, so we won't look for dirty page that is
853 * outside the host page boundary.
855 * @pss: the current page search status
857 static void pss_find_next_dirty(PageSearchStatus *pss)
859 RAMBlock *rb = pss->block;
860 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
861 unsigned long *bitmap = rb->bmap;
863 if (ramblock_is_ignored(rb)) {
864 /* Points directly to the end, so we know no dirty page */
865 pss->page = size;
866 return;
870 * If during sending a host page, only look for dirty pages within the
871 * current host page being send.
873 if (pss->host_page_sending) {
874 assert(pss->host_page_end);
875 size = MIN(size, pss->host_page_end);
878 pss->page = find_next_bit(bitmap, size, pss->page);
881 static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb,
882 unsigned long page)
884 uint8_t shift;
885 hwaddr size, start;
887 if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) {
888 return;
891 shift = rb->clear_bmap_shift;
893 * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
894 * can make things easier sometimes since then start address
895 * of the small chunk will always be 64 pages aligned so the
896 * bitmap will always be aligned to unsigned long. We should
897 * even be able to remove this restriction but I'm simply
898 * keeping it.
900 assert(shift >= 6);
902 size = 1ULL << (TARGET_PAGE_BITS + shift);
903 start = QEMU_ALIGN_DOWN((ram_addr_t)page << TARGET_PAGE_BITS, size);
904 trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
905 memory_region_clear_dirty_bitmap(rb->mr, start, size);
908 static void
909 migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb,
910 unsigned long start,
911 unsigned long npages)
913 unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift;
914 unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages);
915 unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages);
918 * Clear pages from start to start + npages - 1, so the end boundary is
919 * exclusive.
921 for (i = chunk_start; i < chunk_end; i += chunk_pages) {
922 migration_clear_memory_region_dirty_bitmap(rb, i);
927 * colo_bitmap_find_diry:find contiguous dirty pages from start
929 * Returns the page offset within memory region of the start of the contiguout
930 * dirty page
932 * @rs: current RAM state
933 * @rb: RAMBlock where to search for dirty pages
934 * @start: page where we start the search
935 * @num: the number of contiguous dirty pages
937 static inline
938 unsigned long colo_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
939 unsigned long start, unsigned long *num)
941 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
942 unsigned long *bitmap = rb->bmap;
943 unsigned long first, next;
945 *num = 0;
947 if (ramblock_is_ignored(rb)) {
948 return size;
951 first = find_next_bit(bitmap, size, start);
952 if (first >= size) {
953 return first;
955 next = find_next_zero_bit(bitmap, size, first + 1);
956 assert(next >= first);
957 *num = next - first;
958 return first;
961 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
962 RAMBlock *rb,
963 unsigned long page)
965 bool ret;
968 * Clear dirty bitmap if needed. This _must_ be called before we
969 * send any of the page in the chunk because we need to make sure
970 * we can capture further page content changes when we sync dirty
971 * log the next time. So as long as we are going to send any of
972 * the page in the chunk we clear the remote dirty bitmap for all.
973 * Clearing it earlier won't be a problem, but too late will.
975 migration_clear_memory_region_dirty_bitmap(rb, page);
977 ret = test_and_clear_bit(page, rb->bmap);
978 if (ret) {
979 rs->migration_dirty_pages--;
982 return ret;
985 static void dirty_bitmap_clear_section(MemoryRegionSection *section,
986 void *opaque)
988 const hwaddr offset = section->offset_within_region;
989 const hwaddr size = int128_get64(section->size);
990 const unsigned long start = offset >> TARGET_PAGE_BITS;
991 const unsigned long npages = size >> TARGET_PAGE_BITS;
992 RAMBlock *rb = section->mr->ram_block;
993 uint64_t *cleared_bits = opaque;
996 * We don't grab ram_state->bitmap_mutex because we expect to run
997 * only when starting migration or during postcopy recovery where
998 * we don't have concurrent access.
1000 if (!migration_in_postcopy() && !migrate_background_snapshot()) {
1001 migration_clear_memory_region_dirty_bitmap_range(rb, start, npages);
1003 *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages);
1004 bitmap_clear(rb->bmap, start, npages);
1008 * Exclude all dirty pages from migration that fall into a discarded range as
1009 * managed by a RamDiscardManager responsible for the mapped memory region of
1010 * the RAMBlock. Clear the corresponding bits in the dirty bitmaps.
1012 * Discarded pages ("logically unplugged") have undefined content and must
1013 * not get migrated, because even reading these pages for migration might
1014 * result in undesired behavior.
1016 * Returns the number of cleared bits in the RAMBlock dirty bitmap.
1018 * Note: The result is only stable while migrating (precopy/postcopy).
1020 static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb)
1022 uint64_t cleared_bits = 0;
1024 if (rb->mr && rb->bmap && memory_region_has_ram_discard_manager(rb->mr)) {
1025 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1026 MemoryRegionSection section = {
1027 .mr = rb->mr,
1028 .offset_within_region = 0,
1029 .size = int128_make64(qemu_ram_get_used_length(rb)),
1032 ram_discard_manager_replay_discarded(rdm, &section,
1033 dirty_bitmap_clear_section,
1034 &cleared_bits);
1036 return cleared_bits;
1040 * Check if a host-page aligned page falls into a discarded range as managed by
1041 * a RamDiscardManager responsible for the mapped memory region of the RAMBlock.
1043 * Note: The result is only stable while migrating (precopy/postcopy).
1045 bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start)
1047 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1048 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1049 MemoryRegionSection section = {
1050 .mr = rb->mr,
1051 .offset_within_region = start,
1052 .size = int128_make64(qemu_ram_pagesize(rb)),
1055 return !ram_discard_manager_is_populated(rdm, &section);
1057 return false;
1060 /* Called with RCU critical section */
1061 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
1063 uint64_t new_dirty_pages =
1064 cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
1066 rs->migration_dirty_pages += new_dirty_pages;
1067 rs->num_dirty_pages_period += new_dirty_pages;
1071 * ram_pagesize_summary: calculate all the pagesizes of a VM
1073 * Returns a summary bitmap of the page sizes of all RAMBlocks
1075 * For VMs with just normal pages this is equivalent to the host page
1076 * size. If it's got some huge pages then it's the OR of all the
1077 * different page sizes.
1079 uint64_t ram_pagesize_summary(void)
1081 RAMBlock *block;
1082 uint64_t summary = 0;
1084 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1085 summary |= block->page_size;
1088 return summary;
1091 uint64_t ram_get_total_transferred_pages(void)
1093 return stat64_get(&ram_atomic_counters.normal) +
1094 stat64_get(&ram_atomic_counters.duplicate) +
1095 compression_counters.pages + xbzrle_counters.pages;
1098 static void migration_update_rates(RAMState *rs, int64_t end_time)
1100 uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
1101 double compressed_size;
1103 /* calculate period counters */
1104 ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
1105 / (end_time - rs->time_last_bitmap_sync);
1107 if (!page_count) {
1108 return;
1111 if (migrate_use_xbzrle()) {
1112 double encoded_size, unencoded_size;
1114 xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
1115 rs->xbzrle_cache_miss_prev) / page_count;
1116 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
1117 unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
1118 TARGET_PAGE_SIZE;
1119 encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
1120 if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
1121 xbzrle_counters.encoding_rate = 0;
1122 } else {
1123 xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
1125 rs->xbzrle_pages_prev = xbzrle_counters.pages;
1126 rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
1129 if (migrate_use_compression()) {
1130 compression_counters.busy_rate = (double)(compression_counters.busy -
1131 rs->compress_thread_busy_prev) / page_count;
1132 rs->compress_thread_busy_prev = compression_counters.busy;
1134 compressed_size = compression_counters.compressed_size -
1135 rs->compressed_size_prev;
1136 if (compressed_size) {
1137 double uncompressed_size = (compression_counters.pages -
1138 rs->compress_pages_prev) * TARGET_PAGE_SIZE;
1140 /* Compression-Ratio = Uncompressed-size / Compressed-size */
1141 compression_counters.compression_rate =
1142 uncompressed_size / compressed_size;
1144 rs->compress_pages_prev = compression_counters.pages;
1145 rs->compressed_size_prev = compression_counters.compressed_size;
1150 static void migration_trigger_throttle(RAMState *rs)
1152 MigrationState *s = migrate_get_current();
1153 uint64_t threshold = s->parameters.throttle_trigger_threshold;
1154 uint64_t bytes_xfer_period =
1155 stat64_get(&ram_atomic_counters.transferred) - rs->bytes_xfer_prev;
1156 uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
1157 uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
1159 /* During block migration the auto-converge logic incorrectly detects
1160 * that ram migration makes no progress. Avoid this by disabling the
1161 * throttling logic during the bulk phase of block migration. */
1162 if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1163 /* The following detection logic can be refined later. For now:
1164 Check to see if the ratio between dirtied bytes and the approx.
1165 amount of bytes that just got transferred since the last time
1166 we were in this routine reaches the threshold. If that happens
1167 twice, start or increase throttling. */
1169 if ((bytes_dirty_period > bytes_dirty_threshold) &&
1170 (++rs->dirty_rate_high_cnt >= 2)) {
1171 trace_migration_throttle();
1172 rs->dirty_rate_high_cnt = 0;
1173 mig_throttle_guest_down(bytes_dirty_period,
1174 bytes_dirty_threshold);
1179 static void migration_bitmap_sync(RAMState *rs)
1181 RAMBlock *block;
1182 int64_t end_time;
1184 ram_counters.dirty_sync_count++;
1186 if (!rs->time_last_bitmap_sync) {
1187 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1190 trace_migration_bitmap_sync_start();
1191 memory_global_dirty_log_sync();
1193 qemu_mutex_lock(&rs->bitmap_mutex);
1194 WITH_RCU_READ_LOCK_GUARD() {
1195 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1196 ramblock_sync_dirty_bitmap(rs, block);
1198 ram_counters.remaining = ram_bytes_remaining();
1200 qemu_mutex_unlock(&rs->bitmap_mutex);
1202 memory_global_after_dirty_log_sync();
1203 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1205 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1207 /* more than 1 second = 1000 millisecons */
1208 if (end_time > rs->time_last_bitmap_sync + 1000) {
1209 migration_trigger_throttle(rs);
1211 migration_update_rates(rs, end_time);
1213 rs->target_page_count_prev = rs->target_page_count;
1215 /* reset period counters */
1216 rs->time_last_bitmap_sync = end_time;
1217 rs->num_dirty_pages_period = 0;
1218 rs->bytes_xfer_prev = stat64_get(&ram_atomic_counters.transferred);
1220 if (migrate_use_events()) {
1221 qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
1225 static void migration_bitmap_sync_precopy(RAMState *rs)
1227 Error *local_err = NULL;
1230 * The current notifier usage is just an optimization to migration, so we
1231 * don't stop the normal migration process in the error case.
1233 if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1234 error_report_err(local_err);
1235 local_err = NULL;
1238 migration_bitmap_sync(rs);
1240 if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1241 error_report_err(local_err);
1245 void ram_release_page(const char *rbname, uint64_t offset)
1247 if (!migrate_release_ram() || !migration_in_postcopy()) {
1248 return;
1251 ram_discard_range(rbname, offset, TARGET_PAGE_SIZE);
1255 * save_zero_page_to_file: send the zero page to the file
1257 * Returns the size of data written to the file, 0 means the page is not
1258 * a zero page
1260 * @pss: current PSS channel
1261 * @block: block that contains the page we want to send
1262 * @offset: offset inside the block for the page
1264 static int save_zero_page_to_file(PageSearchStatus *pss,
1265 RAMBlock *block, ram_addr_t offset)
1267 uint8_t *p = block->host + offset;
1268 QEMUFile *file = pss->pss_channel;
1269 int len = 0;
1271 if (buffer_is_zero(p, TARGET_PAGE_SIZE)) {
1272 len += save_page_header(pss, block, offset | RAM_SAVE_FLAG_ZERO);
1273 qemu_put_byte(file, 0);
1274 len += 1;
1275 ram_release_page(block->idstr, offset);
1277 return len;
1281 * save_zero_page: send the zero page to the stream
1283 * Returns the number of pages written.
1285 * @pss: current PSS channel
1286 * @block: block that contains the page we want to send
1287 * @offset: offset inside the block for the page
1289 static int save_zero_page(PageSearchStatus *pss, RAMBlock *block,
1290 ram_addr_t offset)
1292 int len = save_zero_page_to_file(pss, block, offset);
1294 if (len) {
1295 stat64_add(&ram_atomic_counters.duplicate, 1);
1296 ram_transferred_add(len);
1297 return 1;
1299 return -1;
1303 * @pages: the number of pages written by the control path,
1304 * < 0 - error
1305 * > 0 - number of pages written
1307 * Return true if the pages has been saved, otherwise false is returned.
1309 static bool control_save_page(PageSearchStatus *pss, RAMBlock *block,
1310 ram_addr_t offset, int *pages)
1312 uint64_t bytes_xmit = 0;
1313 int ret;
1315 *pages = -1;
1316 ret = ram_control_save_page(pss->pss_channel, block->offset, offset,
1317 TARGET_PAGE_SIZE, &bytes_xmit);
1318 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1319 return false;
1322 if (bytes_xmit) {
1323 ram_transferred_add(bytes_xmit);
1324 *pages = 1;
1327 if (ret == RAM_SAVE_CONTROL_DELAYED) {
1328 return true;
1331 if (bytes_xmit > 0) {
1332 stat64_add(&ram_atomic_counters.normal, 1);
1333 } else if (bytes_xmit == 0) {
1334 stat64_add(&ram_atomic_counters.duplicate, 1);
1337 return true;
1341 * directly send the page to the stream
1343 * Returns the number of pages written.
1345 * @pss: current PSS channel
1346 * @block: block that contains the page we want to send
1347 * @offset: offset inside the block for the page
1348 * @buf: the page to be sent
1349 * @async: send to page asyncly
1351 static int save_normal_page(PageSearchStatus *pss, RAMBlock *block,
1352 ram_addr_t offset, uint8_t *buf, bool async)
1354 QEMUFile *file = pss->pss_channel;
1356 ram_transferred_add(save_page_header(pss, block,
1357 offset | RAM_SAVE_FLAG_PAGE));
1358 if (async) {
1359 qemu_put_buffer_async(file, buf, TARGET_PAGE_SIZE,
1360 migrate_release_ram() &&
1361 migration_in_postcopy());
1362 } else {
1363 qemu_put_buffer(file, buf, TARGET_PAGE_SIZE);
1365 ram_transferred_add(TARGET_PAGE_SIZE);
1366 stat64_add(&ram_atomic_counters.normal, 1);
1367 return 1;
1371 * ram_save_page: send the given page to the stream
1373 * Returns the number of pages written.
1374 * < 0 - error
1375 * >=0 - Number of pages written - this might legally be 0
1376 * if xbzrle noticed the page was the same.
1378 * @rs: current RAM state
1379 * @block: block that contains the page we want to send
1380 * @offset: offset inside the block for the page
1382 static int ram_save_page(RAMState *rs, PageSearchStatus *pss)
1384 int pages = -1;
1385 uint8_t *p;
1386 bool send_async = true;
1387 RAMBlock *block = pss->block;
1388 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1389 ram_addr_t current_addr = block->offset + offset;
1391 p = block->host + offset;
1392 trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1394 XBZRLE_cache_lock();
1395 if (rs->xbzrle_enabled && !migration_in_postcopy()) {
1396 pages = save_xbzrle_page(rs, pss, &p, current_addr,
1397 block, offset);
1398 if (!rs->last_stage) {
1399 /* Can't send this cached data async, since the cache page
1400 * might get updated before it gets to the wire
1402 send_async = false;
1406 /* XBZRLE overflow or normal page */
1407 if (pages == -1) {
1408 pages = save_normal_page(pss, block, offset, p, send_async);
1411 XBZRLE_cache_unlock();
1413 return pages;
1416 static int ram_save_multifd_page(QEMUFile *file, RAMBlock *block,
1417 ram_addr_t offset)
1419 if (multifd_queue_page(file, block, offset) < 0) {
1420 return -1;
1422 stat64_add(&ram_atomic_counters.normal, 1);
1424 return 1;
1427 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1428 ram_addr_t offset, uint8_t *source_buf)
1430 RAMState *rs = ram_state;
1431 PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY];
1432 uint8_t *p = block->host + offset;
1433 int ret;
1435 if (save_zero_page_to_file(pss, block, offset)) {
1436 return true;
1439 save_page_header(pss, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1442 * copy it to a internal buffer to avoid it being modified by VM
1443 * so that we can catch up the error during compression and
1444 * decompression
1446 memcpy(source_buf, p, TARGET_PAGE_SIZE);
1447 ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1448 if (ret < 0) {
1449 qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
1450 error_report("compressed data failed!");
1452 return false;
1455 static void
1456 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1458 ram_transferred_add(bytes_xmit);
1460 if (param->zero_page) {
1461 stat64_add(&ram_atomic_counters.duplicate, 1);
1462 return;
1465 /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1466 compression_counters.compressed_size += bytes_xmit - 8;
1467 compression_counters.pages++;
1470 static bool save_page_use_compression(RAMState *rs);
1472 static void flush_compressed_data(RAMState *rs)
1474 MigrationState *ms = migrate_get_current();
1475 int idx, len, thread_count;
1477 if (!save_page_use_compression(rs)) {
1478 return;
1480 thread_count = migrate_compress_threads();
1482 qemu_mutex_lock(&comp_done_lock);
1483 for (idx = 0; idx < thread_count; idx++) {
1484 while (!comp_param[idx].done) {
1485 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1488 qemu_mutex_unlock(&comp_done_lock);
1490 for (idx = 0; idx < thread_count; idx++) {
1491 qemu_mutex_lock(&comp_param[idx].mutex);
1492 if (!comp_param[idx].quit) {
1493 len = qemu_put_qemu_file(ms->to_dst_file, comp_param[idx].file);
1495 * it's safe to fetch zero_page without holding comp_done_lock
1496 * as there is no further request submitted to the thread,
1497 * i.e, the thread should be waiting for a request at this point.
1499 update_compress_thread_counts(&comp_param[idx], len);
1501 qemu_mutex_unlock(&comp_param[idx].mutex);
1505 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1506 ram_addr_t offset)
1508 param->block = block;
1509 param->offset = offset;
1512 static int compress_page_with_multi_thread(RAMBlock *block, ram_addr_t offset)
1514 int idx, thread_count, bytes_xmit = -1, pages = -1;
1515 bool wait = migrate_compress_wait_thread();
1516 MigrationState *ms = migrate_get_current();
1518 thread_count = migrate_compress_threads();
1519 qemu_mutex_lock(&comp_done_lock);
1520 retry:
1521 for (idx = 0; idx < thread_count; idx++) {
1522 if (comp_param[idx].done) {
1523 comp_param[idx].done = false;
1524 bytes_xmit = qemu_put_qemu_file(ms->to_dst_file,
1525 comp_param[idx].file);
1526 qemu_mutex_lock(&comp_param[idx].mutex);
1527 set_compress_params(&comp_param[idx], block, offset);
1528 qemu_cond_signal(&comp_param[idx].cond);
1529 qemu_mutex_unlock(&comp_param[idx].mutex);
1530 pages = 1;
1531 update_compress_thread_counts(&comp_param[idx], bytes_xmit);
1532 break;
1537 * wait for the free thread if the user specifies 'compress-wait-thread',
1538 * otherwise we will post the page out in the main thread as normal page.
1540 if (pages < 0 && wait) {
1541 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1542 goto retry;
1544 qemu_mutex_unlock(&comp_done_lock);
1546 return pages;
1550 * find_dirty_block: find the next dirty page and update any state
1551 * associated with the search process.
1553 * Returns true if a page is found
1555 * @rs: current RAM state
1556 * @pss: data about the state of the current dirty page scan
1557 * @again: set to false if the search has scanned the whole of RAM
1559 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1561 /* Update pss->page for the next dirty bit in ramblock */
1562 pss_find_next_dirty(pss);
1564 if (pss->complete_round && pss->block == rs->last_seen_block &&
1565 pss->page >= rs->last_page) {
1567 * We've been once around the RAM and haven't found anything.
1568 * Give up.
1570 *again = false;
1571 return false;
1573 if (!offset_in_ramblock(pss->block,
1574 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) {
1575 /* Didn't find anything in this RAM Block */
1576 pss->page = 0;
1577 pss->block = QLIST_NEXT_RCU(pss->block, next);
1578 if (!pss->block) {
1580 * If memory migration starts over, we will meet a dirtied page
1581 * which may still exists in compression threads's ring, so we
1582 * should flush the compressed data to make sure the new page
1583 * is not overwritten by the old one in the destination.
1585 * Also If xbzrle is on, stop using the data compression at this
1586 * point. In theory, xbzrle can do better than compression.
1588 flush_compressed_data(rs);
1590 /* Hit the end of the list */
1591 pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1592 /* Flag that we've looped */
1593 pss->complete_round = true;
1594 /* After the first round, enable XBZRLE. */
1595 if (migrate_use_xbzrle()) {
1596 rs->xbzrle_enabled = true;
1599 /* Didn't find anything this time, but try again on the new block */
1600 *again = true;
1601 return false;
1602 } else {
1603 /* Can go around again, but... */
1604 *again = true;
1605 /* We've found something so probably don't need to */
1606 return true;
1611 * unqueue_page: gets a page of the queue
1613 * Helper for 'get_queued_page' - gets a page off the queue
1615 * Returns the block of the page (or NULL if none available)
1617 * @rs: current RAM state
1618 * @offset: used to return the offset within the RAMBlock
1620 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1622 struct RAMSrcPageRequest *entry;
1623 RAMBlock *block = NULL;
1625 if (!postcopy_has_request(rs)) {
1626 return NULL;
1629 QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
1632 * This should _never_ change even after we take the lock, because no one
1633 * should be taking anything off the request list other than us.
1635 assert(postcopy_has_request(rs));
1637 entry = QSIMPLEQ_FIRST(&rs->src_page_requests);
1638 block = entry->rb;
1639 *offset = entry->offset;
1641 if (entry->len > TARGET_PAGE_SIZE) {
1642 entry->len -= TARGET_PAGE_SIZE;
1643 entry->offset += TARGET_PAGE_SIZE;
1644 } else {
1645 memory_region_unref(block->mr);
1646 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1647 g_free(entry);
1648 migration_consume_urgent_request();
1651 return block;
1654 #if defined(__linux__)
1656 * poll_fault_page: try to get next UFFD write fault page and, if pending fault
1657 * is found, return RAM block pointer and page offset
1659 * Returns pointer to the RAMBlock containing faulting page,
1660 * NULL if no write faults are pending
1662 * @rs: current RAM state
1663 * @offset: page offset from the beginning of the block
1665 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1667 struct uffd_msg uffd_msg;
1668 void *page_address;
1669 RAMBlock *block;
1670 int res;
1672 if (!migrate_background_snapshot()) {
1673 return NULL;
1676 res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1);
1677 if (res <= 0) {
1678 return NULL;
1681 page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address;
1682 block = qemu_ram_block_from_host(page_address, false, offset);
1683 assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0);
1684 return block;
1688 * ram_save_release_protection: release UFFD write protection after
1689 * a range of pages has been saved
1691 * @rs: current RAM state
1692 * @pss: page-search-status structure
1693 * @start_page: index of the first page in the range relative to pss->block
1695 * Returns 0 on success, negative value in case of an error
1697 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1698 unsigned long start_page)
1700 int res = 0;
1702 /* Check if page is from UFFD-managed region. */
1703 if (pss->block->flags & RAM_UF_WRITEPROTECT) {
1704 void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS);
1705 uint64_t run_length = (pss->page - start_page) << TARGET_PAGE_BITS;
1707 /* Flush async buffers before un-protect. */
1708 qemu_fflush(pss->pss_channel);
1709 /* Un-protect memory range. */
1710 res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
1711 false, false);
1714 return res;
1717 /* ram_write_tracking_available: check if kernel supports required UFFD features
1719 * Returns true if supports, false otherwise
1721 bool ram_write_tracking_available(void)
1723 uint64_t uffd_features;
1724 int res;
1726 res = uffd_query_features(&uffd_features);
1727 return (res == 0 &&
1728 (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0);
1731 /* ram_write_tracking_compatible: check if guest configuration is
1732 * compatible with 'write-tracking'
1734 * Returns true if compatible, false otherwise
1736 bool ram_write_tracking_compatible(void)
1738 const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT);
1739 int uffd_fd;
1740 RAMBlock *block;
1741 bool ret = false;
1743 /* Open UFFD file descriptor */
1744 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false);
1745 if (uffd_fd < 0) {
1746 return false;
1749 RCU_READ_LOCK_GUARD();
1751 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1752 uint64_t uffd_ioctls;
1754 /* Nothing to do with read-only and MMIO-writable regions */
1755 if (block->mr->readonly || block->mr->rom_device) {
1756 continue;
1758 /* Try to register block memory via UFFD-IO to track writes */
1759 if (uffd_register_memory(uffd_fd, block->host, block->max_length,
1760 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) {
1761 goto out;
1763 if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) {
1764 goto out;
1767 ret = true;
1769 out:
1770 uffd_close_fd(uffd_fd);
1771 return ret;
1774 static inline void populate_read_range(RAMBlock *block, ram_addr_t offset,
1775 ram_addr_t size)
1778 * We read one byte of each page; this will preallocate page tables if
1779 * required and populate the shared zeropage on MAP_PRIVATE anonymous memory
1780 * where no page was populated yet. This might require adaption when
1781 * supporting other mappings, like shmem.
1783 for (; offset < size; offset += block->page_size) {
1784 char tmp = *((char *)block->host + offset);
1786 /* Don't optimize the read out */
1787 asm volatile("" : "+r" (tmp));
1791 static inline int populate_read_section(MemoryRegionSection *section,
1792 void *opaque)
1794 const hwaddr size = int128_get64(section->size);
1795 hwaddr offset = section->offset_within_region;
1796 RAMBlock *block = section->mr->ram_block;
1798 populate_read_range(block, offset, size);
1799 return 0;
1803 * ram_block_populate_read: preallocate page tables and populate pages in the
1804 * RAM block by reading a byte of each page.
1806 * Since it's solely used for userfault_fd WP feature, here we just
1807 * hardcode page size to qemu_real_host_page_size.
1809 * @block: RAM block to populate
1811 static void ram_block_populate_read(RAMBlock *rb)
1814 * Skip populating all pages that fall into a discarded range as managed by
1815 * a RamDiscardManager responsible for the mapped memory region of the
1816 * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock
1817 * must not get populated automatically. We don't have to track
1818 * modifications via userfaultfd WP reliably, because these pages will
1819 * not be part of the migration stream either way -- see
1820 * ramblock_dirty_bitmap_exclude_discarded_pages().
1822 * Note: The result is only stable while migrating (precopy/postcopy).
1824 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1825 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1826 MemoryRegionSection section = {
1827 .mr = rb->mr,
1828 .offset_within_region = 0,
1829 .size = rb->mr->size,
1832 ram_discard_manager_replay_populated(rdm, &section,
1833 populate_read_section, NULL);
1834 } else {
1835 populate_read_range(rb, 0, rb->used_length);
1840 * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking
1842 void ram_write_tracking_prepare(void)
1844 RAMBlock *block;
1846 RCU_READ_LOCK_GUARD();
1848 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1849 /* Nothing to do with read-only and MMIO-writable regions */
1850 if (block->mr->readonly || block->mr->rom_device) {
1851 continue;
1855 * Populate pages of the RAM block before enabling userfault_fd
1856 * write protection.
1858 * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with
1859 * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip
1860 * pages with pte_none() entries in page table.
1862 ram_block_populate_read(block);
1867 * ram_write_tracking_start: start UFFD-WP memory tracking
1869 * Returns 0 for success or negative value in case of error
1871 int ram_write_tracking_start(void)
1873 int uffd_fd;
1874 RAMState *rs = ram_state;
1875 RAMBlock *block;
1877 /* Open UFFD file descriptor */
1878 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true);
1879 if (uffd_fd < 0) {
1880 return uffd_fd;
1882 rs->uffdio_fd = uffd_fd;
1884 RCU_READ_LOCK_GUARD();
1886 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1887 /* Nothing to do with read-only and MMIO-writable regions */
1888 if (block->mr->readonly || block->mr->rom_device) {
1889 continue;
1892 /* Register block memory with UFFD to track writes */
1893 if (uffd_register_memory(rs->uffdio_fd, block->host,
1894 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
1895 goto fail;
1897 /* Apply UFFD write protection to the block memory range */
1898 if (uffd_change_protection(rs->uffdio_fd, block->host,
1899 block->max_length, true, false)) {
1900 goto fail;
1902 block->flags |= RAM_UF_WRITEPROTECT;
1903 memory_region_ref(block->mr);
1905 trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size,
1906 block->host, block->max_length);
1909 return 0;
1911 fail:
1912 error_report("ram_write_tracking_start() failed: restoring initial memory state");
1914 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1915 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1916 continue;
1919 * In case some memory block failed to be write-protected
1920 * remove protection and unregister all succeeded RAM blocks
1922 uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1923 false, false);
1924 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1925 /* Cleanup flags and remove reference */
1926 block->flags &= ~RAM_UF_WRITEPROTECT;
1927 memory_region_unref(block->mr);
1930 uffd_close_fd(uffd_fd);
1931 rs->uffdio_fd = -1;
1932 return -1;
1936 * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
1938 void ram_write_tracking_stop(void)
1940 RAMState *rs = ram_state;
1941 RAMBlock *block;
1943 RCU_READ_LOCK_GUARD();
1945 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1946 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1947 continue;
1949 /* Remove protection and unregister all affected RAM blocks */
1950 uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1951 false, false);
1952 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1954 trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size,
1955 block->host, block->max_length);
1957 /* Cleanup flags and remove reference */
1958 block->flags &= ~RAM_UF_WRITEPROTECT;
1959 memory_region_unref(block->mr);
1962 /* Finally close UFFD file descriptor */
1963 uffd_close_fd(rs->uffdio_fd);
1964 rs->uffdio_fd = -1;
1967 #else
1968 /* No target OS support, stubs just fail or ignore */
1970 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1972 (void) rs;
1973 (void) offset;
1975 return NULL;
1978 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1979 unsigned long start_page)
1981 (void) rs;
1982 (void) pss;
1983 (void) start_page;
1985 return 0;
1988 bool ram_write_tracking_available(void)
1990 return false;
1993 bool ram_write_tracking_compatible(void)
1995 assert(0);
1996 return false;
1999 int ram_write_tracking_start(void)
2001 assert(0);
2002 return -1;
2005 void ram_write_tracking_stop(void)
2007 assert(0);
2009 #endif /* defined(__linux__) */
2012 * get_queued_page: unqueue a page from the postcopy requests
2014 * Skips pages that are already sent (!dirty)
2016 * Returns true if a queued page is found
2018 * @rs: current RAM state
2019 * @pss: data about the state of the current dirty page scan
2021 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
2023 RAMBlock *block;
2024 ram_addr_t offset;
2025 bool dirty;
2027 do {
2028 block = unqueue_page(rs, &offset);
2030 * We're sending this page, and since it's postcopy nothing else
2031 * will dirty it, and we must make sure it doesn't get sent again
2032 * even if this queue request was received after the background
2033 * search already sent it.
2035 if (block) {
2036 unsigned long page;
2038 page = offset >> TARGET_PAGE_BITS;
2039 dirty = test_bit(page, block->bmap);
2040 if (!dirty) {
2041 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
2042 page);
2043 } else {
2044 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
2048 } while (block && !dirty);
2050 if (!block) {
2052 * Poll write faults too if background snapshot is enabled; that's
2053 * when we have vcpus got blocked by the write protected pages.
2055 block = poll_fault_page(rs, &offset);
2058 if (block) {
2060 * We want the background search to continue from the queued page
2061 * since the guest is likely to want other pages near to the page
2062 * it just requested.
2064 pss->block = block;
2065 pss->page = offset >> TARGET_PAGE_BITS;
2068 * This unqueued page would break the "one round" check, even is
2069 * really rare.
2071 pss->complete_round = false;
2074 return !!block;
2078 * migration_page_queue_free: drop any remaining pages in the ram
2079 * request queue
2081 * It should be empty at the end anyway, but in error cases there may
2082 * be some left. in case that there is any page left, we drop it.
2085 static void migration_page_queue_free(RAMState *rs)
2087 struct RAMSrcPageRequest *mspr, *next_mspr;
2088 /* This queue generally should be empty - but in the case of a failed
2089 * migration might have some droppings in.
2091 RCU_READ_LOCK_GUARD();
2092 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
2093 memory_region_unref(mspr->rb->mr);
2094 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
2095 g_free(mspr);
2100 * ram_save_queue_pages: queue the page for transmission
2102 * A request from postcopy destination for example.
2104 * Returns zero on success or negative on error
2106 * @rbname: Name of the RAMBLock of the request. NULL means the
2107 * same that last one.
2108 * @start: starting address from the start of the RAMBlock
2109 * @len: length (in bytes) to send
2111 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
2113 RAMBlock *ramblock;
2114 RAMState *rs = ram_state;
2116 ram_counters.postcopy_requests++;
2117 RCU_READ_LOCK_GUARD();
2119 if (!rbname) {
2120 /* Reuse last RAMBlock */
2121 ramblock = rs->last_req_rb;
2123 if (!ramblock) {
2125 * Shouldn't happen, we can't reuse the last RAMBlock if
2126 * it's the 1st request.
2128 error_report("ram_save_queue_pages no previous block");
2129 return -1;
2131 } else {
2132 ramblock = qemu_ram_block_by_name(rbname);
2134 if (!ramblock) {
2135 /* We shouldn't be asked for a non-existent RAMBlock */
2136 error_report("ram_save_queue_pages no block '%s'", rbname);
2137 return -1;
2139 rs->last_req_rb = ramblock;
2141 trace_ram_save_queue_pages(ramblock->idstr, start, len);
2142 if (!offset_in_ramblock(ramblock, start + len - 1)) {
2143 error_report("%s request overrun start=" RAM_ADDR_FMT " len="
2144 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
2145 __func__, start, len, ramblock->used_length);
2146 return -1;
2150 * When with postcopy preempt, we send back the page directly in the
2151 * rp-return thread.
2153 if (postcopy_preempt_active()) {
2154 ram_addr_t page_start = start >> TARGET_PAGE_BITS;
2155 size_t page_size = qemu_ram_pagesize(ramblock);
2156 PageSearchStatus *pss = &ram_state->pss[RAM_CHANNEL_POSTCOPY];
2157 int ret = 0;
2159 qemu_mutex_lock(&rs->bitmap_mutex);
2161 pss_init(pss, ramblock, page_start);
2163 * Always use the preempt channel, and make sure it's there. It's
2164 * safe to access without lock, because when rp-thread is running
2165 * we should be the only one who operates on the qemufile
2167 pss->pss_channel = migrate_get_current()->postcopy_qemufile_src;
2168 assert(pss->pss_channel);
2171 * It must be either one or multiple of host page size. Just
2172 * assert; if something wrong we're mostly split brain anyway.
2174 assert(len % page_size == 0);
2175 while (len) {
2176 if (ram_save_host_page_urgent(pss)) {
2177 error_report("%s: ram_save_host_page_urgent() failed: "
2178 "ramblock=%s, start_addr=0x"RAM_ADDR_FMT,
2179 __func__, ramblock->idstr, start);
2180 ret = -1;
2181 break;
2184 * NOTE: after ram_save_host_page_urgent() succeeded, pss->page
2185 * will automatically be moved and point to the next host page
2186 * we're going to send, so no need to update here.
2188 * Normally QEMU never sends >1 host page in requests, so
2189 * logically we don't even need that as the loop should only
2190 * run once, but just to be consistent.
2192 len -= page_size;
2194 qemu_mutex_unlock(&rs->bitmap_mutex);
2196 return ret;
2199 struct RAMSrcPageRequest *new_entry =
2200 g_new0(struct RAMSrcPageRequest, 1);
2201 new_entry->rb = ramblock;
2202 new_entry->offset = start;
2203 new_entry->len = len;
2205 memory_region_ref(ramblock->mr);
2206 qemu_mutex_lock(&rs->src_page_req_mutex);
2207 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
2208 migration_make_urgent_request();
2209 qemu_mutex_unlock(&rs->src_page_req_mutex);
2211 return 0;
2214 static bool save_page_use_compression(RAMState *rs)
2216 if (!migrate_use_compression()) {
2217 return false;
2221 * If xbzrle is enabled (e.g., after first round of migration), stop
2222 * using the data compression. In theory, xbzrle can do better than
2223 * compression.
2225 if (rs->xbzrle_enabled) {
2226 return false;
2229 return true;
2233 * try to compress the page before posting it out, return true if the page
2234 * has been properly handled by compression, otherwise needs other
2235 * paths to handle it
2237 static bool save_compress_page(RAMState *rs, PageSearchStatus *pss,
2238 RAMBlock *block, ram_addr_t offset)
2240 if (!save_page_use_compression(rs)) {
2241 return false;
2245 * When starting the process of a new block, the first page of
2246 * the block should be sent out before other pages in the same
2247 * block, and all the pages in last block should have been sent
2248 * out, keeping this order is important, because the 'cont' flag
2249 * is used to avoid resending the block name.
2251 * We post the fist page as normal page as compression will take
2252 * much CPU resource.
2254 if (block != pss->last_sent_block) {
2255 flush_compressed_data(rs);
2256 return false;
2259 if (compress_page_with_multi_thread(block, offset) > 0) {
2260 return true;
2263 compression_counters.busy++;
2264 return false;
2268 * ram_save_target_page: save one target page
2270 * Returns the number of pages written
2272 * @rs: current RAM state
2273 * @pss: data about the page we want to send
2275 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss)
2277 RAMBlock *block = pss->block;
2278 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
2279 int res;
2281 if (control_save_page(pss, block, offset, &res)) {
2282 return res;
2285 if (save_compress_page(rs, pss, block, offset)) {
2286 return 1;
2289 res = save_zero_page(pss, block, offset);
2290 if (res > 0) {
2291 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2292 * page would be stale
2294 if (rs->xbzrle_enabled) {
2295 XBZRLE_cache_lock();
2296 xbzrle_cache_zero_page(rs, block->offset + offset);
2297 XBZRLE_cache_unlock();
2299 return res;
2303 * Do not use multifd in postcopy as one whole host page should be
2304 * placed. Meanwhile postcopy requires atomic update of pages, so even
2305 * if host page size == guest page size the dest guest during run may
2306 * still see partially copied pages which is data corruption.
2308 if (migrate_use_multifd() && !migration_in_postcopy()) {
2309 return ram_save_multifd_page(pss->pss_channel, block, offset);
2312 return ram_save_page(rs, pss);
2315 /* Should be called before sending a host page */
2316 static void pss_host_page_prepare(PageSearchStatus *pss)
2318 /* How many guest pages are there in one host page? */
2319 size_t guest_pfns = qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2321 pss->host_page_sending = true;
2322 if (guest_pfns <= 1) {
2324 * This covers both when guest psize == host psize, or when guest
2325 * has larger psize than the host (guest_pfns==0).
2327 * For the latter, we always send one whole guest page per
2328 * iteration of the host page (example: an Alpha VM on x86 host
2329 * will have guest psize 8K while host psize 4K).
2331 pss->host_page_start = pss->page;
2332 pss->host_page_end = pss->page + 1;
2333 } else {
2335 * The host page spans over multiple guest pages, we send them
2336 * within the same host page iteration.
2338 pss->host_page_start = ROUND_DOWN(pss->page, guest_pfns);
2339 pss->host_page_end = ROUND_UP(pss->page + 1, guest_pfns);
2344 * Whether the page pointed by PSS is within the host page being sent.
2345 * Must be called after a previous pss_host_page_prepare().
2347 static bool pss_within_range(PageSearchStatus *pss)
2349 ram_addr_t ram_addr;
2351 assert(pss->host_page_sending);
2353 /* Over host-page boundary? */
2354 if (pss->page >= pss->host_page_end) {
2355 return false;
2358 ram_addr = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
2360 return offset_in_ramblock(pss->block, ram_addr);
2363 static void pss_host_page_finish(PageSearchStatus *pss)
2365 pss->host_page_sending = false;
2366 /* This is not needed, but just to reset it */
2367 pss->host_page_start = pss->host_page_end = 0;
2371 * Send an urgent host page specified by `pss'. Need to be called with
2372 * bitmap_mutex held.
2374 * Returns 0 if save host page succeeded, false otherwise.
2376 static int ram_save_host_page_urgent(PageSearchStatus *pss)
2378 bool page_dirty, sent = false;
2379 RAMState *rs = ram_state;
2380 int ret = 0;
2382 trace_postcopy_preempt_send_host_page(pss->block->idstr, pss->page);
2383 pss_host_page_prepare(pss);
2386 * If precopy is sending the same page, let it be done in precopy, or
2387 * we could send the same page in two channels and none of them will
2388 * receive the whole page.
2390 if (pss_overlap(pss, &ram_state->pss[RAM_CHANNEL_PRECOPY])) {
2391 trace_postcopy_preempt_hit(pss->block->idstr,
2392 pss->page << TARGET_PAGE_BITS);
2393 return 0;
2396 do {
2397 page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page);
2399 if (page_dirty) {
2400 /* Be strict to return code; it must be 1, or what else? */
2401 if (ram_save_target_page(rs, pss) != 1) {
2402 error_report_once("%s: ram_save_target_page failed", __func__);
2403 ret = -1;
2404 goto out;
2406 sent = true;
2408 pss_find_next_dirty(pss);
2409 } while (pss_within_range(pss));
2410 out:
2411 pss_host_page_finish(pss);
2412 /* For urgent requests, flush immediately if sent */
2413 if (sent) {
2414 qemu_fflush(pss->pss_channel);
2416 return ret;
2420 * ram_save_host_page: save a whole host page
2422 * Starting at *offset send pages up to the end of the current host
2423 * page. It's valid for the initial offset to point into the middle of
2424 * a host page in which case the remainder of the hostpage is sent.
2425 * Only dirty target pages are sent. Note that the host page size may
2426 * be a huge page for this block.
2428 * The saving stops at the boundary of the used_length of the block
2429 * if the RAMBlock isn't a multiple of the host page size.
2431 * The caller must be with ram_state.bitmap_mutex held to call this
2432 * function. Note that this function can temporarily release the lock, but
2433 * when the function is returned it'll make sure the lock is still held.
2435 * Returns the number of pages written or negative on error
2437 * @rs: current RAM state
2438 * @pss: data about the page we want to send
2440 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss)
2442 bool page_dirty, preempt_active = postcopy_preempt_active();
2443 int tmppages, pages = 0;
2444 size_t pagesize_bits =
2445 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2446 unsigned long start_page = pss->page;
2447 int res;
2449 if (ramblock_is_ignored(pss->block)) {
2450 error_report("block %s should not be migrated !", pss->block->idstr);
2451 return 0;
2454 /* Update host page boundary information */
2455 pss_host_page_prepare(pss);
2457 do {
2458 page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page);
2460 /* Check the pages is dirty and if it is send it */
2461 if (page_dirty) {
2463 * Properly yield the lock only in postcopy preempt mode
2464 * because both migration thread and rp-return thread can
2465 * operate on the bitmaps.
2467 if (preempt_active) {
2468 qemu_mutex_unlock(&rs->bitmap_mutex);
2470 tmppages = ram_save_target_page(rs, pss);
2471 if (tmppages >= 0) {
2472 pages += tmppages;
2474 * Allow rate limiting to happen in the middle of huge pages if
2475 * something is sent in the current iteration.
2477 if (pagesize_bits > 1 && tmppages > 0) {
2478 migration_rate_limit();
2481 if (preempt_active) {
2482 qemu_mutex_lock(&rs->bitmap_mutex);
2484 } else {
2485 tmppages = 0;
2488 if (tmppages < 0) {
2489 pss_host_page_finish(pss);
2490 return tmppages;
2493 pss_find_next_dirty(pss);
2494 } while (pss_within_range(pss));
2496 pss_host_page_finish(pss);
2498 res = ram_save_release_protection(rs, pss, start_page);
2499 return (res < 0 ? res : pages);
2503 * ram_find_and_save_block: finds a dirty page and sends it to f
2505 * Called within an RCU critical section.
2507 * Returns the number of pages written where zero means no dirty pages,
2508 * or negative on error
2510 * @rs: current RAM state
2512 * On systems where host-page-size > target-page-size it will send all the
2513 * pages in a host page that are dirty.
2515 static int ram_find_and_save_block(RAMState *rs)
2517 PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY];
2518 int pages = 0;
2519 bool again, found;
2521 /* No dirty page as there is zero RAM */
2522 if (!ram_bytes_total()) {
2523 return pages;
2527 * Always keep last_seen_block/last_page valid during this procedure,
2528 * because find_dirty_block() relies on these values (e.g., we compare
2529 * last_seen_block with pss.block to see whether we searched all the
2530 * ramblocks) to detect the completion of migration. Having NULL value
2531 * of last_seen_block can conditionally cause below loop to run forever.
2533 if (!rs->last_seen_block) {
2534 rs->last_seen_block = QLIST_FIRST_RCU(&ram_list.blocks);
2535 rs->last_page = 0;
2538 pss_init(pss, rs->last_seen_block, rs->last_page);
2540 do {
2541 again = true;
2542 found = get_queued_page(rs, pss);
2544 if (!found) {
2545 /* priority queue empty, so just search for something dirty */
2546 found = find_dirty_block(rs, pss, &again);
2549 if (found) {
2550 pages = ram_save_host_page(rs, pss);
2552 } while (!pages && again);
2554 rs->last_seen_block = pss->block;
2555 rs->last_page = pss->page;
2557 return pages;
2560 void acct_update_position(QEMUFile *f, size_t size, bool zero)
2562 uint64_t pages = size / TARGET_PAGE_SIZE;
2564 if (zero) {
2565 stat64_add(&ram_atomic_counters.duplicate, pages);
2566 } else {
2567 stat64_add(&ram_atomic_counters.normal, pages);
2568 ram_transferred_add(size);
2569 qemu_file_credit_transfer(f, size);
2573 static uint64_t ram_bytes_total_common(bool count_ignored)
2575 RAMBlock *block;
2576 uint64_t total = 0;
2578 RCU_READ_LOCK_GUARD();
2580 if (count_ignored) {
2581 RAMBLOCK_FOREACH_MIGRATABLE(block) {
2582 total += block->used_length;
2584 } else {
2585 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2586 total += block->used_length;
2589 return total;
2592 uint64_t ram_bytes_total(void)
2594 return ram_bytes_total_common(false);
2597 static void xbzrle_load_setup(void)
2599 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2602 static void xbzrle_load_cleanup(void)
2604 g_free(XBZRLE.decoded_buf);
2605 XBZRLE.decoded_buf = NULL;
2608 static void ram_state_cleanup(RAMState **rsp)
2610 if (*rsp) {
2611 migration_page_queue_free(*rsp);
2612 qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2613 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2614 g_free(*rsp);
2615 *rsp = NULL;
2619 static void xbzrle_cleanup(void)
2621 XBZRLE_cache_lock();
2622 if (XBZRLE.cache) {
2623 cache_fini(XBZRLE.cache);
2624 g_free(XBZRLE.encoded_buf);
2625 g_free(XBZRLE.current_buf);
2626 g_free(XBZRLE.zero_target_page);
2627 XBZRLE.cache = NULL;
2628 XBZRLE.encoded_buf = NULL;
2629 XBZRLE.current_buf = NULL;
2630 XBZRLE.zero_target_page = NULL;
2632 XBZRLE_cache_unlock();
2635 static void ram_save_cleanup(void *opaque)
2637 RAMState **rsp = opaque;
2638 RAMBlock *block;
2640 /* We don't use dirty log with background snapshots */
2641 if (!migrate_background_snapshot()) {
2642 /* caller have hold iothread lock or is in a bh, so there is
2643 * no writing race against the migration bitmap
2645 if (global_dirty_tracking & GLOBAL_DIRTY_MIGRATION) {
2647 * do not stop dirty log without starting it, since
2648 * memory_global_dirty_log_stop will assert that
2649 * memory_global_dirty_log_start/stop used in pairs
2651 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
2655 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2656 g_free(block->clear_bmap);
2657 block->clear_bmap = NULL;
2658 g_free(block->bmap);
2659 block->bmap = NULL;
2662 xbzrle_cleanup();
2663 compress_threads_save_cleanup();
2664 ram_state_cleanup(rsp);
2667 static void ram_state_reset(RAMState *rs)
2669 int i;
2671 for (i = 0; i < RAM_CHANNEL_MAX; i++) {
2672 rs->pss[i].last_sent_block = NULL;
2675 rs->last_seen_block = NULL;
2676 rs->last_page = 0;
2677 rs->last_version = ram_list.version;
2678 rs->xbzrle_enabled = false;
2681 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2683 /* **** functions for postcopy ***** */
2685 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2687 struct RAMBlock *block;
2689 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2690 unsigned long *bitmap = block->bmap;
2691 unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2692 unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2694 while (run_start < range) {
2695 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2696 ram_discard_range(block->idstr,
2697 ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
2698 ((ram_addr_t)(run_end - run_start))
2699 << TARGET_PAGE_BITS);
2700 run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2706 * postcopy_send_discard_bm_ram: discard a RAMBlock
2708 * Callback from postcopy_each_ram_send_discard for each RAMBlock
2710 * @ms: current migration state
2711 * @block: RAMBlock to discard
2713 static void postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
2715 unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2716 unsigned long current;
2717 unsigned long *bitmap = block->bmap;
2719 for (current = 0; current < end; ) {
2720 unsigned long one = find_next_bit(bitmap, end, current);
2721 unsigned long zero, discard_length;
2723 if (one >= end) {
2724 break;
2727 zero = find_next_zero_bit(bitmap, end, one + 1);
2729 if (zero >= end) {
2730 discard_length = end - one;
2731 } else {
2732 discard_length = zero - one;
2734 postcopy_discard_send_range(ms, one, discard_length);
2735 current = one + discard_length;
2739 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block);
2742 * postcopy_each_ram_send_discard: discard all RAMBlocks
2744 * Utility for the outgoing postcopy code.
2745 * Calls postcopy_send_discard_bm_ram for each RAMBlock
2746 * passing it bitmap indexes and name.
2747 * (qemu_ram_foreach_block ends up passing unscaled lengths
2748 * which would mean postcopy code would have to deal with target page)
2750 * @ms: current migration state
2752 static void postcopy_each_ram_send_discard(MigrationState *ms)
2754 struct RAMBlock *block;
2756 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2757 postcopy_discard_send_init(ms, block->idstr);
2760 * Deal with TPS != HPS and huge pages. It discard any partially sent
2761 * host-page size chunks, mark any partially dirty host-page size
2762 * chunks as all dirty. In this case the host-page is the host-page
2763 * for the particular RAMBlock, i.e. it might be a huge page.
2765 postcopy_chunk_hostpages_pass(ms, block);
2768 * Postcopy sends chunks of bitmap over the wire, but it
2769 * just needs indexes at this point, avoids it having
2770 * target page specific code.
2772 postcopy_send_discard_bm_ram(ms, block);
2773 postcopy_discard_send_finish(ms);
2778 * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2780 * Helper for postcopy_chunk_hostpages; it's called twice to
2781 * canonicalize the two bitmaps, that are similar, but one is
2782 * inverted.
2784 * Postcopy requires that all target pages in a hostpage are dirty or
2785 * clean, not a mix. This function canonicalizes the bitmaps.
2787 * @ms: current migration state
2788 * @block: block that contains the page we want to canonicalize
2790 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
2792 RAMState *rs = ram_state;
2793 unsigned long *bitmap = block->bmap;
2794 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2795 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2796 unsigned long run_start;
2798 if (block->page_size == TARGET_PAGE_SIZE) {
2799 /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2800 return;
2803 /* Find a dirty page */
2804 run_start = find_next_bit(bitmap, pages, 0);
2806 while (run_start < pages) {
2809 * If the start of this run of pages is in the middle of a host
2810 * page, then we need to fixup this host page.
2812 if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2813 /* Find the end of this run */
2814 run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2816 * If the end isn't at the start of a host page, then the
2817 * run doesn't finish at the end of a host page
2818 * and we need to discard.
2822 if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2823 unsigned long page;
2824 unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2825 host_ratio);
2826 run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2828 /* Clean up the bitmap */
2829 for (page = fixup_start_addr;
2830 page < fixup_start_addr + host_ratio; page++) {
2832 * Remark them as dirty, updating the count for any pages
2833 * that weren't previously dirty.
2835 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2839 /* Find the next dirty page for the next iteration */
2840 run_start = find_next_bit(bitmap, pages, run_start);
2845 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2847 * Transmit the set of pages to be discarded after precopy to the target
2848 * these are pages that:
2849 * a) Have been previously transmitted but are now dirty again
2850 * b) Pages that have never been transmitted, this ensures that
2851 * any pages on the destination that have been mapped by background
2852 * tasks get discarded (transparent huge pages is the specific concern)
2853 * Hopefully this is pretty sparse
2855 * @ms: current migration state
2857 void ram_postcopy_send_discard_bitmap(MigrationState *ms)
2859 RAMState *rs = ram_state;
2861 RCU_READ_LOCK_GUARD();
2863 /* This should be our last sync, the src is now paused */
2864 migration_bitmap_sync(rs);
2866 /* Easiest way to make sure we don't resume in the middle of a host-page */
2867 rs->pss[RAM_CHANNEL_PRECOPY].last_sent_block = NULL;
2868 rs->last_seen_block = NULL;
2869 rs->last_page = 0;
2871 postcopy_each_ram_send_discard(ms);
2873 trace_ram_postcopy_send_discard_bitmap();
2877 * ram_discard_range: discard dirtied pages at the beginning of postcopy
2879 * Returns zero on success
2881 * @rbname: name of the RAMBlock of the request. NULL means the
2882 * same that last one.
2883 * @start: RAMBlock starting page
2884 * @length: RAMBlock size
2886 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2888 trace_ram_discard_range(rbname, start, length);
2890 RCU_READ_LOCK_GUARD();
2891 RAMBlock *rb = qemu_ram_block_by_name(rbname);
2893 if (!rb) {
2894 error_report("ram_discard_range: Failed to find block '%s'", rbname);
2895 return -1;
2899 * On source VM, we don't need to update the received bitmap since
2900 * we don't even have one.
2902 if (rb->receivedmap) {
2903 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2904 length >> qemu_target_page_bits());
2907 return ram_block_discard_range(rb, start, length);
2911 * For every allocation, we will try not to crash the VM if the
2912 * allocation failed.
2914 static int xbzrle_init(void)
2916 Error *local_err = NULL;
2918 if (!migrate_use_xbzrle()) {
2919 return 0;
2922 XBZRLE_cache_lock();
2924 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2925 if (!XBZRLE.zero_target_page) {
2926 error_report("%s: Error allocating zero page", __func__);
2927 goto err_out;
2930 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2931 TARGET_PAGE_SIZE, &local_err);
2932 if (!XBZRLE.cache) {
2933 error_report_err(local_err);
2934 goto free_zero_page;
2937 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2938 if (!XBZRLE.encoded_buf) {
2939 error_report("%s: Error allocating encoded_buf", __func__);
2940 goto free_cache;
2943 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2944 if (!XBZRLE.current_buf) {
2945 error_report("%s: Error allocating current_buf", __func__);
2946 goto free_encoded_buf;
2949 /* We are all good */
2950 XBZRLE_cache_unlock();
2951 return 0;
2953 free_encoded_buf:
2954 g_free(XBZRLE.encoded_buf);
2955 XBZRLE.encoded_buf = NULL;
2956 free_cache:
2957 cache_fini(XBZRLE.cache);
2958 XBZRLE.cache = NULL;
2959 free_zero_page:
2960 g_free(XBZRLE.zero_target_page);
2961 XBZRLE.zero_target_page = NULL;
2962 err_out:
2963 XBZRLE_cache_unlock();
2964 return -ENOMEM;
2967 static int ram_state_init(RAMState **rsp)
2969 *rsp = g_try_new0(RAMState, 1);
2971 if (!*rsp) {
2972 error_report("%s: Init ramstate fail", __func__);
2973 return -1;
2976 qemu_mutex_init(&(*rsp)->bitmap_mutex);
2977 qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2978 QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2981 * Count the total number of pages used by ram blocks not including any
2982 * gaps due to alignment or unplugs.
2983 * This must match with the initial values of dirty bitmap.
2985 (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2986 ram_state_reset(*rsp);
2988 return 0;
2991 static void ram_list_init_bitmaps(void)
2993 MigrationState *ms = migrate_get_current();
2994 RAMBlock *block;
2995 unsigned long pages;
2996 uint8_t shift;
2998 /* Skip setting bitmap if there is no RAM */
2999 if (ram_bytes_total()) {
3000 shift = ms->clear_bitmap_shift;
3001 if (shift > CLEAR_BITMAP_SHIFT_MAX) {
3002 error_report("clear_bitmap_shift (%u) too big, using "
3003 "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
3004 shift = CLEAR_BITMAP_SHIFT_MAX;
3005 } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
3006 error_report("clear_bitmap_shift (%u) too small, using "
3007 "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
3008 shift = CLEAR_BITMAP_SHIFT_MIN;
3011 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3012 pages = block->max_length >> TARGET_PAGE_BITS;
3014 * The initial dirty bitmap for migration must be set with all
3015 * ones to make sure we'll migrate every guest RAM page to
3016 * destination.
3017 * Here we set RAMBlock.bmap all to 1 because when rebegin a
3018 * new migration after a failed migration, ram_list.
3019 * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
3020 * guest memory.
3022 block->bmap = bitmap_new(pages);
3023 bitmap_set(block->bmap, 0, pages);
3024 block->clear_bmap_shift = shift;
3025 block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
3030 static void migration_bitmap_clear_discarded_pages(RAMState *rs)
3032 unsigned long pages;
3033 RAMBlock *rb;
3035 RCU_READ_LOCK_GUARD();
3037 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3038 pages = ramblock_dirty_bitmap_clear_discarded_pages(rb);
3039 rs->migration_dirty_pages -= pages;
3043 static void ram_init_bitmaps(RAMState *rs)
3045 /* For memory_global_dirty_log_start below. */
3046 qemu_mutex_lock_iothread();
3047 qemu_mutex_lock_ramlist();
3049 WITH_RCU_READ_LOCK_GUARD() {
3050 ram_list_init_bitmaps();
3051 /* We don't use dirty log with background snapshots */
3052 if (!migrate_background_snapshot()) {
3053 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
3054 migration_bitmap_sync_precopy(rs);
3057 qemu_mutex_unlock_ramlist();
3058 qemu_mutex_unlock_iothread();
3061 * After an eventual first bitmap sync, fixup the initial bitmap
3062 * containing all 1s to exclude any discarded pages from migration.
3064 migration_bitmap_clear_discarded_pages(rs);
3067 static int ram_init_all(RAMState **rsp)
3069 if (ram_state_init(rsp)) {
3070 return -1;
3073 if (xbzrle_init()) {
3074 ram_state_cleanup(rsp);
3075 return -1;
3078 ram_init_bitmaps(*rsp);
3080 return 0;
3083 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
3085 RAMBlock *block;
3086 uint64_t pages = 0;
3089 * Postcopy is not using xbzrle/compression, so no need for that.
3090 * Also, since source are already halted, we don't need to care
3091 * about dirty page logging as well.
3094 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3095 pages += bitmap_count_one(block->bmap,
3096 block->used_length >> TARGET_PAGE_BITS);
3099 /* This may not be aligned with current bitmaps. Recalculate. */
3100 rs->migration_dirty_pages = pages;
3102 ram_state_reset(rs);
3104 /* Update RAMState cache of output QEMUFile */
3105 rs->pss[RAM_CHANNEL_PRECOPY].pss_channel = out;
3107 trace_ram_state_resume_prepare(pages);
3111 * This function clears bits of the free pages reported by the caller from the
3112 * migration dirty bitmap. @addr is the host address corresponding to the
3113 * start of the continuous guest free pages, and @len is the total bytes of
3114 * those pages.
3116 void qemu_guest_free_page_hint(void *addr, size_t len)
3118 RAMBlock *block;
3119 ram_addr_t offset;
3120 size_t used_len, start, npages;
3121 MigrationState *s = migrate_get_current();
3123 /* This function is currently expected to be used during live migration */
3124 if (!migration_is_setup_or_active(s->state)) {
3125 return;
3128 for (; len > 0; len -= used_len, addr += used_len) {
3129 block = qemu_ram_block_from_host(addr, false, &offset);
3130 if (unlikely(!block || offset >= block->used_length)) {
3132 * The implementation might not support RAMBlock resize during
3133 * live migration, but it could happen in theory with future
3134 * updates. So we add a check here to capture that case.
3136 error_report_once("%s unexpected error", __func__);
3137 return;
3140 if (len <= block->used_length - offset) {
3141 used_len = len;
3142 } else {
3143 used_len = block->used_length - offset;
3146 start = offset >> TARGET_PAGE_BITS;
3147 npages = used_len >> TARGET_PAGE_BITS;
3149 qemu_mutex_lock(&ram_state->bitmap_mutex);
3151 * The skipped free pages are equavalent to be sent from clear_bmap's
3152 * perspective, so clear the bits from the memory region bitmap which
3153 * are initially set. Otherwise those skipped pages will be sent in
3154 * the next round after syncing from the memory region bitmap.
3156 migration_clear_memory_region_dirty_bitmap_range(block, start, npages);
3157 ram_state->migration_dirty_pages -=
3158 bitmap_count_one_with_offset(block->bmap, start, npages);
3159 bitmap_clear(block->bmap, start, npages);
3160 qemu_mutex_unlock(&ram_state->bitmap_mutex);
3165 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
3166 * long-running RCU critical section. When rcu-reclaims in the code
3167 * start to become numerous it will be necessary to reduce the
3168 * granularity of these critical sections.
3172 * ram_save_setup: Setup RAM for migration
3174 * Returns zero to indicate success and negative for error
3176 * @f: QEMUFile where to send the data
3177 * @opaque: RAMState pointer
3179 static int ram_save_setup(QEMUFile *f, void *opaque)
3181 RAMState **rsp = opaque;
3182 RAMBlock *block;
3183 int ret;
3185 if (compress_threads_save_setup()) {
3186 return -1;
3189 /* migration has already setup the bitmap, reuse it. */
3190 if (!migration_in_colo_state()) {
3191 if (ram_init_all(rsp) != 0) {
3192 compress_threads_save_cleanup();
3193 return -1;
3196 (*rsp)->pss[RAM_CHANNEL_PRECOPY].pss_channel = f;
3198 WITH_RCU_READ_LOCK_GUARD() {
3199 qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
3201 RAMBLOCK_FOREACH_MIGRATABLE(block) {
3202 qemu_put_byte(f, strlen(block->idstr));
3203 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
3204 qemu_put_be64(f, block->used_length);
3205 if (migrate_postcopy_ram() && block->page_size !=
3206 qemu_host_page_size) {
3207 qemu_put_be64(f, block->page_size);
3209 if (migrate_ignore_shared()) {
3210 qemu_put_be64(f, block->mr->addr);
3215 ram_control_before_iterate(f, RAM_CONTROL_SETUP);
3216 ram_control_after_iterate(f, RAM_CONTROL_SETUP);
3218 ret = multifd_send_sync_main(f);
3219 if (ret < 0) {
3220 return ret;
3223 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3224 qemu_fflush(f);
3226 return 0;
3230 * ram_save_iterate: iterative stage for migration
3232 * Returns zero to indicate success and negative for error
3234 * @f: QEMUFile where to send the data
3235 * @opaque: RAMState pointer
3237 static int ram_save_iterate(QEMUFile *f, void *opaque)
3239 RAMState **temp = opaque;
3240 RAMState *rs = *temp;
3241 int ret = 0;
3242 int i;
3243 int64_t t0;
3244 int done = 0;
3246 if (blk_mig_bulk_active()) {
3247 /* Avoid transferring ram during bulk phase of block migration as
3248 * the bulk phase will usually take a long time and transferring
3249 * ram updates during that time is pointless. */
3250 goto out;
3254 * We'll take this lock a little bit long, but it's okay for two reasons.
3255 * Firstly, the only possible other thread to take it is who calls
3256 * qemu_guest_free_page_hint(), which should be rare; secondly, see
3257 * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which
3258 * guarantees that we'll at least released it in a regular basis.
3260 qemu_mutex_lock(&rs->bitmap_mutex);
3261 WITH_RCU_READ_LOCK_GUARD() {
3262 if (ram_list.version != rs->last_version) {
3263 ram_state_reset(rs);
3266 /* Read version before ram_list.blocks */
3267 smp_rmb();
3269 ram_control_before_iterate(f, RAM_CONTROL_ROUND);
3271 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
3272 i = 0;
3273 while ((ret = qemu_file_rate_limit(f)) == 0 ||
3274 postcopy_has_request(rs)) {
3275 int pages;
3277 if (qemu_file_get_error(f)) {
3278 break;
3281 pages = ram_find_and_save_block(rs);
3282 /* no more pages to sent */
3283 if (pages == 0) {
3284 done = 1;
3285 break;
3288 if (pages < 0) {
3289 qemu_file_set_error(f, pages);
3290 break;
3293 rs->target_page_count += pages;
3296 * During postcopy, it is necessary to make sure one whole host
3297 * page is sent in one chunk.
3299 if (migrate_postcopy_ram()) {
3300 flush_compressed_data(rs);
3304 * we want to check in the 1st loop, just in case it was the 1st
3305 * time and we had to sync the dirty bitmap.
3306 * qemu_clock_get_ns() is a bit expensive, so we only check each
3307 * some iterations
3309 if ((i & 63) == 0) {
3310 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
3311 1000000;
3312 if (t1 > MAX_WAIT) {
3313 trace_ram_save_iterate_big_wait(t1, i);
3314 break;
3317 i++;
3320 qemu_mutex_unlock(&rs->bitmap_mutex);
3323 * Must occur before EOS (or any QEMUFile operation)
3324 * because of RDMA protocol.
3326 ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3328 out:
3329 if (ret >= 0
3330 && migration_is_setup_or_active(migrate_get_current()->state)) {
3331 ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel);
3332 if (ret < 0) {
3333 return ret;
3336 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3337 qemu_fflush(f);
3338 ram_transferred_add(8);
3340 ret = qemu_file_get_error(f);
3342 if (ret < 0) {
3343 return ret;
3346 return done;
3350 * ram_save_complete: function called to send the remaining amount of ram
3352 * Returns zero to indicate success or negative on error
3354 * Called with iothread lock
3356 * @f: QEMUFile where to send the data
3357 * @opaque: RAMState pointer
3359 static int ram_save_complete(QEMUFile *f, void *opaque)
3361 RAMState **temp = opaque;
3362 RAMState *rs = *temp;
3363 int ret = 0;
3365 rs->last_stage = !migration_in_colo_state();
3367 WITH_RCU_READ_LOCK_GUARD() {
3368 if (!migration_in_postcopy()) {
3369 migration_bitmap_sync_precopy(rs);
3372 ram_control_before_iterate(f, RAM_CONTROL_FINISH);
3374 /* try transferring iterative blocks of memory */
3376 /* flush all remaining blocks regardless of rate limiting */
3377 qemu_mutex_lock(&rs->bitmap_mutex);
3378 while (true) {
3379 int pages;
3381 pages = ram_find_and_save_block(rs);
3382 /* no more blocks to sent */
3383 if (pages == 0) {
3384 break;
3386 if (pages < 0) {
3387 ret = pages;
3388 break;
3391 qemu_mutex_unlock(&rs->bitmap_mutex);
3393 flush_compressed_data(rs);
3394 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3397 if (ret < 0) {
3398 return ret;
3401 ret = multifd_send_sync_main(rs->pss[RAM_CHANNEL_PRECOPY].pss_channel);
3402 if (ret < 0) {
3403 return ret;
3406 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3407 qemu_fflush(f);
3409 return 0;
3412 static void ram_state_pending_estimate(void *opaque, uint64_t max_size,
3413 uint64_t *res_precopy_only,
3414 uint64_t *res_compatible,
3415 uint64_t *res_postcopy_only)
3417 RAMState **temp = opaque;
3418 RAMState *rs = *temp;
3420 uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3422 if (migrate_postcopy_ram()) {
3423 /* We can do postcopy, and all the data is postcopiable */
3424 *res_postcopy_only += remaining_size;
3425 } else {
3426 *res_precopy_only += remaining_size;
3430 static void ram_state_pending_exact(void *opaque, uint64_t max_size,
3431 uint64_t *res_precopy_only,
3432 uint64_t *res_compatible,
3433 uint64_t *res_postcopy_only)
3435 RAMState **temp = opaque;
3436 RAMState *rs = *temp;
3438 uint64_t remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3440 if (!migration_in_postcopy()) {
3441 qemu_mutex_lock_iothread();
3442 WITH_RCU_READ_LOCK_GUARD() {
3443 migration_bitmap_sync_precopy(rs);
3445 qemu_mutex_unlock_iothread();
3446 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3449 if (migrate_postcopy_ram()) {
3450 /* We can do postcopy, and all the data is postcopiable */
3451 *res_compatible += remaining_size;
3452 } else {
3453 *res_precopy_only += remaining_size;
3457 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3459 unsigned int xh_len;
3460 int xh_flags;
3461 uint8_t *loaded_data;
3463 /* extract RLE header */
3464 xh_flags = qemu_get_byte(f);
3465 xh_len = qemu_get_be16(f);
3467 if (xh_flags != ENCODING_FLAG_XBZRLE) {
3468 error_report("Failed to load XBZRLE page - wrong compression!");
3469 return -1;
3472 if (xh_len > TARGET_PAGE_SIZE) {
3473 error_report("Failed to load XBZRLE page - len overflow!");
3474 return -1;
3476 loaded_data = XBZRLE.decoded_buf;
3477 /* load data and decode */
3478 /* it can change loaded_data to point to an internal buffer */
3479 qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3481 /* decode RLE */
3482 if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3483 TARGET_PAGE_SIZE) == -1) {
3484 error_report("Failed to load XBZRLE page - decode error!");
3485 return -1;
3488 return 0;
3492 * ram_block_from_stream: read a RAMBlock id from the migration stream
3494 * Must be called from within a rcu critical section.
3496 * Returns a pointer from within the RCU-protected ram_list.
3498 * @mis: the migration incoming state pointer
3499 * @f: QEMUFile where to read the data from
3500 * @flags: Page flags (mostly to see if it's a continuation of previous block)
3501 * @channel: the channel we're using
3503 static inline RAMBlock *ram_block_from_stream(MigrationIncomingState *mis,
3504 QEMUFile *f, int flags,
3505 int channel)
3507 RAMBlock *block = mis->last_recv_block[channel];
3508 char id[256];
3509 uint8_t len;
3511 if (flags & RAM_SAVE_FLAG_CONTINUE) {
3512 if (!block) {
3513 error_report("Ack, bad migration stream!");
3514 return NULL;
3516 return block;
3519 len = qemu_get_byte(f);
3520 qemu_get_buffer(f, (uint8_t *)id, len);
3521 id[len] = 0;
3523 block = qemu_ram_block_by_name(id);
3524 if (!block) {
3525 error_report("Can't find block %s", id);
3526 return NULL;
3529 if (ramblock_is_ignored(block)) {
3530 error_report("block %s should not be migrated !", id);
3531 return NULL;
3534 mis->last_recv_block[channel] = block;
3536 return block;
3539 static inline void *host_from_ram_block_offset(RAMBlock *block,
3540 ram_addr_t offset)
3542 if (!offset_in_ramblock(block, offset)) {
3543 return NULL;
3546 return block->host + offset;
3549 static void *host_page_from_ram_block_offset(RAMBlock *block,
3550 ram_addr_t offset)
3552 /* Note: Explicitly no check against offset_in_ramblock(). */
3553 return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset),
3554 block->page_size);
3557 static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block,
3558 ram_addr_t offset)
3560 return ((uintptr_t)block->host + offset) & (block->page_size - 1);
3563 static inline void *colo_cache_from_block_offset(RAMBlock *block,
3564 ram_addr_t offset, bool record_bitmap)
3566 if (!offset_in_ramblock(block, offset)) {
3567 return NULL;
3569 if (!block->colo_cache) {
3570 error_report("%s: colo_cache is NULL in block :%s",
3571 __func__, block->idstr);
3572 return NULL;
3576 * During colo checkpoint, we need bitmap of these migrated pages.
3577 * It help us to decide which pages in ram cache should be flushed
3578 * into VM's RAM later.
3580 if (record_bitmap &&
3581 !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
3582 ram_state->migration_dirty_pages++;
3584 return block->colo_cache + offset;
3588 * ram_handle_compressed: handle the zero page case
3590 * If a page (or a whole RDMA chunk) has been
3591 * determined to be zero, then zap it.
3593 * @host: host address for the zero page
3594 * @ch: what the page is filled from. We only support zero
3595 * @size: size of the zero page
3597 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3599 if (ch != 0 || !buffer_is_zero(host, size)) {
3600 memset(host, ch, size);
3604 /* return the size after decompression, or negative value on error */
3605 static int
3606 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3607 const uint8_t *source, size_t source_len)
3609 int err;
3611 err = inflateReset(stream);
3612 if (err != Z_OK) {
3613 return -1;
3616 stream->avail_in = source_len;
3617 stream->next_in = (uint8_t *)source;
3618 stream->avail_out = dest_len;
3619 stream->next_out = dest;
3621 err = inflate(stream, Z_NO_FLUSH);
3622 if (err != Z_STREAM_END) {
3623 return -1;
3626 return stream->total_out;
3629 static void *do_data_decompress(void *opaque)
3631 DecompressParam *param = opaque;
3632 unsigned long pagesize;
3633 uint8_t *des;
3634 int len, ret;
3636 qemu_mutex_lock(&param->mutex);
3637 while (!param->quit) {
3638 if (param->des) {
3639 des = param->des;
3640 len = param->len;
3641 param->des = 0;
3642 qemu_mutex_unlock(&param->mutex);
3644 pagesize = TARGET_PAGE_SIZE;
3646 ret = qemu_uncompress_data(&param->stream, des, pagesize,
3647 param->compbuf, len);
3648 if (ret < 0 && migrate_get_current()->decompress_error_check) {
3649 error_report("decompress data failed");
3650 qemu_file_set_error(decomp_file, ret);
3653 qemu_mutex_lock(&decomp_done_lock);
3654 param->done = true;
3655 qemu_cond_signal(&decomp_done_cond);
3656 qemu_mutex_unlock(&decomp_done_lock);
3658 qemu_mutex_lock(&param->mutex);
3659 } else {
3660 qemu_cond_wait(&param->cond, &param->mutex);
3663 qemu_mutex_unlock(&param->mutex);
3665 return NULL;
3668 static int wait_for_decompress_done(void)
3670 int idx, thread_count;
3672 if (!migrate_use_compression()) {
3673 return 0;
3676 thread_count = migrate_decompress_threads();
3677 qemu_mutex_lock(&decomp_done_lock);
3678 for (idx = 0; idx < thread_count; idx++) {
3679 while (!decomp_param[idx].done) {
3680 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3683 qemu_mutex_unlock(&decomp_done_lock);
3684 return qemu_file_get_error(decomp_file);
3687 static void compress_threads_load_cleanup(void)
3689 int i, thread_count;
3691 if (!migrate_use_compression()) {
3692 return;
3694 thread_count = migrate_decompress_threads();
3695 for (i = 0; i < thread_count; i++) {
3697 * we use it as a indicator which shows if the thread is
3698 * properly init'd or not
3700 if (!decomp_param[i].compbuf) {
3701 break;
3704 qemu_mutex_lock(&decomp_param[i].mutex);
3705 decomp_param[i].quit = true;
3706 qemu_cond_signal(&decomp_param[i].cond);
3707 qemu_mutex_unlock(&decomp_param[i].mutex);
3709 for (i = 0; i < thread_count; i++) {
3710 if (!decomp_param[i].compbuf) {
3711 break;
3714 qemu_thread_join(decompress_threads + i);
3715 qemu_mutex_destroy(&decomp_param[i].mutex);
3716 qemu_cond_destroy(&decomp_param[i].cond);
3717 inflateEnd(&decomp_param[i].stream);
3718 g_free(decomp_param[i].compbuf);
3719 decomp_param[i].compbuf = NULL;
3721 g_free(decompress_threads);
3722 g_free(decomp_param);
3723 decompress_threads = NULL;
3724 decomp_param = NULL;
3725 decomp_file = NULL;
3728 static int compress_threads_load_setup(QEMUFile *f)
3730 int i, thread_count;
3732 if (!migrate_use_compression()) {
3733 return 0;
3736 thread_count = migrate_decompress_threads();
3737 decompress_threads = g_new0(QemuThread, thread_count);
3738 decomp_param = g_new0(DecompressParam, thread_count);
3739 qemu_mutex_init(&decomp_done_lock);
3740 qemu_cond_init(&decomp_done_cond);
3741 decomp_file = f;
3742 for (i = 0; i < thread_count; i++) {
3743 if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3744 goto exit;
3747 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3748 qemu_mutex_init(&decomp_param[i].mutex);
3749 qemu_cond_init(&decomp_param[i].cond);
3750 decomp_param[i].done = true;
3751 decomp_param[i].quit = false;
3752 qemu_thread_create(decompress_threads + i, "decompress",
3753 do_data_decompress, decomp_param + i,
3754 QEMU_THREAD_JOINABLE);
3756 return 0;
3757 exit:
3758 compress_threads_load_cleanup();
3759 return -1;
3762 static void decompress_data_with_multi_threads(QEMUFile *f,
3763 void *host, int len)
3765 int idx, thread_count;
3767 thread_count = migrate_decompress_threads();
3768 QEMU_LOCK_GUARD(&decomp_done_lock);
3769 while (true) {
3770 for (idx = 0; idx < thread_count; idx++) {
3771 if (decomp_param[idx].done) {
3772 decomp_param[idx].done = false;
3773 qemu_mutex_lock(&decomp_param[idx].mutex);
3774 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3775 decomp_param[idx].des = host;
3776 decomp_param[idx].len = len;
3777 qemu_cond_signal(&decomp_param[idx].cond);
3778 qemu_mutex_unlock(&decomp_param[idx].mutex);
3779 break;
3782 if (idx < thread_count) {
3783 break;
3784 } else {
3785 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3790 static void colo_init_ram_state(void)
3792 ram_state_init(&ram_state);
3796 * colo cache: this is for secondary VM, we cache the whole
3797 * memory of the secondary VM, it is need to hold the global lock
3798 * to call this helper.
3800 int colo_init_ram_cache(void)
3802 RAMBlock *block;
3804 WITH_RCU_READ_LOCK_GUARD() {
3805 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3806 block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3807 NULL, false, false);
3808 if (!block->colo_cache) {
3809 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3810 "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3811 block->used_length);
3812 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3813 if (block->colo_cache) {
3814 qemu_anon_ram_free(block->colo_cache, block->used_length);
3815 block->colo_cache = NULL;
3818 return -errno;
3820 if (!machine_dump_guest_core(current_machine)) {
3821 qemu_madvise(block->colo_cache, block->used_length,
3822 QEMU_MADV_DONTDUMP);
3828 * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3829 * with to decide which page in cache should be flushed into SVM's RAM. Here
3830 * we use the same name 'ram_bitmap' as for migration.
3832 if (ram_bytes_total()) {
3833 RAMBlock *block;
3835 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3836 unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3837 block->bmap = bitmap_new(pages);
3841 colo_init_ram_state();
3842 return 0;
3845 /* TODO: duplicated with ram_init_bitmaps */
3846 void colo_incoming_start_dirty_log(void)
3848 RAMBlock *block = NULL;
3849 /* For memory_global_dirty_log_start below. */
3850 qemu_mutex_lock_iothread();
3851 qemu_mutex_lock_ramlist();
3853 memory_global_dirty_log_sync();
3854 WITH_RCU_READ_LOCK_GUARD() {
3855 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3856 ramblock_sync_dirty_bitmap(ram_state, block);
3857 /* Discard this dirty bitmap record */
3858 bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3860 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
3862 ram_state->migration_dirty_pages = 0;
3863 qemu_mutex_unlock_ramlist();
3864 qemu_mutex_unlock_iothread();
3867 /* It is need to hold the global lock to call this helper */
3868 void colo_release_ram_cache(void)
3870 RAMBlock *block;
3872 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
3873 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3874 g_free(block->bmap);
3875 block->bmap = NULL;
3878 WITH_RCU_READ_LOCK_GUARD() {
3879 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3880 if (block->colo_cache) {
3881 qemu_anon_ram_free(block->colo_cache, block->used_length);
3882 block->colo_cache = NULL;
3886 ram_state_cleanup(&ram_state);
3890 * ram_load_setup: Setup RAM for migration incoming side
3892 * Returns zero to indicate success and negative for error
3894 * @f: QEMUFile where to receive the data
3895 * @opaque: RAMState pointer
3897 static int ram_load_setup(QEMUFile *f, void *opaque)
3899 if (compress_threads_load_setup(f)) {
3900 return -1;
3903 xbzrle_load_setup();
3904 ramblock_recv_map_init();
3906 return 0;
3909 static int ram_load_cleanup(void *opaque)
3911 RAMBlock *rb;
3913 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3914 qemu_ram_block_writeback(rb);
3917 xbzrle_load_cleanup();
3918 compress_threads_load_cleanup();
3920 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3921 g_free(rb->receivedmap);
3922 rb->receivedmap = NULL;
3925 return 0;
3929 * ram_postcopy_incoming_init: allocate postcopy data structures
3931 * Returns 0 for success and negative if there was one error
3933 * @mis: current migration incoming state
3935 * Allocate data structures etc needed by incoming migration with
3936 * postcopy-ram. postcopy-ram's similarly names
3937 * postcopy_ram_incoming_init does the work.
3939 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3941 return postcopy_ram_incoming_init(mis);
3945 * ram_load_postcopy: load a page in postcopy case
3947 * Returns 0 for success or -errno in case of error
3949 * Called in postcopy mode by ram_load().
3950 * rcu_read_lock is taken prior to this being called.
3952 * @f: QEMUFile where to send the data
3953 * @channel: the channel to use for loading
3955 int ram_load_postcopy(QEMUFile *f, int channel)
3957 int flags = 0, ret = 0;
3958 bool place_needed = false;
3959 bool matches_target_page_size = false;
3960 MigrationIncomingState *mis = migration_incoming_get_current();
3961 PostcopyTmpPage *tmp_page = &mis->postcopy_tmp_pages[channel];
3963 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3964 ram_addr_t addr;
3965 void *page_buffer = NULL;
3966 void *place_source = NULL;
3967 RAMBlock *block = NULL;
3968 uint8_t ch;
3969 int len;
3971 addr = qemu_get_be64(f);
3974 * If qemu file error, we should stop here, and then "addr"
3975 * may be invalid
3977 ret = qemu_file_get_error(f);
3978 if (ret) {
3979 break;
3982 flags = addr & ~TARGET_PAGE_MASK;
3983 addr &= TARGET_PAGE_MASK;
3985 trace_ram_load_postcopy_loop(channel, (uint64_t)addr, flags);
3986 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3987 RAM_SAVE_FLAG_COMPRESS_PAGE)) {
3988 block = ram_block_from_stream(mis, f, flags, channel);
3989 if (!block) {
3990 ret = -EINVAL;
3991 break;
3995 * Relying on used_length is racy and can result in false positives.
3996 * We might place pages beyond used_length in case RAM was shrunk
3997 * while in postcopy, which is fine - trying to place via
3998 * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault.
4000 if (!block->host || addr >= block->postcopy_length) {
4001 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4002 ret = -EINVAL;
4003 break;
4005 tmp_page->target_pages++;
4006 matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
4008 * Postcopy requires that we place whole host pages atomically;
4009 * these may be huge pages for RAMBlocks that are backed by
4010 * hugetlbfs.
4011 * To make it atomic, the data is read into a temporary page
4012 * that's moved into place later.
4013 * The migration protocol uses, possibly smaller, target-pages
4014 * however the source ensures it always sends all the components
4015 * of a host page in one chunk.
4017 page_buffer = tmp_page->tmp_huge_page +
4018 host_page_offset_from_ram_block_offset(block, addr);
4019 /* If all TP are zero then we can optimise the place */
4020 if (tmp_page->target_pages == 1) {
4021 tmp_page->host_addr =
4022 host_page_from_ram_block_offset(block, addr);
4023 } else if (tmp_page->host_addr !=
4024 host_page_from_ram_block_offset(block, addr)) {
4025 /* not the 1st TP within the HP */
4026 error_report("Non-same host page detected on channel %d: "
4027 "Target host page %p, received host page %p "
4028 "(rb %s offset 0x"RAM_ADDR_FMT" target_pages %d)",
4029 channel, tmp_page->host_addr,
4030 host_page_from_ram_block_offset(block, addr),
4031 block->idstr, addr, tmp_page->target_pages);
4032 ret = -EINVAL;
4033 break;
4037 * If it's the last part of a host page then we place the host
4038 * page
4040 if (tmp_page->target_pages ==
4041 (block->page_size / TARGET_PAGE_SIZE)) {
4042 place_needed = true;
4044 place_source = tmp_page->tmp_huge_page;
4047 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4048 case RAM_SAVE_FLAG_ZERO:
4049 ch = qemu_get_byte(f);
4051 * Can skip to set page_buffer when
4052 * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
4054 if (ch || !matches_target_page_size) {
4055 memset(page_buffer, ch, TARGET_PAGE_SIZE);
4057 if (ch) {
4058 tmp_page->all_zero = false;
4060 break;
4062 case RAM_SAVE_FLAG_PAGE:
4063 tmp_page->all_zero = false;
4064 if (!matches_target_page_size) {
4065 /* For huge pages, we always use temporary buffer */
4066 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
4067 } else {
4069 * For small pages that matches target page size, we
4070 * avoid the qemu_file copy. Instead we directly use
4071 * the buffer of QEMUFile to place the page. Note: we
4072 * cannot do any QEMUFile operation before using that
4073 * buffer to make sure the buffer is valid when
4074 * placing the page.
4076 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
4077 TARGET_PAGE_SIZE);
4079 break;
4080 case RAM_SAVE_FLAG_COMPRESS_PAGE:
4081 tmp_page->all_zero = false;
4082 len = qemu_get_be32(f);
4083 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4084 error_report("Invalid compressed data length: %d", len);
4085 ret = -EINVAL;
4086 break;
4088 decompress_data_with_multi_threads(f, page_buffer, len);
4089 break;
4091 case RAM_SAVE_FLAG_EOS:
4092 /* normal exit */
4093 multifd_recv_sync_main();
4094 break;
4095 default:
4096 error_report("Unknown combination of migration flags: 0x%x"
4097 " (postcopy mode)", flags);
4098 ret = -EINVAL;
4099 break;
4102 /* Got the whole host page, wait for decompress before placing. */
4103 if (place_needed) {
4104 ret |= wait_for_decompress_done();
4107 /* Detect for any possible file errors */
4108 if (!ret && qemu_file_get_error(f)) {
4109 ret = qemu_file_get_error(f);
4112 if (!ret && place_needed) {
4113 if (tmp_page->all_zero) {
4114 ret = postcopy_place_page_zero(mis, tmp_page->host_addr, block);
4115 } else {
4116 ret = postcopy_place_page(mis, tmp_page->host_addr,
4117 place_source, block);
4119 place_needed = false;
4120 postcopy_temp_page_reset(tmp_page);
4124 return ret;
4127 static bool postcopy_is_advised(void)
4129 PostcopyState ps = postcopy_state_get();
4130 return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
4133 static bool postcopy_is_running(void)
4135 PostcopyState ps = postcopy_state_get();
4136 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
4140 * Flush content of RAM cache into SVM's memory.
4141 * Only flush the pages that be dirtied by PVM or SVM or both.
4143 void colo_flush_ram_cache(void)
4145 RAMBlock *block = NULL;
4146 void *dst_host;
4147 void *src_host;
4148 unsigned long offset = 0;
4150 memory_global_dirty_log_sync();
4151 WITH_RCU_READ_LOCK_GUARD() {
4152 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4153 ramblock_sync_dirty_bitmap(ram_state, block);
4157 trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
4158 WITH_RCU_READ_LOCK_GUARD() {
4159 block = QLIST_FIRST_RCU(&ram_list.blocks);
4161 while (block) {
4162 unsigned long num = 0;
4164 offset = colo_bitmap_find_dirty(ram_state, block, offset, &num);
4165 if (!offset_in_ramblock(block,
4166 ((ram_addr_t)offset) << TARGET_PAGE_BITS)) {
4167 offset = 0;
4168 num = 0;
4169 block = QLIST_NEXT_RCU(block, next);
4170 } else {
4171 unsigned long i = 0;
4173 for (i = 0; i < num; i++) {
4174 migration_bitmap_clear_dirty(ram_state, block, offset + i);
4176 dst_host = block->host
4177 + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
4178 src_host = block->colo_cache
4179 + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
4180 memcpy(dst_host, src_host, TARGET_PAGE_SIZE * num);
4181 offset += num;
4185 trace_colo_flush_ram_cache_end();
4189 * ram_load_precopy: load pages in precopy case
4191 * Returns 0 for success or -errno in case of error
4193 * Called in precopy mode by ram_load().
4194 * rcu_read_lock is taken prior to this being called.
4196 * @f: QEMUFile where to send the data
4198 static int ram_load_precopy(QEMUFile *f)
4200 MigrationIncomingState *mis = migration_incoming_get_current();
4201 int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
4202 /* ADVISE is earlier, it shows the source has the postcopy capability on */
4203 bool postcopy_advised = postcopy_is_advised();
4204 if (!migrate_use_compression()) {
4205 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
4208 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
4209 ram_addr_t addr, total_ram_bytes;
4210 void *host = NULL, *host_bak = NULL;
4211 uint8_t ch;
4214 * Yield periodically to let main loop run, but an iteration of
4215 * the main loop is expensive, so do it each some iterations
4217 if ((i & 32767) == 0 && qemu_in_coroutine()) {
4218 aio_co_schedule(qemu_get_current_aio_context(),
4219 qemu_coroutine_self());
4220 qemu_coroutine_yield();
4222 i++;
4224 addr = qemu_get_be64(f);
4225 flags = addr & ~TARGET_PAGE_MASK;
4226 addr &= TARGET_PAGE_MASK;
4228 if (flags & invalid_flags) {
4229 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
4230 error_report("Received an unexpected compressed page");
4233 ret = -EINVAL;
4234 break;
4237 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
4238 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
4239 RAMBlock *block = ram_block_from_stream(mis, f, flags,
4240 RAM_CHANNEL_PRECOPY);
4242 host = host_from_ram_block_offset(block, addr);
4244 * After going into COLO stage, we should not load the page
4245 * into SVM's memory directly, we put them into colo_cache firstly.
4246 * NOTE: We need to keep a copy of SVM's ram in colo_cache.
4247 * Previously, we copied all these memory in preparing stage of COLO
4248 * while we need to stop VM, which is a time-consuming process.
4249 * Here we optimize it by a trick, back-up every page while in
4250 * migration process while COLO is enabled, though it affects the
4251 * speed of the migration, but it obviously reduce the downtime of
4252 * back-up all SVM'S memory in COLO preparing stage.
4254 if (migration_incoming_colo_enabled()) {
4255 if (migration_incoming_in_colo_state()) {
4256 /* In COLO stage, put all pages into cache temporarily */
4257 host = colo_cache_from_block_offset(block, addr, true);
4258 } else {
4260 * In migration stage but before COLO stage,
4261 * Put all pages into both cache and SVM's memory.
4263 host_bak = colo_cache_from_block_offset(block, addr, false);
4266 if (!host) {
4267 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4268 ret = -EINVAL;
4269 break;
4271 if (!migration_incoming_in_colo_state()) {
4272 ramblock_recv_bitmap_set(block, host);
4275 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
4278 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4279 case RAM_SAVE_FLAG_MEM_SIZE:
4280 /* Synchronize RAM block list */
4281 total_ram_bytes = addr;
4282 while (!ret && total_ram_bytes) {
4283 RAMBlock *block;
4284 char id[256];
4285 ram_addr_t length;
4287 len = qemu_get_byte(f);
4288 qemu_get_buffer(f, (uint8_t *)id, len);
4289 id[len] = 0;
4290 length = qemu_get_be64(f);
4292 block = qemu_ram_block_by_name(id);
4293 if (block && !qemu_ram_is_migratable(block)) {
4294 error_report("block %s should not be migrated !", id);
4295 ret = -EINVAL;
4296 } else if (block) {
4297 if (length != block->used_length) {
4298 Error *local_err = NULL;
4300 ret = qemu_ram_resize(block, length,
4301 &local_err);
4302 if (local_err) {
4303 error_report_err(local_err);
4306 /* For postcopy we need to check hugepage sizes match */
4307 if (postcopy_advised && migrate_postcopy_ram() &&
4308 block->page_size != qemu_host_page_size) {
4309 uint64_t remote_page_size = qemu_get_be64(f);
4310 if (remote_page_size != block->page_size) {
4311 error_report("Mismatched RAM page size %s "
4312 "(local) %zd != %" PRId64,
4313 id, block->page_size,
4314 remote_page_size);
4315 ret = -EINVAL;
4318 if (migrate_ignore_shared()) {
4319 hwaddr addr = qemu_get_be64(f);
4320 if (ramblock_is_ignored(block) &&
4321 block->mr->addr != addr) {
4322 error_report("Mismatched GPAs for block %s "
4323 "%" PRId64 "!= %" PRId64,
4324 id, (uint64_t)addr,
4325 (uint64_t)block->mr->addr);
4326 ret = -EINVAL;
4329 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
4330 block->idstr);
4331 } else {
4332 error_report("Unknown ramblock \"%s\", cannot "
4333 "accept migration", id);
4334 ret = -EINVAL;
4337 total_ram_bytes -= length;
4339 break;
4341 case RAM_SAVE_FLAG_ZERO:
4342 ch = qemu_get_byte(f);
4343 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
4344 break;
4346 case RAM_SAVE_FLAG_PAGE:
4347 qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
4348 break;
4350 case RAM_SAVE_FLAG_COMPRESS_PAGE:
4351 len = qemu_get_be32(f);
4352 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4353 error_report("Invalid compressed data length: %d", len);
4354 ret = -EINVAL;
4355 break;
4357 decompress_data_with_multi_threads(f, host, len);
4358 break;
4360 case RAM_SAVE_FLAG_XBZRLE:
4361 if (load_xbzrle(f, addr, host) < 0) {
4362 error_report("Failed to decompress XBZRLE page at "
4363 RAM_ADDR_FMT, addr);
4364 ret = -EINVAL;
4365 break;
4367 break;
4368 case RAM_SAVE_FLAG_EOS:
4369 /* normal exit */
4370 multifd_recv_sync_main();
4371 break;
4372 default:
4373 if (flags & RAM_SAVE_FLAG_HOOK) {
4374 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
4375 } else {
4376 error_report("Unknown combination of migration flags: 0x%x",
4377 flags);
4378 ret = -EINVAL;
4381 if (!ret) {
4382 ret = qemu_file_get_error(f);
4384 if (!ret && host_bak) {
4385 memcpy(host_bak, host, TARGET_PAGE_SIZE);
4389 ret |= wait_for_decompress_done();
4390 return ret;
4393 static int ram_load(QEMUFile *f, void *opaque, int version_id)
4395 int ret = 0;
4396 static uint64_t seq_iter;
4398 * If system is running in postcopy mode, page inserts to host memory must
4399 * be atomic
4401 bool postcopy_running = postcopy_is_running();
4403 seq_iter++;
4405 if (version_id != 4) {
4406 return -EINVAL;
4410 * This RCU critical section can be very long running.
4411 * When RCU reclaims in the code start to become numerous,
4412 * it will be necessary to reduce the granularity of this
4413 * critical section.
4415 WITH_RCU_READ_LOCK_GUARD() {
4416 if (postcopy_running) {
4418 * Note! Here RAM_CHANNEL_PRECOPY is the precopy channel of
4419 * postcopy migration, we have another RAM_CHANNEL_POSTCOPY to
4420 * service fast page faults.
4422 ret = ram_load_postcopy(f, RAM_CHANNEL_PRECOPY);
4423 } else {
4424 ret = ram_load_precopy(f);
4427 trace_ram_load_complete(ret, seq_iter);
4429 return ret;
4432 static bool ram_has_postcopy(void *opaque)
4434 RAMBlock *rb;
4435 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4436 if (ramblock_is_pmem(rb)) {
4437 info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4438 "is not supported now!", rb->idstr, rb->host);
4439 return false;
4443 return migrate_postcopy_ram();
4446 /* Sync all the dirty bitmap with destination VM. */
4447 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4449 RAMBlock *block;
4450 QEMUFile *file = s->to_dst_file;
4451 int ramblock_count = 0;
4453 trace_ram_dirty_bitmap_sync_start();
4455 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4456 qemu_savevm_send_recv_bitmap(file, block->idstr);
4457 trace_ram_dirty_bitmap_request(block->idstr);
4458 ramblock_count++;
4461 trace_ram_dirty_bitmap_sync_wait();
4463 /* Wait until all the ramblocks' dirty bitmap synced */
4464 while (ramblock_count--) {
4465 qemu_sem_wait(&s->rp_state.rp_sem);
4468 trace_ram_dirty_bitmap_sync_complete();
4470 return 0;
4473 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4475 qemu_sem_post(&s->rp_state.rp_sem);
4479 * Read the received bitmap, revert it as the initial dirty bitmap.
4480 * This is only used when the postcopy migration is paused but wants
4481 * to resume from a middle point.
4483 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4485 int ret = -EINVAL;
4486 /* from_dst_file is always valid because we're within rp_thread */
4487 QEMUFile *file = s->rp_state.from_dst_file;
4488 unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
4489 uint64_t local_size = DIV_ROUND_UP(nbits, 8);
4490 uint64_t size, end_mark;
4492 trace_ram_dirty_bitmap_reload_begin(block->idstr);
4494 if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4495 error_report("%s: incorrect state %s", __func__,
4496 MigrationStatus_str(s->state));
4497 return -EINVAL;
4501 * Note: see comments in ramblock_recv_bitmap_send() on why we
4502 * need the endianness conversion, and the paddings.
4504 local_size = ROUND_UP(local_size, 8);
4506 /* Add paddings */
4507 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4509 size = qemu_get_be64(file);
4511 /* The size of the bitmap should match with our ramblock */
4512 if (size != local_size) {
4513 error_report("%s: ramblock '%s' bitmap size mismatch "
4514 "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4515 block->idstr, size, local_size);
4516 ret = -EINVAL;
4517 goto out;
4520 size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4521 end_mark = qemu_get_be64(file);
4523 ret = qemu_file_get_error(file);
4524 if (ret || size != local_size) {
4525 error_report("%s: read bitmap failed for ramblock '%s': %d"
4526 " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4527 __func__, block->idstr, ret, local_size, size);
4528 ret = -EIO;
4529 goto out;
4532 if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4533 error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64,
4534 __func__, block->idstr, end_mark);
4535 ret = -EINVAL;
4536 goto out;
4540 * Endianness conversion. We are during postcopy (though paused).
4541 * The dirty bitmap won't change. We can directly modify it.
4543 bitmap_from_le(block->bmap, le_bitmap, nbits);
4546 * What we received is "received bitmap". Revert it as the initial
4547 * dirty bitmap for this ramblock.
4549 bitmap_complement(block->bmap, block->bmap, nbits);
4551 /* Clear dirty bits of discarded ranges that we don't want to migrate. */
4552 ramblock_dirty_bitmap_clear_discarded_pages(block);
4554 /* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */
4555 trace_ram_dirty_bitmap_reload_complete(block->idstr);
4558 * We succeeded to sync bitmap for current ramblock. If this is
4559 * the last one to sync, we need to notify the main send thread.
4561 ram_dirty_bitmap_reload_notify(s);
4563 ret = 0;
4564 out:
4565 g_free(le_bitmap);
4566 return ret;
4569 static int ram_resume_prepare(MigrationState *s, void *opaque)
4571 RAMState *rs = *(RAMState **)opaque;
4572 int ret;
4574 ret = ram_dirty_bitmap_sync_all(s, rs);
4575 if (ret) {
4576 return ret;
4579 ram_state_resume_prepare(rs, s->to_dst_file);
4581 return 0;
4584 void postcopy_preempt_shutdown_file(MigrationState *s)
4586 qemu_put_be64(s->postcopy_qemufile_src, RAM_SAVE_FLAG_EOS);
4587 qemu_fflush(s->postcopy_qemufile_src);
4590 static SaveVMHandlers savevm_ram_handlers = {
4591 .save_setup = ram_save_setup,
4592 .save_live_iterate = ram_save_iterate,
4593 .save_live_complete_postcopy = ram_save_complete,
4594 .save_live_complete_precopy = ram_save_complete,
4595 .has_postcopy = ram_has_postcopy,
4596 .state_pending_exact = ram_state_pending_exact,
4597 .state_pending_estimate = ram_state_pending_estimate,
4598 .load_state = ram_load,
4599 .save_cleanup = ram_save_cleanup,
4600 .load_setup = ram_load_setup,
4601 .load_cleanup = ram_load_cleanup,
4602 .resume_prepare = ram_resume_prepare,
4605 static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host,
4606 size_t old_size, size_t new_size)
4608 PostcopyState ps = postcopy_state_get();
4609 ram_addr_t offset;
4610 RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset);
4611 Error *err = NULL;
4613 if (ramblock_is_ignored(rb)) {
4614 return;
4617 if (!migration_is_idle()) {
4619 * Precopy code on the source cannot deal with the size of RAM blocks
4620 * changing at random points in time - especially after sending the
4621 * RAM block sizes in the migration stream, they must no longer change.
4622 * Abort and indicate a proper reason.
4624 error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr);
4625 migration_cancel(err);
4626 error_free(err);
4629 switch (ps) {
4630 case POSTCOPY_INCOMING_ADVISE:
4632 * Update what ram_postcopy_incoming_init()->init_range() does at the
4633 * time postcopy was advised. Syncing RAM blocks with the source will
4634 * result in RAM resizes.
4636 if (old_size < new_size) {
4637 if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) {
4638 error_report("RAM block '%s' discard of resized RAM failed",
4639 rb->idstr);
4642 rb->postcopy_length = new_size;
4643 break;
4644 case POSTCOPY_INCOMING_NONE:
4645 case POSTCOPY_INCOMING_RUNNING:
4646 case POSTCOPY_INCOMING_END:
4648 * Once our guest is running, postcopy does no longer care about
4649 * resizes. When growing, the new memory was not available on the
4650 * source, no handler needed.
4652 break;
4653 default:
4654 error_report("RAM block '%s' resized during postcopy state: %d",
4655 rb->idstr, ps);
4656 exit(-1);
4660 static RAMBlockNotifier ram_mig_ram_notifier = {
4661 .ram_block_resized = ram_mig_ram_block_resized,
4664 void ram_mig_init(void)
4666 qemu_mutex_init(&XBZRLE.lock);
4667 register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
4668 ram_block_notifier_add(&ram_mig_ram_notifier);