migration: Send requested page directly in rp-return thread
[qemu.git] / migration / ram.c
blob16ade7cb70b01be0531e6dd13e46ccc0034f3c1b
1 /*
2 * QEMU System Emulator
4 * Copyright (c) 2003-2008 Fabrice Bellard
5 * Copyright (c) 2011-2015 Red Hat Inc
7 * Authors:
8 * Juan Quintela <quintela@redhat.com>
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 * THE SOFTWARE.
29 #include "qemu/osdep.h"
30 #include "qemu/cutils.h"
31 #include "qemu/bitops.h"
32 #include "qemu/bitmap.h"
33 #include "qemu/madvise.h"
34 #include "qemu/main-loop.h"
35 #include "io/channel-null.h"
36 #include "xbzrle.h"
37 #include "ram.h"
38 #include "migration.h"
39 #include "migration/register.h"
40 #include "migration/misc.h"
41 #include "qemu-file.h"
42 #include "postcopy-ram.h"
43 #include "page_cache.h"
44 #include "qemu/error-report.h"
45 #include "qapi/error.h"
46 #include "qapi/qapi-types-migration.h"
47 #include "qapi/qapi-events-migration.h"
48 #include "qapi/qmp/qerror.h"
49 #include "trace.h"
50 #include "exec/ram_addr.h"
51 #include "exec/target_page.h"
52 #include "qemu/rcu_queue.h"
53 #include "migration/colo.h"
54 #include "block.h"
55 #include "sysemu/cpu-throttle.h"
56 #include "savevm.h"
57 #include "qemu/iov.h"
58 #include "multifd.h"
59 #include "sysemu/runstate.h"
61 #include "hw/boards.h" /* for machine_dump_guest_core() */
63 #if defined(__linux__)
64 #include "qemu/userfaultfd.h"
65 #endif /* defined(__linux__) */
67 /***********************************************************/
68 /* ram save/restore */
70 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
71 * worked for pages that where filled with the same char. We switched
72 * it to only search for the zero value. And to avoid confusion with
73 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
76 #define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */
77 #define RAM_SAVE_FLAG_ZERO 0x02
78 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
79 #define RAM_SAVE_FLAG_PAGE 0x08
80 #define RAM_SAVE_FLAG_EOS 0x10
81 #define RAM_SAVE_FLAG_CONTINUE 0x20
82 #define RAM_SAVE_FLAG_XBZRLE 0x40
83 /* 0x80 is reserved in migration.h start with 0x100 next */
84 #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
86 XBZRLECacheStats xbzrle_counters;
88 /* used by the search for pages to send */
89 struct PageSearchStatus {
90 /* The migration channel used for a specific host page */
91 QEMUFile *pss_channel;
92 /* Last block from where we have sent data */
93 RAMBlock *last_sent_block;
94 /* Current block being searched */
95 RAMBlock *block;
96 /* Current page to search from */
97 unsigned long page;
98 /* Set once we wrap around */
99 bool complete_round;
101 * [POSTCOPY-ONLY] Whether current page is explicitly requested by
102 * postcopy. When set, the request is "urgent" because the dest QEMU
103 * threads are waiting for us.
105 bool postcopy_requested;
107 * [POSTCOPY-ONLY] The target channel to use to send current page.
109 * Note: This may _not_ match with the value in postcopy_requested
110 * above. Let's imagine the case where the postcopy request is exactly
111 * the page that we're sending in progress during precopy. In this case
112 * we'll have postcopy_requested set to true but the target channel
113 * will be the precopy channel (so that we don't split brain on that
114 * specific page since the precopy channel already contains partial of
115 * that page data).
117 * Besides that specific use case, postcopy_target_channel should
118 * always be equal to postcopy_requested, because by default we send
119 * postcopy pages via postcopy preempt channel.
121 bool postcopy_target_channel;
122 /* Whether we're sending a host page */
123 bool host_page_sending;
124 /* The start/end of current host page. Invalid if host_page_sending==false */
125 unsigned long host_page_start;
126 unsigned long host_page_end;
128 typedef struct PageSearchStatus PageSearchStatus;
130 /* struct contains XBZRLE cache and a static page
131 used by the compression */
132 static struct {
133 /* buffer used for XBZRLE encoding */
134 uint8_t *encoded_buf;
135 /* buffer for storing page content */
136 uint8_t *current_buf;
137 /* Cache for XBZRLE, Protected by lock. */
138 PageCache *cache;
139 QemuMutex lock;
140 /* it will store a page full of zeros */
141 uint8_t *zero_target_page;
142 /* buffer used for XBZRLE decoding */
143 uint8_t *decoded_buf;
144 } XBZRLE;
146 static void XBZRLE_cache_lock(void)
148 if (migrate_use_xbzrle()) {
149 qemu_mutex_lock(&XBZRLE.lock);
153 static void XBZRLE_cache_unlock(void)
155 if (migrate_use_xbzrle()) {
156 qemu_mutex_unlock(&XBZRLE.lock);
161 * xbzrle_cache_resize: resize the xbzrle cache
163 * This function is called from migrate_params_apply in main
164 * thread, possibly while a migration is in progress. A running
165 * migration may be using the cache and might finish during this call,
166 * hence changes to the cache are protected by XBZRLE.lock().
168 * Returns 0 for success or -1 for error
170 * @new_size: new cache size
171 * @errp: set *errp if the check failed, with reason
173 int xbzrle_cache_resize(uint64_t new_size, Error **errp)
175 PageCache *new_cache;
176 int64_t ret = 0;
178 /* Check for truncation */
179 if (new_size != (size_t)new_size) {
180 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
181 "exceeding address space");
182 return -1;
185 if (new_size == migrate_xbzrle_cache_size()) {
186 /* nothing to do */
187 return 0;
190 XBZRLE_cache_lock();
192 if (XBZRLE.cache != NULL) {
193 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
194 if (!new_cache) {
195 ret = -1;
196 goto out;
199 cache_fini(XBZRLE.cache);
200 XBZRLE.cache = new_cache;
202 out:
203 XBZRLE_cache_unlock();
204 return ret;
207 static bool postcopy_preempt_active(void)
209 return migrate_postcopy_preempt() && migration_in_postcopy();
212 bool ramblock_is_ignored(RAMBlock *block)
214 return !qemu_ram_is_migratable(block) ||
215 (migrate_ignore_shared() && qemu_ram_is_shared(block));
218 #undef RAMBLOCK_FOREACH
220 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
222 RAMBlock *block;
223 int ret = 0;
225 RCU_READ_LOCK_GUARD();
227 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
228 ret = func(block, opaque);
229 if (ret) {
230 break;
233 return ret;
236 static void ramblock_recv_map_init(void)
238 RAMBlock *rb;
240 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
241 assert(!rb->receivedmap);
242 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
246 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
248 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
249 rb->receivedmap);
252 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
254 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
257 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
259 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
262 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
263 size_t nr)
265 bitmap_set_atomic(rb->receivedmap,
266 ramblock_recv_bitmap_offset(host_addr, rb),
267 nr);
270 #define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL)
273 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
275 * Returns >0 if success with sent bytes, or <0 if error.
277 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
278 const char *block_name)
280 RAMBlock *block = qemu_ram_block_by_name(block_name);
281 unsigned long *le_bitmap, nbits;
282 uint64_t size;
284 if (!block) {
285 error_report("%s: invalid block name: %s", __func__, block_name);
286 return -1;
289 nbits = block->postcopy_length >> TARGET_PAGE_BITS;
292 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
293 * machines we may need 4 more bytes for padding (see below
294 * comment). So extend it a bit before hand.
296 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
299 * Always use little endian when sending the bitmap. This is
300 * required that when source and destination VMs are not using the
301 * same endianness. (Note: big endian won't work.)
303 bitmap_to_le(le_bitmap, block->receivedmap, nbits);
305 /* Size of the bitmap, in bytes */
306 size = DIV_ROUND_UP(nbits, 8);
309 * size is always aligned to 8 bytes for 64bit machines, but it
310 * may not be true for 32bit machines. We need this padding to
311 * make sure the migration can survive even between 32bit and
312 * 64bit machines.
314 size = ROUND_UP(size, 8);
316 qemu_put_be64(file, size);
317 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
319 * Mark as an end, in case the middle part is screwed up due to
320 * some "mysterious" reason.
322 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
323 qemu_fflush(file);
325 g_free(le_bitmap);
327 if (qemu_file_get_error(file)) {
328 return qemu_file_get_error(file);
331 return size + sizeof(size);
335 * An outstanding page request, on the source, having been received
336 * and queued
338 struct RAMSrcPageRequest {
339 RAMBlock *rb;
340 hwaddr offset;
341 hwaddr len;
343 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
346 typedef struct {
348 * Cached ramblock/offset values if preempted. They're only meaningful if
349 * preempted==true below.
351 RAMBlock *ram_block;
352 unsigned long ram_page;
354 * Whether a postcopy preemption just happened. Will be reset after
355 * precopy recovered to background migration.
357 bool preempted;
358 } PostcopyPreemptState;
360 /* State of RAM for migration */
361 struct RAMState {
362 /* QEMUFile used for this migration */
363 QEMUFile *f;
365 * PageSearchStatus structures for the channels when send pages.
366 * Protected by the bitmap_mutex.
368 PageSearchStatus pss[RAM_CHANNEL_MAX];
369 /* UFFD file descriptor, used in 'write-tracking' migration */
370 int uffdio_fd;
371 /* Last block that we have visited searching for dirty pages */
372 RAMBlock *last_seen_block;
373 /* Last dirty target page we have sent */
374 ram_addr_t last_page;
375 /* last ram version we have seen */
376 uint32_t last_version;
377 /* How many times we have dirty too many pages */
378 int dirty_rate_high_cnt;
379 /* these variables are used for bitmap sync */
380 /* last time we did a full bitmap_sync */
381 int64_t time_last_bitmap_sync;
382 /* bytes transferred at start_time */
383 uint64_t bytes_xfer_prev;
384 /* number of dirty pages since start_time */
385 uint64_t num_dirty_pages_period;
386 /* xbzrle misses since the beginning of the period */
387 uint64_t xbzrle_cache_miss_prev;
388 /* Amount of xbzrle pages since the beginning of the period */
389 uint64_t xbzrle_pages_prev;
390 /* Amount of xbzrle encoded bytes since the beginning of the period */
391 uint64_t xbzrle_bytes_prev;
392 /* Start using XBZRLE (e.g., after the first round). */
393 bool xbzrle_enabled;
394 /* Are we on the last stage of migration */
395 bool last_stage;
396 /* compression statistics since the beginning of the period */
397 /* amount of count that no free thread to compress data */
398 uint64_t compress_thread_busy_prev;
399 /* amount bytes after compression */
400 uint64_t compressed_size_prev;
401 /* amount of compressed pages */
402 uint64_t compress_pages_prev;
404 /* total handled target pages at the beginning of period */
405 uint64_t target_page_count_prev;
406 /* total handled target pages since start */
407 uint64_t target_page_count;
408 /* number of dirty bits in the bitmap */
409 uint64_t migration_dirty_pages;
411 * Protects:
412 * - dirty/clear bitmap
413 * - migration_dirty_pages
414 * - pss structures
416 QemuMutex bitmap_mutex;
417 /* The RAMBlock used in the last src_page_requests */
418 RAMBlock *last_req_rb;
419 /* Queue of outstanding page requests from the destination */
420 QemuMutex src_page_req_mutex;
421 QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
423 /* Postcopy preemption informations */
424 PostcopyPreemptState postcopy_preempt_state;
426 * Current channel we're using on src VM. Only valid if postcopy-preempt
427 * is enabled.
429 unsigned int postcopy_channel;
431 typedef struct RAMState RAMState;
433 static RAMState *ram_state;
435 static NotifierWithReturnList precopy_notifier_list;
437 static void postcopy_preempt_reset(RAMState *rs)
439 memset(&rs->postcopy_preempt_state, 0, sizeof(PostcopyPreemptState));
442 /* Whether postcopy has queued requests? */
443 static bool postcopy_has_request(RAMState *rs)
445 return !QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests);
448 void precopy_infrastructure_init(void)
450 notifier_with_return_list_init(&precopy_notifier_list);
453 void precopy_add_notifier(NotifierWithReturn *n)
455 notifier_with_return_list_add(&precopy_notifier_list, n);
458 void precopy_remove_notifier(NotifierWithReturn *n)
460 notifier_with_return_remove(n);
463 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
465 PrecopyNotifyData pnd;
466 pnd.reason = reason;
467 pnd.errp = errp;
469 return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
472 uint64_t ram_bytes_remaining(void)
474 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
479 * NOTE: not all stats in ram_counters are used in reality. See comments
480 * for struct MigrationAtomicStats. The ultimate result of ram migration
481 * counters will be a merged version with both ram_counters and the atomic
482 * fields in ram_atomic_counters.
484 MigrationStats ram_counters;
485 MigrationAtomicStats ram_atomic_counters;
487 void ram_transferred_add(uint64_t bytes)
489 if (runstate_is_running()) {
490 ram_counters.precopy_bytes += bytes;
491 } else if (migration_in_postcopy()) {
492 stat64_add(&ram_atomic_counters.postcopy_bytes, bytes);
493 } else {
494 ram_counters.downtime_bytes += bytes;
496 stat64_add(&ram_atomic_counters.transferred, bytes);
499 void dirty_sync_missed_zero_copy(void)
501 ram_counters.dirty_sync_missed_zero_copy++;
504 CompressionStats compression_counters;
506 struct CompressParam {
507 bool done;
508 bool quit;
509 bool zero_page;
510 QEMUFile *file;
511 QemuMutex mutex;
512 QemuCond cond;
513 RAMBlock *block;
514 ram_addr_t offset;
516 /* internally used fields */
517 z_stream stream;
518 uint8_t *originbuf;
520 typedef struct CompressParam CompressParam;
522 struct DecompressParam {
523 bool done;
524 bool quit;
525 QemuMutex mutex;
526 QemuCond cond;
527 void *des;
528 uint8_t *compbuf;
529 int len;
530 z_stream stream;
532 typedef struct DecompressParam DecompressParam;
534 static CompressParam *comp_param;
535 static QemuThread *compress_threads;
536 /* comp_done_cond is used to wake up the migration thread when
537 * one of the compression threads has finished the compression.
538 * comp_done_lock is used to co-work with comp_done_cond.
540 static QemuMutex comp_done_lock;
541 static QemuCond comp_done_cond;
543 static QEMUFile *decomp_file;
544 static DecompressParam *decomp_param;
545 static QemuThread *decompress_threads;
546 static QemuMutex decomp_done_lock;
547 static QemuCond decomp_done_cond;
549 static int ram_save_host_page_urgent(PageSearchStatus *pss);
551 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
552 ram_addr_t offset, uint8_t *source_buf);
554 static void postcopy_preempt_restore(RAMState *rs, PageSearchStatus *pss,
555 bool postcopy_requested);
557 /* NOTE: page is the PFN not real ram_addr_t. */
558 static void pss_init(PageSearchStatus *pss, RAMBlock *rb, ram_addr_t page)
560 pss->block = rb;
561 pss->page = page;
562 pss->complete_round = false;
566 * Check whether two PSSs are actively sending the same page. Return true
567 * if it is, false otherwise.
569 static bool pss_overlap(PageSearchStatus *pss1, PageSearchStatus *pss2)
571 return pss1->host_page_sending && pss2->host_page_sending &&
572 (pss1->host_page_start == pss2->host_page_start);
575 static void *do_data_compress(void *opaque)
577 CompressParam *param = opaque;
578 RAMBlock *block;
579 ram_addr_t offset;
580 bool zero_page;
582 qemu_mutex_lock(&param->mutex);
583 while (!param->quit) {
584 if (param->block) {
585 block = param->block;
586 offset = param->offset;
587 param->block = NULL;
588 qemu_mutex_unlock(&param->mutex);
590 zero_page = do_compress_ram_page(param->file, &param->stream,
591 block, offset, param->originbuf);
593 qemu_mutex_lock(&comp_done_lock);
594 param->done = true;
595 param->zero_page = zero_page;
596 qemu_cond_signal(&comp_done_cond);
597 qemu_mutex_unlock(&comp_done_lock);
599 qemu_mutex_lock(&param->mutex);
600 } else {
601 qemu_cond_wait(&param->cond, &param->mutex);
604 qemu_mutex_unlock(&param->mutex);
606 return NULL;
609 static void compress_threads_save_cleanup(void)
611 int i, thread_count;
613 if (!migrate_use_compression() || !comp_param) {
614 return;
617 thread_count = migrate_compress_threads();
618 for (i = 0; i < thread_count; i++) {
620 * we use it as a indicator which shows if the thread is
621 * properly init'd or not
623 if (!comp_param[i].file) {
624 break;
627 qemu_mutex_lock(&comp_param[i].mutex);
628 comp_param[i].quit = true;
629 qemu_cond_signal(&comp_param[i].cond);
630 qemu_mutex_unlock(&comp_param[i].mutex);
632 qemu_thread_join(compress_threads + i);
633 qemu_mutex_destroy(&comp_param[i].mutex);
634 qemu_cond_destroy(&comp_param[i].cond);
635 deflateEnd(&comp_param[i].stream);
636 g_free(comp_param[i].originbuf);
637 qemu_fclose(comp_param[i].file);
638 comp_param[i].file = NULL;
640 qemu_mutex_destroy(&comp_done_lock);
641 qemu_cond_destroy(&comp_done_cond);
642 g_free(compress_threads);
643 g_free(comp_param);
644 compress_threads = NULL;
645 comp_param = NULL;
648 static int compress_threads_save_setup(void)
650 int i, thread_count;
652 if (!migrate_use_compression()) {
653 return 0;
655 thread_count = migrate_compress_threads();
656 compress_threads = g_new0(QemuThread, thread_count);
657 comp_param = g_new0(CompressParam, thread_count);
658 qemu_cond_init(&comp_done_cond);
659 qemu_mutex_init(&comp_done_lock);
660 for (i = 0; i < thread_count; i++) {
661 comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
662 if (!comp_param[i].originbuf) {
663 goto exit;
666 if (deflateInit(&comp_param[i].stream,
667 migrate_compress_level()) != Z_OK) {
668 g_free(comp_param[i].originbuf);
669 goto exit;
672 /* comp_param[i].file is just used as a dummy buffer to save data,
673 * set its ops to empty.
675 comp_param[i].file = qemu_file_new_output(
676 QIO_CHANNEL(qio_channel_null_new()));
677 comp_param[i].done = true;
678 comp_param[i].quit = false;
679 qemu_mutex_init(&comp_param[i].mutex);
680 qemu_cond_init(&comp_param[i].cond);
681 qemu_thread_create(compress_threads + i, "compress",
682 do_data_compress, comp_param + i,
683 QEMU_THREAD_JOINABLE);
685 return 0;
687 exit:
688 compress_threads_save_cleanup();
689 return -1;
693 * save_page_header: write page header to wire
695 * If this is the 1st block, it also writes the block identification
697 * Returns the number of bytes written
699 * @pss: current PSS channel status
700 * @block: block that contains the page we want to send
701 * @offset: offset inside the block for the page
702 * in the lower bits, it contains flags
704 static size_t save_page_header(PageSearchStatus *pss, RAMBlock *block,
705 ram_addr_t offset)
707 size_t size, len;
708 bool same_block = (block == pss->last_sent_block);
709 QEMUFile *f = pss->pss_channel;
711 if (same_block) {
712 offset |= RAM_SAVE_FLAG_CONTINUE;
714 qemu_put_be64(f, offset);
715 size = 8;
717 if (!same_block) {
718 len = strlen(block->idstr);
719 qemu_put_byte(f, len);
720 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
721 size += 1 + len;
722 pss->last_sent_block = block;
724 return size;
728 * mig_throttle_guest_down: throttle down the guest
730 * Reduce amount of guest cpu execution to hopefully slow down memory
731 * writes. If guest dirty memory rate is reduced below the rate at
732 * which we can transfer pages to the destination then we should be
733 * able to complete migration. Some workloads dirty memory way too
734 * fast and will not effectively converge, even with auto-converge.
736 static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
737 uint64_t bytes_dirty_threshold)
739 MigrationState *s = migrate_get_current();
740 uint64_t pct_initial = s->parameters.cpu_throttle_initial;
741 uint64_t pct_increment = s->parameters.cpu_throttle_increment;
742 bool pct_tailslow = s->parameters.cpu_throttle_tailslow;
743 int pct_max = s->parameters.max_cpu_throttle;
745 uint64_t throttle_now = cpu_throttle_get_percentage();
746 uint64_t cpu_now, cpu_ideal, throttle_inc;
748 /* We have not started throttling yet. Let's start it. */
749 if (!cpu_throttle_active()) {
750 cpu_throttle_set(pct_initial);
751 } else {
752 /* Throttling already on, just increase the rate */
753 if (!pct_tailslow) {
754 throttle_inc = pct_increment;
755 } else {
756 /* Compute the ideal CPU percentage used by Guest, which may
757 * make the dirty rate match the dirty rate threshold. */
758 cpu_now = 100 - throttle_now;
759 cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
760 bytes_dirty_period);
761 throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
763 cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
767 void mig_throttle_counter_reset(void)
769 RAMState *rs = ram_state;
771 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
772 rs->num_dirty_pages_period = 0;
773 rs->bytes_xfer_prev = stat64_get(&ram_atomic_counters.transferred);
777 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
779 * @rs: current RAM state
780 * @current_addr: address for the zero page
782 * Update the xbzrle cache to reflect a page that's been sent as all 0.
783 * The important thing is that a stale (not-yet-0'd) page be replaced
784 * by the new data.
785 * As a bonus, if the page wasn't in the cache it gets added so that
786 * when a small write is made into the 0'd page it gets XBZRLE sent.
788 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
790 /* We don't care if this fails to allocate a new cache page
791 * as long as it updated an old one */
792 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
793 ram_counters.dirty_sync_count);
796 #define ENCODING_FLAG_XBZRLE 0x1
799 * save_xbzrle_page: compress and send current page
801 * Returns: 1 means that we wrote the page
802 * 0 means that page is identical to the one already sent
803 * -1 means that xbzrle would be longer than normal
805 * @rs: current RAM state
806 * @pss: current PSS channel
807 * @current_data: pointer to the address of the page contents
808 * @current_addr: addr of the page
809 * @block: block that contains the page we want to send
810 * @offset: offset inside the block for the page
812 static int save_xbzrle_page(RAMState *rs, PageSearchStatus *pss,
813 uint8_t **current_data, ram_addr_t current_addr,
814 RAMBlock *block, ram_addr_t offset)
816 int encoded_len = 0, bytes_xbzrle;
817 uint8_t *prev_cached_page;
818 QEMUFile *file = pss->pss_channel;
820 if (!cache_is_cached(XBZRLE.cache, current_addr,
821 ram_counters.dirty_sync_count)) {
822 xbzrle_counters.cache_miss++;
823 if (!rs->last_stage) {
824 if (cache_insert(XBZRLE.cache, current_addr, *current_data,
825 ram_counters.dirty_sync_count) == -1) {
826 return -1;
827 } else {
828 /* update *current_data when the page has been
829 inserted into cache */
830 *current_data = get_cached_data(XBZRLE.cache, current_addr);
833 return -1;
837 * Reaching here means the page has hit the xbzrle cache, no matter what
838 * encoding result it is (normal encoding, overflow or skipping the page),
839 * count the page as encoded. This is used to calculate the encoding rate.
841 * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
842 * 2nd page turns out to be skipped (i.e. no new bytes written to the
843 * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
844 * skipped page included. In this way, the encoding rate can tell if the
845 * guest page is good for xbzrle encoding.
847 xbzrle_counters.pages++;
848 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
850 /* save current buffer into memory */
851 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
853 /* XBZRLE encoding (if there is no overflow) */
854 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
855 TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
856 TARGET_PAGE_SIZE);
859 * Update the cache contents, so that it corresponds to the data
860 * sent, in all cases except where we skip the page.
862 if (!rs->last_stage && encoded_len != 0) {
863 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
865 * In the case where we couldn't compress, ensure that the caller
866 * sends the data from the cache, since the guest might have
867 * changed the RAM since we copied it.
869 *current_data = prev_cached_page;
872 if (encoded_len == 0) {
873 trace_save_xbzrle_page_skipping();
874 return 0;
875 } else if (encoded_len == -1) {
876 trace_save_xbzrle_page_overflow();
877 xbzrle_counters.overflow++;
878 xbzrle_counters.bytes += TARGET_PAGE_SIZE;
879 return -1;
882 /* Send XBZRLE based compressed page */
883 bytes_xbzrle = save_page_header(pss, block,
884 offset | RAM_SAVE_FLAG_XBZRLE);
885 qemu_put_byte(file, ENCODING_FLAG_XBZRLE);
886 qemu_put_be16(file, encoded_len);
887 qemu_put_buffer(file, XBZRLE.encoded_buf, encoded_len);
888 bytes_xbzrle += encoded_len + 1 + 2;
890 * Like compressed_size (please see update_compress_thread_counts),
891 * the xbzrle encoded bytes don't count the 8 byte header with
892 * RAM_SAVE_FLAG_CONTINUE.
894 xbzrle_counters.bytes += bytes_xbzrle - 8;
895 ram_transferred_add(bytes_xbzrle);
897 return 1;
901 * pss_find_next_dirty: find the next dirty page of current ramblock
903 * This function updates pss->page to point to the next dirty page index
904 * within the ramblock to migrate, or the end of ramblock when nothing
905 * found. Note that when pss->host_page_sending==true it means we're
906 * during sending a host page, so we won't look for dirty page that is
907 * outside the host page boundary.
909 * @pss: the current page search status
911 static void pss_find_next_dirty(PageSearchStatus *pss)
913 RAMBlock *rb = pss->block;
914 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
915 unsigned long *bitmap = rb->bmap;
917 if (ramblock_is_ignored(rb)) {
918 /* Points directly to the end, so we know no dirty page */
919 pss->page = size;
920 return;
924 * If during sending a host page, only look for dirty pages within the
925 * current host page being send.
927 if (pss->host_page_sending) {
928 assert(pss->host_page_end);
929 size = MIN(size, pss->host_page_end);
932 pss->page = find_next_bit(bitmap, size, pss->page);
935 static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb,
936 unsigned long page)
938 uint8_t shift;
939 hwaddr size, start;
941 if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) {
942 return;
945 shift = rb->clear_bmap_shift;
947 * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
948 * can make things easier sometimes since then start address
949 * of the small chunk will always be 64 pages aligned so the
950 * bitmap will always be aligned to unsigned long. We should
951 * even be able to remove this restriction but I'm simply
952 * keeping it.
954 assert(shift >= 6);
956 size = 1ULL << (TARGET_PAGE_BITS + shift);
957 start = QEMU_ALIGN_DOWN((ram_addr_t)page << TARGET_PAGE_BITS, size);
958 trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
959 memory_region_clear_dirty_bitmap(rb->mr, start, size);
962 static void
963 migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb,
964 unsigned long start,
965 unsigned long npages)
967 unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift;
968 unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages);
969 unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages);
972 * Clear pages from start to start + npages - 1, so the end boundary is
973 * exclusive.
975 for (i = chunk_start; i < chunk_end; i += chunk_pages) {
976 migration_clear_memory_region_dirty_bitmap(rb, i);
981 * colo_bitmap_find_diry:find contiguous dirty pages from start
983 * Returns the page offset within memory region of the start of the contiguout
984 * dirty page
986 * @rs: current RAM state
987 * @rb: RAMBlock where to search for dirty pages
988 * @start: page where we start the search
989 * @num: the number of contiguous dirty pages
991 static inline
992 unsigned long colo_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
993 unsigned long start, unsigned long *num)
995 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
996 unsigned long *bitmap = rb->bmap;
997 unsigned long first, next;
999 *num = 0;
1001 if (ramblock_is_ignored(rb)) {
1002 return size;
1005 first = find_next_bit(bitmap, size, start);
1006 if (first >= size) {
1007 return first;
1009 next = find_next_zero_bit(bitmap, size, first + 1);
1010 assert(next >= first);
1011 *num = next - first;
1012 return first;
1015 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
1016 RAMBlock *rb,
1017 unsigned long page)
1019 bool ret;
1022 * Clear dirty bitmap if needed. This _must_ be called before we
1023 * send any of the page in the chunk because we need to make sure
1024 * we can capture further page content changes when we sync dirty
1025 * log the next time. So as long as we are going to send any of
1026 * the page in the chunk we clear the remote dirty bitmap for all.
1027 * Clearing it earlier won't be a problem, but too late will.
1029 migration_clear_memory_region_dirty_bitmap(rb, page);
1031 ret = test_and_clear_bit(page, rb->bmap);
1032 if (ret) {
1033 rs->migration_dirty_pages--;
1036 return ret;
1039 static void dirty_bitmap_clear_section(MemoryRegionSection *section,
1040 void *opaque)
1042 const hwaddr offset = section->offset_within_region;
1043 const hwaddr size = int128_get64(section->size);
1044 const unsigned long start = offset >> TARGET_PAGE_BITS;
1045 const unsigned long npages = size >> TARGET_PAGE_BITS;
1046 RAMBlock *rb = section->mr->ram_block;
1047 uint64_t *cleared_bits = opaque;
1050 * We don't grab ram_state->bitmap_mutex because we expect to run
1051 * only when starting migration or during postcopy recovery where
1052 * we don't have concurrent access.
1054 if (!migration_in_postcopy() && !migrate_background_snapshot()) {
1055 migration_clear_memory_region_dirty_bitmap_range(rb, start, npages);
1057 *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages);
1058 bitmap_clear(rb->bmap, start, npages);
1062 * Exclude all dirty pages from migration that fall into a discarded range as
1063 * managed by a RamDiscardManager responsible for the mapped memory region of
1064 * the RAMBlock. Clear the corresponding bits in the dirty bitmaps.
1066 * Discarded pages ("logically unplugged") have undefined content and must
1067 * not get migrated, because even reading these pages for migration might
1068 * result in undesired behavior.
1070 * Returns the number of cleared bits in the RAMBlock dirty bitmap.
1072 * Note: The result is only stable while migrating (precopy/postcopy).
1074 static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb)
1076 uint64_t cleared_bits = 0;
1078 if (rb->mr && rb->bmap && memory_region_has_ram_discard_manager(rb->mr)) {
1079 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1080 MemoryRegionSection section = {
1081 .mr = rb->mr,
1082 .offset_within_region = 0,
1083 .size = int128_make64(qemu_ram_get_used_length(rb)),
1086 ram_discard_manager_replay_discarded(rdm, &section,
1087 dirty_bitmap_clear_section,
1088 &cleared_bits);
1090 return cleared_bits;
1094 * Check if a host-page aligned page falls into a discarded range as managed by
1095 * a RamDiscardManager responsible for the mapped memory region of the RAMBlock.
1097 * Note: The result is only stable while migrating (precopy/postcopy).
1099 bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start)
1101 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1102 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1103 MemoryRegionSection section = {
1104 .mr = rb->mr,
1105 .offset_within_region = start,
1106 .size = int128_make64(qemu_ram_pagesize(rb)),
1109 return !ram_discard_manager_is_populated(rdm, &section);
1111 return false;
1114 /* Called with RCU critical section */
1115 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
1117 uint64_t new_dirty_pages =
1118 cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
1120 rs->migration_dirty_pages += new_dirty_pages;
1121 rs->num_dirty_pages_period += new_dirty_pages;
1125 * ram_pagesize_summary: calculate all the pagesizes of a VM
1127 * Returns a summary bitmap of the page sizes of all RAMBlocks
1129 * For VMs with just normal pages this is equivalent to the host page
1130 * size. If it's got some huge pages then it's the OR of all the
1131 * different page sizes.
1133 uint64_t ram_pagesize_summary(void)
1135 RAMBlock *block;
1136 uint64_t summary = 0;
1138 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1139 summary |= block->page_size;
1142 return summary;
1145 uint64_t ram_get_total_transferred_pages(void)
1147 return stat64_get(&ram_atomic_counters.normal) +
1148 stat64_get(&ram_atomic_counters.duplicate) +
1149 compression_counters.pages + xbzrle_counters.pages;
1152 static void migration_update_rates(RAMState *rs, int64_t end_time)
1154 uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
1155 double compressed_size;
1157 /* calculate period counters */
1158 ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
1159 / (end_time - rs->time_last_bitmap_sync);
1161 if (!page_count) {
1162 return;
1165 if (migrate_use_xbzrle()) {
1166 double encoded_size, unencoded_size;
1168 xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
1169 rs->xbzrle_cache_miss_prev) / page_count;
1170 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
1171 unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
1172 TARGET_PAGE_SIZE;
1173 encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
1174 if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
1175 xbzrle_counters.encoding_rate = 0;
1176 } else {
1177 xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
1179 rs->xbzrle_pages_prev = xbzrle_counters.pages;
1180 rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
1183 if (migrate_use_compression()) {
1184 compression_counters.busy_rate = (double)(compression_counters.busy -
1185 rs->compress_thread_busy_prev) / page_count;
1186 rs->compress_thread_busy_prev = compression_counters.busy;
1188 compressed_size = compression_counters.compressed_size -
1189 rs->compressed_size_prev;
1190 if (compressed_size) {
1191 double uncompressed_size = (compression_counters.pages -
1192 rs->compress_pages_prev) * TARGET_PAGE_SIZE;
1194 /* Compression-Ratio = Uncompressed-size / Compressed-size */
1195 compression_counters.compression_rate =
1196 uncompressed_size / compressed_size;
1198 rs->compress_pages_prev = compression_counters.pages;
1199 rs->compressed_size_prev = compression_counters.compressed_size;
1204 static void migration_trigger_throttle(RAMState *rs)
1206 MigrationState *s = migrate_get_current();
1207 uint64_t threshold = s->parameters.throttle_trigger_threshold;
1208 uint64_t bytes_xfer_period =
1209 stat64_get(&ram_atomic_counters.transferred) - rs->bytes_xfer_prev;
1210 uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
1211 uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
1213 /* During block migration the auto-converge logic incorrectly detects
1214 * that ram migration makes no progress. Avoid this by disabling the
1215 * throttling logic during the bulk phase of block migration. */
1216 if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1217 /* The following detection logic can be refined later. For now:
1218 Check to see if the ratio between dirtied bytes and the approx.
1219 amount of bytes that just got transferred since the last time
1220 we were in this routine reaches the threshold. If that happens
1221 twice, start or increase throttling. */
1223 if ((bytes_dirty_period > bytes_dirty_threshold) &&
1224 (++rs->dirty_rate_high_cnt >= 2)) {
1225 trace_migration_throttle();
1226 rs->dirty_rate_high_cnt = 0;
1227 mig_throttle_guest_down(bytes_dirty_period,
1228 bytes_dirty_threshold);
1233 static void migration_bitmap_sync(RAMState *rs)
1235 RAMBlock *block;
1236 int64_t end_time;
1238 ram_counters.dirty_sync_count++;
1240 if (!rs->time_last_bitmap_sync) {
1241 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1244 trace_migration_bitmap_sync_start();
1245 memory_global_dirty_log_sync();
1247 qemu_mutex_lock(&rs->bitmap_mutex);
1248 WITH_RCU_READ_LOCK_GUARD() {
1249 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1250 ramblock_sync_dirty_bitmap(rs, block);
1252 ram_counters.remaining = ram_bytes_remaining();
1254 qemu_mutex_unlock(&rs->bitmap_mutex);
1256 memory_global_after_dirty_log_sync();
1257 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1259 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1261 /* more than 1 second = 1000 millisecons */
1262 if (end_time > rs->time_last_bitmap_sync + 1000) {
1263 migration_trigger_throttle(rs);
1265 migration_update_rates(rs, end_time);
1267 rs->target_page_count_prev = rs->target_page_count;
1269 /* reset period counters */
1270 rs->time_last_bitmap_sync = end_time;
1271 rs->num_dirty_pages_period = 0;
1272 rs->bytes_xfer_prev = stat64_get(&ram_atomic_counters.transferred);
1274 if (migrate_use_events()) {
1275 qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
1279 static void migration_bitmap_sync_precopy(RAMState *rs)
1281 Error *local_err = NULL;
1284 * The current notifier usage is just an optimization to migration, so we
1285 * don't stop the normal migration process in the error case.
1287 if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1288 error_report_err(local_err);
1289 local_err = NULL;
1292 migration_bitmap_sync(rs);
1294 if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1295 error_report_err(local_err);
1299 void ram_release_page(const char *rbname, uint64_t offset)
1301 if (!migrate_release_ram() || !migration_in_postcopy()) {
1302 return;
1305 ram_discard_range(rbname, offset, TARGET_PAGE_SIZE);
1309 * save_zero_page_to_file: send the zero page to the file
1311 * Returns the size of data written to the file, 0 means the page is not
1312 * a zero page
1314 * @pss: current PSS channel
1315 * @block: block that contains the page we want to send
1316 * @offset: offset inside the block for the page
1318 static int save_zero_page_to_file(PageSearchStatus *pss,
1319 RAMBlock *block, ram_addr_t offset)
1321 uint8_t *p = block->host + offset;
1322 QEMUFile *file = pss->pss_channel;
1323 int len = 0;
1325 if (buffer_is_zero(p, TARGET_PAGE_SIZE)) {
1326 len += save_page_header(pss, block, offset | RAM_SAVE_FLAG_ZERO);
1327 qemu_put_byte(file, 0);
1328 len += 1;
1329 ram_release_page(block->idstr, offset);
1331 return len;
1335 * save_zero_page: send the zero page to the stream
1337 * Returns the number of pages written.
1339 * @pss: current PSS channel
1340 * @block: block that contains the page we want to send
1341 * @offset: offset inside the block for the page
1343 static int save_zero_page(PageSearchStatus *pss, RAMBlock *block,
1344 ram_addr_t offset)
1346 int len = save_zero_page_to_file(pss, block, offset);
1348 if (len) {
1349 stat64_add(&ram_atomic_counters.duplicate, 1);
1350 ram_transferred_add(len);
1351 return 1;
1353 return -1;
1357 * @pages: the number of pages written by the control path,
1358 * < 0 - error
1359 * > 0 - number of pages written
1361 * Return true if the pages has been saved, otherwise false is returned.
1363 static bool control_save_page(PageSearchStatus *pss, RAMBlock *block,
1364 ram_addr_t offset, int *pages)
1366 uint64_t bytes_xmit = 0;
1367 int ret;
1369 *pages = -1;
1370 ret = ram_control_save_page(pss->pss_channel, block->offset, offset,
1371 TARGET_PAGE_SIZE, &bytes_xmit);
1372 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1373 return false;
1376 if (bytes_xmit) {
1377 ram_transferred_add(bytes_xmit);
1378 *pages = 1;
1381 if (ret == RAM_SAVE_CONTROL_DELAYED) {
1382 return true;
1385 if (bytes_xmit > 0) {
1386 stat64_add(&ram_atomic_counters.normal, 1);
1387 } else if (bytes_xmit == 0) {
1388 stat64_add(&ram_atomic_counters.duplicate, 1);
1391 return true;
1395 * directly send the page to the stream
1397 * Returns the number of pages written.
1399 * @pss: current PSS channel
1400 * @block: block that contains the page we want to send
1401 * @offset: offset inside the block for the page
1402 * @buf: the page to be sent
1403 * @async: send to page asyncly
1405 static int save_normal_page(PageSearchStatus *pss, RAMBlock *block,
1406 ram_addr_t offset, uint8_t *buf, bool async)
1408 QEMUFile *file = pss->pss_channel;
1410 ram_transferred_add(save_page_header(pss, block,
1411 offset | RAM_SAVE_FLAG_PAGE));
1412 if (async) {
1413 qemu_put_buffer_async(file, buf, TARGET_PAGE_SIZE,
1414 migrate_release_ram() &&
1415 migration_in_postcopy());
1416 } else {
1417 qemu_put_buffer(file, buf, TARGET_PAGE_SIZE);
1419 ram_transferred_add(TARGET_PAGE_SIZE);
1420 stat64_add(&ram_atomic_counters.normal, 1);
1421 return 1;
1425 * ram_save_page: send the given page to the stream
1427 * Returns the number of pages written.
1428 * < 0 - error
1429 * >=0 - Number of pages written - this might legally be 0
1430 * if xbzrle noticed the page was the same.
1432 * @rs: current RAM state
1433 * @block: block that contains the page we want to send
1434 * @offset: offset inside the block for the page
1436 static int ram_save_page(RAMState *rs, PageSearchStatus *pss)
1438 int pages = -1;
1439 uint8_t *p;
1440 bool send_async = true;
1441 RAMBlock *block = pss->block;
1442 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1443 ram_addr_t current_addr = block->offset + offset;
1445 p = block->host + offset;
1446 trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1448 XBZRLE_cache_lock();
1449 if (rs->xbzrle_enabled && !migration_in_postcopy()) {
1450 pages = save_xbzrle_page(rs, pss, &p, current_addr,
1451 block, offset);
1452 if (!rs->last_stage) {
1453 /* Can't send this cached data async, since the cache page
1454 * might get updated before it gets to the wire
1456 send_async = false;
1460 /* XBZRLE overflow or normal page */
1461 if (pages == -1) {
1462 pages = save_normal_page(pss, block, offset, p, send_async);
1465 XBZRLE_cache_unlock();
1467 return pages;
1470 static int ram_save_multifd_page(QEMUFile *file, RAMBlock *block,
1471 ram_addr_t offset)
1473 if (multifd_queue_page(file, block, offset) < 0) {
1474 return -1;
1476 stat64_add(&ram_atomic_counters.normal, 1);
1478 return 1;
1481 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1482 ram_addr_t offset, uint8_t *source_buf)
1484 RAMState *rs = ram_state;
1485 PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY];
1486 uint8_t *p = block->host + offset;
1487 int ret;
1489 if (save_zero_page_to_file(pss, block, offset)) {
1490 return true;
1493 save_page_header(pss, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1496 * copy it to a internal buffer to avoid it being modified by VM
1497 * so that we can catch up the error during compression and
1498 * decompression
1500 memcpy(source_buf, p, TARGET_PAGE_SIZE);
1501 ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1502 if (ret < 0) {
1503 qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
1504 error_report("compressed data failed!");
1506 return false;
1509 static void
1510 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1512 ram_transferred_add(bytes_xmit);
1514 if (param->zero_page) {
1515 stat64_add(&ram_atomic_counters.duplicate, 1);
1516 return;
1519 /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1520 compression_counters.compressed_size += bytes_xmit - 8;
1521 compression_counters.pages++;
1524 static bool save_page_use_compression(RAMState *rs);
1526 static void flush_compressed_data(RAMState *rs)
1528 MigrationState *ms = migrate_get_current();
1529 int idx, len, thread_count;
1531 if (!save_page_use_compression(rs)) {
1532 return;
1534 thread_count = migrate_compress_threads();
1536 qemu_mutex_lock(&comp_done_lock);
1537 for (idx = 0; idx < thread_count; idx++) {
1538 while (!comp_param[idx].done) {
1539 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1542 qemu_mutex_unlock(&comp_done_lock);
1544 for (idx = 0; idx < thread_count; idx++) {
1545 qemu_mutex_lock(&comp_param[idx].mutex);
1546 if (!comp_param[idx].quit) {
1547 len = qemu_put_qemu_file(ms->to_dst_file, comp_param[idx].file);
1549 * it's safe to fetch zero_page without holding comp_done_lock
1550 * as there is no further request submitted to the thread,
1551 * i.e, the thread should be waiting for a request at this point.
1553 update_compress_thread_counts(&comp_param[idx], len);
1555 qemu_mutex_unlock(&comp_param[idx].mutex);
1559 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1560 ram_addr_t offset)
1562 param->block = block;
1563 param->offset = offset;
1566 static int compress_page_with_multi_thread(RAMBlock *block, ram_addr_t offset)
1568 int idx, thread_count, bytes_xmit = -1, pages = -1;
1569 bool wait = migrate_compress_wait_thread();
1570 MigrationState *ms = migrate_get_current();
1572 thread_count = migrate_compress_threads();
1573 qemu_mutex_lock(&comp_done_lock);
1574 retry:
1575 for (idx = 0; idx < thread_count; idx++) {
1576 if (comp_param[idx].done) {
1577 comp_param[idx].done = false;
1578 bytes_xmit = qemu_put_qemu_file(ms->to_dst_file,
1579 comp_param[idx].file);
1580 qemu_mutex_lock(&comp_param[idx].mutex);
1581 set_compress_params(&comp_param[idx], block, offset);
1582 qemu_cond_signal(&comp_param[idx].cond);
1583 qemu_mutex_unlock(&comp_param[idx].mutex);
1584 pages = 1;
1585 update_compress_thread_counts(&comp_param[idx], bytes_xmit);
1586 break;
1591 * wait for the free thread if the user specifies 'compress-wait-thread',
1592 * otherwise we will post the page out in the main thread as normal page.
1594 if (pages < 0 && wait) {
1595 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1596 goto retry;
1598 qemu_mutex_unlock(&comp_done_lock);
1600 return pages;
1604 * find_dirty_block: find the next dirty page and update any state
1605 * associated with the search process.
1607 * Returns true if a page is found
1609 * @rs: current RAM state
1610 * @pss: data about the state of the current dirty page scan
1611 * @again: set to false if the search has scanned the whole of RAM
1613 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1616 * This is not a postcopy requested page, mark it "not urgent", and use
1617 * precopy channel to send it.
1619 pss->postcopy_requested = false;
1620 pss->postcopy_target_channel = RAM_CHANNEL_PRECOPY;
1622 /* Update pss->page for the next dirty bit in ramblock */
1623 pss_find_next_dirty(pss);
1625 if (pss->complete_round && pss->block == rs->last_seen_block &&
1626 pss->page >= rs->last_page) {
1628 * We've been once around the RAM and haven't found anything.
1629 * Give up.
1631 *again = false;
1632 return false;
1634 if (!offset_in_ramblock(pss->block,
1635 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) {
1636 /* Didn't find anything in this RAM Block */
1637 pss->page = 0;
1638 pss->block = QLIST_NEXT_RCU(pss->block, next);
1639 if (!pss->block) {
1641 * If memory migration starts over, we will meet a dirtied page
1642 * which may still exists in compression threads's ring, so we
1643 * should flush the compressed data to make sure the new page
1644 * is not overwritten by the old one in the destination.
1646 * Also If xbzrle is on, stop using the data compression at this
1647 * point. In theory, xbzrle can do better than compression.
1649 flush_compressed_data(rs);
1651 /* Hit the end of the list */
1652 pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1653 /* Flag that we've looped */
1654 pss->complete_round = true;
1655 /* After the first round, enable XBZRLE. */
1656 if (migrate_use_xbzrle()) {
1657 rs->xbzrle_enabled = true;
1660 /* Didn't find anything this time, but try again on the new block */
1661 *again = true;
1662 return false;
1663 } else {
1664 /* Can go around again, but... */
1665 *again = true;
1666 /* We've found something so probably don't need to */
1667 return true;
1672 * unqueue_page: gets a page of the queue
1674 * Helper for 'get_queued_page' - gets a page off the queue
1676 * Returns the block of the page (or NULL if none available)
1678 * @rs: current RAM state
1679 * @offset: used to return the offset within the RAMBlock
1681 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1683 struct RAMSrcPageRequest *entry;
1684 RAMBlock *block = NULL;
1686 if (!postcopy_has_request(rs)) {
1687 return NULL;
1690 QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
1693 * This should _never_ change even after we take the lock, because no one
1694 * should be taking anything off the request list other than us.
1696 assert(postcopy_has_request(rs));
1698 entry = QSIMPLEQ_FIRST(&rs->src_page_requests);
1699 block = entry->rb;
1700 *offset = entry->offset;
1702 if (entry->len > TARGET_PAGE_SIZE) {
1703 entry->len -= TARGET_PAGE_SIZE;
1704 entry->offset += TARGET_PAGE_SIZE;
1705 } else {
1706 memory_region_unref(block->mr);
1707 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1708 g_free(entry);
1709 migration_consume_urgent_request();
1712 return block;
1715 #if defined(__linux__)
1717 * poll_fault_page: try to get next UFFD write fault page and, if pending fault
1718 * is found, return RAM block pointer and page offset
1720 * Returns pointer to the RAMBlock containing faulting page,
1721 * NULL if no write faults are pending
1723 * @rs: current RAM state
1724 * @offset: page offset from the beginning of the block
1726 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1728 struct uffd_msg uffd_msg;
1729 void *page_address;
1730 RAMBlock *block;
1731 int res;
1733 if (!migrate_background_snapshot()) {
1734 return NULL;
1737 res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1);
1738 if (res <= 0) {
1739 return NULL;
1742 page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address;
1743 block = qemu_ram_block_from_host(page_address, false, offset);
1744 assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0);
1745 return block;
1749 * ram_save_release_protection: release UFFD write protection after
1750 * a range of pages has been saved
1752 * @rs: current RAM state
1753 * @pss: page-search-status structure
1754 * @start_page: index of the first page in the range relative to pss->block
1756 * Returns 0 on success, negative value in case of an error
1758 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1759 unsigned long start_page)
1761 int res = 0;
1763 /* Check if page is from UFFD-managed region. */
1764 if (pss->block->flags & RAM_UF_WRITEPROTECT) {
1765 void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS);
1766 uint64_t run_length = (pss->page - start_page) << TARGET_PAGE_BITS;
1768 /* Flush async buffers before un-protect. */
1769 qemu_fflush(pss->pss_channel);
1770 /* Un-protect memory range. */
1771 res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
1772 false, false);
1775 return res;
1778 /* ram_write_tracking_available: check if kernel supports required UFFD features
1780 * Returns true if supports, false otherwise
1782 bool ram_write_tracking_available(void)
1784 uint64_t uffd_features;
1785 int res;
1787 res = uffd_query_features(&uffd_features);
1788 return (res == 0 &&
1789 (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0);
1792 /* ram_write_tracking_compatible: check if guest configuration is
1793 * compatible with 'write-tracking'
1795 * Returns true if compatible, false otherwise
1797 bool ram_write_tracking_compatible(void)
1799 const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT);
1800 int uffd_fd;
1801 RAMBlock *block;
1802 bool ret = false;
1804 /* Open UFFD file descriptor */
1805 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false);
1806 if (uffd_fd < 0) {
1807 return false;
1810 RCU_READ_LOCK_GUARD();
1812 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1813 uint64_t uffd_ioctls;
1815 /* Nothing to do with read-only and MMIO-writable regions */
1816 if (block->mr->readonly || block->mr->rom_device) {
1817 continue;
1819 /* Try to register block memory via UFFD-IO to track writes */
1820 if (uffd_register_memory(uffd_fd, block->host, block->max_length,
1821 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) {
1822 goto out;
1824 if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) {
1825 goto out;
1828 ret = true;
1830 out:
1831 uffd_close_fd(uffd_fd);
1832 return ret;
1835 static inline void populate_read_range(RAMBlock *block, ram_addr_t offset,
1836 ram_addr_t size)
1839 * We read one byte of each page; this will preallocate page tables if
1840 * required and populate the shared zeropage on MAP_PRIVATE anonymous memory
1841 * where no page was populated yet. This might require adaption when
1842 * supporting other mappings, like shmem.
1844 for (; offset < size; offset += block->page_size) {
1845 char tmp = *((char *)block->host + offset);
1847 /* Don't optimize the read out */
1848 asm volatile("" : "+r" (tmp));
1852 static inline int populate_read_section(MemoryRegionSection *section,
1853 void *opaque)
1855 const hwaddr size = int128_get64(section->size);
1856 hwaddr offset = section->offset_within_region;
1857 RAMBlock *block = section->mr->ram_block;
1859 populate_read_range(block, offset, size);
1860 return 0;
1864 * ram_block_populate_read: preallocate page tables and populate pages in the
1865 * RAM block by reading a byte of each page.
1867 * Since it's solely used for userfault_fd WP feature, here we just
1868 * hardcode page size to qemu_real_host_page_size.
1870 * @block: RAM block to populate
1872 static void ram_block_populate_read(RAMBlock *rb)
1875 * Skip populating all pages that fall into a discarded range as managed by
1876 * a RamDiscardManager responsible for the mapped memory region of the
1877 * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock
1878 * must not get populated automatically. We don't have to track
1879 * modifications via userfaultfd WP reliably, because these pages will
1880 * not be part of the migration stream either way -- see
1881 * ramblock_dirty_bitmap_exclude_discarded_pages().
1883 * Note: The result is only stable while migrating (precopy/postcopy).
1885 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1886 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1887 MemoryRegionSection section = {
1888 .mr = rb->mr,
1889 .offset_within_region = 0,
1890 .size = rb->mr->size,
1893 ram_discard_manager_replay_populated(rdm, &section,
1894 populate_read_section, NULL);
1895 } else {
1896 populate_read_range(rb, 0, rb->used_length);
1901 * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking
1903 void ram_write_tracking_prepare(void)
1905 RAMBlock *block;
1907 RCU_READ_LOCK_GUARD();
1909 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1910 /* Nothing to do with read-only and MMIO-writable regions */
1911 if (block->mr->readonly || block->mr->rom_device) {
1912 continue;
1916 * Populate pages of the RAM block before enabling userfault_fd
1917 * write protection.
1919 * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with
1920 * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip
1921 * pages with pte_none() entries in page table.
1923 ram_block_populate_read(block);
1928 * ram_write_tracking_start: start UFFD-WP memory tracking
1930 * Returns 0 for success or negative value in case of error
1932 int ram_write_tracking_start(void)
1934 int uffd_fd;
1935 RAMState *rs = ram_state;
1936 RAMBlock *block;
1938 /* Open UFFD file descriptor */
1939 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true);
1940 if (uffd_fd < 0) {
1941 return uffd_fd;
1943 rs->uffdio_fd = uffd_fd;
1945 RCU_READ_LOCK_GUARD();
1947 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1948 /* Nothing to do with read-only and MMIO-writable regions */
1949 if (block->mr->readonly || block->mr->rom_device) {
1950 continue;
1953 /* Register block memory with UFFD to track writes */
1954 if (uffd_register_memory(rs->uffdio_fd, block->host,
1955 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
1956 goto fail;
1958 /* Apply UFFD write protection to the block memory range */
1959 if (uffd_change_protection(rs->uffdio_fd, block->host,
1960 block->max_length, true, false)) {
1961 goto fail;
1963 block->flags |= RAM_UF_WRITEPROTECT;
1964 memory_region_ref(block->mr);
1966 trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size,
1967 block->host, block->max_length);
1970 return 0;
1972 fail:
1973 error_report("ram_write_tracking_start() failed: restoring initial memory state");
1975 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1976 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1977 continue;
1980 * In case some memory block failed to be write-protected
1981 * remove protection and unregister all succeeded RAM blocks
1983 uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1984 false, false);
1985 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1986 /* Cleanup flags and remove reference */
1987 block->flags &= ~RAM_UF_WRITEPROTECT;
1988 memory_region_unref(block->mr);
1991 uffd_close_fd(uffd_fd);
1992 rs->uffdio_fd = -1;
1993 return -1;
1997 * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
1999 void ram_write_tracking_stop(void)
2001 RAMState *rs = ram_state;
2002 RAMBlock *block;
2004 RCU_READ_LOCK_GUARD();
2006 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2007 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
2008 continue;
2010 /* Remove protection and unregister all affected RAM blocks */
2011 uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
2012 false, false);
2013 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
2015 trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size,
2016 block->host, block->max_length);
2018 /* Cleanup flags and remove reference */
2019 block->flags &= ~RAM_UF_WRITEPROTECT;
2020 memory_region_unref(block->mr);
2023 /* Finally close UFFD file descriptor */
2024 uffd_close_fd(rs->uffdio_fd);
2025 rs->uffdio_fd = -1;
2028 #else
2029 /* No target OS support, stubs just fail or ignore */
2031 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
2033 (void) rs;
2034 (void) offset;
2036 return NULL;
2039 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
2040 unsigned long start_page)
2042 (void) rs;
2043 (void) pss;
2044 (void) start_page;
2046 return 0;
2049 bool ram_write_tracking_available(void)
2051 return false;
2054 bool ram_write_tracking_compatible(void)
2056 assert(0);
2057 return false;
2060 int ram_write_tracking_start(void)
2062 assert(0);
2063 return -1;
2066 void ram_write_tracking_stop(void)
2068 assert(0);
2070 #endif /* defined(__linux__) */
2073 * Check whether two addr/offset of the ramblock falls onto the same host huge
2074 * page. Returns true if so, false otherwise.
2076 static bool offset_on_same_huge_page(RAMBlock *rb, uint64_t addr1,
2077 uint64_t addr2)
2079 size_t page_size = qemu_ram_pagesize(rb);
2081 addr1 = ROUND_DOWN(addr1, page_size);
2082 addr2 = ROUND_DOWN(addr2, page_size);
2084 return addr1 == addr2;
2088 * Whether a previous preempted precopy huge page contains current requested
2089 * page? Returns true if so, false otherwise.
2091 * This should really happen very rarely, because it means when we were sending
2092 * during background migration for postcopy we're sending exactly the page that
2093 * some vcpu got faulted on on dest node. When it happens, we probably don't
2094 * need to do much but drop the request, because we know right after we restore
2095 * the precopy stream it'll be serviced. It'll slightly affect the order of
2096 * postcopy requests to be serviced (e.g. it'll be the same as we move current
2097 * request to the end of the queue) but it shouldn't be a big deal. The most
2098 * imporant thing is we can _never_ try to send a partial-sent huge page on the
2099 * POSTCOPY channel again, otherwise that huge page will got "split brain" on
2100 * two channels (PRECOPY, POSTCOPY).
2102 static bool postcopy_preempted_contains(RAMState *rs, RAMBlock *block,
2103 ram_addr_t offset)
2105 PostcopyPreemptState *state = &rs->postcopy_preempt_state;
2107 /* No preemption at all? */
2108 if (!state->preempted) {
2109 return false;
2112 /* Not even the same ramblock? */
2113 if (state->ram_block != block) {
2114 return false;
2117 return offset_on_same_huge_page(block, offset,
2118 state->ram_page << TARGET_PAGE_BITS);
2122 * get_queued_page: unqueue a page from the postcopy requests
2124 * Skips pages that are already sent (!dirty)
2126 * Returns true if a queued page is found
2128 * @rs: current RAM state
2129 * @pss: data about the state of the current dirty page scan
2131 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
2133 RAMBlock *block;
2134 ram_addr_t offset;
2135 bool dirty;
2137 do {
2138 block = unqueue_page(rs, &offset);
2140 * We're sending this page, and since it's postcopy nothing else
2141 * will dirty it, and we must make sure it doesn't get sent again
2142 * even if this queue request was received after the background
2143 * search already sent it.
2145 if (block) {
2146 unsigned long page;
2148 page = offset >> TARGET_PAGE_BITS;
2149 dirty = test_bit(page, block->bmap);
2150 if (!dirty) {
2151 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
2152 page);
2153 } else {
2154 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
2158 } while (block && !dirty);
2160 if (block) {
2161 /* See comment above postcopy_preempted_contains() */
2162 if (postcopy_preempted_contains(rs, block, offset)) {
2163 trace_postcopy_preempt_hit(block->idstr, offset);
2165 * If what we preempted previously was exactly what we're
2166 * requesting right now, restore the preempted precopy
2167 * immediately, boosting its priority as it's requested by
2168 * postcopy.
2170 postcopy_preempt_restore(rs, pss, true);
2171 return true;
2173 } else {
2175 * Poll write faults too if background snapshot is enabled; that's
2176 * when we have vcpus got blocked by the write protected pages.
2178 block = poll_fault_page(rs, &offset);
2181 if (block) {
2183 * We want the background search to continue from the queued page
2184 * since the guest is likely to want other pages near to the page
2185 * it just requested.
2187 pss->block = block;
2188 pss->page = offset >> TARGET_PAGE_BITS;
2191 * This unqueued page would break the "one round" check, even is
2192 * really rare.
2194 pss->complete_round = false;
2195 /* Mark it an urgent request, meanwhile using POSTCOPY channel */
2196 pss->postcopy_requested = true;
2197 pss->postcopy_target_channel = RAM_CHANNEL_POSTCOPY;
2200 return !!block;
2204 * migration_page_queue_free: drop any remaining pages in the ram
2205 * request queue
2207 * It should be empty at the end anyway, but in error cases there may
2208 * be some left. in case that there is any page left, we drop it.
2211 static void migration_page_queue_free(RAMState *rs)
2213 struct RAMSrcPageRequest *mspr, *next_mspr;
2214 /* This queue generally should be empty - but in the case of a failed
2215 * migration might have some droppings in.
2217 RCU_READ_LOCK_GUARD();
2218 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
2219 memory_region_unref(mspr->rb->mr);
2220 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
2221 g_free(mspr);
2226 * ram_save_queue_pages: queue the page for transmission
2228 * A request from postcopy destination for example.
2230 * Returns zero on success or negative on error
2232 * @rbname: Name of the RAMBLock of the request. NULL means the
2233 * same that last one.
2234 * @start: starting address from the start of the RAMBlock
2235 * @len: length (in bytes) to send
2237 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
2239 RAMBlock *ramblock;
2240 RAMState *rs = ram_state;
2242 ram_counters.postcopy_requests++;
2243 RCU_READ_LOCK_GUARD();
2245 if (!rbname) {
2246 /* Reuse last RAMBlock */
2247 ramblock = rs->last_req_rb;
2249 if (!ramblock) {
2251 * Shouldn't happen, we can't reuse the last RAMBlock if
2252 * it's the 1st request.
2254 error_report("ram_save_queue_pages no previous block");
2255 return -1;
2257 } else {
2258 ramblock = qemu_ram_block_by_name(rbname);
2260 if (!ramblock) {
2261 /* We shouldn't be asked for a non-existent RAMBlock */
2262 error_report("ram_save_queue_pages no block '%s'", rbname);
2263 return -1;
2265 rs->last_req_rb = ramblock;
2267 trace_ram_save_queue_pages(ramblock->idstr, start, len);
2268 if (!offset_in_ramblock(ramblock, start + len - 1)) {
2269 error_report("%s request overrun start=" RAM_ADDR_FMT " len="
2270 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
2271 __func__, start, len, ramblock->used_length);
2272 return -1;
2276 * When with postcopy preempt, we send back the page directly in the
2277 * rp-return thread.
2279 if (postcopy_preempt_active()) {
2280 ram_addr_t page_start = start >> TARGET_PAGE_BITS;
2281 size_t page_size = qemu_ram_pagesize(ramblock);
2282 PageSearchStatus *pss = &ram_state->pss[RAM_CHANNEL_POSTCOPY];
2283 int ret = 0;
2285 qemu_mutex_lock(&rs->bitmap_mutex);
2287 pss_init(pss, ramblock, page_start);
2289 * Always use the preempt channel, and make sure it's there. It's
2290 * safe to access without lock, because when rp-thread is running
2291 * we should be the only one who operates on the qemufile
2293 pss->pss_channel = migrate_get_current()->postcopy_qemufile_src;
2294 pss->postcopy_requested = true;
2295 assert(pss->pss_channel);
2298 * It must be either one or multiple of host page size. Just
2299 * assert; if something wrong we're mostly split brain anyway.
2301 assert(len % page_size == 0);
2302 while (len) {
2303 if (ram_save_host_page_urgent(pss)) {
2304 error_report("%s: ram_save_host_page_urgent() failed: "
2305 "ramblock=%s, start_addr=0x"RAM_ADDR_FMT,
2306 __func__, ramblock->idstr, start);
2307 ret = -1;
2308 break;
2311 * NOTE: after ram_save_host_page_urgent() succeeded, pss->page
2312 * will automatically be moved and point to the next host page
2313 * we're going to send, so no need to update here.
2315 * Normally QEMU never sends >1 host page in requests, so
2316 * logically we don't even need that as the loop should only
2317 * run once, but just to be consistent.
2319 len -= page_size;
2321 qemu_mutex_unlock(&rs->bitmap_mutex);
2323 return ret;
2326 struct RAMSrcPageRequest *new_entry =
2327 g_new0(struct RAMSrcPageRequest, 1);
2328 new_entry->rb = ramblock;
2329 new_entry->offset = start;
2330 new_entry->len = len;
2332 memory_region_ref(ramblock->mr);
2333 qemu_mutex_lock(&rs->src_page_req_mutex);
2334 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
2335 migration_make_urgent_request();
2336 qemu_mutex_unlock(&rs->src_page_req_mutex);
2338 return 0;
2341 static bool save_page_use_compression(RAMState *rs)
2343 if (!migrate_use_compression()) {
2344 return false;
2348 * If xbzrle is enabled (e.g., after first round of migration), stop
2349 * using the data compression. In theory, xbzrle can do better than
2350 * compression.
2352 if (rs->xbzrle_enabled) {
2353 return false;
2356 return true;
2360 * try to compress the page before posting it out, return true if the page
2361 * has been properly handled by compression, otherwise needs other
2362 * paths to handle it
2364 static bool save_compress_page(RAMState *rs, PageSearchStatus *pss,
2365 RAMBlock *block, ram_addr_t offset)
2367 if (!save_page_use_compression(rs)) {
2368 return false;
2372 * When starting the process of a new block, the first page of
2373 * the block should be sent out before other pages in the same
2374 * block, and all the pages in last block should have been sent
2375 * out, keeping this order is important, because the 'cont' flag
2376 * is used to avoid resending the block name.
2378 * We post the fist page as normal page as compression will take
2379 * much CPU resource.
2381 if (block != pss->last_sent_block) {
2382 flush_compressed_data(rs);
2383 return false;
2386 if (compress_page_with_multi_thread(block, offset) > 0) {
2387 return true;
2390 compression_counters.busy++;
2391 return false;
2395 * ram_save_target_page: save one target page
2397 * Returns the number of pages written
2399 * @rs: current RAM state
2400 * @pss: data about the page we want to send
2402 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss)
2404 RAMBlock *block = pss->block;
2405 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
2406 int res;
2408 if (control_save_page(pss, block, offset, &res)) {
2409 return res;
2412 if (save_compress_page(rs, pss, block, offset)) {
2413 return 1;
2416 res = save_zero_page(pss, block, offset);
2417 if (res > 0) {
2418 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2419 * page would be stale
2421 if (rs->xbzrle_enabled) {
2422 XBZRLE_cache_lock();
2423 xbzrle_cache_zero_page(rs, block->offset + offset);
2424 XBZRLE_cache_unlock();
2426 return res;
2430 * Do not use multifd in postcopy as one whole host page should be
2431 * placed. Meanwhile postcopy requires atomic update of pages, so even
2432 * if host page size == guest page size the dest guest during run may
2433 * still see partially copied pages which is data corruption.
2435 if (migrate_use_multifd() && !migration_in_postcopy()) {
2436 return ram_save_multifd_page(pss->pss_channel, block, offset);
2439 return ram_save_page(rs, pss);
2442 static bool postcopy_needs_preempt(RAMState *rs, PageSearchStatus *pss)
2444 MigrationState *ms = migrate_get_current();
2446 /* Not enabled eager preempt? Then never do that. */
2447 if (!migrate_postcopy_preempt()) {
2448 return false;
2451 /* If the user explicitly disabled breaking of huge page, skip */
2452 if (!ms->postcopy_preempt_break_huge) {
2453 return false;
2456 /* If the ramblock we're sending is a small page? Never bother. */
2457 if (qemu_ram_pagesize(pss->block) == TARGET_PAGE_SIZE) {
2458 return false;
2461 /* Not in postcopy at all? */
2462 if (!migration_in_postcopy()) {
2463 return false;
2467 * If we're already handling a postcopy request, don't preempt as this page
2468 * has got the same high priority.
2470 if (pss->postcopy_requested) {
2471 return false;
2474 /* If there's postcopy requests, then check it up! */
2475 return postcopy_has_request(rs);
2478 /* Returns true if we preempted precopy, false otherwise */
2479 static void postcopy_do_preempt(RAMState *rs, PageSearchStatus *pss)
2481 PostcopyPreemptState *p_state = &rs->postcopy_preempt_state;
2483 trace_postcopy_preempt_triggered(pss->block->idstr, pss->page);
2486 * Time to preempt precopy. Cache current PSS into preempt state, so that
2487 * after handling the postcopy pages we can recover to it. We need to do
2488 * so because the dest VM will have partial of the precopy huge page kept
2489 * over in its tmp huge page caches; better move on with it when we can.
2491 p_state->ram_block = pss->block;
2492 p_state->ram_page = pss->page;
2493 p_state->preempted = true;
2496 /* Whether we're preempted by a postcopy request during sending a huge page */
2497 static bool postcopy_preempt_triggered(RAMState *rs)
2499 return rs->postcopy_preempt_state.preempted;
2502 static void postcopy_preempt_restore(RAMState *rs, PageSearchStatus *pss,
2503 bool postcopy_requested)
2505 PostcopyPreemptState *state = &rs->postcopy_preempt_state;
2507 assert(state->preempted);
2509 pss->block = state->ram_block;
2510 pss->page = state->ram_page;
2512 /* Whether this is a postcopy request? */
2513 pss->postcopy_requested = postcopy_requested;
2515 * When restoring a preempted page, the old data resides in PRECOPY
2516 * slow channel, even if postcopy_requested is set. So always use
2517 * PRECOPY channel here.
2519 pss->postcopy_target_channel = RAM_CHANNEL_PRECOPY;
2521 trace_postcopy_preempt_restored(pss->block->idstr, pss->page);
2523 /* Reset preempt state, most importantly, set preempted==false */
2524 postcopy_preempt_reset(rs);
2527 static void postcopy_preempt_choose_channel(RAMState *rs, PageSearchStatus *pss)
2529 MigrationState *s = migrate_get_current();
2530 unsigned int channel = pss->postcopy_target_channel;
2531 QEMUFile *next;
2533 if (channel != rs->postcopy_channel) {
2534 if (channel == RAM_CHANNEL_PRECOPY) {
2535 next = s->to_dst_file;
2536 } else {
2537 next = s->postcopy_qemufile_src;
2539 /* Update and cache the current channel */
2540 rs->f = next;
2541 rs->postcopy_channel = channel;
2544 * If channel switched, reset last_sent_block since the old sent block
2545 * may not be on the same channel.
2547 pss->last_sent_block = NULL;
2549 trace_postcopy_preempt_switch_channel(channel);
2552 trace_postcopy_preempt_send_host_page(pss->block->idstr, pss->page);
2555 /* We need to make sure rs->f always points to the default channel elsewhere */
2556 static void postcopy_preempt_reset_channel(RAMState *rs)
2558 if (postcopy_preempt_active()) {
2559 rs->postcopy_channel = RAM_CHANNEL_PRECOPY;
2560 rs->f = migrate_get_current()->to_dst_file;
2561 trace_postcopy_preempt_reset_channel();
2565 /* Should be called before sending a host page */
2566 static void pss_host_page_prepare(PageSearchStatus *pss)
2568 /* How many guest pages are there in one host page? */
2569 size_t guest_pfns = qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2571 pss->host_page_sending = true;
2572 pss->host_page_start = ROUND_DOWN(pss->page, guest_pfns);
2573 pss->host_page_end = ROUND_UP(pss->page + 1, guest_pfns);
2577 * Whether the page pointed by PSS is within the host page being sent.
2578 * Must be called after a previous pss_host_page_prepare().
2580 static bool pss_within_range(PageSearchStatus *pss)
2582 ram_addr_t ram_addr;
2584 assert(pss->host_page_sending);
2586 /* Over host-page boundary? */
2587 if (pss->page >= pss->host_page_end) {
2588 return false;
2591 ram_addr = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
2593 return offset_in_ramblock(pss->block, ram_addr);
2596 static void pss_host_page_finish(PageSearchStatus *pss)
2598 pss->host_page_sending = false;
2599 /* This is not needed, but just to reset it */
2600 pss->host_page_start = pss->host_page_end = 0;
2604 * Send an urgent host page specified by `pss'. Need to be called with
2605 * bitmap_mutex held.
2607 * Returns 0 if save host page succeeded, false otherwise.
2609 static int ram_save_host_page_urgent(PageSearchStatus *pss)
2611 bool page_dirty, sent = false;
2612 RAMState *rs = ram_state;
2613 int ret = 0;
2615 trace_postcopy_preempt_send_host_page(pss->block->idstr, pss->page);
2616 pss_host_page_prepare(pss);
2619 * If precopy is sending the same page, let it be done in precopy, or
2620 * we could send the same page in two channels and none of them will
2621 * receive the whole page.
2623 if (pss_overlap(pss, &ram_state->pss[RAM_CHANNEL_PRECOPY])) {
2624 trace_postcopy_preempt_hit(pss->block->idstr,
2625 pss->page << TARGET_PAGE_BITS);
2626 return 0;
2629 do {
2630 page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page);
2632 if (page_dirty) {
2633 /* Be strict to return code; it must be 1, or what else? */
2634 if (ram_save_target_page(rs, pss) != 1) {
2635 error_report_once("%s: ram_save_target_page failed", __func__);
2636 ret = -1;
2637 goto out;
2639 sent = true;
2641 pss_find_next_dirty(pss);
2642 } while (pss_within_range(pss));
2643 out:
2644 pss_host_page_finish(pss);
2645 /* For urgent requests, flush immediately if sent */
2646 if (sent) {
2647 qemu_fflush(pss->pss_channel);
2649 return ret;
2653 * ram_save_host_page: save a whole host page
2655 * Starting at *offset send pages up to the end of the current host
2656 * page. It's valid for the initial offset to point into the middle of
2657 * a host page in which case the remainder of the hostpage is sent.
2658 * Only dirty target pages are sent. Note that the host page size may
2659 * be a huge page for this block.
2661 * The saving stops at the boundary of the used_length of the block
2662 * if the RAMBlock isn't a multiple of the host page size.
2664 * The caller must be with ram_state.bitmap_mutex held to call this
2665 * function. Note that this function can temporarily release the lock, but
2666 * when the function is returned it'll make sure the lock is still held.
2668 * Returns the number of pages written or negative on error
2670 * @rs: current RAM state
2671 * @pss: data about the page we want to send
2673 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss)
2675 bool page_dirty, preempt_active = postcopy_preempt_active();
2676 int tmppages, pages = 0;
2677 size_t pagesize_bits =
2678 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2679 unsigned long start_page = pss->page;
2680 int res;
2682 if (ramblock_is_ignored(pss->block)) {
2683 error_report("block %s should not be migrated !", pss->block->idstr);
2684 return 0;
2687 /* Update host page boundary information */
2688 pss_host_page_prepare(pss);
2690 do {
2691 if (postcopy_needs_preempt(rs, pss)) {
2692 postcopy_do_preempt(rs, pss);
2693 break;
2696 page_dirty = migration_bitmap_clear_dirty(rs, pss->block, pss->page);
2698 /* Check the pages is dirty and if it is send it */
2699 if (page_dirty) {
2701 * Properly yield the lock only in postcopy preempt mode
2702 * because both migration thread and rp-return thread can
2703 * operate on the bitmaps.
2705 if (preempt_active) {
2706 qemu_mutex_unlock(&rs->bitmap_mutex);
2708 tmppages = ram_save_target_page(rs, pss);
2709 if (tmppages >= 0) {
2710 pages += tmppages;
2712 * Allow rate limiting to happen in the middle of huge pages if
2713 * something is sent in the current iteration.
2715 if (pagesize_bits > 1 && tmppages > 0) {
2716 migration_rate_limit();
2719 if (preempt_active) {
2720 qemu_mutex_lock(&rs->bitmap_mutex);
2722 } else {
2723 tmppages = 0;
2726 if (tmppages < 0) {
2727 pss_host_page_finish(pss);
2728 return tmppages;
2731 pss_find_next_dirty(pss);
2732 } while (pss_within_range(pss));
2734 pss_host_page_finish(pss);
2737 * When with postcopy preempt mode, flush the data as soon as possible for
2738 * postcopy requests, because we've already sent a whole huge page, so the
2739 * dst node should already have enough resource to atomically filling in
2740 * the current missing page.
2742 * More importantly, when using separate postcopy channel, we must do
2743 * explicit flush or it won't flush until the buffer is full.
2745 if (migrate_postcopy_preempt() && pss->postcopy_requested) {
2746 qemu_fflush(pss->pss_channel);
2749 res = ram_save_release_protection(rs, pss, start_page);
2750 return (res < 0 ? res : pages);
2754 * ram_find_and_save_block: finds a dirty page and sends it to f
2756 * Called within an RCU critical section.
2758 * Returns the number of pages written where zero means no dirty pages,
2759 * or negative on error
2761 * @rs: current RAM state
2763 * On systems where host-page-size > target-page-size it will send all the
2764 * pages in a host page that are dirty.
2766 static int ram_find_and_save_block(RAMState *rs)
2768 PageSearchStatus *pss = &rs->pss[RAM_CHANNEL_PRECOPY];
2769 int pages = 0;
2770 bool again, found;
2772 /* No dirty page as there is zero RAM */
2773 if (!ram_bytes_total()) {
2774 return pages;
2778 * Always keep last_seen_block/last_page valid during this procedure,
2779 * because find_dirty_block() relies on these values (e.g., we compare
2780 * last_seen_block with pss.block to see whether we searched all the
2781 * ramblocks) to detect the completion of migration. Having NULL value
2782 * of last_seen_block can conditionally cause below loop to run forever.
2784 if (!rs->last_seen_block) {
2785 rs->last_seen_block = QLIST_FIRST_RCU(&ram_list.blocks);
2786 rs->last_page = 0;
2789 pss_init(pss, rs->last_seen_block, rs->last_page);
2791 do {
2792 again = true;
2793 found = get_queued_page(rs, pss);
2795 if (!found) {
2797 * Recover previous precopy ramblock/offset if postcopy has
2798 * preempted precopy. Otherwise find the next dirty bit.
2800 if (postcopy_preempt_triggered(rs)) {
2801 postcopy_preempt_restore(rs, pss, false);
2802 found = true;
2803 } else {
2804 /* priority queue empty, so just search for something dirty */
2805 found = find_dirty_block(rs, pss, &again);
2809 if (found) {
2810 /* Update rs->f with correct channel */
2811 if (postcopy_preempt_active()) {
2812 postcopy_preempt_choose_channel(rs, pss);
2814 /* Cache rs->f in pss_channel (TODO: remove rs->f) */
2815 pss->pss_channel = rs->f;
2816 pages = ram_save_host_page(rs, pss);
2818 } while (!pages && again);
2820 rs->last_seen_block = pss->block;
2821 rs->last_page = pss->page;
2823 return pages;
2826 void acct_update_position(QEMUFile *f, size_t size, bool zero)
2828 uint64_t pages = size / TARGET_PAGE_SIZE;
2830 if (zero) {
2831 stat64_add(&ram_atomic_counters.duplicate, pages);
2832 } else {
2833 stat64_add(&ram_atomic_counters.normal, pages);
2834 ram_transferred_add(size);
2835 qemu_file_credit_transfer(f, size);
2839 static uint64_t ram_bytes_total_common(bool count_ignored)
2841 RAMBlock *block;
2842 uint64_t total = 0;
2844 RCU_READ_LOCK_GUARD();
2846 if (count_ignored) {
2847 RAMBLOCK_FOREACH_MIGRATABLE(block) {
2848 total += block->used_length;
2850 } else {
2851 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2852 total += block->used_length;
2855 return total;
2858 uint64_t ram_bytes_total(void)
2860 return ram_bytes_total_common(false);
2863 static void xbzrle_load_setup(void)
2865 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2868 static void xbzrle_load_cleanup(void)
2870 g_free(XBZRLE.decoded_buf);
2871 XBZRLE.decoded_buf = NULL;
2874 static void ram_state_cleanup(RAMState **rsp)
2876 if (*rsp) {
2877 migration_page_queue_free(*rsp);
2878 qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2879 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2880 g_free(*rsp);
2881 *rsp = NULL;
2885 static void xbzrle_cleanup(void)
2887 XBZRLE_cache_lock();
2888 if (XBZRLE.cache) {
2889 cache_fini(XBZRLE.cache);
2890 g_free(XBZRLE.encoded_buf);
2891 g_free(XBZRLE.current_buf);
2892 g_free(XBZRLE.zero_target_page);
2893 XBZRLE.cache = NULL;
2894 XBZRLE.encoded_buf = NULL;
2895 XBZRLE.current_buf = NULL;
2896 XBZRLE.zero_target_page = NULL;
2898 XBZRLE_cache_unlock();
2901 static void ram_save_cleanup(void *opaque)
2903 RAMState **rsp = opaque;
2904 RAMBlock *block;
2906 /* We don't use dirty log with background snapshots */
2907 if (!migrate_background_snapshot()) {
2908 /* caller have hold iothread lock or is in a bh, so there is
2909 * no writing race against the migration bitmap
2911 if (global_dirty_tracking & GLOBAL_DIRTY_MIGRATION) {
2913 * do not stop dirty log without starting it, since
2914 * memory_global_dirty_log_stop will assert that
2915 * memory_global_dirty_log_start/stop used in pairs
2917 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
2921 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2922 g_free(block->clear_bmap);
2923 block->clear_bmap = NULL;
2924 g_free(block->bmap);
2925 block->bmap = NULL;
2928 xbzrle_cleanup();
2929 compress_threads_save_cleanup();
2930 ram_state_cleanup(rsp);
2933 static void ram_state_reset(RAMState *rs)
2935 int i;
2937 for (i = 0; i < RAM_CHANNEL_MAX; i++) {
2938 rs->pss[i].last_sent_block = NULL;
2941 rs->last_seen_block = NULL;
2942 rs->last_page = 0;
2943 rs->last_version = ram_list.version;
2944 rs->xbzrle_enabled = false;
2945 postcopy_preempt_reset(rs);
2946 rs->postcopy_channel = RAM_CHANNEL_PRECOPY;
2949 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2951 /* **** functions for postcopy ***** */
2953 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2955 struct RAMBlock *block;
2957 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2958 unsigned long *bitmap = block->bmap;
2959 unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2960 unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2962 while (run_start < range) {
2963 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2964 ram_discard_range(block->idstr,
2965 ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
2966 ((ram_addr_t)(run_end - run_start))
2967 << TARGET_PAGE_BITS);
2968 run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2974 * postcopy_send_discard_bm_ram: discard a RAMBlock
2976 * Callback from postcopy_each_ram_send_discard for each RAMBlock
2978 * @ms: current migration state
2979 * @block: RAMBlock to discard
2981 static void postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
2983 unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2984 unsigned long current;
2985 unsigned long *bitmap = block->bmap;
2987 for (current = 0; current < end; ) {
2988 unsigned long one = find_next_bit(bitmap, end, current);
2989 unsigned long zero, discard_length;
2991 if (one >= end) {
2992 break;
2995 zero = find_next_zero_bit(bitmap, end, one + 1);
2997 if (zero >= end) {
2998 discard_length = end - one;
2999 } else {
3000 discard_length = zero - one;
3002 postcopy_discard_send_range(ms, one, discard_length);
3003 current = one + discard_length;
3007 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block);
3010 * postcopy_each_ram_send_discard: discard all RAMBlocks
3012 * Utility for the outgoing postcopy code.
3013 * Calls postcopy_send_discard_bm_ram for each RAMBlock
3014 * passing it bitmap indexes and name.
3015 * (qemu_ram_foreach_block ends up passing unscaled lengths
3016 * which would mean postcopy code would have to deal with target page)
3018 * @ms: current migration state
3020 static void postcopy_each_ram_send_discard(MigrationState *ms)
3022 struct RAMBlock *block;
3024 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3025 postcopy_discard_send_init(ms, block->idstr);
3028 * Deal with TPS != HPS and huge pages. It discard any partially sent
3029 * host-page size chunks, mark any partially dirty host-page size
3030 * chunks as all dirty. In this case the host-page is the host-page
3031 * for the particular RAMBlock, i.e. it might be a huge page.
3033 postcopy_chunk_hostpages_pass(ms, block);
3036 * Postcopy sends chunks of bitmap over the wire, but it
3037 * just needs indexes at this point, avoids it having
3038 * target page specific code.
3040 postcopy_send_discard_bm_ram(ms, block);
3041 postcopy_discard_send_finish(ms);
3046 * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
3048 * Helper for postcopy_chunk_hostpages; it's called twice to
3049 * canonicalize the two bitmaps, that are similar, but one is
3050 * inverted.
3052 * Postcopy requires that all target pages in a hostpage are dirty or
3053 * clean, not a mix. This function canonicalizes the bitmaps.
3055 * @ms: current migration state
3056 * @block: block that contains the page we want to canonicalize
3058 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
3060 RAMState *rs = ram_state;
3061 unsigned long *bitmap = block->bmap;
3062 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
3063 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
3064 unsigned long run_start;
3066 if (block->page_size == TARGET_PAGE_SIZE) {
3067 /* Easy case - TPS==HPS for a non-huge page RAMBlock */
3068 return;
3071 /* Find a dirty page */
3072 run_start = find_next_bit(bitmap, pages, 0);
3074 while (run_start < pages) {
3077 * If the start of this run of pages is in the middle of a host
3078 * page, then we need to fixup this host page.
3080 if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
3081 /* Find the end of this run */
3082 run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
3084 * If the end isn't at the start of a host page, then the
3085 * run doesn't finish at the end of a host page
3086 * and we need to discard.
3090 if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
3091 unsigned long page;
3092 unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
3093 host_ratio);
3094 run_start = QEMU_ALIGN_UP(run_start, host_ratio);
3096 /* Clean up the bitmap */
3097 for (page = fixup_start_addr;
3098 page < fixup_start_addr + host_ratio; page++) {
3100 * Remark them as dirty, updating the count for any pages
3101 * that weren't previously dirty.
3103 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
3107 /* Find the next dirty page for the next iteration */
3108 run_start = find_next_bit(bitmap, pages, run_start);
3113 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
3115 * Transmit the set of pages to be discarded after precopy to the target
3116 * these are pages that:
3117 * a) Have been previously transmitted but are now dirty again
3118 * b) Pages that have never been transmitted, this ensures that
3119 * any pages on the destination that have been mapped by background
3120 * tasks get discarded (transparent huge pages is the specific concern)
3121 * Hopefully this is pretty sparse
3123 * @ms: current migration state
3125 void ram_postcopy_send_discard_bitmap(MigrationState *ms)
3127 RAMState *rs = ram_state;
3129 RCU_READ_LOCK_GUARD();
3131 /* This should be our last sync, the src is now paused */
3132 migration_bitmap_sync(rs);
3134 /* Easiest way to make sure we don't resume in the middle of a host-page */
3135 rs->pss[RAM_CHANNEL_PRECOPY].last_sent_block = NULL;
3136 rs->last_seen_block = NULL;
3137 rs->last_page = 0;
3139 postcopy_each_ram_send_discard(ms);
3141 trace_ram_postcopy_send_discard_bitmap();
3145 * ram_discard_range: discard dirtied pages at the beginning of postcopy
3147 * Returns zero on success
3149 * @rbname: name of the RAMBlock of the request. NULL means the
3150 * same that last one.
3151 * @start: RAMBlock starting page
3152 * @length: RAMBlock size
3154 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
3156 trace_ram_discard_range(rbname, start, length);
3158 RCU_READ_LOCK_GUARD();
3159 RAMBlock *rb = qemu_ram_block_by_name(rbname);
3161 if (!rb) {
3162 error_report("ram_discard_range: Failed to find block '%s'", rbname);
3163 return -1;
3167 * On source VM, we don't need to update the received bitmap since
3168 * we don't even have one.
3170 if (rb->receivedmap) {
3171 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
3172 length >> qemu_target_page_bits());
3175 return ram_block_discard_range(rb, start, length);
3179 * For every allocation, we will try not to crash the VM if the
3180 * allocation failed.
3182 static int xbzrle_init(void)
3184 Error *local_err = NULL;
3186 if (!migrate_use_xbzrle()) {
3187 return 0;
3190 XBZRLE_cache_lock();
3192 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
3193 if (!XBZRLE.zero_target_page) {
3194 error_report("%s: Error allocating zero page", __func__);
3195 goto err_out;
3198 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
3199 TARGET_PAGE_SIZE, &local_err);
3200 if (!XBZRLE.cache) {
3201 error_report_err(local_err);
3202 goto free_zero_page;
3205 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
3206 if (!XBZRLE.encoded_buf) {
3207 error_report("%s: Error allocating encoded_buf", __func__);
3208 goto free_cache;
3211 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
3212 if (!XBZRLE.current_buf) {
3213 error_report("%s: Error allocating current_buf", __func__);
3214 goto free_encoded_buf;
3217 /* We are all good */
3218 XBZRLE_cache_unlock();
3219 return 0;
3221 free_encoded_buf:
3222 g_free(XBZRLE.encoded_buf);
3223 XBZRLE.encoded_buf = NULL;
3224 free_cache:
3225 cache_fini(XBZRLE.cache);
3226 XBZRLE.cache = NULL;
3227 free_zero_page:
3228 g_free(XBZRLE.zero_target_page);
3229 XBZRLE.zero_target_page = NULL;
3230 err_out:
3231 XBZRLE_cache_unlock();
3232 return -ENOMEM;
3235 static int ram_state_init(RAMState **rsp)
3237 *rsp = g_try_new0(RAMState, 1);
3239 if (!*rsp) {
3240 error_report("%s: Init ramstate fail", __func__);
3241 return -1;
3244 qemu_mutex_init(&(*rsp)->bitmap_mutex);
3245 qemu_mutex_init(&(*rsp)->src_page_req_mutex);
3246 QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
3249 * Count the total number of pages used by ram blocks not including any
3250 * gaps due to alignment or unplugs.
3251 * This must match with the initial values of dirty bitmap.
3253 (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
3254 ram_state_reset(*rsp);
3256 return 0;
3259 static void ram_list_init_bitmaps(void)
3261 MigrationState *ms = migrate_get_current();
3262 RAMBlock *block;
3263 unsigned long pages;
3264 uint8_t shift;
3266 /* Skip setting bitmap if there is no RAM */
3267 if (ram_bytes_total()) {
3268 shift = ms->clear_bitmap_shift;
3269 if (shift > CLEAR_BITMAP_SHIFT_MAX) {
3270 error_report("clear_bitmap_shift (%u) too big, using "
3271 "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
3272 shift = CLEAR_BITMAP_SHIFT_MAX;
3273 } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
3274 error_report("clear_bitmap_shift (%u) too small, using "
3275 "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
3276 shift = CLEAR_BITMAP_SHIFT_MIN;
3279 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3280 pages = block->max_length >> TARGET_PAGE_BITS;
3282 * The initial dirty bitmap for migration must be set with all
3283 * ones to make sure we'll migrate every guest RAM page to
3284 * destination.
3285 * Here we set RAMBlock.bmap all to 1 because when rebegin a
3286 * new migration after a failed migration, ram_list.
3287 * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
3288 * guest memory.
3290 block->bmap = bitmap_new(pages);
3291 bitmap_set(block->bmap, 0, pages);
3292 block->clear_bmap_shift = shift;
3293 block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
3298 static void migration_bitmap_clear_discarded_pages(RAMState *rs)
3300 unsigned long pages;
3301 RAMBlock *rb;
3303 RCU_READ_LOCK_GUARD();
3305 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3306 pages = ramblock_dirty_bitmap_clear_discarded_pages(rb);
3307 rs->migration_dirty_pages -= pages;
3311 static void ram_init_bitmaps(RAMState *rs)
3313 /* For memory_global_dirty_log_start below. */
3314 qemu_mutex_lock_iothread();
3315 qemu_mutex_lock_ramlist();
3317 WITH_RCU_READ_LOCK_GUARD() {
3318 ram_list_init_bitmaps();
3319 /* We don't use dirty log with background snapshots */
3320 if (!migrate_background_snapshot()) {
3321 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
3322 migration_bitmap_sync_precopy(rs);
3325 qemu_mutex_unlock_ramlist();
3326 qemu_mutex_unlock_iothread();
3329 * After an eventual first bitmap sync, fixup the initial bitmap
3330 * containing all 1s to exclude any discarded pages from migration.
3332 migration_bitmap_clear_discarded_pages(rs);
3335 static int ram_init_all(RAMState **rsp)
3337 if (ram_state_init(rsp)) {
3338 return -1;
3341 if (xbzrle_init()) {
3342 ram_state_cleanup(rsp);
3343 return -1;
3346 ram_init_bitmaps(*rsp);
3348 return 0;
3351 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
3353 RAMBlock *block;
3354 uint64_t pages = 0;
3357 * Postcopy is not using xbzrle/compression, so no need for that.
3358 * Also, since source are already halted, we don't need to care
3359 * about dirty page logging as well.
3362 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3363 pages += bitmap_count_one(block->bmap,
3364 block->used_length >> TARGET_PAGE_BITS);
3367 /* This may not be aligned with current bitmaps. Recalculate. */
3368 rs->migration_dirty_pages = pages;
3370 ram_state_reset(rs);
3372 /* Update RAMState cache of output QEMUFile */
3373 rs->f = out;
3375 trace_ram_state_resume_prepare(pages);
3379 * This function clears bits of the free pages reported by the caller from the
3380 * migration dirty bitmap. @addr is the host address corresponding to the
3381 * start of the continuous guest free pages, and @len is the total bytes of
3382 * those pages.
3384 void qemu_guest_free_page_hint(void *addr, size_t len)
3386 RAMBlock *block;
3387 ram_addr_t offset;
3388 size_t used_len, start, npages;
3389 MigrationState *s = migrate_get_current();
3391 /* This function is currently expected to be used during live migration */
3392 if (!migration_is_setup_or_active(s->state)) {
3393 return;
3396 for (; len > 0; len -= used_len, addr += used_len) {
3397 block = qemu_ram_block_from_host(addr, false, &offset);
3398 if (unlikely(!block || offset >= block->used_length)) {
3400 * The implementation might not support RAMBlock resize during
3401 * live migration, but it could happen in theory with future
3402 * updates. So we add a check here to capture that case.
3404 error_report_once("%s unexpected error", __func__);
3405 return;
3408 if (len <= block->used_length - offset) {
3409 used_len = len;
3410 } else {
3411 used_len = block->used_length - offset;
3414 start = offset >> TARGET_PAGE_BITS;
3415 npages = used_len >> TARGET_PAGE_BITS;
3417 qemu_mutex_lock(&ram_state->bitmap_mutex);
3419 * The skipped free pages are equavalent to be sent from clear_bmap's
3420 * perspective, so clear the bits from the memory region bitmap which
3421 * are initially set. Otherwise those skipped pages will be sent in
3422 * the next round after syncing from the memory region bitmap.
3424 migration_clear_memory_region_dirty_bitmap_range(block, start, npages);
3425 ram_state->migration_dirty_pages -=
3426 bitmap_count_one_with_offset(block->bmap, start, npages);
3427 bitmap_clear(block->bmap, start, npages);
3428 qemu_mutex_unlock(&ram_state->bitmap_mutex);
3433 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
3434 * long-running RCU critical section. When rcu-reclaims in the code
3435 * start to become numerous it will be necessary to reduce the
3436 * granularity of these critical sections.
3440 * ram_save_setup: Setup RAM for migration
3442 * Returns zero to indicate success and negative for error
3444 * @f: QEMUFile where to send the data
3445 * @opaque: RAMState pointer
3447 static int ram_save_setup(QEMUFile *f, void *opaque)
3449 RAMState **rsp = opaque;
3450 RAMBlock *block;
3451 int ret;
3453 if (compress_threads_save_setup()) {
3454 return -1;
3457 /* migration has already setup the bitmap, reuse it. */
3458 if (!migration_in_colo_state()) {
3459 if (ram_init_all(rsp) != 0) {
3460 compress_threads_save_cleanup();
3461 return -1;
3464 (*rsp)->f = f;
3466 WITH_RCU_READ_LOCK_GUARD() {
3467 qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
3469 RAMBLOCK_FOREACH_MIGRATABLE(block) {
3470 qemu_put_byte(f, strlen(block->idstr));
3471 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
3472 qemu_put_be64(f, block->used_length);
3473 if (migrate_postcopy_ram() && block->page_size !=
3474 qemu_host_page_size) {
3475 qemu_put_be64(f, block->page_size);
3477 if (migrate_ignore_shared()) {
3478 qemu_put_be64(f, block->mr->addr);
3483 ram_control_before_iterate(f, RAM_CONTROL_SETUP);
3484 ram_control_after_iterate(f, RAM_CONTROL_SETUP);
3486 ret = multifd_send_sync_main(f);
3487 if (ret < 0) {
3488 return ret;
3491 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3492 qemu_fflush(f);
3494 return 0;
3498 * ram_save_iterate: iterative stage for migration
3500 * Returns zero to indicate success and negative for error
3502 * @f: QEMUFile where to send the data
3503 * @opaque: RAMState pointer
3505 static int ram_save_iterate(QEMUFile *f, void *opaque)
3507 RAMState **temp = opaque;
3508 RAMState *rs = *temp;
3509 int ret = 0;
3510 int i;
3511 int64_t t0;
3512 int done = 0;
3514 if (blk_mig_bulk_active()) {
3515 /* Avoid transferring ram during bulk phase of block migration as
3516 * the bulk phase will usually take a long time and transferring
3517 * ram updates during that time is pointless. */
3518 goto out;
3522 * We'll take this lock a little bit long, but it's okay for two reasons.
3523 * Firstly, the only possible other thread to take it is who calls
3524 * qemu_guest_free_page_hint(), which should be rare; secondly, see
3525 * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which
3526 * guarantees that we'll at least released it in a regular basis.
3528 qemu_mutex_lock(&rs->bitmap_mutex);
3529 WITH_RCU_READ_LOCK_GUARD() {
3530 if (ram_list.version != rs->last_version) {
3531 ram_state_reset(rs);
3534 /* Read version before ram_list.blocks */
3535 smp_rmb();
3537 ram_control_before_iterate(f, RAM_CONTROL_ROUND);
3539 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
3540 i = 0;
3541 while ((ret = qemu_file_rate_limit(f)) == 0 ||
3542 postcopy_has_request(rs)) {
3543 int pages;
3545 if (qemu_file_get_error(f)) {
3546 break;
3549 pages = ram_find_and_save_block(rs);
3550 /* no more pages to sent */
3551 if (pages == 0) {
3552 done = 1;
3553 break;
3556 if (pages < 0) {
3557 qemu_file_set_error(f, pages);
3558 break;
3561 rs->target_page_count += pages;
3564 * During postcopy, it is necessary to make sure one whole host
3565 * page is sent in one chunk.
3567 if (migrate_postcopy_ram()) {
3568 flush_compressed_data(rs);
3572 * we want to check in the 1st loop, just in case it was the 1st
3573 * time and we had to sync the dirty bitmap.
3574 * qemu_clock_get_ns() is a bit expensive, so we only check each
3575 * some iterations
3577 if ((i & 63) == 0) {
3578 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
3579 1000000;
3580 if (t1 > MAX_WAIT) {
3581 trace_ram_save_iterate_big_wait(t1, i);
3582 break;
3585 i++;
3588 qemu_mutex_unlock(&rs->bitmap_mutex);
3590 postcopy_preempt_reset_channel(rs);
3593 * Must occur before EOS (or any QEMUFile operation)
3594 * because of RDMA protocol.
3596 ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3598 out:
3599 if (ret >= 0
3600 && migration_is_setup_or_active(migrate_get_current()->state)) {
3601 ret = multifd_send_sync_main(rs->f);
3602 if (ret < 0) {
3603 return ret;
3606 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3607 qemu_fflush(f);
3608 ram_transferred_add(8);
3610 ret = qemu_file_get_error(f);
3612 if (ret < 0) {
3613 return ret;
3616 return done;
3620 * ram_save_complete: function called to send the remaining amount of ram
3622 * Returns zero to indicate success or negative on error
3624 * Called with iothread lock
3626 * @f: QEMUFile where to send the data
3627 * @opaque: RAMState pointer
3629 static int ram_save_complete(QEMUFile *f, void *opaque)
3631 RAMState **temp = opaque;
3632 RAMState *rs = *temp;
3633 int ret = 0;
3635 rs->last_stage = !migration_in_colo_state();
3637 WITH_RCU_READ_LOCK_GUARD() {
3638 if (!migration_in_postcopy()) {
3639 migration_bitmap_sync_precopy(rs);
3642 ram_control_before_iterate(f, RAM_CONTROL_FINISH);
3644 /* try transferring iterative blocks of memory */
3646 /* flush all remaining blocks regardless of rate limiting */
3647 qemu_mutex_lock(&rs->bitmap_mutex);
3648 while (true) {
3649 int pages;
3651 pages = ram_find_and_save_block(rs);
3652 /* no more blocks to sent */
3653 if (pages == 0) {
3654 break;
3656 if (pages < 0) {
3657 ret = pages;
3658 break;
3661 qemu_mutex_unlock(&rs->bitmap_mutex);
3663 flush_compressed_data(rs);
3664 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3667 if (ret < 0) {
3668 return ret;
3671 postcopy_preempt_reset_channel(rs);
3673 ret = multifd_send_sync_main(rs->f);
3674 if (ret < 0) {
3675 return ret;
3678 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3679 qemu_fflush(f);
3681 return 0;
3684 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
3685 uint64_t *res_precopy_only,
3686 uint64_t *res_compatible,
3687 uint64_t *res_postcopy_only)
3689 RAMState **temp = opaque;
3690 RAMState *rs = *temp;
3691 uint64_t remaining_size;
3693 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3695 if (!migration_in_postcopy() &&
3696 remaining_size < max_size) {
3697 qemu_mutex_lock_iothread();
3698 WITH_RCU_READ_LOCK_GUARD() {
3699 migration_bitmap_sync_precopy(rs);
3701 qemu_mutex_unlock_iothread();
3702 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3705 if (migrate_postcopy_ram()) {
3706 /* We can do postcopy, and all the data is postcopiable */
3707 *res_compatible += remaining_size;
3708 } else {
3709 *res_precopy_only += remaining_size;
3713 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3715 unsigned int xh_len;
3716 int xh_flags;
3717 uint8_t *loaded_data;
3719 /* extract RLE header */
3720 xh_flags = qemu_get_byte(f);
3721 xh_len = qemu_get_be16(f);
3723 if (xh_flags != ENCODING_FLAG_XBZRLE) {
3724 error_report("Failed to load XBZRLE page - wrong compression!");
3725 return -1;
3728 if (xh_len > TARGET_PAGE_SIZE) {
3729 error_report("Failed to load XBZRLE page - len overflow!");
3730 return -1;
3732 loaded_data = XBZRLE.decoded_buf;
3733 /* load data and decode */
3734 /* it can change loaded_data to point to an internal buffer */
3735 qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3737 /* decode RLE */
3738 if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3739 TARGET_PAGE_SIZE) == -1) {
3740 error_report("Failed to load XBZRLE page - decode error!");
3741 return -1;
3744 return 0;
3748 * ram_block_from_stream: read a RAMBlock id from the migration stream
3750 * Must be called from within a rcu critical section.
3752 * Returns a pointer from within the RCU-protected ram_list.
3754 * @mis: the migration incoming state pointer
3755 * @f: QEMUFile where to read the data from
3756 * @flags: Page flags (mostly to see if it's a continuation of previous block)
3757 * @channel: the channel we're using
3759 static inline RAMBlock *ram_block_from_stream(MigrationIncomingState *mis,
3760 QEMUFile *f, int flags,
3761 int channel)
3763 RAMBlock *block = mis->last_recv_block[channel];
3764 char id[256];
3765 uint8_t len;
3767 if (flags & RAM_SAVE_FLAG_CONTINUE) {
3768 if (!block) {
3769 error_report("Ack, bad migration stream!");
3770 return NULL;
3772 return block;
3775 len = qemu_get_byte(f);
3776 qemu_get_buffer(f, (uint8_t *)id, len);
3777 id[len] = 0;
3779 block = qemu_ram_block_by_name(id);
3780 if (!block) {
3781 error_report("Can't find block %s", id);
3782 return NULL;
3785 if (ramblock_is_ignored(block)) {
3786 error_report("block %s should not be migrated !", id);
3787 return NULL;
3790 mis->last_recv_block[channel] = block;
3792 return block;
3795 static inline void *host_from_ram_block_offset(RAMBlock *block,
3796 ram_addr_t offset)
3798 if (!offset_in_ramblock(block, offset)) {
3799 return NULL;
3802 return block->host + offset;
3805 static void *host_page_from_ram_block_offset(RAMBlock *block,
3806 ram_addr_t offset)
3808 /* Note: Explicitly no check against offset_in_ramblock(). */
3809 return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset),
3810 block->page_size);
3813 static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block,
3814 ram_addr_t offset)
3816 return ((uintptr_t)block->host + offset) & (block->page_size - 1);
3819 static inline void *colo_cache_from_block_offset(RAMBlock *block,
3820 ram_addr_t offset, bool record_bitmap)
3822 if (!offset_in_ramblock(block, offset)) {
3823 return NULL;
3825 if (!block->colo_cache) {
3826 error_report("%s: colo_cache is NULL in block :%s",
3827 __func__, block->idstr);
3828 return NULL;
3832 * During colo checkpoint, we need bitmap of these migrated pages.
3833 * It help us to decide which pages in ram cache should be flushed
3834 * into VM's RAM later.
3836 if (record_bitmap &&
3837 !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
3838 ram_state->migration_dirty_pages++;
3840 return block->colo_cache + offset;
3844 * ram_handle_compressed: handle the zero page case
3846 * If a page (or a whole RDMA chunk) has been
3847 * determined to be zero, then zap it.
3849 * @host: host address for the zero page
3850 * @ch: what the page is filled from. We only support zero
3851 * @size: size of the zero page
3853 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3855 if (ch != 0 || !buffer_is_zero(host, size)) {
3856 memset(host, ch, size);
3860 /* return the size after decompression, or negative value on error */
3861 static int
3862 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3863 const uint8_t *source, size_t source_len)
3865 int err;
3867 err = inflateReset(stream);
3868 if (err != Z_OK) {
3869 return -1;
3872 stream->avail_in = source_len;
3873 stream->next_in = (uint8_t *)source;
3874 stream->avail_out = dest_len;
3875 stream->next_out = dest;
3877 err = inflate(stream, Z_NO_FLUSH);
3878 if (err != Z_STREAM_END) {
3879 return -1;
3882 return stream->total_out;
3885 static void *do_data_decompress(void *opaque)
3887 DecompressParam *param = opaque;
3888 unsigned long pagesize;
3889 uint8_t *des;
3890 int len, ret;
3892 qemu_mutex_lock(&param->mutex);
3893 while (!param->quit) {
3894 if (param->des) {
3895 des = param->des;
3896 len = param->len;
3897 param->des = 0;
3898 qemu_mutex_unlock(&param->mutex);
3900 pagesize = TARGET_PAGE_SIZE;
3902 ret = qemu_uncompress_data(&param->stream, des, pagesize,
3903 param->compbuf, len);
3904 if (ret < 0 && migrate_get_current()->decompress_error_check) {
3905 error_report("decompress data failed");
3906 qemu_file_set_error(decomp_file, ret);
3909 qemu_mutex_lock(&decomp_done_lock);
3910 param->done = true;
3911 qemu_cond_signal(&decomp_done_cond);
3912 qemu_mutex_unlock(&decomp_done_lock);
3914 qemu_mutex_lock(&param->mutex);
3915 } else {
3916 qemu_cond_wait(&param->cond, &param->mutex);
3919 qemu_mutex_unlock(&param->mutex);
3921 return NULL;
3924 static int wait_for_decompress_done(void)
3926 int idx, thread_count;
3928 if (!migrate_use_compression()) {
3929 return 0;
3932 thread_count = migrate_decompress_threads();
3933 qemu_mutex_lock(&decomp_done_lock);
3934 for (idx = 0; idx < thread_count; idx++) {
3935 while (!decomp_param[idx].done) {
3936 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3939 qemu_mutex_unlock(&decomp_done_lock);
3940 return qemu_file_get_error(decomp_file);
3943 static void compress_threads_load_cleanup(void)
3945 int i, thread_count;
3947 if (!migrate_use_compression()) {
3948 return;
3950 thread_count = migrate_decompress_threads();
3951 for (i = 0; i < thread_count; i++) {
3953 * we use it as a indicator which shows if the thread is
3954 * properly init'd or not
3956 if (!decomp_param[i].compbuf) {
3957 break;
3960 qemu_mutex_lock(&decomp_param[i].mutex);
3961 decomp_param[i].quit = true;
3962 qemu_cond_signal(&decomp_param[i].cond);
3963 qemu_mutex_unlock(&decomp_param[i].mutex);
3965 for (i = 0; i < thread_count; i++) {
3966 if (!decomp_param[i].compbuf) {
3967 break;
3970 qemu_thread_join(decompress_threads + i);
3971 qemu_mutex_destroy(&decomp_param[i].mutex);
3972 qemu_cond_destroy(&decomp_param[i].cond);
3973 inflateEnd(&decomp_param[i].stream);
3974 g_free(decomp_param[i].compbuf);
3975 decomp_param[i].compbuf = NULL;
3977 g_free(decompress_threads);
3978 g_free(decomp_param);
3979 decompress_threads = NULL;
3980 decomp_param = NULL;
3981 decomp_file = NULL;
3984 static int compress_threads_load_setup(QEMUFile *f)
3986 int i, thread_count;
3988 if (!migrate_use_compression()) {
3989 return 0;
3992 thread_count = migrate_decompress_threads();
3993 decompress_threads = g_new0(QemuThread, thread_count);
3994 decomp_param = g_new0(DecompressParam, thread_count);
3995 qemu_mutex_init(&decomp_done_lock);
3996 qemu_cond_init(&decomp_done_cond);
3997 decomp_file = f;
3998 for (i = 0; i < thread_count; i++) {
3999 if (inflateInit(&decomp_param[i].stream) != Z_OK) {
4000 goto exit;
4003 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
4004 qemu_mutex_init(&decomp_param[i].mutex);
4005 qemu_cond_init(&decomp_param[i].cond);
4006 decomp_param[i].done = true;
4007 decomp_param[i].quit = false;
4008 qemu_thread_create(decompress_threads + i, "decompress",
4009 do_data_decompress, decomp_param + i,
4010 QEMU_THREAD_JOINABLE);
4012 return 0;
4013 exit:
4014 compress_threads_load_cleanup();
4015 return -1;
4018 static void decompress_data_with_multi_threads(QEMUFile *f,
4019 void *host, int len)
4021 int idx, thread_count;
4023 thread_count = migrate_decompress_threads();
4024 QEMU_LOCK_GUARD(&decomp_done_lock);
4025 while (true) {
4026 for (idx = 0; idx < thread_count; idx++) {
4027 if (decomp_param[idx].done) {
4028 decomp_param[idx].done = false;
4029 qemu_mutex_lock(&decomp_param[idx].mutex);
4030 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
4031 decomp_param[idx].des = host;
4032 decomp_param[idx].len = len;
4033 qemu_cond_signal(&decomp_param[idx].cond);
4034 qemu_mutex_unlock(&decomp_param[idx].mutex);
4035 break;
4038 if (idx < thread_count) {
4039 break;
4040 } else {
4041 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
4046 static void colo_init_ram_state(void)
4048 ram_state_init(&ram_state);
4052 * colo cache: this is for secondary VM, we cache the whole
4053 * memory of the secondary VM, it is need to hold the global lock
4054 * to call this helper.
4056 int colo_init_ram_cache(void)
4058 RAMBlock *block;
4060 WITH_RCU_READ_LOCK_GUARD() {
4061 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4062 block->colo_cache = qemu_anon_ram_alloc(block->used_length,
4063 NULL, false, false);
4064 if (!block->colo_cache) {
4065 error_report("%s: Can't alloc memory for COLO cache of block %s,"
4066 "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
4067 block->used_length);
4068 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4069 if (block->colo_cache) {
4070 qemu_anon_ram_free(block->colo_cache, block->used_length);
4071 block->colo_cache = NULL;
4074 return -errno;
4076 if (!machine_dump_guest_core(current_machine)) {
4077 qemu_madvise(block->colo_cache, block->used_length,
4078 QEMU_MADV_DONTDUMP);
4084 * Record the dirty pages that sent by PVM, we use this dirty bitmap together
4085 * with to decide which page in cache should be flushed into SVM's RAM. Here
4086 * we use the same name 'ram_bitmap' as for migration.
4088 if (ram_bytes_total()) {
4089 RAMBlock *block;
4091 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4092 unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
4093 block->bmap = bitmap_new(pages);
4097 colo_init_ram_state();
4098 return 0;
4101 /* TODO: duplicated with ram_init_bitmaps */
4102 void colo_incoming_start_dirty_log(void)
4104 RAMBlock *block = NULL;
4105 /* For memory_global_dirty_log_start below. */
4106 qemu_mutex_lock_iothread();
4107 qemu_mutex_lock_ramlist();
4109 memory_global_dirty_log_sync();
4110 WITH_RCU_READ_LOCK_GUARD() {
4111 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4112 ramblock_sync_dirty_bitmap(ram_state, block);
4113 /* Discard this dirty bitmap record */
4114 bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
4116 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
4118 ram_state->migration_dirty_pages = 0;
4119 qemu_mutex_unlock_ramlist();
4120 qemu_mutex_unlock_iothread();
4123 /* It is need to hold the global lock to call this helper */
4124 void colo_release_ram_cache(void)
4126 RAMBlock *block;
4128 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
4129 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4130 g_free(block->bmap);
4131 block->bmap = NULL;
4134 WITH_RCU_READ_LOCK_GUARD() {
4135 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4136 if (block->colo_cache) {
4137 qemu_anon_ram_free(block->colo_cache, block->used_length);
4138 block->colo_cache = NULL;
4142 ram_state_cleanup(&ram_state);
4146 * ram_load_setup: Setup RAM for migration incoming side
4148 * Returns zero to indicate success and negative for error
4150 * @f: QEMUFile where to receive the data
4151 * @opaque: RAMState pointer
4153 static int ram_load_setup(QEMUFile *f, void *opaque)
4155 if (compress_threads_load_setup(f)) {
4156 return -1;
4159 xbzrle_load_setup();
4160 ramblock_recv_map_init();
4162 return 0;
4165 static int ram_load_cleanup(void *opaque)
4167 RAMBlock *rb;
4169 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4170 qemu_ram_block_writeback(rb);
4173 xbzrle_load_cleanup();
4174 compress_threads_load_cleanup();
4176 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4177 g_free(rb->receivedmap);
4178 rb->receivedmap = NULL;
4181 return 0;
4185 * ram_postcopy_incoming_init: allocate postcopy data structures
4187 * Returns 0 for success and negative if there was one error
4189 * @mis: current migration incoming state
4191 * Allocate data structures etc needed by incoming migration with
4192 * postcopy-ram. postcopy-ram's similarly names
4193 * postcopy_ram_incoming_init does the work.
4195 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
4197 return postcopy_ram_incoming_init(mis);
4201 * ram_load_postcopy: load a page in postcopy case
4203 * Returns 0 for success or -errno in case of error
4205 * Called in postcopy mode by ram_load().
4206 * rcu_read_lock is taken prior to this being called.
4208 * @f: QEMUFile where to send the data
4209 * @channel: the channel to use for loading
4211 int ram_load_postcopy(QEMUFile *f, int channel)
4213 int flags = 0, ret = 0;
4214 bool place_needed = false;
4215 bool matches_target_page_size = false;
4216 MigrationIncomingState *mis = migration_incoming_get_current();
4217 PostcopyTmpPage *tmp_page = &mis->postcopy_tmp_pages[channel];
4219 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
4220 ram_addr_t addr;
4221 void *page_buffer = NULL;
4222 void *place_source = NULL;
4223 RAMBlock *block = NULL;
4224 uint8_t ch;
4225 int len;
4227 addr = qemu_get_be64(f);
4230 * If qemu file error, we should stop here, and then "addr"
4231 * may be invalid
4233 ret = qemu_file_get_error(f);
4234 if (ret) {
4235 break;
4238 flags = addr & ~TARGET_PAGE_MASK;
4239 addr &= TARGET_PAGE_MASK;
4241 trace_ram_load_postcopy_loop(channel, (uint64_t)addr, flags);
4242 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
4243 RAM_SAVE_FLAG_COMPRESS_PAGE)) {
4244 block = ram_block_from_stream(mis, f, flags, channel);
4245 if (!block) {
4246 ret = -EINVAL;
4247 break;
4251 * Relying on used_length is racy and can result in false positives.
4252 * We might place pages beyond used_length in case RAM was shrunk
4253 * while in postcopy, which is fine - trying to place via
4254 * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault.
4256 if (!block->host || addr >= block->postcopy_length) {
4257 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4258 ret = -EINVAL;
4259 break;
4261 tmp_page->target_pages++;
4262 matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
4264 * Postcopy requires that we place whole host pages atomically;
4265 * these may be huge pages for RAMBlocks that are backed by
4266 * hugetlbfs.
4267 * To make it atomic, the data is read into a temporary page
4268 * that's moved into place later.
4269 * The migration protocol uses, possibly smaller, target-pages
4270 * however the source ensures it always sends all the components
4271 * of a host page in one chunk.
4273 page_buffer = tmp_page->tmp_huge_page +
4274 host_page_offset_from_ram_block_offset(block, addr);
4275 /* If all TP are zero then we can optimise the place */
4276 if (tmp_page->target_pages == 1) {
4277 tmp_page->host_addr =
4278 host_page_from_ram_block_offset(block, addr);
4279 } else if (tmp_page->host_addr !=
4280 host_page_from_ram_block_offset(block, addr)) {
4281 /* not the 1st TP within the HP */
4282 error_report("Non-same host page detected on channel %d: "
4283 "Target host page %p, received host page %p "
4284 "(rb %s offset 0x"RAM_ADDR_FMT" target_pages %d)",
4285 channel, tmp_page->host_addr,
4286 host_page_from_ram_block_offset(block, addr),
4287 block->idstr, addr, tmp_page->target_pages);
4288 ret = -EINVAL;
4289 break;
4293 * If it's the last part of a host page then we place the host
4294 * page
4296 if (tmp_page->target_pages ==
4297 (block->page_size / TARGET_PAGE_SIZE)) {
4298 place_needed = true;
4300 place_source = tmp_page->tmp_huge_page;
4303 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4304 case RAM_SAVE_FLAG_ZERO:
4305 ch = qemu_get_byte(f);
4307 * Can skip to set page_buffer when
4308 * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
4310 if (ch || !matches_target_page_size) {
4311 memset(page_buffer, ch, TARGET_PAGE_SIZE);
4313 if (ch) {
4314 tmp_page->all_zero = false;
4316 break;
4318 case RAM_SAVE_FLAG_PAGE:
4319 tmp_page->all_zero = false;
4320 if (!matches_target_page_size) {
4321 /* For huge pages, we always use temporary buffer */
4322 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
4323 } else {
4325 * For small pages that matches target page size, we
4326 * avoid the qemu_file copy. Instead we directly use
4327 * the buffer of QEMUFile to place the page. Note: we
4328 * cannot do any QEMUFile operation before using that
4329 * buffer to make sure the buffer is valid when
4330 * placing the page.
4332 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
4333 TARGET_PAGE_SIZE);
4335 break;
4336 case RAM_SAVE_FLAG_COMPRESS_PAGE:
4337 tmp_page->all_zero = false;
4338 len = qemu_get_be32(f);
4339 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4340 error_report("Invalid compressed data length: %d", len);
4341 ret = -EINVAL;
4342 break;
4344 decompress_data_with_multi_threads(f, page_buffer, len);
4345 break;
4347 case RAM_SAVE_FLAG_EOS:
4348 /* normal exit */
4349 multifd_recv_sync_main();
4350 break;
4351 default:
4352 error_report("Unknown combination of migration flags: 0x%x"
4353 " (postcopy mode)", flags);
4354 ret = -EINVAL;
4355 break;
4358 /* Got the whole host page, wait for decompress before placing. */
4359 if (place_needed) {
4360 ret |= wait_for_decompress_done();
4363 /* Detect for any possible file errors */
4364 if (!ret && qemu_file_get_error(f)) {
4365 ret = qemu_file_get_error(f);
4368 if (!ret && place_needed) {
4369 if (tmp_page->all_zero) {
4370 ret = postcopy_place_page_zero(mis, tmp_page->host_addr, block);
4371 } else {
4372 ret = postcopy_place_page(mis, tmp_page->host_addr,
4373 place_source, block);
4375 place_needed = false;
4376 postcopy_temp_page_reset(tmp_page);
4380 return ret;
4383 static bool postcopy_is_advised(void)
4385 PostcopyState ps = postcopy_state_get();
4386 return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
4389 static bool postcopy_is_running(void)
4391 PostcopyState ps = postcopy_state_get();
4392 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
4396 * Flush content of RAM cache into SVM's memory.
4397 * Only flush the pages that be dirtied by PVM or SVM or both.
4399 void colo_flush_ram_cache(void)
4401 RAMBlock *block = NULL;
4402 void *dst_host;
4403 void *src_host;
4404 unsigned long offset = 0;
4406 memory_global_dirty_log_sync();
4407 WITH_RCU_READ_LOCK_GUARD() {
4408 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4409 ramblock_sync_dirty_bitmap(ram_state, block);
4413 trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
4414 WITH_RCU_READ_LOCK_GUARD() {
4415 block = QLIST_FIRST_RCU(&ram_list.blocks);
4417 while (block) {
4418 unsigned long num = 0;
4420 offset = colo_bitmap_find_dirty(ram_state, block, offset, &num);
4421 if (!offset_in_ramblock(block,
4422 ((ram_addr_t)offset) << TARGET_PAGE_BITS)) {
4423 offset = 0;
4424 num = 0;
4425 block = QLIST_NEXT_RCU(block, next);
4426 } else {
4427 unsigned long i = 0;
4429 for (i = 0; i < num; i++) {
4430 migration_bitmap_clear_dirty(ram_state, block, offset + i);
4432 dst_host = block->host
4433 + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
4434 src_host = block->colo_cache
4435 + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
4436 memcpy(dst_host, src_host, TARGET_PAGE_SIZE * num);
4437 offset += num;
4441 trace_colo_flush_ram_cache_end();
4445 * ram_load_precopy: load pages in precopy case
4447 * Returns 0 for success or -errno in case of error
4449 * Called in precopy mode by ram_load().
4450 * rcu_read_lock is taken prior to this being called.
4452 * @f: QEMUFile where to send the data
4454 static int ram_load_precopy(QEMUFile *f)
4456 MigrationIncomingState *mis = migration_incoming_get_current();
4457 int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
4458 /* ADVISE is earlier, it shows the source has the postcopy capability on */
4459 bool postcopy_advised = postcopy_is_advised();
4460 if (!migrate_use_compression()) {
4461 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
4464 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
4465 ram_addr_t addr, total_ram_bytes;
4466 void *host = NULL, *host_bak = NULL;
4467 uint8_t ch;
4470 * Yield periodically to let main loop run, but an iteration of
4471 * the main loop is expensive, so do it each some iterations
4473 if ((i & 32767) == 0 && qemu_in_coroutine()) {
4474 aio_co_schedule(qemu_get_current_aio_context(),
4475 qemu_coroutine_self());
4476 qemu_coroutine_yield();
4478 i++;
4480 addr = qemu_get_be64(f);
4481 flags = addr & ~TARGET_PAGE_MASK;
4482 addr &= TARGET_PAGE_MASK;
4484 if (flags & invalid_flags) {
4485 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
4486 error_report("Received an unexpected compressed page");
4489 ret = -EINVAL;
4490 break;
4493 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
4494 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
4495 RAMBlock *block = ram_block_from_stream(mis, f, flags,
4496 RAM_CHANNEL_PRECOPY);
4498 host = host_from_ram_block_offset(block, addr);
4500 * After going into COLO stage, we should not load the page
4501 * into SVM's memory directly, we put them into colo_cache firstly.
4502 * NOTE: We need to keep a copy of SVM's ram in colo_cache.
4503 * Previously, we copied all these memory in preparing stage of COLO
4504 * while we need to stop VM, which is a time-consuming process.
4505 * Here we optimize it by a trick, back-up every page while in
4506 * migration process while COLO is enabled, though it affects the
4507 * speed of the migration, but it obviously reduce the downtime of
4508 * back-up all SVM'S memory in COLO preparing stage.
4510 if (migration_incoming_colo_enabled()) {
4511 if (migration_incoming_in_colo_state()) {
4512 /* In COLO stage, put all pages into cache temporarily */
4513 host = colo_cache_from_block_offset(block, addr, true);
4514 } else {
4516 * In migration stage but before COLO stage,
4517 * Put all pages into both cache and SVM's memory.
4519 host_bak = colo_cache_from_block_offset(block, addr, false);
4522 if (!host) {
4523 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4524 ret = -EINVAL;
4525 break;
4527 if (!migration_incoming_in_colo_state()) {
4528 ramblock_recv_bitmap_set(block, host);
4531 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
4534 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4535 case RAM_SAVE_FLAG_MEM_SIZE:
4536 /* Synchronize RAM block list */
4537 total_ram_bytes = addr;
4538 while (!ret && total_ram_bytes) {
4539 RAMBlock *block;
4540 char id[256];
4541 ram_addr_t length;
4543 len = qemu_get_byte(f);
4544 qemu_get_buffer(f, (uint8_t *)id, len);
4545 id[len] = 0;
4546 length = qemu_get_be64(f);
4548 block = qemu_ram_block_by_name(id);
4549 if (block && !qemu_ram_is_migratable(block)) {
4550 error_report("block %s should not be migrated !", id);
4551 ret = -EINVAL;
4552 } else if (block) {
4553 if (length != block->used_length) {
4554 Error *local_err = NULL;
4556 ret = qemu_ram_resize(block, length,
4557 &local_err);
4558 if (local_err) {
4559 error_report_err(local_err);
4562 /* For postcopy we need to check hugepage sizes match */
4563 if (postcopy_advised && migrate_postcopy_ram() &&
4564 block->page_size != qemu_host_page_size) {
4565 uint64_t remote_page_size = qemu_get_be64(f);
4566 if (remote_page_size != block->page_size) {
4567 error_report("Mismatched RAM page size %s "
4568 "(local) %zd != %" PRId64,
4569 id, block->page_size,
4570 remote_page_size);
4571 ret = -EINVAL;
4574 if (migrate_ignore_shared()) {
4575 hwaddr addr = qemu_get_be64(f);
4576 if (ramblock_is_ignored(block) &&
4577 block->mr->addr != addr) {
4578 error_report("Mismatched GPAs for block %s "
4579 "%" PRId64 "!= %" PRId64,
4580 id, (uint64_t)addr,
4581 (uint64_t)block->mr->addr);
4582 ret = -EINVAL;
4585 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
4586 block->idstr);
4587 } else {
4588 error_report("Unknown ramblock \"%s\", cannot "
4589 "accept migration", id);
4590 ret = -EINVAL;
4593 total_ram_bytes -= length;
4595 break;
4597 case RAM_SAVE_FLAG_ZERO:
4598 ch = qemu_get_byte(f);
4599 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
4600 break;
4602 case RAM_SAVE_FLAG_PAGE:
4603 qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
4604 break;
4606 case RAM_SAVE_FLAG_COMPRESS_PAGE:
4607 len = qemu_get_be32(f);
4608 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4609 error_report("Invalid compressed data length: %d", len);
4610 ret = -EINVAL;
4611 break;
4613 decompress_data_with_multi_threads(f, host, len);
4614 break;
4616 case RAM_SAVE_FLAG_XBZRLE:
4617 if (load_xbzrle(f, addr, host) < 0) {
4618 error_report("Failed to decompress XBZRLE page at "
4619 RAM_ADDR_FMT, addr);
4620 ret = -EINVAL;
4621 break;
4623 break;
4624 case RAM_SAVE_FLAG_EOS:
4625 /* normal exit */
4626 multifd_recv_sync_main();
4627 break;
4628 default:
4629 if (flags & RAM_SAVE_FLAG_HOOK) {
4630 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
4631 } else {
4632 error_report("Unknown combination of migration flags: 0x%x",
4633 flags);
4634 ret = -EINVAL;
4637 if (!ret) {
4638 ret = qemu_file_get_error(f);
4640 if (!ret && host_bak) {
4641 memcpy(host_bak, host, TARGET_PAGE_SIZE);
4645 ret |= wait_for_decompress_done();
4646 return ret;
4649 static int ram_load(QEMUFile *f, void *opaque, int version_id)
4651 int ret = 0;
4652 static uint64_t seq_iter;
4654 * If system is running in postcopy mode, page inserts to host memory must
4655 * be atomic
4657 bool postcopy_running = postcopy_is_running();
4659 seq_iter++;
4661 if (version_id != 4) {
4662 return -EINVAL;
4666 * This RCU critical section can be very long running.
4667 * When RCU reclaims in the code start to become numerous,
4668 * it will be necessary to reduce the granularity of this
4669 * critical section.
4671 WITH_RCU_READ_LOCK_GUARD() {
4672 if (postcopy_running) {
4674 * Note! Here RAM_CHANNEL_PRECOPY is the precopy channel of
4675 * postcopy migration, we have another RAM_CHANNEL_POSTCOPY to
4676 * service fast page faults.
4678 ret = ram_load_postcopy(f, RAM_CHANNEL_PRECOPY);
4679 } else {
4680 ret = ram_load_precopy(f);
4683 trace_ram_load_complete(ret, seq_iter);
4685 return ret;
4688 static bool ram_has_postcopy(void *opaque)
4690 RAMBlock *rb;
4691 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4692 if (ramblock_is_pmem(rb)) {
4693 info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4694 "is not supported now!", rb->idstr, rb->host);
4695 return false;
4699 return migrate_postcopy_ram();
4702 /* Sync all the dirty bitmap with destination VM. */
4703 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4705 RAMBlock *block;
4706 QEMUFile *file = s->to_dst_file;
4707 int ramblock_count = 0;
4709 trace_ram_dirty_bitmap_sync_start();
4711 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4712 qemu_savevm_send_recv_bitmap(file, block->idstr);
4713 trace_ram_dirty_bitmap_request(block->idstr);
4714 ramblock_count++;
4717 trace_ram_dirty_bitmap_sync_wait();
4719 /* Wait until all the ramblocks' dirty bitmap synced */
4720 while (ramblock_count--) {
4721 qemu_sem_wait(&s->rp_state.rp_sem);
4724 trace_ram_dirty_bitmap_sync_complete();
4726 return 0;
4729 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4731 qemu_sem_post(&s->rp_state.rp_sem);
4735 * Read the received bitmap, revert it as the initial dirty bitmap.
4736 * This is only used when the postcopy migration is paused but wants
4737 * to resume from a middle point.
4739 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4741 int ret = -EINVAL;
4742 /* from_dst_file is always valid because we're within rp_thread */
4743 QEMUFile *file = s->rp_state.from_dst_file;
4744 unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
4745 uint64_t local_size = DIV_ROUND_UP(nbits, 8);
4746 uint64_t size, end_mark;
4748 trace_ram_dirty_bitmap_reload_begin(block->idstr);
4750 if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4751 error_report("%s: incorrect state %s", __func__,
4752 MigrationStatus_str(s->state));
4753 return -EINVAL;
4757 * Note: see comments in ramblock_recv_bitmap_send() on why we
4758 * need the endianness conversion, and the paddings.
4760 local_size = ROUND_UP(local_size, 8);
4762 /* Add paddings */
4763 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4765 size = qemu_get_be64(file);
4767 /* The size of the bitmap should match with our ramblock */
4768 if (size != local_size) {
4769 error_report("%s: ramblock '%s' bitmap size mismatch "
4770 "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4771 block->idstr, size, local_size);
4772 ret = -EINVAL;
4773 goto out;
4776 size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4777 end_mark = qemu_get_be64(file);
4779 ret = qemu_file_get_error(file);
4780 if (ret || size != local_size) {
4781 error_report("%s: read bitmap failed for ramblock '%s': %d"
4782 " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4783 __func__, block->idstr, ret, local_size, size);
4784 ret = -EIO;
4785 goto out;
4788 if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4789 error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64,
4790 __func__, block->idstr, end_mark);
4791 ret = -EINVAL;
4792 goto out;
4796 * Endianness conversion. We are during postcopy (though paused).
4797 * The dirty bitmap won't change. We can directly modify it.
4799 bitmap_from_le(block->bmap, le_bitmap, nbits);
4802 * What we received is "received bitmap". Revert it as the initial
4803 * dirty bitmap for this ramblock.
4805 bitmap_complement(block->bmap, block->bmap, nbits);
4807 /* Clear dirty bits of discarded ranges that we don't want to migrate. */
4808 ramblock_dirty_bitmap_clear_discarded_pages(block);
4810 /* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */
4811 trace_ram_dirty_bitmap_reload_complete(block->idstr);
4814 * We succeeded to sync bitmap for current ramblock. If this is
4815 * the last one to sync, we need to notify the main send thread.
4817 ram_dirty_bitmap_reload_notify(s);
4819 ret = 0;
4820 out:
4821 g_free(le_bitmap);
4822 return ret;
4825 static int ram_resume_prepare(MigrationState *s, void *opaque)
4827 RAMState *rs = *(RAMState **)opaque;
4828 int ret;
4830 ret = ram_dirty_bitmap_sync_all(s, rs);
4831 if (ret) {
4832 return ret;
4835 ram_state_resume_prepare(rs, s->to_dst_file);
4837 return 0;
4840 void postcopy_preempt_shutdown_file(MigrationState *s)
4842 qemu_put_be64(s->postcopy_qemufile_src, RAM_SAVE_FLAG_EOS);
4843 qemu_fflush(s->postcopy_qemufile_src);
4846 static SaveVMHandlers savevm_ram_handlers = {
4847 .save_setup = ram_save_setup,
4848 .save_live_iterate = ram_save_iterate,
4849 .save_live_complete_postcopy = ram_save_complete,
4850 .save_live_complete_precopy = ram_save_complete,
4851 .has_postcopy = ram_has_postcopy,
4852 .save_live_pending = ram_save_pending,
4853 .load_state = ram_load,
4854 .save_cleanup = ram_save_cleanup,
4855 .load_setup = ram_load_setup,
4856 .load_cleanup = ram_load_cleanup,
4857 .resume_prepare = ram_resume_prepare,
4860 static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host,
4861 size_t old_size, size_t new_size)
4863 PostcopyState ps = postcopy_state_get();
4864 ram_addr_t offset;
4865 RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset);
4866 Error *err = NULL;
4868 if (ramblock_is_ignored(rb)) {
4869 return;
4872 if (!migration_is_idle()) {
4874 * Precopy code on the source cannot deal with the size of RAM blocks
4875 * changing at random points in time - especially after sending the
4876 * RAM block sizes in the migration stream, they must no longer change.
4877 * Abort and indicate a proper reason.
4879 error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr);
4880 migration_cancel(err);
4881 error_free(err);
4884 switch (ps) {
4885 case POSTCOPY_INCOMING_ADVISE:
4887 * Update what ram_postcopy_incoming_init()->init_range() does at the
4888 * time postcopy was advised. Syncing RAM blocks with the source will
4889 * result in RAM resizes.
4891 if (old_size < new_size) {
4892 if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) {
4893 error_report("RAM block '%s' discard of resized RAM failed",
4894 rb->idstr);
4897 rb->postcopy_length = new_size;
4898 break;
4899 case POSTCOPY_INCOMING_NONE:
4900 case POSTCOPY_INCOMING_RUNNING:
4901 case POSTCOPY_INCOMING_END:
4903 * Once our guest is running, postcopy does no longer care about
4904 * resizes. When growing, the new memory was not available on the
4905 * source, no handler needed.
4907 break;
4908 default:
4909 error_report("RAM block '%s' resized during postcopy state: %d",
4910 rb->idstr, ps);
4911 exit(-1);
4915 static RAMBlockNotifier ram_mig_ram_notifier = {
4916 .ram_block_resized = ram_mig_ram_block_resized,
4919 void ram_mig_init(void)
4921 qemu_mutex_init(&XBZRLE.lock);
4922 register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
4923 ram_block_notifier_add(&ram_mig_ram_notifier);