python/aqmp: copy type definitions from qmp
[qemu.git] / migration / ram.c
blob57efa67f208ac7bf9f629bba4852a19b98b3f025
1 /*
2 * QEMU System Emulator
4 * Copyright (c) 2003-2008 Fabrice Bellard
5 * Copyright (c) 2011-2015 Red Hat Inc
7 * Authors:
8 * Juan Quintela <quintela@redhat.com>
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 * THE SOFTWARE.
29 #include "qemu/osdep.h"
30 #include "qemu/cutils.h"
31 #include "qemu/bitops.h"
32 #include "qemu/bitmap.h"
33 #include "qemu/main-loop.h"
34 #include "xbzrle.h"
35 #include "ram.h"
36 #include "migration.h"
37 #include "migration/register.h"
38 #include "migration/misc.h"
39 #include "qemu-file.h"
40 #include "postcopy-ram.h"
41 #include "page_cache.h"
42 #include "qemu/error-report.h"
43 #include "qapi/error.h"
44 #include "qapi/qapi-types-migration.h"
45 #include "qapi/qapi-events-migration.h"
46 #include "qapi/qmp/qerror.h"
47 #include "trace.h"
48 #include "exec/ram_addr.h"
49 #include "exec/target_page.h"
50 #include "qemu/rcu_queue.h"
51 #include "migration/colo.h"
52 #include "block.h"
53 #include "sysemu/cpu-throttle.h"
54 #include "savevm.h"
55 #include "qemu/iov.h"
56 #include "multifd.h"
57 #include "sysemu/runstate.h"
59 #include "hw/boards.h" /* for machine_dump_guest_core() */
61 #if defined(__linux__)
62 #include "qemu/userfaultfd.h"
63 #endif /* defined(__linux__) */
65 /***********************************************************/
66 /* ram save/restore */
68 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
69 * worked for pages that where filled with the same char. We switched
70 * it to only search for the zero value. And to avoid confusion with
71 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
74 #define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */
75 #define RAM_SAVE_FLAG_ZERO 0x02
76 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
77 #define RAM_SAVE_FLAG_PAGE 0x08
78 #define RAM_SAVE_FLAG_EOS 0x10
79 #define RAM_SAVE_FLAG_CONTINUE 0x20
80 #define RAM_SAVE_FLAG_XBZRLE 0x40
81 /* 0x80 is reserved in migration.h start with 0x100 next */
82 #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
84 XBZRLECacheStats xbzrle_counters;
86 /* struct contains XBZRLE cache and a static page
87 used by the compression */
88 static struct {
89 /* buffer used for XBZRLE encoding */
90 uint8_t *encoded_buf;
91 /* buffer for storing page content */
92 uint8_t *current_buf;
93 /* Cache for XBZRLE, Protected by lock. */
94 PageCache *cache;
95 QemuMutex lock;
96 /* it will store a page full of zeros */
97 uint8_t *zero_target_page;
98 /* buffer used for XBZRLE decoding */
99 uint8_t *decoded_buf;
100 } XBZRLE;
102 static void XBZRLE_cache_lock(void)
104 if (migrate_use_xbzrle()) {
105 qemu_mutex_lock(&XBZRLE.lock);
109 static void XBZRLE_cache_unlock(void)
111 if (migrate_use_xbzrle()) {
112 qemu_mutex_unlock(&XBZRLE.lock);
117 * xbzrle_cache_resize: resize the xbzrle cache
119 * This function is called from migrate_params_apply in main
120 * thread, possibly while a migration is in progress. A running
121 * migration may be using the cache and might finish during this call,
122 * hence changes to the cache are protected by XBZRLE.lock().
124 * Returns 0 for success or -1 for error
126 * @new_size: new cache size
127 * @errp: set *errp if the check failed, with reason
129 int xbzrle_cache_resize(uint64_t new_size, Error **errp)
131 PageCache *new_cache;
132 int64_t ret = 0;
134 /* Check for truncation */
135 if (new_size != (size_t)new_size) {
136 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
137 "exceeding address space");
138 return -1;
141 if (new_size == migrate_xbzrle_cache_size()) {
142 /* nothing to do */
143 return 0;
146 XBZRLE_cache_lock();
148 if (XBZRLE.cache != NULL) {
149 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
150 if (!new_cache) {
151 ret = -1;
152 goto out;
155 cache_fini(XBZRLE.cache);
156 XBZRLE.cache = new_cache;
158 out:
159 XBZRLE_cache_unlock();
160 return ret;
163 bool ramblock_is_ignored(RAMBlock *block)
165 return !qemu_ram_is_migratable(block) ||
166 (migrate_ignore_shared() && qemu_ram_is_shared(block));
169 #undef RAMBLOCK_FOREACH
171 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
173 RAMBlock *block;
174 int ret = 0;
176 RCU_READ_LOCK_GUARD();
178 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
179 ret = func(block, opaque);
180 if (ret) {
181 break;
184 return ret;
187 static void ramblock_recv_map_init(void)
189 RAMBlock *rb;
191 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
192 assert(!rb->receivedmap);
193 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
197 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
199 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
200 rb->receivedmap);
203 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
205 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
208 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
210 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
213 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
214 size_t nr)
216 bitmap_set_atomic(rb->receivedmap,
217 ramblock_recv_bitmap_offset(host_addr, rb),
218 nr);
221 #define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL)
224 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
226 * Returns >0 if success with sent bytes, or <0 if error.
228 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
229 const char *block_name)
231 RAMBlock *block = qemu_ram_block_by_name(block_name);
232 unsigned long *le_bitmap, nbits;
233 uint64_t size;
235 if (!block) {
236 error_report("%s: invalid block name: %s", __func__, block_name);
237 return -1;
240 nbits = block->postcopy_length >> TARGET_PAGE_BITS;
243 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
244 * machines we may need 4 more bytes for padding (see below
245 * comment). So extend it a bit before hand.
247 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
250 * Always use little endian when sending the bitmap. This is
251 * required that when source and destination VMs are not using the
252 * same endianness. (Note: big endian won't work.)
254 bitmap_to_le(le_bitmap, block->receivedmap, nbits);
256 /* Size of the bitmap, in bytes */
257 size = DIV_ROUND_UP(nbits, 8);
260 * size is always aligned to 8 bytes for 64bit machines, but it
261 * may not be true for 32bit machines. We need this padding to
262 * make sure the migration can survive even between 32bit and
263 * 64bit machines.
265 size = ROUND_UP(size, 8);
267 qemu_put_be64(file, size);
268 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
270 * Mark as an end, in case the middle part is screwed up due to
271 * some "mysterious" reason.
273 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
274 qemu_fflush(file);
276 g_free(le_bitmap);
278 if (qemu_file_get_error(file)) {
279 return qemu_file_get_error(file);
282 return size + sizeof(size);
286 * An outstanding page request, on the source, having been received
287 * and queued
289 struct RAMSrcPageRequest {
290 RAMBlock *rb;
291 hwaddr offset;
292 hwaddr len;
294 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
297 /* State of RAM for migration */
298 struct RAMState {
299 /* QEMUFile used for this migration */
300 QEMUFile *f;
301 /* UFFD file descriptor, used in 'write-tracking' migration */
302 int uffdio_fd;
303 /* Last block that we have visited searching for dirty pages */
304 RAMBlock *last_seen_block;
305 /* Last block from where we have sent data */
306 RAMBlock *last_sent_block;
307 /* Last dirty target page we have sent */
308 ram_addr_t last_page;
309 /* last ram version we have seen */
310 uint32_t last_version;
311 /* How many times we have dirty too many pages */
312 int dirty_rate_high_cnt;
313 /* these variables are used for bitmap sync */
314 /* last time we did a full bitmap_sync */
315 int64_t time_last_bitmap_sync;
316 /* bytes transferred at start_time */
317 uint64_t bytes_xfer_prev;
318 /* number of dirty pages since start_time */
319 uint64_t num_dirty_pages_period;
320 /* xbzrle misses since the beginning of the period */
321 uint64_t xbzrle_cache_miss_prev;
322 /* Amount of xbzrle pages since the beginning of the period */
323 uint64_t xbzrle_pages_prev;
324 /* Amount of xbzrle encoded bytes since the beginning of the period */
325 uint64_t xbzrle_bytes_prev;
326 /* Start using XBZRLE (e.g., after the first round). */
327 bool xbzrle_enabled;
329 /* compression statistics since the beginning of the period */
330 /* amount of count that no free thread to compress data */
331 uint64_t compress_thread_busy_prev;
332 /* amount bytes after compression */
333 uint64_t compressed_size_prev;
334 /* amount of compressed pages */
335 uint64_t compress_pages_prev;
337 /* total handled target pages at the beginning of period */
338 uint64_t target_page_count_prev;
339 /* total handled target pages since start */
340 uint64_t target_page_count;
341 /* number of dirty bits in the bitmap */
342 uint64_t migration_dirty_pages;
343 /* Protects modification of the bitmap and migration dirty pages */
344 QemuMutex bitmap_mutex;
345 /* The RAMBlock used in the last src_page_requests */
346 RAMBlock *last_req_rb;
347 /* Queue of outstanding page requests from the destination */
348 QemuMutex src_page_req_mutex;
349 QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
351 typedef struct RAMState RAMState;
353 static RAMState *ram_state;
355 static NotifierWithReturnList precopy_notifier_list;
357 void precopy_infrastructure_init(void)
359 notifier_with_return_list_init(&precopy_notifier_list);
362 void precopy_add_notifier(NotifierWithReturn *n)
364 notifier_with_return_list_add(&precopy_notifier_list, n);
367 void precopy_remove_notifier(NotifierWithReturn *n)
369 notifier_with_return_remove(n);
372 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
374 PrecopyNotifyData pnd;
375 pnd.reason = reason;
376 pnd.errp = errp;
378 return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
381 uint64_t ram_bytes_remaining(void)
383 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
387 MigrationStats ram_counters;
389 /* used by the search for pages to send */
390 struct PageSearchStatus {
391 /* Current block being searched */
392 RAMBlock *block;
393 /* Current page to search from */
394 unsigned long page;
395 /* Set once we wrap around */
396 bool complete_round;
398 typedef struct PageSearchStatus PageSearchStatus;
400 CompressionStats compression_counters;
402 struct CompressParam {
403 bool done;
404 bool quit;
405 bool zero_page;
406 QEMUFile *file;
407 QemuMutex mutex;
408 QemuCond cond;
409 RAMBlock *block;
410 ram_addr_t offset;
412 /* internally used fields */
413 z_stream stream;
414 uint8_t *originbuf;
416 typedef struct CompressParam CompressParam;
418 struct DecompressParam {
419 bool done;
420 bool quit;
421 QemuMutex mutex;
422 QemuCond cond;
423 void *des;
424 uint8_t *compbuf;
425 int len;
426 z_stream stream;
428 typedef struct DecompressParam DecompressParam;
430 static CompressParam *comp_param;
431 static QemuThread *compress_threads;
432 /* comp_done_cond is used to wake up the migration thread when
433 * one of the compression threads has finished the compression.
434 * comp_done_lock is used to co-work with comp_done_cond.
436 static QemuMutex comp_done_lock;
437 static QemuCond comp_done_cond;
438 /* The empty QEMUFileOps will be used by file in CompressParam */
439 static const QEMUFileOps empty_ops = { };
441 static QEMUFile *decomp_file;
442 static DecompressParam *decomp_param;
443 static QemuThread *decompress_threads;
444 static QemuMutex decomp_done_lock;
445 static QemuCond decomp_done_cond;
447 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
448 ram_addr_t offset, uint8_t *source_buf);
450 static void *do_data_compress(void *opaque)
452 CompressParam *param = opaque;
453 RAMBlock *block;
454 ram_addr_t offset;
455 bool zero_page;
457 qemu_mutex_lock(&param->mutex);
458 while (!param->quit) {
459 if (param->block) {
460 block = param->block;
461 offset = param->offset;
462 param->block = NULL;
463 qemu_mutex_unlock(&param->mutex);
465 zero_page = do_compress_ram_page(param->file, &param->stream,
466 block, offset, param->originbuf);
468 qemu_mutex_lock(&comp_done_lock);
469 param->done = true;
470 param->zero_page = zero_page;
471 qemu_cond_signal(&comp_done_cond);
472 qemu_mutex_unlock(&comp_done_lock);
474 qemu_mutex_lock(&param->mutex);
475 } else {
476 qemu_cond_wait(&param->cond, &param->mutex);
479 qemu_mutex_unlock(&param->mutex);
481 return NULL;
484 static void compress_threads_save_cleanup(void)
486 int i, thread_count;
488 if (!migrate_use_compression() || !comp_param) {
489 return;
492 thread_count = migrate_compress_threads();
493 for (i = 0; i < thread_count; i++) {
495 * we use it as a indicator which shows if the thread is
496 * properly init'd or not
498 if (!comp_param[i].file) {
499 break;
502 qemu_mutex_lock(&comp_param[i].mutex);
503 comp_param[i].quit = true;
504 qemu_cond_signal(&comp_param[i].cond);
505 qemu_mutex_unlock(&comp_param[i].mutex);
507 qemu_thread_join(compress_threads + i);
508 qemu_mutex_destroy(&comp_param[i].mutex);
509 qemu_cond_destroy(&comp_param[i].cond);
510 deflateEnd(&comp_param[i].stream);
511 g_free(comp_param[i].originbuf);
512 qemu_fclose(comp_param[i].file);
513 comp_param[i].file = NULL;
515 qemu_mutex_destroy(&comp_done_lock);
516 qemu_cond_destroy(&comp_done_cond);
517 g_free(compress_threads);
518 g_free(comp_param);
519 compress_threads = NULL;
520 comp_param = NULL;
523 static int compress_threads_save_setup(void)
525 int i, thread_count;
527 if (!migrate_use_compression()) {
528 return 0;
530 thread_count = migrate_compress_threads();
531 compress_threads = g_new0(QemuThread, thread_count);
532 comp_param = g_new0(CompressParam, thread_count);
533 qemu_cond_init(&comp_done_cond);
534 qemu_mutex_init(&comp_done_lock);
535 for (i = 0; i < thread_count; i++) {
536 comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
537 if (!comp_param[i].originbuf) {
538 goto exit;
541 if (deflateInit(&comp_param[i].stream,
542 migrate_compress_level()) != Z_OK) {
543 g_free(comp_param[i].originbuf);
544 goto exit;
547 /* comp_param[i].file is just used as a dummy buffer to save data,
548 * set its ops to empty.
550 comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops, false);
551 comp_param[i].done = true;
552 comp_param[i].quit = false;
553 qemu_mutex_init(&comp_param[i].mutex);
554 qemu_cond_init(&comp_param[i].cond);
555 qemu_thread_create(compress_threads + i, "compress",
556 do_data_compress, comp_param + i,
557 QEMU_THREAD_JOINABLE);
559 return 0;
561 exit:
562 compress_threads_save_cleanup();
563 return -1;
567 * save_page_header: write page header to wire
569 * If this is the 1st block, it also writes the block identification
571 * Returns the number of bytes written
573 * @f: QEMUFile where to send the data
574 * @block: block that contains the page we want to send
575 * @offset: offset inside the block for the page
576 * in the lower bits, it contains flags
578 static size_t save_page_header(RAMState *rs, QEMUFile *f, RAMBlock *block,
579 ram_addr_t offset)
581 size_t size, len;
583 if (block == rs->last_sent_block) {
584 offset |= RAM_SAVE_FLAG_CONTINUE;
586 qemu_put_be64(f, offset);
587 size = 8;
589 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
590 len = strlen(block->idstr);
591 qemu_put_byte(f, len);
592 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
593 size += 1 + len;
594 rs->last_sent_block = block;
596 return size;
600 * mig_throttle_guest_down: throttle down the guest
602 * Reduce amount of guest cpu execution to hopefully slow down memory
603 * writes. If guest dirty memory rate is reduced below the rate at
604 * which we can transfer pages to the destination then we should be
605 * able to complete migration. Some workloads dirty memory way too
606 * fast and will not effectively converge, even with auto-converge.
608 static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
609 uint64_t bytes_dirty_threshold)
611 MigrationState *s = migrate_get_current();
612 uint64_t pct_initial = s->parameters.cpu_throttle_initial;
613 uint64_t pct_increment = s->parameters.cpu_throttle_increment;
614 bool pct_tailslow = s->parameters.cpu_throttle_tailslow;
615 int pct_max = s->parameters.max_cpu_throttle;
617 uint64_t throttle_now = cpu_throttle_get_percentage();
618 uint64_t cpu_now, cpu_ideal, throttle_inc;
620 /* We have not started throttling yet. Let's start it. */
621 if (!cpu_throttle_active()) {
622 cpu_throttle_set(pct_initial);
623 } else {
624 /* Throttling already on, just increase the rate */
625 if (!pct_tailslow) {
626 throttle_inc = pct_increment;
627 } else {
628 /* Compute the ideal CPU percentage used by Guest, which may
629 * make the dirty rate match the dirty rate threshold. */
630 cpu_now = 100 - throttle_now;
631 cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
632 bytes_dirty_period);
633 throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
635 cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
639 void mig_throttle_counter_reset(void)
641 RAMState *rs = ram_state;
643 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
644 rs->num_dirty_pages_period = 0;
645 rs->bytes_xfer_prev = ram_counters.transferred;
649 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
651 * @rs: current RAM state
652 * @current_addr: address for the zero page
654 * Update the xbzrle cache to reflect a page that's been sent as all 0.
655 * The important thing is that a stale (not-yet-0'd) page be replaced
656 * by the new data.
657 * As a bonus, if the page wasn't in the cache it gets added so that
658 * when a small write is made into the 0'd page it gets XBZRLE sent.
660 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
662 if (!rs->xbzrle_enabled) {
663 return;
666 /* We don't care if this fails to allocate a new cache page
667 * as long as it updated an old one */
668 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
669 ram_counters.dirty_sync_count);
672 #define ENCODING_FLAG_XBZRLE 0x1
675 * save_xbzrle_page: compress and send current page
677 * Returns: 1 means that we wrote the page
678 * 0 means that page is identical to the one already sent
679 * -1 means that xbzrle would be longer than normal
681 * @rs: current RAM state
682 * @current_data: pointer to the address of the page contents
683 * @current_addr: addr of the page
684 * @block: block that contains the page we want to send
685 * @offset: offset inside the block for the page
686 * @last_stage: if we are at the completion stage
688 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
689 ram_addr_t current_addr, RAMBlock *block,
690 ram_addr_t offset, bool last_stage)
692 int encoded_len = 0, bytes_xbzrle;
693 uint8_t *prev_cached_page;
695 if (!cache_is_cached(XBZRLE.cache, current_addr,
696 ram_counters.dirty_sync_count)) {
697 xbzrle_counters.cache_miss++;
698 if (!last_stage) {
699 if (cache_insert(XBZRLE.cache, current_addr, *current_data,
700 ram_counters.dirty_sync_count) == -1) {
701 return -1;
702 } else {
703 /* update *current_data when the page has been
704 inserted into cache */
705 *current_data = get_cached_data(XBZRLE.cache, current_addr);
708 return -1;
712 * Reaching here means the page has hit the xbzrle cache, no matter what
713 * encoding result it is (normal encoding, overflow or skipping the page),
714 * count the page as encoded. This is used to calculate the encoding rate.
716 * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
717 * 2nd page turns out to be skipped (i.e. no new bytes written to the
718 * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
719 * skipped page included. In this way, the encoding rate can tell if the
720 * guest page is good for xbzrle encoding.
722 xbzrle_counters.pages++;
723 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
725 /* save current buffer into memory */
726 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
728 /* XBZRLE encoding (if there is no overflow) */
729 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
730 TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
731 TARGET_PAGE_SIZE);
734 * Update the cache contents, so that it corresponds to the data
735 * sent, in all cases except where we skip the page.
737 if (!last_stage && encoded_len != 0) {
738 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
740 * In the case where we couldn't compress, ensure that the caller
741 * sends the data from the cache, since the guest might have
742 * changed the RAM since we copied it.
744 *current_data = prev_cached_page;
747 if (encoded_len == 0) {
748 trace_save_xbzrle_page_skipping();
749 return 0;
750 } else if (encoded_len == -1) {
751 trace_save_xbzrle_page_overflow();
752 xbzrle_counters.overflow++;
753 xbzrle_counters.bytes += TARGET_PAGE_SIZE;
754 return -1;
757 /* Send XBZRLE based compressed page */
758 bytes_xbzrle = save_page_header(rs, rs->f, block,
759 offset | RAM_SAVE_FLAG_XBZRLE);
760 qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
761 qemu_put_be16(rs->f, encoded_len);
762 qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
763 bytes_xbzrle += encoded_len + 1 + 2;
765 * Like compressed_size (please see update_compress_thread_counts),
766 * the xbzrle encoded bytes don't count the 8 byte header with
767 * RAM_SAVE_FLAG_CONTINUE.
769 xbzrle_counters.bytes += bytes_xbzrle - 8;
770 ram_counters.transferred += bytes_xbzrle;
772 return 1;
776 * migration_bitmap_find_dirty: find the next dirty page from start
778 * Returns the page offset within memory region of the start of a dirty page
780 * @rs: current RAM state
781 * @rb: RAMBlock where to search for dirty pages
782 * @start: page where we start the search
784 static inline
785 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
786 unsigned long start)
788 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
789 unsigned long *bitmap = rb->bmap;
791 if (ramblock_is_ignored(rb)) {
792 return size;
795 return find_next_bit(bitmap, size, start);
798 static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb,
799 unsigned long page)
801 uint8_t shift;
802 hwaddr size, start;
804 if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) {
805 return;
808 shift = rb->clear_bmap_shift;
810 * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
811 * can make things easier sometimes since then start address
812 * of the small chunk will always be 64 pages aligned so the
813 * bitmap will always be aligned to unsigned long. We should
814 * even be able to remove this restriction but I'm simply
815 * keeping it.
817 assert(shift >= 6);
819 size = 1ULL << (TARGET_PAGE_BITS + shift);
820 start = QEMU_ALIGN_DOWN((ram_addr_t)page << TARGET_PAGE_BITS, size);
821 trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
822 memory_region_clear_dirty_bitmap(rb->mr, start, size);
825 static void
826 migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb,
827 unsigned long start,
828 unsigned long npages)
830 unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift;
831 unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages);
832 unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages);
835 * Clear pages from start to start + npages - 1, so the end boundary is
836 * exclusive.
838 for (i = chunk_start; i < chunk_end; i += chunk_pages) {
839 migration_clear_memory_region_dirty_bitmap(rb, i);
844 * colo_bitmap_find_diry:find contiguous dirty pages from start
846 * Returns the page offset within memory region of the start of the contiguout
847 * dirty page
849 * @rs: current RAM state
850 * @rb: RAMBlock where to search for dirty pages
851 * @start: page where we start the search
852 * @num: the number of contiguous dirty pages
854 static inline
855 unsigned long colo_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
856 unsigned long start, unsigned long *num)
858 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
859 unsigned long *bitmap = rb->bmap;
860 unsigned long first, next;
862 *num = 0;
864 if (ramblock_is_ignored(rb)) {
865 return size;
868 first = find_next_bit(bitmap, size, start);
869 if (first >= size) {
870 return first;
872 next = find_next_zero_bit(bitmap, size, first + 1);
873 assert(next >= first);
874 *num = next - first;
875 return first;
878 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
879 RAMBlock *rb,
880 unsigned long page)
882 bool ret;
885 * Clear dirty bitmap if needed. This _must_ be called before we
886 * send any of the page in the chunk because we need to make sure
887 * we can capture further page content changes when we sync dirty
888 * log the next time. So as long as we are going to send any of
889 * the page in the chunk we clear the remote dirty bitmap for all.
890 * Clearing it earlier won't be a problem, but too late will.
892 migration_clear_memory_region_dirty_bitmap(rb, page);
894 ret = test_and_clear_bit(page, rb->bmap);
895 if (ret) {
896 rs->migration_dirty_pages--;
899 return ret;
902 static void dirty_bitmap_clear_section(MemoryRegionSection *section,
903 void *opaque)
905 const hwaddr offset = section->offset_within_region;
906 const hwaddr size = int128_get64(section->size);
907 const unsigned long start = offset >> TARGET_PAGE_BITS;
908 const unsigned long npages = size >> TARGET_PAGE_BITS;
909 RAMBlock *rb = section->mr->ram_block;
910 uint64_t *cleared_bits = opaque;
913 * We don't grab ram_state->bitmap_mutex because we expect to run
914 * only when starting migration or during postcopy recovery where
915 * we don't have concurrent access.
917 if (!migration_in_postcopy() && !migrate_background_snapshot()) {
918 migration_clear_memory_region_dirty_bitmap_range(rb, start, npages);
920 *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages);
921 bitmap_clear(rb->bmap, start, npages);
925 * Exclude all dirty pages from migration that fall into a discarded range as
926 * managed by a RamDiscardManager responsible for the mapped memory region of
927 * the RAMBlock. Clear the corresponding bits in the dirty bitmaps.
929 * Discarded pages ("logically unplugged") have undefined content and must
930 * not get migrated, because even reading these pages for migration might
931 * result in undesired behavior.
933 * Returns the number of cleared bits in the RAMBlock dirty bitmap.
935 * Note: The result is only stable while migrating (precopy/postcopy).
937 static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb)
939 uint64_t cleared_bits = 0;
941 if (rb->mr && rb->bmap && memory_region_has_ram_discard_manager(rb->mr)) {
942 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
943 MemoryRegionSection section = {
944 .mr = rb->mr,
945 .offset_within_region = 0,
946 .size = int128_make64(qemu_ram_get_used_length(rb)),
949 ram_discard_manager_replay_discarded(rdm, &section,
950 dirty_bitmap_clear_section,
951 &cleared_bits);
953 return cleared_bits;
957 * Check if a host-page aligned page falls into a discarded range as managed by
958 * a RamDiscardManager responsible for the mapped memory region of the RAMBlock.
960 * Note: The result is only stable while migrating (precopy/postcopy).
962 bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start)
964 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
965 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
966 MemoryRegionSection section = {
967 .mr = rb->mr,
968 .offset_within_region = start,
969 .size = int128_make64(qemu_ram_pagesize(rb)),
972 return !ram_discard_manager_is_populated(rdm, &section);
974 return false;
977 /* Called with RCU critical section */
978 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
980 uint64_t new_dirty_pages =
981 cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
983 rs->migration_dirty_pages += new_dirty_pages;
984 rs->num_dirty_pages_period += new_dirty_pages;
988 * ram_pagesize_summary: calculate all the pagesizes of a VM
990 * Returns a summary bitmap of the page sizes of all RAMBlocks
992 * For VMs with just normal pages this is equivalent to the host page
993 * size. If it's got some huge pages then it's the OR of all the
994 * different page sizes.
996 uint64_t ram_pagesize_summary(void)
998 RAMBlock *block;
999 uint64_t summary = 0;
1001 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1002 summary |= block->page_size;
1005 return summary;
1008 uint64_t ram_get_total_transferred_pages(void)
1010 return ram_counters.normal + ram_counters.duplicate +
1011 compression_counters.pages + xbzrle_counters.pages;
1014 static void migration_update_rates(RAMState *rs, int64_t end_time)
1016 uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
1017 double compressed_size;
1019 /* calculate period counters */
1020 ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
1021 / (end_time - rs->time_last_bitmap_sync);
1023 if (!page_count) {
1024 return;
1027 if (migrate_use_xbzrle()) {
1028 double encoded_size, unencoded_size;
1030 xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
1031 rs->xbzrle_cache_miss_prev) / page_count;
1032 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
1033 unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
1034 TARGET_PAGE_SIZE;
1035 encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
1036 if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
1037 xbzrle_counters.encoding_rate = 0;
1038 } else {
1039 xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
1041 rs->xbzrle_pages_prev = xbzrle_counters.pages;
1042 rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
1045 if (migrate_use_compression()) {
1046 compression_counters.busy_rate = (double)(compression_counters.busy -
1047 rs->compress_thread_busy_prev) / page_count;
1048 rs->compress_thread_busy_prev = compression_counters.busy;
1050 compressed_size = compression_counters.compressed_size -
1051 rs->compressed_size_prev;
1052 if (compressed_size) {
1053 double uncompressed_size = (compression_counters.pages -
1054 rs->compress_pages_prev) * TARGET_PAGE_SIZE;
1056 /* Compression-Ratio = Uncompressed-size / Compressed-size */
1057 compression_counters.compression_rate =
1058 uncompressed_size / compressed_size;
1060 rs->compress_pages_prev = compression_counters.pages;
1061 rs->compressed_size_prev = compression_counters.compressed_size;
1066 static void migration_trigger_throttle(RAMState *rs)
1068 MigrationState *s = migrate_get_current();
1069 uint64_t threshold = s->parameters.throttle_trigger_threshold;
1071 uint64_t bytes_xfer_period = ram_counters.transferred - rs->bytes_xfer_prev;
1072 uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
1073 uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
1075 /* During block migration the auto-converge logic incorrectly detects
1076 * that ram migration makes no progress. Avoid this by disabling the
1077 * throttling logic during the bulk phase of block migration. */
1078 if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1079 /* The following detection logic can be refined later. For now:
1080 Check to see if the ratio between dirtied bytes and the approx.
1081 amount of bytes that just got transferred since the last time
1082 we were in this routine reaches the threshold. If that happens
1083 twice, start or increase throttling. */
1085 if ((bytes_dirty_period > bytes_dirty_threshold) &&
1086 (++rs->dirty_rate_high_cnt >= 2)) {
1087 trace_migration_throttle();
1088 rs->dirty_rate_high_cnt = 0;
1089 mig_throttle_guest_down(bytes_dirty_period,
1090 bytes_dirty_threshold);
1095 static void migration_bitmap_sync(RAMState *rs)
1097 RAMBlock *block;
1098 int64_t end_time;
1100 ram_counters.dirty_sync_count++;
1102 if (!rs->time_last_bitmap_sync) {
1103 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1106 trace_migration_bitmap_sync_start();
1107 memory_global_dirty_log_sync();
1109 qemu_mutex_lock(&rs->bitmap_mutex);
1110 WITH_RCU_READ_LOCK_GUARD() {
1111 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1112 ramblock_sync_dirty_bitmap(rs, block);
1114 ram_counters.remaining = ram_bytes_remaining();
1116 qemu_mutex_unlock(&rs->bitmap_mutex);
1118 memory_global_after_dirty_log_sync();
1119 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1121 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1123 /* more than 1 second = 1000 millisecons */
1124 if (end_time > rs->time_last_bitmap_sync + 1000) {
1125 migration_trigger_throttle(rs);
1127 migration_update_rates(rs, end_time);
1129 rs->target_page_count_prev = rs->target_page_count;
1131 /* reset period counters */
1132 rs->time_last_bitmap_sync = end_time;
1133 rs->num_dirty_pages_period = 0;
1134 rs->bytes_xfer_prev = ram_counters.transferred;
1136 if (migrate_use_events()) {
1137 qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
1141 static void migration_bitmap_sync_precopy(RAMState *rs)
1143 Error *local_err = NULL;
1146 * The current notifier usage is just an optimization to migration, so we
1147 * don't stop the normal migration process in the error case.
1149 if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1150 error_report_err(local_err);
1151 local_err = NULL;
1154 migration_bitmap_sync(rs);
1156 if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1157 error_report_err(local_err);
1162 * save_zero_page_to_file: send the zero page to the file
1164 * Returns the size of data written to the file, 0 means the page is not
1165 * a zero page
1167 * @rs: current RAM state
1168 * @file: the file where the data is saved
1169 * @block: block that contains the page we want to send
1170 * @offset: offset inside the block for the page
1172 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1173 RAMBlock *block, ram_addr_t offset)
1175 uint8_t *p = block->host + offset;
1176 int len = 0;
1178 if (buffer_is_zero(p, TARGET_PAGE_SIZE)) {
1179 len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1180 qemu_put_byte(file, 0);
1181 len += 1;
1183 return len;
1187 * save_zero_page: send the zero page to the stream
1189 * Returns the number of pages written.
1191 * @rs: current RAM state
1192 * @block: block that contains the page we want to send
1193 * @offset: offset inside the block for the page
1195 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1197 int len = save_zero_page_to_file(rs, rs->f, block, offset);
1199 if (len) {
1200 ram_counters.duplicate++;
1201 ram_counters.transferred += len;
1202 return 1;
1204 return -1;
1207 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
1209 if (!migrate_release_ram() || !migration_in_postcopy()) {
1210 return;
1213 ram_discard_range(rbname, offset, ((ram_addr_t)pages) << TARGET_PAGE_BITS);
1217 * @pages: the number of pages written by the control path,
1218 * < 0 - error
1219 * > 0 - number of pages written
1221 * Return true if the pages has been saved, otherwise false is returned.
1223 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1224 int *pages)
1226 uint64_t bytes_xmit = 0;
1227 int ret;
1229 *pages = -1;
1230 ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1231 &bytes_xmit);
1232 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1233 return false;
1236 if (bytes_xmit) {
1237 ram_counters.transferred += bytes_xmit;
1238 *pages = 1;
1241 if (ret == RAM_SAVE_CONTROL_DELAYED) {
1242 return true;
1245 if (bytes_xmit > 0) {
1246 ram_counters.normal++;
1247 } else if (bytes_xmit == 0) {
1248 ram_counters.duplicate++;
1251 return true;
1255 * directly send the page to the stream
1257 * Returns the number of pages written.
1259 * @rs: current RAM state
1260 * @block: block that contains the page we want to send
1261 * @offset: offset inside the block for the page
1262 * @buf: the page to be sent
1263 * @async: send to page asyncly
1265 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1266 uint8_t *buf, bool async)
1268 ram_counters.transferred += save_page_header(rs, rs->f, block,
1269 offset | RAM_SAVE_FLAG_PAGE);
1270 if (async) {
1271 qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1272 migrate_release_ram() &
1273 migration_in_postcopy());
1274 } else {
1275 qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1277 ram_counters.transferred += TARGET_PAGE_SIZE;
1278 ram_counters.normal++;
1279 return 1;
1283 * ram_save_page: send the given page to the stream
1285 * Returns the number of pages written.
1286 * < 0 - error
1287 * >=0 - Number of pages written - this might legally be 0
1288 * if xbzrle noticed the page was the same.
1290 * @rs: current RAM state
1291 * @block: block that contains the page we want to send
1292 * @offset: offset inside the block for the page
1293 * @last_stage: if we are at the completion stage
1295 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
1297 int pages = -1;
1298 uint8_t *p;
1299 bool send_async = true;
1300 RAMBlock *block = pss->block;
1301 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1302 ram_addr_t current_addr = block->offset + offset;
1304 p = block->host + offset;
1305 trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1307 XBZRLE_cache_lock();
1308 if (rs->xbzrle_enabled && !migration_in_postcopy()) {
1309 pages = save_xbzrle_page(rs, &p, current_addr, block,
1310 offset, last_stage);
1311 if (!last_stage) {
1312 /* Can't send this cached data async, since the cache page
1313 * might get updated before it gets to the wire
1315 send_async = false;
1319 /* XBZRLE overflow or normal page */
1320 if (pages == -1) {
1321 pages = save_normal_page(rs, block, offset, p, send_async);
1324 XBZRLE_cache_unlock();
1326 return pages;
1329 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
1330 ram_addr_t offset)
1332 if (multifd_queue_page(rs->f, block, offset) < 0) {
1333 return -1;
1335 ram_counters.normal++;
1337 return 1;
1340 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1341 ram_addr_t offset, uint8_t *source_buf)
1343 RAMState *rs = ram_state;
1344 uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1345 bool zero_page = false;
1346 int ret;
1348 if (save_zero_page_to_file(rs, f, block, offset)) {
1349 zero_page = true;
1350 goto exit;
1353 save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1356 * copy it to a internal buffer to avoid it being modified by VM
1357 * so that we can catch up the error during compression and
1358 * decompression
1360 memcpy(source_buf, p, TARGET_PAGE_SIZE);
1361 ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1362 if (ret < 0) {
1363 qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
1364 error_report("compressed data failed!");
1365 return false;
1368 exit:
1369 ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
1370 return zero_page;
1373 static void
1374 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1376 ram_counters.transferred += bytes_xmit;
1378 if (param->zero_page) {
1379 ram_counters.duplicate++;
1380 return;
1383 /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1384 compression_counters.compressed_size += bytes_xmit - 8;
1385 compression_counters.pages++;
1388 static bool save_page_use_compression(RAMState *rs);
1390 static void flush_compressed_data(RAMState *rs)
1392 int idx, len, thread_count;
1394 if (!save_page_use_compression(rs)) {
1395 return;
1397 thread_count = migrate_compress_threads();
1399 qemu_mutex_lock(&comp_done_lock);
1400 for (idx = 0; idx < thread_count; idx++) {
1401 while (!comp_param[idx].done) {
1402 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1405 qemu_mutex_unlock(&comp_done_lock);
1407 for (idx = 0; idx < thread_count; idx++) {
1408 qemu_mutex_lock(&comp_param[idx].mutex);
1409 if (!comp_param[idx].quit) {
1410 len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1412 * it's safe to fetch zero_page without holding comp_done_lock
1413 * as there is no further request submitted to the thread,
1414 * i.e, the thread should be waiting for a request at this point.
1416 update_compress_thread_counts(&comp_param[idx], len);
1418 qemu_mutex_unlock(&comp_param[idx].mutex);
1422 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1423 ram_addr_t offset)
1425 param->block = block;
1426 param->offset = offset;
1429 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1430 ram_addr_t offset)
1432 int idx, thread_count, bytes_xmit = -1, pages = -1;
1433 bool wait = migrate_compress_wait_thread();
1435 thread_count = migrate_compress_threads();
1436 qemu_mutex_lock(&comp_done_lock);
1437 retry:
1438 for (idx = 0; idx < thread_count; idx++) {
1439 if (comp_param[idx].done) {
1440 comp_param[idx].done = false;
1441 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1442 qemu_mutex_lock(&comp_param[idx].mutex);
1443 set_compress_params(&comp_param[idx], block, offset);
1444 qemu_cond_signal(&comp_param[idx].cond);
1445 qemu_mutex_unlock(&comp_param[idx].mutex);
1446 pages = 1;
1447 update_compress_thread_counts(&comp_param[idx], bytes_xmit);
1448 break;
1453 * wait for the free thread if the user specifies 'compress-wait-thread',
1454 * otherwise we will post the page out in the main thread as normal page.
1456 if (pages < 0 && wait) {
1457 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1458 goto retry;
1460 qemu_mutex_unlock(&comp_done_lock);
1462 return pages;
1466 * find_dirty_block: find the next dirty page and update any state
1467 * associated with the search process.
1469 * Returns true if a page is found
1471 * @rs: current RAM state
1472 * @pss: data about the state of the current dirty page scan
1473 * @again: set to false if the search has scanned the whole of RAM
1475 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1477 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1478 if (pss->complete_round && pss->block == rs->last_seen_block &&
1479 pss->page >= rs->last_page) {
1481 * We've been once around the RAM and haven't found anything.
1482 * Give up.
1484 *again = false;
1485 return false;
1487 if (!offset_in_ramblock(pss->block,
1488 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) {
1489 /* Didn't find anything in this RAM Block */
1490 pss->page = 0;
1491 pss->block = QLIST_NEXT_RCU(pss->block, next);
1492 if (!pss->block) {
1494 * If memory migration starts over, we will meet a dirtied page
1495 * which may still exists in compression threads's ring, so we
1496 * should flush the compressed data to make sure the new page
1497 * is not overwritten by the old one in the destination.
1499 * Also If xbzrle is on, stop using the data compression at this
1500 * point. In theory, xbzrle can do better than compression.
1502 flush_compressed_data(rs);
1504 /* Hit the end of the list */
1505 pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1506 /* Flag that we've looped */
1507 pss->complete_round = true;
1508 /* After the first round, enable XBZRLE. */
1509 if (migrate_use_xbzrle()) {
1510 rs->xbzrle_enabled = true;
1513 /* Didn't find anything this time, but try again on the new block */
1514 *again = true;
1515 return false;
1516 } else {
1517 /* Can go around again, but... */
1518 *again = true;
1519 /* We've found something so probably don't need to */
1520 return true;
1525 * unqueue_page: gets a page of the queue
1527 * Helper for 'get_queued_page' - gets a page off the queue
1529 * Returns the block of the page (or NULL if none available)
1531 * @rs: current RAM state
1532 * @offset: used to return the offset within the RAMBlock
1534 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1536 RAMBlock *block = NULL;
1538 if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) {
1539 return NULL;
1542 QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
1543 if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1544 struct RAMSrcPageRequest *entry =
1545 QSIMPLEQ_FIRST(&rs->src_page_requests);
1546 block = entry->rb;
1547 *offset = entry->offset;
1549 if (entry->len > TARGET_PAGE_SIZE) {
1550 entry->len -= TARGET_PAGE_SIZE;
1551 entry->offset += TARGET_PAGE_SIZE;
1552 } else {
1553 memory_region_unref(block->mr);
1554 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1555 g_free(entry);
1556 migration_consume_urgent_request();
1560 return block;
1563 #if defined(__linux__)
1565 * poll_fault_page: try to get next UFFD write fault page and, if pending fault
1566 * is found, return RAM block pointer and page offset
1568 * Returns pointer to the RAMBlock containing faulting page,
1569 * NULL if no write faults are pending
1571 * @rs: current RAM state
1572 * @offset: page offset from the beginning of the block
1574 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1576 struct uffd_msg uffd_msg;
1577 void *page_address;
1578 RAMBlock *block;
1579 int res;
1581 if (!migrate_background_snapshot()) {
1582 return NULL;
1585 res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1);
1586 if (res <= 0) {
1587 return NULL;
1590 page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address;
1591 block = qemu_ram_block_from_host(page_address, false, offset);
1592 assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0);
1593 return block;
1597 * ram_save_release_protection: release UFFD write protection after
1598 * a range of pages has been saved
1600 * @rs: current RAM state
1601 * @pss: page-search-status structure
1602 * @start_page: index of the first page in the range relative to pss->block
1604 * Returns 0 on success, negative value in case of an error
1606 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1607 unsigned long start_page)
1609 int res = 0;
1611 /* Check if page is from UFFD-managed region. */
1612 if (pss->block->flags & RAM_UF_WRITEPROTECT) {
1613 void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS);
1614 uint64_t run_length = (pss->page - start_page + 1) << TARGET_PAGE_BITS;
1616 /* Flush async buffers before un-protect. */
1617 qemu_fflush(rs->f);
1618 /* Un-protect memory range. */
1619 res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
1620 false, false);
1623 return res;
1626 /* ram_write_tracking_available: check if kernel supports required UFFD features
1628 * Returns true if supports, false otherwise
1630 bool ram_write_tracking_available(void)
1632 uint64_t uffd_features;
1633 int res;
1635 res = uffd_query_features(&uffd_features);
1636 return (res == 0 &&
1637 (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0);
1640 /* ram_write_tracking_compatible: check if guest configuration is
1641 * compatible with 'write-tracking'
1643 * Returns true if compatible, false otherwise
1645 bool ram_write_tracking_compatible(void)
1647 const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT);
1648 int uffd_fd;
1649 RAMBlock *block;
1650 bool ret = false;
1652 /* Open UFFD file descriptor */
1653 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false);
1654 if (uffd_fd < 0) {
1655 return false;
1658 RCU_READ_LOCK_GUARD();
1660 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1661 uint64_t uffd_ioctls;
1663 /* Nothing to do with read-only and MMIO-writable regions */
1664 if (block->mr->readonly || block->mr->rom_device) {
1665 continue;
1667 /* Try to register block memory via UFFD-IO to track writes */
1668 if (uffd_register_memory(uffd_fd, block->host, block->max_length,
1669 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) {
1670 goto out;
1672 if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) {
1673 goto out;
1676 ret = true;
1678 out:
1679 uffd_close_fd(uffd_fd);
1680 return ret;
1683 static inline void populate_read_range(RAMBlock *block, ram_addr_t offset,
1684 ram_addr_t size)
1687 * We read one byte of each page; this will preallocate page tables if
1688 * required and populate the shared zeropage on MAP_PRIVATE anonymous memory
1689 * where no page was populated yet. This might require adaption when
1690 * supporting other mappings, like shmem.
1692 for (; offset < size; offset += block->page_size) {
1693 char tmp = *((char *)block->host + offset);
1695 /* Don't optimize the read out */
1696 asm volatile("" : "+r" (tmp));
1700 static inline int populate_read_section(MemoryRegionSection *section,
1701 void *opaque)
1703 const hwaddr size = int128_get64(section->size);
1704 hwaddr offset = section->offset_within_region;
1705 RAMBlock *block = section->mr->ram_block;
1707 populate_read_range(block, offset, size);
1708 return 0;
1712 * ram_block_populate_read: preallocate page tables and populate pages in the
1713 * RAM block by reading a byte of each page.
1715 * Since it's solely used for userfault_fd WP feature, here we just
1716 * hardcode page size to qemu_real_host_page_size.
1718 * @block: RAM block to populate
1720 static void ram_block_populate_read(RAMBlock *rb)
1723 * Skip populating all pages that fall into a discarded range as managed by
1724 * a RamDiscardManager responsible for the mapped memory region of the
1725 * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock
1726 * must not get populated automatically. We don't have to track
1727 * modifications via userfaultfd WP reliably, because these pages will
1728 * not be part of the migration stream either way -- see
1729 * ramblock_dirty_bitmap_exclude_discarded_pages().
1731 * Note: The result is only stable while migrating (precopy/postcopy).
1733 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1734 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1735 MemoryRegionSection section = {
1736 .mr = rb->mr,
1737 .offset_within_region = 0,
1738 .size = rb->mr->size,
1741 ram_discard_manager_replay_populated(rdm, &section,
1742 populate_read_section, NULL);
1743 } else {
1744 populate_read_range(rb, 0, rb->used_length);
1749 * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking
1751 void ram_write_tracking_prepare(void)
1753 RAMBlock *block;
1755 RCU_READ_LOCK_GUARD();
1757 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1758 /* Nothing to do with read-only and MMIO-writable regions */
1759 if (block->mr->readonly || block->mr->rom_device) {
1760 continue;
1764 * Populate pages of the RAM block before enabling userfault_fd
1765 * write protection.
1767 * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with
1768 * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip
1769 * pages with pte_none() entries in page table.
1771 ram_block_populate_read(block);
1776 * ram_write_tracking_start: start UFFD-WP memory tracking
1778 * Returns 0 for success or negative value in case of error
1780 int ram_write_tracking_start(void)
1782 int uffd_fd;
1783 RAMState *rs = ram_state;
1784 RAMBlock *block;
1786 /* Open UFFD file descriptor */
1787 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true);
1788 if (uffd_fd < 0) {
1789 return uffd_fd;
1791 rs->uffdio_fd = uffd_fd;
1793 RCU_READ_LOCK_GUARD();
1795 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1796 /* Nothing to do with read-only and MMIO-writable regions */
1797 if (block->mr->readonly || block->mr->rom_device) {
1798 continue;
1801 /* Register block memory with UFFD to track writes */
1802 if (uffd_register_memory(rs->uffdio_fd, block->host,
1803 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
1804 goto fail;
1806 /* Apply UFFD write protection to the block memory range */
1807 if (uffd_change_protection(rs->uffdio_fd, block->host,
1808 block->max_length, true, false)) {
1809 goto fail;
1811 block->flags |= RAM_UF_WRITEPROTECT;
1812 memory_region_ref(block->mr);
1814 trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size,
1815 block->host, block->max_length);
1818 return 0;
1820 fail:
1821 error_report("ram_write_tracking_start() failed: restoring initial memory state");
1823 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1824 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1825 continue;
1828 * In case some memory block failed to be write-protected
1829 * remove protection and unregister all succeeded RAM blocks
1831 uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1832 false, false);
1833 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1834 /* Cleanup flags and remove reference */
1835 block->flags &= ~RAM_UF_WRITEPROTECT;
1836 memory_region_unref(block->mr);
1839 uffd_close_fd(uffd_fd);
1840 rs->uffdio_fd = -1;
1841 return -1;
1845 * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
1847 void ram_write_tracking_stop(void)
1849 RAMState *rs = ram_state;
1850 RAMBlock *block;
1852 RCU_READ_LOCK_GUARD();
1854 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1855 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1856 continue;
1858 /* Remove protection and unregister all affected RAM blocks */
1859 uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1860 false, false);
1861 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1863 trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size,
1864 block->host, block->max_length);
1866 /* Cleanup flags and remove reference */
1867 block->flags &= ~RAM_UF_WRITEPROTECT;
1868 memory_region_unref(block->mr);
1871 /* Finally close UFFD file descriptor */
1872 uffd_close_fd(rs->uffdio_fd);
1873 rs->uffdio_fd = -1;
1876 #else
1877 /* No target OS support, stubs just fail or ignore */
1879 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1881 (void) rs;
1882 (void) offset;
1884 return NULL;
1887 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1888 unsigned long start_page)
1890 (void) rs;
1891 (void) pss;
1892 (void) start_page;
1894 return 0;
1897 bool ram_write_tracking_available(void)
1899 return false;
1902 bool ram_write_tracking_compatible(void)
1904 assert(0);
1905 return false;
1908 int ram_write_tracking_start(void)
1910 assert(0);
1911 return -1;
1914 void ram_write_tracking_stop(void)
1916 assert(0);
1918 #endif /* defined(__linux__) */
1921 * get_queued_page: unqueue a page from the postcopy requests
1923 * Skips pages that are already sent (!dirty)
1925 * Returns true if a queued page is found
1927 * @rs: current RAM state
1928 * @pss: data about the state of the current dirty page scan
1930 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1932 RAMBlock *block;
1933 ram_addr_t offset;
1934 bool dirty;
1936 do {
1937 block = unqueue_page(rs, &offset);
1939 * We're sending this page, and since it's postcopy nothing else
1940 * will dirty it, and we must make sure it doesn't get sent again
1941 * even if this queue request was received after the background
1942 * search already sent it.
1944 if (block) {
1945 unsigned long page;
1947 page = offset >> TARGET_PAGE_BITS;
1948 dirty = test_bit(page, block->bmap);
1949 if (!dirty) {
1950 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1951 page);
1952 } else {
1953 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1957 } while (block && !dirty);
1959 if (!block) {
1961 * Poll write faults too if background snapshot is enabled; that's
1962 * when we have vcpus got blocked by the write protected pages.
1964 block = poll_fault_page(rs, &offset);
1967 if (block) {
1969 * We want the background search to continue from the queued page
1970 * since the guest is likely to want other pages near to the page
1971 * it just requested.
1973 pss->block = block;
1974 pss->page = offset >> TARGET_PAGE_BITS;
1977 * This unqueued page would break the "one round" check, even is
1978 * really rare.
1980 pss->complete_round = false;
1983 return !!block;
1987 * migration_page_queue_free: drop any remaining pages in the ram
1988 * request queue
1990 * It should be empty at the end anyway, but in error cases there may
1991 * be some left. in case that there is any page left, we drop it.
1994 static void migration_page_queue_free(RAMState *rs)
1996 struct RAMSrcPageRequest *mspr, *next_mspr;
1997 /* This queue generally should be empty - but in the case of a failed
1998 * migration might have some droppings in.
2000 RCU_READ_LOCK_GUARD();
2001 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
2002 memory_region_unref(mspr->rb->mr);
2003 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
2004 g_free(mspr);
2009 * ram_save_queue_pages: queue the page for transmission
2011 * A request from postcopy destination for example.
2013 * Returns zero on success or negative on error
2015 * @rbname: Name of the RAMBLock of the request. NULL means the
2016 * same that last one.
2017 * @start: starting address from the start of the RAMBlock
2018 * @len: length (in bytes) to send
2020 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
2022 RAMBlock *ramblock;
2023 RAMState *rs = ram_state;
2025 ram_counters.postcopy_requests++;
2026 RCU_READ_LOCK_GUARD();
2028 if (!rbname) {
2029 /* Reuse last RAMBlock */
2030 ramblock = rs->last_req_rb;
2032 if (!ramblock) {
2034 * Shouldn't happen, we can't reuse the last RAMBlock if
2035 * it's the 1st request.
2037 error_report("ram_save_queue_pages no previous block");
2038 return -1;
2040 } else {
2041 ramblock = qemu_ram_block_by_name(rbname);
2043 if (!ramblock) {
2044 /* We shouldn't be asked for a non-existent RAMBlock */
2045 error_report("ram_save_queue_pages no block '%s'", rbname);
2046 return -1;
2048 rs->last_req_rb = ramblock;
2050 trace_ram_save_queue_pages(ramblock->idstr, start, len);
2051 if (!offset_in_ramblock(ramblock, start + len - 1)) {
2052 error_report("%s request overrun start=" RAM_ADDR_FMT " len="
2053 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
2054 __func__, start, len, ramblock->used_length);
2055 return -1;
2058 struct RAMSrcPageRequest *new_entry =
2059 g_malloc0(sizeof(struct RAMSrcPageRequest));
2060 new_entry->rb = ramblock;
2061 new_entry->offset = start;
2062 new_entry->len = len;
2064 memory_region_ref(ramblock->mr);
2065 qemu_mutex_lock(&rs->src_page_req_mutex);
2066 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
2067 migration_make_urgent_request();
2068 qemu_mutex_unlock(&rs->src_page_req_mutex);
2070 return 0;
2073 static bool save_page_use_compression(RAMState *rs)
2075 if (!migrate_use_compression()) {
2076 return false;
2080 * If xbzrle is enabled (e.g., after first round of migration), stop
2081 * using the data compression. In theory, xbzrle can do better than
2082 * compression.
2084 if (rs->xbzrle_enabled) {
2085 return false;
2088 return true;
2092 * try to compress the page before posting it out, return true if the page
2093 * has been properly handled by compression, otherwise needs other
2094 * paths to handle it
2096 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
2098 if (!save_page_use_compression(rs)) {
2099 return false;
2103 * When starting the process of a new block, the first page of
2104 * the block should be sent out before other pages in the same
2105 * block, and all the pages in last block should have been sent
2106 * out, keeping this order is important, because the 'cont' flag
2107 * is used to avoid resending the block name.
2109 * We post the fist page as normal page as compression will take
2110 * much CPU resource.
2112 if (block != rs->last_sent_block) {
2113 flush_compressed_data(rs);
2114 return false;
2117 if (compress_page_with_multi_thread(rs, block, offset) > 0) {
2118 return true;
2121 compression_counters.busy++;
2122 return false;
2126 * ram_save_target_page: save one target page
2128 * Returns the number of pages written
2130 * @rs: current RAM state
2131 * @pss: data about the page we want to send
2132 * @last_stage: if we are at the completion stage
2134 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
2135 bool last_stage)
2137 RAMBlock *block = pss->block;
2138 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
2139 int res;
2141 if (control_save_page(rs, block, offset, &res)) {
2142 return res;
2145 if (save_compress_page(rs, block, offset)) {
2146 return 1;
2149 res = save_zero_page(rs, block, offset);
2150 if (res > 0) {
2151 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2152 * page would be stale
2154 if (!save_page_use_compression(rs)) {
2155 XBZRLE_cache_lock();
2156 xbzrle_cache_zero_page(rs, block->offset + offset);
2157 XBZRLE_cache_unlock();
2159 ram_release_pages(block->idstr, offset, res);
2160 return res;
2164 * Do not use multifd for:
2165 * 1. Compression as the first page in the new block should be posted out
2166 * before sending the compressed page
2167 * 2. In postcopy as one whole host page should be placed
2169 if (!save_page_use_compression(rs) && migrate_use_multifd()
2170 && !migration_in_postcopy()) {
2171 return ram_save_multifd_page(rs, block, offset);
2174 return ram_save_page(rs, pss, last_stage);
2178 * ram_save_host_page: save a whole host page
2180 * Starting at *offset send pages up to the end of the current host
2181 * page. It's valid for the initial offset to point into the middle of
2182 * a host page in which case the remainder of the hostpage is sent.
2183 * Only dirty target pages are sent. Note that the host page size may
2184 * be a huge page for this block.
2185 * The saving stops at the boundary of the used_length of the block
2186 * if the RAMBlock isn't a multiple of the host page size.
2188 * Returns the number of pages written or negative on error
2190 * @rs: current RAM state
2191 * @ms: current migration state
2192 * @pss: data about the page we want to send
2193 * @last_stage: if we are at the completion stage
2195 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
2196 bool last_stage)
2198 int tmppages, pages = 0;
2199 size_t pagesize_bits =
2200 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2201 unsigned long hostpage_boundary =
2202 QEMU_ALIGN_UP(pss->page + 1, pagesize_bits);
2203 unsigned long start_page = pss->page;
2204 int res;
2206 if (ramblock_is_ignored(pss->block)) {
2207 error_report("block %s should not be migrated !", pss->block->idstr);
2208 return 0;
2211 do {
2212 /* Check the pages is dirty and if it is send it */
2213 if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
2214 tmppages = ram_save_target_page(rs, pss, last_stage);
2215 if (tmppages < 0) {
2216 return tmppages;
2219 pages += tmppages;
2221 * Allow rate limiting to happen in the middle of huge pages if
2222 * something is sent in the current iteration.
2224 if (pagesize_bits > 1 && tmppages > 0) {
2225 migration_rate_limit();
2228 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
2229 } while ((pss->page < hostpage_boundary) &&
2230 offset_in_ramblock(pss->block,
2231 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS));
2232 /* The offset we leave with is the min boundary of host page and block */
2233 pss->page = MIN(pss->page, hostpage_boundary) - 1;
2235 res = ram_save_release_protection(rs, pss, start_page);
2236 return (res < 0 ? res : pages);
2240 * ram_find_and_save_block: finds a dirty page and sends it to f
2242 * Called within an RCU critical section.
2244 * Returns the number of pages written where zero means no dirty pages,
2245 * or negative on error
2247 * @rs: current RAM state
2248 * @last_stage: if we are at the completion stage
2250 * On systems where host-page-size > target-page-size it will send all the
2251 * pages in a host page that are dirty.
2254 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
2256 PageSearchStatus pss;
2257 int pages = 0;
2258 bool again, found;
2260 /* No dirty page as there is zero RAM */
2261 if (!ram_bytes_total()) {
2262 return pages;
2265 pss.block = rs->last_seen_block;
2266 pss.page = rs->last_page;
2267 pss.complete_round = false;
2269 if (!pss.block) {
2270 pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
2273 do {
2274 again = true;
2275 found = get_queued_page(rs, &pss);
2277 if (!found) {
2278 /* priority queue empty, so just search for something dirty */
2279 found = find_dirty_block(rs, &pss, &again);
2282 if (found) {
2283 pages = ram_save_host_page(rs, &pss, last_stage);
2285 } while (!pages && again);
2287 rs->last_seen_block = pss.block;
2288 rs->last_page = pss.page;
2290 return pages;
2293 void acct_update_position(QEMUFile *f, size_t size, bool zero)
2295 uint64_t pages = size / TARGET_PAGE_SIZE;
2297 if (zero) {
2298 ram_counters.duplicate += pages;
2299 } else {
2300 ram_counters.normal += pages;
2301 ram_counters.transferred += size;
2302 qemu_update_position(f, size);
2306 static uint64_t ram_bytes_total_common(bool count_ignored)
2308 RAMBlock *block;
2309 uint64_t total = 0;
2311 RCU_READ_LOCK_GUARD();
2313 if (count_ignored) {
2314 RAMBLOCK_FOREACH_MIGRATABLE(block) {
2315 total += block->used_length;
2317 } else {
2318 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2319 total += block->used_length;
2322 return total;
2325 uint64_t ram_bytes_total(void)
2327 return ram_bytes_total_common(false);
2330 static void xbzrle_load_setup(void)
2332 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2335 static void xbzrle_load_cleanup(void)
2337 g_free(XBZRLE.decoded_buf);
2338 XBZRLE.decoded_buf = NULL;
2341 static void ram_state_cleanup(RAMState **rsp)
2343 if (*rsp) {
2344 migration_page_queue_free(*rsp);
2345 qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2346 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2347 g_free(*rsp);
2348 *rsp = NULL;
2352 static void xbzrle_cleanup(void)
2354 XBZRLE_cache_lock();
2355 if (XBZRLE.cache) {
2356 cache_fini(XBZRLE.cache);
2357 g_free(XBZRLE.encoded_buf);
2358 g_free(XBZRLE.current_buf);
2359 g_free(XBZRLE.zero_target_page);
2360 XBZRLE.cache = NULL;
2361 XBZRLE.encoded_buf = NULL;
2362 XBZRLE.current_buf = NULL;
2363 XBZRLE.zero_target_page = NULL;
2365 XBZRLE_cache_unlock();
2368 static void ram_save_cleanup(void *opaque)
2370 RAMState **rsp = opaque;
2371 RAMBlock *block;
2373 /* We don't use dirty log with background snapshots */
2374 if (!migrate_background_snapshot()) {
2375 /* caller have hold iothread lock or is in a bh, so there is
2376 * no writing race against the migration bitmap
2378 if (global_dirty_tracking & GLOBAL_DIRTY_MIGRATION) {
2380 * do not stop dirty log without starting it, since
2381 * memory_global_dirty_log_stop will assert that
2382 * memory_global_dirty_log_start/stop used in pairs
2384 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
2388 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2389 g_free(block->clear_bmap);
2390 block->clear_bmap = NULL;
2391 g_free(block->bmap);
2392 block->bmap = NULL;
2395 xbzrle_cleanup();
2396 compress_threads_save_cleanup();
2397 ram_state_cleanup(rsp);
2400 static void ram_state_reset(RAMState *rs)
2402 rs->last_seen_block = NULL;
2403 rs->last_sent_block = NULL;
2404 rs->last_page = 0;
2405 rs->last_version = ram_list.version;
2406 rs->xbzrle_enabled = false;
2409 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2412 * 'expected' is the value you expect the bitmap mostly to be full
2413 * of; it won't bother printing lines that are all this value.
2414 * If 'todump' is null the migration bitmap is dumped.
2416 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
2417 unsigned long pages)
2419 int64_t cur;
2420 int64_t linelen = 128;
2421 char linebuf[129];
2423 for (cur = 0; cur < pages; cur += linelen) {
2424 int64_t curb;
2425 bool found = false;
2427 * Last line; catch the case where the line length
2428 * is longer than remaining ram
2430 if (cur + linelen > pages) {
2431 linelen = pages - cur;
2433 for (curb = 0; curb < linelen; curb++) {
2434 bool thisbit = test_bit(cur + curb, todump);
2435 linebuf[curb] = thisbit ? '1' : '.';
2436 found = found || (thisbit != expected);
2438 if (found) {
2439 linebuf[curb] = '\0';
2440 fprintf(stderr, "0x%08" PRIx64 " : %s\n", cur, linebuf);
2445 /* **** functions for postcopy ***** */
2447 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2449 struct RAMBlock *block;
2451 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2452 unsigned long *bitmap = block->bmap;
2453 unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2454 unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2456 while (run_start < range) {
2457 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2458 ram_discard_range(block->idstr,
2459 ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
2460 ((ram_addr_t)(run_end - run_start))
2461 << TARGET_PAGE_BITS);
2462 run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2468 * postcopy_send_discard_bm_ram: discard a RAMBlock
2470 * Returns zero on success
2472 * Callback from postcopy_each_ram_send_discard for each RAMBlock
2474 * @ms: current migration state
2475 * @block: RAMBlock to discard
2477 static int postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
2479 unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2480 unsigned long current;
2481 unsigned long *bitmap = block->bmap;
2483 for (current = 0; current < end; ) {
2484 unsigned long one = find_next_bit(bitmap, end, current);
2485 unsigned long zero, discard_length;
2487 if (one >= end) {
2488 break;
2491 zero = find_next_zero_bit(bitmap, end, one + 1);
2493 if (zero >= end) {
2494 discard_length = end - one;
2495 } else {
2496 discard_length = zero - one;
2498 postcopy_discard_send_range(ms, one, discard_length);
2499 current = one + discard_length;
2502 return 0;
2506 * postcopy_each_ram_send_discard: discard all RAMBlocks
2508 * Returns 0 for success or negative for error
2510 * Utility for the outgoing postcopy code.
2511 * Calls postcopy_send_discard_bm_ram for each RAMBlock
2512 * passing it bitmap indexes and name.
2513 * (qemu_ram_foreach_block ends up passing unscaled lengths
2514 * which would mean postcopy code would have to deal with target page)
2516 * @ms: current migration state
2518 static int postcopy_each_ram_send_discard(MigrationState *ms)
2520 struct RAMBlock *block;
2521 int ret;
2523 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2524 postcopy_discard_send_init(ms, block->idstr);
2527 * Postcopy sends chunks of bitmap over the wire, but it
2528 * just needs indexes at this point, avoids it having
2529 * target page specific code.
2531 ret = postcopy_send_discard_bm_ram(ms, block);
2532 postcopy_discard_send_finish(ms);
2533 if (ret) {
2534 return ret;
2538 return 0;
2542 * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2544 * Helper for postcopy_chunk_hostpages; it's called twice to
2545 * canonicalize the two bitmaps, that are similar, but one is
2546 * inverted.
2548 * Postcopy requires that all target pages in a hostpage are dirty or
2549 * clean, not a mix. This function canonicalizes the bitmaps.
2551 * @ms: current migration state
2552 * @block: block that contains the page we want to canonicalize
2554 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
2556 RAMState *rs = ram_state;
2557 unsigned long *bitmap = block->bmap;
2558 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2559 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2560 unsigned long run_start;
2562 if (block->page_size == TARGET_PAGE_SIZE) {
2563 /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2564 return;
2567 /* Find a dirty page */
2568 run_start = find_next_bit(bitmap, pages, 0);
2570 while (run_start < pages) {
2573 * If the start of this run of pages is in the middle of a host
2574 * page, then we need to fixup this host page.
2576 if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2577 /* Find the end of this run */
2578 run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2580 * If the end isn't at the start of a host page, then the
2581 * run doesn't finish at the end of a host page
2582 * and we need to discard.
2586 if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2587 unsigned long page;
2588 unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2589 host_ratio);
2590 run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2592 /* Clean up the bitmap */
2593 for (page = fixup_start_addr;
2594 page < fixup_start_addr + host_ratio; page++) {
2596 * Remark them as dirty, updating the count for any pages
2597 * that weren't previously dirty.
2599 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2603 /* Find the next dirty page for the next iteration */
2604 run_start = find_next_bit(bitmap, pages, run_start);
2609 * postcopy_chunk_hostpages: discard any partially sent host page
2611 * Utility for the outgoing postcopy code.
2613 * Discard any partially sent host-page size chunks, mark any partially
2614 * dirty host-page size chunks as all dirty. In this case the host-page
2615 * is the host-page for the particular RAMBlock, i.e. it might be a huge page
2617 * Returns zero on success
2619 * @ms: current migration state
2620 * @block: block we want to work with
2622 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
2624 postcopy_discard_send_init(ms, block->idstr);
2627 * Ensure that all partially dirty host pages are made fully dirty.
2629 postcopy_chunk_hostpages_pass(ms, block);
2631 postcopy_discard_send_finish(ms);
2632 return 0;
2636 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2638 * Returns zero on success
2640 * Transmit the set of pages to be discarded after precopy to the target
2641 * these are pages that:
2642 * a) Have been previously transmitted but are now dirty again
2643 * b) Pages that have never been transmitted, this ensures that
2644 * any pages on the destination that have been mapped by background
2645 * tasks get discarded (transparent huge pages is the specific concern)
2646 * Hopefully this is pretty sparse
2648 * @ms: current migration state
2650 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
2652 RAMState *rs = ram_state;
2653 RAMBlock *block;
2654 int ret;
2656 RCU_READ_LOCK_GUARD();
2658 /* This should be our last sync, the src is now paused */
2659 migration_bitmap_sync(rs);
2661 /* Easiest way to make sure we don't resume in the middle of a host-page */
2662 rs->last_seen_block = NULL;
2663 rs->last_sent_block = NULL;
2664 rs->last_page = 0;
2666 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2667 /* Deal with TPS != HPS and huge pages */
2668 ret = postcopy_chunk_hostpages(ms, block);
2669 if (ret) {
2670 return ret;
2673 #ifdef DEBUG_POSTCOPY
2674 ram_debug_dump_bitmap(block->bmap, true,
2675 block->used_length >> TARGET_PAGE_BITS);
2676 #endif
2678 trace_ram_postcopy_send_discard_bitmap();
2680 return postcopy_each_ram_send_discard(ms);
2684 * ram_discard_range: discard dirtied pages at the beginning of postcopy
2686 * Returns zero on success
2688 * @rbname: name of the RAMBlock of the request. NULL means the
2689 * same that last one.
2690 * @start: RAMBlock starting page
2691 * @length: RAMBlock size
2693 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2695 trace_ram_discard_range(rbname, start, length);
2697 RCU_READ_LOCK_GUARD();
2698 RAMBlock *rb = qemu_ram_block_by_name(rbname);
2700 if (!rb) {
2701 error_report("ram_discard_range: Failed to find block '%s'", rbname);
2702 return -1;
2706 * On source VM, we don't need to update the received bitmap since
2707 * we don't even have one.
2709 if (rb->receivedmap) {
2710 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2711 length >> qemu_target_page_bits());
2714 return ram_block_discard_range(rb, start, length);
2718 * For every allocation, we will try not to crash the VM if the
2719 * allocation failed.
2721 static int xbzrle_init(void)
2723 Error *local_err = NULL;
2725 if (!migrate_use_xbzrle()) {
2726 return 0;
2729 XBZRLE_cache_lock();
2731 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2732 if (!XBZRLE.zero_target_page) {
2733 error_report("%s: Error allocating zero page", __func__);
2734 goto err_out;
2737 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2738 TARGET_PAGE_SIZE, &local_err);
2739 if (!XBZRLE.cache) {
2740 error_report_err(local_err);
2741 goto free_zero_page;
2744 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2745 if (!XBZRLE.encoded_buf) {
2746 error_report("%s: Error allocating encoded_buf", __func__);
2747 goto free_cache;
2750 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2751 if (!XBZRLE.current_buf) {
2752 error_report("%s: Error allocating current_buf", __func__);
2753 goto free_encoded_buf;
2756 /* We are all good */
2757 XBZRLE_cache_unlock();
2758 return 0;
2760 free_encoded_buf:
2761 g_free(XBZRLE.encoded_buf);
2762 XBZRLE.encoded_buf = NULL;
2763 free_cache:
2764 cache_fini(XBZRLE.cache);
2765 XBZRLE.cache = NULL;
2766 free_zero_page:
2767 g_free(XBZRLE.zero_target_page);
2768 XBZRLE.zero_target_page = NULL;
2769 err_out:
2770 XBZRLE_cache_unlock();
2771 return -ENOMEM;
2774 static int ram_state_init(RAMState **rsp)
2776 *rsp = g_try_new0(RAMState, 1);
2778 if (!*rsp) {
2779 error_report("%s: Init ramstate fail", __func__);
2780 return -1;
2783 qemu_mutex_init(&(*rsp)->bitmap_mutex);
2784 qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2785 QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2788 * Count the total number of pages used by ram blocks not including any
2789 * gaps due to alignment or unplugs.
2790 * This must match with the initial values of dirty bitmap.
2792 (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2793 ram_state_reset(*rsp);
2795 return 0;
2798 static void ram_list_init_bitmaps(void)
2800 MigrationState *ms = migrate_get_current();
2801 RAMBlock *block;
2802 unsigned long pages;
2803 uint8_t shift;
2805 /* Skip setting bitmap if there is no RAM */
2806 if (ram_bytes_total()) {
2807 shift = ms->clear_bitmap_shift;
2808 if (shift > CLEAR_BITMAP_SHIFT_MAX) {
2809 error_report("clear_bitmap_shift (%u) too big, using "
2810 "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
2811 shift = CLEAR_BITMAP_SHIFT_MAX;
2812 } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
2813 error_report("clear_bitmap_shift (%u) too small, using "
2814 "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
2815 shift = CLEAR_BITMAP_SHIFT_MIN;
2818 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2819 pages = block->max_length >> TARGET_PAGE_BITS;
2821 * The initial dirty bitmap for migration must be set with all
2822 * ones to make sure we'll migrate every guest RAM page to
2823 * destination.
2824 * Here we set RAMBlock.bmap all to 1 because when rebegin a
2825 * new migration after a failed migration, ram_list.
2826 * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
2827 * guest memory.
2829 block->bmap = bitmap_new(pages);
2830 bitmap_set(block->bmap, 0, pages);
2831 block->clear_bmap_shift = shift;
2832 block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
2837 static void migration_bitmap_clear_discarded_pages(RAMState *rs)
2839 unsigned long pages;
2840 RAMBlock *rb;
2842 RCU_READ_LOCK_GUARD();
2844 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
2845 pages = ramblock_dirty_bitmap_clear_discarded_pages(rb);
2846 rs->migration_dirty_pages -= pages;
2850 static void ram_init_bitmaps(RAMState *rs)
2852 /* For memory_global_dirty_log_start below. */
2853 qemu_mutex_lock_iothread();
2854 qemu_mutex_lock_ramlist();
2856 WITH_RCU_READ_LOCK_GUARD() {
2857 ram_list_init_bitmaps();
2858 /* We don't use dirty log with background snapshots */
2859 if (!migrate_background_snapshot()) {
2860 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
2861 migration_bitmap_sync_precopy(rs);
2864 qemu_mutex_unlock_ramlist();
2865 qemu_mutex_unlock_iothread();
2868 * After an eventual first bitmap sync, fixup the initial bitmap
2869 * containing all 1s to exclude any discarded pages from migration.
2871 migration_bitmap_clear_discarded_pages(rs);
2874 static int ram_init_all(RAMState **rsp)
2876 if (ram_state_init(rsp)) {
2877 return -1;
2880 if (xbzrle_init()) {
2881 ram_state_cleanup(rsp);
2882 return -1;
2885 ram_init_bitmaps(*rsp);
2887 return 0;
2890 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2892 RAMBlock *block;
2893 uint64_t pages = 0;
2896 * Postcopy is not using xbzrle/compression, so no need for that.
2897 * Also, since source are already halted, we don't need to care
2898 * about dirty page logging as well.
2901 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2902 pages += bitmap_count_one(block->bmap,
2903 block->used_length >> TARGET_PAGE_BITS);
2906 /* This may not be aligned with current bitmaps. Recalculate. */
2907 rs->migration_dirty_pages = pages;
2909 ram_state_reset(rs);
2911 /* Update RAMState cache of output QEMUFile */
2912 rs->f = out;
2914 trace_ram_state_resume_prepare(pages);
2918 * This function clears bits of the free pages reported by the caller from the
2919 * migration dirty bitmap. @addr is the host address corresponding to the
2920 * start of the continuous guest free pages, and @len is the total bytes of
2921 * those pages.
2923 void qemu_guest_free_page_hint(void *addr, size_t len)
2925 RAMBlock *block;
2926 ram_addr_t offset;
2927 size_t used_len, start, npages;
2928 MigrationState *s = migrate_get_current();
2930 /* This function is currently expected to be used during live migration */
2931 if (!migration_is_setup_or_active(s->state)) {
2932 return;
2935 for (; len > 0; len -= used_len, addr += used_len) {
2936 block = qemu_ram_block_from_host(addr, false, &offset);
2937 if (unlikely(!block || offset >= block->used_length)) {
2939 * The implementation might not support RAMBlock resize during
2940 * live migration, but it could happen in theory with future
2941 * updates. So we add a check here to capture that case.
2943 error_report_once("%s unexpected error", __func__);
2944 return;
2947 if (len <= block->used_length - offset) {
2948 used_len = len;
2949 } else {
2950 used_len = block->used_length - offset;
2953 start = offset >> TARGET_PAGE_BITS;
2954 npages = used_len >> TARGET_PAGE_BITS;
2956 qemu_mutex_lock(&ram_state->bitmap_mutex);
2958 * The skipped free pages are equavalent to be sent from clear_bmap's
2959 * perspective, so clear the bits from the memory region bitmap which
2960 * are initially set. Otherwise those skipped pages will be sent in
2961 * the next round after syncing from the memory region bitmap.
2963 migration_clear_memory_region_dirty_bitmap_range(block, start, npages);
2964 ram_state->migration_dirty_pages -=
2965 bitmap_count_one_with_offset(block->bmap, start, npages);
2966 bitmap_clear(block->bmap, start, npages);
2967 qemu_mutex_unlock(&ram_state->bitmap_mutex);
2972 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2973 * long-running RCU critical section. When rcu-reclaims in the code
2974 * start to become numerous it will be necessary to reduce the
2975 * granularity of these critical sections.
2979 * ram_save_setup: Setup RAM for migration
2981 * Returns zero to indicate success and negative for error
2983 * @f: QEMUFile where to send the data
2984 * @opaque: RAMState pointer
2986 static int ram_save_setup(QEMUFile *f, void *opaque)
2988 RAMState **rsp = opaque;
2989 RAMBlock *block;
2991 if (compress_threads_save_setup()) {
2992 return -1;
2995 /* migration has already setup the bitmap, reuse it. */
2996 if (!migration_in_colo_state()) {
2997 if (ram_init_all(rsp) != 0) {
2998 compress_threads_save_cleanup();
2999 return -1;
3002 (*rsp)->f = f;
3004 WITH_RCU_READ_LOCK_GUARD() {
3005 qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
3007 RAMBLOCK_FOREACH_MIGRATABLE(block) {
3008 qemu_put_byte(f, strlen(block->idstr));
3009 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
3010 qemu_put_be64(f, block->used_length);
3011 if (migrate_postcopy_ram() && block->page_size !=
3012 qemu_host_page_size) {
3013 qemu_put_be64(f, block->page_size);
3015 if (migrate_ignore_shared()) {
3016 qemu_put_be64(f, block->mr->addr);
3021 ram_control_before_iterate(f, RAM_CONTROL_SETUP);
3022 ram_control_after_iterate(f, RAM_CONTROL_SETUP);
3024 multifd_send_sync_main(f);
3025 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3026 qemu_fflush(f);
3028 return 0;
3032 * ram_save_iterate: iterative stage for migration
3034 * Returns zero to indicate success and negative for error
3036 * @f: QEMUFile where to send the data
3037 * @opaque: RAMState pointer
3039 static int ram_save_iterate(QEMUFile *f, void *opaque)
3041 RAMState **temp = opaque;
3042 RAMState *rs = *temp;
3043 int ret = 0;
3044 int i;
3045 int64_t t0;
3046 int done = 0;
3048 if (blk_mig_bulk_active()) {
3049 /* Avoid transferring ram during bulk phase of block migration as
3050 * the bulk phase will usually take a long time and transferring
3051 * ram updates during that time is pointless. */
3052 goto out;
3056 * We'll take this lock a little bit long, but it's okay for two reasons.
3057 * Firstly, the only possible other thread to take it is who calls
3058 * qemu_guest_free_page_hint(), which should be rare; secondly, see
3059 * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which
3060 * guarantees that we'll at least released it in a regular basis.
3062 qemu_mutex_lock(&rs->bitmap_mutex);
3063 WITH_RCU_READ_LOCK_GUARD() {
3064 if (ram_list.version != rs->last_version) {
3065 ram_state_reset(rs);
3068 /* Read version before ram_list.blocks */
3069 smp_rmb();
3071 ram_control_before_iterate(f, RAM_CONTROL_ROUND);
3073 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
3074 i = 0;
3075 while ((ret = qemu_file_rate_limit(f)) == 0 ||
3076 !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
3077 int pages;
3079 if (qemu_file_get_error(f)) {
3080 break;
3083 pages = ram_find_and_save_block(rs, false);
3084 /* no more pages to sent */
3085 if (pages == 0) {
3086 done = 1;
3087 break;
3090 if (pages < 0) {
3091 qemu_file_set_error(f, pages);
3092 break;
3095 rs->target_page_count += pages;
3098 * During postcopy, it is necessary to make sure one whole host
3099 * page is sent in one chunk.
3101 if (migrate_postcopy_ram()) {
3102 flush_compressed_data(rs);
3106 * we want to check in the 1st loop, just in case it was the 1st
3107 * time and we had to sync the dirty bitmap.
3108 * qemu_clock_get_ns() is a bit expensive, so we only check each
3109 * some iterations
3111 if ((i & 63) == 0) {
3112 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
3113 1000000;
3114 if (t1 > MAX_WAIT) {
3115 trace_ram_save_iterate_big_wait(t1, i);
3116 break;
3119 i++;
3122 qemu_mutex_unlock(&rs->bitmap_mutex);
3125 * Must occur before EOS (or any QEMUFile operation)
3126 * because of RDMA protocol.
3128 ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3130 out:
3131 if (ret >= 0
3132 && migration_is_setup_or_active(migrate_get_current()->state)) {
3133 multifd_send_sync_main(rs->f);
3134 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3135 qemu_fflush(f);
3136 ram_counters.transferred += 8;
3138 ret = qemu_file_get_error(f);
3140 if (ret < 0) {
3141 return ret;
3144 return done;
3148 * ram_save_complete: function called to send the remaining amount of ram
3150 * Returns zero to indicate success or negative on error
3152 * Called with iothread lock
3154 * @f: QEMUFile where to send the data
3155 * @opaque: RAMState pointer
3157 static int ram_save_complete(QEMUFile *f, void *opaque)
3159 RAMState **temp = opaque;
3160 RAMState *rs = *temp;
3161 int ret = 0;
3163 WITH_RCU_READ_LOCK_GUARD() {
3164 if (!migration_in_postcopy()) {
3165 migration_bitmap_sync_precopy(rs);
3168 ram_control_before_iterate(f, RAM_CONTROL_FINISH);
3170 /* try transferring iterative blocks of memory */
3172 /* flush all remaining blocks regardless of rate limiting */
3173 while (true) {
3174 int pages;
3176 pages = ram_find_and_save_block(rs, !migration_in_colo_state());
3177 /* no more blocks to sent */
3178 if (pages == 0) {
3179 break;
3181 if (pages < 0) {
3182 ret = pages;
3183 break;
3187 flush_compressed_data(rs);
3188 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3191 if (ret >= 0) {
3192 multifd_send_sync_main(rs->f);
3193 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3194 qemu_fflush(f);
3197 return ret;
3200 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
3201 uint64_t *res_precopy_only,
3202 uint64_t *res_compatible,
3203 uint64_t *res_postcopy_only)
3205 RAMState **temp = opaque;
3206 RAMState *rs = *temp;
3207 uint64_t remaining_size;
3209 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3211 if (!migration_in_postcopy() &&
3212 remaining_size < max_size) {
3213 qemu_mutex_lock_iothread();
3214 WITH_RCU_READ_LOCK_GUARD() {
3215 migration_bitmap_sync_precopy(rs);
3217 qemu_mutex_unlock_iothread();
3218 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3221 if (migrate_postcopy_ram()) {
3222 /* We can do postcopy, and all the data is postcopiable */
3223 *res_compatible += remaining_size;
3224 } else {
3225 *res_precopy_only += remaining_size;
3229 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3231 unsigned int xh_len;
3232 int xh_flags;
3233 uint8_t *loaded_data;
3235 /* extract RLE header */
3236 xh_flags = qemu_get_byte(f);
3237 xh_len = qemu_get_be16(f);
3239 if (xh_flags != ENCODING_FLAG_XBZRLE) {
3240 error_report("Failed to load XBZRLE page - wrong compression!");
3241 return -1;
3244 if (xh_len > TARGET_PAGE_SIZE) {
3245 error_report("Failed to load XBZRLE page - len overflow!");
3246 return -1;
3248 loaded_data = XBZRLE.decoded_buf;
3249 /* load data and decode */
3250 /* it can change loaded_data to point to an internal buffer */
3251 qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3253 /* decode RLE */
3254 if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3255 TARGET_PAGE_SIZE) == -1) {
3256 error_report("Failed to load XBZRLE page - decode error!");
3257 return -1;
3260 return 0;
3264 * ram_block_from_stream: read a RAMBlock id from the migration stream
3266 * Must be called from within a rcu critical section.
3268 * Returns a pointer from within the RCU-protected ram_list.
3270 * @f: QEMUFile where to read the data from
3271 * @flags: Page flags (mostly to see if it's a continuation of previous block)
3273 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
3275 static RAMBlock *block;
3276 char id[256];
3277 uint8_t len;
3279 if (flags & RAM_SAVE_FLAG_CONTINUE) {
3280 if (!block) {
3281 error_report("Ack, bad migration stream!");
3282 return NULL;
3284 return block;
3287 len = qemu_get_byte(f);
3288 qemu_get_buffer(f, (uint8_t *)id, len);
3289 id[len] = 0;
3291 block = qemu_ram_block_by_name(id);
3292 if (!block) {
3293 error_report("Can't find block %s", id);
3294 return NULL;
3297 if (ramblock_is_ignored(block)) {
3298 error_report("block %s should not be migrated !", id);
3299 return NULL;
3302 return block;
3305 static inline void *host_from_ram_block_offset(RAMBlock *block,
3306 ram_addr_t offset)
3308 if (!offset_in_ramblock(block, offset)) {
3309 return NULL;
3312 return block->host + offset;
3315 static void *host_page_from_ram_block_offset(RAMBlock *block,
3316 ram_addr_t offset)
3318 /* Note: Explicitly no check against offset_in_ramblock(). */
3319 return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset),
3320 block->page_size);
3323 static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block,
3324 ram_addr_t offset)
3326 return ((uintptr_t)block->host + offset) & (block->page_size - 1);
3329 static inline void *colo_cache_from_block_offset(RAMBlock *block,
3330 ram_addr_t offset, bool record_bitmap)
3332 if (!offset_in_ramblock(block, offset)) {
3333 return NULL;
3335 if (!block->colo_cache) {
3336 error_report("%s: colo_cache is NULL in block :%s",
3337 __func__, block->idstr);
3338 return NULL;
3342 * During colo checkpoint, we need bitmap of these migrated pages.
3343 * It help us to decide which pages in ram cache should be flushed
3344 * into VM's RAM later.
3346 if (record_bitmap &&
3347 !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
3348 ram_state->migration_dirty_pages++;
3350 return block->colo_cache + offset;
3354 * ram_handle_compressed: handle the zero page case
3356 * If a page (or a whole RDMA chunk) has been
3357 * determined to be zero, then zap it.
3359 * @host: host address for the zero page
3360 * @ch: what the page is filled from. We only support zero
3361 * @size: size of the zero page
3363 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3365 if (ch != 0 || !buffer_is_zero(host, size)) {
3366 memset(host, ch, size);
3370 /* return the size after decompression, or negative value on error */
3371 static int
3372 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3373 const uint8_t *source, size_t source_len)
3375 int err;
3377 err = inflateReset(stream);
3378 if (err != Z_OK) {
3379 return -1;
3382 stream->avail_in = source_len;
3383 stream->next_in = (uint8_t *)source;
3384 stream->avail_out = dest_len;
3385 stream->next_out = dest;
3387 err = inflate(stream, Z_NO_FLUSH);
3388 if (err != Z_STREAM_END) {
3389 return -1;
3392 return stream->total_out;
3395 static void *do_data_decompress(void *opaque)
3397 DecompressParam *param = opaque;
3398 unsigned long pagesize;
3399 uint8_t *des;
3400 int len, ret;
3402 qemu_mutex_lock(&param->mutex);
3403 while (!param->quit) {
3404 if (param->des) {
3405 des = param->des;
3406 len = param->len;
3407 param->des = 0;
3408 qemu_mutex_unlock(&param->mutex);
3410 pagesize = TARGET_PAGE_SIZE;
3412 ret = qemu_uncompress_data(&param->stream, des, pagesize,
3413 param->compbuf, len);
3414 if (ret < 0 && migrate_get_current()->decompress_error_check) {
3415 error_report("decompress data failed");
3416 qemu_file_set_error(decomp_file, ret);
3419 qemu_mutex_lock(&decomp_done_lock);
3420 param->done = true;
3421 qemu_cond_signal(&decomp_done_cond);
3422 qemu_mutex_unlock(&decomp_done_lock);
3424 qemu_mutex_lock(&param->mutex);
3425 } else {
3426 qemu_cond_wait(&param->cond, &param->mutex);
3429 qemu_mutex_unlock(&param->mutex);
3431 return NULL;
3434 static int wait_for_decompress_done(void)
3436 int idx, thread_count;
3438 if (!migrate_use_compression()) {
3439 return 0;
3442 thread_count = migrate_decompress_threads();
3443 qemu_mutex_lock(&decomp_done_lock);
3444 for (idx = 0; idx < thread_count; idx++) {
3445 while (!decomp_param[idx].done) {
3446 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3449 qemu_mutex_unlock(&decomp_done_lock);
3450 return qemu_file_get_error(decomp_file);
3453 static void compress_threads_load_cleanup(void)
3455 int i, thread_count;
3457 if (!migrate_use_compression()) {
3458 return;
3460 thread_count = migrate_decompress_threads();
3461 for (i = 0; i < thread_count; i++) {
3463 * we use it as a indicator which shows if the thread is
3464 * properly init'd or not
3466 if (!decomp_param[i].compbuf) {
3467 break;
3470 qemu_mutex_lock(&decomp_param[i].mutex);
3471 decomp_param[i].quit = true;
3472 qemu_cond_signal(&decomp_param[i].cond);
3473 qemu_mutex_unlock(&decomp_param[i].mutex);
3475 for (i = 0; i < thread_count; i++) {
3476 if (!decomp_param[i].compbuf) {
3477 break;
3480 qemu_thread_join(decompress_threads + i);
3481 qemu_mutex_destroy(&decomp_param[i].mutex);
3482 qemu_cond_destroy(&decomp_param[i].cond);
3483 inflateEnd(&decomp_param[i].stream);
3484 g_free(decomp_param[i].compbuf);
3485 decomp_param[i].compbuf = NULL;
3487 g_free(decompress_threads);
3488 g_free(decomp_param);
3489 decompress_threads = NULL;
3490 decomp_param = NULL;
3491 decomp_file = NULL;
3494 static int compress_threads_load_setup(QEMUFile *f)
3496 int i, thread_count;
3498 if (!migrate_use_compression()) {
3499 return 0;
3502 thread_count = migrate_decompress_threads();
3503 decompress_threads = g_new0(QemuThread, thread_count);
3504 decomp_param = g_new0(DecompressParam, thread_count);
3505 qemu_mutex_init(&decomp_done_lock);
3506 qemu_cond_init(&decomp_done_cond);
3507 decomp_file = f;
3508 for (i = 0; i < thread_count; i++) {
3509 if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3510 goto exit;
3513 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3514 qemu_mutex_init(&decomp_param[i].mutex);
3515 qemu_cond_init(&decomp_param[i].cond);
3516 decomp_param[i].done = true;
3517 decomp_param[i].quit = false;
3518 qemu_thread_create(decompress_threads + i, "decompress",
3519 do_data_decompress, decomp_param + i,
3520 QEMU_THREAD_JOINABLE);
3522 return 0;
3523 exit:
3524 compress_threads_load_cleanup();
3525 return -1;
3528 static void decompress_data_with_multi_threads(QEMUFile *f,
3529 void *host, int len)
3531 int idx, thread_count;
3533 thread_count = migrate_decompress_threads();
3534 QEMU_LOCK_GUARD(&decomp_done_lock);
3535 while (true) {
3536 for (idx = 0; idx < thread_count; idx++) {
3537 if (decomp_param[idx].done) {
3538 decomp_param[idx].done = false;
3539 qemu_mutex_lock(&decomp_param[idx].mutex);
3540 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3541 decomp_param[idx].des = host;
3542 decomp_param[idx].len = len;
3543 qemu_cond_signal(&decomp_param[idx].cond);
3544 qemu_mutex_unlock(&decomp_param[idx].mutex);
3545 break;
3548 if (idx < thread_count) {
3549 break;
3550 } else {
3551 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3556 static void colo_init_ram_state(void)
3558 ram_state_init(&ram_state);
3562 * colo cache: this is for secondary VM, we cache the whole
3563 * memory of the secondary VM, it is need to hold the global lock
3564 * to call this helper.
3566 int colo_init_ram_cache(void)
3568 RAMBlock *block;
3570 WITH_RCU_READ_LOCK_GUARD() {
3571 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3572 block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3573 NULL, false, false);
3574 if (!block->colo_cache) {
3575 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3576 "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3577 block->used_length);
3578 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3579 if (block->colo_cache) {
3580 qemu_anon_ram_free(block->colo_cache, block->used_length);
3581 block->colo_cache = NULL;
3584 return -errno;
3586 if (!machine_dump_guest_core(current_machine)) {
3587 qemu_madvise(block->colo_cache, block->used_length,
3588 QEMU_MADV_DONTDUMP);
3594 * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3595 * with to decide which page in cache should be flushed into SVM's RAM. Here
3596 * we use the same name 'ram_bitmap' as for migration.
3598 if (ram_bytes_total()) {
3599 RAMBlock *block;
3601 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3602 unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3603 block->bmap = bitmap_new(pages);
3607 colo_init_ram_state();
3608 return 0;
3611 /* TODO: duplicated with ram_init_bitmaps */
3612 void colo_incoming_start_dirty_log(void)
3614 RAMBlock *block = NULL;
3615 /* For memory_global_dirty_log_start below. */
3616 qemu_mutex_lock_iothread();
3617 qemu_mutex_lock_ramlist();
3619 memory_global_dirty_log_sync();
3620 WITH_RCU_READ_LOCK_GUARD() {
3621 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3622 ramblock_sync_dirty_bitmap(ram_state, block);
3623 /* Discard this dirty bitmap record */
3624 bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3626 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
3628 ram_state->migration_dirty_pages = 0;
3629 qemu_mutex_unlock_ramlist();
3630 qemu_mutex_unlock_iothread();
3633 /* It is need to hold the global lock to call this helper */
3634 void colo_release_ram_cache(void)
3636 RAMBlock *block;
3638 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
3639 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3640 g_free(block->bmap);
3641 block->bmap = NULL;
3644 WITH_RCU_READ_LOCK_GUARD() {
3645 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3646 if (block->colo_cache) {
3647 qemu_anon_ram_free(block->colo_cache, block->used_length);
3648 block->colo_cache = NULL;
3652 ram_state_cleanup(&ram_state);
3656 * ram_load_setup: Setup RAM for migration incoming side
3658 * Returns zero to indicate success and negative for error
3660 * @f: QEMUFile where to receive the data
3661 * @opaque: RAMState pointer
3663 static int ram_load_setup(QEMUFile *f, void *opaque)
3665 if (compress_threads_load_setup(f)) {
3666 return -1;
3669 xbzrle_load_setup();
3670 ramblock_recv_map_init();
3672 return 0;
3675 static int ram_load_cleanup(void *opaque)
3677 RAMBlock *rb;
3679 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3680 qemu_ram_block_writeback(rb);
3683 xbzrle_load_cleanup();
3684 compress_threads_load_cleanup();
3686 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3687 g_free(rb->receivedmap);
3688 rb->receivedmap = NULL;
3691 return 0;
3695 * ram_postcopy_incoming_init: allocate postcopy data structures
3697 * Returns 0 for success and negative if there was one error
3699 * @mis: current migration incoming state
3701 * Allocate data structures etc needed by incoming migration with
3702 * postcopy-ram. postcopy-ram's similarly names
3703 * postcopy_ram_incoming_init does the work.
3705 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3707 return postcopy_ram_incoming_init(mis);
3711 * ram_load_postcopy: load a page in postcopy case
3713 * Returns 0 for success or -errno in case of error
3715 * Called in postcopy mode by ram_load().
3716 * rcu_read_lock is taken prior to this being called.
3718 * @f: QEMUFile where to send the data
3720 static int ram_load_postcopy(QEMUFile *f)
3722 int flags = 0, ret = 0;
3723 bool place_needed = false;
3724 bool matches_target_page_size = false;
3725 MigrationIncomingState *mis = migration_incoming_get_current();
3726 /* Temporary page that is later 'placed' */
3727 void *postcopy_host_page = mis->postcopy_tmp_page;
3728 void *host_page = NULL;
3729 bool all_zero = true;
3730 int target_pages = 0;
3732 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3733 ram_addr_t addr;
3734 void *page_buffer = NULL;
3735 void *place_source = NULL;
3736 RAMBlock *block = NULL;
3737 uint8_t ch;
3738 int len;
3740 addr = qemu_get_be64(f);
3743 * If qemu file error, we should stop here, and then "addr"
3744 * may be invalid
3746 ret = qemu_file_get_error(f);
3747 if (ret) {
3748 break;
3751 flags = addr & ~TARGET_PAGE_MASK;
3752 addr &= TARGET_PAGE_MASK;
3754 trace_ram_load_postcopy_loop((uint64_t)addr, flags);
3755 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3756 RAM_SAVE_FLAG_COMPRESS_PAGE)) {
3757 block = ram_block_from_stream(f, flags);
3758 if (!block) {
3759 ret = -EINVAL;
3760 break;
3764 * Relying on used_length is racy and can result in false positives.
3765 * We might place pages beyond used_length in case RAM was shrunk
3766 * while in postcopy, which is fine - trying to place via
3767 * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault.
3769 if (!block->host || addr >= block->postcopy_length) {
3770 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3771 ret = -EINVAL;
3772 break;
3774 target_pages++;
3775 matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
3777 * Postcopy requires that we place whole host pages atomically;
3778 * these may be huge pages for RAMBlocks that are backed by
3779 * hugetlbfs.
3780 * To make it atomic, the data is read into a temporary page
3781 * that's moved into place later.
3782 * The migration protocol uses, possibly smaller, target-pages
3783 * however the source ensures it always sends all the components
3784 * of a host page in one chunk.
3786 page_buffer = postcopy_host_page +
3787 host_page_offset_from_ram_block_offset(block, addr);
3788 /* If all TP are zero then we can optimise the place */
3789 if (target_pages == 1) {
3790 host_page = host_page_from_ram_block_offset(block, addr);
3791 } else if (host_page != host_page_from_ram_block_offset(block,
3792 addr)) {
3793 /* not the 1st TP within the HP */
3794 error_report("Non-same host page %p/%p", host_page,
3795 host_page_from_ram_block_offset(block, addr));
3796 ret = -EINVAL;
3797 break;
3801 * If it's the last part of a host page then we place the host
3802 * page
3804 if (target_pages == (block->page_size / TARGET_PAGE_SIZE)) {
3805 place_needed = true;
3807 place_source = postcopy_host_page;
3810 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3811 case RAM_SAVE_FLAG_ZERO:
3812 ch = qemu_get_byte(f);
3814 * Can skip to set page_buffer when
3815 * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
3817 if (ch || !matches_target_page_size) {
3818 memset(page_buffer, ch, TARGET_PAGE_SIZE);
3820 if (ch) {
3821 all_zero = false;
3823 break;
3825 case RAM_SAVE_FLAG_PAGE:
3826 all_zero = false;
3827 if (!matches_target_page_size) {
3828 /* For huge pages, we always use temporary buffer */
3829 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3830 } else {
3832 * For small pages that matches target page size, we
3833 * avoid the qemu_file copy. Instead we directly use
3834 * the buffer of QEMUFile to place the page. Note: we
3835 * cannot do any QEMUFile operation before using that
3836 * buffer to make sure the buffer is valid when
3837 * placing the page.
3839 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3840 TARGET_PAGE_SIZE);
3842 break;
3843 case RAM_SAVE_FLAG_COMPRESS_PAGE:
3844 all_zero = false;
3845 len = qemu_get_be32(f);
3846 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3847 error_report("Invalid compressed data length: %d", len);
3848 ret = -EINVAL;
3849 break;
3851 decompress_data_with_multi_threads(f, page_buffer, len);
3852 break;
3854 case RAM_SAVE_FLAG_EOS:
3855 /* normal exit */
3856 multifd_recv_sync_main();
3857 break;
3858 default:
3859 error_report("Unknown combination of migration flags: 0x%x"
3860 " (postcopy mode)", flags);
3861 ret = -EINVAL;
3862 break;
3865 /* Got the whole host page, wait for decompress before placing. */
3866 if (place_needed) {
3867 ret |= wait_for_decompress_done();
3870 /* Detect for any possible file errors */
3871 if (!ret && qemu_file_get_error(f)) {
3872 ret = qemu_file_get_error(f);
3875 if (!ret && place_needed) {
3876 if (all_zero) {
3877 ret = postcopy_place_page_zero(mis, host_page, block);
3878 } else {
3879 ret = postcopy_place_page(mis, host_page, place_source,
3880 block);
3882 place_needed = false;
3883 target_pages = 0;
3884 /* Assume we have a zero page until we detect something different */
3885 all_zero = true;
3889 return ret;
3892 static bool postcopy_is_advised(void)
3894 PostcopyState ps = postcopy_state_get();
3895 return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
3898 static bool postcopy_is_running(void)
3900 PostcopyState ps = postcopy_state_get();
3901 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3905 * Flush content of RAM cache into SVM's memory.
3906 * Only flush the pages that be dirtied by PVM or SVM or both.
3908 void colo_flush_ram_cache(void)
3910 RAMBlock *block = NULL;
3911 void *dst_host;
3912 void *src_host;
3913 unsigned long offset = 0;
3915 memory_global_dirty_log_sync();
3916 WITH_RCU_READ_LOCK_GUARD() {
3917 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3918 ramblock_sync_dirty_bitmap(ram_state, block);
3922 trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
3923 WITH_RCU_READ_LOCK_GUARD() {
3924 block = QLIST_FIRST_RCU(&ram_list.blocks);
3926 while (block) {
3927 unsigned long num = 0;
3929 offset = colo_bitmap_find_dirty(ram_state, block, offset, &num);
3930 if (!offset_in_ramblock(block,
3931 ((ram_addr_t)offset) << TARGET_PAGE_BITS)) {
3932 offset = 0;
3933 num = 0;
3934 block = QLIST_NEXT_RCU(block, next);
3935 } else {
3936 unsigned long i = 0;
3938 for (i = 0; i < num; i++) {
3939 migration_bitmap_clear_dirty(ram_state, block, offset + i);
3941 dst_host = block->host
3942 + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3943 src_host = block->colo_cache
3944 + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3945 memcpy(dst_host, src_host, TARGET_PAGE_SIZE * num);
3946 offset += num;
3950 trace_colo_flush_ram_cache_end();
3954 * ram_load_precopy: load pages in precopy case
3956 * Returns 0 for success or -errno in case of error
3958 * Called in precopy mode by ram_load().
3959 * rcu_read_lock is taken prior to this being called.
3961 * @f: QEMUFile where to send the data
3963 static int ram_load_precopy(QEMUFile *f)
3965 int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
3966 /* ADVISE is earlier, it shows the source has the postcopy capability on */
3967 bool postcopy_advised = postcopy_is_advised();
3968 if (!migrate_use_compression()) {
3969 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3972 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3973 ram_addr_t addr, total_ram_bytes;
3974 void *host = NULL, *host_bak = NULL;
3975 uint8_t ch;
3978 * Yield periodically to let main loop run, but an iteration of
3979 * the main loop is expensive, so do it each some iterations
3981 if ((i & 32767) == 0 && qemu_in_coroutine()) {
3982 aio_co_schedule(qemu_get_current_aio_context(),
3983 qemu_coroutine_self());
3984 qemu_coroutine_yield();
3986 i++;
3988 addr = qemu_get_be64(f);
3989 flags = addr & ~TARGET_PAGE_MASK;
3990 addr &= TARGET_PAGE_MASK;
3992 if (flags & invalid_flags) {
3993 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3994 error_report("Received an unexpected compressed page");
3997 ret = -EINVAL;
3998 break;
4001 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
4002 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
4003 RAMBlock *block = ram_block_from_stream(f, flags);
4005 host = host_from_ram_block_offset(block, addr);
4007 * After going into COLO stage, we should not load the page
4008 * into SVM's memory directly, we put them into colo_cache firstly.
4009 * NOTE: We need to keep a copy of SVM's ram in colo_cache.
4010 * Previously, we copied all these memory in preparing stage of COLO
4011 * while we need to stop VM, which is a time-consuming process.
4012 * Here we optimize it by a trick, back-up every page while in
4013 * migration process while COLO is enabled, though it affects the
4014 * speed of the migration, but it obviously reduce the downtime of
4015 * back-up all SVM'S memory in COLO preparing stage.
4017 if (migration_incoming_colo_enabled()) {
4018 if (migration_incoming_in_colo_state()) {
4019 /* In COLO stage, put all pages into cache temporarily */
4020 host = colo_cache_from_block_offset(block, addr, true);
4021 } else {
4023 * In migration stage but before COLO stage,
4024 * Put all pages into both cache and SVM's memory.
4026 host_bak = colo_cache_from_block_offset(block, addr, false);
4029 if (!host) {
4030 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4031 ret = -EINVAL;
4032 break;
4034 if (!migration_incoming_in_colo_state()) {
4035 ramblock_recv_bitmap_set(block, host);
4038 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
4041 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4042 case RAM_SAVE_FLAG_MEM_SIZE:
4043 /* Synchronize RAM block list */
4044 total_ram_bytes = addr;
4045 while (!ret && total_ram_bytes) {
4046 RAMBlock *block;
4047 char id[256];
4048 ram_addr_t length;
4050 len = qemu_get_byte(f);
4051 qemu_get_buffer(f, (uint8_t *)id, len);
4052 id[len] = 0;
4053 length = qemu_get_be64(f);
4055 block = qemu_ram_block_by_name(id);
4056 if (block && !qemu_ram_is_migratable(block)) {
4057 error_report("block %s should not be migrated !", id);
4058 ret = -EINVAL;
4059 } else if (block) {
4060 if (length != block->used_length) {
4061 Error *local_err = NULL;
4063 ret = qemu_ram_resize(block, length,
4064 &local_err);
4065 if (local_err) {
4066 error_report_err(local_err);
4069 /* For postcopy we need to check hugepage sizes match */
4070 if (postcopy_advised && migrate_postcopy_ram() &&
4071 block->page_size != qemu_host_page_size) {
4072 uint64_t remote_page_size = qemu_get_be64(f);
4073 if (remote_page_size != block->page_size) {
4074 error_report("Mismatched RAM page size %s "
4075 "(local) %zd != %" PRId64,
4076 id, block->page_size,
4077 remote_page_size);
4078 ret = -EINVAL;
4081 if (migrate_ignore_shared()) {
4082 hwaddr addr = qemu_get_be64(f);
4083 if (ramblock_is_ignored(block) &&
4084 block->mr->addr != addr) {
4085 error_report("Mismatched GPAs for block %s "
4086 "%" PRId64 "!= %" PRId64,
4087 id, (uint64_t)addr,
4088 (uint64_t)block->mr->addr);
4089 ret = -EINVAL;
4092 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
4093 block->idstr);
4094 } else {
4095 error_report("Unknown ramblock \"%s\", cannot "
4096 "accept migration", id);
4097 ret = -EINVAL;
4100 total_ram_bytes -= length;
4102 break;
4104 case RAM_SAVE_FLAG_ZERO:
4105 ch = qemu_get_byte(f);
4106 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
4107 break;
4109 case RAM_SAVE_FLAG_PAGE:
4110 qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
4111 break;
4113 case RAM_SAVE_FLAG_COMPRESS_PAGE:
4114 len = qemu_get_be32(f);
4115 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4116 error_report("Invalid compressed data length: %d", len);
4117 ret = -EINVAL;
4118 break;
4120 decompress_data_with_multi_threads(f, host, len);
4121 break;
4123 case RAM_SAVE_FLAG_XBZRLE:
4124 if (load_xbzrle(f, addr, host) < 0) {
4125 error_report("Failed to decompress XBZRLE page at "
4126 RAM_ADDR_FMT, addr);
4127 ret = -EINVAL;
4128 break;
4130 break;
4131 case RAM_SAVE_FLAG_EOS:
4132 /* normal exit */
4133 multifd_recv_sync_main();
4134 break;
4135 default:
4136 if (flags & RAM_SAVE_FLAG_HOOK) {
4137 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
4138 } else {
4139 error_report("Unknown combination of migration flags: 0x%x",
4140 flags);
4141 ret = -EINVAL;
4144 if (!ret) {
4145 ret = qemu_file_get_error(f);
4147 if (!ret && host_bak) {
4148 memcpy(host_bak, host, TARGET_PAGE_SIZE);
4152 ret |= wait_for_decompress_done();
4153 return ret;
4156 static int ram_load(QEMUFile *f, void *opaque, int version_id)
4158 int ret = 0;
4159 static uint64_t seq_iter;
4161 * If system is running in postcopy mode, page inserts to host memory must
4162 * be atomic
4164 bool postcopy_running = postcopy_is_running();
4166 seq_iter++;
4168 if (version_id != 4) {
4169 return -EINVAL;
4173 * This RCU critical section can be very long running.
4174 * When RCU reclaims in the code start to become numerous,
4175 * it will be necessary to reduce the granularity of this
4176 * critical section.
4178 WITH_RCU_READ_LOCK_GUARD() {
4179 if (postcopy_running) {
4180 ret = ram_load_postcopy(f);
4181 } else {
4182 ret = ram_load_precopy(f);
4185 trace_ram_load_complete(ret, seq_iter);
4187 return ret;
4190 static bool ram_has_postcopy(void *opaque)
4192 RAMBlock *rb;
4193 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4194 if (ramblock_is_pmem(rb)) {
4195 info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4196 "is not supported now!", rb->idstr, rb->host);
4197 return false;
4201 return migrate_postcopy_ram();
4204 /* Sync all the dirty bitmap with destination VM. */
4205 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4207 RAMBlock *block;
4208 QEMUFile *file = s->to_dst_file;
4209 int ramblock_count = 0;
4211 trace_ram_dirty_bitmap_sync_start();
4213 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4214 qemu_savevm_send_recv_bitmap(file, block->idstr);
4215 trace_ram_dirty_bitmap_request(block->idstr);
4216 ramblock_count++;
4219 trace_ram_dirty_bitmap_sync_wait();
4221 /* Wait until all the ramblocks' dirty bitmap synced */
4222 while (ramblock_count--) {
4223 qemu_sem_wait(&s->rp_state.rp_sem);
4226 trace_ram_dirty_bitmap_sync_complete();
4228 return 0;
4231 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4233 qemu_sem_post(&s->rp_state.rp_sem);
4237 * Read the received bitmap, revert it as the initial dirty bitmap.
4238 * This is only used when the postcopy migration is paused but wants
4239 * to resume from a middle point.
4241 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4243 int ret = -EINVAL;
4244 /* from_dst_file is always valid because we're within rp_thread */
4245 QEMUFile *file = s->rp_state.from_dst_file;
4246 unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
4247 uint64_t local_size = DIV_ROUND_UP(nbits, 8);
4248 uint64_t size, end_mark;
4250 trace_ram_dirty_bitmap_reload_begin(block->idstr);
4252 if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4253 error_report("%s: incorrect state %s", __func__,
4254 MigrationStatus_str(s->state));
4255 return -EINVAL;
4259 * Note: see comments in ramblock_recv_bitmap_send() on why we
4260 * need the endianness conversion, and the paddings.
4262 local_size = ROUND_UP(local_size, 8);
4264 /* Add paddings */
4265 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4267 size = qemu_get_be64(file);
4269 /* The size of the bitmap should match with our ramblock */
4270 if (size != local_size) {
4271 error_report("%s: ramblock '%s' bitmap size mismatch "
4272 "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4273 block->idstr, size, local_size);
4274 ret = -EINVAL;
4275 goto out;
4278 size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4279 end_mark = qemu_get_be64(file);
4281 ret = qemu_file_get_error(file);
4282 if (ret || size != local_size) {
4283 error_report("%s: read bitmap failed for ramblock '%s': %d"
4284 " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4285 __func__, block->idstr, ret, local_size, size);
4286 ret = -EIO;
4287 goto out;
4290 if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4291 error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64,
4292 __func__, block->idstr, end_mark);
4293 ret = -EINVAL;
4294 goto out;
4298 * Endianness conversion. We are during postcopy (though paused).
4299 * The dirty bitmap won't change. We can directly modify it.
4301 bitmap_from_le(block->bmap, le_bitmap, nbits);
4304 * What we received is "received bitmap". Revert it as the initial
4305 * dirty bitmap for this ramblock.
4307 bitmap_complement(block->bmap, block->bmap, nbits);
4309 /* Clear dirty bits of discarded ranges that we don't want to migrate. */
4310 ramblock_dirty_bitmap_clear_discarded_pages(block);
4312 /* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */
4313 trace_ram_dirty_bitmap_reload_complete(block->idstr);
4316 * We succeeded to sync bitmap for current ramblock. If this is
4317 * the last one to sync, we need to notify the main send thread.
4319 ram_dirty_bitmap_reload_notify(s);
4321 ret = 0;
4322 out:
4323 g_free(le_bitmap);
4324 return ret;
4327 static int ram_resume_prepare(MigrationState *s, void *opaque)
4329 RAMState *rs = *(RAMState **)opaque;
4330 int ret;
4332 ret = ram_dirty_bitmap_sync_all(s, rs);
4333 if (ret) {
4334 return ret;
4337 ram_state_resume_prepare(rs, s->to_dst_file);
4339 return 0;
4342 static SaveVMHandlers savevm_ram_handlers = {
4343 .save_setup = ram_save_setup,
4344 .save_live_iterate = ram_save_iterate,
4345 .save_live_complete_postcopy = ram_save_complete,
4346 .save_live_complete_precopy = ram_save_complete,
4347 .has_postcopy = ram_has_postcopy,
4348 .save_live_pending = ram_save_pending,
4349 .load_state = ram_load,
4350 .save_cleanup = ram_save_cleanup,
4351 .load_setup = ram_load_setup,
4352 .load_cleanup = ram_load_cleanup,
4353 .resume_prepare = ram_resume_prepare,
4356 static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host,
4357 size_t old_size, size_t new_size)
4359 PostcopyState ps = postcopy_state_get();
4360 ram_addr_t offset;
4361 RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset);
4362 Error *err = NULL;
4364 if (ramblock_is_ignored(rb)) {
4365 return;
4368 if (!migration_is_idle()) {
4370 * Precopy code on the source cannot deal with the size of RAM blocks
4371 * changing at random points in time - especially after sending the
4372 * RAM block sizes in the migration stream, they must no longer change.
4373 * Abort and indicate a proper reason.
4375 error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr);
4376 migration_cancel(err);
4377 error_free(err);
4380 switch (ps) {
4381 case POSTCOPY_INCOMING_ADVISE:
4383 * Update what ram_postcopy_incoming_init()->init_range() does at the
4384 * time postcopy was advised. Syncing RAM blocks with the source will
4385 * result in RAM resizes.
4387 if (old_size < new_size) {
4388 if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) {
4389 error_report("RAM block '%s' discard of resized RAM failed",
4390 rb->idstr);
4393 rb->postcopy_length = new_size;
4394 break;
4395 case POSTCOPY_INCOMING_NONE:
4396 case POSTCOPY_INCOMING_RUNNING:
4397 case POSTCOPY_INCOMING_END:
4399 * Once our guest is running, postcopy does no longer care about
4400 * resizes. When growing, the new memory was not available on the
4401 * source, no handler needed.
4403 break;
4404 default:
4405 error_report("RAM block '%s' resized during postcopy state: %d",
4406 rb->idstr, ps);
4407 exit(-1);
4411 static RAMBlockNotifier ram_mig_ram_notifier = {
4412 .ram_block_resized = ram_mig_ram_block_resized,
4415 void ram_mig_init(void)
4417 qemu_mutex_init(&XBZRLE.lock);
4418 register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
4419 ram_block_notifier_add(&ram_mig_ram_notifier);