multifd: Make zstd use iov's
[qemu/armbru.git] / migration / ram.c
blobe9dcd3ca4ed84506ddbe1b45fa7752b65c22d9ba
1 /*
2 * QEMU System Emulator
4 * Copyright (c) 2003-2008 Fabrice Bellard
5 * Copyright (c) 2011-2015 Red Hat Inc
7 * Authors:
8 * Juan Quintela <quintela@redhat.com>
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 * THE SOFTWARE.
29 #include "qemu/osdep.h"
30 #include "qemu/cutils.h"
31 #include "qemu/bitops.h"
32 #include "qemu/bitmap.h"
33 #include "qemu/main-loop.h"
34 #include "xbzrle.h"
35 #include "ram.h"
36 #include "migration.h"
37 #include "migration/register.h"
38 #include "migration/misc.h"
39 #include "qemu-file.h"
40 #include "postcopy-ram.h"
41 #include "page_cache.h"
42 #include "qemu/error-report.h"
43 #include "qapi/error.h"
44 #include "qapi/qapi-types-migration.h"
45 #include "qapi/qapi-events-migration.h"
46 #include "qapi/qmp/qerror.h"
47 #include "trace.h"
48 #include "exec/ram_addr.h"
49 #include "exec/target_page.h"
50 #include "qemu/rcu_queue.h"
51 #include "migration/colo.h"
52 #include "block.h"
53 #include "sysemu/cpu-throttle.h"
54 #include "savevm.h"
55 #include "qemu/iov.h"
56 #include "multifd.h"
57 #include "sysemu/runstate.h"
59 #include "hw/boards.h" /* for machine_dump_guest_core() */
61 #if defined(__linux__)
62 #include "qemu/userfaultfd.h"
63 #endif /* defined(__linux__) */
65 /***********************************************************/
66 /* ram save/restore */
68 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
69 * worked for pages that where filled with the same char. We switched
70 * it to only search for the zero value. And to avoid confusion with
71 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
74 #define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */
75 #define RAM_SAVE_FLAG_ZERO 0x02
76 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
77 #define RAM_SAVE_FLAG_PAGE 0x08
78 #define RAM_SAVE_FLAG_EOS 0x10
79 #define RAM_SAVE_FLAG_CONTINUE 0x20
80 #define RAM_SAVE_FLAG_XBZRLE 0x40
81 /* 0x80 is reserved in migration.h start with 0x100 next */
82 #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
84 XBZRLECacheStats xbzrle_counters;
86 /* struct contains XBZRLE cache and a static page
87 used by the compression */
88 static struct {
89 /* buffer used for XBZRLE encoding */
90 uint8_t *encoded_buf;
91 /* buffer for storing page content */
92 uint8_t *current_buf;
93 /* Cache for XBZRLE, Protected by lock. */
94 PageCache *cache;
95 QemuMutex lock;
96 /* it will store a page full of zeros */
97 uint8_t *zero_target_page;
98 /* buffer used for XBZRLE decoding */
99 uint8_t *decoded_buf;
100 } XBZRLE;
102 static void XBZRLE_cache_lock(void)
104 if (migrate_use_xbzrle()) {
105 qemu_mutex_lock(&XBZRLE.lock);
109 static void XBZRLE_cache_unlock(void)
111 if (migrate_use_xbzrle()) {
112 qemu_mutex_unlock(&XBZRLE.lock);
117 * xbzrle_cache_resize: resize the xbzrle cache
119 * This function is called from migrate_params_apply in main
120 * thread, possibly while a migration is in progress. A running
121 * migration may be using the cache and might finish during this call,
122 * hence changes to the cache are protected by XBZRLE.lock().
124 * Returns 0 for success or -1 for error
126 * @new_size: new cache size
127 * @errp: set *errp if the check failed, with reason
129 int xbzrle_cache_resize(uint64_t new_size, Error **errp)
131 PageCache *new_cache;
132 int64_t ret = 0;
134 /* Check for truncation */
135 if (new_size != (size_t)new_size) {
136 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
137 "exceeding address space");
138 return -1;
141 if (new_size == migrate_xbzrle_cache_size()) {
142 /* nothing to do */
143 return 0;
146 XBZRLE_cache_lock();
148 if (XBZRLE.cache != NULL) {
149 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
150 if (!new_cache) {
151 ret = -1;
152 goto out;
155 cache_fini(XBZRLE.cache);
156 XBZRLE.cache = new_cache;
158 out:
159 XBZRLE_cache_unlock();
160 return ret;
163 bool ramblock_is_ignored(RAMBlock *block)
165 return !qemu_ram_is_migratable(block) ||
166 (migrate_ignore_shared() && qemu_ram_is_shared(block));
169 #undef RAMBLOCK_FOREACH
171 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
173 RAMBlock *block;
174 int ret = 0;
176 RCU_READ_LOCK_GUARD();
178 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
179 ret = func(block, opaque);
180 if (ret) {
181 break;
184 return ret;
187 static void ramblock_recv_map_init(void)
189 RAMBlock *rb;
191 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
192 assert(!rb->receivedmap);
193 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
197 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
199 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
200 rb->receivedmap);
203 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
205 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
208 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
210 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
213 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
214 size_t nr)
216 bitmap_set_atomic(rb->receivedmap,
217 ramblock_recv_bitmap_offset(host_addr, rb),
218 nr);
221 #define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL)
224 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
226 * Returns >0 if success with sent bytes, or <0 if error.
228 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
229 const char *block_name)
231 RAMBlock *block = qemu_ram_block_by_name(block_name);
232 unsigned long *le_bitmap, nbits;
233 uint64_t size;
235 if (!block) {
236 error_report("%s: invalid block name: %s", __func__, block_name);
237 return -1;
240 nbits = block->postcopy_length >> TARGET_PAGE_BITS;
243 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
244 * machines we may need 4 more bytes for padding (see below
245 * comment). So extend it a bit before hand.
247 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
250 * Always use little endian when sending the bitmap. This is
251 * required that when source and destination VMs are not using the
252 * same endianness. (Note: big endian won't work.)
254 bitmap_to_le(le_bitmap, block->receivedmap, nbits);
256 /* Size of the bitmap, in bytes */
257 size = DIV_ROUND_UP(nbits, 8);
260 * size is always aligned to 8 bytes for 64bit machines, but it
261 * may not be true for 32bit machines. We need this padding to
262 * make sure the migration can survive even between 32bit and
263 * 64bit machines.
265 size = ROUND_UP(size, 8);
267 qemu_put_be64(file, size);
268 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
270 * Mark as an end, in case the middle part is screwed up due to
271 * some "mysterious" reason.
273 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
274 qemu_fflush(file);
276 g_free(le_bitmap);
278 if (qemu_file_get_error(file)) {
279 return qemu_file_get_error(file);
282 return size + sizeof(size);
286 * An outstanding page request, on the source, having been received
287 * and queued
289 struct RAMSrcPageRequest {
290 RAMBlock *rb;
291 hwaddr offset;
292 hwaddr len;
294 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
297 /* State of RAM for migration */
298 struct RAMState {
299 /* QEMUFile used for this migration */
300 QEMUFile *f;
301 /* UFFD file descriptor, used in 'write-tracking' migration */
302 int uffdio_fd;
303 /* Last block that we have visited searching for dirty pages */
304 RAMBlock *last_seen_block;
305 /* Last block from where we have sent data */
306 RAMBlock *last_sent_block;
307 /* Last dirty target page we have sent */
308 ram_addr_t last_page;
309 /* last ram version we have seen */
310 uint32_t last_version;
311 /* How many times we have dirty too many pages */
312 int dirty_rate_high_cnt;
313 /* these variables are used for bitmap sync */
314 /* last time we did a full bitmap_sync */
315 int64_t time_last_bitmap_sync;
316 /* bytes transferred at start_time */
317 uint64_t bytes_xfer_prev;
318 /* number of dirty pages since start_time */
319 uint64_t num_dirty_pages_period;
320 /* xbzrle misses since the beginning of the period */
321 uint64_t xbzrle_cache_miss_prev;
322 /* Amount of xbzrle pages since the beginning of the period */
323 uint64_t xbzrle_pages_prev;
324 /* Amount of xbzrle encoded bytes since the beginning of the period */
325 uint64_t xbzrle_bytes_prev;
326 /* Start using XBZRLE (e.g., after the first round). */
327 bool xbzrle_enabled;
328 /* Are we on the last stage of migration */
329 bool last_stage;
330 /* compression statistics since the beginning of the period */
331 /* amount of count that no free thread to compress data */
332 uint64_t compress_thread_busy_prev;
333 /* amount bytes after compression */
334 uint64_t compressed_size_prev;
335 /* amount of compressed pages */
336 uint64_t compress_pages_prev;
338 /* total handled target pages at the beginning of period */
339 uint64_t target_page_count_prev;
340 /* total handled target pages since start */
341 uint64_t target_page_count;
342 /* number of dirty bits in the bitmap */
343 uint64_t migration_dirty_pages;
344 /* Protects modification of the bitmap and migration dirty pages */
345 QemuMutex bitmap_mutex;
346 /* The RAMBlock used in the last src_page_requests */
347 RAMBlock *last_req_rb;
348 /* Queue of outstanding page requests from the destination */
349 QemuMutex src_page_req_mutex;
350 QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
352 typedef struct RAMState RAMState;
354 static RAMState *ram_state;
356 static NotifierWithReturnList precopy_notifier_list;
358 void precopy_infrastructure_init(void)
360 notifier_with_return_list_init(&precopy_notifier_list);
363 void precopy_add_notifier(NotifierWithReturn *n)
365 notifier_with_return_list_add(&precopy_notifier_list, n);
368 void precopy_remove_notifier(NotifierWithReturn *n)
370 notifier_with_return_remove(n);
373 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
375 PrecopyNotifyData pnd;
376 pnd.reason = reason;
377 pnd.errp = errp;
379 return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
382 uint64_t ram_bytes_remaining(void)
384 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
388 MigrationStats ram_counters;
390 /* used by the search for pages to send */
391 struct PageSearchStatus {
392 /* Current block being searched */
393 RAMBlock *block;
394 /* Current page to search from */
395 unsigned long page;
396 /* Set once we wrap around */
397 bool complete_round;
399 typedef struct PageSearchStatus PageSearchStatus;
401 CompressionStats compression_counters;
403 struct CompressParam {
404 bool done;
405 bool quit;
406 bool zero_page;
407 QEMUFile *file;
408 QemuMutex mutex;
409 QemuCond cond;
410 RAMBlock *block;
411 ram_addr_t offset;
413 /* internally used fields */
414 z_stream stream;
415 uint8_t *originbuf;
417 typedef struct CompressParam CompressParam;
419 struct DecompressParam {
420 bool done;
421 bool quit;
422 QemuMutex mutex;
423 QemuCond cond;
424 void *des;
425 uint8_t *compbuf;
426 int len;
427 z_stream stream;
429 typedef struct DecompressParam DecompressParam;
431 static CompressParam *comp_param;
432 static QemuThread *compress_threads;
433 /* comp_done_cond is used to wake up the migration thread when
434 * one of the compression threads has finished the compression.
435 * comp_done_lock is used to co-work with comp_done_cond.
437 static QemuMutex comp_done_lock;
438 static QemuCond comp_done_cond;
439 /* The empty QEMUFileOps will be used by file in CompressParam */
440 static const QEMUFileOps empty_ops = { };
442 static QEMUFile *decomp_file;
443 static DecompressParam *decomp_param;
444 static QemuThread *decompress_threads;
445 static QemuMutex decomp_done_lock;
446 static QemuCond decomp_done_cond;
448 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
449 ram_addr_t offset, uint8_t *source_buf);
451 static void *do_data_compress(void *opaque)
453 CompressParam *param = opaque;
454 RAMBlock *block;
455 ram_addr_t offset;
456 bool zero_page;
458 qemu_mutex_lock(&param->mutex);
459 while (!param->quit) {
460 if (param->block) {
461 block = param->block;
462 offset = param->offset;
463 param->block = NULL;
464 qemu_mutex_unlock(&param->mutex);
466 zero_page = do_compress_ram_page(param->file, &param->stream,
467 block, offset, param->originbuf);
469 qemu_mutex_lock(&comp_done_lock);
470 param->done = true;
471 param->zero_page = zero_page;
472 qemu_cond_signal(&comp_done_cond);
473 qemu_mutex_unlock(&comp_done_lock);
475 qemu_mutex_lock(&param->mutex);
476 } else {
477 qemu_cond_wait(&param->cond, &param->mutex);
480 qemu_mutex_unlock(&param->mutex);
482 return NULL;
485 static void compress_threads_save_cleanup(void)
487 int i, thread_count;
489 if (!migrate_use_compression() || !comp_param) {
490 return;
493 thread_count = migrate_compress_threads();
494 for (i = 0; i < thread_count; i++) {
496 * we use it as a indicator which shows if the thread is
497 * properly init'd or not
499 if (!comp_param[i].file) {
500 break;
503 qemu_mutex_lock(&comp_param[i].mutex);
504 comp_param[i].quit = true;
505 qemu_cond_signal(&comp_param[i].cond);
506 qemu_mutex_unlock(&comp_param[i].mutex);
508 qemu_thread_join(compress_threads + i);
509 qemu_mutex_destroy(&comp_param[i].mutex);
510 qemu_cond_destroy(&comp_param[i].cond);
511 deflateEnd(&comp_param[i].stream);
512 g_free(comp_param[i].originbuf);
513 qemu_fclose(comp_param[i].file);
514 comp_param[i].file = NULL;
516 qemu_mutex_destroy(&comp_done_lock);
517 qemu_cond_destroy(&comp_done_cond);
518 g_free(compress_threads);
519 g_free(comp_param);
520 compress_threads = NULL;
521 comp_param = NULL;
524 static int compress_threads_save_setup(void)
526 int i, thread_count;
528 if (!migrate_use_compression()) {
529 return 0;
531 thread_count = migrate_compress_threads();
532 compress_threads = g_new0(QemuThread, thread_count);
533 comp_param = g_new0(CompressParam, thread_count);
534 qemu_cond_init(&comp_done_cond);
535 qemu_mutex_init(&comp_done_lock);
536 for (i = 0; i < thread_count; i++) {
537 comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
538 if (!comp_param[i].originbuf) {
539 goto exit;
542 if (deflateInit(&comp_param[i].stream,
543 migrate_compress_level()) != Z_OK) {
544 g_free(comp_param[i].originbuf);
545 goto exit;
548 /* comp_param[i].file is just used as a dummy buffer to save data,
549 * set its ops to empty.
551 comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops, false);
552 comp_param[i].done = true;
553 comp_param[i].quit = false;
554 qemu_mutex_init(&comp_param[i].mutex);
555 qemu_cond_init(&comp_param[i].cond);
556 qemu_thread_create(compress_threads + i, "compress",
557 do_data_compress, comp_param + i,
558 QEMU_THREAD_JOINABLE);
560 return 0;
562 exit:
563 compress_threads_save_cleanup();
564 return -1;
568 * save_page_header: write page header to wire
570 * If this is the 1st block, it also writes the block identification
572 * Returns the number of bytes written
574 * @f: QEMUFile where to send the data
575 * @block: block that contains the page we want to send
576 * @offset: offset inside the block for the page
577 * in the lower bits, it contains flags
579 static size_t save_page_header(RAMState *rs, QEMUFile *f, RAMBlock *block,
580 ram_addr_t offset)
582 size_t size, len;
584 if (block == rs->last_sent_block) {
585 offset |= RAM_SAVE_FLAG_CONTINUE;
587 qemu_put_be64(f, offset);
588 size = 8;
590 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
591 len = strlen(block->idstr);
592 qemu_put_byte(f, len);
593 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
594 size += 1 + len;
595 rs->last_sent_block = block;
597 return size;
601 * mig_throttle_guest_down: throttle down the guest
603 * Reduce amount of guest cpu execution to hopefully slow down memory
604 * writes. If guest dirty memory rate is reduced below the rate at
605 * which we can transfer pages to the destination then we should be
606 * able to complete migration. Some workloads dirty memory way too
607 * fast and will not effectively converge, even with auto-converge.
609 static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
610 uint64_t bytes_dirty_threshold)
612 MigrationState *s = migrate_get_current();
613 uint64_t pct_initial = s->parameters.cpu_throttle_initial;
614 uint64_t pct_increment = s->parameters.cpu_throttle_increment;
615 bool pct_tailslow = s->parameters.cpu_throttle_tailslow;
616 int pct_max = s->parameters.max_cpu_throttle;
618 uint64_t throttle_now = cpu_throttle_get_percentage();
619 uint64_t cpu_now, cpu_ideal, throttle_inc;
621 /* We have not started throttling yet. Let's start it. */
622 if (!cpu_throttle_active()) {
623 cpu_throttle_set(pct_initial);
624 } else {
625 /* Throttling already on, just increase the rate */
626 if (!pct_tailslow) {
627 throttle_inc = pct_increment;
628 } else {
629 /* Compute the ideal CPU percentage used by Guest, which may
630 * make the dirty rate match the dirty rate threshold. */
631 cpu_now = 100 - throttle_now;
632 cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
633 bytes_dirty_period);
634 throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
636 cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
640 void mig_throttle_counter_reset(void)
642 RAMState *rs = ram_state;
644 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
645 rs->num_dirty_pages_period = 0;
646 rs->bytes_xfer_prev = ram_counters.transferred;
650 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
652 * @rs: current RAM state
653 * @current_addr: address for the zero page
655 * Update the xbzrle cache to reflect a page that's been sent as all 0.
656 * The important thing is that a stale (not-yet-0'd) page be replaced
657 * by the new data.
658 * As a bonus, if the page wasn't in the cache it gets added so that
659 * when a small write is made into the 0'd page it gets XBZRLE sent.
661 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
663 if (!rs->xbzrle_enabled) {
664 return;
667 /* We don't care if this fails to allocate a new cache page
668 * as long as it updated an old one */
669 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
670 ram_counters.dirty_sync_count);
673 #define ENCODING_FLAG_XBZRLE 0x1
676 * save_xbzrle_page: compress and send current page
678 * Returns: 1 means that we wrote the page
679 * 0 means that page is identical to the one already sent
680 * -1 means that xbzrle would be longer than normal
682 * @rs: current RAM state
683 * @current_data: pointer to the address of the page contents
684 * @current_addr: addr of the page
685 * @block: block that contains the page we want to send
686 * @offset: offset inside the block for the page
688 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
689 ram_addr_t current_addr, RAMBlock *block,
690 ram_addr_t offset)
692 int encoded_len = 0, bytes_xbzrle;
693 uint8_t *prev_cached_page;
695 if (!cache_is_cached(XBZRLE.cache, current_addr,
696 ram_counters.dirty_sync_count)) {
697 xbzrle_counters.cache_miss++;
698 if (!rs->last_stage) {
699 if (cache_insert(XBZRLE.cache, current_addr, *current_data,
700 ram_counters.dirty_sync_count) == -1) {
701 return -1;
702 } else {
703 /* update *current_data when the page has been
704 inserted into cache */
705 *current_data = get_cached_data(XBZRLE.cache, current_addr);
708 return -1;
712 * Reaching here means the page has hit the xbzrle cache, no matter what
713 * encoding result it is (normal encoding, overflow or skipping the page),
714 * count the page as encoded. This is used to calculate the encoding rate.
716 * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
717 * 2nd page turns out to be skipped (i.e. no new bytes written to the
718 * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
719 * skipped page included. In this way, the encoding rate can tell if the
720 * guest page is good for xbzrle encoding.
722 xbzrle_counters.pages++;
723 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
725 /* save current buffer into memory */
726 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
728 /* XBZRLE encoding (if there is no overflow) */
729 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
730 TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
731 TARGET_PAGE_SIZE);
734 * Update the cache contents, so that it corresponds to the data
735 * sent, in all cases except where we skip the page.
737 if (!rs->last_stage && encoded_len != 0) {
738 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
740 * In the case where we couldn't compress, ensure that the caller
741 * sends the data from the cache, since the guest might have
742 * changed the RAM since we copied it.
744 *current_data = prev_cached_page;
747 if (encoded_len == 0) {
748 trace_save_xbzrle_page_skipping();
749 return 0;
750 } else if (encoded_len == -1) {
751 trace_save_xbzrle_page_overflow();
752 xbzrle_counters.overflow++;
753 xbzrle_counters.bytes += TARGET_PAGE_SIZE;
754 return -1;
757 /* Send XBZRLE based compressed page */
758 bytes_xbzrle = save_page_header(rs, rs->f, block,
759 offset | RAM_SAVE_FLAG_XBZRLE);
760 qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
761 qemu_put_be16(rs->f, encoded_len);
762 qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
763 bytes_xbzrle += encoded_len + 1 + 2;
765 * Like compressed_size (please see update_compress_thread_counts),
766 * the xbzrle encoded bytes don't count the 8 byte header with
767 * RAM_SAVE_FLAG_CONTINUE.
769 xbzrle_counters.bytes += bytes_xbzrle - 8;
770 ram_counters.transferred += bytes_xbzrle;
772 return 1;
776 * migration_bitmap_find_dirty: find the next dirty page from start
778 * Returns the page offset within memory region of the start of a dirty page
780 * @rs: current RAM state
781 * @rb: RAMBlock where to search for dirty pages
782 * @start: page where we start the search
784 static inline
785 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
786 unsigned long start)
788 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
789 unsigned long *bitmap = rb->bmap;
791 if (ramblock_is_ignored(rb)) {
792 return size;
795 return find_next_bit(bitmap, size, start);
798 static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb,
799 unsigned long page)
801 uint8_t shift;
802 hwaddr size, start;
804 if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) {
805 return;
808 shift = rb->clear_bmap_shift;
810 * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
811 * can make things easier sometimes since then start address
812 * of the small chunk will always be 64 pages aligned so the
813 * bitmap will always be aligned to unsigned long. We should
814 * even be able to remove this restriction but I'm simply
815 * keeping it.
817 assert(shift >= 6);
819 size = 1ULL << (TARGET_PAGE_BITS + shift);
820 start = QEMU_ALIGN_DOWN((ram_addr_t)page << TARGET_PAGE_BITS, size);
821 trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
822 memory_region_clear_dirty_bitmap(rb->mr, start, size);
825 static void
826 migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb,
827 unsigned long start,
828 unsigned long npages)
830 unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift;
831 unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages);
832 unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages);
835 * Clear pages from start to start + npages - 1, so the end boundary is
836 * exclusive.
838 for (i = chunk_start; i < chunk_end; i += chunk_pages) {
839 migration_clear_memory_region_dirty_bitmap(rb, i);
844 * colo_bitmap_find_diry:find contiguous dirty pages from start
846 * Returns the page offset within memory region of the start of the contiguout
847 * dirty page
849 * @rs: current RAM state
850 * @rb: RAMBlock where to search for dirty pages
851 * @start: page where we start the search
852 * @num: the number of contiguous dirty pages
854 static inline
855 unsigned long colo_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
856 unsigned long start, unsigned long *num)
858 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
859 unsigned long *bitmap = rb->bmap;
860 unsigned long first, next;
862 *num = 0;
864 if (ramblock_is_ignored(rb)) {
865 return size;
868 first = find_next_bit(bitmap, size, start);
869 if (first >= size) {
870 return first;
872 next = find_next_zero_bit(bitmap, size, first + 1);
873 assert(next >= first);
874 *num = next - first;
875 return first;
878 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
879 RAMBlock *rb,
880 unsigned long page)
882 bool ret;
885 * Clear dirty bitmap if needed. This _must_ be called before we
886 * send any of the page in the chunk because we need to make sure
887 * we can capture further page content changes when we sync dirty
888 * log the next time. So as long as we are going to send any of
889 * the page in the chunk we clear the remote dirty bitmap for all.
890 * Clearing it earlier won't be a problem, but too late will.
892 migration_clear_memory_region_dirty_bitmap(rb, page);
894 ret = test_and_clear_bit(page, rb->bmap);
895 if (ret) {
896 rs->migration_dirty_pages--;
899 return ret;
902 static void dirty_bitmap_clear_section(MemoryRegionSection *section,
903 void *opaque)
905 const hwaddr offset = section->offset_within_region;
906 const hwaddr size = int128_get64(section->size);
907 const unsigned long start = offset >> TARGET_PAGE_BITS;
908 const unsigned long npages = size >> TARGET_PAGE_BITS;
909 RAMBlock *rb = section->mr->ram_block;
910 uint64_t *cleared_bits = opaque;
913 * We don't grab ram_state->bitmap_mutex because we expect to run
914 * only when starting migration or during postcopy recovery where
915 * we don't have concurrent access.
917 if (!migration_in_postcopy() && !migrate_background_snapshot()) {
918 migration_clear_memory_region_dirty_bitmap_range(rb, start, npages);
920 *cleared_bits += bitmap_count_one_with_offset(rb->bmap, start, npages);
921 bitmap_clear(rb->bmap, start, npages);
925 * Exclude all dirty pages from migration that fall into a discarded range as
926 * managed by a RamDiscardManager responsible for the mapped memory region of
927 * the RAMBlock. Clear the corresponding bits in the dirty bitmaps.
929 * Discarded pages ("logically unplugged") have undefined content and must
930 * not get migrated, because even reading these pages for migration might
931 * result in undesired behavior.
933 * Returns the number of cleared bits in the RAMBlock dirty bitmap.
935 * Note: The result is only stable while migrating (precopy/postcopy).
937 static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock *rb)
939 uint64_t cleared_bits = 0;
941 if (rb->mr && rb->bmap && memory_region_has_ram_discard_manager(rb->mr)) {
942 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
943 MemoryRegionSection section = {
944 .mr = rb->mr,
945 .offset_within_region = 0,
946 .size = int128_make64(qemu_ram_get_used_length(rb)),
949 ram_discard_manager_replay_discarded(rdm, &section,
950 dirty_bitmap_clear_section,
951 &cleared_bits);
953 return cleared_bits;
957 * Check if a host-page aligned page falls into a discarded range as managed by
958 * a RamDiscardManager responsible for the mapped memory region of the RAMBlock.
960 * Note: The result is only stable while migrating (precopy/postcopy).
962 bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start)
964 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
965 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
966 MemoryRegionSection section = {
967 .mr = rb->mr,
968 .offset_within_region = start,
969 .size = int128_make64(qemu_ram_pagesize(rb)),
972 return !ram_discard_manager_is_populated(rdm, &section);
974 return false;
977 /* Called with RCU critical section */
978 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
980 uint64_t new_dirty_pages =
981 cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
983 rs->migration_dirty_pages += new_dirty_pages;
984 rs->num_dirty_pages_period += new_dirty_pages;
988 * ram_pagesize_summary: calculate all the pagesizes of a VM
990 * Returns a summary bitmap of the page sizes of all RAMBlocks
992 * For VMs with just normal pages this is equivalent to the host page
993 * size. If it's got some huge pages then it's the OR of all the
994 * different page sizes.
996 uint64_t ram_pagesize_summary(void)
998 RAMBlock *block;
999 uint64_t summary = 0;
1001 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1002 summary |= block->page_size;
1005 return summary;
1008 uint64_t ram_get_total_transferred_pages(void)
1010 return ram_counters.normal + ram_counters.duplicate +
1011 compression_counters.pages + xbzrle_counters.pages;
1014 static void migration_update_rates(RAMState *rs, int64_t end_time)
1016 uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
1017 double compressed_size;
1019 /* calculate period counters */
1020 ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
1021 / (end_time - rs->time_last_bitmap_sync);
1023 if (!page_count) {
1024 return;
1027 if (migrate_use_xbzrle()) {
1028 double encoded_size, unencoded_size;
1030 xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
1031 rs->xbzrle_cache_miss_prev) / page_count;
1032 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
1033 unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
1034 TARGET_PAGE_SIZE;
1035 encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
1036 if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
1037 xbzrle_counters.encoding_rate = 0;
1038 } else {
1039 xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
1041 rs->xbzrle_pages_prev = xbzrle_counters.pages;
1042 rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
1045 if (migrate_use_compression()) {
1046 compression_counters.busy_rate = (double)(compression_counters.busy -
1047 rs->compress_thread_busy_prev) / page_count;
1048 rs->compress_thread_busy_prev = compression_counters.busy;
1050 compressed_size = compression_counters.compressed_size -
1051 rs->compressed_size_prev;
1052 if (compressed_size) {
1053 double uncompressed_size = (compression_counters.pages -
1054 rs->compress_pages_prev) * TARGET_PAGE_SIZE;
1056 /* Compression-Ratio = Uncompressed-size / Compressed-size */
1057 compression_counters.compression_rate =
1058 uncompressed_size / compressed_size;
1060 rs->compress_pages_prev = compression_counters.pages;
1061 rs->compressed_size_prev = compression_counters.compressed_size;
1066 static void migration_trigger_throttle(RAMState *rs)
1068 MigrationState *s = migrate_get_current();
1069 uint64_t threshold = s->parameters.throttle_trigger_threshold;
1071 uint64_t bytes_xfer_period = ram_counters.transferred - rs->bytes_xfer_prev;
1072 uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
1073 uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
1075 /* During block migration the auto-converge logic incorrectly detects
1076 * that ram migration makes no progress. Avoid this by disabling the
1077 * throttling logic during the bulk phase of block migration. */
1078 if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1079 /* The following detection logic can be refined later. For now:
1080 Check to see if the ratio between dirtied bytes and the approx.
1081 amount of bytes that just got transferred since the last time
1082 we were in this routine reaches the threshold. If that happens
1083 twice, start or increase throttling. */
1085 if ((bytes_dirty_period > bytes_dirty_threshold) &&
1086 (++rs->dirty_rate_high_cnt >= 2)) {
1087 trace_migration_throttle();
1088 rs->dirty_rate_high_cnt = 0;
1089 mig_throttle_guest_down(bytes_dirty_period,
1090 bytes_dirty_threshold);
1095 static void migration_bitmap_sync(RAMState *rs)
1097 RAMBlock *block;
1098 int64_t end_time;
1100 ram_counters.dirty_sync_count++;
1102 if (!rs->time_last_bitmap_sync) {
1103 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1106 trace_migration_bitmap_sync_start();
1107 memory_global_dirty_log_sync();
1109 qemu_mutex_lock(&rs->bitmap_mutex);
1110 WITH_RCU_READ_LOCK_GUARD() {
1111 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1112 ramblock_sync_dirty_bitmap(rs, block);
1114 ram_counters.remaining = ram_bytes_remaining();
1116 qemu_mutex_unlock(&rs->bitmap_mutex);
1118 memory_global_after_dirty_log_sync();
1119 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1121 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1123 /* more than 1 second = 1000 millisecons */
1124 if (end_time > rs->time_last_bitmap_sync + 1000) {
1125 migration_trigger_throttle(rs);
1127 migration_update_rates(rs, end_time);
1129 rs->target_page_count_prev = rs->target_page_count;
1131 /* reset period counters */
1132 rs->time_last_bitmap_sync = end_time;
1133 rs->num_dirty_pages_period = 0;
1134 rs->bytes_xfer_prev = ram_counters.transferred;
1136 if (migrate_use_events()) {
1137 qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
1141 static void migration_bitmap_sync_precopy(RAMState *rs)
1143 Error *local_err = NULL;
1146 * The current notifier usage is just an optimization to migration, so we
1147 * don't stop the normal migration process in the error case.
1149 if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1150 error_report_err(local_err);
1151 local_err = NULL;
1154 migration_bitmap_sync(rs);
1156 if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1157 error_report_err(local_err);
1161 static void ram_release_page(const char *rbname, uint64_t offset)
1163 if (!migrate_release_ram() || !migration_in_postcopy()) {
1164 return;
1167 ram_discard_range(rbname, offset, TARGET_PAGE_SIZE);
1171 * save_zero_page_to_file: send the zero page to the file
1173 * Returns the size of data written to the file, 0 means the page is not
1174 * a zero page
1176 * @rs: current RAM state
1177 * @file: the file where the data is saved
1178 * @block: block that contains the page we want to send
1179 * @offset: offset inside the block for the page
1181 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1182 RAMBlock *block, ram_addr_t offset)
1184 uint8_t *p = block->host + offset;
1185 int len = 0;
1187 if (buffer_is_zero(p, TARGET_PAGE_SIZE)) {
1188 len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1189 qemu_put_byte(file, 0);
1190 len += 1;
1191 ram_release_page(block->idstr, offset);
1193 return len;
1197 * save_zero_page: send the zero page to the stream
1199 * Returns the number of pages written.
1201 * @rs: current RAM state
1202 * @block: block that contains the page we want to send
1203 * @offset: offset inside the block for the page
1205 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1207 int len = save_zero_page_to_file(rs, rs->f, block, offset);
1209 if (len) {
1210 ram_counters.duplicate++;
1211 ram_counters.transferred += len;
1212 return 1;
1214 return -1;
1218 * @pages: the number of pages written by the control path,
1219 * < 0 - error
1220 * > 0 - number of pages written
1222 * Return true if the pages has been saved, otherwise false is returned.
1224 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1225 int *pages)
1227 uint64_t bytes_xmit = 0;
1228 int ret;
1230 *pages = -1;
1231 ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1232 &bytes_xmit);
1233 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1234 return false;
1237 if (bytes_xmit) {
1238 ram_counters.transferred += bytes_xmit;
1239 *pages = 1;
1242 if (ret == RAM_SAVE_CONTROL_DELAYED) {
1243 return true;
1246 if (bytes_xmit > 0) {
1247 ram_counters.normal++;
1248 } else if (bytes_xmit == 0) {
1249 ram_counters.duplicate++;
1252 return true;
1256 * directly send the page to the stream
1258 * Returns the number of pages written.
1260 * @rs: current RAM state
1261 * @block: block that contains the page we want to send
1262 * @offset: offset inside the block for the page
1263 * @buf: the page to be sent
1264 * @async: send to page asyncly
1266 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1267 uint8_t *buf, bool async)
1269 ram_counters.transferred += save_page_header(rs, rs->f, block,
1270 offset | RAM_SAVE_FLAG_PAGE);
1271 if (async) {
1272 qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1273 migrate_release_ram() &
1274 migration_in_postcopy());
1275 } else {
1276 qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1278 ram_counters.transferred += TARGET_PAGE_SIZE;
1279 ram_counters.normal++;
1280 return 1;
1284 * ram_save_page: send the given page to the stream
1286 * Returns the number of pages written.
1287 * < 0 - error
1288 * >=0 - Number of pages written - this might legally be 0
1289 * if xbzrle noticed the page was the same.
1291 * @rs: current RAM state
1292 * @block: block that contains the page we want to send
1293 * @offset: offset inside the block for the page
1295 static int ram_save_page(RAMState *rs, PageSearchStatus *pss)
1297 int pages = -1;
1298 uint8_t *p;
1299 bool send_async = true;
1300 RAMBlock *block = pss->block;
1301 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1302 ram_addr_t current_addr = block->offset + offset;
1304 p = block->host + offset;
1305 trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1307 XBZRLE_cache_lock();
1308 if (rs->xbzrle_enabled && !migration_in_postcopy()) {
1309 pages = save_xbzrle_page(rs, &p, current_addr, block,
1310 offset);
1311 if (!rs->last_stage) {
1312 /* Can't send this cached data async, since the cache page
1313 * might get updated before it gets to the wire
1315 send_async = false;
1319 /* XBZRLE overflow or normal page */
1320 if (pages == -1) {
1321 pages = save_normal_page(rs, block, offset, p, send_async);
1324 XBZRLE_cache_unlock();
1326 return pages;
1329 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
1330 ram_addr_t offset)
1332 if (multifd_queue_page(rs->f, block, offset) < 0) {
1333 return -1;
1335 ram_counters.normal++;
1337 return 1;
1340 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1341 ram_addr_t offset, uint8_t *source_buf)
1343 RAMState *rs = ram_state;
1344 uint8_t *p = block->host + offset;
1345 int ret;
1347 if (save_zero_page_to_file(rs, f, block, offset)) {
1348 return true;
1351 save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1354 * copy it to a internal buffer to avoid it being modified by VM
1355 * so that we can catch up the error during compression and
1356 * decompression
1358 memcpy(source_buf, p, TARGET_PAGE_SIZE);
1359 ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1360 if (ret < 0) {
1361 qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
1362 error_report("compressed data failed!");
1364 return false;
1367 static void
1368 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1370 ram_counters.transferred += bytes_xmit;
1372 if (param->zero_page) {
1373 ram_counters.duplicate++;
1374 return;
1377 /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1378 compression_counters.compressed_size += bytes_xmit - 8;
1379 compression_counters.pages++;
1382 static bool save_page_use_compression(RAMState *rs);
1384 static void flush_compressed_data(RAMState *rs)
1386 int idx, len, thread_count;
1388 if (!save_page_use_compression(rs)) {
1389 return;
1391 thread_count = migrate_compress_threads();
1393 qemu_mutex_lock(&comp_done_lock);
1394 for (idx = 0; idx < thread_count; idx++) {
1395 while (!comp_param[idx].done) {
1396 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1399 qemu_mutex_unlock(&comp_done_lock);
1401 for (idx = 0; idx < thread_count; idx++) {
1402 qemu_mutex_lock(&comp_param[idx].mutex);
1403 if (!comp_param[idx].quit) {
1404 len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1406 * it's safe to fetch zero_page without holding comp_done_lock
1407 * as there is no further request submitted to the thread,
1408 * i.e, the thread should be waiting for a request at this point.
1410 update_compress_thread_counts(&comp_param[idx], len);
1412 qemu_mutex_unlock(&comp_param[idx].mutex);
1416 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1417 ram_addr_t offset)
1419 param->block = block;
1420 param->offset = offset;
1423 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1424 ram_addr_t offset)
1426 int idx, thread_count, bytes_xmit = -1, pages = -1;
1427 bool wait = migrate_compress_wait_thread();
1429 thread_count = migrate_compress_threads();
1430 qemu_mutex_lock(&comp_done_lock);
1431 retry:
1432 for (idx = 0; idx < thread_count; idx++) {
1433 if (comp_param[idx].done) {
1434 comp_param[idx].done = false;
1435 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1436 qemu_mutex_lock(&comp_param[idx].mutex);
1437 set_compress_params(&comp_param[idx], block, offset);
1438 qemu_cond_signal(&comp_param[idx].cond);
1439 qemu_mutex_unlock(&comp_param[idx].mutex);
1440 pages = 1;
1441 update_compress_thread_counts(&comp_param[idx], bytes_xmit);
1442 break;
1447 * wait for the free thread if the user specifies 'compress-wait-thread',
1448 * otherwise we will post the page out in the main thread as normal page.
1450 if (pages < 0 && wait) {
1451 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1452 goto retry;
1454 qemu_mutex_unlock(&comp_done_lock);
1456 return pages;
1460 * find_dirty_block: find the next dirty page and update any state
1461 * associated with the search process.
1463 * Returns true if a page is found
1465 * @rs: current RAM state
1466 * @pss: data about the state of the current dirty page scan
1467 * @again: set to false if the search has scanned the whole of RAM
1469 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1471 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1472 if (pss->complete_round && pss->block == rs->last_seen_block &&
1473 pss->page >= rs->last_page) {
1475 * We've been once around the RAM and haven't found anything.
1476 * Give up.
1478 *again = false;
1479 return false;
1481 if (!offset_in_ramblock(pss->block,
1482 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) {
1483 /* Didn't find anything in this RAM Block */
1484 pss->page = 0;
1485 pss->block = QLIST_NEXT_RCU(pss->block, next);
1486 if (!pss->block) {
1488 * If memory migration starts over, we will meet a dirtied page
1489 * which may still exists in compression threads's ring, so we
1490 * should flush the compressed data to make sure the new page
1491 * is not overwritten by the old one in the destination.
1493 * Also If xbzrle is on, stop using the data compression at this
1494 * point. In theory, xbzrle can do better than compression.
1496 flush_compressed_data(rs);
1498 /* Hit the end of the list */
1499 pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1500 /* Flag that we've looped */
1501 pss->complete_round = true;
1502 /* After the first round, enable XBZRLE. */
1503 if (migrate_use_xbzrle()) {
1504 rs->xbzrle_enabled = true;
1507 /* Didn't find anything this time, but try again on the new block */
1508 *again = true;
1509 return false;
1510 } else {
1511 /* Can go around again, but... */
1512 *again = true;
1513 /* We've found something so probably don't need to */
1514 return true;
1519 * unqueue_page: gets a page of the queue
1521 * Helper for 'get_queued_page' - gets a page off the queue
1523 * Returns the block of the page (or NULL if none available)
1525 * @rs: current RAM state
1526 * @offset: used to return the offset within the RAMBlock
1528 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1530 RAMBlock *block = NULL;
1532 if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) {
1533 return NULL;
1536 QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
1537 if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1538 struct RAMSrcPageRequest *entry =
1539 QSIMPLEQ_FIRST(&rs->src_page_requests);
1540 block = entry->rb;
1541 *offset = entry->offset;
1543 if (entry->len > TARGET_PAGE_SIZE) {
1544 entry->len -= TARGET_PAGE_SIZE;
1545 entry->offset += TARGET_PAGE_SIZE;
1546 } else {
1547 memory_region_unref(block->mr);
1548 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1549 g_free(entry);
1550 migration_consume_urgent_request();
1554 return block;
1557 #if defined(__linux__)
1559 * poll_fault_page: try to get next UFFD write fault page and, if pending fault
1560 * is found, return RAM block pointer and page offset
1562 * Returns pointer to the RAMBlock containing faulting page,
1563 * NULL if no write faults are pending
1565 * @rs: current RAM state
1566 * @offset: page offset from the beginning of the block
1568 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1570 struct uffd_msg uffd_msg;
1571 void *page_address;
1572 RAMBlock *block;
1573 int res;
1575 if (!migrate_background_snapshot()) {
1576 return NULL;
1579 res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1);
1580 if (res <= 0) {
1581 return NULL;
1584 page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address;
1585 block = qemu_ram_block_from_host(page_address, false, offset);
1586 assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0);
1587 return block;
1591 * ram_save_release_protection: release UFFD write protection after
1592 * a range of pages has been saved
1594 * @rs: current RAM state
1595 * @pss: page-search-status structure
1596 * @start_page: index of the first page in the range relative to pss->block
1598 * Returns 0 on success, negative value in case of an error
1600 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1601 unsigned long start_page)
1603 int res = 0;
1605 /* Check if page is from UFFD-managed region. */
1606 if (pss->block->flags & RAM_UF_WRITEPROTECT) {
1607 void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS);
1608 uint64_t run_length = (pss->page - start_page + 1) << TARGET_PAGE_BITS;
1610 /* Flush async buffers before un-protect. */
1611 qemu_fflush(rs->f);
1612 /* Un-protect memory range. */
1613 res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
1614 false, false);
1617 return res;
1620 /* ram_write_tracking_available: check if kernel supports required UFFD features
1622 * Returns true if supports, false otherwise
1624 bool ram_write_tracking_available(void)
1626 uint64_t uffd_features;
1627 int res;
1629 res = uffd_query_features(&uffd_features);
1630 return (res == 0 &&
1631 (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0);
1634 /* ram_write_tracking_compatible: check if guest configuration is
1635 * compatible with 'write-tracking'
1637 * Returns true if compatible, false otherwise
1639 bool ram_write_tracking_compatible(void)
1641 const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT);
1642 int uffd_fd;
1643 RAMBlock *block;
1644 bool ret = false;
1646 /* Open UFFD file descriptor */
1647 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false);
1648 if (uffd_fd < 0) {
1649 return false;
1652 RCU_READ_LOCK_GUARD();
1654 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1655 uint64_t uffd_ioctls;
1657 /* Nothing to do with read-only and MMIO-writable regions */
1658 if (block->mr->readonly || block->mr->rom_device) {
1659 continue;
1661 /* Try to register block memory via UFFD-IO to track writes */
1662 if (uffd_register_memory(uffd_fd, block->host, block->max_length,
1663 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) {
1664 goto out;
1666 if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) {
1667 goto out;
1670 ret = true;
1672 out:
1673 uffd_close_fd(uffd_fd);
1674 return ret;
1677 static inline void populate_read_range(RAMBlock *block, ram_addr_t offset,
1678 ram_addr_t size)
1681 * We read one byte of each page; this will preallocate page tables if
1682 * required and populate the shared zeropage on MAP_PRIVATE anonymous memory
1683 * where no page was populated yet. This might require adaption when
1684 * supporting other mappings, like shmem.
1686 for (; offset < size; offset += block->page_size) {
1687 char tmp = *((char *)block->host + offset);
1689 /* Don't optimize the read out */
1690 asm volatile("" : "+r" (tmp));
1694 static inline int populate_read_section(MemoryRegionSection *section,
1695 void *opaque)
1697 const hwaddr size = int128_get64(section->size);
1698 hwaddr offset = section->offset_within_region;
1699 RAMBlock *block = section->mr->ram_block;
1701 populate_read_range(block, offset, size);
1702 return 0;
1706 * ram_block_populate_read: preallocate page tables and populate pages in the
1707 * RAM block by reading a byte of each page.
1709 * Since it's solely used for userfault_fd WP feature, here we just
1710 * hardcode page size to qemu_real_host_page_size.
1712 * @block: RAM block to populate
1714 static void ram_block_populate_read(RAMBlock *rb)
1717 * Skip populating all pages that fall into a discarded range as managed by
1718 * a RamDiscardManager responsible for the mapped memory region of the
1719 * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock
1720 * must not get populated automatically. We don't have to track
1721 * modifications via userfaultfd WP reliably, because these pages will
1722 * not be part of the migration stream either way -- see
1723 * ramblock_dirty_bitmap_exclude_discarded_pages().
1725 * Note: The result is only stable while migrating (precopy/postcopy).
1727 if (rb->mr && memory_region_has_ram_discard_manager(rb->mr)) {
1728 RamDiscardManager *rdm = memory_region_get_ram_discard_manager(rb->mr);
1729 MemoryRegionSection section = {
1730 .mr = rb->mr,
1731 .offset_within_region = 0,
1732 .size = rb->mr->size,
1735 ram_discard_manager_replay_populated(rdm, &section,
1736 populate_read_section, NULL);
1737 } else {
1738 populate_read_range(rb, 0, rb->used_length);
1743 * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking
1745 void ram_write_tracking_prepare(void)
1747 RAMBlock *block;
1749 RCU_READ_LOCK_GUARD();
1751 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1752 /* Nothing to do with read-only and MMIO-writable regions */
1753 if (block->mr->readonly || block->mr->rom_device) {
1754 continue;
1758 * Populate pages of the RAM block before enabling userfault_fd
1759 * write protection.
1761 * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with
1762 * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip
1763 * pages with pte_none() entries in page table.
1765 ram_block_populate_read(block);
1770 * ram_write_tracking_start: start UFFD-WP memory tracking
1772 * Returns 0 for success or negative value in case of error
1774 int ram_write_tracking_start(void)
1776 int uffd_fd;
1777 RAMState *rs = ram_state;
1778 RAMBlock *block;
1780 /* Open UFFD file descriptor */
1781 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true);
1782 if (uffd_fd < 0) {
1783 return uffd_fd;
1785 rs->uffdio_fd = uffd_fd;
1787 RCU_READ_LOCK_GUARD();
1789 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1790 /* Nothing to do with read-only and MMIO-writable regions */
1791 if (block->mr->readonly || block->mr->rom_device) {
1792 continue;
1795 /* Register block memory with UFFD to track writes */
1796 if (uffd_register_memory(rs->uffdio_fd, block->host,
1797 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
1798 goto fail;
1800 /* Apply UFFD write protection to the block memory range */
1801 if (uffd_change_protection(rs->uffdio_fd, block->host,
1802 block->max_length, true, false)) {
1803 goto fail;
1805 block->flags |= RAM_UF_WRITEPROTECT;
1806 memory_region_ref(block->mr);
1808 trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size,
1809 block->host, block->max_length);
1812 return 0;
1814 fail:
1815 error_report("ram_write_tracking_start() failed: restoring initial memory state");
1817 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1818 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1819 continue;
1822 * In case some memory block failed to be write-protected
1823 * remove protection and unregister all succeeded RAM blocks
1825 uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1826 false, false);
1827 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1828 /* Cleanup flags and remove reference */
1829 block->flags &= ~RAM_UF_WRITEPROTECT;
1830 memory_region_unref(block->mr);
1833 uffd_close_fd(uffd_fd);
1834 rs->uffdio_fd = -1;
1835 return -1;
1839 * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
1841 void ram_write_tracking_stop(void)
1843 RAMState *rs = ram_state;
1844 RAMBlock *block;
1846 RCU_READ_LOCK_GUARD();
1848 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1849 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1850 continue;
1852 /* Remove protection and unregister all affected RAM blocks */
1853 uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1854 false, false);
1855 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1857 trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size,
1858 block->host, block->max_length);
1860 /* Cleanup flags and remove reference */
1861 block->flags &= ~RAM_UF_WRITEPROTECT;
1862 memory_region_unref(block->mr);
1865 /* Finally close UFFD file descriptor */
1866 uffd_close_fd(rs->uffdio_fd);
1867 rs->uffdio_fd = -1;
1870 #else
1871 /* No target OS support, stubs just fail or ignore */
1873 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1875 (void) rs;
1876 (void) offset;
1878 return NULL;
1881 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1882 unsigned long start_page)
1884 (void) rs;
1885 (void) pss;
1886 (void) start_page;
1888 return 0;
1891 bool ram_write_tracking_available(void)
1893 return false;
1896 bool ram_write_tracking_compatible(void)
1898 assert(0);
1899 return false;
1902 int ram_write_tracking_start(void)
1904 assert(0);
1905 return -1;
1908 void ram_write_tracking_stop(void)
1910 assert(0);
1912 #endif /* defined(__linux__) */
1915 * get_queued_page: unqueue a page from the postcopy requests
1917 * Skips pages that are already sent (!dirty)
1919 * Returns true if a queued page is found
1921 * @rs: current RAM state
1922 * @pss: data about the state of the current dirty page scan
1924 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1926 RAMBlock *block;
1927 ram_addr_t offset;
1928 bool dirty;
1930 do {
1931 block = unqueue_page(rs, &offset);
1933 * We're sending this page, and since it's postcopy nothing else
1934 * will dirty it, and we must make sure it doesn't get sent again
1935 * even if this queue request was received after the background
1936 * search already sent it.
1938 if (block) {
1939 unsigned long page;
1941 page = offset >> TARGET_PAGE_BITS;
1942 dirty = test_bit(page, block->bmap);
1943 if (!dirty) {
1944 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1945 page);
1946 } else {
1947 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1951 } while (block && !dirty);
1953 if (!block) {
1955 * Poll write faults too if background snapshot is enabled; that's
1956 * when we have vcpus got blocked by the write protected pages.
1958 block = poll_fault_page(rs, &offset);
1961 if (block) {
1963 * We want the background search to continue from the queued page
1964 * since the guest is likely to want other pages near to the page
1965 * it just requested.
1967 pss->block = block;
1968 pss->page = offset >> TARGET_PAGE_BITS;
1971 * This unqueued page would break the "one round" check, even is
1972 * really rare.
1974 pss->complete_round = false;
1977 return !!block;
1981 * migration_page_queue_free: drop any remaining pages in the ram
1982 * request queue
1984 * It should be empty at the end anyway, but in error cases there may
1985 * be some left. in case that there is any page left, we drop it.
1988 static void migration_page_queue_free(RAMState *rs)
1990 struct RAMSrcPageRequest *mspr, *next_mspr;
1991 /* This queue generally should be empty - but in the case of a failed
1992 * migration might have some droppings in.
1994 RCU_READ_LOCK_GUARD();
1995 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1996 memory_region_unref(mspr->rb->mr);
1997 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1998 g_free(mspr);
2003 * ram_save_queue_pages: queue the page for transmission
2005 * A request from postcopy destination for example.
2007 * Returns zero on success or negative on error
2009 * @rbname: Name of the RAMBLock of the request. NULL means the
2010 * same that last one.
2011 * @start: starting address from the start of the RAMBlock
2012 * @len: length (in bytes) to send
2014 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
2016 RAMBlock *ramblock;
2017 RAMState *rs = ram_state;
2019 ram_counters.postcopy_requests++;
2020 RCU_READ_LOCK_GUARD();
2022 if (!rbname) {
2023 /* Reuse last RAMBlock */
2024 ramblock = rs->last_req_rb;
2026 if (!ramblock) {
2028 * Shouldn't happen, we can't reuse the last RAMBlock if
2029 * it's the 1st request.
2031 error_report("ram_save_queue_pages no previous block");
2032 return -1;
2034 } else {
2035 ramblock = qemu_ram_block_by_name(rbname);
2037 if (!ramblock) {
2038 /* We shouldn't be asked for a non-existent RAMBlock */
2039 error_report("ram_save_queue_pages no block '%s'", rbname);
2040 return -1;
2042 rs->last_req_rb = ramblock;
2044 trace_ram_save_queue_pages(ramblock->idstr, start, len);
2045 if (!offset_in_ramblock(ramblock, start + len - 1)) {
2046 error_report("%s request overrun start=" RAM_ADDR_FMT " len="
2047 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
2048 __func__, start, len, ramblock->used_length);
2049 return -1;
2052 struct RAMSrcPageRequest *new_entry =
2053 g_malloc0(sizeof(struct RAMSrcPageRequest));
2054 new_entry->rb = ramblock;
2055 new_entry->offset = start;
2056 new_entry->len = len;
2058 memory_region_ref(ramblock->mr);
2059 qemu_mutex_lock(&rs->src_page_req_mutex);
2060 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
2061 migration_make_urgent_request();
2062 qemu_mutex_unlock(&rs->src_page_req_mutex);
2064 return 0;
2067 static bool save_page_use_compression(RAMState *rs)
2069 if (!migrate_use_compression()) {
2070 return false;
2074 * If xbzrle is enabled (e.g., after first round of migration), stop
2075 * using the data compression. In theory, xbzrle can do better than
2076 * compression.
2078 if (rs->xbzrle_enabled) {
2079 return false;
2082 return true;
2086 * try to compress the page before posting it out, return true if the page
2087 * has been properly handled by compression, otherwise needs other
2088 * paths to handle it
2090 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
2092 if (!save_page_use_compression(rs)) {
2093 return false;
2097 * When starting the process of a new block, the first page of
2098 * the block should be sent out before other pages in the same
2099 * block, and all the pages in last block should have been sent
2100 * out, keeping this order is important, because the 'cont' flag
2101 * is used to avoid resending the block name.
2103 * We post the fist page as normal page as compression will take
2104 * much CPU resource.
2106 if (block != rs->last_sent_block) {
2107 flush_compressed_data(rs);
2108 return false;
2111 if (compress_page_with_multi_thread(rs, block, offset) > 0) {
2112 return true;
2115 compression_counters.busy++;
2116 return false;
2120 * ram_save_target_page: save one target page
2122 * Returns the number of pages written
2124 * @rs: current RAM state
2125 * @pss: data about the page we want to send
2127 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss)
2129 RAMBlock *block = pss->block;
2130 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
2131 int res;
2133 if (control_save_page(rs, block, offset, &res)) {
2134 return res;
2137 if (save_compress_page(rs, block, offset)) {
2138 return 1;
2141 res = save_zero_page(rs, block, offset);
2142 if (res > 0) {
2143 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2144 * page would be stale
2146 if (!save_page_use_compression(rs)) {
2147 XBZRLE_cache_lock();
2148 xbzrle_cache_zero_page(rs, block->offset + offset);
2149 XBZRLE_cache_unlock();
2151 return res;
2155 * Do not use multifd for:
2156 * 1. Compression as the first page in the new block should be posted out
2157 * before sending the compressed page
2158 * 2. In postcopy as one whole host page should be placed
2160 if (!save_page_use_compression(rs) && migrate_use_multifd()
2161 && !migration_in_postcopy()) {
2162 return ram_save_multifd_page(rs, block, offset);
2165 return ram_save_page(rs, pss);
2169 * ram_save_host_page: save a whole host page
2171 * Starting at *offset send pages up to the end of the current host
2172 * page. It's valid for the initial offset to point into the middle of
2173 * a host page in which case the remainder of the hostpage is sent.
2174 * Only dirty target pages are sent. Note that the host page size may
2175 * be a huge page for this block.
2176 * The saving stops at the boundary of the used_length of the block
2177 * if the RAMBlock isn't a multiple of the host page size.
2179 * Returns the number of pages written or negative on error
2181 * @rs: current RAM state
2182 * @ms: current migration state
2183 * @pss: data about the page we want to send
2185 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss)
2187 int tmppages, pages = 0;
2188 size_t pagesize_bits =
2189 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2190 unsigned long hostpage_boundary =
2191 QEMU_ALIGN_UP(pss->page + 1, pagesize_bits);
2192 unsigned long start_page = pss->page;
2193 int res;
2195 if (ramblock_is_ignored(pss->block)) {
2196 error_report("block %s should not be migrated !", pss->block->idstr);
2197 return 0;
2200 do {
2201 /* Check the pages is dirty and if it is send it */
2202 if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
2203 tmppages = ram_save_target_page(rs, pss);
2204 if (tmppages < 0) {
2205 return tmppages;
2208 pages += tmppages;
2210 * Allow rate limiting to happen in the middle of huge pages if
2211 * something is sent in the current iteration.
2213 if (pagesize_bits > 1 && tmppages > 0) {
2214 migration_rate_limit();
2217 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
2218 } while ((pss->page < hostpage_boundary) &&
2219 offset_in_ramblock(pss->block,
2220 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS));
2221 /* The offset we leave with is the min boundary of host page and block */
2222 pss->page = MIN(pss->page, hostpage_boundary) - 1;
2224 res = ram_save_release_protection(rs, pss, start_page);
2225 return (res < 0 ? res : pages);
2229 * ram_find_and_save_block: finds a dirty page and sends it to f
2231 * Called within an RCU critical section.
2233 * Returns the number of pages written where zero means no dirty pages,
2234 * or negative on error
2236 * @rs: current RAM state
2238 * On systems where host-page-size > target-page-size it will send all the
2239 * pages in a host page that are dirty.
2241 static int ram_find_and_save_block(RAMState *rs)
2243 PageSearchStatus pss;
2244 int pages = 0;
2245 bool again, found;
2247 /* No dirty page as there is zero RAM */
2248 if (!ram_bytes_total()) {
2249 return pages;
2252 pss.block = rs->last_seen_block;
2253 pss.page = rs->last_page;
2254 pss.complete_round = false;
2256 if (!pss.block) {
2257 pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
2260 do {
2261 again = true;
2262 found = get_queued_page(rs, &pss);
2264 if (!found) {
2265 /* priority queue empty, so just search for something dirty */
2266 found = find_dirty_block(rs, &pss, &again);
2269 if (found) {
2270 pages = ram_save_host_page(rs, &pss);
2272 } while (!pages && again);
2274 rs->last_seen_block = pss.block;
2275 rs->last_page = pss.page;
2277 return pages;
2280 void acct_update_position(QEMUFile *f, size_t size, bool zero)
2282 uint64_t pages = size / TARGET_PAGE_SIZE;
2284 if (zero) {
2285 ram_counters.duplicate += pages;
2286 } else {
2287 ram_counters.normal += pages;
2288 ram_counters.transferred += size;
2289 qemu_update_position(f, size);
2293 static uint64_t ram_bytes_total_common(bool count_ignored)
2295 RAMBlock *block;
2296 uint64_t total = 0;
2298 RCU_READ_LOCK_GUARD();
2300 if (count_ignored) {
2301 RAMBLOCK_FOREACH_MIGRATABLE(block) {
2302 total += block->used_length;
2304 } else {
2305 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2306 total += block->used_length;
2309 return total;
2312 uint64_t ram_bytes_total(void)
2314 return ram_bytes_total_common(false);
2317 static void xbzrle_load_setup(void)
2319 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2322 static void xbzrle_load_cleanup(void)
2324 g_free(XBZRLE.decoded_buf);
2325 XBZRLE.decoded_buf = NULL;
2328 static void ram_state_cleanup(RAMState **rsp)
2330 if (*rsp) {
2331 migration_page_queue_free(*rsp);
2332 qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2333 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2334 g_free(*rsp);
2335 *rsp = NULL;
2339 static void xbzrle_cleanup(void)
2341 XBZRLE_cache_lock();
2342 if (XBZRLE.cache) {
2343 cache_fini(XBZRLE.cache);
2344 g_free(XBZRLE.encoded_buf);
2345 g_free(XBZRLE.current_buf);
2346 g_free(XBZRLE.zero_target_page);
2347 XBZRLE.cache = NULL;
2348 XBZRLE.encoded_buf = NULL;
2349 XBZRLE.current_buf = NULL;
2350 XBZRLE.zero_target_page = NULL;
2352 XBZRLE_cache_unlock();
2355 static void ram_save_cleanup(void *opaque)
2357 RAMState **rsp = opaque;
2358 RAMBlock *block;
2360 /* We don't use dirty log with background snapshots */
2361 if (!migrate_background_snapshot()) {
2362 /* caller have hold iothread lock or is in a bh, so there is
2363 * no writing race against the migration bitmap
2365 if (global_dirty_tracking & GLOBAL_DIRTY_MIGRATION) {
2367 * do not stop dirty log without starting it, since
2368 * memory_global_dirty_log_stop will assert that
2369 * memory_global_dirty_log_start/stop used in pairs
2371 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
2375 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2376 g_free(block->clear_bmap);
2377 block->clear_bmap = NULL;
2378 g_free(block->bmap);
2379 block->bmap = NULL;
2382 xbzrle_cleanup();
2383 compress_threads_save_cleanup();
2384 ram_state_cleanup(rsp);
2387 static void ram_state_reset(RAMState *rs)
2389 rs->last_seen_block = NULL;
2390 rs->last_sent_block = NULL;
2391 rs->last_page = 0;
2392 rs->last_version = ram_list.version;
2393 rs->xbzrle_enabled = false;
2396 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2399 * 'expected' is the value you expect the bitmap mostly to be full
2400 * of; it won't bother printing lines that are all this value.
2401 * If 'todump' is null the migration bitmap is dumped.
2403 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
2404 unsigned long pages)
2406 int64_t cur;
2407 int64_t linelen = 128;
2408 char linebuf[129];
2410 for (cur = 0; cur < pages; cur += linelen) {
2411 int64_t curb;
2412 bool found = false;
2414 * Last line; catch the case where the line length
2415 * is longer than remaining ram
2417 if (cur + linelen > pages) {
2418 linelen = pages - cur;
2420 for (curb = 0; curb < linelen; curb++) {
2421 bool thisbit = test_bit(cur + curb, todump);
2422 linebuf[curb] = thisbit ? '1' : '.';
2423 found = found || (thisbit != expected);
2425 if (found) {
2426 linebuf[curb] = '\0';
2427 fprintf(stderr, "0x%08" PRIx64 " : %s\n", cur, linebuf);
2432 /* **** functions for postcopy ***** */
2434 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2436 struct RAMBlock *block;
2438 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2439 unsigned long *bitmap = block->bmap;
2440 unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2441 unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2443 while (run_start < range) {
2444 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2445 ram_discard_range(block->idstr,
2446 ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
2447 ((ram_addr_t)(run_end - run_start))
2448 << TARGET_PAGE_BITS);
2449 run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2455 * postcopy_send_discard_bm_ram: discard a RAMBlock
2457 * Returns zero on success
2459 * Callback from postcopy_each_ram_send_discard for each RAMBlock
2461 * @ms: current migration state
2462 * @block: RAMBlock to discard
2464 static int postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
2466 unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2467 unsigned long current;
2468 unsigned long *bitmap = block->bmap;
2470 for (current = 0; current < end; ) {
2471 unsigned long one = find_next_bit(bitmap, end, current);
2472 unsigned long zero, discard_length;
2474 if (one >= end) {
2475 break;
2478 zero = find_next_zero_bit(bitmap, end, one + 1);
2480 if (zero >= end) {
2481 discard_length = end - one;
2482 } else {
2483 discard_length = zero - one;
2485 postcopy_discard_send_range(ms, one, discard_length);
2486 current = one + discard_length;
2489 return 0;
2493 * postcopy_each_ram_send_discard: discard all RAMBlocks
2495 * Returns 0 for success or negative for error
2497 * Utility for the outgoing postcopy code.
2498 * Calls postcopy_send_discard_bm_ram for each RAMBlock
2499 * passing it bitmap indexes and name.
2500 * (qemu_ram_foreach_block ends up passing unscaled lengths
2501 * which would mean postcopy code would have to deal with target page)
2503 * @ms: current migration state
2505 static int postcopy_each_ram_send_discard(MigrationState *ms)
2507 struct RAMBlock *block;
2508 int ret;
2510 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2511 postcopy_discard_send_init(ms, block->idstr);
2514 * Postcopy sends chunks of bitmap over the wire, but it
2515 * just needs indexes at this point, avoids it having
2516 * target page specific code.
2518 ret = postcopy_send_discard_bm_ram(ms, block);
2519 postcopy_discard_send_finish(ms);
2520 if (ret) {
2521 return ret;
2525 return 0;
2529 * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2531 * Helper for postcopy_chunk_hostpages; it's called twice to
2532 * canonicalize the two bitmaps, that are similar, but one is
2533 * inverted.
2535 * Postcopy requires that all target pages in a hostpage are dirty or
2536 * clean, not a mix. This function canonicalizes the bitmaps.
2538 * @ms: current migration state
2539 * @block: block that contains the page we want to canonicalize
2541 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
2543 RAMState *rs = ram_state;
2544 unsigned long *bitmap = block->bmap;
2545 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2546 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2547 unsigned long run_start;
2549 if (block->page_size == TARGET_PAGE_SIZE) {
2550 /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2551 return;
2554 /* Find a dirty page */
2555 run_start = find_next_bit(bitmap, pages, 0);
2557 while (run_start < pages) {
2560 * If the start of this run of pages is in the middle of a host
2561 * page, then we need to fixup this host page.
2563 if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2564 /* Find the end of this run */
2565 run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2567 * If the end isn't at the start of a host page, then the
2568 * run doesn't finish at the end of a host page
2569 * and we need to discard.
2573 if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2574 unsigned long page;
2575 unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2576 host_ratio);
2577 run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2579 /* Clean up the bitmap */
2580 for (page = fixup_start_addr;
2581 page < fixup_start_addr + host_ratio; page++) {
2583 * Remark them as dirty, updating the count for any pages
2584 * that weren't previously dirty.
2586 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2590 /* Find the next dirty page for the next iteration */
2591 run_start = find_next_bit(bitmap, pages, run_start);
2596 * postcopy_chunk_hostpages: discard any partially sent host page
2598 * Utility for the outgoing postcopy code.
2600 * Discard any partially sent host-page size chunks, mark any partially
2601 * dirty host-page size chunks as all dirty. In this case the host-page
2602 * is the host-page for the particular RAMBlock, i.e. it might be a huge page
2604 * Returns zero on success
2606 * @ms: current migration state
2607 * @block: block we want to work with
2609 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
2611 postcopy_discard_send_init(ms, block->idstr);
2614 * Ensure that all partially dirty host pages are made fully dirty.
2616 postcopy_chunk_hostpages_pass(ms, block);
2618 postcopy_discard_send_finish(ms);
2619 return 0;
2623 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2625 * Returns zero on success
2627 * Transmit the set of pages to be discarded after precopy to the target
2628 * these are pages that:
2629 * a) Have been previously transmitted but are now dirty again
2630 * b) Pages that have never been transmitted, this ensures that
2631 * any pages on the destination that have been mapped by background
2632 * tasks get discarded (transparent huge pages is the specific concern)
2633 * Hopefully this is pretty sparse
2635 * @ms: current migration state
2637 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
2639 RAMState *rs = ram_state;
2640 RAMBlock *block;
2641 int ret;
2643 RCU_READ_LOCK_GUARD();
2645 /* This should be our last sync, the src is now paused */
2646 migration_bitmap_sync(rs);
2648 /* Easiest way to make sure we don't resume in the middle of a host-page */
2649 rs->last_seen_block = NULL;
2650 rs->last_sent_block = NULL;
2651 rs->last_page = 0;
2653 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2654 /* Deal with TPS != HPS and huge pages */
2655 ret = postcopy_chunk_hostpages(ms, block);
2656 if (ret) {
2657 return ret;
2660 #ifdef DEBUG_POSTCOPY
2661 ram_debug_dump_bitmap(block->bmap, true,
2662 block->used_length >> TARGET_PAGE_BITS);
2663 #endif
2665 trace_ram_postcopy_send_discard_bitmap();
2667 return postcopy_each_ram_send_discard(ms);
2671 * ram_discard_range: discard dirtied pages at the beginning of postcopy
2673 * Returns zero on success
2675 * @rbname: name of the RAMBlock of the request. NULL means the
2676 * same that last one.
2677 * @start: RAMBlock starting page
2678 * @length: RAMBlock size
2680 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2682 trace_ram_discard_range(rbname, start, length);
2684 RCU_READ_LOCK_GUARD();
2685 RAMBlock *rb = qemu_ram_block_by_name(rbname);
2687 if (!rb) {
2688 error_report("ram_discard_range: Failed to find block '%s'", rbname);
2689 return -1;
2693 * On source VM, we don't need to update the received bitmap since
2694 * we don't even have one.
2696 if (rb->receivedmap) {
2697 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2698 length >> qemu_target_page_bits());
2701 return ram_block_discard_range(rb, start, length);
2705 * For every allocation, we will try not to crash the VM if the
2706 * allocation failed.
2708 static int xbzrle_init(void)
2710 Error *local_err = NULL;
2712 if (!migrate_use_xbzrle()) {
2713 return 0;
2716 XBZRLE_cache_lock();
2718 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2719 if (!XBZRLE.zero_target_page) {
2720 error_report("%s: Error allocating zero page", __func__);
2721 goto err_out;
2724 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2725 TARGET_PAGE_SIZE, &local_err);
2726 if (!XBZRLE.cache) {
2727 error_report_err(local_err);
2728 goto free_zero_page;
2731 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2732 if (!XBZRLE.encoded_buf) {
2733 error_report("%s: Error allocating encoded_buf", __func__);
2734 goto free_cache;
2737 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2738 if (!XBZRLE.current_buf) {
2739 error_report("%s: Error allocating current_buf", __func__);
2740 goto free_encoded_buf;
2743 /* We are all good */
2744 XBZRLE_cache_unlock();
2745 return 0;
2747 free_encoded_buf:
2748 g_free(XBZRLE.encoded_buf);
2749 XBZRLE.encoded_buf = NULL;
2750 free_cache:
2751 cache_fini(XBZRLE.cache);
2752 XBZRLE.cache = NULL;
2753 free_zero_page:
2754 g_free(XBZRLE.zero_target_page);
2755 XBZRLE.zero_target_page = NULL;
2756 err_out:
2757 XBZRLE_cache_unlock();
2758 return -ENOMEM;
2761 static int ram_state_init(RAMState **rsp)
2763 *rsp = g_try_new0(RAMState, 1);
2765 if (!*rsp) {
2766 error_report("%s: Init ramstate fail", __func__);
2767 return -1;
2770 qemu_mutex_init(&(*rsp)->bitmap_mutex);
2771 qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2772 QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2775 * Count the total number of pages used by ram blocks not including any
2776 * gaps due to alignment or unplugs.
2777 * This must match with the initial values of dirty bitmap.
2779 (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2780 ram_state_reset(*rsp);
2782 return 0;
2785 static void ram_list_init_bitmaps(void)
2787 MigrationState *ms = migrate_get_current();
2788 RAMBlock *block;
2789 unsigned long pages;
2790 uint8_t shift;
2792 /* Skip setting bitmap if there is no RAM */
2793 if (ram_bytes_total()) {
2794 shift = ms->clear_bitmap_shift;
2795 if (shift > CLEAR_BITMAP_SHIFT_MAX) {
2796 error_report("clear_bitmap_shift (%u) too big, using "
2797 "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
2798 shift = CLEAR_BITMAP_SHIFT_MAX;
2799 } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
2800 error_report("clear_bitmap_shift (%u) too small, using "
2801 "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
2802 shift = CLEAR_BITMAP_SHIFT_MIN;
2805 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2806 pages = block->max_length >> TARGET_PAGE_BITS;
2808 * The initial dirty bitmap for migration must be set with all
2809 * ones to make sure we'll migrate every guest RAM page to
2810 * destination.
2811 * Here we set RAMBlock.bmap all to 1 because when rebegin a
2812 * new migration after a failed migration, ram_list.
2813 * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
2814 * guest memory.
2816 block->bmap = bitmap_new(pages);
2817 bitmap_set(block->bmap, 0, pages);
2818 block->clear_bmap_shift = shift;
2819 block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
2824 static void migration_bitmap_clear_discarded_pages(RAMState *rs)
2826 unsigned long pages;
2827 RAMBlock *rb;
2829 RCU_READ_LOCK_GUARD();
2831 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
2832 pages = ramblock_dirty_bitmap_clear_discarded_pages(rb);
2833 rs->migration_dirty_pages -= pages;
2837 static void ram_init_bitmaps(RAMState *rs)
2839 /* For memory_global_dirty_log_start below. */
2840 qemu_mutex_lock_iothread();
2841 qemu_mutex_lock_ramlist();
2843 WITH_RCU_READ_LOCK_GUARD() {
2844 ram_list_init_bitmaps();
2845 /* We don't use dirty log with background snapshots */
2846 if (!migrate_background_snapshot()) {
2847 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
2848 migration_bitmap_sync_precopy(rs);
2851 qemu_mutex_unlock_ramlist();
2852 qemu_mutex_unlock_iothread();
2855 * After an eventual first bitmap sync, fixup the initial bitmap
2856 * containing all 1s to exclude any discarded pages from migration.
2858 migration_bitmap_clear_discarded_pages(rs);
2861 static int ram_init_all(RAMState **rsp)
2863 if (ram_state_init(rsp)) {
2864 return -1;
2867 if (xbzrle_init()) {
2868 ram_state_cleanup(rsp);
2869 return -1;
2872 ram_init_bitmaps(*rsp);
2874 return 0;
2877 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2879 RAMBlock *block;
2880 uint64_t pages = 0;
2883 * Postcopy is not using xbzrle/compression, so no need for that.
2884 * Also, since source are already halted, we don't need to care
2885 * about dirty page logging as well.
2888 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2889 pages += bitmap_count_one(block->bmap,
2890 block->used_length >> TARGET_PAGE_BITS);
2893 /* This may not be aligned with current bitmaps. Recalculate. */
2894 rs->migration_dirty_pages = pages;
2896 ram_state_reset(rs);
2898 /* Update RAMState cache of output QEMUFile */
2899 rs->f = out;
2901 trace_ram_state_resume_prepare(pages);
2905 * This function clears bits of the free pages reported by the caller from the
2906 * migration dirty bitmap. @addr is the host address corresponding to the
2907 * start of the continuous guest free pages, and @len is the total bytes of
2908 * those pages.
2910 void qemu_guest_free_page_hint(void *addr, size_t len)
2912 RAMBlock *block;
2913 ram_addr_t offset;
2914 size_t used_len, start, npages;
2915 MigrationState *s = migrate_get_current();
2917 /* This function is currently expected to be used during live migration */
2918 if (!migration_is_setup_or_active(s->state)) {
2919 return;
2922 for (; len > 0; len -= used_len, addr += used_len) {
2923 block = qemu_ram_block_from_host(addr, false, &offset);
2924 if (unlikely(!block || offset >= block->used_length)) {
2926 * The implementation might not support RAMBlock resize during
2927 * live migration, but it could happen in theory with future
2928 * updates. So we add a check here to capture that case.
2930 error_report_once("%s unexpected error", __func__);
2931 return;
2934 if (len <= block->used_length - offset) {
2935 used_len = len;
2936 } else {
2937 used_len = block->used_length - offset;
2940 start = offset >> TARGET_PAGE_BITS;
2941 npages = used_len >> TARGET_PAGE_BITS;
2943 qemu_mutex_lock(&ram_state->bitmap_mutex);
2945 * The skipped free pages are equavalent to be sent from clear_bmap's
2946 * perspective, so clear the bits from the memory region bitmap which
2947 * are initially set. Otherwise those skipped pages will be sent in
2948 * the next round after syncing from the memory region bitmap.
2950 migration_clear_memory_region_dirty_bitmap_range(block, start, npages);
2951 ram_state->migration_dirty_pages -=
2952 bitmap_count_one_with_offset(block->bmap, start, npages);
2953 bitmap_clear(block->bmap, start, npages);
2954 qemu_mutex_unlock(&ram_state->bitmap_mutex);
2959 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2960 * long-running RCU critical section. When rcu-reclaims in the code
2961 * start to become numerous it will be necessary to reduce the
2962 * granularity of these critical sections.
2966 * ram_save_setup: Setup RAM for migration
2968 * Returns zero to indicate success and negative for error
2970 * @f: QEMUFile where to send the data
2971 * @opaque: RAMState pointer
2973 static int ram_save_setup(QEMUFile *f, void *opaque)
2975 RAMState **rsp = opaque;
2976 RAMBlock *block;
2978 if (compress_threads_save_setup()) {
2979 return -1;
2982 /* migration has already setup the bitmap, reuse it. */
2983 if (!migration_in_colo_state()) {
2984 if (ram_init_all(rsp) != 0) {
2985 compress_threads_save_cleanup();
2986 return -1;
2989 (*rsp)->f = f;
2991 WITH_RCU_READ_LOCK_GUARD() {
2992 qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
2994 RAMBLOCK_FOREACH_MIGRATABLE(block) {
2995 qemu_put_byte(f, strlen(block->idstr));
2996 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2997 qemu_put_be64(f, block->used_length);
2998 if (migrate_postcopy_ram() && block->page_size !=
2999 qemu_host_page_size) {
3000 qemu_put_be64(f, block->page_size);
3002 if (migrate_ignore_shared()) {
3003 qemu_put_be64(f, block->mr->addr);
3008 ram_control_before_iterate(f, RAM_CONTROL_SETUP);
3009 ram_control_after_iterate(f, RAM_CONTROL_SETUP);
3011 multifd_send_sync_main(f);
3012 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3013 qemu_fflush(f);
3015 return 0;
3019 * ram_save_iterate: iterative stage for migration
3021 * Returns zero to indicate success and negative for error
3023 * @f: QEMUFile where to send the data
3024 * @opaque: RAMState pointer
3026 static int ram_save_iterate(QEMUFile *f, void *opaque)
3028 RAMState **temp = opaque;
3029 RAMState *rs = *temp;
3030 int ret = 0;
3031 int i;
3032 int64_t t0;
3033 int done = 0;
3035 if (blk_mig_bulk_active()) {
3036 /* Avoid transferring ram during bulk phase of block migration as
3037 * the bulk phase will usually take a long time and transferring
3038 * ram updates during that time is pointless. */
3039 goto out;
3043 * We'll take this lock a little bit long, but it's okay for two reasons.
3044 * Firstly, the only possible other thread to take it is who calls
3045 * qemu_guest_free_page_hint(), which should be rare; secondly, see
3046 * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which
3047 * guarantees that we'll at least released it in a regular basis.
3049 qemu_mutex_lock(&rs->bitmap_mutex);
3050 WITH_RCU_READ_LOCK_GUARD() {
3051 if (ram_list.version != rs->last_version) {
3052 ram_state_reset(rs);
3055 /* Read version before ram_list.blocks */
3056 smp_rmb();
3058 ram_control_before_iterate(f, RAM_CONTROL_ROUND);
3060 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
3061 i = 0;
3062 while ((ret = qemu_file_rate_limit(f)) == 0 ||
3063 !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
3064 int pages;
3066 if (qemu_file_get_error(f)) {
3067 break;
3070 pages = ram_find_and_save_block(rs);
3071 /* no more pages to sent */
3072 if (pages == 0) {
3073 done = 1;
3074 break;
3077 if (pages < 0) {
3078 qemu_file_set_error(f, pages);
3079 break;
3082 rs->target_page_count += pages;
3085 * During postcopy, it is necessary to make sure one whole host
3086 * page is sent in one chunk.
3088 if (migrate_postcopy_ram()) {
3089 flush_compressed_data(rs);
3093 * we want to check in the 1st loop, just in case it was the 1st
3094 * time and we had to sync the dirty bitmap.
3095 * qemu_clock_get_ns() is a bit expensive, so we only check each
3096 * some iterations
3098 if ((i & 63) == 0) {
3099 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
3100 1000000;
3101 if (t1 > MAX_WAIT) {
3102 trace_ram_save_iterate_big_wait(t1, i);
3103 break;
3106 i++;
3109 qemu_mutex_unlock(&rs->bitmap_mutex);
3112 * Must occur before EOS (or any QEMUFile operation)
3113 * because of RDMA protocol.
3115 ram_control_after_iterate(f, RAM_CONTROL_ROUND);
3117 out:
3118 if (ret >= 0
3119 && migration_is_setup_or_active(migrate_get_current()->state)) {
3120 multifd_send_sync_main(rs->f);
3121 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3122 qemu_fflush(f);
3123 ram_counters.transferred += 8;
3125 ret = qemu_file_get_error(f);
3127 if (ret < 0) {
3128 return ret;
3131 return done;
3135 * ram_save_complete: function called to send the remaining amount of ram
3137 * Returns zero to indicate success or negative on error
3139 * Called with iothread lock
3141 * @f: QEMUFile where to send the data
3142 * @opaque: RAMState pointer
3144 static int ram_save_complete(QEMUFile *f, void *opaque)
3146 RAMState **temp = opaque;
3147 RAMState *rs = *temp;
3148 int ret = 0;
3150 rs->last_stage = !migration_in_colo_state();
3152 WITH_RCU_READ_LOCK_GUARD() {
3153 if (!migration_in_postcopy()) {
3154 migration_bitmap_sync_precopy(rs);
3157 ram_control_before_iterate(f, RAM_CONTROL_FINISH);
3159 /* try transferring iterative blocks of memory */
3161 /* flush all remaining blocks regardless of rate limiting */
3162 while (true) {
3163 int pages;
3165 pages = ram_find_and_save_block(rs);
3166 /* no more blocks to sent */
3167 if (pages == 0) {
3168 break;
3170 if (pages < 0) {
3171 ret = pages;
3172 break;
3176 flush_compressed_data(rs);
3177 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3180 if (ret >= 0) {
3181 multifd_send_sync_main(rs->f);
3182 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3183 qemu_fflush(f);
3186 return ret;
3189 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
3190 uint64_t *res_precopy_only,
3191 uint64_t *res_compatible,
3192 uint64_t *res_postcopy_only)
3194 RAMState **temp = opaque;
3195 RAMState *rs = *temp;
3196 uint64_t remaining_size;
3198 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3200 if (!migration_in_postcopy() &&
3201 remaining_size < max_size) {
3202 qemu_mutex_lock_iothread();
3203 WITH_RCU_READ_LOCK_GUARD() {
3204 migration_bitmap_sync_precopy(rs);
3206 qemu_mutex_unlock_iothread();
3207 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3210 if (migrate_postcopy_ram()) {
3211 /* We can do postcopy, and all the data is postcopiable */
3212 *res_compatible += remaining_size;
3213 } else {
3214 *res_precopy_only += remaining_size;
3218 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3220 unsigned int xh_len;
3221 int xh_flags;
3222 uint8_t *loaded_data;
3224 /* extract RLE header */
3225 xh_flags = qemu_get_byte(f);
3226 xh_len = qemu_get_be16(f);
3228 if (xh_flags != ENCODING_FLAG_XBZRLE) {
3229 error_report("Failed to load XBZRLE page - wrong compression!");
3230 return -1;
3233 if (xh_len > TARGET_PAGE_SIZE) {
3234 error_report("Failed to load XBZRLE page - len overflow!");
3235 return -1;
3237 loaded_data = XBZRLE.decoded_buf;
3238 /* load data and decode */
3239 /* it can change loaded_data to point to an internal buffer */
3240 qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3242 /* decode RLE */
3243 if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3244 TARGET_PAGE_SIZE) == -1) {
3245 error_report("Failed to load XBZRLE page - decode error!");
3246 return -1;
3249 return 0;
3253 * ram_block_from_stream: read a RAMBlock id from the migration stream
3255 * Must be called from within a rcu critical section.
3257 * Returns a pointer from within the RCU-protected ram_list.
3259 * @f: QEMUFile where to read the data from
3260 * @flags: Page flags (mostly to see if it's a continuation of previous block)
3262 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
3264 static RAMBlock *block;
3265 char id[256];
3266 uint8_t len;
3268 if (flags & RAM_SAVE_FLAG_CONTINUE) {
3269 if (!block) {
3270 error_report("Ack, bad migration stream!");
3271 return NULL;
3273 return block;
3276 len = qemu_get_byte(f);
3277 qemu_get_buffer(f, (uint8_t *)id, len);
3278 id[len] = 0;
3280 block = qemu_ram_block_by_name(id);
3281 if (!block) {
3282 error_report("Can't find block %s", id);
3283 return NULL;
3286 if (ramblock_is_ignored(block)) {
3287 error_report("block %s should not be migrated !", id);
3288 return NULL;
3291 return block;
3294 static inline void *host_from_ram_block_offset(RAMBlock *block,
3295 ram_addr_t offset)
3297 if (!offset_in_ramblock(block, offset)) {
3298 return NULL;
3301 return block->host + offset;
3304 static void *host_page_from_ram_block_offset(RAMBlock *block,
3305 ram_addr_t offset)
3307 /* Note: Explicitly no check against offset_in_ramblock(). */
3308 return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset),
3309 block->page_size);
3312 static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block,
3313 ram_addr_t offset)
3315 return ((uintptr_t)block->host + offset) & (block->page_size - 1);
3318 static inline void *colo_cache_from_block_offset(RAMBlock *block,
3319 ram_addr_t offset, bool record_bitmap)
3321 if (!offset_in_ramblock(block, offset)) {
3322 return NULL;
3324 if (!block->colo_cache) {
3325 error_report("%s: colo_cache is NULL in block :%s",
3326 __func__, block->idstr);
3327 return NULL;
3331 * During colo checkpoint, we need bitmap of these migrated pages.
3332 * It help us to decide which pages in ram cache should be flushed
3333 * into VM's RAM later.
3335 if (record_bitmap &&
3336 !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
3337 ram_state->migration_dirty_pages++;
3339 return block->colo_cache + offset;
3343 * ram_handle_compressed: handle the zero page case
3345 * If a page (or a whole RDMA chunk) has been
3346 * determined to be zero, then zap it.
3348 * @host: host address for the zero page
3349 * @ch: what the page is filled from. We only support zero
3350 * @size: size of the zero page
3352 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3354 if (ch != 0 || !buffer_is_zero(host, size)) {
3355 memset(host, ch, size);
3359 /* return the size after decompression, or negative value on error */
3360 static int
3361 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3362 const uint8_t *source, size_t source_len)
3364 int err;
3366 err = inflateReset(stream);
3367 if (err != Z_OK) {
3368 return -1;
3371 stream->avail_in = source_len;
3372 stream->next_in = (uint8_t *)source;
3373 stream->avail_out = dest_len;
3374 stream->next_out = dest;
3376 err = inflate(stream, Z_NO_FLUSH);
3377 if (err != Z_STREAM_END) {
3378 return -1;
3381 return stream->total_out;
3384 static void *do_data_decompress(void *opaque)
3386 DecompressParam *param = opaque;
3387 unsigned long pagesize;
3388 uint8_t *des;
3389 int len, ret;
3391 qemu_mutex_lock(&param->mutex);
3392 while (!param->quit) {
3393 if (param->des) {
3394 des = param->des;
3395 len = param->len;
3396 param->des = 0;
3397 qemu_mutex_unlock(&param->mutex);
3399 pagesize = TARGET_PAGE_SIZE;
3401 ret = qemu_uncompress_data(&param->stream, des, pagesize,
3402 param->compbuf, len);
3403 if (ret < 0 && migrate_get_current()->decompress_error_check) {
3404 error_report("decompress data failed");
3405 qemu_file_set_error(decomp_file, ret);
3408 qemu_mutex_lock(&decomp_done_lock);
3409 param->done = true;
3410 qemu_cond_signal(&decomp_done_cond);
3411 qemu_mutex_unlock(&decomp_done_lock);
3413 qemu_mutex_lock(&param->mutex);
3414 } else {
3415 qemu_cond_wait(&param->cond, &param->mutex);
3418 qemu_mutex_unlock(&param->mutex);
3420 return NULL;
3423 static int wait_for_decompress_done(void)
3425 int idx, thread_count;
3427 if (!migrate_use_compression()) {
3428 return 0;
3431 thread_count = migrate_decompress_threads();
3432 qemu_mutex_lock(&decomp_done_lock);
3433 for (idx = 0; idx < thread_count; idx++) {
3434 while (!decomp_param[idx].done) {
3435 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3438 qemu_mutex_unlock(&decomp_done_lock);
3439 return qemu_file_get_error(decomp_file);
3442 static void compress_threads_load_cleanup(void)
3444 int i, thread_count;
3446 if (!migrate_use_compression()) {
3447 return;
3449 thread_count = migrate_decompress_threads();
3450 for (i = 0; i < thread_count; i++) {
3452 * we use it as a indicator which shows if the thread is
3453 * properly init'd or not
3455 if (!decomp_param[i].compbuf) {
3456 break;
3459 qemu_mutex_lock(&decomp_param[i].mutex);
3460 decomp_param[i].quit = true;
3461 qemu_cond_signal(&decomp_param[i].cond);
3462 qemu_mutex_unlock(&decomp_param[i].mutex);
3464 for (i = 0; i < thread_count; i++) {
3465 if (!decomp_param[i].compbuf) {
3466 break;
3469 qemu_thread_join(decompress_threads + i);
3470 qemu_mutex_destroy(&decomp_param[i].mutex);
3471 qemu_cond_destroy(&decomp_param[i].cond);
3472 inflateEnd(&decomp_param[i].stream);
3473 g_free(decomp_param[i].compbuf);
3474 decomp_param[i].compbuf = NULL;
3476 g_free(decompress_threads);
3477 g_free(decomp_param);
3478 decompress_threads = NULL;
3479 decomp_param = NULL;
3480 decomp_file = NULL;
3483 static int compress_threads_load_setup(QEMUFile *f)
3485 int i, thread_count;
3487 if (!migrate_use_compression()) {
3488 return 0;
3491 thread_count = migrate_decompress_threads();
3492 decompress_threads = g_new0(QemuThread, thread_count);
3493 decomp_param = g_new0(DecompressParam, thread_count);
3494 qemu_mutex_init(&decomp_done_lock);
3495 qemu_cond_init(&decomp_done_cond);
3496 decomp_file = f;
3497 for (i = 0; i < thread_count; i++) {
3498 if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3499 goto exit;
3502 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3503 qemu_mutex_init(&decomp_param[i].mutex);
3504 qemu_cond_init(&decomp_param[i].cond);
3505 decomp_param[i].done = true;
3506 decomp_param[i].quit = false;
3507 qemu_thread_create(decompress_threads + i, "decompress",
3508 do_data_decompress, decomp_param + i,
3509 QEMU_THREAD_JOINABLE);
3511 return 0;
3512 exit:
3513 compress_threads_load_cleanup();
3514 return -1;
3517 static void decompress_data_with_multi_threads(QEMUFile *f,
3518 void *host, int len)
3520 int idx, thread_count;
3522 thread_count = migrate_decompress_threads();
3523 QEMU_LOCK_GUARD(&decomp_done_lock);
3524 while (true) {
3525 for (idx = 0; idx < thread_count; idx++) {
3526 if (decomp_param[idx].done) {
3527 decomp_param[idx].done = false;
3528 qemu_mutex_lock(&decomp_param[idx].mutex);
3529 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3530 decomp_param[idx].des = host;
3531 decomp_param[idx].len = len;
3532 qemu_cond_signal(&decomp_param[idx].cond);
3533 qemu_mutex_unlock(&decomp_param[idx].mutex);
3534 break;
3537 if (idx < thread_count) {
3538 break;
3539 } else {
3540 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3545 static void colo_init_ram_state(void)
3547 ram_state_init(&ram_state);
3551 * colo cache: this is for secondary VM, we cache the whole
3552 * memory of the secondary VM, it is need to hold the global lock
3553 * to call this helper.
3555 int colo_init_ram_cache(void)
3557 RAMBlock *block;
3559 WITH_RCU_READ_LOCK_GUARD() {
3560 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3561 block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3562 NULL, false, false);
3563 if (!block->colo_cache) {
3564 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3565 "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3566 block->used_length);
3567 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3568 if (block->colo_cache) {
3569 qemu_anon_ram_free(block->colo_cache, block->used_length);
3570 block->colo_cache = NULL;
3573 return -errno;
3575 if (!machine_dump_guest_core(current_machine)) {
3576 qemu_madvise(block->colo_cache, block->used_length,
3577 QEMU_MADV_DONTDUMP);
3583 * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3584 * with to decide which page in cache should be flushed into SVM's RAM. Here
3585 * we use the same name 'ram_bitmap' as for migration.
3587 if (ram_bytes_total()) {
3588 RAMBlock *block;
3590 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3591 unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3592 block->bmap = bitmap_new(pages);
3596 colo_init_ram_state();
3597 return 0;
3600 /* TODO: duplicated with ram_init_bitmaps */
3601 void colo_incoming_start_dirty_log(void)
3603 RAMBlock *block = NULL;
3604 /* For memory_global_dirty_log_start below. */
3605 qemu_mutex_lock_iothread();
3606 qemu_mutex_lock_ramlist();
3608 memory_global_dirty_log_sync();
3609 WITH_RCU_READ_LOCK_GUARD() {
3610 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3611 ramblock_sync_dirty_bitmap(ram_state, block);
3612 /* Discard this dirty bitmap record */
3613 bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3615 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION);
3617 ram_state->migration_dirty_pages = 0;
3618 qemu_mutex_unlock_ramlist();
3619 qemu_mutex_unlock_iothread();
3622 /* It is need to hold the global lock to call this helper */
3623 void colo_release_ram_cache(void)
3625 RAMBlock *block;
3627 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION);
3628 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3629 g_free(block->bmap);
3630 block->bmap = NULL;
3633 WITH_RCU_READ_LOCK_GUARD() {
3634 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3635 if (block->colo_cache) {
3636 qemu_anon_ram_free(block->colo_cache, block->used_length);
3637 block->colo_cache = NULL;
3641 ram_state_cleanup(&ram_state);
3645 * ram_load_setup: Setup RAM for migration incoming side
3647 * Returns zero to indicate success and negative for error
3649 * @f: QEMUFile where to receive the data
3650 * @opaque: RAMState pointer
3652 static int ram_load_setup(QEMUFile *f, void *opaque)
3654 if (compress_threads_load_setup(f)) {
3655 return -1;
3658 xbzrle_load_setup();
3659 ramblock_recv_map_init();
3661 return 0;
3664 static int ram_load_cleanup(void *opaque)
3666 RAMBlock *rb;
3668 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3669 qemu_ram_block_writeback(rb);
3672 xbzrle_load_cleanup();
3673 compress_threads_load_cleanup();
3675 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3676 g_free(rb->receivedmap);
3677 rb->receivedmap = NULL;
3680 return 0;
3684 * ram_postcopy_incoming_init: allocate postcopy data structures
3686 * Returns 0 for success and negative if there was one error
3688 * @mis: current migration incoming state
3690 * Allocate data structures etc needed by incoming migration with
3691 * postcopy-ram. postcopy-ram's similarly names
3692 * postcopy_ram_incoming_init does the work.
3694 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3696 return postcopy_ram_incoming_init(mis);
3700 * ram_load_postcopy: load a page in postcopy case
3702 * Returns 0 for success or -errno in case of error
3704 * Called in postcopy mode by ram_load().
3705 * rcu_read_lock is taken prior to this being called.
3707 * @f: QEMUFile where to send the data
3709 static int ram_load_postcopy(QEMUFile *f)
3711 int flags = 0, ret = 0;
3712 bool place_needed = false;
3713 bool matches_target_page_size = false;
3714 MigrationIncomingState *mis = migration_incoming_get_current();
3715 /* Temporary page that is later 'placed' */
3716 void *postcopy_host_page = mis->postcopy_tmp_page;
3717 void *host_page = NULL;
3718 bool all_zero = true;
3719 int target_pages = 0;
3721 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3722 ram_addr_t addr;
3723 void *page_buffer = NULL;
3724 void *place_source = NULL;
3725 RAMBlock *block = NULL;
3726 uint8_t ch;
3727 int len;
3729 addr = qemu_get_be64(f);
3732 * If qemu file error, we should stop here, and then "addr"
3733 * may be invalid
3735 ret = qemu_file_get_error(f);
3736 if (ret) {
3737 break;
3740 flags = addr & ~TARGET_PAGE_MASK;
3741 addr &= TARGET_PAGE_MASK;
3743 trace_ram_load_postcopy_loop((uint64_t)addr, flags);
3744 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3745 RAM_SAVE_FLAG_COMPRESS_PAGE)) {
3746 block = ram_block_from_stream(f, flags);
3747 if (!block) {
3748 ret = -EINVAL;
3749 break;
3753 * Relying on used_length is racy and can result in false positives.
3754 * We might place pages beyond used_length in case RAM was shrunk
3755 * while in postcopy, which is fine - trying to place via
3756 * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault.
3758 if (!block->host || addr >= block->postcopy_length) {
3759 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3760 ret = -EINVAL;
3761 break;
3763 target_pages++;
3764 matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
3766 * Postcopy requires that we place whole host pages atomically;
3767 * these may be huge pages for RAMBlocks that are backed by
3768 * hugetlbfs.
3769 * To make it atomic, the data is read into a temporary page
3770 * that's moved into place later.
3771 * The migration protocol uses, possibly smaller, target-pages
3772 * however the source ensures it always sends all the components
3773 * of a host page in one chunk.
3775 page_buffer = postcopy_host_page +
3776 host_page_offset_from_ram_block_offset(block, addr);
3777 /* If all TP are zero then we can optimise the place */
3778 if (target_pages == 1) {
3779 host_page = host_page_from_ram_block_offset(block, addr);
3780 } else if (host_page != host_page_from_ram_block_offset(block,
3781 addr)) {
3782 /* not the 1st TP within the HP */
3783 error_report("Non-same host page %p/%p", host_page,
3784 host_page_from_ram_block_offset(block, addr));
3785 ret = -EINVAL;
3786 break;
3790 * If it's the last part of a host page then we place the host
3791 * page
3793 if (target_pages == (block->page_size / TARGET_PAGE_SIZE)) {
3794 place_needed = true;
3796 place_source = postcopy_host_page;
3799 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3800 case RAM_SAVE_FLAG_ZERO:
3801 ch = qemu_get_byte(f);
3803 * Can skip to set page_buffer when
3804 * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
3806 if (ch || !matches_target_page_size) {
3807 memset(page_buffer, ch, TARGET_PAGE_SIZE);
3809 if (ch) {
3810 all_zero = false;
3812 break;
3814 case RAM_SAVE_FLAG_PAGE:
3815 all_zero = false;
3816 if (!matches_target_page_size) {
3817 /* For huge pages, we always use temporary buffer */
3818 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3819 } else {
3821 * For small pages that matches target page size, we
3822 * avoid the qemu_file copy. Instead we directly use
3823 * the buffer of QEMUFile to place the page. Note: we
3824 * cannot do any QEMUFile operation before using that
3825 * buffer to make sure the buffer is valid when
3826 * placing the page.
3828 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3829 TARGET_PAGE_SIZE);
3831 break;
3832 case RAM_SAVE_FLAG_COMPRESS_PAGE:
3833 all_zero = false;
3834 len = qemu_get_be32(f);
3835 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3836 error_report("Invalid compressed data length: %d", len);
3837 ret = -EINVAL;
3838 break;
3840 decompress_data_with_multi_threads(f, page_buffer, len);
3841 break;
3843 case RAM_SAVE_FLAG_EOS:
3844 /* normal exit */
3845 multifd_recv_sync_main();
3846 break;
3847 default:
3848 error_report("Unknown combination of migration flags: 0x%x"
3849 " (postcopy mode)", flags);
3850 ret = -EINVAL;
3851 break;
3854 /* Got the whole host page, wait for decompress before placing. */
3855 if (place_needed) {
3856 ret |= wait_for_decompress_done();
3859 /* Detect for any possible file errors */
3860 if (!ret && qemu_file_get_error(f)) {
3861 ret = qemu_file_get_error(f);
3864 if (!ret && place_needed) {
3865 if (all_zero) {
3866 ret = postcopy_place_page_zero(mis, host_page, block);
3867 } else {
3868 ret = postcopy_place_page(mis, host_page, place_source,
3869 block);
3871 place_needed = false;
3872 target_pages = 0;
3873 /* Assume we have a zero page until we detect something different */
3874 all_zero = true;
3878 return ret;
3881 static bool postcopy_is_advised(void)
3883 PostcopyState ps = postcopy_state_get();
3884 return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
3887 static bool postcopy_is_running(void)
3889 PostcopyState ps = postcopy_state_get();
3890 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3894 * Flush content of RAM cache into SVM's memory.
3895 * Only flush the pages that be dirtied by PVM or SVM or both.
3897 void colo_flush_ram_cache(void)
3899 RAMBlock *block = NULL;
3900 void *dst_host;
3901 void *src_host;
3902 unsigned long offset = 0;
3904 memory_global_dirty_log_sync();
3905 WITH_RCU_READ_LOCK_GUARD() {
3906 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3907 ramblock_sync_dirty_bitmap(ram_state, block);
3911 trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
3912 WITH_RCU_READ_LOCK_GUARD() {
3913 block = QLIST_FIRST_RCU(&ram_list.blocks);
3915 while (block) {
3916 unsigned long num = 0;
3918 offset = colo_bitmap_find_dirty(ram_state, block, offset, &num);
3919 if (!offset_in_ramblock(block,
3920 ((ram_addr_t)offset) << TARGET_PAGE_BITS)) {
3921 offset = 0;
3922 num = 0;
3923 block = QLIST_NEXT_RCU(block, next);
3924 } else {
3925 unsigned long i = 0;
3927 for (i = 0; i < num; i++) {
3928 migration_bitmap_clear_dirty(ram_state, block, offset + i);
3930 dst_host = block->host
3931 + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3932 src_host = block->colo_cache
3933 + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3934 memcpy(dst_host, src_host, TARGET_PAGE_SIZE * num);
3935 offset += num;
3939 trace_colo_flush_ram_cache_end();
3943 * ram_load_precopy: load pages in precopy case
3945 * Returns 0 for success or -errno in case of error
3947 * Called in precopy mode by ram_load().
3948 * rcu_read_lock is taken prior to this being called.
3950 * @f: QEMUFile where to send the data
3952 static int ram_load_precopy(QEMUFile *f)
3954 int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
3955 /* ADVISE is earlier, it shows the source has the postcopy capability on */
3956 bool postcopy_advised = postcopy_is_advised();
3957 if (!migrate_use_compression()) {
3958 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3961 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3962 ram_addr_t addr, total_ram_bytes;
3963 void *host = NULL, *host_bak = NULL;
3964 uint8_t ch;
3967 * Yield periodically to let main loop run, but an iteration of
3968 * the main loop is expensive, so do it each some iterations
3970 if ((i & 32767) == 0 && qemu_in_coroutine()) {
3971 aio_co_schedule(qemu_get_current_aio_context(),
3972 qemu_coroutine_self());
3973 qemu_coroutine_yield();
3975 i++;
3977 addr = qemu_get_be64(f);
3978 flags = addr & ~TARGET_PAGE_MASK;
3979 addr &= TARGET_PAGE_MASK;
3981 if (flags & invalid_flags) {
3982 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3983 error_report("Received an unexpected compressed page");
3986 ret = -EINVAL;
3987 break;
3990 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3991 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
3992 RAMBlock *block = ram_block_from_stream(f, flags);
3994 host = host_from_ram_block_offset(block, addr);
3996 * After going into COLO stage, we should not load the page
3997 * into SVM's memory directly, we put them into colo_cache firstly.
3998 * NOTE: We need to keep a copy of SVM's ram in colo_cache.
3999 * Previously, we copied all these memory in preparing stage of COLO
4000 * while we need to stop VM, which is a time-consuming process.
4001 * Here we optimize it by a trick, back-up every page while in
4002 * migration process while COLO is enabled, though it affects the
4003 * speed of the migration, but it obviously reduce the downtime of
4004 * back-up all SVM'S memory in COLO preparing stage.
4006 if (migration_incoming_colo_enabled()) {
4007 if (migration_incoming_in_colo_state()) {
4008 /* In COLO stage, put all pages into cache temporarily */
4009 host = colo_cache_from_block_offset(block, addr, true);
4010 } else {
4012 * In migration stage but before COLO stage,
4013 * Put all pages into both cache and SVM's memory.
4015 host_bak = colo_cache_from_block_offset(block, addr, false);
4018 if (!host) {
4019 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
4020 ret = -EINVAL;
4021 break;
4023 if (!migration_incoming_in_colo_state()) {
4024 ramblock_recv_bitmap_set(block, host);
4027 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
4030 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
4031 case RAM_SAVE_FLAG_MEM_SIZE:
4032 /* Synchronize RAM block list */
4033 total_ram_bytes = addr;
4034 while (!ret && total_ram_bytes) {
4035 RAMBlock *block;
4036 char id[256];
4037 ram_addr_t length;
4039 len = qemu_get_byte(f);
4040 qemu_get_buffer(f, (uint8_t *)id, len);
4041 id[len] = 0;
4042 length = qemu_get_be64(f);
4044 block = qemu_ram_block_by_name(id);
4045 if (block && !qemu_ram_is_migratable(block)) {
4046 error_report("block %s should not be migrated !", id);
4047 ret = -EINVAL;
4048 } else if (block) {
4049 if (length != block->used_length) {
4050 Error *local_err = NULL;
4052 ret = qemu_ram_resize(block, length,
4053 &local_err);
4054 if (local_err) {
4055 error_report_err(local_err);
4058 /* For postcopy we need to check hugepage sizes match */
4059 if (postcopy_advised && migrate_postcopy_ram() &&
4060 block->page_size != qemu_host_page_size) {
4061 uint64_t remote_page_size = qemu_get_be64(f);
4062 if (remote_page_size != block->page_size) {
4063 error_report("Mismatched RAM page size %s "
4064 "(local) %zd != %" PRId64,
4065 id, block->page_size,
4066 remote_page_size);
4067 ret = -EINVAL;
4070 if (migrate_ignore_shared()) {
4071 hwaddr addr = qemu_get_be64(f);
4072 if (ramblock_is_ignored(block) &&
4073 block->mr->addr != addr) {
4074 error_report("Mismatched GPAs for block %s "
4075 "%" PRId64 "!= %" PRId64,
4076 id, (uint64_t)addr,
4077 (uint64_t)block->mr->addr);
4078 ret = -EINVAL;
4081 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
4082 block->idstr);
4083 } else {
4084 error_report("Unknown ramblock \"%s\", cannot "
4085 "accept migration", id);
4086 ret = -EINVAL;
4089 total_ram_bytes -= length;
4091 break;
4093 case RAM_SAVE_FLAG_ZERO:
4094 ch = qemu_get_byte(f);
4095 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
4096 break;
4098 case RAM_SAVE_FLAG_PAGE:
4099 qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
4100 break;
4102 case RAM_SAVE_FLAG_COMPRESS_PAGE:
4103 len = qemu_get_be32(f);
4104 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
4105 error_report("Invalid compressed data length: %d", len);
4106 ret = -EINVAL;
4107 break;
4109 decompress_data_with_multi_threads(f, host, len);
4110 break;
4112 case RAM_SAVE_FLAG_XBZRLE:
4113 if (load_xbzrle(f, addr, host) < 0) {
4114 error_report("Failed to decompress XBZRLE page at "
4115 RAM_ADDR_FMT, addr);
4116 ret = -EINVAL;
4117 break;
4119 break;
4120 case RAM_SAVE_FLAG_EOS:
4121 /* normal exit */
4122 multifd_recv_sync_main();
4123 break;
4124 default:
4125 if (flags & RAM_SAVE_FLAG_HOOK) {
4126 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
4127 } else {
4128 error_report("Unknown combination of migration flags: 0x%x",
4129 flags);
4130 ret = -EINVAL;
4133 if (!ret) {
4134 ret = qemu_file_get_error(f);
4136 if (!ret && host_bak) {
4137 memcpy(host_bak, host, TARGET_PAGE_SIZE);
4141 ret |= wait_for_decompress_done();
4142 return ret;
4145 static int ram_load(QEMUFile *f, void *opaque, int version_id)
4147 int ret = 0;
4148 static uint64_t seq_iter;
4150 * If system is running in postcopy mode, page inserts to host memory must
4151 * be atomic
4153 bool postcopy_running = postcopy_is_running();
4155 seq_iter++;
4157 if (version_id != 4) {
4158 return -EINVAL;
4162 * This RCU critical section can be very long running.
4163 * When RCU reclaims in the code start to become numerous,
4164 * it will be necessary to reduce the granularity of this
4165 * critical section.
4167 WITH_RCU_READ_LOCK_GUARD() {
4168 if (postcopy_running) {
4169 ret = ram_load_postcopy(f);
4170 } else {
4171 ret = ram_load_precopy(f);
4174 trace_ram_load_complete(ret, seq_iter);
4176 return ret;
4179 static bool ram_has_postcopy(void *opaque)
4181 RAMBlock *rb;
4182 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4183 if (ramblock_is_pmem(rb)) {
4184 info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4185 "is not supported now!", rb->idstr, rb->host);
4186 return false;
4190 return migrate_postcopy_ram();
4193 /* Sync all the dirty bitmap with destination VM. */
4194 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4196 RAMBlock *block;
4197 QEMUFile *file = s->to_dst_file;
4198 int ramblock_count = 0;
4200 trace_ram_dirty_bitmap_sync_start();
4202 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4203 qemu_savevm_send_recv_bitmap(file, block->idstr);
4204 trace_ram_dirty_bitmap_request(block->idstr);
4205 ramblock_count++;
4208 trace_ram_dirty_bitmap_sync_wait();
4210 /* Wait until all the ramblocks' dirty bitmap synced */
4211 while (ramblock_count--) {
4212 qemu_sem_wait(&s->rp_state.rp_sem);
4215 trace_ram_dirty_bitmap_sync_complete();
4217 return 0;
4220 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4222 qemu_sem_post(&s->rp_state.rp_sem);
4226 * Read the received bitmap, revert it as the initial dirty bitmap.
4227 * This is only used when the postcopy migration is paused but wants
4228 * to resume from a middle point.
4230 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4232 int ret = -EINVAL;
4233 /* from_dst_file is always valid because we're within rp_thread */
4234 QEMUFile *file = s->rp_state.from_dst_file;
4235 unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
4236 uint64_t local_size = DIV_ROUND_UP(nbits, 8);
4237 uint64_t size, end_mark;
4239 trace_ram_dirty_bitmap_reload_begin(block->idstr);
4241 if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4242 error_report("%s: incorrect state %s", __func__,
4243 MigrationStatus_str(s->state));
4244 return -EINVAL;
4248 * Note: see comments in ramblock_recv_bitmap_send() on why we
4249 * need the endianness conversion, and the paddings.
4251 local_size = ROUND_UP(local_size, 8);
4253 /* Add paddings */
4254 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4256 size = qemu_get_be64(file);
4258 /* The size of the bitmap should match with our ramblock */
4259 if (size != local_size) {
4260 error_report("%s: ramblock '%s' bitmap size mismatch "
4261 "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4262 block->idstr, size, local_size);
4263 ret = -EINVAL;
4264 goto out;
4267 size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4268 end_mark = qemu_get_be64(file);
4270 ret = qemu_file_get_error(file);
4271 if (ret || size != local_size) {
4272 error_report("%s: read bitmap failed for ramblock '%s': %d"
4273 " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4274 __func__, block->idstr, ret, local_size, size);
4275 ret = -EIO;
4276 goto out;
4279 if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4280 error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64,
4281 __func__, block->idstr, end_mark);
4282 ret = -EINVAL;
4283 goto out;
4287 * Endianness conversion. We are during postcopy (though paused).
4288 * The dirty bitmap won't change. We can directly modify it.
4290 bitmap_from_le(block->bmap, le_bitmap, nbits);
4293 * What we received is "received bitmap". Revert it as the initial
4294 * dirty bitmap for this ramblock.
4296 bitmap_complement(block->bmap, block->bmap, nbits);
4298 /* Clear dirty bits of discarded ranges that we don't want to migrate. */
4299 ramblock_dirty_bitmap_clear_discarded_pages(block);
4301 /* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */
4302 trace_ram_dirty_bitmap_reload_complete(block->idstr);
4305 * We succeeded to sync bitmap for current ramblock. If this is
4306 * the last one to sync, we need to notify the main send thread.
4308 ram_dirty_bitmap_reload_notify(s);
4310 ret = 0;
4311 out:
4312 g_free(le_bitmap);
4313 return ret;
4316 static int ram_resume_prepare(MigrationState *s, void *opaque)
4318 RAMState *rs = *(RAMState **)opaque;
4319 int ret;
4321 ret = ram_dirty_bitmap_sync_all(s, rs);
4322 if (ret) {
4323 return ret;
4326 ram_state_resume_prepare(rs, s->to_dst_file);
4328 return 0;
4331 static SaveVMHandlers savevm_ram_handlers = {
4332 .save_setup = ram_save_setup,
4333 .save_live_iterate = ram_save_iterate,
4334 .save_live_complete_postcopy = ram_save_complete,
4335 .save_live_complete_precopy = ram_save_complete,
4336 .has_postcopy = ram_has_postcopy,
4337 .save_live_pending = ram_save_pending,
4338 .load_state = ram_load,
4339 .save_cleanup = ram_save_cleanup,
4340 .load_setup = ram_load_setup,
4341 .load_cleanup = ram_load_cleanup,
4342 .resume_prepare = ram_resume_prepare,
4345 static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host,
4346 size_t old_size, size_t new_size)
4348 PostcopyState ps = postcopy_state_get();
4349 ram_addr_t offset;
4350 RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset);
4351 Error *err = NULL;
4353 if (ramblock_is_ignored(rb)) {
4354 return;
4357 if (!migration_is_idle()) {
4359 * Precopy code on the source cannot deal with the size of RAM blocks
4360 * changing at random points in time - especially after sending the
4361 * RAM block sizes in the migration stream, they must no longer change.
4362 * Abort and indicate a proper reason.
4364 error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr);
4365 migration_cancel(err);
4366 error_free(err);
4369 switch (ps) {
4370 case POSTCOPY_INCOMING_ADVISE:
4372 * Update what ram_postcopy_incoming_init()->init_range() does at the
4373 * time postcopy was advised. Syncing RAM blocks with the source will
4374 * result in RAM resizes.
4376 if (old_size < new_size) {
4377 if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) {
4378 error_report("RAM block '%s' discard of resized RAM failed",
4379 rb->idstr);
4382 rb->postcopy_length = new_size;
4383 break;
4384 case POSTCOPY_INCOMING_NONE:
4385 case POSTCOPY_INCOMING_RUNNING:
4386 case POSTCOPY_INCOMING_END:
4388 * Once our guest is running, postcopy does no longer care about
4389 * resizes. When growing, the new memory was not available on the
4390 * source, no handler needed.
4392 break;
4393 default:
4394 error_report("RAM block '%s' resized during postcopy state: %d",
4395 rb->idstr, ps);
4396 exit(-1);
4400 static RAMBlockNotifier ram_mig_ram_notifier = {
4401 .ram_block_resized = ram_mig_ram_block_resized,
4404 void ram_mig_init(void)
4406 qemu_mutex_init(&XBZRLE.lock);
4407 register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
4408 ram_block_notifier_add(&ram_mig_ram_notifier);