accel/tcg: Report unaligned atomics for user-only
[qemu.git] / migration / ram.c
blobbb908822d57078c53814c01bd061e9388e44d2e3
1 /*
2 * QEMU System Emulator
4 * Copyright (c) 2003-2008 Fabrice Bellard
5 * Copyright (c) 2011-2015 Red Hat Inc
7 * Authors:
8 * Juan Quintela <quintela@redhat.com>
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 * THE SOFTWARE.
29 #include "qemu/osdep.h"
30 #include "qemu/cutils.h"
31 #include "qemu/bitops.h"
32 #include "qemu/bitmap.h"
33 #include "qemu/main-loop.h"
34 #include "xbzrle.h"
35 #include "ram.h"
36 #include "migration.h"
37 #include "migration/register.h"
38 #include "migration/misc.h"
39 #include "qemu-file.h"
40 #include "postcopy-ram.h"
41 #include "page_cache.h"
42 #include "qemu/error-report.h"
43 #include "qapi/error.h"
44 #include "qapi/qapi-types-migration.h"
45 #include "qapi/qapi-events-migration.h"
46 #include "qapi/qmp/qerror.h"
47 #include "trace.h"
48 #include "exec/ram_addr.h"
49 #include "exec/target_page.h"
50 #include "qemu/rcu_queue.h"
51 #include "migration/colo.h"
52 #include "block.h"
53 #include "sysemu/cpu-throttle.h"
54 #include "savevm.h"
55 #include "qemu/iov.h"
56 #include "multifd.h"
57 #include "sysemu/runstate.h"
59 #if defined(__linux__)
60 #include "qemu/userfaultfd.h"
61 #endif /* defined(__linux__) */
63 /***********************************************************/
64 /* ram save/restore */
66 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
67 * worked for pages that where filled with the same char. We switched
68 * it to only search for the zero value. And to avoid confusion with
69 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
72 #define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */
73 #define RAM_SAVE_FLAG_ZERO 0x02
74 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
75 #define RAM_SAVE_FLAG_PAGE 0x08
76 #define RAM_SAVE_FLAG_EOS 0x10
77 #define RAM_SAVE_FLAG_CONTINUE 0x20
78 #define RAM_SAVE_FLAG_XBZRLE 0x40
79 /* 0x80 is reserved in migration.h start with 0x100 next */
80 #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
82 static inline bool is_zero_range(uint8_t *p, uint64_t size)
84 return buffer_is_zero(p, size);
87 XBZRLECacheStats xbzrle_counters;
89 /* struct contains XBZRLE cache and a static page
90 used by the compression */
91 static struct {
92 /* buffer used for XBZRLE encoding */
93 uint8_t *encoded_buf;
94 /* buffer for storing page content */
95 uint8_t *current_buf;
96 /* Cache for XBZRLE, Protected by lock. */
97 PageCache *cache;
98 QemuMutex lock;
99 /* it will store a page full of zeros */
100 uint8_t *zero_target_page;
101 /* buffer used for XBZRLE decoding */
102 uint8_t *decoded_buf;
103 } XBZRLE;
105 static void XBZRLE_cache_lock(void)
107 if (migrate_use_xbzrle()) {
108 qemu_mutex_lock(&XBZRLE.lock);
112 static void XBZRLE_cache_unlock(void)
114 if (migrate_use_xbzrle()) {
115 qemu_mutex_unlock(&XBZRLE.lock);
120 * xbzrle_cache_resize: resize the xbzrle cache
122 * This function is called from migrate_params_apply in main
123 * thread, possibly while a migration is in progress. A running
124 * migration may be using the cache and might finish during this call,
125 * hence changes to the cache are protected by XBZRLE.lock().
127 * Returns 0 for success or -1 for error
129 * @new_size: new cache size
130 * @errp: set *errp if the check failed, with reason
132 int xbzrle_cache_resize(uint64_t new_size, Error **errp)
134 PageCache *new_cache;
135 int64_t ret = 0;
137 /* Check for truncation */
138 if (new_size != (size_t)new_size) {
139 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
140 "exceeding address space");
141 return -1;
144 if (new_size == migrate_xbzrle_cache_size()) {
145 /* nothing to do */
146 return 0;
149 XBZRLE_cache_lock();
151 if (XBZRLE.cache != NULL) {
152 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
153 if (!new_cache) {
154 ret = -1;
155 goto out;
158 cache_fini(XBZRLE.cache);
159 XBZRLE.cache = new_cache;
161 out:
162 XBZRLE_cache_unlock();
163 return ret;
166 bool ramblock_is_ignored(RAMBlock *block)
168 return !qemu_ram_is_migratable(block) ||
169 (migrate_ignore_shared() && qemu_ram_is_shared(block));
172 #undef RAMBLOCK_FOREACH
174 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
176 RAMBlock *block;
177 int ret = 0;
179 RCU_READ_LOCK_GUARD();
181 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
182 ret = func(block, opaque);
183 if (ret) {
184 break;
187 return ret;
190 static void ramblock_recv_map_init(void)
192 RAMBlock *rb;
194 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
195 assert(!rb->receivedmap);
196 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
200 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
202 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
203 rb->receivedmap);
206 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
208 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
211 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
213 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
216 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
217 size_t nr)
219 bitmap_set_atomic(rb->receivedmap,
220 ramblock_recv_bitmap_offset(host_addr, rb),
221 nr);
224 #define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL)
227 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
229 * Returns >0 if success with sent bytes, or <0 if error.
231 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
232 const char *block_name)
234 RAMBlock *block = qemu_ram_block_by_name(block_name);
235 unsigned long *le_bitmap, nbits;
236 uint64_t size;
238 if (!block) {
239 error_report("%s: invalid block name: %s", __func__, block_name);
240 return -1;
243 nbits = block->postcopy_length >> TARGET_PAGE_BITS;
246 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
247 * machines we may need 4 more bytes for padding (see below
248 * comment). So extend it a bit before hand.
250 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
253 * Always use little endian when sending the bitmap. This is
254 * required that when source and destination VMs are not using the
255 * same endianness. (Note: big endian won't work.)
257 bitmap_to_le(le_bitmap, block->receivedmap, nbits);
259 /* Size of the bitmap, in bytes */
260 size = DIV_ROUND_UP(nbits, 8);
263 * size is always aligned to 8 bytes for 64bit machines, but it
264 * may not be true for 32bit machines. We need this padding to
265 * make sure the migration can survive even between 32bit and
266 * 64bit machines.
268 size = ROUND_UP(size, 8);
270 qemu_put_be64(file, size);
271 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
273 * Mark as an end, in case the middle part is screwed up due to
274 * some "mysterious" reason.
276 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
277 qemu_fflush(file);
279 g_free(le_bitmap);
281 if (qemu_file_get_error(file)) {
282 return qemu_file_get_error(file);
285 return size + sizeof(size);
289 * An outstanding page request, on the source, having been received
290 * and queued
292 struct RAMSrcPageRequest {
293 RAMBlock *rb;
294 hwaddr offset;
295 hwaddr len;
297 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
300 /* State of RAM for migration */
301 struct RAMState {
302 /* QEMUFile used for this migration */
303 QEMUFile *f;
304 /* UFFD file descriptor, used in 'write-tracking' migration */
305 int uffdio_fd;
306 /* Last block that we have visited searching for dirty pages */
307 RAMBlock *last_seen_block;
308 /* Last block from where we have sent data */
309 RAMBlock *last_sent_block;
310 /* Last dirty target page we have sent */
311 ram_addr_t last_page;
312 /* last ram version we have seen */
313 uint32_t last_version;
314 /* How many times we have dirty too many pages */
315 int dirty_rate_high_cnt;
316 /* these variables are used for bitmap sync */
317 /* last time we did a full bitmap_sync */
318 int64_t time_last_bitmap_sync;
319 /* bytes transferred at start_time */
320 uint64_t bytes_xfer_prev;
321 /* number of dirty pages since start_time */
322 uint64_t num_dirty_pages_period;
323 /* xbzrle misses since the beginning of the period */
324 uint64_t xbzrle_cache_miss_prev;
325 /* Amount of xbzrle pages since the beginning of the period */
326 uint64_t xbzrle_pages_prev;
327 /* Amount of xbzrle encoded bytes since the beginning of the period */
328 uint64_t xbzrle_bytes_prev;
329 /* Start using XBZRLE (e.g., after the first round). */
330 bool xbzrle_enabled;
332 /* compression statistics since the beginning of the period */
333 /* amount of count that no free thread to compress data */
334 uint64_t compress_thread_busy_prev;
335 /* amount bytes after compression */
336 uint64_t compressed_size_prev;
337 /* amount of compressed pages */
338 uint64_t compress_pages_prev;
340 /* total handled target pages at the beginning of period */
341 uint64_t target_page_count_prev;
342 /* total handled target pages since start */
343 uint64_t target_page_count;
344 /* number of dirty bits in the bitmap */
345 uint64_t migration_dirty_pages;
346 /* Protects modification of the bitmap and migration dirty pages */
347 QemuMutex bitmap_mutex;
348 /* The RAMBlock used in the last src_page_requests */
349 RAMBlock *last_req_rb;
350 /* Queue of outstanding page requests from the destination */
351 QemuMutex src_page_req_mutex;
352 QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
354 typedef struct RAMState RAMState;
356 static RAMState *ram_state;
358 static NotifierWithReturnList precopy_notifier_list;
360 void precopy_infrastructure_init(void)
362 notifier_with_return_list_init(&precopy_notifier_list);
365 void precopy_add_notifier(NotifierWithReturn *n)
367 notifier_with_return_list_add(&precopy_notifier_list, n);
370 void precopy_remove_notifier(NotifierWithReturn *n)
372 notifier_with_return_remove(n);
375 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
377 PrecopyNotifyData pnd;
378 pnd.reason = reason;
379 pnd.errp = errp;
381 return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
384 uint64_t ram_bytes_remaining(void)
386 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
390 MigrationStats ram_counters;
392 /* used by the search for pages to send */
393 struct PageSearchStatus {
394 /* Current block being searched */
395 RAMBlock *block;
396 /* Current page to search from */
397 unsigned long page;
398 /* Set once we wrap around */
399 bool complete_round;
401 typedef struct PageSearchStatus PageSearchStatus;
403 CompressionStats compression_counters;
405 struct CompressParam {
406 bool done;
407 bool quit;
408 bool zero_page;
409 QEMUFile *file;
410 QemuMutex mutex;
411 QemuCond cond;
412 RAMBlock *block;
413 ram_addr_t offset;
415 /* internally used fields */
416 z_stream stream;
417 uint8_t *originbuf;
419 typedef struct CompressParam CompressParam;
421 struct DecompressParam {
422 bool done;
423 bool quit;
424 QemuMutex mutex;
425 QemuCond cond;
426 void *des;
427 uint8_t *compbuf;
428 int len;
429 z_stream stream;
431 typedef struct DecompressParam DecompressParam;
433 static CompressParam *comp_param;
434 static QemuThread *compress_threads;
435 /* comp_done_cond is used to wake up the migration thread when
436 * one of the compression threads has finished the compression.
437 * comp_done_lock is used to co-work with comp_done_cond.
439 static QemuMutex comp_done_lock;
440 static QemuCond comp_done_cond;
441 /* The empty QEMUFileOps will be used by file in CompressParam */
442 static const QEMUFileOps empty_ops = { };
444 static QEMUFile *decomp_file;
445 static DecompressParam *decomp_param;
446 static QemuThread *decompress_threads;
447 static QemuMutex decomp_done_lock;
448 static QemuCond decomp_done_cond;
450 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
451 ram_addr_t offset, uint8_t *source_buf);
453 static void *do_data_compress(void *opaque)
455 CompressParam *param = opaque;
456 RAMBlock *block;
457 ram_addr_t offset;
458 bool zero_page;
460 qemu_mutex_lock(&param->mutex);
461 while (!param->quit) {
462 if (param->block) {
463 block = param->block;
464 offset = param->offset;
465 param->block = NULL;
466 qemu_mutex_unlock(&param->mutex);
468 zero_page = do_compress_ram_page(param->file, &param->stream,
469 block, offset, param->originbuf);
471 qemu_mutex_lock(&comp_done_lock);
472 param->done = true;
473 param->zero_page = zero_page;
474 qemu_cond_signal(&comp_done_cond);
475 qemu_mutex_unlock(&comp_done_lock);
477 qemu_mutex_lock(&param->mutex);
478 } else {
479 qemu_cond_wait(&param->cond, &param->mutex);
482 qemu_mutex_unlock(&param->mutex);
484 return NULL;
487 static void compress_threads_save_cleanup(void)
489 int i, thread_count;
491 if (!migrate_use_compression() || !comp_param) {
492 return;
495 thread_count = migrate_compress_threads();
496 for (i = 0; i < thread_count; i++) {
498 * we use it as a indicator which shows if the thread is
499 * properly init'd or not
501 if (!comp_param[i].file) {
502 break;
505 qemu_mutex_lock(&comp_param[i].mutex);
506 comp_param[i].quit = true;
507 qemu_cond_signal(&comp_param[i].cond);
508 qemu_mutex_unlock(&comp_param[i].mutex);
510 qemu_thread_join(compress_threads + i);
511 qemu_mutex_destroy(&comp_param[i].mutex);
512 qemu_cond_destroy(&comp_param[i].cond);
513 deflateEnd(&comp_param[i].stream);
514 g_free(comp_param[i].originbuf);
515 qemu_fclose(comp_param[i].file);
516 comp_param[i].file = NULL;
518 qemu_mutex_destroy(&comp_done_lock);
519 qemu_cond_destroy(&comp_done_cond);
520 g_free(compress_threads);
521 g_free(comp_param);
522 compress_threads = NULL;
523 comp_param = NULL;
526 static int compress_threads_save_setup(void)
528 int i, thread_count;
530 if (!migrate_use_compression()) {
531 return 0;
533 thread_count = migrate_compress_threads();
534 compress_threads = g_new0(QemuThread, thread_count);
535 comp_param = g_new0(CompressParam, thread_count);
536 qemu_cond_init(&comp_done_cond);
537 qemu_mutex_init(&comp_done_lock);
538 for (i = 0; i < thread_count; i++) {
539 comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
540 if (!comp_param[i].originbuf) {
541 goto exit;
544 if (deflateInit(&comp_param[i].stream,
545 migrate_compress_level()) != Z_OK) {
546 g_free(comp_param[i].originbuf);
547 goto exit;
550 /* comp_param[i].file is just used as a dummy buffer to save data,
551 * set its ops to empty.
553 comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops, false);
554 comp_param[i].done = true;
555 comp_param[i].quit = false;
556 qemu_mutex_init(&comp_param[i].mutex);
557 qemu_cond_init(&comp_param[i].cond);
558 qemu_thread_create(compress_threads + i, "compress",
559 do_data_compress, comp_param + i,
560 QEMU_THREAD_JOINABLE);
562 return 0;
564 exit:
565 compress_threads_save_cleanup();
566 return -1;
570 * save_page_header: write page header to wire
572 * If this is the 1st block, it also writes the block identification
574 * Returns the number of bytes written
576 * @f: QEMUFile where to send the data
577 * @block: block that contains the page we want to send
578 * @offset: offset inside the block for the page
579 * in the lower bits, it contains flags
581 static size_t save_page_header(RAMState *rs, QEMUFile *f, RAMBlock *block,
582 ram_addr_t offset)
584 size_t size, len;
586 if (block == rs->last_sent_block) {
587 offset |= RAM_SAVE_FLAG_CONTINUE;
589 qemu_put_be64(f, offset);
590 size = 8;
592 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
593 len = strlen(block->idstr);
594 qemu_put_byte(f, len);
595 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
596 size += 1 + len;
597 rs->last_sent_block = block;
599 return size;
603 * mig_throttle_guest_down: throttle down the guest
605 * Reduce amount of guest cpu execution to hopefully slow down memory
606 * writes. If guest dirty memory rate is reduced below the rate at
607 * which we can transfer pages to the destination then we should be
608 * able to complete migration. Some workloads dirty memory way too
609 * fast and will not effectively converge, even with auto-converge.
611 static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
612 uint64_t bytes_dirty_threshold)
614 MigrationState *s = migrate_get_current();
615 uint64_t pct_initial = s->parameters.cpu_throttle_initial;
616 uint64_t pct_increment = s->parameters.cpu_throttle_increment;
617 bool pct_tailslow = s->parameters.cpu_throttle_tailslow;
618 int pct_max = s->parameters.max_cpu_throttle;
620 uint64_t throttle_now = cpu_throttle_get_percentage();
621 uint64_t cpu_now, cpu_ideal, throttle_inc;
623 /* We have not started throttling yet. Let's start it. */
624 if (!cpu_throttle_active()) {
625 cpu_throttle_set(pct_initial);
626 } else {
627 /* Throttling already on, just increase the rate */
628 if (!pct_tailslow) {
629 throttle_inc = pct_increment;
630 } else {
631 /* Compute the ideal CPU percentage used by Guest, which may
632 * make the dirty rate match the dirty rate threshold. */
633 cpu_now = 100 - throttle_now;
634 cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
635 bytes_dirty_period);
636 throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
638 cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
643 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
645 * @rs: current RAM state
646 * @current_addr: address for the zero page
648 * Update the xbzrle cache to reflect a page that's been sent as all 0.
649 * The important thing is that a stale (not-yet-0'd) page be replaced
650 * by the new data.
651 * As a bonus, if the page wasn't in the cache it gets added so that
652 * when a small write is made into the 0'd page it gets XBZRLE sent.
654 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
656 if (!rs->xbzrle_enabled) {
657 return;
660 /* We don't care if this fails to allocate a new cache page
661 * as long as it updated an old one */
662 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
663 ram_counters.dirty_sync_count);
666 #define ENCODING_FLAG_XBZRLE 0x1
669 * save_xbzrle_page: compress and send current page
671 * Returns: 1 means that we wrote the page
672 * 0 means that page is identical to the one already sent
673 * -1 means that xbzrle would be longer than normal
675 * @rs: current RAM state
676 * @current_data: pointer to the address of the page contents
677 * @current_addr: addr of the page
678 * @block: block that contains the page we want to send
679 * @offset: offset inside the block for the page
680 * @last_stage: if we are at the completion stage
682 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
683 ram_addr_t current_addr, RAMBlock *block,
684 ram_addr_t offset, bool last_stage)
686 int encoded_len = 0, bytes_xbzrle;
687 uint8_t *prev_cached_page;
689 if (!cache_is_cached(XBZRLE.cache, current_addr,
690 ram_counters.dirty_sync_count)) {
691 xbzrle_counters.cache_miss++;
692 if (!last_stage) {
693 if (cache_insert(XBZRLE.cache, current_addr, *current_data,
694 ram_counters.dirty_sync_count) == -1) {
695 return -1;
696 } else {
697 /* update *current_data when the page has been
698 inserted into cache */
699 *current_data = get_cached_data(XBZRLE.cache, current_addr);
702 return -1;
706 * Reaching here means the page has hit the xbzrle cache, no matter what
707 * encoding result it is (normal encoding, overflow or skipping the page),
708 * count the page as encoded. This is used to calculate the encoding rate.
710 * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
711 * 2nd page turns out to be skipped (i.e. no new bytes written to the
712 * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
713 * skipped page included. In this way, the encoding rate can tell if the
714 * guest page is good for xbzrle encoding.
716 xbzrle_counters.pages++;
717 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
719 /* save current buffer into memory */
720 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
722 /* XBZRLE encoding (if there is no overflow) */
723 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
724 TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
725 TARGET_PAGE_SIZE);
728 * Update the cache contents, so that it corresponds to the data
729 * sent, in all cases except where we skip the page.
731 if (!last_stage && encoded_len != 0) {
732 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
734 * In the case where we couldn't compress, ensure that the caller
735 * sends the data from the cache, since the guest might have
736 * changed the RAM since we copied it.
738 *current_data = prev_cached_page;
741 if (encoded_len == 0) {
742 trace_save_xbzrle_page_skipping();
743 return 0;
744 } else if (encoded_len == -1) {
745 trace_save_xbzrle_page_overflow();
746 xbzrle_counters.overflow++;
747 xbzrle_counters.bytes += TARGET_PAGE_SIZE;
748 return -1;
751 /* Send XBZRLE based compressed page */
752 bytes_xbzrle = save_page_header(rs, rs->f, block,
753 offset | RAM_SAVE_FLAG_XBZRLE);
754 qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
755 qemu_put_be16(rs->f, encoded_len);
756 qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
757 bytes_xbzrle += encoded_len + 1 + 2;
759 * Like compressed_size (please see update_compress_thread_counts),
760 * the xbzrle encoded bytes don't count the 8 byte header with
761 * RAM_SAVE_FLAG_CONTINUE.
763 xbzrle_counters.bytes += bytes_xbzrle - 8;
764 ram_counters.transferred += bytes_xbzrle;
766 return 1;
770 * migration_bitmap_find_dirty: find the next dirty page from start
772 * Returns the page offset within memory region of the start of a dirty page
774 * @rs: current RAM state
775 * @rb: RAMBlock where to search for dirty pages
776 * @start: page where we start the search
778 static inline
779 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
780 unsigned long start)
782 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
783 unsigned long *bitmap = rb->bmap;
785 if (ramblock_is_ignored(rb)) {
786 return size;
789 return find_next_bit(bitmap, size, start);
792 static void migration_clear_memory_region_dirty_bitmap(RAMBlock *rb,
793 unsigned long page)
795 uint8_t shift;
796 hwaddr size, start;
798 if (!rb->clear_bmap || !clear_bmap_test_and_clear(rb, page)) {
799 return;
802 shift = rb->clear_bmap_shift;
804 * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
805 * can make things easier sometimes since then start address
806 * of the small chunk will always be 64 pages aligned so the
807 * bitmap will always be aligned to unsigned long. We should
808 * even be able to remove this restriction but I'm simply
809 * keeping it.
811 assert(shift >= 6);
813 size = 1ULL << (TARGET_PAGE_BITS + shift);
814 start = (((ram_addr_t)page) << TARGET_PAGE_BITS) & (-size);
815 trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
816 memory_region_clear_dirty_bitmap(rb->mr, start, size);
819 static void
820 migration_clear_memory_region_dirty_bitmap_range(RAMBlock *rb,
821 unsigned long start,
822 unsigned long npages)
824 unsigned long i, chunk_pages = 1UL << rb->clear_bmap_shift;
825 unsigned long chunk_start = QEMU_ALIGN_DOWN(start, chunk_pages);
826 unsigned long chunk_end = QEMU_ALIGN_UP(start + npages, chunk_pages);
829 * Clear pages from start to start + npages - 1, so the end boundary is
830 * exclusive.
832 for (i = chunk_start; i < chunk_end; i += chunk_pages) {
833 migration_clear_memory_region_dirty_bitmap(rb, i);
837 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
838 RAMBlock *rb,
839 unsigned long page)
841 bool ret;
844 * Clear dirty bitmap if needed. This _must_ be called before we
845 * send any of the page in the chunk because we need to make sure
846 * we can capture further page content changes when we sync dirty
847 * log the next time. So as long as we are going to send any of
848 * the page in the chunk we clear the remote dirty bitmap for all.
849 * Clearing it earlier won't be a problem, but too late will.
851 migration_clear_memory_region_dirty_bitmap(rb, page);
853 ret = test_and_clear_bit(page, rb->bmap);
854 if (ret) {
855 rs->migration_dirty_pages--;
858 return ret;
861 /* Called with RCU critical section */
862 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
864 uint64_t new_dirty_pages =
865 cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
867 rs->migration_dirty_pages += new_dirty_pages;
868 rs->num_dirty_pages_period += new_dirty_pages;
872 * ram_pagesize_summary: calculate all the pagesizes of a VM
874 * Returns a summary bitmap of the page sizes of all RAMBlocks
876 * For VMs with just normal pages this is equivalent to the host page
877 * size. If it's got some huge pages then it's the OR of all the
878 * different page sizes.
880 uint64_t ram_pagesize_summary(void)
882 RAMBlock *block;
883 uint64_t summary = 0;
885 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
886 summary |= block->page_size;
889 return summary;
892 uint64_t ram_get_total_transferred_pages(void)
894 return ram_counters.normal + ram_counters.duplicate +
895 compression_counters.pages + xbzrle_counters.pages;
898 static void migration_update_rates(RAMState *rs, int64_t end_time)
900 uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
901 double compressed_size;
903 /* calculate period counters */
904 ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
905 / (end_time - rs->time_last_bitmap_sync);
907 if (!page_count) {
908 return;
911 if (migrate_use_xbzrle()) {
912 double encoded_size, unencoded_size;
914 xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
915 rs->xbzrle_cache_miss_prev) / page_count;
916 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
917 unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
918 TARGET_PAGE_SIZE;
919 encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
920 if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
921 xbzrle_counters.encoding_rate = 0;
922 } else {
923 xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
925 rs->xbzrle_pages_prev = xbzrle_counters.pages;
926 rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
929 if (migrate_use_compression()) {
930 compression_counters.busy_rate = (double)(compression_counters.busy -
931 rs->compress_thread_busy_prev) / page_count;
932 rs->compress_thread_busy_prev = compression_counters.busy;
934 compressed_size = compression_counters.compressed_size -
935 rs->compressed_size_prev;
936 if (compressed_size) {
937 double uncompressed_size = (compression_counters.pages -
938 rs->compress_pages_prev) * TARGET_PAGE_SIZE;
940 /* Compression-Ratio = Uncompressed-size / Compressed-size */
941 compression_counters.compression_rate =
942 uncompressed_size / compressed_size;
944 rs->compress_pages_prev = compression_counters.pages;
945 rs->compressed_size_prev = compression_counters.compressed_size;
950 static void migration_trigger_throttle(RAMState *rs)
952 MigrationState *s = migrate_get_current();
953 uint64_t threshold = s->parameters.throttle_trigger_threshold;
955 uint64_t bytes_xfer_period = ram_counters.transferred - rs->bytes_xfer_prev;
956 uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
957 uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
959 /* During block migration the auto-converge logic incorrectly detects
960 * that ram migration makes no progress. Avoid this by disabling the
961 * throttling logic during the bulk phase of block migration. */
962 if (migrate_auto_converge() && !blk_mig_bulk_active()) {
963 /* The following detection logic can be refined later. For now:
964 Check to see if the ratio between dirtied bytes and the approx.
965 amount of bytes that just got transferred since the last time
966 we were in this routine reaches the threshold. If that happens
967 twice, start or increase throttling. */
969 if ((bytes_dirty_period > bytes_dirty_threshold) &&
970 (++rs->dirty_rate_high_cnt >= 2)) {
971 trace_migration_throttle();
972 rs->dirty_rate_high_cnt = 0;
973 mig_throttle_guest_down(bytes_dirty_period,
974 bytes_dirty_threshold);
979 static void migration_bitmap_sync(RAMState *rs)
981 RAMBlock *block;
982 int64_t end_time;
984 ram_counters.dirty_sync_count++;
986 if (!rs->time_last_bitmap_sync) {
987 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
990 trace_migration_bitmap_sync_start();
991 memory_global_dirty_log_sync();
993 qemu_mutex_lock(&rs->bitmap_mutex);
994 WITH_RCU_READ_LOCK_GUARD() {
995 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
996 ramblock_sync_dirty_bitmap(rs, block);
998 ram_counters.remaining = ram_bytes_remaining();
1000 qemu_mutex_unlock(&rs->bitmap_mutex);
1002 memory_global_after_dirty_log_sync();
1003 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
1005 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
1007 /* more than 1 second = 1000 millisecons */
1008 if (end_time > rs->time_last_bitmap_sync + 1000) {
1009 migration_trigger_throttle(rs);
1011 migration_update_rates(rs, end_time);
1013 rs->target_page_count_prev = rs->target_page_count;
1015 /* reset period counters */
1016 rs->time_last_bitmap_sync = end_time;
1017 rs->num_dirty_pages_period = 0;
1018 rs->bytes_xfer_prev = ram_counters.transferred;
1020 if (migrate_use_events()) {
1021 qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
1025 static void migration_bitmap_sync_precopy(RAMState *rs)
1027 Error *local_err = NULL;
1030 * The current notifier usage is just an optimization to migration, so we
1031 * don't stop the normal migration process in the error case.
1033 if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1034 error_report_err(local_err);
1035 local_err = NULL;
1038 migration_bitmap_sync(rs);
1040 if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1041 error_report_err(local_err);
1046 * save_zero_page_to_file: send the zero page to the file
1048 * Returns the size of data written to the file, 0 means the page is not
1049 * a zero page
1051 * @rs: current RAM state
1052 * @file: the file where the data is saved
1053 * @block: block that contains the page we want to send
1054 * @offset: offset inside the block for the page
1056 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1057 RAMBlock *block, ram_addr_t offset)
1059 uint8_t *p = block->host + offset;
1060 int len = 0;
1062 if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1063 len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1064 qemu_put_byte(file, 0);
1065 len += 1;
1067 return len;
1071 * save_zero_page: send the zero page to the stream
1073 * Returns the number of pages written.
1075 * @rs: current RAM state
1076 * @block: block that contains the page we want to send
1077 * @offset: offset inside the block for the page
1079 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1081 int len = save_zero_page_to_file(rs, rs->f, block, offset);
1083 if (len) {
1084 ram_counters.duplicate++;
1085 ram_counters.transferred += len;
1086 return 1;
1088 return -1;
1091 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
1093 if (!migrate_release_ram() || !migration_in_postcopy()) {
1094 return;
1097 ram_discard_range(rbname, offset, ((ram_addr_t)pages) << TARGET_PAGE_BITS);
1101 * @pages: the number of pages written by the control path,
1102 * < 0 - error
1103 * > 0 - number of pages written
1105 * Return true if the pages has been saved, otherwise false is returned.
1107 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1108 int *pages)
1110 uint64_t bytes_xmit = 0;
1111 int ret;
1113 *pages = -1;
1114 ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1115 &bytes_xmit);
1116 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1117 return false;
1120 if (bytes_xmit) {
1121 ram_counters.transferred += bytes_xmit;
1122 *pages = 1;
1125 if (ret == RAM_SAVE_CONTROL_DELAYED) {
1126 return true;
1129 if (bytes_xmit > 0) {
1130 ram_counters.normal++;
1131 } else if (bytes_xmit == 0) {
1132 ram_counters.duplicate++;
1135 return true;
1139 * directly send the page to the stream
1141 * Returns the number of pages written.
1143 * @rs: current RAM state
1144 * @block: block that contains the page we want to send
1145 * @offset: offset inside the block for the page
1146 * @buf: the page to be sent
1147 * @async: send to page asyncly
1149 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1150 uint8_t *buf, bool async)
1152 ram_counters.transferred += save_page_header(rs, rs->f, block,
1153 offset | RAM_SAVE_FLAG_PAGE);
1154 if (async) {
1155 qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1156 migrate_release_ram() &
1157 migration_in_postcopy());
1158 } else {
1159 qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1161 ram_counters.transferred += TARGET_PAGE_SIZE;
1162 ram_counters.normal++;
1163 return 1;
1167 * ram_save_page: send the given page to the stream
1169 * Returns the number of pages written.
1170 * < 0 - error
1171 * >=0 - Number of pages written - this might legally be 0
1172 * if xbzrle noticed the page was the same.
1174 * @rs: current RAM state
1175 * @block: block that contains the page we want to send
1176 * @offset: offset inside the block for the page
1177 * @last_stage: if we are at the completion stage
1179 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
1181 int pages = -1;
1182 uint8_t *p;
1183 bool send_async = true;
1184 RAMBlock *block = pss->block;
1185 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1186 ram_addr_t current_addr = block->offset + offset;
1188 p = block->host + offset;
1189 trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1191 XBZRLE_cache_lock();
1192 if (rs->xbzrle_enabled && !migration_in_postcopy()) {
1193 pages = save_xbzrle_page(rs, &p, current_addr, block,
1194 offset, last_stage);
1195 if (!last_stage) {
1196 /* Can't send this cached data async, since the cache page
1197 * might get updated before it gets to the wire
1199 send_async = false;
1203 /* XBZRLE overflow or normal page */
1204 if (pages == -1) {
1205 pages = save_normal_page(rs, block, offset, p, send_async);
1208 XBZRLE_cache_unlock();
1210 return pages;
1213 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
1214 ram_addr_t offset)
1216 if (multifd_queue_page(rs->f, block, offset) < 0) {
1217 return -1;
1219 ram_counters.normal++;
1221 return 1;
1224 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1225 ram_addr_t offset, uint8_t *source_buf)
1227 RAMState *rs = ram_state;
1228 uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1229 bool zero_page = false;
1230 int ret;
1232 if (save_zero_page_to_file(rs, f, block, offset)) {
1233 zero_page = true;
1234 goto exit;
1237 save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1240 * copy it to a internal buffer to avoid it being modified by VM
1241 * so that we can catch up the error during compression and
1242 * decompression
1244 memcpy(source_buf, p, TARGET_PAGE_SIZE);
1245 ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1246 if (ret < 0) {
1247 qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
1248 error_report("compressed data failed!");
1249 return false;
1252 exit:
1253 ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
1254 return zero_page;
1257 static void
1258 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1260 ram_counters.transferred += bytes_xmit;
1262 if (param->zero_page) {
1263 ram_counters.duplicate++;
1264 return;
1267 /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1268 compression_counters.compressed_size += bytes_xmit - 8;
1269 compression_counters.pages++;
1272 static bool save_page_use_compression(RAMState *rs);
1274 static void flush_compressed_data(RAMState *rs)
1276 int idx, len, thread_count;
1278 if (!save_page_use_compression(rs)) {
1279 return;
1281 thread_count = migrate_compress_threads();
1283 qemu_mutex_lock(&comp_done_lock);
1284 for (idx = 0; idx < thread_count; idx++) {
1285 while (!comp_param[idx].done) {
1286 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1289 qemu_mutex_unlock(&comp_done_lock);
1291 for (idx = 0; idx < thread_count; idx++) {
1292 qemu_mutex_lock(&comp_param[idx].mutex);
1293 if (!comp_param[idx].quit) {
1294 len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1296 * it's safe to fetch zero_page without holding comp_done_lock
1297 * as there is no further request submitted to the thread,
1298 * i.e, the thread should be waiting for a request at this point.
1300 update_compress_thread_counts(&comp_param[idx], len);
1302 qemu_mutex_unlock(&comp_param[idx].mutex);
1306 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1307 ram_addr_t offset)
1309 param->block = block;
1310 param->offset = offset;
1313 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1314 ram_addr_t offset)
1316 int idx, thread_count, bytes_xmit = -1, pages = -1;
1317 bool wait = migrate_compress_wait_thread();
1319 thread_count = migrate_compress_threads();
1320 qemu_mutex_lock(&comp_done_lock);
1321 retry:
1322 for (idx = 0; idx < thread_count; idx++) {
1323 if (comp_param[idx].done) {
1324 comp_param[idx].done = false;
1325 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1326 qemu_mutex_lock(&comp_param[idx].mutex);
1327 set_compress_params(&comp_param[idx], block, offset);
1328 qemu_cond_signal(&comp_param[idx].cond);
1329 qemu_mutex_unlock(&comp_param[idx].mutex);
1330 pages = 1;
1331 update_compress_thread_counts(&comp_param[idx], bytes_xmit);
1332 break;
1337 * wait for the free thread if the user specifies 'compress-wait-thread',
1338 * otherwise we will post the page out in the main thread as normal page.
1340 if (pages < 0 && wait) {
1341 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1342 goto retry;
1344 qemu_mutex_unlock(&comp_done_lock);
1346 return pages;
1350 * find_dirty_block: find the next dirty page and update any state
1351 * associated with the search process.
1353 * Returns true if a page is found
1355 * @rs: current RAM state
1356 * @pss: data about the state of the current dirty page scan
1357 * @again: set to false if the search has scanned the whole of RAM
1359 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1361 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1362 if (pss->complete_round && pss->block == rs->last_seen_block &&
1363 pss->page >= rs->last_page) {
1365 * We've been once around the RAM and haven't found anything.
1366 * Give up.
1368 *again = false;
1369 return false;
1371 if (!offset_in_ramblock(pss->block,
1372 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS)) {
1373 /* Didn't find anything in this RAM Block */
1374 pss->page = 0;
1375 pss->block = QLIST_NEXT_RCU(pss->block, next);
1376 if (!pss->block) {
1378 * If memory migration starts over, we will meet a dirtied page
1379 * which may still exists in compression threads's ring, so we
1380 * should flush the compressed data to make sure the new page
1381 * is not overwritten by the old one in the destination.
1383 * Also If xbzrle is on, stop using the data compression at this
1384 * point. In theory, xbzrle can do better than compression.
1386 flush_compressed_data(rs);
1388 /* Hit the end of the list */
1389 pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1390 /* Flag that we've looped */
1391 pss->complete_round = true;
1392 /* After the first round, enable XBZRLE. */
1393 if (migrate_use_xbzrle()) {
1394 rs->xbzrle_enabled = true;
1397 /* Didn't find anything this time, but try again on the new block */
1398 *again = true;
1399 return false;
1400 } else {
1401 /* Can go around again, but... */
1402 *again = true;
1403 /* We've found something so probably don't need to */
1404 return true;
1409 * unqueue_page: gets a page of the queue
1411 * Helper for 'get_queued_page' - gets a page off the queue
1413 * Returns the block of the page (or NULL if none available)
1415 * @rs: current RAM state
1416 * @offset: used to return the offset within the RAMBlock
1418 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1420 RAMBlock *block = NULL;
1422 if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) {
1423 return NULL;
1426 QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
1427 if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1428 struct RAMSrcPageRequest *entry =
1429 QSIMPLEQ_FIRST(&rs->src_page_requests);
1430 block = entry->rb;
1431 *offset = entry->offset;
1433 if (entry->len > TARGET_PAGE_SIZE) {
1434 entry->len -= TARGET_PAGE_SIZE;
1435 entry->offset += TARGET_PAGE_SIZE;
1436 } else {
1437 memory_region_unref(block->mr);
1438 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1439 g_free(entry);
1440 migration_consume_urgent_request();
1444 return block;
1447 #if defined(__linux__)
1449 * poll_fault_page: try to get next UFFD write fault page and, if pending fault
1450 * is found, return RAM block pointer and page offset
1452 * Returns pointer to the RAMBlock containing faulting page,
1453 * NULL if no write faults are pending
1455 * @rs: current RAM state
1456 * @offset: page offset from the beginning of the block
1458 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1460 struct uffd_msg uffd_msg;
1461 void *page_address;
1462 RAMBlock *block;
1463 int res;
1465 if (!migrate_background_snapshot()) {
1466 return NULL;
1469 res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1);
1470 if (res <= 0) {
1471 return NULL;
1474 page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address;
1475 block = qemu_ram_block_from_host(page_address, false, offset);
1476 assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0);
1477 return block;
1481 * ram_save_release_protection: release UFFD write protection after
1482 * a range of pages has been saved
1484 * @rs: current RAM state
1485 * @pss: page-search-status structure
1486 * @start_page: index of the first page in the range relative to pss->block
1488 * Returns 0 on success, negative value in case of an error
1490 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1491 unsigned long start_page)
1493 int res = 0;
1495 /* Check if page is from UFFD-managed region. */
1496 if (pss->block->flags & RAM_UF_WRITEPROTECT) {
1497 void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS);
1498 uint64_t run_length = (pss->page - start_page + 1) << TARGET_PAGE_BITS;
1500 /* Flush async buffers before un-protect. */
1501 qemu_fflush(rs->f);
1502 /* Un-protect memory range. */
1503 res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
1504 false, false);
1507 return res;
1510 /* ram_write_tracking_available: check if kernel supports required UFFD features
1512 * Returns true if supports, false otherwise
1514 bool ram_write_tracking_available(void)
1516 uint64_t uffd_features;
1517 int res;
1519 res = uffd_query_features(&uffd_features);
1520 return (res == 0 &&
1521 (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0);
1524 /* ram_write_tracking_compatible: check if guest configuration is
1525 * compatible with 'write-tracking'
1527 * Returns true if compatible, false otherwise
1529 bool ram_write_tracking_compatible(void)
1531 const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT);
1532 int uffd_fd;
1533 RAMBlock *block;
1534 bool ret = false;
1536 /* Open UFFD file descriptor */
1537 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false);
1538 if (uffd_fd < 0) {
1539 return false;
1542 RCU_READ_LOCK_GUARD();
1544 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1545 uint64_t uffd_ioctls;
1547 /* Nothing to do with read-only and MMIO-writable regions */
1548 if (block->mr->readonly || block->mr->rom_device) {
1549 continue;
1551 /* Try to register block memory via UFFD-IO to track writes */
1552 if (uffd_register_memory(uffd_fd, block->host, block->max_length,
1553 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) {
1554 goto out;
1556 if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) {
1557 goto out;
1560 ret = true;
1562 out:
1563 uffd_close_fd(uffd_fd);
1564 return ret;
1568 * ram_block_populate_pages: populate memory in the RAM block by reading
1569 * an integer from the beginning of each page.
1571 * Since it's solely used for userfault_fd WP feature, here we just
1572 * hardcode page size to qemu_real_host_page_size.
1574 * @block: RAM block to populate
1576 static void ram_block_populate_pages(RAMBlock *block)
1578 char *ptr = (char *) block->host;
1580 for (ram_addr_t offset = 0; offset < block->used_length;
1581 offset += qemu_real_host_page_size) {
1582 char tmp = *(ptr + offset);
1584 /* Don't optimize the read out */
1585 asm volatile("" : "+r" (tmp));
1590 * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking
1592 void ram_write_tracking_prepare(void)
1594 RAMBlock *block;
1596 RCU_READ_LOCK_GUARD();
1598 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1599 /* Nothing to do with read-only and MMIO-writable regions */
1600 if (block->mr->readonly || block->mr->rom_device) {
1601 continue;
1605 * Populate pages of the RAM block before enabling userfault_fd
1606 * write protection.
1608 * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with
1609 * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip
1610 * pages with pte_none() entries in page table.
1612 ram_block_populate_pages(block);
1617 * ram_write_tracking_start: start UFFD-WP memory tracking
1619 * Returns 0 for success or negative value in case of error
1621 int ram_write_tracking_start(void)
1623 int uffd_fd;
1624 RAMState *rs = ram_state;
1625 RAMBlock *block;
1627 /* Open UFFD file descriptor */
1628 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true);
1629 if (uffd_fd < 0) {
1630 return uffd_fd;
1632 rs->uffdio_fd = uffd_fd;
1634 RCU_READ_LOCK_GUARD();
1636 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1637 /* Nothing to do with read-only and MMIO-writable regions */
1638 if (block->mr->readonly || block->mr->rom_device) {
1639 continue;
1642 /* Register block memory with UFFD to track writes */
1643 if (uffd_register_memory(rs->uffdio_fd, block->host,
1644 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
1645 goto fail;
1647 /* Apply UFFD write protection to the block memory range */
1648 if (uffd_change_protection(rs->uffdio_fd, block->host,
1649 block->max_length, true, false)) {
1650 goto fail;
1652 block->flags |= RAM_UF_WRITEPROTECT;
1653 memory_region_ref(block->mr);
1655 trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size,
1656 block->host, block->max_length);
1659 return 0;
1661 fail:
1662 error_report("ram_write_tracking_start() failed: restoring initial memory state");
1664 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1665 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1666 continue;
1669 * In case some memory block failed to be write-protected
1670 * remove protection and unregister all succeeded RAM blocks
1672 uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1673 false, false);
1674 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1675 /* Cleanup flags and remove reference */
1676 block->flags &= ~RAM_UF_WRITEPROTECT;
1677 memory_region_unref(block->mr);
1680 uffd_close_fd(uffd_fd);
1681 rs->uffdio_fd = -1;
1682 return -1;
1686 * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
1688 void ram_write_tracking_stop(void)
1690 RAMState *rs = ram_state;
1691 RAMBlock *block;
1693 RCU_READ_LOCK_GUARD();
1695 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1696 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1697 continue;
1699 /* Remove protection and unregister all affected RAM blocks */
1700 uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1701 false, false);
1702 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1704 trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size,
1705 block->host, block->max_length);
1707 /* Cleanup flags and remove reference */
1708 block->flags &= ~RAM_UF_WRITEPROTECT;
1709 memory_region_unref(block->mr);
1712 /* Finally close UFFD file descriptor */
1713 uffd_close_fd(rs->uffdio_fd);
1714 rs->uffdio_fd = -1;
1717 #else
1718 /* No target OS support, stubs just fail or ignore */
1720 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1722 (void) rs;
1723 (void) offset;
1725 return NULL;
1728 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1729 unsigned long start_page)
1731 (void) rs;
1732 (void) pss;
1733 (void) start_page;
1735 return 0;
1738 bool ram_write_tracking_available(void)
1740 return false;
1743 bool ram_write_tracking_compatible(void)
1745 assert(0);
1746 return false;
1749 int ram_write_tracking_start(void)
1751 assert(0);
1752 return -1;
1755 void ram_write_tracking_stop(void)
1757 assert(0);
1759 #endif /* defined(__linux__) */
1762 * get_queued_page: unqueue a page from the postcopy requests
1764 * Skips pages that are already sent (!dirty)
1766 * Returns true if a queued page is found
1768 * @rs: current RAM state
1769 * @pss: data about the state of the current dirty page scan
1771 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1773 RAMBlock *block;
1774 ram_addr_t offset;
1775 bool dirty;
1777 do {
1778 block = unqueue_page(rs, &offset);
1780 * We're sending this page, and since it's postcopy nothing else
1781 * will dirty it, and we must make sure it doesn't get sent again
1782 * even if this queue request was received after the background
1783 * search already sent it.
1785 if (block) {
1786 unsigned long page;
1788 page = offset >> TARGET_PAGE_BITS;
1789 dirty = test_bit(page, block->bmap);
1790 if (!dirty) {
1791 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1792 page);
1793 } else {
1794 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1798 } while (block && !dirty);
1800 if (!block) {
1802 * Poll write faults too if background snapshot is enabled; that's
1803 * when we have vcpus got blocked by the write protected pages.
1805 block = poll_fault_page(rs, &offset);
1808 if (block) {
1810 * We want the background search to continue from the queued page
1811 * since the guest is likely to want other pages near to the page
1812 * it just requested.
1814 pss->block = block;
1815 pss->page = offset >> TARGET_PAGE_BITS;
1818 * This unqueued page would break the "one round" check, even is
1819 * really rare.
1821 pss->complete_round = false;
1824 return !!block;
1828 * migration_page_queue_free: drop any remaining pages in the ram
1829 * request queue
1831 * It should be empty at the end anyway, but in error cases there may
1832 * be some left. in case that there is any page left, we drop it.
1835 static void migration_page_queue_free(RAMState *rs)
1837 struct RAMSrcPageRequest *mspr, *next_mspr;
1838 /* This queue generally should be empty - but in the case of a failed
1839 * migration might have some droppings in.
1841 RCU_READ_LOCK_GUARD();
1842 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1843 memory_region_unref(mspr->rb->mr);
1844 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1845 g_free(mspr);
1850 * ram_save_queue_pages: queue the page for transmission
1852 * A request from postcopy destination for example.
1854 * Returns zero on success or negative on error
1856 * @rbname: Name of the RAMBLock of the request. NULL means the
1857 * same that last one.
1858 * @start: starting address from the start of the RAMBlock
1859 * @len: length (in bytes) to send
1861 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1863 RAMBlock *ramblock;
1864 RAMState *rs = ram_state;
1866 ram_counters.postcopy_requests++;
1867 RCU_READ_LOCK_GUARD();
1869 if (!rbname) {
1870 /* Reuse last RAMBlock */
1871 ramblock = rs->last_req_rb;
1873 if (!ramblock) {
1875 * Shouldn't happen, we can't reuse the last RAMBlock if
1876 * it's the 1st request.
1878 error_report("ram_save_queue_pages no previous block");
1879 return -1;
1881 } else {
1882 ramblock = qemu_ram_block_by_name(rbname);
1884 if (!ramblock) {
1885 /* We shouldn't be asked for a non-existent RAMBlock */
1886 error_report("ram_save_queue_pages no block '%s'", rbname);
1887 return -1;
1889 rs->last_req_rb = ramblock;
1891 trace_ram_save_queue_pages(ramblock->idstr, start, len);
1892 if (!offset_in_ramblock(ramblock, start + len - 1)) {
1893 error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1894 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1895 __func__, start, len, ramblock->used_length);
1896 return -1;
1899 struct RAMSrcPageRequest *new_entry =
1900 g_malloc0(sizeof(struct RAMSrcPageRequest));
1901 new_entry->rb = ramblock;
1902 new_entry->offset = start;
1903 new_entry->len = len;
1905 memory_region_ref(ramblock->mr);
1906 qemu_mutex_lock(&rs->src_page_req_mutex);
1907 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1908 migration_make_urgent_request();
1909 qemu_mutex_unlock(&rs->src_page_req_mutex);
1911 return 0;
1914 static bool save_page_use_compression(RAMState *rs)
1916 if (!migrate_use_compression()) {
1917 return false;
1921 * If xbzrle is enabled (e.g., after first round of migration), stop
1922 * using the data compression. In theory, xbzrle can do better than
1923 * compression.
1925 if (rs->xbzrle_enabled) {
1926 return false;
1929 return true;
1933 * try to compress the page before posting it out, return true if the page
1934 * has been properly handled by compression, otherwise needs other
1935 * paths to handle it
1937 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1939 if (!save_page_use_compression(rs)) {
1940 return false;
1944 * When starting the process of a new block, the first page of
1945 * the block should be sent out before other pages in the same
1946 * block, and all the pages in last block should have been sent
1947 * out, keeping this order is important, because the 'cont' flag
1948 * is used to avoid resending the block name.
1950 * We post the fist page as normal page as compression will take
1951 * much CPU resource.
1953 if (block != rs->last_sent_block) {
1954 flush_compressed_data(rs);
1955 return false;
1958 if (compress_page_with_multi_thread(rs, block, offset) > 0) {
1959 return true;
1962 compression_counters.busy++;
1963 return false;
1967 * ram_save_target_page: save one target page
1969 * Returns the number of pages written
1971 * @rs: current RAM state
1972 * @pss: data about the page we want to send
1973 * @last_stage: if we are at the completion stage
1975 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
1976 bool last_stage)
1978 RAMBlock *block = pss->block;
1979 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1980 int res;
1982 if (control_save_page(rs, block, offset, &res)) {
1983 return res;
1986 if (save_compress_page(rs, block, offset)) {
1987 return 1;
1990 res = save_zero_page(rs, block, offset);
1991 if (res > 0) {
1992 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
1993 * page would be stale
1995 if (!save_page_use_compression(rs)) {
1996 XBZRLE_cache_lock();
1997 xbzrle_cache_zero_page(rs, block->offset + offset);
1998 XBZRLE_cache_unlock();
2000 ram_release_pages(block->idstr, offset, res);
2001 return res;
2005 * Do not use multifd for:
2006 * 1. Compression as the first page in the new block should be posted out
2007 * before sending the compressed page
2008 * 2. In postcopy as one whole host page should be placed
2010 if (!save_page_use_compression(rs) && migrate_use_multifd()
2011 && !migration_in_postcopy()) {
2012 return ram_save_multifd_page(rs, block, offset);
2015 return ram_save_page(rs, pss, last_stage);
2019 * ram_save_host_page: save a whole host page
2021 * Starting at *offset send pages up to the end of the current host
2022 * page. It's valid for the initial offset to point into the middle of
2023 * a host page in which case the remainder of the hostpage is sent.
2024 * Only dirty target pages are sent. Note that the host page size may
2025 * be a huge page for this block.
2026 * The saving stops at the boundary of the used_length of the block
2027 * if the RAMBlock isn't a multiple of the host page size.
2029 * Returns the number of pages written or negative on error
2031 * @rs: current RAM state
2032 * @ms: current migration state
2033 * @pss: data about the page we want to send
2034 * @last_stage: if we are at the completion stage
2036 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
2037 bool last_stage)
2039 int tmppages, pages = 0;
2040 size_t pagesize_bits =
2041 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2042 unsigned long hostpage_boundary =
2043 QEMU_ALIGN_UP(pss->page + 1, pagesize_bits);
2044 unsigned long start_page = pss->page;
2045 int res;
2047 if (ramblock_is_ignored(pss->block)) {
2048 error_report("block %s should not be migrated !", pss->block->idstr);
2049 return 0;
2052 do {
2053 /* Check the pages is dirty and if it is send it */
2054 if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
2055 tmppages = ram_save_target_page(rs, pss, last_stage);
2056 if (tmppages < 0) {
2057 return tmppages;
2060 pages += tmppages;
2062 * Allow rate limiting to happen in the middle of huge pages if
2063 * something is sent in the current iteration.
2065 if (pagesize_bits > 1 && tmppages > 0) {
2066 migration_rate_limit();
2069 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
2070 } while ((pss->page < hostpage_boundary) &&
2071 offset_in_ramblock(pss->block,
2072 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS));
2073 /* The offset we leave with is the min boundary of host page and block */
2074 pss->page = MIN(pss->page, hostpage_boundary) - 1;
2076 res = ram_save_release_protection(rs, pss, start_page);
2077 return (res < 0 ? res : pages);
2081 * ram_find_and_save_block: finds a dirty page and sends it to f
2083 * Called within an RCU critical section.
2085 * Returns the number of pages written where zero means no dirty pages,
2086 * or negative on error
2088 * @rs: current RAM state
2089 * @last_stage: if we are at the completion stage
2091 * On systems where host-page-size > target-page-size it will send all the
2092 * pages in a host page that are dirty.
2095 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
2097 PageSearchStatus pss;
2098 int pages = 0;
2099 bool again, found;
2101 /* No dirty page as there is zero RAM */
2102 if (!ram_bytes_total()) {
2103 return pages;
2106 pss.block = rs->last_seen_block;
2107 pss.page = rs->last_page;
2108 pss.complete_round = false;
2110 if (!pss.block) {
2111 pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
2114 do {
2115 again = true;
2116 found = get_queued_page(rs, &pss);
2118 if (!found) {
2119 /* priority queue empty, so just search for something dirty */
2120 found = find_dirty_block(rs, &pss, &again);
2123 if (found) {
2124 pages = ram_save_host_page(rs, &pss, last_stage);
2126 } while (!pages && again);
2128 rs->last_seen_block = pss.block;
2129 rs->last_page = pss.page;
2131 return pages;
2134 void acct_update_position(QEMUFile *f, size_t size, bool zero)
2136 uint64_t pages = size / TARGET_PAGE_SIZE;
2138 if (zero) {
2139 ram_counters.duplicate += pages;
2140 } else {
2141 ram_counters.normal += pages;
2142 ram_counters.transferred += size;
2143 qemu_update_position(f, size);
2147 static uint64_t ram_bytes_total_common(bool count_ignored)
2149 RAMBlock *block;
2150 uint64_t total = 0;
2152 RCU_READ_LOCK_GUARD();
2154 if (count_ignored) {
2155 RAMBLOCK_FOREACH_MIGRATABLE(block) {
2156 total += block->used_length;
2158 } else {
2159 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2160 total += block->used_length;
2163 return total;
2166 uint64_t ram_bytes_total(void)
2168 return ram_bytes_total_common(false);
2171 static void xbzrle_load_setup(void)
2173 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2176 static void xbzrle_load_cleanup(void)
2178 g_free(XBZRLE.decoded_buf);
2179 XBZRLE.decoded_buf = NULL;
2182 static void ram_state_cleanup(RAMState **rsp)
2184 if (*rsp) {
2185 migration_page_queue_free(*rsp);
2186 qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2187 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2188 g_free(*rsp);
2189 *rsp = NULL;
2193 static void xbzrle_cleanup(void)
2195 XBZRLE_cache_lock();
2196 if (XBZRLE.cache) {
2197 cache_fini(XBZRLE.cache);
2198 g_free(XBZRLE.encoded_buf);
2199 g_free(XBZRLE.current_buf);
2200 g_free(XBZRLE.zero_target_page);
2201 XBZRLE.cache = NULL;
2202 XBZRLE.encoded_buf = NULL;
2203 XBZRLE.current_buf = NULL;
2204 XBZRLE.zero_target_page = NULL;
2206 XBZRLE_cache_unlock();
2209 static void ram_save_cleanup(void *opaque)
2211 RAMState **rsp = opaque;
2212 RAMBlock *block;
2214 /* We don't use dirty log with background snapshots */
2215 if (!migrate_background_snapshot()) {
2216 /* caller have hold iothread lock or is in a bh, so there is
2217 * no writing race against the migration bitmap
2219 memory_global_dirty_log_stop();
2222 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2223 g_free(block->clear_bmap);
2224 block->clear_bmap = NULL;
2225 g_free(block->bmap);
2226 block->bmap = NULL;
2229 xbzrle_cleanup();
2230 compress_threads_save_cleanup();
2231 ram_state_cleanup(rsp);
2234 static void ram_state_reset(RAMState *rs)
2236 rs->last_seen_block = NULL;
2237 rs->last_sent_block = NULL;
2238 rs->last_page = 0;
2239 rs->last_version = ram_list.version;
2240 rs->xbzrle_enabled = false;
2243 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2246 * 'expected' is the value you expect the bitmap mostly to be full
2247 * of; it won't bother printing lines that are all this value.
2248 * If 'todump' is null the migration bitmap is dumped.
2250 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
2251 unsigned long pages)
2253 int64_t cur;
2254 int64_t linelen = 128;
2255 char linebuf[129];
2257 for (cur = 0; cur < pages; cur += linelen) {
2258 int64_t curb;
2259 bool found = false;
2261 * Last line; catch the case where the line length
2262 * is longer than remaining ram
2264 if (cur + linelen > pages) {
2265 linelen = pages - cur;
2267 for (curb = 0; curb < linelen; curb++) {
2268 bool thisbit = test_bit(cur + curb, todump);
2269 linebuf[curb] = thisbit ? '1' : '.';
2270 found = found || (thisbit != expected);
2272 if (found) {
2273 linebuf[curb] = '\0';
2274 fprintf(stderr, "0x%08" PRIx64 " : %s\n", cur, linebuf);
2279 /* **** functions for postcopy ***** */
2281 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2283 struct RAMBlock *block;
2285 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2286 unsigned long *bitmap = block->bmap;
2287 unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2288 unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2290 while (run_start < range) {
2291 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2292 ram_discard_range(block->idstr,
2293 ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
2294 ((ram_addr_t)(run_end - run_start))
2295 << TARGET_PAGE_BITS);
2296 run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2302 * postcopy_send_discard_bm_ram: discard a RAMBlock
2304 * Returns zero on success
2306 * Callback from postcopy_each_ram_send_discard for each RAMBlock
2308 * @ms: current migration state
2309 * @block: RAMBlock to discard
2311 static int postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
2313 unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2314 unsigned long current;
2315 unsigned long *bitmap = block->bmap;
2317 for (current = 0; current < end; ) {
2318 unsigned long one = find_next_bit(bitmap, end, current);
2319 unsigned long zero, discard_length;
2321 if (one >= end) {
2322 break;
2325 zero = find_next_zero_bit(bitmap, end, one + 1);
2327 if (zero >= end) {
2328 discard_length = end - one;
2329 } else {
2330 discard_length = zero - one;
2332 postcopy_discard_send_range(ms, one, discard_length);
2333 current = one + discard_length;
2336 return 0;
2340 * postcopy_each_ram_send_discard: discard all RAMBlocks
2342 * Returns 0 for success or negative for error
2344 * Utility for the outgoing postcopy code.
2345 * Calls postcopy_send_discard_bm_ram for each RAMBlock
2346 * passing it bitmap indexes and name.
2347 * (qemu_ram_foreach_block ends up passing unscaled lengths
2348 * which would mean postcopy code would have to deal with target page)
2350 * @ms: current migration state
2352 static int postcopy_each_ram_send_discard(MigrationState *ms)
2354 struct RAMBlock *block;
2355 int ret;
2357 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2358 postcopy_discard_send_init(ms, block->idstr);
2361 * Postcopy sends chunks of bitmap over the wire, but it
2362 * just needs indexes at this point, avoids it having
2363 * target page specific code.
2365 ret = postcopy_send_discard_bm_ram(ms, block);
2366 postcopy_discard_send_finish(ms);
2367 if (ret) {
2368 return ret;
2372 return 0;
2376 * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2378 * Helper for postcopy_chunk_hostpages; it's called twice to
2379 * canonicalize the two bitmaps, that are similar, but one is
2380 * inverted.
2382 * Postcopy requires that all target pages in a hostpage are dirty or
2383 * clean, not a mix. This function canonicalizes the bitmaps.
2385 * @ms: current migration state
2386 * @block: block that contains the page we want to canonicalize
2388 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
2390 RAMState *rs = ram_state;
2391 unsigned long *bitmap = block->bmap;
2392 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2393 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2394 unsigned long run_start;
2396 if (block->page_size == TARGET_PAGE_SIZE) {
2397 /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2398 return;
2401 /* Find a dirty page */
2402 run_start = find_next_bit(bitmap, pages, 0);
2404 while (run_start < pages) {
2407 * If the start of this run of pages is in the middle of a host
2408 * page, then we need to fixup this host page.
2410 if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2411 /* Find the end of this run */
2412 run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2414 * If the end isn't at the start of a host page, then the
2415 * run doesn't finish at the end of a host page
2416 * and we need to discard.
2420 if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2421 unsigned long page;
2422 unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2423 host_ratio);
2424 run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2426 /* Clean up the bitmap */
2427 for (page = fixup_start_addr;
2428 page < fixup_start_addr + host_ratio; page++) {
2430 * Remark them as dirty, updating the count for any pages
2431 * that weren't previously dirty.
2433 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2437 /* Find the next dirty page for the next iteration */
2438 run_start = find_next_bit(bitmap, pages, run_start);
2443 * postcopy_chunk_hostpages: discard any partially sent host page
2445 * Utility for the outgoing postcopy code.
2447 * Discard any partially sent host-page size chunks, mark any partially
2448 * dirty host-page size chunks as all dirty. In this case the host-page
2449 * is the host-page for the particular RAMBlock, i.e. it might be a huge page
2451 * Returns zero on success
2453 * @ms: current migration state
2454 * @block: block we want to work with
2456 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
2458 postcopy_discard_send_init(ms, block->idstr);
2461 * Ensure that all partially dirty host pages are made fully dirty.
2463 postcopy_chunk_hostpages_pass(ms, block);
2465 postcopy_discard_send_finish(ms);
2466 return 0;
2470 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2472 * Returns zero on success
2474 * Transmit the set of pages to be discarded after precopy to the target
2475 * these are pages that:
2476 * a) Have been previously transmitted but are now dirty again
2477 * b) Pages that have never been transmitted, this ensures that
2478 * any pages on the destination that have been mapped by background
2479 * tasks get discarded (transparent huge pages is the specific concern)
2480 * Hopefully this is pretty sparse
2482 * @ms: current migration state
2484 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
2486 RAMState *rs = ram_state;
2487 RAMBlock *block;
2488 int ret;
2490 RCU_READ_LOCK_GUARD();
2492 /* This should be our last sync, the src is now paused */
2493 migration_bitmap_sync(rs);
2495 /* Easiest way to make sure we don't resume in the middle of a host-page */
2496 rs->last_seen_block = NULL;
2497 rs->last_sent_block = NULL;
2498 rs->last_page = 0;
2500 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2501 /* Deal with TPS != HPS and huge pages */
2502 ret = postcopy_chunk_hostpages(ms, block);
2503 if (ret) {
2504 return ret;
2507 #ifdef DEBUG_POSTCOPY
2508 ram_debug_dump_bitmap(block->bmap, true,
2509 block->used_length >> TARGET_PAGE_BITS);
2510 #endif
2512 trace_ram_postcopy_send_discard_bitmap();
2514 return postcopy_each_ram_send_discard(ms);
2518 * ram_discard_range: discard dirtied pages at the beginning of postcopy
2520 * Returns zero on success
2522 * @rbname: name of the RAMBlock of the request. NULL means the
2523 * same that last one.
2524 * @start: RAMBlock starting page
2525 * @length: RAMBlock size
2527 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2529 trace_ram_discard_range(rbname, start, length);
2531 RCU_READ_LOCK_GUARD();
2532 RAMBlock *rb = qemu_ram_block_by_name(rbname);
2534 if (!rb) {
2535 error_report("ram_discard_range: Failed to find block '%s'", rbname);
2536 return -1;
2540 * On source VM, we don't need to update the received bitmap since
2541 * we don't even have one.
2543 if (rb->receivedmap) {
2544 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2545 length >> qemu_target_page_bits());
2548 return ram_block_discard_range(rb, start, length);
2552 * For every allocation, we will try not to crash the VM if the
2553 * allocation failed.
2555 static int xbzrle_init(void)
2557 Error *local_err = NULL;
2559 if (!migrate_use_xbzrle()) {
2560 return 0;
2563 XBZRLE_cache_lock();
2565 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2566 if (!XBZRLE.zero_target_page) {
2567 error_report("%s: Error allocating zero page", __func__);
2568 goto err_out;
2571 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2572 TARGET_PAGE_SIZE, &local_err);
2573 if (!XBZRLE.cache) {
2574 error_report_err(local_err);
2575 goto free_zero_page;
2578 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2579 if (!XBZRLE.encoded_buf) {
2580 error_report("%s: Error allocating encoded_buf", __func__);
2581 goto free_cache;
2584 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2585 if (!XBZRLE.current_buf) {
2586 error_report("%s: Error allocating current_buf", __func__);
2587 goto free_encoded_buf;
2590 /* We are all good */
2591 XBZRLE_cache_unlock();
2592 return 0;
2594 free_encoded_buf:
2595 g_free(XBZRLE.encoded_buf);
2596 XBZRLE.encoded_buf = NULL;
2597 free_cache:
2598 cache_fini(XBZRLE.cache);
2599 XBZRLE.cache = NULL;
2600 free_zero_page:
2601 g_free(XBZRLE.zero_target_page);
2602 XBZRLE.zero_target_page = NULL;
2603 err_out:
2604 XBZRLE_cache_unlock();
2605 return -ENOMEM;
2608 static int ram_state_init(RAMState **rsp)
2610 *rsp = g_try_new0(RAMState, 1);
2612 if (!*rsp) {
2613 error_report("%s: Init ramstate fail", __func__);
2614 return -1;
2617 qemu_mutex_init(&(*rsp)->bitmap_mutex);
2618 qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2619 QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2622 * Count the total number of pages used by ram blocks not including any
2623 * gaps due to alignment or unplugs.
2624 * This must match with the initial values of dirty bitmap.
2626 (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2627 ram_state_reset(*rsp);
2629 return 0;
2632 static void ram_list_init_bitmaps(void)
2634 MigrationState *ms = migrate_get_current();
2635 RAMBlock *block;
2636 unsigned long pages;
2637 uint8_t shift;
2639 /* Skip setting bitmap if there is no RAM */
2640 if (ram_bytes_total()) {
2641 shift = ms->clear_bitmap_shift;
2642 if (shift > CLEAR_BITMAP_SHIFT_MAX) {
2643 error_report("clear_bitmap_shift (%u) too big, using "
2644 "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
2645 shift = CLEAR_BITMAP_SHIFT_MAX;
2646 } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
2647 error_report("clear_bitmap_shift (%u) too small, using "
2648 "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
2649 shift = CLEAR_BITMAP_SHIFT_MIN;
2652 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2653 pages = block->max_length >> TARGET_PAGE_BITS;
2655 * The initial dirty bitmap for migration must be set with all
2656 * ones to make sure we'll migrate every guest RAM page to
2657 * destination.
2658 * Here we set RAMBlock.bmap all to 1 because when rebegin a
2659 * new migration after a failed migration, ram_list.
2660 * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
2661 * guest memory.
2663 block->bmap = bitmap_new(pages);
2664 bitmap_set(block->bmap, 0, pages);
2665 block->clear_bmap_shift = shift;
2666 block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
2671 static void ram_init_bitmaps(RAMState *rs)
2673 /* For memory_global_dirty_log_start below. */
2674 qemu_mutex_lock_iothread();
2675 qemu_mutex_lock_ramlist();
2677 WITH_RCU_READ_LOCK_GUARD() {
2678 ram_list_init_bitmaps();
2679 /* We don't use dirty log with background snapshots */
2680 if (!migrate_background_snapshot()) {
2681 memory_global_dirty_log_start();
2682 migration_bitmap_sync_precopy(rs);
2685 qemu_mutex_unlock_ramlist();
2686 qemu_mutex_unlock_iothread();
2689 static int ram_init_all(RAMState **rsp)
2691 if (ram_state_init(rsp)) {
2692 return -1;
2695 if (xbzrle_init()) {
2696 ram_state_cleanup(rsp);
2697 return -1;
2700 ram_init_bitmaps(*rsp);
2702 return 0;
2705 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2707 RAMBlock *block;
2708 uint64_t pages = 0;
2711 * Postcopy is not using xbzrle/compression, so no need for that.
2712 * Also, since source are already halted, we don't need to care
2713 * about dirty page logging as well.
2716 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2717 pages += bitmap_count_one(block->bmap,
2718 block->used_length >> TARGET_PAGE_BITS);
2721 /* This may not be aligned with current bitmaps. Recalculate. */
2722 rs->migration_dirty_pages = pages;
2724 ram_state_reset(rs);
2726 /* Update RAMState cache of output QEMUFile */
2727 rs->f = out;
2729 trace_ram_state_resume_prepare(pages);
2733 * This function clears bits of the free pages reported by the caller from the
2734 * migration dirty bitmap. @addr is the host address corresponding to the
2735 * start of the continuous guest free pages, and @len is the total bytes of
2736 * those pages.
2738 void qemu_guest_free_page_hint(void *addr, size_t len)
2740 RAMBlock *block;
2741 ram_addr_t offset;
2742 size_t used_len, start, npages;
2743 MigrationState *s = migrate_get_current();
2745 /* This function is currently expected to be used during live migration */
2746 if (!migration_is_setup_or_active(s->state)) {
2747 return;
2750 for (; len > 0; len -= used_len, addr += used_len) {
2751 block = qemu_ram_block_from_host(addr, false, &offset);
2752 if (unlikely(!block || offset >= block->used_length)) {
2754 * The implementation might not support RAMBlock resize during
2755 * live migration, but it could happen in theory with future
2756 * updates. So we add a check here to capture that case.
2758 error_report_once("%s unexpected error", __func__);
2759 return;
2762 if (len <= block->used_length - offset) {
2763 used_len = len;
2764 } else {
2765 used_len = block->used_length - offset;
2768 start = offset >> TARGET_PAGE_BITS;
2769 npages = used_len >> TARGET_PAGE_BITS;
2771 qemu_mutex_lock(&ram_state->bitmap_mutex);
2773 * The skipped free pages are equavalent to be sent from clear_bmap's
2774 * perspective, so clear the bits from the memory region bitmap which
2775 * are initially set. Otherwise those skipped pages will be sent in
2776 * the next round after syncing from the memory region bitmap.
2778 migration_clear_memory_region_dirty_bitmap_range(block, start, npages);
2779 ram_state->migration_dirty_pages -=
2780 bitmap_count_one_with_offset(block->bmap, start, npages);
2781 bitmap_clear(block->bmap, start, npages);
2782 qemu_mutex_unlock(&ram_state->bitmap_mutex);
2787 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2788 * long-running RCU critical section. When rcu-reclaims in the code
2789 * start to become numerous it will be necessary to reduce the
2790 * granularity of these critical sections.
2794 * ram_save_setup: Setup RAM for migration
2796 * Returns zero to indicate success and negative for error
2798 * @f: QEMUFile where to send the data
2799 * @opaque: RAMState pointer
2801 static int ram_save_setup(QEMUFile *f, void *opaque)
2803 RAMState **rsp = opaque;
2804 RAMBlock *block;
2806 if (compress_threads_save_setup()) {
2807 return -1;
2810 /* migration has already setup the bitmap, reuse it. */
2811 if (!migration_in_colo_state()) {
2812 if (ram_init_all(rsp) != 0) {
2813 compress_threads_save_cleanup();
2814 return -1;
2817 (*rsp)->f = f;
2819 WITH_RCU_READ_LOCK_GUARD() {
2820 qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
2822 RAMBLOCK_FOREACH_MIGRATABLE(block) {
2823 qemu_put_byte(f, strlen(block->idstr));
2824 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2825 qemu_put_be64(f, block->used_length);
2826 if (migrate_postcopy_ram() && block->page_size !=
2827 qemu_host_page_size) {
2828 qemu_put_be64(f, block->page_size);
2830 if (migrate_ignore_shared()) {
2831 qemu_put_be64(f, block->mr->addr);
2836 ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2837 ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2839 multifd_send_sync_main(f);
2840 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2841 qemu_fflush(f);
2843 return 0;
2847 * ram_save_iterate: iterative stage for migration
2849 * Returns zero to indicate success and negative for error
2851 * @f: QEMUFile where to send the data
2852 * @opaque: RAMState pointer
2854 static int ram_save_iterate(QEMUFile *f, void *opaque)
2856 RAMState **temp = opaque;
2857 RAMState *rs = *temp;
2858 int ret = 0;
2859 int i;
2860 int64_t t0;
2861 int done = 0;
2863 if (blk_mig_bulk_active()) {
2864 /* Avoid transferring ram during bulk phase of block migration as
2865 * the bulk phase will usually take a long time and transferring
2866 * ram updates during that time is pointless. */
2867 goto out;
2871 * We'll take this lock a little bit long, but it's okay for two reasons.
2872 * Firstly, the only possible other thread to take it is who calls
2873 * qemu_guest_free_page_hint(), which should be rare; secondly, see
2874 * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which
2875 * guarantees that we'll at least released it in a regular basis.
2877 qemu_mutex_lock(&rs->bitmap_mutex);
2878 WITH_RCU_READ_LOCK_GUARD() {
2879 if (ram_list.version != rs->last_version) {
2880 ram_state_reset(rs);
2883 /* Read version before ram_list.blocks */
2884 smp_rmb();
2886 ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2888 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2889 i = 0;
2890 while ((ret = qemu_file_rate_limit(f)) == 0 ||
2891 !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
2892 int pages;
2894 if (qemu_file_get_error(f)) {
2895 break;
2898 pages = ram_find_and_save_block(rs, false);
2899 /* no more pages to sent */
2900 if (pages == 0) {
2901 done = 1;
2902 break;
2905 if (pages < 0) {
2906 qemu_file_set_error(f, pages);
2907 break;
2910 rs->target_page_count += pages;
2913 * During postcopy, it is necessary to make sure one whole host
2914 * page is sent in one chunk.
2916 if (migrate_postcopy_ram()) {
2917 flush_compressed_data(rs);
2921 * we want to check in the 1st loop, just in case it was the 1st
2922 * time and we had to sync the dirty bitmap.
2923 * qemu_clock_get_ns() is a bit expensive, so we only check each
2924 * some iterations
2926 if ((i & 63) == 0) {
2927 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
2928 1000000;
2929 if (t1 > MAX_WAIT) {
2930 trace_ram_save_iterate_big_wait(t1, i);
2931 break;
2934 i++;
2937 qemu_mutex_unlock(&rs->bitmap_mutex);
2940 * Must occur before EOS (or any QEMUFile operation)
2941 * because of RDMA protocol.
2943 ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2945 out:
2946 if (ret >= 0
2947 && migration_is_setup_or_active(migrate_get_current()->state)) {
2948 multifd_send_sync_main(rs->f);
2949 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2950 qemu_fflush(f);
2951 ram_counters.transferred += 8;
2953 ret = qemu_file_get_error(f);
2955 if (ret < 0) {
2956 return ret;
2959 return done;
2963 * ram_save_complete: function called to send the remaining amount of ram
2965 * Returns zero to indicate success or negative on error
2967 * Called with iothread lock
2969 * @f: QEMUFile where to send the data
2970 * @opaque: RAMState pointer
2972 static int ram_save_complete(QEMUFile *f, void *opaque)
2974 RAMState **temp = opaque;
2975 RAMState *rs = *temp;
2976 int ret = 0;
2978 WITH_RCU_READ_LOCK_GUARD() {
2979 if (!migration_in_postcopy()) {
2980 migration_bitmap_sync_precopy(rs);
2983 ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2985 /* try transferring iterative blocks of memory */
2987 /* flush all remaining blocks regardless of rate limiting */
2988 while (true) {
2989 int pages;
2991 pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2992 /* no more blocks to sent */
2993 if (pages == 0) {
2994 break;
2996 if (pages < 0) {
2997 ret = pages;
2998 break;
3002 flush_compressed_data(rs);
3003 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
3006 if (ret >= 0) {
3007 multifd_send_sync_main(rs->f);
3008 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
3009 qemu_fflush(f);
3012 return ret;
3015 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
3016 uint64_t *res_precopy_only,
3017 uint64_t *res_compatible,
3018 uint64_t *res_postcopy_only)
3020 RAMState **temp = opaque;
3021 RAMState *rs = *temp;
3022 uint64_t remaining_size;
3024 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3026 if (!migration_in_postcopy() &&
3027 remaining_size < max_size) {
3028 qemu_mutex_lock_iothread();
3029 WITH_RCU_READ_LOCK_GUARD() {
3030 migration_bitmap_sync_precopy(rs);
3032 qemu_mutex_unlock_iothread();
3033 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
3036 if (migrate_postcopy_ram()) {
3037 /* We can do postcopy, and all the data is postcopiable */
3038 *res_compatible += remaining_size;
3039 } else {
3040 *res_precopy_only += remaining_size;
3044 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3046 unsigned int xh_len;
3047 int xh_flags;
3048 uint8_t *loaded_data;
3050 /* extract RLE header */
3051 xh_flags = qemu_get_byte(f);
3052 xh_len = qemu_get_be16(f);
3054 if (xh_flags != ENCODING_FLAG_XBZRLE) {
3055 error_report("Failed to load XBZRLE page - wrong compression!");
3056 return -1;
3059 if (xh_len > TARGET_PAGE_SIZE) {
3060 error_report("Failed to load XBZRLE page - len overflow!");
3061 return -1;
3063 loaded_data = XBZRLE.decoded_buf;
3064 /* load data and decode */
3065 /* it can change loaded_data to point to an internal buffer */
3066 qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3068 /* decode RLE */
3069 if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3070 TARGET_PAGE_SIZE) == -1) {
3071 error_report("Failed to load XBZRLE page - decode error!");
3072 return -1;
3075 return 0;
3079 * ram_block_from_stream: read a RAMBlock id from the migration stream
3081 * Must be called from within a rcu critical section.
3083 * Returns a pointer from within the RCU-protected ram_list.
3085 * @f: QEMUFile where to read the data from
3086 * @flags: Page flags (mostly to see if it's a continuation of previous block)
3088 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
3090 static RAMBlock *block;
3091 char id[256];
3092 uint8_t len;
3094 if (flags & RAM_SAVE_FLAG_CONTINUE) {
3095 if (!block) {
3096 error_report("Ack, bad migration stream!");
3097 return NULL;
3099 return block;
3102 len = qemu_get_byte(f);
3103 qemu_get_buffer(f, (uint8_t *)id, len);
3104 id[len] = 0;
3106 block = qemu_ram_block_by_name(id);
3107 if (!block) {
3108 error_report("Can't find block %s", id);
3109 return NULL;
3112 if (ramblock_is_ignored(block)) {
3113 error_report("block %s should not be migrated !", id);
3114 return NULL;
3117 return block;
3120 static inline void *host_from_ram_block_offset(RAMBlock *block,
3121 ram_addr_t offset)
3123 if (!offset_in_ramblock(block, offset)) {
3124 return NULL;
3127 return block->host + offset;
3130 static void *host_page_from_ram_block_offset(RAMBlock *block,
3131 ram_addr_t offset)
3133 /* Note: Explicitly no check against offset_in_ramblock(). */
3134 return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block->host + offset),
3135 block->page_size);
3138 static ram_addr_t host_page_offset_from_ram_block_offset(RAMBlock *block,
3139 ram_addr_t offset)
3141 return ((uintptr_t)block->host + offset) & (block->page_size - 1);
3144 static inline void *colo_cache_from_block_offset(RAMBlock *block,
3145 ram_addr_t offset, bool record_bitmap)
3147 if (!offset_in_ramblock(block, offset)) {
3148 return NULL;
3150 if (!block->colo_cache) {
3151 error_report("%s: colo_cache is NULL in block :%s",
3152 __func__, block->idstr);
3153 return NULL;
3157 * During colo checkpoint, we need bitmap of these migrated pages.
3158 * It help us to decide which pages in ram cache should be flushed
3159 * into VM's RAM later.
3161 if (record_bitmap &&
3162 !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
3163 ram_state->migration_dirty_pages++;
3165 return block->colo_cache + offset;
3169 * ram_handle_compressed: handle the zero page case
3171 * If a page (or a whole RDMA chunk) has been
3172 * determined to be zero, then zap it.
3174 * @host: host address for the zero page
3175 * @ch: what the page is filled from. We only support zero
3176 * @size: size of the zero page
3178 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3180 if (ch != 0 || !is_zero_range(host, size)) {
3181 memset(host, ch, size);
3185 /* return the size after decompression, or negative value on error */
3186 static int
3187 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3188 const uint8_t *source, size_t source_len)
3190 int err;
3192 err = inflateReset(stream);
3193 if (err != Z_OK) {
3194 return -1;
3197 stream->avail_in = source_len;
3198 stream->next_in = (uint8_t *)source;
3199 stream->avail_out = dest_len;
3200 stream->next_out = dest;
3202 err = inflate(stream, Z_NO_FLUSH);
3203 if (err != Z_STREAM_END) {
3204 return -1;
3207 return stream->total_out;
3210 static void *do_data_decompress(void *opaque)
3212 DecompressParam *param = opaque;
3213 unsigned long pagesize;
3214 uint8_t *des;
3215 int len, ret;
3217 qemu_mutex_lock(&param->mutex);
3218 while (!param->quit) {
3219 if (param->des) {
3220 des = param->des;
3221 len = param->len;
3222 param->des = 0;
3223 qemu_mutex_unlock(&param->mutex);
3225 pagesize = TARGET_PAGE_SIZE;
3227 ret = qemu_uncompress_data(&param->stream, des, pagesize,
3228 param->compbuf, len);
3229 if (ret < 0 && migrate_get_current()->decompress_error_check) {
3230 error_report("decompress data failed");
3231 qemu_file_set_error(decomp_file, ret);
3234 qemu_mutex_lock(&decomp_done_lock);
3235 param->done = true;
3236 qemu_cond_signal(&decomp_done_cond);
3237 qemu_mutex_unlock(&decomp_done_lock);
3239 qemu_mutex_lock(&param->mutex);
3240 } else {
3241 qemu_cond_wait(&param->cond, &param->mutex);
3244 qemu_mutex_unlock(&param->mutex);
3246 return NULL;
3249 static int wait_for_decompress_done(void)
3251 int idx, thread_count;
3253 if (!migrate_use_compression()) {
3254 return 0;
3257 thread_count = migrate_decompress_threads();
3258 qemu_mutex_lock(&decomp_done_lock);
3259 for (idx = 0; idx < thread_count; idx++) {
3260 while (!decomp_param[idx].done) {
3261 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3264 qemu_mutex_unlock(&decomp_done_lock);
3265 return qemu_file_get_error(decomp_file);
3268 static void compress_threads_load_cleanup(void)
3270 int i, thread_count;
3272 if (!migrate_use_compression()) {
3273 return;
3275 thread_count = migrate_decompress_threads();
3276 for (i = 0; i < thread_count; i++) {
3278 * we use it as a indicator which shows if the thread is
3279 * properly init'd or not
3281 if (!decomp_param[i].compbuf) {
3282 break;
3285 qemu_mutex_lock(&decomp_param[i].mutex);
3286 decomp_param[i].quit = true;
3287 qemu_cond_signal(&decomp_param[i].cond);
3288 qemu_mutex_unlock(&decomp_param[i].mutex);
3290 for (i = 0; i < thread_count; i++) {
3291 if (!decomp_param[i].compbuf) {
3292 break;
3295 qemu_thread_join(decompress_threads + i);
3296 qemu_mutex_destroy(&decomp_param[i].mutex);
3297 qemu_cond_destroy(&decomp_param[i].cond);
3298 inflateEnd(&decomp_param[i].stream);
3299 g_free(decomp_param[i].compbuf);
3300 decomp_param[i].compbuf = NULL;
3302 g_free(decompress_threads);
3303 g_free(decomp_param);
3304 decompress_threads = NULL;
3305 decomp_param = NULL;
3306 decomp_file = NULL;
3309 static int compress_threads_load_setup(QEMUFile *f)
3311 int i, thread_count;
3313 if (!migrate_use_compression()) {
3314 return 0;
3317 thread_count = migrate_decompress_threads();
3318 decompress_threads = g_new0(QemuThread, thread_count);
3319 decomp_param = g_new0(DecompressParam, thread_count);
3320 qemu_mutex_init(&decomp_done_lock);
3321 qemu_cond_init(&decomp_done_cond);
3322 decomp_file = f;
3323 for (i = 0; i < thread_count; i++) {
3324 if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3325 goto exit;
3328 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3329 qemu_mutex_init(&decomp_param[i].mutex);
3330 qemu_cond_init(&decomp_param[i].cond);
3331 decomp_param[i].done = true;
3332 decomp_param[i].quit = false;
3333 qemu_thread_create(decompress_threads + i, "decompress",
3334 do_data_decompress, decomp_param + i,
3335 QEMU_THREAD_JOINABLE);
3337 return 0;
3338 exit:
3339 compress_threads_load_cleanup();
3340 return -1;
3343 static void decompress_data_with_multi_threads(QEMUFile *f,
3344 void *host, int len)
3346 int idx, thread_count;
3348 thread_count = migrate_decompress_threads();
3349 QEMU_LOCK_GUARD(&decomp_done_lock);
3350 while (true) {
3351 for (idx = 0; idx < thread_count; idx++) {
3352 if (decomp_param[idx].done) {
3353 decomp_param[idx].done = false;
3354 qemu_mutex_lock(&decomp_param[idx].mutex);
3355 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3356 decomp_param[idx].des = host;
3357 decomp_param[idx].len = len;
3358 qemu_cond_signal(&decomp_param[idx].cond);
3359 qemu_mutex_unlock(&decomp_param[idx].mutex);
3360 break;
3363 if (idx < thread_count) {
3364 break;
3365 } else {
3366 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3371 static void colo_init_ram_state(void)
3373 ram_state_init(&ram_state);
3377 * colo cache: this is for secondary VM, we cache the whole
3378 * memory of the secondary VM, it is need to hold the global lock
3379 * to call this helper.
3381 int colo_init_ram_cache(void)
3383 RAMBlock *block;
3385 WITH_RCU_READ_LOCK_GUARD() {
3386 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3387 block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3388 NULL, false, false);
3389 if (!block->colo_cache) {
3390 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3391 "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3392 block->used_length);
3393 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3394 if (block->colo_cache) {
3395 qemu_anon_ram_free(block->colo_cache, block->used_length);
3396 block->colo_cache = NULL;
3399 return -errno;
3405 * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3406 * with to decide which page in cache should be flushed into SVM's RAM. Here
3407 * we use the same name 'ram_bitmap' as for migration.
3409 if (ram_bytes_total()) {
3410 RAMBlock *block;
3412 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3413 unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3414 block->bmap = bitmap_new(pages);
3418 colo_init_ram_state();
3419 return 0;
3422 /* TODO: duplicated with ram_init_bitmaps */
3423 void colo_incoming_start_dirty_log(void)
3425 RAMBlock *block = NULL;
3426 /* For memory_global_dirty_log_start below. */
3427 qemu_mutex_lock_iothread();
3428 qemu_mutex_lock_ramlist();
3430 memory_global_dirty_log_sync();
3431 WITH_RCU_READ_LOCK_GUARD() {
3432 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3433 ramblock_sync_dirty_bitmap(ram_state, block);
3434 /* Discard this dirty bitmap record */
3435 bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3437 memory_global_dirty_log_start();
3439 ram_state->migration_dirty_pages = 0;
3440 qemu_mutex_unlock_ramlist();
3441 qemu_mutex_unlock_iothread();
3444 /* It is need to hold the global lock to call this helper */
3445 void colo_release_ram_cache(void)
3447 RAMBlock *block;
3449 memory_global_dirty_log_stop();
3450 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3451 g_free(block->bmap);
3452 block->bmap = NULL;
3455 WITH_RCU_READ_LOCK_GUARD() {
3456 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3457 if (block->colo_cache) {
3458 qemu_anon_ram_free(block->colo_cache, block->used_length);
3459 block->colo_cache = NULL;
3463 ram_state_cleanup(&ram_state);
3467 * ram_load_setup: Setup RAM for migration incoming side
3469 * Returns zero to indicate success and negative for error
3471 * @f: QEMUFile where to receive the data
3472 * @opaque: RAMState pointer
3474 static int ram_load_setup(QEMUFile *f, void *opaque)
3476 if (compress_threads_load_setup(f)) {
3477 return -1;
3480 xbzrle_load_setup();
3481 ramblock_recv_map_init();
3483 return 0;
3486 static int ram_load_cleanup(void *opaque)
3488 RAMBlock *rb;
3490 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3491 qemu_ram_block_writeback(rb);
3494 xbzrle_load_cleanup();
3495 compress_threads_load_cleanup();
3497 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3498 g_free(rb->receivedmap);
3499 rb->receivedmap = NULL;
3502 return 0;
3506 * ram_postcopy_incoming_init: allocate postcopy data structures
3508 * Returns 0 for success and negative if there was one error
3510 * @mis: current migration incoming state
3512 * Allocate data structures etc needed by incoming migration with
3513 * postcopy-ram. postcopy-ram's similarly names
3514 * postcopy_ram_incoming_init does the work.
3516 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3518 return postcopy_ram_incoming_init(mis);
3522 * ram_load_postcopy: load a page in postcopy case
3524 * Returns 0 for success or -errno in case of error
3526 * Called in postcopy mode by ram_load().
3527 * rcu_read_lock is taken prior to this being called.
3529 * @f: QEMUFile where to send the data
3531 static int ram_load_postcopy(QEMUFile *f)
3533 int flags = 0, ret = 0;
3534 bool place_needed = false;
3535 bool matches_target_page_size = false;
3536 MigrationIncomingState *mis = migration_incoming_get_current();
3537 /* Temporary page that is later 'placed' */
3538 void *postcopy_host_page = mis->postcopy_tmp_page;
3539 void *host_page = NULL;
3540 bool all_zero = true;
3541 int target_pages = 0;
3543 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3544 ram_addr_t addr;
3545 void *page_buffer = NULL;
3546 void *place_source = NULL;
3547 RAMBlock *block = NULL;
3548 uint8_t ch;
3549 int len;
3551 addr = qemu_get_be64(f);
3554 * If qemu file error, we should stop here, and then "addr"
3555 * may be invalid
3557 ret = qemu_file_get_error(f);
3558 if (ret) {
3559 break;
3562 flags = addr & ~TARGET_PAGE_MASK;
3563 addr &= TARGET_PAGE_MASK;
3565 trace_ram_load_postcopy_loop((uint64_t)addr, flags);
3566 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3567 RAM_SAVE_FLAG_COMPRESS_PAGE)) {
3568 block = ram_block_from_stream(f, flags);
3569 if (!block) {
3570 ret = -EINVAL;
3571 break;
3575 * Relying on used_length is racy and can result in false positives.
3576 * We might place pages beyond used_length in case RAM was shrunk
3577 * while in postcopy, which is fine - trying to place via
3578 * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault.
3580 if (!block->host || addr >= block->postcopy_length) {
3581 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3582 ret = -EINVAL;
3583 break;
3585 target_pages++;
3586 matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
3588 * Postcopy requires that we place whole host pages atomically;
3589 * these may be huge pages for RAMBlocks that are backed by
3590 * hugetlbfs.
3591 * To make it atomic, the data is read into a temporary page
3592 * that's moved into place later.
3593 * The migration protocol uses, possibly smaller, target-pages
3594 * however the source ensures it always sends all the components
3595 * of a host page in one chunk.
3597 page_buffer = postcopy_host_page +
3598 host_page_offset_from_ram_block_offset(block, addr);
3599 /* If all TP are zero then we can optimise the place */
3600 if (target_pages == 1) {
3601 host_page = host_page_from_ram_block_offset(block, addr);
3602 } else if (host_page != host_page_from_ram_block_offset(block,
3603 addr)) {
3604 /* not the 1st TP within the HP */
3605 error_report("Non-same host page %p/%p", host_page,
3606 host_page_from_ram_block_offset(block, addr));
3607 ret = -EINVAL;
3608 break;
3612 * If it's the last part of a host page then we place the host
3613 * page
3615 if (target_pages == (block->page_size / TARGET_PAGE_SIZE)) {
3616 place_needed = true;
3618 place_source = postcopy_host_page;
3621 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3622 case RAM_SAVE_FLAG_ZERO:
3623 ch = qemu_get_byte(f);
3625 * Can skip to set page_buffer when
3626 * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
3628 if (ch || !matches_target_page_size) {
3629 memset(page_buffer, ch, TARGET_PAGE_SIZE);
3631 if (ch) {
3632 all_zero = false;
3634 break;
3636 case RAM_SAVE_FLAG_PAGE:
3637 all_zero = false;
3638 if (!matches_target_page_size) {
3639 /* For huge pages, we always use temporary buffer */
3640 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3641 } else {
3643 * For small pages that matches target page size, we
3644 * avoid the qemu_file copy. Instead we directly use
3645 * the buffer of QEMUFile to place the page. Note: we
3646 * cannot do any QEMUFile operation before using that
3647 * buffer to make sure the buffer is valid when
3648 * placing the page.
3650 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3651 TARGET_PAGE_SIZE);
3653 break;
3654 case RAM_SAVE_FLAG_COMPRESS_PAGE:
3655 all_zero = false;
3656 len = qemu_get_be32(f);
3657 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3658 error_report("Invalid compressed data length: %d", len);
3659 ret = -EINVAL;
3660 break;
3662 decompress_data_with_multi_threads(f, page_buffer, len);
3663 break;
3665 case RAM_SAVE_FLAG_EOS:
3666 /* normal exit */
3667 multifd_recv_sync_main();
3668 break;
3669 default:
3670 error_report("Unknown combination of migration flags: 0x%x"
3671 " (postcopy mode)", flags);
3672 ret = -EINVAL;
3673 break;
3676 /* Got the whole host page, wait for decompress before placing. */
3677 if (place_needed) {
3678 ret |= wait_for_decompress_done();
3681 /* Detect for any possible file errors */
3682 if (!ret && qemu_file_get_error(f)) {
3683 ret = qemu_file_get_error(f);
3686 if (!ret && place_needed) {
3687 if (all_zero) {
3688 ret = postcopy_place_page_zero(mis, host_page, block);
3689 } else {
3690 ret = postcopy_place_page(mis, host_page, place_source,
3691 block);
3693 place_needed = false;
3694 target_pages = 0;
3695 /* Assume we have a zero page until we detect something different */
3696 all_zero = true;
3700 return ret;
3703 static bool postcopy_is_advised(void)
3705 PostcopyState ps = postcopy_state_get();
3706 return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
3709 static bool postcopy_is_running(void)
3711 PostcopyState ps = postcopy_state_get();
3712 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3716 * Flush content of RAM cache into SVM's memory.
3717 * Only flush the pages that be dirtied by PVM or SVM or both.
3719 void colo_flush_ram_cache(void)
3721 RAMBlock *block = NULL;
3722 void *dst_host;
3723 void *src_host;
3724 unsigned long offset = 0;
3726 memory_global_dirty_log_sync();
3727 qemu_mutex_lock(&ram_state->bitmap_mutex);
3728 WITH_RCU_READ_LOCK_GUARD() {
3729 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3730 ramblock_sync_dirty_bitmap(ram_state, block);
3734 trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
3735 WITH_RCU_READ_LOCK_GUARD() {
3736 block = QLIST_FIRST_RCU(&ram_list.blocks);
3738 while (block) {
3739 offset = migration_bitmap_find_dirty(ram_state, block, offset);
3741 if (!offset_in_ramblock(block,
3742 ((ram_addr_t)offset) << TARGET_PAGE_BITS)) {
3743 offset = 0;
3744 block = QLIST_NEXT_RCU(block, next);
3745 } else {
3746 migration_bitmap_clear_dirty(ram_state, block, offset);
3747 dst_host = block->host
3748 + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3749 src_host = block->colo_cache
3750 + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3751 memcpy(dst_host, src_host, TARGET_PAGE_SIZE);
3755 trace_colo_flush_ram_cache_end();
3756 qemu_mutex_unlock(&ram_state->bitmap_mutex);
3760 * ram_load_precopy: load pages in precopy case
3762 * Returns 0 for success or -errno in case of error
3764 * Called in precopy mode by ram_load().
3765 * rcu_read_lock is taken prior to this being called.
3767 * @f: QEMUFile where to send the data
3769 static int ram_load_precopy(QEMUFile *f)
3771 int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
3772 /* ADVISE is earlier, it shows the source has the postcopy capability on */
3773 bool postcopy_advised = postcopy_is_advised();
3774 if (!migrate_use_compression()) {
3775 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3778 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3779 ram_addr_t addr, total_ram_bytes;
3780 void *host = NULL, *host_bak = NULL;
3781 uint8_t ch;
3784 * Yield periodically to let main loop run, but an iteration of
3785 * the main loop is expensive, so do it each some iterations
3787 if ((i & 32767) == 0 && qemu_in_coroutine()) {
3788 aio_co_schedule(qemu_get_current_aio_context(),
3789 qemu_coroutine_self());
3790 qemu_coroutine_yield();
3792 i++;
3794 addr = qemu_get_be64(f);
3795 flags = addr & ~TARGET_PAGE_MASK;
3796 addr &= TARGET_PAGE_MASK;
3798 if (flags & invalid_flags) {
3799 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3800 error_report("Received an unexpected compressed page");
3803 ret = -EINVAL;
3804 break;
3807 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3808 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
3809 RAMBlock *block = ram_block_from_stream(f, flags);
3811 host = host_from_ram_block_offset(block, addr);
3813 * After going into COLO stage, we should not load the page
3814 * into SVM's memory directly, we put them into colo_cache firstly.
3815 * NOTE: We need to keep a copy of SVM's ram in colo_cache.
3816 * Previously, we copied all these memory in preparing stage of COLO
3817 * while we need to stop VM, which is a time-consuming process.
3818 * Here we optimize it by a trick, back-up every page while in
3819 * migration process while COLO is enabled, though it affects the
3820 * speed of the migration, but it obviously reduce the downtime of
3821 * back-up all SVM'S memory in COLO preparing stage.
3823 if (migration_incoming_colo_enabled()) {
3824 if (migration_incoming_in_colo_state()) {
3825 /* In COLO stage, put all pages into cache temporarily */
3826 host = colo_cache_from_block_offset(block, addr, true);
3827 } else {
3829 * In migration stage but before COLO stage,
3830 * Put all pages into both cache and SVM's memory.
3832 host_bak = colo_cache_from_block_offset(block, addr, false);
3835 if (!host) {
3836 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3837 ret = -EINVAL;
3838 break;
3840 if (!migration_incoming_in_colo_state()) {
3841 ramblock_recv_bitmap_set(block, host);
3844 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
3847 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3848 case RAM_SAVE_FLAG_MEM_SIZE:
3849 /* Synchronize RAM block list */
3850 total_ram_bytes = addr;
3851 while (!ret && total_ram_bytes) {
3852 RAMBlock *block;
3853 char id[256];
3854 ram_addr_t length;
3856 len = qemu_get_byte(f);
3857 qemu_get_buffer(f, (uint8_t *)id, len);
3858 id[len] = 0;
3859 length = qemu_get_be64(f);
3861 block = qemu_ram_block_by_name(id);
3862 if (block && !qemu_ram_is_migratable(block)) {
3863 error_report("block %s should not be migrated !", id);
3864 ret = -EINVAL;
3865 } else if (block) {
3866 if (length != block->used_length) {
3867 Error *local_err = NULL;
3869 ret = qemu_ram_resize(block, length,
3870 &local_err);
3871 if (local_err) {
3872 error_report_err(local_err);
3875 /* For postcopy we need to check hugepage sizes match */
3876 if (postcopy_advised && migrate_postcopy_ram() &&
3877 block->page_size != qemu_host_page_size) {
3878 uint64_t remote_page_size = qemu_get_be64(f);
3879 if (remote_page_size != block->page_size) {
3880 error_report("Mismatched RAM page size %s "
3881 "(local) %zd != %" PRId64,
3882 id, block->page_size,
3883 remote_page_size);
3884 ret = -EINVAL;
3887 if (migrate_ignore_shared()) {
3888 hwaddr addr = qemu_get_be64(f);
3889 if (ramblock_is_ignored(block) &&
3890 block->mr->addr != addr) {
3891 error_report("Mismatched GPAs for block %s "
3892 "%" PRId64 "!= %" PRId64,
3893 id, (uint64_t)addr,
3894 (uint64_t)block->mr->addr);
3895 ret = -EINVAL;
3898 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
3899 block->idstr);
3900 } else {
3901 error_report("Unknown ramblock \"%s\", cannot "
3902 "accept migration", id);
3903 ret = -EINVAL;
3906 total_ram_bytes -= length;
3908 break;
3910 case RAM_SAVE_FLAG_ZERO:
3911 ch = qemu_get_byte(f);
3912 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
3913 break;
3915 case RAM_SAVE_FLAG_PAGE:
3916 qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
3917 break;
3919 case RAM_SAVE_FLAG_COMPRESS_PAGE:
3920 len = qemu_get_be32(f);
3921 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3922 error_report("Invalid compressed data length: %d", len);
3923 ret = -EINVAL;
3924 break;
3926 decompress_data_with_multi_threads(f, host, len);
3927 break;
3929 case RAM_SAVE_FLAG_XBZRLE:
3930 if (load_xbzrle(f, addr, host) < 0) {
3931 error_report("Failed to decompress XBZRLE page at "
3932 RAM_ADDR_FMT, addr);
3933 ret = -EINVAL;
3934 break;
3936 break;
3937 case RAM_SAVE_FLAG_EOS:
3938 /* normal exit */
3939 multifd_recv_sync_main();
3940 break;
3941 default:
3942 if (flags & RAM_SAVE_FLAG_HOOK) {
3943 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
3944 } else {
3945 error_report("Unknown combination of migration flags: 0x%x",
3946 flags);
3947 ret = -EINVAL;
3950 if (!ret) {
3951 ret = qemu_file_get_error(f);
3953 if (!ret && host_bak) {
3954 memcpy(host_bak, host, TARGET_PAGE_SIZE);
3958 ret |= wait_for_decompress_done();
3959 return ret;
3962 static int ram_load(QEMUFile *f, void *opaque, int version_id)
3964 int ret = 0;
3965 static uint64_t seq_iter;
3967 * If system is running in postcopy mode, page inserts to host memory must
3968 * be atomic
3970 bool postcopy_running = postcopy_is_running();
3972 seq_iter++;
3974 if (version_id != 4) {
3975 return -EINVAL;
3979 * This RCU critical section can be very long running.
3980 * When RCU reclaims in the code start to become numerous,
3981 * it will be necessary to reduce the granularity of this
3982 * critical section.
3984 WITH_RCU_READ_LOCK_GUARD() {
3985 if (postcopy_running) {
3986 ret = ram_load_postcopy(f);
3987 } else {
3988 ret = ram_load_precopy(f);
3991 trace_ram_load_complete(ret, seq_iter);
3993 return ret;
3996 static bool ram_has_postcopy(void *opaque)
3998 RAMBlock *rb;
3999 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
4000 if (ramblock_is_pmem(rb)) {
4001 info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4002 "is not supported now!", rb->idstr, rb->host);
4003 return false;
4007 return migrate_postcopy_ram();
4010 /* Sync all the dirty bitmap with destination VM. */
4011 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
4013 RAMBlock *block;
4014 QEMUFile *file = s->to_dst_file;
4015 int ramblock_count = 0;
4017 trace_ram_dirty_bitmap_sync_start();
4019 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
4020 qemu_savevm_send_recv_bitmap(file, block->idstr);
4021 trace_ram_dirty_bitmap_request(block->idstr);
4022 ramblock_count++;
4025 trace_ram_dirty_bitmap_sync_wait();
4027 /* Wait until all the ramblocks' dirty bitmap synced */
4028 while (ramblock_count--) {
4029 qemu_sem_wait(&s->rp_state.rp_sem);
4032 trace_ram_dirty_bitmap_sync_complete();
4034 return 0;
4037 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
4039 qemu_sem_post(&s->rp_state.rp_sem);
4043 * Read the received bitmap, revert it as the initial dirty bitmap.
4044 * This is only used when the postcopy migration is paused but wants
4045 * to resume from a middle point.
4047 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
4049 int ret = -EINVAL;
4050 /* from_dst_file is always valid because we're within rp_thread */
4051 QEMUFile *file = s->rp_state.from_dst_file;
4052 unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
4053 uint64_t local_size = DIV_ROUND_UP(nbits, 8);
4054 uint64_t size, end_mark;
4056 trace_ram_dirty_bitmap_reload_begin(block->idstr);
4058 if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
4059 error_report("%s: incorrect state %s", __func__,
4060 MigrationStatus_str(s->state));
4061 return -EINVAL;
4065 * Note: see comments in ramblock_recv_bitmap_send() on why we
4066 * need the endianness conversion, and the paddings.
4068 local_size = ROUND_UP(local_size, 8);
4070 /* Add paddings */
4071 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4073 size = qemu_get_be64(file);
4075 /* The size of the bitmap should match with our ramblock */
4076 if (size != local_size) {
4077 error_report("%s: ramblock '%s' bitmap size mismatch "
4078 "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4079 block->idstr, size, local_size);
4080 ret = -EINVAL;
4081 goto out;
4084 size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4085 end_mark = qemu_get_be64(file);
4087 ret = qemu_file_get_error(file);
4088 if (ret || size != local_size) {
4089 error_report("%s: read bitmap failed for ramblock '%s': %d"
4090 " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4091 __func__, block->idstr, ret, local_size, size);
4092 ret = -EIO;
4093 goto out;
4096 if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4097 error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64,
4098 __func__, block->idstr, end_mark);
4099 ret = -EINVAL;
4100 goto out;
4104 * Endianness conversion. We are during postcopy (though paused).
4105 * The dirty bitmap won't change. We can directly modify it.
4107 bitmap_from_le(block->bmap, le_bitmap, nbits);
4110 * What we received is "received bitmap". Revert it as the initial
4111 * dirty bitmap for this ramblock.
4113 bitmap_complement(block->bmap, block->bmap, nbits);
4115 trace_ram_dirty_bitmap_reload_complete(block->idstr);
4118 * We succeeded to sync bitmap for current ramblock. If this is
4119 * the last one to sync, we need to notify the main send thread.
4121 ram_dirty_bitmap_reload_notify(s);
4123 ret = 0;
4124 out:
4125 g_free(le_bitmap);
4126 return ret;
4129 static int ram_resume_prepare(MigrationState *s, void *opaque)
4131 RAMState *rs = *(RAMState **)opaque;
4132 int ret;
4134 ret = ram_dirty_bitmap_sync_all(s, rs);
4135 if (ret) {
4136 return ret;
4139 ram_state_resume_prepare(rs, s->to_dst_file);
4141 return 0;
4144 static SaveVMHandlers savevm_ram_handlers = {
4145 .save_setup = ram_save_setup,
4146 .save_live_iterate = ram_save_iterate,
4147 .save_live_complete_postcopy = ram_save_complete,
4148 .save_live_complete_precopy = ram_save_complete,
4149 .has_postcopy = ram_has_postcopy,
4150 .save_live_pending = ram_save_pending,
4151 .load_state = ram_load,
4152 .save_cleanup = ram_save_cleanup,
4153 .load_setup = ram_load_setup,
4154 .load_cleanup = ram_load_cleanup,
4155 .resume_prepare = ram_resume_prepare,
4158 static void ram_mig_ram_block_resized(RAMBlockNotifier *n, void *host,
4159 size_t old_size, size_t new_size)
4161 PostcopyState ps = postcopy_state_get();
4162 ram_addr_t offset;
4163 RAMBlock *rb = qemu_ram_block_from_host(host, false, &offset);
4164 Error *err = NULL;
4166 if (ramblock_is_ignored(rb)) {
4167 return;
4170 if (!migration_is_idle()) {
4172 * Precopy code on the source cannot deal with the size of RAM blocks
4173 * changing at random points in time - especially after sending the
4174 * RAM block sizes in the migration stream, they must no longer change.
4175 * Abort and indicate a proper reason.
4177 error_setg(&err, "RAM block '%s' resized during precopy.", rb->idstr);
4178 migrate_set_error(migrate_get_current(), err);
4179 error_free(err);
4180 migration_cancel();
4183 switch (ps) {
4184 case POSTCOPY_INCOMING_ADVISE:
4186 * Update what ram_postcopy_incoming_init()->init_range() does at the
4187 * time postcopy was advised. Syncing RAM blocks with the source will
4188 * result in RAM resizes.
4190 if (old_size < new_size) {
4191 if (ram_discard_range(rb->idstr, old_size, new_size - old_size)) {
4192 error_report("RAM block '%s' discard of resized RAM failed",
4193 rb->idstr);
4196 rb->postcopy_length = new_size;
4197 break;
4198 case POSTCOPY_INCOMING_NONE:
4199 case POSTCOPY_INCOMING_RUNNING:
4200 case POSTCOPY_INCOMING_END:
4202 * Once our guest is running, postcopy does no longer care about
4203 * resizes. When growing, the new memory was not available on the
4204 * source, no handler needed.
4206 break;
4207 default:
4208 error_report("RAM block '%s' resized during postcopy state: %d",
4209 rb->idstr, ps);
4210 exit(-1);
4214 static RAMBlockNotifier ram_mig_ram_notifier = {
4215 .ram_block_resized = ram_mig_ram_block_resized,
4218 void ram_mig_init(void)
4220 qemu_mutex_init(&XBZRLE.lock);
4221 register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);
4222 ram_block_notifier_add(&ram_mig_ram_notifier);