4 * Copyright (c) 2003-2008 Fabrice Bellard
5 * Copyright (c) 2011-2015 Red Hat Inc
8 * Juan Quintela <quintela@redhat.com>
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
29 #include "qemu/osdep.h"
30 #include "qemu/cutils.h"
31 #include "qemu/bitops.h"
32 #include "qemu/bitmap.h"
33 #include "qemu/main-loop.h"
36 #include "migration.h"
37 #include "migration/register.h"
38 #include "migration/misc.h"
39 #include "qemu-file.h"
40 #include "postcopy-ram.h"
41 #include "page_cache.h"
42 #include "qemu/error-report.h"
43 #include "qapi/error.h"
44 #include "qapi/qapi-types-migration.h"
45 #include "qapi/qapi-events-migration.h"
46 #include "qapi/qmp/qerror.h"
48 #include "exec/ram_addr.h"
49 #include "exec/target_page.h"
50 #include "qemu/rcu_queue.h"
51 #include "migration/colo.h"
53 #include "sysemu/cpu-throttle.h"
57 #include "sysemu/runstate.h"
59 #include "hw/boards.h" /* for machine_dump_guest_core() */
61 #if defined(__linux__)
62 #include "qemu/userfaultfd.h"
63 #endif /* defined(__linux__) */
65 /***********************************************************/
66 /* ram save/restore */
68 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
69 * worked for pages that where filled with the same char. We switched
70 * it to only search for the zero value. And to avoid confusion with
71 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
74 #define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */
75 #define RAM_SAVE_FLAG_ZERO 0x02
76 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
77 #define RAM_SAVE_FLAG_PAGE 0x08
78 #define RAM_SAVE_FLAG_EOS 0x10
79 #define RAM_SAVE_FLAG_CONTINUE 0x20
80 #define RAM_SAVE_FLAG_XBZRLE 0x40
81 /* 0x80 is reserved in migration.h start with 0x100 next */
82 #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
84 static inline bool is_zero_range(uint8_t *p
, uint64_t size
)
86 return buffer_is_zero(p
, size
);
89 XBZRLECacheStats xbzrle_counters
;
91 /* struct contains XBZRLE cache and a static page
92 used by the compression */
94 /* buffer used for XBZRLE encoding */
96 /* buffer for storing page content */
98 /* Cache for XBZRLE, Protected by lock. */
101 /* it will store a page full of zeros */
102 uint8_t *zero_target_page
;
103 /* buffer used for XBZRLE decoding */
104 uint8_t *decoded_buf
;
107 static void XBZRLE_cache_lock(void)
109 if (migrate_use_xbzrle()) {
110 qemu_mutex_lock(&XBZRLE
.lock
);
114 static void XBZRLE_cache_unlock(void)
116 if (migrate_use_xbzrle()) {
117 qemu_mutex_unlock(&XBZRLE
.lock
);
122 * xbzrle_cache_resize: resize the xbzrle cache
124 * This function is called from migrate_params_apply in main
125 * thread, possibly while a migration is in progress. A running
126 * migration may be using the cache and might finish during this call,
127 * hence changes to the cache are protected by XBZRLE.lock().
129 * Returns 0 for success or -1 for error
131 * @new_size: new cache size
132 * @errp: set *errp if the check failed, with reason
134 int xbzrle_cache_resize(uint64_t new_size
, Error
**errp
)
136 PageCache
*new_cache
;
139 /* Check for truncation */
140 if (new_size
!= (size_t)new_size
) {
141 error_setg(errp
, QERR_INVALID_PARAMETER_VALUE
, "cache size",
142 "exceeding address space");
146 if (new_size
== migrate_xbzrle_cache_size()) {
153 if (XBZRLE
.cache
!= NULL
) {
154 new_cache
= cache_init(new_size
, TARGET_PAGE_SIZE
, errp
);
160 cache_fini(XBZRLE
.cache
);
161 XBZRLE
.cache
= new_cache
;
164 XBZRLE_cache_unlock();
168 bool ramblock_is_ignored(RAMBlock
*block
)
170 return !qemu_ram_is_migratable(block
) ||
171 (migrate_ignore_shared() && qemu_ram_is_shared(block
));
174 #undef RAMBLOCK_FOREACH
176 int foreach_not_ignored_block(RAMBlockIterFunc func
, void *opaque
)
181 RCU_READ_LOCK_GUARD();
183 RAMBLOCK_FOREACH_NOT_IGNORED(block
) {
184 ret
= func(block
, opaque
);
192 static void ramblock_recv_map_init(void)
196 RAMBLOCK_FOREACH_NOT_IGNORED(rb
) {
197 assert(!rb
->receivedmap
);
198 rb
->receivedmap
= bitmap_new(rb
->max_length
>> qemu_target_page_bits());
202 int ramblock_recv_bitmap_test(RAMBlock
*rb
, void *host_addr
)
204 return test_bit(ramblock_recv_bitmap_offset(host_addr
, rb
),
208 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock
*rb
, uint64_t byte_offset
)
210 return test_bit(byte_offset
>> TARGET_PAGE_BITS
, rb
->receivedmap
);
213 void ramblock_recv_bitmap_set(RAMBlock
*rb
, void *host_addr
)
215 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr
, rb
), rb
->receivedmap
);
218 void ramblock_recv_bitmap_set_range(RAMBlock
*rb
, void *host_addr
,
221 bitmap_set_atomic(rb
->receivedmap
,
222 ramblock_recv_bitmap_offset(host_addr
, rb
),
226 #define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL)
229 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
231 * Returns >0 if success with sent bytes, or <0 if error.
233 int64_t ramblock_recv_bitmap_send(QEMUFile
*file
,
234 const char *block_name
)
236 RAMBlock
*block
= qemu_ram_block_by_name(block_name
);
237 unsigned long *le_bitmap
, nbits
;
241 error_report("%s: invalid block name: %s", __func__
, block_name
);
245 nbits
= block
->postcopy_length
>> TARGET_PAGE_BITS
;
248 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
249 * machines we may need 4 more bytes for padding (see below
250 * comment). So extend it a bit before hand.
252 le_bitmap
= bitmap_new(nbits
+ BITS_PER_LONG
);
255 * Always use little endian when sending the bitmap. This is
256 * required that when source and destination VMs are not using the
257 * same endianness. (Note: big endian won't work.)
259 bitmap_to_le(le_bitmap
, block
->receivedmap
, nbits
);
261 /* Size of the bitmap, in bytes */
262 size
= DIV_ROUND_UP(nbits
, 8);
265 * size is always aligned to 8 bytes for 64bit machines, but it
266 * may not be true for 32bit machines. We need this padding to
267 * make sure the migration can survive even between 32bit and
270 size
= ROUND_UP(size
, 8);
272 qemu_put_be64(file
, size
);
273 qemu_put_buffer(file
, (const uint8_t *)le_bitmap
, size
);
275 * Mark as an end, in case the middle part is screwed up due to
276 * some "mysterious" reason.
278 qemu_put_be64(file
, RAMBLOCK_RECV_BITMAP_ENDING
);
283 if (qemu_file_get_error(file
)) {
284 return qemu_file_get_error(file
);
287 return size
+ sizeof(size
);
291 * An outstanding page request, on the source, having been received
294 struct RAMSrcPageRequest
{
299 QSIMPLEQ_ENTRY(RAMSrcPageRequest
) next_req
;
302 /* State of RAM for migration */
304 /* QEMUFile used for this migration */
306 /* UFFD file descriptor, used in 'write-tracking' migration */
308 /* Last block that we have visited searching for dirty pages */
309 RAMBlock
*last_seen_block
;
310 /* Last block from where we have sent data */
311 RAMBlock
*last_sent_block
;
312 /* Last dirty target page we have sent */
313 ram_addr_t last_page
;
314 /* last ram version we have seen */
315 uint32_t last_version
;
316 /* How many times we have dirty too many pages */
317 int dirty_rate_high_cnt
;
318 /* these variables are used for bitmap sync */
319 /* last time we did a full bitmap_sync */
320 int64_t time_last_bitmap_sync
;
321 /* bytes transferred at start_time */
322 uint64_t bytes_xfer_prev
;
323 /* number of dirty pages since start_time */
324 uint64_t num_dirty_pages_period
;
325 /* xbzrle misses since the beginning of the period */
326 uint64_t xbzrle_cache_miss_prev
;
327 /* Amount of xbzrle pages since the beginning of the period */
328 uint64_t xbzrle_pages_prev
;
329 /* Amount of xbzrle encoded bytes since the beginning of the period */
330 uint64_t xbzrle_bytes_prev
;
331 /* Start using XBZRLE (e.g., after the first round). */
334 /* compression statistics since the beginning of the period */
335 /* amount of count that no free thread to compress data */
336 uint64_t compress_thread_busy_prev
;
337 /* amount bytes after compression */
338 uint64_t compressed_size_prev
;
339 /* amount of compressed pages */
340 uint64_t compress_pages_prev
;
342 /* total handled target pages at the beginning of period */
343 uint64_t target_page_count_prev
;
344 /* total handled target pages since start */
345 uint64_t target_page_count
;
346 /* number of dirty bits in the bitmap */
347 uint64_t migration_dirty_pages
;
348 /* Protects modification of the bitmap and migration dirty pages */
349 QemuMutex bitmap_mutex
;
350 /* The RAMBlock used in the last src_page_requests */
351 RAMBlock
*last_req_rb
;
352 /* Queue of outstanding page requests from the destination */
353 QemuMutex src_page_req_mutex
;
354 QSIMPLEQ_HEAD(, RAMSrcPageRequest
) src_page_requests
;
356 typedef struct RAMState RAMState
;
358 static RAMState
*ram_state
;
360 static NotifierWithReturnList precopy_notifier_list
;
362 void precopy_infrastructure_init(void)
364 notifier_with_return_list_init(&precopy_notifier_list
);
367 void precopy_add_notifier(NotifierWithReturn
*n
)
369 notifier_with_return_list_add(&precopy_notifier_list
, n
);
372 void precopy_remove_notifier(NotifierWithReturn
*n
)
374 notifier_with_return_remove(n
);
377 int precopy_notify(PrecopyNotifyReason reason
, Error
**errp
)
379 PrecopyNotifyData pnd
;
383 return notifier_with_return_list_notify(&precopy_notifier_list
, &pnd
);
386 uint64_t ram_bytes_remaining(void)
388 return ram_state
? (ram_state
->migration_dirty_pages
* TARGET_PAGE_SIZE
) :
392 MigrationStats ram_counters
;
394 /* used by the search for pages to send */
395 struct PageSearchStatus
{
396 /* Current block being searched */
398 /* Current page to search from */
400 /* Set once we wrap around */
403 typedef struct PageSearchStatus PageSearchStatus
;
405 CompressionStats compression_counters
;
407 struct CompressParam
{
417 /* internally used fields */
421 typedef struct CompressParam CompressParam
;
423 struct DecompressParam
{
433 typedef struct DecompressParam DecompressParam
;
435 static CompressParam
*comp_param
;
436 static QemuThread
*compress_threads
;
437 /* comp_done_cond is used to wake up the migration thread when
438 * one of the compression threads has finished the compression.
439 * comp_done_lock is used to co-work with comp_done_cond.
441 static QemuMutex comp_done_lock
;
442 static QemuCond comp_done_cond
;
443 /* The empty QEMUFileOps will be used by file in CompressParam */
444 static const QEMUFileOps empty_ops
= { };
446 static QEMUFile
*decomp_file
;
447 static DecompressParam
*decomp_param
;
448 static QemuThread
*decompress_threads
;
449 static QemuMutex decomp_done_lock
;
450 static QemuCond decomp_done_cond
;
452 static bool do_compress_ram_page(QEMUFile
*f
, z_stream
*stream
, RAMBlock
*block
,
453 ram_addr_t offset
, uint8_t *source_buf
);
455 static void *do_data_compress(void *opaque
)
457 CompressParam
*param
= opaque
;
462 qemu_mutex_lock(¶m
->mutex
);
463 while (!param
->quit
) {
465 block
= param
->block
;
466 offset
= param
->offset
;
468 qemu_mutex_unlock(¶m
->mutex
);
470 zero_page
= do_compress_ram_page(param
->file
, ¶m
->stream
,
471 block
, offset
, param
->originbuf
);
473 qemu_mutex_lock(&comp_done_lock
);
475 param
->zero_page
= zero_page
;
476 qemu_cond_signal(&comp_done_cond
);
477 qemu_mutex_unlock(&comp_done_lock
);
479 qemu_mutex_lock(¶m
->mutex
);
481 qemu_cond_wait(¶m
->cond
, ¶m
->mutex
);
484 qemu_mutex_unlock(¶m
->mutex
);
489 static void compress_threads_save_cleanup(void)
493 if (!migrate_use_compression() || !comp_param
) {
497 thread_count
= migrate_compress_threads();
498 for (i
= 0; i
< thread_count
; i
++) {
500 * we use it as a indicator which shows if the thread is
501 * properly init'd or not
503 if (!comp_param
[i
].file
) {
507 qemu_mutex_lock(&comp_param
[i
].mutex
);
508 comp_param
[i
].quit
= true;
509 qemu_cond_signal(&comp_param
[i
].cond
);
510 qemu_mutex_unlock(&comp_param
[i
].mutex
);
512 qemu_thread_join(compress_threads
+ i
);
513 qemu_mutex_destroy(&comp_param
[i
].mutex
);
514 qemu_cond_destroy(&comp_param
[i
].cond
);
515 deflateEnd(&comp_param
[i
].stream
);
516 g_free(comp_param
[i
].originbuf
);
517 qemu_fclose(comp_param
[i
].file
);
518 comp_param
[i
].file
= NULL
;
520 qemu_mutex_destroy(&comp_done_lock
);
521 qemu_cond_destroy(&comp_done_cond
);
522 g_free(compress_threads
);
524 compress_threads
= NULL
;
528 static int compress_threads_save_setup(void)
532 if (!migrate_use_compression()) {
535 thread_count
= migrate_compress_threads();
536 compress_threads
= g_new0(QemuThread
, thread_count
);
537 comp_param
= g_new0(CompressParam
, thread_count
);
538 qemu_cond_init(&comp_done_cond
);
539 qemu_mutex_init(&comp_done_lock
);
540 for (i
= 0; i
< thread_count
; i
++) {
541 comp_param
[i
].originbuf
= g_try_malloc(TARGET_PAGE_SIZE
);
542 if (!comp_param
[i
].originbuf
) {
546 if (deflateInit(&comp_param
[i
].stream
,
547 migrate_compress_level()) != Z_OK
) {
548 g_free(comp_param
[i
].originbuf
);
552 /* comp_param[i].file is just used as a dummy buffer to save data,
553 * set its ops to empty.
555 comp_param
[i
].file
= qemu_fopen_ops(NULL
, &empty_ops
, false);
556 comp_param
[i
].done
= true;
557 comp_param
[i
].quit
= false;
558 qemu_mutex_init(&comp_param
[i
].mutex
);
559 qemu_cond_init(&comp_param
[i
].cond
);
560 qemu_thread_create(compress_threads
+ i
, "compress",
561 do_data_compress
, comp_param
+ i
,
562 QEMU_THREAD_JOINABLE
);
567 compress_threads_save_cleanup();
572 * save_page_header: write page header to wire
574 * If this is the 1st block, it also writes the block identification
576 * Returns the number of bytes written
578 * @f: QEMUFile where to send the data
579 * @block: block that contains the page we want to send
580 * @offset: offset inside the block for the page
581 * in the lower bits, it contains flags
583 static size_t save_page_header(RAMState
*rs
, QEMUFile
*f
, RAMBlock
*block
,
588 if (block
== rs
->last_sent_block
) {
589 offset
|= RAM_SAVE_FLAG_CONTINUE
;
591 qemu_put_be64(f
, offset
);
594 if (!(offset
& RAM_SAVE_FLAG_CONTINUE
)) {
595 len
= strlen(block
->idstr
);
596 qemu_put_byte(f
, len
);
597 qemu_put_buffer(f
, (uint8_t *)block
->idstr
, len
);
599 rs
->last_sent_block
= block
;
605 * mig_throttle_guest_down: throttle down the guest
607 * Reduce amount of guest cpu execution to hopefully slow down memory
608 * writes. If guest dirty memory rate is reduced below the rate at
609 * which we can transfer pages to the destination then we should be
610 * able to complete migration. Some workloads dirty memory way too
611 * fast and will not effectively converge, even with auto-converge.
613 static void mig_throttle_guest_down(uint64_t bytes_dirty_period
,
614 uint64_t bytes_dirty_threshold
)
616 MigrationState
*s
= migrate_get_current();
617 uint64_t pct_initial
= s
->parameters
.cpu_throttle_initial
;
618 uint64_t pct_increment
= s
->parameters
.cpu_throttle_increment
;
619 bool pct_tailslow
= s
->parameters
.cpu_throttle_tailslow
;
620 int pct_max
= s
->parameters
.max_cpu_throttle
;
622 uint64_t throttle_now
= cpu_throttle_get_percentage();
623 uint64_t cpu_now
, cpu_ideal
, throttle_inc
;
625 /* We have not started throttling yet. Let's start it. */
626 if (!cpu_throttle_active()) {
627 cpu_throttle_set(pct_initial
);
629 /* Throttling already on, just increase the rate */
631 throttle_inc
= pct_increment
;
633 /* Compute the ideal CPU percentage used by Guest, which may
634 * make the dirty rate match the dirty rate threshold. */
635 cpu_now
= 100 - throttle_now
;
636 cpu_ideal
= cpu_now
* (bytes_dirty_threshold
* 1.0 /
638 throttle_inc
= MIN(cpu_now
- cpu_ideal
, pct_increment
);
640 cpu_throttle_set(MIN(throttle_now
+ throttle_inc
, pct_max
));
645 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
647 * @rs: current RAM state
648 * @current_addr: address for the zero page
650 * Update the xbzrle cache to reflect a page that's been sent as all 0.
651 * The important thing is that a stale (not-yet-0'd) page be replaced
653 * As a bonus, if the page wasn't in the cache it gets added so that
654 * when a small write is made into the 0'd page it gets XBZRLE sent.
656 static void xbzrle_cache_zero_page(RAMState
*rs
, ram_addr_t current_addr
)
658 if (!rs
->xbzrle_enabled
) {
662 /* We don't care if this fails to allocate a new cache page
663 * as long as it updated an old one */
664 cache_insert(XBZRLE
.cache
, current_addr
, XBZRLE
.zero_target_page
,
665 ram_counters
.dirty_sync_count
);
668 #define ENCODING_FLAG_XBZRLE 0x1
671 * save_xbzrle_page: compress and send current page
673 * Returns: 1 means that we wrote the page
674 * 0 means that page is identical to the one already sent
675 * -1 means that xbzrle would be longer than normal
677 * @rs: current RAM state
678 * @current_data: pointer to the address of the page contents
679 * @current_addr: addr of the page
680 * @block: block that contains the page we want to send
681 * @offset: offset inside the block for the page
682 * @last_stage: if we are at the completion stage
684 static int save_xbzrle_page(RAMState
*rs
, uint8_t **current_data
,
685 ram_addr_t current_addr
, RAMBlock
*block
,
686 ram_addr_t offset
, bool last_stage
)
688 int encoded_len
= 0, bytes_xbzrle
;
689 uint8_t *prev_cached_page
;
691 if (!cache_is_cached(XBZRLE
.cache
, current_addr
,
692 ram_counters
.dirty_sync_count
)) {
693 xbzrle_counters
.cache_miss
++;
695 if (cache_insert(XBZRLE
.cache
, current_addr
, *current_data
,
696 ram_counters
.dirty_sync_count
) == -1) {
699 /* update *current_data when the page has been
700 inserted into cache */
701 *current_data
= get_cached_data(XBZRLE
.cache
, current_addr
);
708 * Reaching here means the page has hit the xbzrle cache, no matter what
709 * encoding result it is (normal encoding, overflow or skipping the page),
710 * count the page as encoded. This is used to calculate the encoding rate.
712 * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
713 * 2nd page turns out to be skipped (i.e. no new bytes written to the
714 * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
715 * skipped page included. In this way, the encoding rate can tell if the
716 * guest page is good for xbzrle encoding.
718 xbzrle_counters
.pages
++;
719 prev_cached_page
= get_cached_data(XBZRLE
.cache
, current_addr
);
721 /* save current buffer into memory */
722 memcpy(XBZRLE
.current_buf
, *current_data
, TARGET_PAGE_SIZE
);
724 /* XBZRLE encoding (if there is no overflow) */
725 encoded_len
= xbzrle_encode_buffer(prev_cached_page
, XBZRLE
.current_buf
,
726 TARGET_PAGE_SIZE
, XBZRLE
.encoded_buf
,
730 * Update the cache contents, so that it corresponds to the data
731 * sent, in all cases except where we skip the page.
733 if (!last_stage
&& encoded_len
!= 0) {
734 memcpy(prev_cached_page
, XBZRLE
.current_buf
, TARGET_PAGE_SIZE
);
736 * In the case where we couldn't compress, ensure that the caller
737 * sends the data from the cache, since the guest might have
738 * changed the RAM since we copied it.
740 *current_data
= prev_cached_page
;
743 if (encoded_len
== 0) {
744 trace_save_xbzrle_page_skipping();
746 } else if (encoded_len
== -1) {
747 trace_save_xbzrle_page_overflow();
748 xbzrle_counters
.overflow
++;
749 xbzrle_counters
.bytes
+= TARGET_PAGE_SIZE
;
753 /* Send XBZRLE based compressed page */
754 bytes_xbzrle
= save_page_header(rs
, rs
->f
, block
,
755 offset
| RAM_SAVE_FLAG_XBZRLE
);
756 qemu_put_byte(rs
->f
, ENCODING_FLAG_XBZRLE
);
757 qemu_put_be16(rs
->f
, encoded_len
);
758 qemu_put_buffer(rs
->f
, XBZRLE
.encoded_buf
, encoded_len
);
759 bytes_xbzrle
+= encoded_len
+ 1 + 2;
761 * Like compressed_size (please see update_compress_thread_counts),
762 * the xbzrle encoded bytes don't count the 8 byte header with
763 * RAM_SAVE_FLAG_CONTINUE.
765 xbzrle_counters
.bytes
+= bytes_xbzrle
- 8;
766 ram_counters
.transferred
+= bytes_xbzrle
;
772 * migration_bitmap_find_dirty: find the next dirty page from start
774 * Returns the page offset within memory region of the start of a dirty page
776 * @rs: current RAM state
777 * @rb: RAMBlock where to search for dirty pages
778 * @start: page where we start the search
781 unsigned long migration_bitmap_find_dirty(RAMState
*rs
, RAMBlock
*rb
,
784 unsigned long size
= rb
->used_length
>> TARGET_PAGE_BITS
;
785 unsigned long *bitmap
= rb
->bmap
;
787 if (ramblock_is_ignored(rb
)) {
791 return find_next_bit(bitmap
, size
, start
);
794 static void migration_clear_memory_region_dirty_bitmap(RAMBlock
*rb
,
800 if (!rb
->clear_bmap
|| !clear_bmap_test_and_clear(rb
, page
)) {
804 shift
= rb
->clear_bmap_shift
;
806 * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
807 * can make things easier sometimes since then start address
808 * of the small chunk will always be 64 pages aligned so the
809 * bitmap will always be aligned to unsigned long. We should
810 * even be able to remove this restriction but I'm simply
815 size
= 1ULL << (TARGET_PAGE_BITS
+ shift
);
816 start
= QEMU_ALIGN_DOWN((ram_addr_t
)page
<< TARGET_PAGE_BITS
, size
);
817 trace_migration_bitmap_clear_dirty(rb
->idstr
, start
, size
, page
);
818 memory_region_clear_dirty_bitmap(rb
->mr
, start
, size
);
822 migration_clear_memory_region_dirty_bitmap_range(RAMBlock
*rb
,
824 unsigned long npages
)
826 unsigned long i
, chunk_pages
= 1UL << rb
->clear_bmap_shift
;
827 unsigned long chunk_start
= QEMU_ALIGN_DOWN(start
, chunk_pages
);
828 unsigned long chunk_end
= QEMU_ALIGN_UP(start
+ npages
, chunk_pages
);
831 * Clear pages from start to start + npages - 1, so the end boundary is
834 for (i
= chunk_start
; i
< chunk_end
; i
+= chunk_pages
) {
835 migration_clear_memory_region_dirty_bitmap(rb
, i
);
839 static inline bool migration_bitmap_clear_dirty(RAMState
*rs
,
846 * Clear dirty bitmap if needed. This _must_ be called before we
847 * send any of the page in the chunk because we need to make sure
848 * we can capture further page content changes when we sync dirty
849 * log the next time. So as long as we are going to send any of
850 * the page in the chunk we clear the remote dirty bitmap for all.
851 * Clearing it earlier won't be a problem, but too late will.
853 migration_clear_memory_region_dirty_bitmap(rb
, page
);
855 ret
= test_and_clear_bit(page
, rb
->bmap
);
857 rs
->migration_dirty_pages
--;
863 static void dirty_bitmap_clear_section(MemoryRegionSection
*section
,
866 const hwaddr offset
= section
->offset_within_region
;
867 const hwaddr size
= int128_get64(section
->size
);
868 const unsigned long start
= offset
>> TARGET_PAGE_BITS
;
869 const unsigned long npages
= size
>> TARGET_PAGE_BITS
;
870 RAMBlock
*rb
= section
->mr
->ram_block
;
871 uint64_t *cleared_bits
= opaque
;
874 * We don't grab ram_state->bitmap_mutex because we expect to run
875 * only when starting migration or during postcopy recovery where
876 * we don't have concurrent access.
878 if (!migration_in_postcopy() && !migrate_background_snapshot()) {
879 migration_clear_memory_region_dirty_bitmap_range(rb
, start
, npages
);
881 *cleared_bits
+= bitmap_count_one_with_offset(rb
->bmap
, start
, npages
);
882 bitmap_clear(rb
->bmap
, start
, npages
);
886 * Exclude all dirty pages from migration that fall into a discarded range as
887 * managed by a RamDiscardManager responsible for the mapped memory region of
888 * the RAMBlock. Clear the corresponding bits in the dirty bitmaps.
890 * Discarded pages ("logically unplugged") have undefined content and must
891 * not get migrated, because even reading these pages for migration might
892 * result in undesired behavior.
894 * Returns the number of cleared bits in the RAMBlock dirty bitmap.
896 * Note: The result is only stable while migrating (precopy/postcopy).
898 static uint64_t ramblock_dirty_bitmap_clear_discarded_pages(RAMBlock
*rb
)
900 uint64_t cleared_bits
= 0;
902 if (rb
->mr
&& rb
->bmap
&& memory_region_has_ram_discard_manager(rb
->mr
)) {
903 RamDiscardManager
*rdm
= memory_region_get_ram_discard_manager(rb
->mr
);
904 MemoryRegionSection section
= {
906 .offset_within_region
= 0,
907 .size
= int128_make64(qemu_ram_get_used_length(rb
)),
910 ram_discard_manager_replay_discarded(rdm
, §ion
,
911 dirty_bitmap_clear_section
,
918 * Check if a host-page aligned page falls into a discarded range as managed by
919 * a RamDiscardManager responsible for the mapped memory region of the RAMBlock.
921 * Note: The result is only stable while migrating (precopy/postcopy).
923 bool ramblock_page_is_discarded(RAMBlock
*rb
, ram_addr_t start
)
925 if (rb
->mr
&& memory_region_has_ram_discard_manager(rb
->mr
)) {
926 RamDiscardManager
*rdm
= memory_region_get_ram_discard_manager(rb
->mr
);
927 MemoryRegionSection section
= {
929 .offset_within_region
= start
,
930 .size
= int128_make64(qemu_ram_pagesize(rb
)),
933 return !ram_discard_manager_is_populated(rdm
, §ion
);
938 /* Called with RCU critical section */
939 static void ramblock_sync_dirty_bitmap(RAMState
*rs
, RAMBlock
*rb
)
941 uint64_t new_dirty_pages
=
942 cpu_physical_memory_sync_dirty_bitmap(rb
, 0, rb
->used_length
);
944 rs
->migration_dirty_pages
+= new_dirty_pages
;
945 rs
->num_dirty_pages_period
+= new_dirty_pages
;
949 * ram_pagesize_summary: calculate all the pagesizes of a VM
951 * Returns a summary bitmap of the page sizes of all RAMBlocks
953 * For VMs with just normal pages this is equivalent to the host page
954 * size. If it's got some huge pages then it's the OR of all the
955 * different page sizes.
957 uint64_t ram_pagesize_summary(void)
960 uint64_t summary
= 0;
962 RAMBLOCK_FOREACH_NOT_IGNORED(block
) {
963 summary
|= block
->page_size
;
969 uint64_t ram_get_total_transferred_pages(void)
971 return ram_counters
.normal
+ ram_counters
.duplicate
+
972 compression_counters
.pages
+ xbzrle_counters
.pages
;
975 static void migration_update_rates(RAMState
*rs
, int64_t end_time
)
977 uint64_t page_count
= rs
->target_page_count
- rs
->target_page_count_prev
;
978 double compressed_size
;
980 /* calculate period counters */
981 ram_counters
.dirty_pages_rate
= rs
->num_dirty_pages_period
* 1000
982 / (end_time
- rs
->time_last_bitmap_sync
);
988 if (migrate_use_xbzrle()) {
989 double encoded_size
, unencoded_size
;
991 xbzrle_counters
.cache_miss_rate
= (double)(xbzrle_counters
.cache_miss
-
992 rs
->xbzrle_cache_miss_prev
) / page_count
;
993 rs
->xbzrle_cache_miss_prev
= xbzrle_counters
.cache_miss
;
994 unencoded_size
= (xbzrle_counters
.pages
- rs
->xbzrle_pages_prev
) *
996 encoded_size
= xbzrle_counters
.bytes
- rs
->xbzrle_bytes_prev
;
997 if (xbzrle_counters
.pages
== rs
->xbzrle_pages_prev
|| !encoded_size
) {
998 xbzrle_counters
.encoding_rate
= 0;
1000 xbzrle_counters
.encoding_rate
= unencoded_size
/ encoded_size
;
1002 rs
->xbzrle_pages_prev
= xbzrle_counters
.pages
;
1003 rs
->xbzrle_bytes_prev
= xbzrle_counters
.bytes
;
1006 if (migrate_use_compression()) {
1007 compression_counters
.busy_rate
= (double)(compression_counters
.busy
-
1008 rs
->compress_thread_busy_prev
) / page_count
;
1009 rs
->compress_thread_busy_prev
= compression_counters
.busy
;
1011 compressed_size
= compression_counters
.compressed_size
-
1012 rs
->compressed_size_prev
;
1013 if (compressed_size
) {
1014 double uncompressed_size
= (compression_counters
.pages
-
1015 rs
->compress_pages_prev
) * TARGET_PAGE_SIZE
;
1017 /* Compression-Ratio = Uncompressed-size / Compressed-size */
1018 compression_counters
.compression_rate
=
1019 uncompressed_size
/ compressed_size
;
1021 rs
->compress_pages_prev
= compression_counters
.pages
;
1022 rs
->compressed_size_prev
= compression_counters
.compressed_size
;
1027 static void migration_trigger_throttle(RAMState
*rs
)
1029 MigrationState
*s
= migrate_get_current();
1030 uint64_t threshold
= s
->parameters
.throttle_trigger_threshold
;
1032 uint64_t bytes_xfer_period
= ram_counters
.transferred
- rs
->bytes_xfer_prev
;
1033 uint64_t bytes_dirty_period
= rs
->num_dirty_pages_period
* TARGET_PAGE_SIZE
;
1034 uint64_t bytes_dirty_threshold
= bytes_xfer_period
* threshold
/ 100;
1036 /* During block migration the auto-converge logic incorrectly detects
1037 * that ram migration makes no progress. Avoid this by disabling the
1038 * throttling logic during the bulk phase of block migration. */
1039 if (migrate_auto_converge() && !blk_mig_bulk_active()) {
1040 /* The following detection logic can be refined later. For now:
1041 Check to see if the ratio between dirtied bytes and the approx.
1042 amount of bytes that just got transferred since the last time
1043 we were in this routine reaches the threshold. If that happens
1044 twice, start or increase throttling. */
1046 if ((bytes_dirty_period
> bytes_dirty_threshold
) &&
1047 (++rs
->dirty_rate_high_cnt
>= 2)) {
1048 trace_migration_throttle();
1049 rs
->dirty_rate_high_cnt
= 0;
1050 mig_throttle_guest_down(bytes_dirty_period
,
1051 bytes_dirty_threshold
);
1056 static void migration_bitmap_sync(RAMState
*rs
)
1061 ram_counters
.dirty_sync_count
++;
1063 if (!rs
->time_last_bitmap_sync
) {
1064 rs
->time_last_bitmap_sync
= qemu_clock_get_ms(QEMU_CLOCK_REALTIME
);
1067 trace_migration_bitmap_sync_start();
1068 memory_global_dirty_log_sync();
1070 qemu_mutex_lock(&rs
->bitmap_mutex
);
1071 WITH_RCU_READ_LOCK_GUARD() {
1072 RAMBLOCK_FOREACH_NOT_IGNORED(block
) {
1073 ramblock_sync_dirty_bitmap(rs
, block
);
1075 ram_counters
.remaining
= ram_bytes_remaining();
1077 qemu_mutex_unlock(&rs
->bitmap_mutex
);
1079 memory_global_after_dirty_log_sync();
1080 trace_migration_bitmap_sync_end(rs
->num_dirty_pages_period
);
1082 end_time
= qemu_clock_get_ms(QEMU_CLOCK_REALTIME
);
1084 /* more than 1 second = 1000 millisecons */
1085 if (end_time
> rs
->time_last_bitmap_sync
+ 1000) {
1086 migration_trigger_throttle(rs
);
1088 migration_update_rates(rs
, end_time
);
1090 rs
->target_page_count_prev
= rs
->target_page_count
;
1092 /* reset period counters */
1093 rs
->time_last_bitmap_sync
= end_time
;
1094 rs
->num_dirty_pages_period
= 0;
1095 rs
->bytes_xfer_prev
= ram_counters
.transferred
;
1097 if (migrate_use_events()) {
1098 qapi_event_send_migration_pass(ram_counters
.dirty_sync_count
);
1102 static void migration_bitmap_sync_precopy(RAMState
*rs
)
1104 Error
*local_err
= NULL
;
1107 * The current notifier usage is just an optimization to migration, so we
1108 * don't stop the normal migration process in the error case.
1110 if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC
, &local_err
)) {
1111 error_report_err(local_err
);
1115 migration_bitmap_sync(rs
);
1117 if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC
, &local_err
)) {
1118 error_report_err(local_err
);
1123 * save_zero_page_to_file: send the zero page to the file
1125 * Returns the size of data written to the file, 0 means the page is not
1128 * @rs: current RAM state
1129 * @file: the file where the data is saved
1130 * @block: block that contains the page we want to send
1131 * @offset: offset inside the block for the page
1133 static int save_zero_page_to_file(RAMState
*rs
, QEMUFile
*file
,
1134 RAMBlock
*block
, ram_addr_t offset
)
1136 uint8_t *p
= block
->host
+ offset
;
1139 if (is_zero_range(p
, TARGET_PAGE_SIZE
)) {
1140 len
+= save_page_header(rs
, file
, block
, offset
| RAM_SAVE_FLAG_ZERO
);
1141 qemu_put_byte(file
, 0);
1148 * save_zero_page: send the zero page to the stream
1150 * Returns the number of pages written.
1152 * @rs: current RAM state
1153 * @block: block that contains the page we want to send
1154 * @offset: offset inside the block for the page
1156 static int save_zero_page(RAMState
*rs
, RAMBlock
*block
, ram_addr_t offset
)
1158 int len
= save_zero_page_to_file(rs
, rs
->f
, block
, offset
);
1161 ram_counters
.duplicate
++;
1162 ram_counters
.transferred
+= len
;
1168 static void ram_release_pages(const char *rbname
, uint64_t offset
, int pages
)
1170 if (!migrate_release_ram() || !migration_in_postcopy()) {
1174 ram_discard_range(rbname
, offset
, ((ram_addr_t
)pages
) << TARGET_PAGE_BITS
);
1178 * @pages: the number of pages written by the control path,
1180 * > 0 - number of pages written
1182 * Return true if the pages has been saved, otherwise false is returned.
1184 static bool control_save_page(RAMState
*rs
, RAMBlock
*block
, ram_addr_t offset
,
1187 uint64_t bytes_xmit
= 0;
1191 ret
= ram_control_save_page(rs
->f
, block
->offset
, offset
, TARGET_PAGE_SIZE
,
1193 if (ret
== RAM_SAVE_CONTROL_NOT_SUPP
) {
1198 ram_counters
.transferred
+= bytes_xmit
;
1202 if (ret
== RAM_SAVE_CONTROL_DELAYED
) {
1206 if (bytes_xmit
> 0) {
1207 ram_counters
.normal
++;
1208 } else if (bytes_xmit
== 0) {
1209 ram_counters
.duplicate
++;
1216 * directly send the page to the stream
1218 * Returns the number of pages written.
1220 * @rs: current RAM state
1221 * @block: block that contains the page we want to send
1222 * @offset: offset inside the block for the page
1223 * @buf: the page to be sent
1224 * @async: send to page asyncly
1226 static int save_normal_page(RAMState
*rs
, RAMBlock
*block
, ram_addr_t offset
,
1227 uint8_t *buf
, bool async
)
1229 ram_counters
.transferred
+= save_page_header(rs
, rs
->f
, block
,
1230 offset
| RAM_SAVE_FLAG_PAGE
);
1232 qemu_put_buffer_async(rs
->f
, buf
, TARGET_PAGE_SIZE
,
1233 migrate_release_ram() &
1234 migration_in_postcopy());
1236 qemu_put_buffer(rs
->f
, buf
, TARGET_PAGE_SIZE
);
1238 ram_counters
.transferred
+= TARGET_PAGE_SIZE
;
1239 ram_counters
.normal
++;
1244 * ram_save_page: send the given page to the stream
1246 * Returns the number of pages written.
1248 * >=0 - Number of pages written - this might legally be 0
1249 * if xbzrle noticed the page was the same.
1251 * @rs: current RAM state
1252 * @block: block that contains the page we want to send
1253 * @offset: offset inside the block for the page
1254 * @last_stage: if we are at the completion stage
1256 static int ram_save_page(RAMState
*rs
, PageSearchStatus
*pss
, bool last_stage
)
1260 bool send_async
= true;
1261 RAMBlock
*block
= pss
->block
;
1262 ram_addr_t offset
= ((ram_addr_t
)pss
->page
) << TARGET_PAGE_BITS
;
1263 ram_addr_t current_addr
= block
->offset
+ offset
;
1265 p
= block
->host
+ offset
;
1266 trace_ram_save_page(block
->idstr
, (uint64_t)offset
, p
);
1268 XBZRLE_cache_lock();
1269 if (rs
->xbzrle_enabled
&& !migration_in_postcopy()) {
1270 pages
= save_xbzrle_page(rs
, &p
, current_addr
, block
,
1271 offset
, last_stage
);
1273 /* Can't send this cached data async, since the cache page
1274 * might get updated before it gets to the wire
1280 /* XBZRLE overflow or normal page */
1282 pages
= save_normal_page(rs
, block
, offset
, p
, send_async
);
1285 XBZRLE_cache_unlock();
1290 static int ram_save_multifd_page(RAMState
*rs
, RAMBlock
*block
,
1293 if (multifd_queue_page(rs
->f
, block
, offset
) < 0) {
1296 ram_counters
.normal
++;
1301 static bool do_compress_ram_page(QEMUFile
*f
, z_stream
*stream
, RAMBlock
*block
,
1302 ram_addr_t offset
, uint8_t *source_buf
)
1304 RAMState
*rs
= ram_state
;
1305 uint8_t *p
= block
->host
+ (offset
& TARGET_PAGE_MASK
);
1306 bool zero_page
= false;
1309 if (save_zero_page_to_file(rs
, f
, block
, offset
)) {
1314 save_page_header(rs
, f
, block
, offset
| RAM_SAVE_FLAG_COMPRESS_PAGE
);
1317 * copy it to a internal buffer to avoid it being modified by VM
1318 * so that we can catch up the error during compression and
1321 memcpy(source_buf
, p
, TARGET_PAGE_SIZE
);
1322 ret
= qemu_put_compression_data(f
, stream
, source_buf
, TARGET_PAGE_SIZE
);
1324 qemu_file_set_error(migrate_get_current()->to_dst_file
, ret
);
1325 error_report("compressed data failed!");
1330 ram_release_pages(block
->idstr
, offset
& TARGET_PAGE_MASK
, 1);
1335 update_compress_thread_counts(const CompressParam
*param
, int bytes_xmit
)
1337 ram_counters
.transferred
+= bytes_xmit
;
1339 if (param
->zero_page
) {
1340 ram_counters
.duplicate
++;
1344 /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1345 compression_counters
.compressed_size
+= bytes_xmit
- 8;
1346 compression_counters
.pages
++;
1349 static bool save_page_use_compression(RAMState
*rs
);
1351 static void flush_compressed_data(RAMState
*rs
)
1353 int idx
, len
, thread_count
;
1355 if (!save_page_use_compression(rs
)) {
1358 thread_count
= migrate_compress_threads();
1360 qemu_mutex_lock(&comp_done_lock
);
1361 for (idx
= 0; idx
< thread_count
; idx
++) {
1362 while (!comp_param
[idx
].done
) {
1363 qemu_cond_wait(&comp_done_cond
, &comp_done_lock
);
1366 qemu_mutex_unlock(&comp_done_lock
);
1368 for (idx
= 0; idx
< thread_count
; idx
++) {
1369 qemu_mutex_lock(&comp_param
[idx
].mutex
);
1370 if (!comp_param
[idx
].quit
) {
1371 len
= qemu_put_qemu_file(rs
->f
, comp_param
[idx
].file
);
1373 * it's safe to fetch zero_page without holding comp_done_lock
1374 * as there is no further request submitted to the thread,
1375 * i.e, the thread should be waiting for a request at this point.
1377 update_compress_thread_counts(&comp_param
[idx
], len
);
1379 qemu_mutex_unlock(&comp_param
[idx
].mutex
);
1383 static inline void set_compress_params(CompressParam
*param
, RAMBlock
*block
,
1386 param
->block
= block
;
1387 param
->offset
= offset
;
1390 static int compress_page_with_multi_thread(RAMState
*rs
, RAMBlock
*block
,
1393 int idx
, thread_count
, bytes_xmit
= -1, pages
= -1;
1394 bool wait
= migrate_compress_wait_thread();
1396 thread_count
= migrate_compress_threads();
1397 qemu_mutex_lock(&comp_done_lock
);
1399 for (idx
= 0; idx
< thread_count
; idx
++) {
1400 if (comp_param
[idx
].done
) {
1401 comp_param
[idx
].done
= false;
1402 bytes_xmit
= qemu_put_qemu_file(rs
->f
, comp_param
[idx
].file
);
1403 qemu_mutex_lock(&comp_param
[idx
].mutex
);
1404 set_compress_params(&comp_param
[idx
], block
, offset
);
1405 qemu_cond_signal(&comp_param
[idx
].cond
);
1406 qemu_mutex_unlock(&comp_param
[idx
].mutex
);
1408 update_compress_thread_counts(&comp_param
[idx
], bytes_xmit
);
1414 * wait for the free thread if the user specifies 'compress-wait-thread',
1415 * otherwise we will post the page out in the main thread as normal page.
1417 if (pages
< 0 && wait
) {
1418 qemu_cond_wait(&comp_done_cond
, &comp_done_lock
);
1421 qemu_mutex_unlock(&comp_done_lock
);
1427 * find_dirty_block: find the next dirty page and update any state
1428 * associated with the search process.
1430 * Returns true if a page is found
1432 * @rs: current RAM state
1433 * @pss: data about the state of the current dirty page scan
1434 * @again: set to false if the search has scanned the whole of RAM
1436 static bool find_dirty_block(RAMState
*rs
, PageSearchStatus
*pss
, bool *again
)
1438 pss
->page
= migration_bitmap_find_dirty(rs
, pss
->block
, pss
->page
);
1439 if (pss
->complete_round
&& pss
->block
== rs
->last_seen_block
&&
1440 pss
->page
>= rs
->last_page
) {
1442 * We've been once around the RAM and haven't found anything.
1448 if (!offset_in_ramblock(pss
->block
,
1449 ((ram_addr_t
)pss
->page
) << TARGET_PAGE_BITS
)) {
1450 /* Didn't find anything in this RAM Block */
1452 pss
->block
= QLIST_NEXT_RCU(pss
->block
, next
);
1455 * If memory migration starts over, we will meet a dirtied page
1456 * which may still exists in compression threads's ring, so we
1457 * should flush the compressed data to make sure the new page
1458 * is not overwritten by the old one in the destination.
1460 * Also If xbzrle is on, stop using the data compression at this
1461 * point. In theory, xbzrle can do better than compression.
1463 flush_compressed_data(rs
);
1465 /* Hit the end of the list */
1466 pss
->block
= QLIST_FIRST_RCU(&ram_list
.blocks
);
1467 /* Flag that we've looped */
1468 pss
->complete_round
= true;
1469 /* After the first round, enable XBZRLE. */
1470 if (migrate_use_xbzrle()) {
1471 rs
->xbzrle_enabled
= true;
1474 /* Didn't find anything this time, but try again on the new block */
1478 /* Can go around again, but... */
1480 /* We've found something so probably don't need to */
1486 * unqueue_page: gets a page of the queue
1488 * Helper for 'get_queued_page' - gets a page off the queue
1490 * Returns the block of the page (or NULL if none available)
1492 * @rs: current RAM state
1493 * @offset: used to return the offset within the RAMBlock
1495 static RAMBlock
*unqueue_page(RAMState
*rs
, ram_addr_t
*offset
)
1497 RAMBlock
*block
= NULL
;
1499 if (QSIMPLEQ_EMPTY_ATOMIC(&rs
->src_page_requests
)) {
1503 QEMU_LOCK_GUARD(&rs
->src_page_req_mutex
);
1504 if (!QSIMPLEQ_EMPTY(&rs
->src_page_requests
)) {
1505 struct RAMSrcPageRequest
*entry
=
1506 QSIMPLEQ_FIRST(&rs
->src_page_requests
);
1508 *offset
= entry
->offset
;
1510 if (entry
->len
> TARGET_PAGE_SIZE
) {
1511 entry
->len
-= TARGET_PAGE_SIZE
;
1512 entry
->offset
+= TARGET_PAGE_SIZE
;
1514 memory_region_unref(block
->mr
);
1515 QSIMPLEQ_REMOVE_HEAD(&rs
->src_page_requests
, next_req
);
1517 migration_consume_urgent_request();
1524 #if defined(__linux__)
1526 * poll_fault_page: try to get next UFFD write fault page and, if pending fault
1527 * is found, return RAM block pointer and page offset
1529 * Returns pointer to the RAMBlock containing faulting page,
1530 * NULL if no write faults are pending
1532 * @rs: current RAM state
1533 * @offset: page offset from the beginning of the block
1535 static RAMBlock
*poll_fault_page(RAMState
*rs
, ram_addr_t
*offset
)
1537 struct uffd_msg uffd_msg
;
1542 if (!migrate_background_snapshot()) {
1546 res
= uffd_read_events(rs
->uffdio_fd
, &uffd_msg
, 1);
1551 page_address
= (void *)(uintptr_t) uffd_msg
.arg
.pagefault
.address
;
1552 block
= qemu_ram_block_from_host(page_address
, false, offset
);
1553 assert(block
&& (block
->flags
& RAM_UF_WRITEPROTECT
) != 0);
1558 * ram_save_release_protection: release UFFD write protection after
1559 * a range of pages has been saved
1561 * @rs: current RAM state
1562 * @pss: page-search-status structure
1563 * @start_page: index of the first page in the range relative to pss->block
1565 * Returns 0 on success, negative value in case of an error
1567 static int ram_save_release_protection(RAMState
*rs
, PageSearchStatus
*pss
,
1568 unsigned long start_page
)
1572 /* Check if page is from UFFD-managed region. */
1573 if (pss
->block
->flags
& RAM_UF_WRITEPROTECT
) {
1574 void *page_address
= pss
->block
->host
+ (start_page
<< TARGET_PAGE_BITS
);
1575 uint64_t run_length
= (pss
->page
- start_page
+ 1) << TARGET_PAGE_BITS
;
1577 /* Flush async buffers before un-protect. */
1579 /* Un-protect memory range. */
1580 res
= uffd_change_protection(rs
->uffdio_fd
, page_address
, run_length
,
1587 /* ram_write_tracking_available: check if kernel supports required UFFD features
1589 * Returns true if supports, false otherwise
1591 bool ram_write_tracking_available(void)
1593 uint64_t uffd_features
;
1596 res
= uffd_query_features(&uffd_features
);
1598 (uffd_features
& UFFD_FEATURE_PAGEFAULT_FLAG_WP
) != 0);
1601 /* ram_write_tracking_compatible: check if guest configuration is
1602 * compatible with 'write-tracking'
1604 * Returns true if compatible, false otherwise
1606 bool ram_write_tracking_compatible(void)
1608 const uint64_t uffd_ioctls_mask
= BIT(_UFFDIO_WRITEPROTECT
);
1613 /* Open UFFD file descriptor */
1614 uffd_fd
= uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP
, false);
1619 RCU_READ_LOCK_GUARD();
1621 RAMBLOCK_FOREACH_NOT_IGNORED(block
) {
1622 uint64_t uffd_ioctls
;
1624 /* Nothing to do with read-only and MMIO-writable regions */
1625 if (block
->mr
->readonly
|| block
->mr
->rom_device
) {
1628 /* Try to register block memory via UFFD-IO to track writes */
1629 if (uffd_register_memory(uffd_fd
, block
->host
, block
->max_length
,
1630 UFFDIO_REGISTER_MODE_WP
, &uffd_ioctls
)) {
1633 if ((uffd_ioctls
& uffd_ioctls_mask
) != uffd_ioctls_mask
) {
1640 uffd_close_fd(uffd_fd
);
1644 static inline void populate_read_range(RAMBlock
*block
, ram_addr_t offset
,
1648 * We read one byte of each page; this will preallocate page tables if
1649 * required and populate the shared zeropage on MAP_PRIVATE anonymous memory
1650 * where no page was populated yet. This might require adaption when
1651 * supporting other mappings, like shmem.
1653 for (; offset
< size
; offset
+= block
->page_size
) {
1654 char tmp
= *((char *)block
->host
+ offset
);
1656 /* Don't optimize the read out */
1657 asm volatile("" : "+r" (tmp
));
1661 static inline int populate_read_section(MemoryRegionSection
*section
,
1664 const hwaddr size
= int128_get64(section
->size
);
1665 hwaddr offset
= section
->offset_within_region
;
1666 RAMBlock
*block
= section
->mr
->ram_block
;
1668 populate_read_range(block
, offset
, size
);
1673 * ram_block_populate_read: preallocate page tables and populate pages in the
1674 * RAM block by reading a byte of each page.
1676 * Since it's solely used for userfault_fd WP feature, here we just
1677 * hardcode page size to qemu_real_host_page_size.
1679 * @block: RAM block to populate
1681 static void ram_block_populate_read(RAMBlock
*rb
)
1684 * Skip populating all pages that fall into a discarded range as managed by
1685 * a RamDiscardManager responsible for the mapped memory region of the
1686 * RAMBlock. Such discarded ("logically unplugged") parts of a RAMBlock
1687 * must not get populated automatically. We don't have to track
1688 * modifications via userfaultfd WP reliably, because these pages will
1689 * not be part of the migration stream either way -- see
1690 * ramblock_dirty_bitmap_exclude_discarded_pages().
1692 * Note: The result is only stable while migrating (precopy/postcopy).
1694 if (rb
->mr
&& memory_region_has_ram_discard_manager(rb
->mr
)) {
1695 RamDiscardManager
*rdm
= memory_region_get_ram_discard_manager(rb
->mr
);
1696 MemoryRegionSection section
= {
1698 .offset_within_region
= 0,
1699 .size
= rb
->mr
->size
,
1702 ram_discard_manager_replay_populated(rdm
, §ion
,
1703 populate_read_section
, NULL
);
1705 populate_read_range(rb
, 0, rb
->used_length
);
1710 * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking
1712 void ram_write_tracking_prepare(void)
1716 RCU_READ_LOCK_GUARD();
1718 RAMBLOCK_FOREACH_NOT_IGNORED(block
) {
1719 /* Nothing to do with read-only and MMIO-writable regions */
1720 if (block
->mr
->readonly
|| block
->mr
->rom_device
) {
1725 * Populate pages of the RAM block before enabling userfault_fd
1728 * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with
1729 * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip
1730 * pages with pte_none() entries in page table.
1732 ram_block_populate_read(block
);
1737 * ram_write_tracking_start: start UFFD-WP memory tracking
1739 * Returns 0 for success or negative value in case of error
1741 int ram_write_tracking_start(void)
1744 RAMState
*rs
= ram_state
;
1747 /* Open UFFD file descriptor */
1748 uffd_fd
= uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP
, true);
1752 rs
->uffdio_fd
= uffd_fd
;
1754 RCU_READ_LOCK_GUARD();
1756 RAMBLOCK_FOREACH_NOT_IGNORED(block
) {
1757 /* Nothing to do with read-only and MMIO-writable regions */
1758 if (block
->mr
->readonly
|| block
->mr
->rom_device
) {
1762 /* Register block memory with UFFD to track writes */
1763 if (uffd_register_memory(rs
->uffdio_fd
, block
->host
,
1764 block
->max_length
, UFFDIO_REGISTER_MODE_WP
, NULL
)) {
1767 /* Apply UFFD write protection to the block memory range */
1768 if (uffd_change_protection(rs
->uffdio_fd
, block
->host
,
1769 block
->max_length
, true, false)) {
1772 block
->flags
|= RAM_UF_WRITEPROTECT
;
1773 memory_region_ref(block
->mr
);
1775 trace_ram_write_tracking_ramblock_start(block
->idstr
, block
->page_size
,
1776 block
->host
, block
->max_length
);
1782 error_report("ram_write_tracking_start() failed: restoring initial memory state");
1784 RAMBLOCK_FOREACH_NOT_IGNORED(block
) {
1785 if ((block
->flags
& RAM_UF_WRITEPROTECT
) == 0) {
1789 * In case some memory block failed to be write-protected
1790 * remove protection and unregister all succeeded RAM blocks
1792 uffd_change_protection(rs
->uffdio_fd
, block
->host
, block
->max_length
,
1794 uffd_unregister_memory(rs
->uffdio_fd
, block
->host
, block
->max_length
);
1795 /* Cleanup flags and remove reference */
1796 block
->flags
&= ~RAM_UF_WRITEPROTECT
;
1797 memory_region_unref(block
->mr
);
1800 uffd_close_fd(uffd_fd
);
1806 * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
1808 void ram_write_tracking_stop(void)
1810 RAMState
*rs
= ram_state
;
1813 RCU_READ_LOCK_GUARD();
1815 RAMBLOCK_FOREACH_NOT_IGNORED(block
) {
1816 if ((block
->flags
& RAM_UF_WRITEPROTECT
) == 0) {
1819 /* Remove protection and unregister all affected RAM blocks */
1820 uffd_change_protection(rs
->uffdio_fd
, block
->host
, block
->max_length
,
1822 uffd_unregister_memory(rs
->uffdio_fd
, block
->host
, block
->max_length
);
1824 trace_ram_write_tracking_ramblock_stop(block
->idstr
, block
->page_size
,
1825 block
->host
, block
->max_length
);
1827 /* Cleanup flags and remove reference */
1828 block
->flags
&= ~RAM_UF_WRITEPROTECT
;
1829 memory_region_unref(block
->mr
);
1832 /* Finally close UFFD file descriptor */
1833 uffd_close_fd(rs
->uffdio_fd
);
1838 /* No target OS support, stubs just fail or ignore */
1840 static RAMBlock
*poll_fault_page(RAMState
*rs
, ram_addr_t
*offset
)
1848 static int ram_save_release_protection(RAMState
*rs
, PageSearchStatus
*pss
,
1849 unsigned long start_page
)
1858 bool ram_write_tracking_available(void)
1863 bool ram_write_tracking_compatible(void)
1869 int ram_write_tracking_start(void)
1875 void ram_write_tracking_stop(void)
1879 #endif /* defined(__linux__) */
1882 * get_queued_page: unqueue a page from the postcopy requests
1884 * Skips pages that are already sent (!dirty)
1886 * Returns true if a queued page is found
1888 * @rs: current RAM state
1889 * @pss: data about the state of the current dirty page scan
1891 static bool get_queued_page(RAMState
*rs
, PageSearchStatus
*pss
)
1898 block
= unqueue_page(rs
, &offset
);
1900 * We're sending this page, and since it's postcopy nothing else
1901 * will dirty it, and we must make sure it doesn't get sent again
1902 * even if this queue request was received after the background
1903 * search already sent it.
1908 page
= offset
>> TARGET_PAGE_BITS
;
1909 dirty
= test_bit(page
, block
->bmap
);
1911 trace_get_queued_page_not_dirty(block
->idstr
, (uint64_t)offset
,
1914 trace_get_queued_page(block
->idstr
, (uint64_t)offset
, page
);
1918 } while (block
&& !dirty
);
1922 * Poll write faults too if background snapshot is enabled; that's
1923 * when we have vcpus got blocked by the write protected pages.
1925 block
= poll_fault_page(rs
, &offset
);
1930 * We want the background search to continue from the queued page
1931 * since the guest is likely to want other pages near to the page
1932 * it just requested.
1935 pss
->page
= offset
>> TARGET_PAGE_BITS
;
1938 * This unqueued page would break the "one round" check, even is
1941 pss
->complete_round
= false;
1948 * migration_page_queue_free: drop any remaining pages in the ram
1951 * It should be empty at the end anyway, but in error cases there may
1952 * be some left. in case that there is any page left, we drop it.
1955 static void migration_page_queue_free(RAMState
*rs
)
1957 struct RAMSrcPageRequest
*mspr
, *next_mspr
;
1958 /* This queue generally should be empty - but in the case of a failed
1959 * migration might have some droppings in.
1961 RCU_READ_LOCK_GUARD();
1962 QSIMPLEQ_FOREACH_SAFE(mspr
, &rs
->src_page_requests
, next_req
, next_mspr
) {
1963 memory_region_unref(mspr
->rb
->mr
);
1964 QSIMPLEQ_REMOVE_HEAD(&rs
->src_page_requests
, next_req
);
1970 * ram_save_queue_pages: queue the page for transmission
1972 * A request from postcopy destination for example.
1974 * Returns zero on success or negative on error
1976 * @rbname: Name of the RAMBLock of the request. NULL means the
1977 * same that last one.
1978 * @start: starting address from the start of the RAMBlock
1979 * @len: length (in bytes) to send
1981 int ram_save_queue_pages(const char *rbname
, ram_addr_t start
, ram_addr_t len
)
1984 RAMState
*rs
= ram_state
;
1986 ram_counters
.postcopy_requests
++;
1987 RCU_READ_LOCK_GUARD();
1990 /* Reuse last RAMBlock */
1991 ramblock
= rs
->last_req_rb
;
1995 * Shouldn't happen, we can't reuse the last RAMBlock if
1996 * it's the 1st request.
1998 error_report("ram_save_queue_pages no previous block");
2002 ramblock
= qemu_ram_block_by_name(rbname
);
2005 /* We shouldn't be asked for a non-existent RAMBlock */
2006 error_report("ram_save_queue_pages no block '%s'", rbname
);
2009 rs
->last_req_rb
= ramblock
;
2011 trace_ram_save_queue_pages(ramblock
->idstr
, start
, len
);
2012 if (!offset_in_ramblock(ramblock
, start
+ len
- 1)) {
2013 error_report("%s request overrun start=" RAM_ADDR_FMT
" len="
2014 RAM_ADDR_FMT
" blocklen=" RAM_ADDR_FMT
,
2015 __func__
, start
, len
, ramblock
->used_length
);
2019 struct RAMSrcPageRequest
*new_entry
=
2020 g_malloc0(sizeof(struct RAMSrcPageRequest
));
2021 new_entry
->rb
= ramblock
;
2022 new_entry
->offset
= start
;
2023 new_entry
->len
= len
;
2025 memory_region_ref(ramblock
->mr
);
2026 qemu_mutex_lock(&rs
->src_page_req_mutex
);
2027 QSIMPLEQ_INSERT_TAIL(&rs
->src_page_requests
, new_entry
, next_req
);
2028 migration_make_urgent_request();
2029 qemu_mutex_unlock(&rs
->src_page_req_mutex
);
2034 static bool save_page_use_compression(RAMState
*rs
)
2036 if (!migrate_use_compression()) {
2041 * If xbzrle is enabled (e.g., after first round of migration), stop
2042 * using the data compression. In theory, xbzrle can do better than
2045 if (rs
->xbzrle_enabled
) {
2053 * try to compress the page before posting it out, return true if the page
2054 * has been properly handled by compression, otherwise needs other
2055 * paths to handle it
2057 static bool save_compress_page(RAMState
*rs
, RAMBlock
*block
, ram_addr_t offset
)
2059 if (!save_page_use_compression(rs
)) {
2064 * When starting the process of a new block, the first page of
2065 * the block should be sent out before other pages in the same
2066 * block, and all the pages in last block should have been sent
2067 * out, keeping this order is important, because the 'cont' flag
2068 * is used to avoid resending the block name.
2070 * We post the fist page as normal page as compression will take
2071 * much CPU resource.
2073 if (block
!= rs
->last_sent_block
) {
2074 flush_compressed_data(rs
);
2078 if (compress_page_with_multi_thread(rs
, block
, offset
) > 0) {
2082 compression_counters
.busy
++;
2087 * ram_save_target_page: save one target page
2089 * Returns the number of pages written
2091 * @rs: current RAM state
2092 * @pss: data about the page we want to send
2093 * @last_stage: if we are at the completion stage
2095 static int ram_save_target_page(RAMState
*rs
, PageSearchStatus
*pss
,
2098 RAMBlock
*block
= pss
->block
;
2099 ram_addr_t offset
= ((ram_addr_t
)pss
->page
) << TARGET_PAGE_BITS
;
2102 if (control_save_page(rs
, block
, offset
, &res
)) {
2106 if (save_compress_page(rs
, block
, offset
)) {
2110 res
= save_zero_page(rs
, block
, offset
);
2112 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
2113 * page would be stale
2115 if (!save_page_use_compression(rs
)) {
2116 XBZRLE_cache_lock();
2117 xbzrle_cache_zero_page(rs
, block
->offset
+ offset
);
2118 XBZRLE_cache_unlock();
2120 ram_release_pages(block
->idstr
, offset
, res
);
2125 * Do not use multifd for:
2126 * 1. Compression as the first page in the new block should be posted out
2127 * before sending the compressed page
2128 * 2. In postcopy as one whole host page should be placed
2130 if (!save_page_use_compression(rs
) && migrate_use_multifd()
2131 && !migration_in_postcopy()) {
2132 return ram_save_multifd_page(rs
, block
, offset
);
2135 return ram_save_page(rs
, pss
, last_stage
);
2139 * ram_save_host_page: save a whole host page
2141 * Starting at *offset send pages up to the end of the current host
2142 * page. It's valid for the initial offset to point into the middle of
2143 * a host page in which case the remainder of the hostpage is sent.
2144 * Only dirty target pages are sent. Note that the host page size may
2145 * be a huge page for this block.
2146 * The saving stops at the boundary of the used_length of the block
2147 * if the RAMBlock isn't a multiple of the host page size.
2149 * Returns the number of pages written or negative on error
2151 * @rs: current RAM state
2152 * @ms: current migration state
2153 * @pss: data about the page we want to send
2154 * @last_stage: if we are at the completion stage
2156 static int ram_save_host_page(RAMState
*rs
, PageSearchStatus
*pss
,
2159 int tmppages
, pages
= 0;
2160 size_t pagesize_bits
=
2161 qemu_ram_pagesize(pss
->block
) >> TARGET_PAGE_BITS
;
2162 unsigned long hostpage_boundary
=
2163 QEMU_ALIGN_UP(pss
->page
+ 1, pagesize_bits
);
2164 unsigned long start_page
= pss
->page
;
2167 if (ramblock_is_ignored(pss
->block
)) {
2168 error_report("block %s should not be migrated !", pss
->block
->idstr
);
2173 /* Check the pages is dirty and if it is send it */
2174 if (migration_bitmap_clear_dirty(rs
, pss
->block
, pss
->page
)) {
2175 tmppages
= ram_save_target_page(rs
, pss
, last_stage
);
2182 * Allow rate limiting to happen in the middle of huge pages if
2183 * something is sent in the current iteration.
2185 if (pagesize_bits
> 1 && tmppages
> 0) {
2186 migration_rate_limit();
2189 pss
->page
= migration_bitmap_find_dirty(rs
, pss
->block
, pss
->page
);
2190 } while ((pss
->page
< hostpage_boundary
) &&
2191 offset_in_ramblock(pss
->block
,
2192 ((ram_addr_t
)pss
->page
) << TARGET_PAGE_BITS
));
2193 /* The offset we leave with is the min boundary of host page and block */
2194 pss
->page
= MIN(pss
->page
, hostpage_boundary
) - 1;
2196 res
= ram_save_release_protection(rs
, pss
, start_page
);
2197 return (res
< 0 ? res
: pages
);
2201 * ram_find_and_save_block: finds a dirty page and sends it to f
2203 * Called within an RCU critical section.
2205 * Returns the number of pages written where zero means no dirty pages,
2206 * or negative on error
2208 * @rs: current RAM state
2209 * @last_stage: if we are at the completion stage
2211 * On systems where host-page-size > target-page-size it will send all the
2212 * pages in a host page that are dirty.
2215 static int ram_find_and_save_block(RAMState
*rs
, bool last_stage
)
2217 PageSearchStatus pss
;
2221 /* No dirty page as there is zero RAM */
2222 if (!ram_bytes_total()) {
2226 pss
.block
= rs
->last_seen_block
;
2227 pss
.page
= rs
->last_page
;
2228 pss
.complete_round
= false;
2231 pss
.block
= QLIST_FIRST_RCU(&ram_list
.blocks
);
2236 found
= get_queued_page(rs
, &pss
);
2239 /* priority queue empty, so just search for something dirty */
2240 found
= find_dirty_block(rs
, &pss
, &again
);
2244 pages
= ram_save_host_page(rs
, &pss
, last_stage
);
2246 } while (!pages
&& again
);
2248 rs
->last_seen_block
= pss
.block
;
2249 rs
->last_page
= pss
.page
;
2254 void acct_update_position(QEMUFile
*f
, size_t size
, bool zero
)
2256 uint64_t pages
= size
/ TARGET_PAGE_SIZE
;
2259 ram_counters
.duplicate
+= pages
;
2261 ram_counters
.normal
+= pages
;
2262 ram_counters
.transferred
+= size
;
2263 qemu_update_position(f
, size
);
2267 static uint64_t ram_bytes_total_common(bool count_ignored
)
2272 RCU_READ_LOCK_GUARD();
2274 if (count_ignored
) {
2275 RAMBLOCK_FOREACH_MIGRATABLE(block
) {
2276 total
+= block
->used_length
;
2279 RAMBLOCK_FOREACH_NOT_IGNORED(block
) {
2280 total
+= block
->used_length
;
2286 uint64_t ram_bytes_total(void)
2288 return ram_bytes_total_common(false);
2291 static void xbzrle_load_setup(void)
2293 XBZRLE
.decoded_buf
= g_malloc(TARGET_PAGE_SIZE
);
2296 static void xbzrle_load_cleanup(void)
2298 g_free(XBZRLE
.decoded_buf
);
2299 XBZRLE
.decoded_buf
= NULL
;
2302 static void ram_state_cleanup(RAMState
**rsp
)
2305 migration_page_queue_free(*rsp
);
2306 qemu_mutex_destroy(&(*rsp
)->bitmap_mutex
);
2307 qemu_mutex_destroy(&(*rsp
)->src_page_req_mutex
);
2313 static void xbzrle_cleanup(void)
2315 XBZRLE_cache_lock();
2317 cache_fini(XBZRLE
.cache
);
2318 g_free(XBZRLE
.encoded_buf
);
2319 g_free(XBZRLE
.current_buf
);
2320 g_free(XBZRLE
.zero_target_page
);
2321 XBZRLE
.cache
= NULL
;
2322 XBZRLE
.encoded_buf
= NULL
;
2323 XBZRLE
.current_buf
= NULL
;
2324 XBZRLE
.zero_target_page
= NULL
;
2326 XBZRLE_cache_unlock();
2329 static void ram_save_cleanup(void *opaque
)
2331 RAMState
**rsp
= opaque
;
2334 /* We don't use dirty log with background snapshots */
2335 if (!migrate_background_snapshot()) {
2336 /* caller have hold iothread lock or is in a bh, so there is
2337 * no writing race against the migration bitmap
2339 if (global_dirty_tracking
& GLOBAL_DIRTY_MIGRATION
) {
2341 * do not stop dirty log without starting it, since
2342 * memory_global_dirty_log_stop will assert that
2343 * memory_global_dirty_log_start/stop used in pairs
2345 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION
);
2349 RAMBLOCK_FOREACH_NOT_IGNORED(block
) {
2350 g_free(block
->clear_bmap
);
2351 block
->clear_bmap
= NULL
;
2352 g_free(block
->bmap
);
2357 compress_threads_save_cleanup();
2358 ram_state_cleanup(rsp
);
2361 static void ram_state_reset(RAMState
*rs
)
2363 rs
->last_seen_block
= NULL
;
2364 rs
->last_sent_block
= NULL
;
2366 rs
->last_version
= ram_list
.version
;
2367 rs
->xbzrle_enabled
= false;
2370 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2373 * 'expected' is the value you expect the bitmap mostly to be full
2374 * of; it won't bother printing lines that are all this value.
2375 * If 'todump' is null the migration bitmap is dumped.
2377 void ram_debug_dump_bitmap(unsigned long *todump
, bool expected
,
2378 unsigned long pages
)
2381 int64_t linelen
= 128;
2384 for (cur
= 0; cur
< pages
; cur
+= linelen
) {
2388 * Last line; catch the case where the line length
2389 * is longer than remaining ram
2391 if (cur
+ linelen
> pages
) {
2392 linelen
= pages
- cur
;
2394 for (curb
= 0; curb
< linelen
; curb
++) {
2395 bool thisbit
= test_bit(cur
+ curb
, todump
);
2396 linebuf
[curb
] = thisbit
? '1' : '.';
2397 found
= found
|| (thisbit
!= expected
);
2400 linebuf
[curb
] = '\0';
2401 fprintf(stderr
, "0x%08" PRIx64
" : %s\n", cur
, linebuf
);
2406 /* **** functions for postcopy ***** */
2408 void ram_postcopy_migrated_memory_release(MigrationState
*ms
)
2410 struct RAMBlock
*block
;
2412 RAMBLOCK_FOREACH_NOT_IGNORED(block
) {
2413 unsigned long *bitmap
= block
->bmap
;
2414 unsigned long range
= block
->used_length
>> TARGET_PAGE_BITS
;
2415 unsigned long run_start
= find_next_zero_bit(bitmap
, range
, 0);
2417 while (run_start
< range
) {
2418 unsigned long run_end
= find_next_bit(bitmap
, range
, run_start
+ 1);
2419 ram_discard_range(block
->idstr
,
2420 ((ram_addr_t
)run_start
) << TARGET_PAGE_BITS
,
2421 ((ram_addr_t
)(run_end
- run_start
))
2422 << TARGET_PAGE_BITS
);
2423 run_start
= find_next_zero_bit(bitmap
, range
, run_end
+ 1);
2429 * postcopy_send_discard_bm_ram: discard a RAMBlock
2431 * Returns zero on success
2433 * Callback from postcopy_each_ram_send_discard for each RAMBlock
2435 * @ms: current migration state
2436 * @block: RAMBlock to discard
2438 static int postcopy_send_discard_bm_ram(MigrationState
*ms
, RAMBlock
*block
)
2440 unsigned long end
= block
->used_length
>> TARGET_PAGE_BITS
;
2441 unsigned long current
;
2442 unsigned long *bitmap
= block
->bmap
;
2444 for (current
= 0; current
< end
; ) {
2445 unsigned long one
= find_next_bit(bitmap
, end
, current
);
2446 unsigned long zero
, discard_length
;
2452 zero
= find_next_zero_bit(bitmap
, end
, one
+ 1);
2455 discard_length
= end
- one
;
2457 discard_length
= zero
- one
;
2459 postcopy_discard_send_range(ms
, one
, discard_length
);
2460 current
= one
+ discard_length
;
2467 * postcopy_each_ram_send_discard: discard all RAMBlocks
2469 * Returns 0 for success or negative for error
2471 * Utility for the outgoing postcopy code.
2472 * Calls postcopy_send_discard_bm_ram for each RAMBlock
2473 * passing it bitmap indexes and name.
2474 * (qemu_ram_foreach_block ends up passing unscaled lengths
2475 * which would mean postcopy code would have to deal with target page)
2477 * @ms: current migration state
2479 static int postcopy_each_ram_send_discard(MigrationState
*ms
)
2481 struct RAMBlock
*block
;
2484 RAMBLOCK_FOREACH_NOT_IGNORED(block
) {
2485 postcopy_discard_send_init(ms
, block
->idstr
);
2488 * Postcopy sends chunks of bitmap over the wire, but it
2489 * just needs indexes at this point, avoids it having
2490 * target page specific code.
2492 ret
= postcopy_send_discard_bm_ram(ms
, block
);
2493 postcopy_discard_send_finish(ms
);
2503 * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2505 * Helper for postcopy_chunk_hostpages; it's called twice to
2506 * canonicalize the two bitmaps, that are similar, but one is
2509 * Postcopy requires that all target pages in a hostpage are dirty or
2510 * clean, not a mix. This function canonicalizes the bitmaps.
2512 * @ms: current migration state
2513 * @block: block that contains the page we want to canonicalize
2515 static void postcopy_chunk_hostpages_pass(MigrationState
*ms
, RAMBlock
*block
)
2517 RAMState
*rs
= ram_state
;
2518 unsigned long *bitmap
= block
->bmap
;
2519 unsigned int host_ratio
= block
->page_size
/ TARGET_PAGE_SIZE
;
2520 unsigned long pages
= block
->used_length
>> TARGET_PAGE_BITS
;
2521 unsigned long run_start
;
2523 if (block
->page_size
== TARGET_PAGE_SIZE
) {
2524 /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2528 /* Find a dirty page */
2529 run_start
= find_next_bit(bitmap
, pages
, 0);
2531 while (run_start
< pages
) {
2534 * If the start of this run of pages is in the middle of a host
2535 * page, then we need to fixup this host page.
2537 if (QEMU_IS_ALIGNED(run_start
, host_ratio
)) {
2538 /* Find the end of this run */
2539 run_start
= find_next_zero_bit(bitmap
, pages
, run_start
+ 1);
2541 * If the end isn't at the start of a host page, then the
2542 * run doesn't finish at the end of a host page
2543 * and we need to discard.
2547 if (!QEMU_IS_ALIGNED(run_start
, host_ratio
)) {
2549 unsigned long fixup_start_addr
= QEMU_ALIGN_DOWN(run_start
,
2551 run_start
= QEMU_ALIGN_UP(run_start
, host_ratio
);
2553 /* Clean up the bitmap */
2554 for (page
= fixup_start_addr
;
2555 page
< fixup_start_addr
+ host_ratio
; page
++) {
2557 * Remark them as dirty, updating the count for any pages
2558 * that weren't previously dirty.
2560 rs
->migration_dirty_pages
+= !test_and_set_bit(page
, bitmap
);
2564 /* Find the next dirty page for the next iteration */
2565 run_start
= find_next_bit(bitmap
, pages
, run_start
);
2570 * postcopy_chunk_hostpages: discard any partially sent host page
2572 * Utility for the outgoing postcopy code.
2574 * Discard any partially sent host-page size chunks, mark any partially
2575 * dirty host-page size chunks as all dirty. In this case the host-page
2576 * is the host-page for the particular RAMBlock, i.e. it might be a huge page
2578 * Returns zero on success
2580 * @ms: current migration state
2581 * @block: block we want to work with
2583 static int postcopy_chunk_hostpages(MigrationState
*ms
, RAMBlock
*block
)
2585 postcopy_discard_send_init(ms
, block
->idstr
);
2588 * Ensure that all partially dirty host pages are made fully dirty.
2590 postcopy_chunk_hostpages_pass(ms
, block
);
2592 postcopy_discard_send_finish(ms
);
2597 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2599 * Returns zero on success
2601 * Transmit the set of pages to be discarded after precopy to the target
2602 * these are pages that:
2603 * a) Have been previously transmitted but are now dirty again
2604 * b) Pages that have never been transmitted, this ensures that
2605 * any pages on the destination that have been mapped by background
2606 * tasks get discarded (transparent huge pages is the specific concern)
2607 * Hopefully this is pretty sparse
2609 * @ms: current migration state
2611 int ram_postcopy_send_discard_bitmap(MigrationState
*ms
)
2613 RAMState
*rs
= ram_state
;
2617 RCU_READ_LOCK_GUARD();
2619 /* This should be our last sync, the src is now paused */
2620 migration_bitmap_sync(rs
);
2622 /* Easiest way to make sure we don't resume in the middle of a host-page */
2623 rs
->last_seen_block
= NULL
;
2624 rs
->last_sent_block
= NULL
;
2627 RAMBLOCK_FOREACH_NOT_IGNORED(block
) {
2628 /* Deal with TPS != HPS and huge pages */
2629 ret
= postcopy_chunk_hostpages(ms
, block
);
2634 #ifdef DEBUG_POSTCOPY
2635 ram_debug_dump_bitmap(block
->bmap
, true,
2636 block
->used_length
>> TARGET_PAGE_BITS
);
2639 trace_ram_postcopy_send_discard_bitmap();
2641 return postcopy_each_ram_send_discard(ms
);
2645 * ram_discard_range: discard dirtied pages at the beginning of postcopy
2647 * Returns zero on success
2649 * @rbname: name of the RAMBlock of the request. NULL means the
2650 * same that last one.
2651 * @start: RAMBlock starting page
2652 * @length: RAMBlock size
2654 int ram_discard_range(const char *rbname
, uint64_t start
, size_t length
)
2656 trace_ram_discard_range(rbname
, start
, length
);
2658 RCU_READ_LOCK_GUARD();
2659 RAMBlock
*rb
= qemu_ram_block_by_name(rbname
);
2662 error_report("ram_discard_range: Failed to find block '%s'", rbname
);
2667 * On source VM, we don't need to update the received bitmap since
2668 * we don't even have one.
2670 if (rb
->receivedmap
) {
2671 bitmap_clear(rb
->receivedmap
, start
>> qemu_target_page_bits(),
2672 length
>> qemu_target_page_bits());
2675 return ram_block_discard_range(rb
, start
, length
);
2679 * For every allocation, we will try not to crash the VM if the
2680 * allocation failed.
2682 static int xbzrle_init(void)
2684 Error
*local_err
= NULL
;
2686 if (!migrate_use_xbzrle()) {
2690 XBZRLE_cache_lock();
2692 XBZRLE
.zero_target_page
= g_try_malloc0(TARGET_PAGE_SIZE
);
2693 if (!XBZRLE
.zero_target_page
) {
2694 error_report("%s: Error allocating zero page", __func__
);
2698 XBZRLE
.cache
= cache_init(migrate_xbzrle_cache_size(),
2699 TARGET_PAGE_SIZE
, &local_err
);
2700 if (!XBZRLE
.cache
) {
2701 error_report_err(local_err
);
2702 goto free_zero_page
;
2705 XBZRLE
.encoded_buf
= g_try_malloc0(TARGET_PAGE_SIZE
);
2706 if (!XBZRLE
.encoded_buf
) {
2707 error_report("%s: Error allocating encoded_buf", __func__
);
2711 XBZRLE
.current_buf
= g_try_malloc(TARGET_PAGE_SIZE
);
2712 if (!XBZRLE
.current_buf
) {
2713 error_report("%s: Error allocating current_buf", __func__
);
2714 goto free_encoded_buf
;
2717 /* We are all good */
2718 XBZRLE_cache_unlock();
2722 g_free(XBZRLE
.encoded_buf
);
2723 XBZRLE
.encoded_buf
= NULL
;
2725 cache_fini(XBZRLE
.cache
);
2726 XBZRLE
.cache
= NULL
;
2728 g_free(XBZRLE
.zero_target_page
);
2729 XBZRLE
.zero_target_page
= NULL
;
2731 XBZRLE_cache_unlock();
2735 static int ram_state_init(RAMState
**rsp
)
2737 *rsp
= g_try_new0(RAMState
, 1);
2740 error_report("%s: Init ramstate fail", __func__
);
2744 qemu_mutex_init(&(*rsp
)->bitmap_mutex
);
2745 qemu_mutex_init(&(*rsp
)->src_page_req_mutex
);
2746 QSIMPLEQ_INIT(&(*rsp
)->src_page_requests
);
2749 * Count the total number of pages used by ram blocks not including any
2750 * gaps due to alignment or unplugs.
2751 * This must match with the initial values of dirty bitmap.
2753 (*rsp
)->migration_dirty_pages
= ram_bytes_total() >> TARGET_PAGE_BITS
;
2754 ram_state_reset(*rsp
);
2759 static void ram_list_init_bitmaps(void)
2761 MigrationState
*ms
= migrate_get_current();
2763 unsigned long pages
;
2766 /* Skip setting bitmap if there is no RAM */
2767 if (ram_bytes_total()) {
2768 shift
= ms
->clear_bitmap_shift
;
2769 if (shift
> CLEAR_BITMAP_SHIFT_MAX
) {
2770 error_report("clear_bitmap_shift (%u) too big, using "
2771 "max value (%u)", shift
, CLEAR_BITMAP_SHIFT_MAX
);
2772 shift
= CLEAR_BITMAP_SHIFT_MAX
;
2773 } else if (shift
< CLEAR_BITMAP_SHIFT_MIN
) {
2774 error_report("clear_bitmap_shift (%u) too small, using "
2775 "min value (%u)", shift
, CLEAR_BITMAP_SHIFT_MIN
);
2776 shift
= CLEAR_BITMAP_SHIFT_MIN
;
2779 RAMBLOCK_FOREACH_NOT_IGNORED(block
) {
2780 pages
= block
->max_length
>> TARGET_PAGE_BITS
;
2782 * The initial dirty bitmap for migration must be set with all
2783 * ones to make sure we'll migrate every guest RAM page to
2785 * Here we set RAMBlock.bmap all to 1 because when rebegin a
2786 * new migration after a failed migration, ram_list.
2787 * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
2790 block
->bmap
= bitmap_new(pages
);
2791 bitmap_set(block
->bmap
, 0, pages
);
2792 block
->clear_bmap_shift
= shift
;
2793 block
->clear_bmap
= bitmap_new(clear_bmap_size(pages
, shift
));
2798 static void migration_bitmap_clear_discarded_pages(RAMState
*rs
)
2800 unsigned long pages
;
2803 RCU_READ_LOCK_GUARD();
2805 RAMBLOCK_FOREACH_NOT_IGNORED(rb
) {
2806 pages
= ramblock_dirty_bitmap_clear_discarded_pages(rb
);
2807 rs
->migration_dirty_pages
-= pages
;
2811 static void ram_init_bitmaps(RAMState
*rs
)
2813 /* For memory_global_dirty_log_start below. */
2814 qemu_mutex_lock_iothread();
2815 qemu_mutex_lock_ramlist();
2817 WITH_RCU_READ_LOCK_GUARD() {
2818 ram_list_init_bitmaps();
2819 /* We don't use dirty log with background snapshots */
2820 if (!migrate_background_snapshot()) {
2821 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION
);
2822 migration_bitmap_sync_precopy(rs
);
2825 qemu_mutex_unlock_ramlist();
2826 qemu_mutex_unlock_iothread();
2829 * After an eventual first bitmap sync, fixup the initial bitmap
2830 * containing all 1s to exclude any discarded pages from migration.
2832 migration_bitmap_clear_discarded_pages(rs
);
2835 static int ram_init_all(RAMState
**rsp
)
2837 if (ram_state_init(rsp
)) {
2841 if (xbzrle_init()) {
2842 ram_state_cleanup(rsp
);
2846 ram_init_bitmaps(*rsp
);
2851 static void ram_state_resume_prepare(RAMState
*rs
, QEMUFile
*out
)
2857 * Postcopy is not using xbzrle/compression, so no need for that.
2858 * Also, since source are already halted, we don't need to care
2859 * about dirty page logging as well.
2862 RAMBLOCK_FOREACH_NOT_IGNORED(block
) {
2863 pages
+= bitmap_count_one(block
->bmap
,
2864 block
->used_length
>> TARGET_PAGE_BITS
);
2867 /* This may not be aligned with current bitmaps. Recalculate. */
2868 rs
->migration_dirty_pages
= pages
;
2870 ram_state_reset(rs
);
2872 /* Update RAMState cache of output QEMUFile */
2875 trace_ram_state_resume_prepare(pages
);
2879 * This function clears bits of the free pages reported by the caller from the
2880 * migration dirty bitmap. @addr is the host address corresponding to the
2881 * start of the continuous guest free pages, and @len is the total bytes of
2884 void qemu_guest_free_page_hint(void *addr
, size_t len
)
2888 size_t used_len
, start
, npages
;
2889 MigrationState
*s
= migrate_get_current();
2891 /* This function is currently expected to be used during live migration */
2892 if (!migration_is_setup_or_active(s
->state
)) {
2896 for (; len
> 0; len
-= used_len
, addr
+= used_len
) {
2897 block
= qemu_ram_block_from_host(addr
, false, &offset
);
2898 if (unlikely(!block
|| offset
>= block
->used_length
)) {
2900 * The implementation might not support RAMBlock resize during
2901 * live migration, but it could happen in theory with future
2902 * updates. So we add a check here to capture that case.
2904 error_report_once("%s unexpected error", __func__
);
2908 if (len
<= block
->used_length
- offset
) {
2911 used_len
= block
->used_length
- offset
;
2914 start
= offset
>> TARGET_PAGE_BITS
;
2915 npages
= used_len
>> TARGET_PAGE_BITS
;
2917 qemu_mutex_lock(&ram_state
->bitmap_mutex
);
2919 * The skipped free pages are equavalent to be sent from clear_bmap's
2920 * perspective, so clear the bits from the memory region bitmap which
2921 * are initially set. Otherwise those skipped pages will be sent in
2922 * the next round after syncing from the memory region bitmap.
2924 migration_clear_memory_region_dirty_bitmap_range(block
, start
, npages
);
2925 ram_state
->migration_dirty_pages
-=
2926 bitmap_count_one_with_offset(block
->bmap
, start
, npages
);
2927 bitmap_clear(block
->bmap
, start
, npages
);
2928 qemu_mutex_unlock(&ram_state
->bitmap_mutex
);
2933 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2934 * long-running RCU critical section. When rcu-reclaims in the code
2935 * start to become numerous it will be necessary to reduce the
2936 * granularity of these critical sections.
2940 * ram_save_setup: Setup RAM for migration
2942 * Returns zero to indicate success and negative for error
2944 * @f: QEMUFile where to send the data
2945 * @opaque: RAMState pointer
2947 static int ram_save_setup(QEMUFile
*f
, void *opaque
)
2949 RAMState
**rsp
= opaque
;
2952 if (compress_threads_save_setup()) {
2956 /* migration has already setup the bitmap, reuse it. */
2957 if (!migration_in_colo_state()) {
2958 if (ram_init_all(rsp
) != 0) {
2959 compress_threads_save_cleanup();
2965 WITH_RCU_READ_LOCK_GUARD() {
2966 qemu_put_be64(f
, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE
);
2968 RAMBLOCK_FOREACH_MIGRATABLE(block
) {
2969 qemu_put_byte(f
, strlen(block
->idstr
));
2970 qemu_put_buffer(f
, (uint8_t *)block
->idstr
, strlen(block
->idstr
));
2971 qemu_put_be64(f
, block
->used_length
);
2972 if (migrate_postcopy_ram() && block
->page_size
!=
2973 qemu_host_page_size
) {
2974 qemu_put_be64(f
, block
->page_size
);
2976 if (migrate_ignore_shared()) {
2977 qemu_put_be64(f
, block
->mr
->addr
);
2982 ram_control_before_iterate(f
, RAM_CONTROL_SETUP
);
2983 ram_control_after_iterate(f
, RAM_CONTROL_SETUP
);
2985 multifd_send_sync_main(f
);
2986 qemu_put_be64(f
, RAM_SAVE_FLAG_EOS
);
2993 * ram_save_iterate: iterative stage for migration
2995 * Returns zero to indicate success and negative for error
2997 * @f: QEMUFile where to send the data
2998 * @opaque: RAMState pointer
3000 static int ram_save_iterate(QEMUFile
*f
, void *opaque
)
3002 RAMState
**temp
= opaque
;
3003 RAMState
*rs
= *temp
;
3009 if (blk_mig_bulk_active()) {
3010 /* Avoid transferring ram during bulk phase of block migration as
3011 * the bulk phase will usually take a long time and transferring
3012 * ram updates during that time is pointless. */
3017 * We'll take this lock a little bit long, but it's okay for two reasons.
3018 * Firstly, the only possible other thread to take it is who calls
3019 * qemu_guest_free_page_hint(), which should be rare; secondly, see
3020 * MAX_WAIT (if curious, further see commit 4508bd9ed8053ce) below, which
3021 * guarantees that we'll at least released it in a regular basis.
3023 qemu_mutex_lock(&rs
->bitmap_mutex
);
3024 WITH_RCU_READ_LOCK_GUARD() {
3025 if (ram_list
.version
!= rs
->last_version
) {
3026 ram_state_reset(rs
);
3029 /* Read version before ram_list.blocks */
3032 ram_control_before_iterate(f
, RAM_CONTROL_ROUND
);
3034 t0
= qemu_clock_get_ns(QEMU_CLOCK_REALTIME
);
3036 while ((ret
= qemu_file_rate_limit(f
)) == 0 ||
3037 !QSIMPLEQ_EMPTY(&rs
->src_page_requests
)) {
3040 if (qemu_file_get_error(f
)) {
3044 pages
= ram_find_and_save_block(rs
, false);
3045 /* no more pages to sent */
3052 qemu_file_set_error(f
, pages
);
3056 rs
->target_page_count
+= pages
;
3059 * During postcopy, it is necessary to make sure one whole host
3060 * page is sent in one chunk.
3062 if (migrate_postcopy_ram()) {
3063 flush_compressed_data(rs
);
3067 * we want to check in the 1st loop, just in case it was the 1st
3068 * time and we had to sync the dirty bitmap.
3069 * qemu_clock_get_ns() is a bit expensive, so we only check each
3072 if ((i
& 63) == 0) {
3073 uint64_t t1
= (qemu_clock_get_ns(QEMU_CLOCK_REALTIME
) - t0
) /
3075 if (t1
> MAX_WAIT
) {
3076 trace_ram_save_iterate_big_wait(t1
, i
);
3083 qemu_mutex_unlock(&rs
->bitmap_mutex
);
3086 * Must occur before EOS (or any QEMUFile operation)
3087 * because of RDMA protocol.
3089 ram_control_after_iterate(f
, RAM_CONTROL_ROUND
);
3093 && migration_is_setup_or_active(migrate_get_current()->state
)) {
3094 multifd_send_sync_main(rs
->f
);
3095 qemu_put_be64(f
, RAM_SAVE_FLAG_EOS
);
3097 ram_counters
.transferred
+= 8;
3099 ret
= qemu_file_get_error(f
);
3109 * ram_save_complete: function called to send the remaining amount of ram
3111 * Returns zero to indicate success or negative on error
3113 * Called with iothread lock
3115 * @f: QEMUFile where to send the data
3116 * @opaque: RAMState pointer
3118 static int ram_save_complete(QEMUFile
*f
, void *opaque
)
3120 RAMState
**temp
= opaque
;
3121 RAMState
*rs
= *temp
;
3124 WITH_RCU_READ_LOCK_GUARD() {
3125 if (!migration_in_postcopy()) {
3126 migration_bitmap_sync_precopy(rs
);
3129 ram_control_before_iterate(f
, RAM_CONTROL_FINISH
);
3131 /* try transferring iterative blocks of memory */
3133 /* flush all remaining blocks regardless of rate limiting */
3137 pages
= ram_find_and_save_block(rs
, !migration_in_colo_state());
3138 /* no more blocks to sent */
3148 flush_compressed_data(rs
);
3149 ram_control_after_iterate(f
, RAM_CONTROL_FINISH
);
3153 multifd_send_sync_main(rs
->f
);
3154 qemu_put_be64(f
, RAM_SAVE_FLAG_EOS
);
3161 static void ram_save_pending(QEMUFile
*f
, void *opaque
, uint64_t max_size
,
3162 uint64_t *res_precopy_only
,
3163 uint64_t *res_compatible
,
3164 uint64_t *res_postcopy_only
)
3166 RAMState
**temp
= opaque
;
3167 RAMState
*rs
= *temp
;
3168 uint64_t remaining_size
;
3170 remaining_size
= rs
->migration_dirty_pages
* TARGET_PAGE_SIZE
;
3172 if (!migration_in_postcopy() &&
3173 remaining_size
< max_size
) {
3174 qemu_mutex_lock_iothread();
3175 WITH_RCU_READ_LOCK_GUARD() {
3176 migration_bitmap_sync_precopy(rs
);
3178 qemu_mutex_unlock_iothread();
3179 remaining_size
= rs
->migration_dirty_pages
* TARGET_PAGE_SIZE
;
3182 if (migrate_postcopy_ram()) {
3183 /* We can do postcopy, and all the data is postcopiable */
3184 *res_compatible
+= remaining_size
;
3186 *res_precopy_only
+= remaining_size
;
3190 static int load_xbzrle(QEMUFile
*f
, ram_addr_t addr
, void *host
)
3192 unsigned int xh_len
;
3194 uint8_t *loaded_data
;
3196 /* extract RLE header */
3197 xh_flags
= qemu_get_byte(f
);
3198 xh_len
= qemu_get_be16(f
);
3200 if (xh_flags
!= ENCODING_FLAG_XBZRLE
) {
3201 error_report("Failed to load XBZRLE page - wrong compression!");
3205 if (xh_len
> TARGET_PAGE_SIZE
) {
3206 error_report("Failed to load XBZRLE page - len overflow!");
3209 loaded_data
= XBZRLE
.decoded_buf
;
3210 /* load data and decode */
3211 /* it can change loaded_data to point to an internal buffer */
3212 qemu_get_buffer_in_place(f
, &loaded_data
, xh_len
);
3215 if (xbzrle_decode_buffer(loaded_data
, xh_len
, host
,
3216 TARGET_PAGE_SIZE
) == -1) {
3217 error_report("Failed to load XBZRLE page - decode error!");
3225 * ram_block_from_stream: read a RAMBlock id from the migration stream
3227 * Must be called from within a rcu critical section.
3229 * Returns a pointer from within the RCU-protected ram_list.
3231 * @f: QEMUFile where to read the data from
3232 * @flags: Page flags (mostly to see if it's a continuation of previous block)
3234 static inline RAMBlock
*ram_block_from_stream(QEMUFile
*f
, int flags
)
3236 static RAMBlock
*block
;
3240 if (flags
& RAM_SAVE_FLAG_CONTINUE
) {
3242 error_report("Ack, bad migration stream!");
3248 len
= qemu_get_byte(f
);
3249 qemu_get_buffer(f
, (uint8_t *)id
, len
);
3252 block
= qemu_ram_block_by_name(id
);
3254 error_report("Can't find block %s", id
);
3258 if (ramblock_is_ignored(block
)) {
3259 error_report("block %s should not be migrated !", id
);
3266 static inline void *host_from_ram_block_offset(RAMBlock
*block
,
3269 if (!offset_in_ramblock(block
, offset
)) {
3273 return block
->host
+ offset
;
3276 static void *host_page_from_ram_block_offset(RAMBlock
*block
,
3279 /* Note: Explicitly no check against offset_in_ramblock(). */
3280 return (void *)QEMU_ALIGN_DOWN((uintptr_t)(block
->host
+ offset
),
3284 static ram_addr_t
host_page_offset_from_ram_block_offset(RAMBlock
*block
,
3287 return ((uintptr_t)block
->host
+ offset
) & (block
->page_size
- 1);
3290 static inline void *colo_cache_from_block_offset(RAMBlock
*block
,
3291 ram_addr_t offset
, bool record_bitmap
)
3293 if (!offset_in_ramblock(block
, offset
)) {
3296 if (!block
->colo_cache
) {
3297 error_report("%s: colo_cache is NULL in block :%s",
3298 __func__
, block
->idstr
);
3303 * During colo checkpoint, we need bitmap of these migrated pages.
3304 * It help us to decide which pages in ram cache should be flushed
3305 * into VM's RAM later.
3307 if (record_bitmap
&&
3308 !test_and_set_bit(offset
>> TARGET_PAGE_BITS
, block
->bmap
)) {
3309 ram_state
->migration_dirty_pages
++;
3311 return block
->colo_cache
+ offset
;
3315 * ram_handle_compressed: handle the zero page case
3317 * If a page (or a whole RDMA chunk) has been
3318 * determined to be zero, then zap it.
3320 * @host: host address for the zero page
3321 * @ch: what the page is filled from. We only support zero
3322 * @size: size of the zero page
3324 void ram_handle_compressed(void *host
, uint8_t ch
, uint64_t size
)
3326 if (ch
!= 0 || !is_zero_range(host
, size
)) {
3327 memset(host
, ch
, size
);
3331 /* return the size after decompression, or negative value on error */
3333 qemu_uncompress_data(z_stream
*stream
, uint8_t *dest
, size_t dest_len
,
3334 const uint8_t *source
, size_t source_len
)
3338 err
= inflateReset(stream
);
3343 stream
->avail_in
= source_len
;
3344 stream
->next_in
= (uint8_t *)source
;
3345 stream
->avail_out
= dest_len
;
3346 stream
->next_out
= dest
;
3348 err
= inflate(stream
, Z_NO_FLUSH
);
3349 if (err
!= Z_STREAM_END
) {
3353 return stream
->total_out
;
3356 static void *do_data_decompress(void *opaque
)
3358 DecompressParam
*param
= opaque
;
3359 unsigned long pagesize
;
3363 qemu_mutex_lock(¶m
->mutex
);
3364 while (!param
->quit
) {
3369 qemu_mutex_unlock(¶m
->mutex
);
3371 pagesize
= TARGET_PAGE_SIZE
;
3373 ret
= qemu_uncompress_data(¶m
->stream
, des
, pagesize
,
3374 param
->compbuf
, len
);
3375 if (ret
< 0 && migrate_get_current()->decompress_error_check
) {
3376 error_report("decompress data failed");
3377 qemu_file_set_error(decomp_file
, ret
);
3380 qemu_mutex_lock(&decomp_done_lock
);
3382 qemu_cond_signal(&decomp_done_cond
);
3383 qemu_mutex_unlock(&decomp_done_lock
);
3385 qemu_mutex_lock(¶m
->mutex
);
3387 qemu_cond_wait(¶m
->cond
, ¶m
->mutex
);
3390 qemu_mutex_unlock(¶m
->mutex
);
3395 static int wait_for_decompress_done(void)
3397 int idx
, thread_count
;
3399 if (!migrate_use_compression()) {
3403 thread_count
= migrate_decompress_threads();
3404 qemu_mutex_lock(&decomp_done_lock
);
3405 for (idx
= 0; idx
< thread_count
; idx
++) {
3406 while (!decomp_param
[idx
].done
) {
3407 qemu_cond_wait(&decomp_done_cond
, &decomp_done_lock
);
3410 qemu_mutex_unlock(&decomp_done_lock
);
3411 return qemu_file_get_error(decomp_file
);
3414 static void compress_threads_load_cleanup(void)
3416 int i
, thread_count
;
3418 if (!migrate_use_compression()) {
3421 thread_count
= migrate_decompress_threads();
3422 for (i
= 0; i
< thread_count
; i
++) {
3424 * we use it as a indicator which shows if the thread is
3425 * properly init'd or not
3427 if (!decomp_param
[i
].compbuf
) {
3431 qemu_mutex_lock(&decomp_param
[i
].mutex
);
3432 decomp_param
[i
].quit
= true;
3433 qemu_cond_signal(&decomp_param
[i
].cond
);
3434 qemu_mutex_unlock(&decomp_param
[i
].mutex
);
3436 for (i
= 0; i
< thread_count
; i
++) {
3437 if (!decomp_param
[i
].compbuf
) {
3441 qemu_thread_join(decompress_threads
+ i
);
3442 qemu_mutex_destroy(&decomp_param
[i
].mutex
);
3443 qemu_cond_destroy(&decomp_param
[i
].cond
);
3444 inflateEnd(&decomp_param
[i
].stream
);
3445 g_free(decomp_param
[i
].compbuf
);
3446 decomp_param
[i
].compbuf
= NULL
;
3448 g_free(decompress_threads
);
3449 g_free(decomp_param
);
3450 decompress_threads
= NULL
;
3451 decomp_param
= NULL
;
3455 static int compress_threads_load_setup(QEMUFile
*f
)
3457 int i
, thread_count
;
3459 if (!migrate_use_compression()) {
3463 thread_count
= migrate_decompress_threads();
3464 decompress_threads
= g_new0(QemuThread
, thread_count
);
3465 decomp_param
= g_new0(DecompressParam
, thread_count
);
3466 qemu_mutex_init(&decomp_done_lock
);
3467 qemu_cond_init(&decomp_done_cond
);
3469 for (i
= 0; i
< thread_count
; i
++) {
3470 if (inflateInit(&decomp_param
[i
].stream
) != Z_OK
) {
3474 decomp_param
[i
].compbuf
= g_malloc0(compressBound(TARGET_PAGE_SIZE
));
3475 qemu_mutex_init(&decomp_param
[i
].mutex
);
3476 qemu_cond_init(&decomp_param
[i
].cond
);
3477 decomp_param
[i
].done
= true;
3478 decomp_param
[i
].quit
= false;
3479 qemu_thread_create(decompress_threads
+ i
, "decompress",
3480 do_data_decompress
, decomp_param
+ i
,
3481 QEMU_THREAD_JOINABLE
);
3485 compress_threads_load_cleanup();
3489 static void decompress_data_with_multi_threads(QEMUFile
*f
,
3490 void *host
, int len
)
3492 int idx
, thread_count
;
3494 thread_count
= migrate_decompress_threads();
3495 QEMU_LOCK_GUARD(&decomp_done_lock
);
3497 for (idx
= 0; idx
< thread_count
; idx
++) {
3498 if (decomp_param
[idx
].done
) {
3499 decomp_param
[idx
].done
= false;
3500 qemu_mutex_lock(&decomp_param
[idx
].mutex
);
3501 qemu_get_buffer(f
, decomp_param
[idx
].compbuf
, len
);
3502 decomp_param
[idx
].des
= host
;
3503 decomp_param
[idx
].len
= len
;
3504 qemu_cond_signal(&decomp_param
[idx
].cond
);
3505 qemu_mutex_unlock(&decomp_param
[idx
].mutex
);
3509 if (idx
< thread_count
) {
3512 qemu_cond_wait(&decomp_done_cond
, &decomp_done_lock
);
3517 static void colo_init_ram_state(void)
3519 ram_state_init(&ram_state
);
3523 * colo cache: this is for secondary VM, we cache the whole
3524 * memory of the secondary VM, it is need to hold the global lock
3525 * to call this helper.
3527 int colo_init_ram_cache(void)
3531 WITH_RCU_READ_LOCK_GUARD() {
3532 RAMBLOCK_FOREACH_NOT_IGNORED(block
) {
3533 block
->colo_cache
= qemu_anon_ram_alloc(block
->used_length
,
3534 NULL
, false, false);
3535 if (!block
->colo_cache
) {
3536 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3537 "size 0x" RAM_ADDR_FMT
, __func__
, block
->idstr
,
3538 block
->used_length
);
3539 RAMBLOCK_FOREACH_NOT_IGNORED(block
) {
3540 if (block
->colo_cache
) {
3541 qemu_anon_ram_free(block
->colo_cache
, block
->used_length
);
3542 block
->colo_cache
= NULL
;
3547 if (!machine_dump_guest_core(current_machine
)) {
3548 qemu_madvise(block
->colo_cache
, block
->used_length
,
3549 QEMU_MADV_DONTDUMP
);
3555 * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3556 * with to decide which page in cache should be flushed into SVM's RAM. Here
3557 * we use the same name 'ram_bitmap' as for migration.
3559 if (ram_bytes_total()) {
3562 RAMBLOCK_FOREACH_NOT_IGNORED(block
) {
3563 unsigned long pages
= block
->max_length
>> TARGET_PAGE_BITS
;
3564 block
->bmap
= bitmap_new(pages
);
3568 colo_init_ram_state();
3572 /* TODO: duplicated with ram_init_bitmaps */
3573 void colo_incoming_start_dirty_log(void)
3575 RAMBlock
*block
= NULL
;
3576 /* For memory_global_dirty_log_start below. */
3577 qemu_mutex_lock_iothread();
3578 qemu_mutex_lock_ramlist();
3580 memory_global_dirty_log_sync();
3581 WITH_RCU_READ_LOCK_GUARD() {
3582 RAMBLOCK_FOREACH_NOT_IGNORED(block
) {
3583 ramblock_sync_dirty_bitmap(ram_state
, block
);
3584 /* Discard this dirty bitmap record */
3585 bitmap_zero(block
->bmap
, block
->max_length
>> TARGET_PAGE_BITS
);
3587 memory_global_dirty_log_start(GLOBAL_DIRTY_MIGRATION
);
3589 ram_state
->migration_dirty_pages
= 0;
3590 qemu_mutex_unlock_ramlist();
3591 qemu_mutex_unlock_iothread();
3594 /* It is need to hold the global lock to call this helper */
3595 void colo_release_ram_cache(void)
3599 memory_global_dirty_log_stop(GLOBAL_DIRTY_MIGRATION
);
3600 RAMBLOCK_FOREACH_NOT_IGNORED(block
) {
3601 g_free(block
->bmap
);
3605 WITH_RCU_READ_LOCK_GUARD() {
3606 RAMBLOCK_FOREACH_NOT_IGNORED(block
) {
3607 if (block
->colo_cache
) {
3608 qemu_anon_ram_free(block
->colo_cache
, block
->used_length
);
3609 block
->colo_cache
= NULL
;
3613 ram_state_cleanup(&ram_state
);
3617 * ram_load_setup: Setup RAM for migration incoming side
3619 * Returns zero to indicate success and negative for error
3621 * @f: QEMUFile where to receive the data
3622 * @opaque: RAMState pointer
3624 static int ram_load_setup(QEMUFile
*f
, void *opaque
)
3626 if (compress_threads_load_setup(f
)) {
3630 xbzrle_load_setup();
3631 ramblock_recv_map_init();
3636 static int ram_load_cleanup(void *opaque
)
3640 RAMBLOCK_FOREACH_NOT_IGNORED(rb
) {
3641 qemu_ram_block_writeback(rb
);
3644 xbzrle_load_cleanup();
3645 compress_threads_load_cleanup();
3647 RAMBLOCK_FOREACH_NOT_IGNORED(rb
) {
3648 g_free(rb
->receivedmap
);
3649 rb
->receivedmap
= NULL
;
3656 * ram_postcopy_incoming_init: allocate postcopy data structures
3658 * Returns 0 for success and negative if there was one error
3660 * @mis: current migration incoming state
3662 * Allocate data structures etc needed by incoming migration with
3663 * postcopy-ram. postcopy-ram's similarly names
3664 * postcopy_ram_incoming_init does the work.
3666 int ram_postcopy_incoming_init(MigrationIncomingState
*mis
)
3668 return postcopy_ram_incoming_init(mis
);
3672 * ram_load_postcopy: load a page in postcopy case
3674 * Returns 0 for success or -errno in case of error
3676 * Called in postcopy mode by ram_load().
3677 * rcu_read_lock is taken prior to this being called.
3679 * @f: QEMUFile where to send the data
3681 static int ram_load_postcopy(QEMUFile
*f
)
3683 int flags
= 0, ret
= 0;
3684 bool place_needed
= false;
3685 bool matches_target_page_size
= false;
3686 MigrationIncomingState
*mis
= migration_incoming_get_current();
3687 /* Temporary page that is later 'placed' */
3688 void *postcopy_host_page
= mis
->postcopy_tmp_page
;
3689 void *host_page
= NULL
;
3690 bool all_zero
= true;
3691 int target_pages
= 0;
3693 while (!ret
&& !(flags
& RAM_SAVE_FLAG_EOS
)) {
3695 void *page_buffer
= NULL
;
3696 void *place_source
= NULL
;
3697 RAMBlock
*block
= NULL
;
3701 addr
= qemu_get_be64(f
);
3704 * If qemu file error, we should stop here, and then "addr"
3707 ret
= qemu_file_get_error(f
);
3712 flags
= addr
& ~TARGET_PAGE_MASK
;
3713 addr
&= TARGET_PAGE_MASK
;
3715 trace_ram_load_postcopy_loop((uint64_t)addr
, flags
);
3716 if (flags
& (RAM_SAVE_FLAG_ZERO
| RAM_SAVE_FLAG_PAGE
|
3717 RAM_SAVE_FLAG_COMPRESS_PAGE
)) {
3718 block
= ram_block_from_stream(f
, flags
);
3725 * Relying on used_length is racy and can result in false positives.
3726 * We might place pages beyond used_length in case RAM was shrunk
3727 * while in postcopy, which is fine - trying to place via
3728 * UFFDIO_COPY/UFFDIO_ZEROPAGE will never segfault.
3730 if (!block
->host
|| addr
>= block
->postcopy_length
) {
3731 error_report("Illegal RAM offset " RAM_ADDR_FMT
, addr
);
3736 matches_target_page_size
= block
->page_size
== TARGET_PAGE_SIZE
;
3738 * Postcopy requires that we place whole host pages atomically;
3739 * these may be huge pages for RAMBlocks that are backed by
3741 * To make it atomic, the data is read into a temporary page
3742 * that's moved into place later.
3743 * The migration protocol uses, possibly smaller, target-pages
3744 * however the source ensures it always sends all the components
3745 * of a host page in one chunk.
3747 page_buffer
= postcopy_host_page
+
3748 host_page_offset_from_ram_block_offset(block
, addr
);
3749 /* If all TP are zero then we can optimise the place */
3750 if (target_pages
== 1) {
3751 host_page
= host_page_from_ram_block_offset(block
, addr
);
3752 } else if (host_page
!= host_page_from_ram_block_offset(block
,
3754 /* not the 1st TP within the HP */
3755 error_report("Non-same host page %p/%p", host_page
,
3756 host_page_from_ram_block_offset(block
, addr
));
3762 * If it's the last part of a host page then we place the host
3765 if (target_pages
== (block
->page_size
/ TARGET_PAGE_SIZE
)) {
3766 place_needed
= true;
3768 place_source
= postcopy_host_page
;
3771 switch (flags
& ~RAM_SAVE_FLAG_CONTINUE
) {
3772 case RAM_SAVE_FLAG_ZERO
:
3773 ch
= qemu_get_byte(f
);
3775 * Can skip to set page_buffer when
3776 * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
3778 if (ch
|| !matches_target_page_size
) {
3779 memset(page_buffer
, ch
, TARGET_PAGE_SIZE
);
3786 case RAM_SAVE_FLAG_PAGE
:
3788 if (!matches_target_page_size
) {
3789 /* For huge pages, we always use temporary buffer */
3790 qemu_get_buffer(f
, page_buffer
, TARGET_PAGE_SIZE
);
3793 * For small pages that matches target page size, we
3794 * avoid the qemu_file copy. Instead we directly use
3795 * the buffer of QEMUFile to place the page. Note: we
3796 * cannot do any QEMUFile operation before using that
3797 * buffer to make sure the buffer is valid when
3800 qemu_get_buffer_in_place(f
, (uint8_t **)&place_source
,
3804 case RAM_SAVE_FLAG_COMPRESS_PAGE
:
3806 len
= qemu_get_be32(f
);
3807 if (len
< 0 || len
> compressBound(TARGET_PAGE_SIZE
)) {
3808 error_report("Invalid compressed data length: %d", len
);
3812 decompress_data_with_multi_threads(f
, page_buffer
, len
);
3815 case RAM_SAVE_FLAG_EOS
:
3817 multifd_recv_sync_main();
3820 error_report("Unknown combination of migration flags: 0x%x"
3821 " (postcopy mode)", flags
);
3826 /* Got the whole host page, wait for decompress before placing. */
3828 ret
|= wait_for_decompress_done();
3831 /* Detect for any possible file errors */
3832 if (!ret
&& qemu_file_get_error(f
)) {
3833 ret
= qemu_file_get_error(f
);
3836 if (!ret
&& place_needed
) {
3838 ret
= postcopy_place_page_zero(mis
, host_page
, block
);
3840 ret
= postcopy_place_page(mis
, host_page
, place_source
,
3843 place_needed
= false;
3845 /* Assume we have a zero page until we detect something different */
3853 static bool postcopy_is_advised(void)
3855 PostcopyState ps
= postcopy_state_get();
3856 return ps
>= POSTCOPY_INCOMING_ADVISE
&& ps
< POSTCOPY_INCOMING_END
;
3859 static bool postcopy_is_running(void)
3861 PostcopyState ps
= postcopy_state_get();
3862 return ps
>= POSTCOPY_INCOMING_LISTENING
&& ps
< POSTCOPY_INCOMING_END
;
3866 * Flush content of RAM cache into SVM's memory.
3867 * Only flush the pages that be dirtied by PVM or SVM or both.
3869 void colo_flush_ram_cache(void)
3871 RAMBlock
*block
= NULL
;
3874 unsigned long offset
= 0;
3876 memory_global_dirty_log_sync();
3877 qemu_mutex_lock(&ram_state
->bitmap_mutex
);
3878 WITH_RCU_READ_LOCK_GUARD() {
3879 RAMBLOCK_FOREACH_NOT_IGNORED(block
) {
3880 ramblock_sync_dirty_bitmap(ram_state
, block
);
3884 trace_colo_flush_ram_cache_begin(ram_state
->migration_dirty_pages
);
3885 WITH_RCU_READ_LOCK_GUARD() {
3886 block
= QLIST_FIRST_RCU(&ram_list
.blocks
);
3889 offset
= migration_bitmap_find_dirty(ram_state
, block
, offset
);
3891 if (!offset_in_ramblock(block
,
3892 ((ram_addr_t
)offset
) << TARGET_PAGE_BITS
)) {
3894 block
= QLIST_NEXT_RCU(block
, next
);
3896 migration_bitmap_clear_dirty(ram_state
, block
, offset
);
3897 dst_host
= block
->host
3898 + (((ram_addr_t
)offset
) << TARGET_PAGE_BITS
);
3899 src_host
= block
->colo_cache
3900 + (((ram_addr_t
)offset
) << TARGET_PAGE_BITS
);
3901 memcpy(dst_host
, src_host
, TARGET_PAGE_SIZE
);
3905 trace_colo_flush_ram_cache_end();
3906 qemu_mutex_unlock(&ram_state
->bitmap_mutex
);
3910 * ram_load_precopy: load pages in precopy case
3912 * Returns 0 for success or -errno in case of error
3914 * Called in precopy mode by ram_load().
3915 * rcu_read_lock is taken prior to this being called.
3917 * @f: QEMUFile where to send the data
3919 static int ram_load_precopy(QEMUFile
*f
)
3921 int flags
= 0, ret
= 0, invalid_flags
= 0, len
= 0, i
= 0;
3922 /* ADVISE is earlier, it shows the source has the postcopy capability on */
3923 bool postcopy_advised
= postcopy_is_advised();
3924 if (!migrate_use_compression()) {
3925 invalid_flags
|= RAM_SAVE_FLAG_COMPRESS_PAGE
;
3928 while (!ret
&& !(flags
& RAM_SAVE_FLAG_EOS
)) {
3929 ram_addr_t addr
, total_ram_bytes
;
3930 void *host
= NULL
, *host_bak
= NULL
;
3934 * Yield periodically to let main loop run, but an iteration of
3935 * the main loop is expensive, so do it each some iterations
3937 if ((i
& 32767) == 0 && qemu_in_coroutine()) {
3938 aio_co_schedule(qemu_get_current_aio_context(),
3939 qemu_coroutine_self());
3940 qemu_coroutine_yield();
3944 addr
= qemu_get_be64(f
);
3945 flags
= addr
& ~TARGET_PAGE_MASK
;
3946 addr
&= TARGET_PAGE_MASK
;
3948 if (flags
& invalid_flags
) {
3949 if (flags
& invalid_flags
& RAM_SAVE_FLAG_COMPRESS_PAGE
) {
3950 error_report("Received an unexpected compressed page");
3957 if (flags
& (RAM_SAVE_FLAG_ZERO
| RAM_SAVE_FLAG_PAGE
|
3958 RAM_SAVE_FLAG_COMPRESS_PAGE
| RAM_SAVE_FLAG_XBZRLE
)) {
3959 RAMBlock
*block
= ram_block_from_stream(f
, flags
);
3961 host
= host_from_ram_block_offset(block
, addr
);
3963 * After going into COLO stage, we should not load the page
3964 * into SVM's memory directly, we put them into colo_cache firstly.
3965 * NOTE: We need to keep a copy of SVM's ram in colo_cache.
3966 * Previously, we copied all these memory in preparing stage of COLO
3967 * while we need to stop VM, which is a time-consuming process.
3968 * Here we optimize it by a trick, back-up every page while in
3969 * migration process while COLO is enabled, though it affects the
3970 * speed of the migration, but it obviously reduce the downtime of
3971 * back-up all SVM'S memory in COLO preparing stage.
3973 if (migration_incoming_colo_enabled()) {
3974 if (migration_incoming_in_colo_state()) {
3975 /* In COLO stage, put all pages into cache temporarily */
3976 host
= colo_cache_from_block_offset(block
, addr
, true);
3979 * In migration stage but before COLO stage,
3980 * Put all pages into both cache and SVM's memory.
3982 host_bak
= colo_cache_from_block_offset(block
, addr
, false);
3986 error_report("Illegal RAM offset " RAM_ADDR_FMT
, addr
);
3990 if (!migration_incoming_in_colo_state()) {
3991 ramblock_recv_bitmap_set(block
, host
);
3994 trace_ram_load_loop(block
->idstr
, (uint64_t)addr
, flags
, host
);
3997 switch (flags
& ~RAM_SAVE_FLAG_CONTINUE
) {
3998 case RAM_SAVE_FLAG_MEM_SIZE
:
3999 /* Synchronize RAM block list */
4000 total_ram_bytes
= addr
;
4001 while (!ret
&& total_ram_bytes
) {
4006 len
= qemu_get_byte(f
);
4007 qemu_get_buffer(f
, (uint8_t *)id
, len
);
4009 length
= qemu_get_be64(f
);
4011 block
= qemu_ram_block_by_name(id
);
4012 if (block
&& !qemu_ram_is_migratable(block
)) {
4013 error_report("block %s should not be migrated !", id
);
4016 if (length
!= block
->used_length
) {
4017 Error
*local_err
= NULL
;
4019 ret
= qemu_ram_resize(block
, length
,
4022 error_report_err(local_err
);
4025 /* For postcopy we need to check hugepage sizes match */
4026 if (postcopy_advised
&& migrate_postcopy_ram() &&
4027 block
->page_size
!= qemu_host_page_size
) {
4028 uint64_t remote_page_size
= qemu_get_be64(f
);
4029 if (remote_page_size
!= block
->page_size
) {
4030 error_report("Mismatched RAM page size %s "
4031 "(local) %zd != %" PRId64
,
4032 id
, block
->page_size
,
4037 if (migrate_ignore_shared()) {
4038 hwaddr addr
= qemu_get_be64(f
);
4039 if (ramblock_is_ignored(block
) &&
4040 block
->mr
->addr
!= addr
) {
4041 error_report("Mismatched GPAs for block %s "
4042 "%" PRId64
"!= %" PRId64
,
4044 (uint64_t)block
->mr
->addr
);
4048 ram_control_load_hook(f
, RAM_CONTROL_BLOCK_REG
,
4051 error_report("Unknown ramblock \"%s\", cannot "
4052 "accept migration", id
);
4056 total_ram_bytes
-= length
;
4060 case RAM_SAVE_FLAG_ZERO
:
4061 ch
= qemu_get_byte(f
);
4062 ram_handle_compressed(host
, ch
, TARGET_PAGE_SIZE
);
4065 case RAM_SAVE_FLAG_PAGE
:
4066 qemu_get_buffer(f
, host
, TARGET_PAGE_SIZE
);
4069 case RAM_SAVE_FLAG_COMPRESS_PAGE
:
4070 len
= qemu_get_be32(f
);
4071 if (len
< 0 || len
> compressBound(TARGET_PAGE_SIZE
)) {
4072 error_report("Invalid compressed data length: %d", len
);
4076 decompress_data_with_multi_threads(f
, host
, len
);
4079 case RAM_SAVE_FLAG_XBZRLE
:
4080 if (load_xbzrle(f
, addr
, host
) < 0) {
4081 error_report("Failed to decompress XBZRLE page at "
4082 RAM_ADDR_FMT
, addr
);
4087 case RAM_SAVE_FLAG_EOS
:
4089 multifd_recv_sync_main();
4092 if (flags
& RAM_SAVE_FLAG_HOOK
) {
4093 ram_control_load_hook(f
, RAM_CONTROL_HOOK
, NULL
);
4095 error_report("Unknown combination of migration flags: 0x%x",
4101 ret
= qemu_file_get_error(f
);
4103 if (!ret
&& host_bak
) {
4104 memcpy(host_bak
, host
, TARGET_PAGE_SIZE
);
4108 ret
|= wait_for_decompress_done();
4112 static int ram_load(QEMUFile
*f
, void *opaque
, int version_id
)
4115 static uint64_t seq_iter
;
4117 * If system is running in postcopy mode, page inserts to host memory must
4120 bool postcopy_running
= postcopy_is_running();
4124 if (version_id
!= 4) {
4129 * This RCU critical section can be very long running.
4130 * When RCU reclaims in the code start to become numerous,
4131 * it will be necessary to reduce the granularity of this
4134 WITH_RCU_READ_LOCK_GUARD() {
4135 if (postcopy_running
) {
4136 ret
= ram_load_postcopy(f
);
4138 ret
= ram_load_precopy(f
);
4141 trace_ram_load_complete(ret
, seq_iter
);
4146 static bool ram_has_postcopy(void *opaque
)
4149 RAMBLOCK_FOREACH_NOT_IGNORED(rb
) {
4150 if (ramblock_is_pmem(rb
)) {
4151 info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
4152 "is not supported now!", rb
->idstr
, rb
->host
);
4157 return migrate_postcopy_ram();
4160 /* Sync all the dirty bitmap with destination VM. */
4161 static int ram_dirty_bitmap_sync_all(MigrationState
*s
, RAMState
*rs
)
4164 QEMUFile
*file
= s
->to_dst_file
;
4165 int ramblock_count
= 0;
4167 trace_ram_dirty_bitmap_sync_start();
4169 RAMBLOCK_FOREACH_NOT_IGNORED(block
) {
4170 qemu_savevm_send_recv_bitmap(file
, block
->idstr
);
4171 trace_ram_dirty_bitmap_request(block
->idstr
);
4175 trace_ram_dirty_bitmap_sync_wait();
4177 /* Wait until all the ramblocks' dirty bitmap synced */
4178 while (ramblock_count
--) {
4179 qemu_sem_wait(&s
->rp_state
.rp_sem
);
4182 trace_ram_dirty_bitmap_sync_complete();
4187 static void ram_dirty_bitmap_reload_notify(MigrationState
*s
)
4189 qemu_sem_post(&s
->rp_state
.rp_sem
);
4193 * Read the received bitmap, revert it as the initial dirty bitmap.
4194 * This is only used when the postcopy migration is paused but wants
4195 * to resume from a middle point.
4197 int ram_dirty_bitmap_reload(MigrationState
*s
, RAMBlock
*block
)
4200 /* from_dst_file is always valid because we're within rp_thread */
4201 QEMUFile
*file
= s
->rp_state
.from_dst_file
;
4202 unsigned long *le_bitmap
, nbits
= block
->used_length
>> TARGET_PAGE_BITS
;
4203 uint64_t local_size
= DIV_ROUND_UP(nbits
, 8);
4204 uint64_t size
, end_mark
;
4206 trace_ram_dirty_bitmap_reload_begin(block
->idstr
);
4208 if (s
->state
!= MIGRATION_STATUS_POSTCOPY_RECOVER
) {
4209 error_report("%s: incorrect state %s", __func__
,
4210 MigrationStatus_str(s
->state
));
4215 * Note: see comments in ramblock_recv_bitmap_send() on why we
4216 * need the endianness conversion, and the paddings.
4218 local_size
= ROUND_UP(local_size
, 8);
4221 le_bitmap
= bitmap_new(nbits
+ BITS_PER_LONG
);
4223 size
= qemu_get_be64(file
);
4225 /* The size of the bitmap should match with our ramblock */
4226 if (size
!= local_size
) {
4227 error_report("%s: ramblock '%s' bitmap size mismatch "
4228 "(0x%"PRIx64
" != 0x%"PRIx64
")", __func__
,
4229 block
->idstr
, size
, local_size
);
4234 size
= qemu_get_buffer(file
, (uint8_t *)le_bitmap
, local_size
);
4235 end_mark
= qemu_get_be64(file
);
4237 ret
= qemu_file_get_error(file
);
4238 if (ret
|| size
!= local_size
) {
4239 error_report("%s: read bitmap failed for ramblock '%s': %d"
4240 " (size 0x%"PRIx64
", got: 0x%"PRIx64
")",
4241 __func__
, block
->idstr
, ret
, local_size
, size
);
4246 if (end_mark
!= RAMBLOCK_RECV_BITMAP_ENDING
) {
4247 error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64
,
4248 __func__
, block
->idstr
, end_mark
);
4254 * Endianness conversion. We are during postcopy (though paused).
4255 * The dirty bitmap won't change. We can directly modify it.
4257 bitmap_from_le(block
->bmap
, le_bitmap
, nbits
);
4260 * What we received is "received bitmap". Revert it as the initial
4261 * dirty bitmap for this ramblock.
4263 bitmap_complement(block
->bmap
, block
->bmap
, nbits
);
4265 /* Clear dirty bits of discarded ranges that we don't want to migrate. */
4266 ramblock_dirty_bitmap_clear_discarded_pages(block
);
4268 /* We'll recalculate migration_dirty_pages in ram_state_resume_prepare(). */
4269 trace_ram_dirty_bitmap_reload_complete(block
->idstr
);
4272 * We succeeded to sync bitmap for current ramblock. If this is
4273 * the last one to sync, we need to notify the main send thread.
4275 ram_dirty_bitmap_reload_notify(s
);
4283 static int ram_resume_prepare(MigrationState
*s
, void *opaque
)
4285 RAMState
*rs
= *(RAMState
**)opaque
;
4288 ret
= ram_dirty_bitmap_sync_all(s
, rs
);
4293 ram_state_resume_prepare(rs
, s
->to_dst_file
);
4298 static SaveVMHandlers savevm_ram_handlers
= {
4299 .save_setup
= ram_save_setup
,
4300 .save_live_iterate
= ram_save_iterate
,
4301 .save_live_complete_postcopy
= ram_save_complete
,
4302 .save_live_complete_precopy
= ram_save_complete
,
4303 .has_postcopy
= ram_has_postcopy
,
4304 .save_live_pending
= ram_save_pending
,
4305 .load_state
= ram_load
,
4306 .save_cleanup
= ram_save_cleanup
,
4307 .load_setup
= ram_load_setup
,
4308 .load_cleanup
= ram_load_cleanup
,
4309 .resume_prepare
= ram_resume_prepare
,
4312 static void ram_mig_ram_block_resized(RAMBlockNotifier
*n
, void *host
,
4313 size_t old_size
, size_t new_size
)
4315 PostcopyState ps
= postcopy_state_get();
4317 RAMBlock
*rb
= qemu_ram_block_from_host(host
, false, &offset
);
4320 if (ramblock_is_ignored(rb
)) {
4324 if (!migration_is_idle()) {
4326 * Precopy code on the source cannot deal with the size of RAM blocks
4327 * changing at random points in time - especially after sending the
4328 * RAM block sizes in the migration stream, they must no longer change.
4329 * Abort and indicate a proper reason.
4331 error_setg(&err
, "RAM block '%s' resized during precopy.", rb
->idstr
);
4332 migration_cancel(err
);
4337 case POSTCOPY_INCOMING_ADVISE
:
4339 * Update what ram_postcopy_incoming_init()->init_range() does at the
4340 * time postcopy was advised. Syncing RAM blocks with the source will
4341 * result in RAM resizes.
4343 if (old_size
< new_size
) {
4344 if (ram_discard_range(rb
->idstr
, old_size
, new_size
- old_size
)) {
4345 error_report("RAM block '%s' discard of resized RAM failed",
4349 rb
->postcopy_length
= new_size
;
4351 case POSTCOPY_INCOMING_NONE
:
4352 case POSTCOPY_INCOMING_RUNNING
:
4353 case POSTCOPY_INCOMING_END
:
4355 * Once our guest is running, postcopy does no longer care about
4356 * resizes. When growing, the new memory was not available on the
4357 * source, no handler needed.
4361 error_report("RAM block '%s' resized during postcopy state: %d",
4367 static RAMBlockNotifier ram_mig_ram_notifier
= {
4368 .ram_block_resized
= ram_mig_ram_block_resized
,
4371 void ram_mig_init(void)
4373 qemu_mutex_init(&XBZRLE
.lock
);
4374 register_savevm_live("ram", 0, 4, &savevm_ram_handlers
, &ram_state
);
4375 ram_block_notifier_add(&ram_mig_ram_notifier
);