4 * Copyright (c) 2003-2008 Fabrice Bellard
5 * Copyright (c) 2011-2015 Red Hat Inc
8 * Juan Quintela <quintela@redhat.com>
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
29 #include "qemu/osdep.h"
31 #include "qemu/cutils.h"
32 #include "qemu/bitops.h"
33 #include "qemu/bitmap.h"
34 #include "qemu/main-loop.h"
37 #include "migration.h"
38 #include "migration/register.h"
39 #include "migration/misc.h"
40 #include "qemu-file.h"
41 #include "postcopy-ram.h"
42 #include "page_cache.h"
43 #include "qemu/error-report.h"
44 #include "qapi/error.h"
45 #include "qapi/qapi-types-migration.h"
46 #include "qapi/qapi-events-migration.h"
47 #include "qapi/qmp/qerror.h"
49 #include "exec/ram_addr.h"
50 #include "exec/target_page.h"
51 #include "qemu/rcu_queue.h"
52 #include "migration/colo.h"
54 #include "sysemu/sysemu.h"
55 #include "sysemu/cpu-throttle.h"
59 #include "sysemu/runstate.h"
61 #if defined(__linux__)
62 #include "qemu/userfaultfd.h"
63 #endif /* defined(__linux__) */
65 /***********************************************************/
66 /* ram save/restore */
68 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
69 * worked for pages that where filled with the same char. We switched
70 * it to only search for the zero value. And to avoid confusion with
71 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
74 #define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */
75 #define RAM_SAVE_FLAG_ZERO 0x02
76 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
77 #define RAM_SAVE_FLAG_PAGE 0x08
78 #define RAM_SAVE_FLAG_EOS 0x10
79 #define RAM_SAVE_FLAG_CONTINUE 0x20
80 #define RAM_SAVE_FLAG_XBZRLE 0x40
81 /* 0x80 is reserved in migration.h start with 0x100 next */
82 #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
84 static inline bool is_zero_range(uint8_t *p
, uint64_t size
)
86 return buffer_is_zero(p
, size
);
89 XBZRLECacheStats xbzrle_counters
;
91 /* struct contains XBZRLE cache and a static page
92 used by the compression */
94 /* buffer used for XBZRLE encoding */
96 /* buffer for storing page content */
98 /* Cache for XBZRLE, Protected by lock. */
101 /* it will store a page full of zeros */
102 uint8_t *zero_target_page
;
103 /* buffer used for XBZRLE decoding */
104 uint8_t *decoded_buf
;
107 static void XBZRLE_cache_lock(void)
109 if (migrate_use_xbzrle()) {
110 qemu_mutex_lock(&XBZRLE
.lock
);
114 static void XBZRLE_cache_unlock(void)
116 if (migrate_use_xbzrle()) {
117 qemu_mutex_unlock(&XBZRLE
.lock
);
122 * xbzrle_cache_resize: resize the xbzrle cache
124 * This function is called from migrate_params_apply in main
125 * thread, possibly while a migration is in progress. A running
126 * migration may be using the cache and might finish during this call,
127 * hence changes to the cache are protected by XBZRLE.lock().
129 * Returns 0 for success or -1 for error
131 * @new_size: new cache size
132 * @errp: set *errp if the check failed, with reason
134 int xbzrle_cache_resize(uint64_t new_size
, Error
**errp
)
136 PageCache
*new_cache
;
139 /* Check for truncation */
140 if (new_size
!= (size_t)new_size
) {
141 error_setg(errp
, QERR_INVALID_PARAMETER_VALUE
, "cache size",
142 "exceeding address space");
146 if (new_size
== migrate_xbzrle_cache_size()) {
153 if (XBZRLE
.cache
!= NULL
) {
154 new_cache
= cache_init(new_size
, TARGET_PAGE_SIZE
, errp
);
160 cache_fini(XBZRLE
.cache
);
161 XBZRLE
.cache
= new_cache
;
164 XBZRLE_cache_unlock();
168 bool ramblock_is_ignored(RAMBlock
*block
)
170 return !qemu_ram_is_migratable(block
) ||
171 (migrate_ignore_shared() && qemu_ram_is_shared(block
));
174 #undef RAMBLOCK_FOREACH
176 int foreach_not_ignored_block(RAMBlockIterFunc func
, void *opaque
)
181 RCU_READ_LOCK_GUARD();
183 RAMBLOCK_FOREACH_NOT_IGNORED(block
) {
184 ret
= func(block
, opaque
);
192 static void ramblock_recv_map_init(void)
196 RAMBLOCK_FOREACH_NOT_IGNORED(rb
) {
197 assert(!rb
->receivedmap
);
198 rb
->receivedmap
= bitmap_new(rb
->max_length
>> qemu_target_page_bits());
202 int ramblock_recv_bitmap_test(RAMBlock
*rb
, void *host_addr
)
204 return test_bit(ramblock_recv_bitmap_offset(host_addr
, rb
),
208 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock
*rb
, uint64_t byte_offset
)
210 return test_bit(byte_offset
>> TARGET_PAGE_BITS
, rb
->receivedmap
);
213 void ramblock_recv_bitmap_set(RAMBlock
*rb
, void *host_addr
)
215 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr
, rb
), rb
->receivedmap
);
218 void ramblock_recv_bitmap_set_range(RAMBlock
*rb
, void *host_addr
,
221 bitmap_set_atomic(rb
->receivedmap
,
222 ramblock_recv_bitmap_offset(host_addr
, rb
),
226 #define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL)
229 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
231 * Returns >0 if success with sent bytes, or <0 if error.
233 int64_t ramblock_recv_bitmap_send(QEMUFile
*file
,
234 const char *block_name
)
236 RAMBlock
*block
= qemu_ram_block_by_name(block_name
);
237 unsigned long *le_bitmap
, nbits
;
241 error_report("%s: invalid block name: %s", __func__
, block_name
);
245 nbits
= block
->used_length
>> TARGET_PAGE_BITS
;
248 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
249 * machines we may need 4 more bytes for padding (see below
250 * comment). So extend it a bit before hand.
252 le_bitmap
= bitmap_new(nbits
+ BITS_PER_LONG
);
255 * Always use little endian when sending the bitmap. This is
256 * required that when source and destination VMs are not using the
257 * same endianness. (Note: big endian won't work.)
259 bitmap_to_le(le_bitmap
, block
->receivedmap
, nbits
);
261 /* Size of the bitmap, in bytes */
262 size
= DIV_ROUND_UP(nbits
, 8);
265 * size is always aligned to 8 bytes for 64bit machines, but it
266 * may not be true for 32bit machines. We need this padding to
267 * make sure the migration can survive even between 32bit and
270 size
= ROUND_UP(size
, 8);
272 qemu_put_be64(file
, size
);
273 qemu_put_buffer(file
, (const uint8_t *)le_bitmap
, size
);
275 * Mark as an end, in case the middle part is screwed up due to
276 * some "mysterious" reason.
278 qemu_put_be64(file
, RAMBLOCK_RECV_BITMAP_ENDING
);
283 if (qemu_file_get_error(file
)) {
284 return qemu_file_get_error(file
);
287 return size
+ sizeof(size
);
291 * An outstanding page request, on the source, having been received
294 struct RAMSrcPageRequest
{
299 QSIMPLEQ_ENTRY(RAMSrcPageRequest
) next_req
;
302 /* State of RAM for migration */
304 /* QEMUFile used for this migration */
306 /* UFFD file descriptor, used in 'write-tracking' migration */
308 /* Last block that we have visited searching for dirty pages */
309 RAMBlock
*last_seen_block
;
310 /* Last block from where we have sent data */
311 RAMBlock
*last_sent_block
;
312 /* Last dirty target page we have sent */
313 ram_addr_t last_page
;
314 /* last ram version we have seen */
315 uint32_t last_version
;
316 /* We are in the first round */
318 /* The free page optimization is enabled */
320 /* How many times we have dirty too many pages */
321 int dirty_rate_high_cnt
;
322 /* these variables are used for bitmap sync */
323 /* last time we did a full bitmap_sync */
324 int64_t time_last_bitmap_sync
;
325 /* bytes transferred at start_time */
326 uint64_t bytes_xfer_prev
;
327 /* number of dirty pages since start_time */
328 uint64_t num_dirty_pages_period
;
329 /* xbzrle misses since the beginning of the period */
330 uint64_t xbzrle_cache_miss_prev
;
331 /* Amount of xbzrle pages since the beginning of the period */
332 uint64_t xbzrle_pages_prev
;
333 /* Amount of xbzrle encoded bytes since the beginning of the period */
334 uint64_t xbzrle_bytes_prev
;
336 /* compression statistics since the beginning of the period */
337 /* amount of count that no free thread to compress data */
338 uint64_t compress_thread_busy_prev
;
339 /* amount bytes after compression */
340 uint64_t compressed_size_prev
;
341 /* amount of compressed pages */
342 uint64_t compress_pages_prev
;
344 /* total handled target pages at the beginning of period */
345 uint64_t target_page_count_prev
;
346 /* total handled target pages since start */
347 uint64_t target_page_count
;
348 /* number of dirty bits in the bitmap */
349 uint64_t migration_dirty_pages
;
350 /* Protects modification of the bitmap and migration dirty pages */
351 QemuMutex bitmap_mutex
;
352 /* The RAMBlock used in the last src_page_requests */
353 RAMBlock
*last_req_rb
;
354 /* Queue of outstanding page requests from the destination */
355 QemuMutex src_page_req_mutex
;
356 QSIMPLEQ_HEAD(, RAMSrcPageRequest
) src_page_requests
;
358 typedef struct RAMState RAMState
;
360 static RAMState
*ram_state
;
362 static NotifierWithReturnList precopy_notifier_list
;
364 void precopy_infrastructure_init(void)
366 notifier_with_return_list_init(&precopy_notifier_list
);
369 void precopy_add_notifier(NotifierWithReturn
*n
)
371 notifier_with_return_list_add(&precopy_notifier_list
, n
);
374 void precopy_remove_notifier(NotifierWithReturn
*n
)
376 notifier_with_return_remove(n
);
379 int precopy_notify(PrecopyNotifyReason reason
, Error
**errp
)
381 PrecopyNotifyData pnd
;
385 return notifier_with_return_list_notify(&precopy_notifier_list
, &pnd
);
388 void precopy_enable_free_page_optimization(void)
394 ram_state
->fpo_enabled
= true;
397 uint64_t ram_bytes_remaining(void)
399 return ram_state
? (ram_state
->migration_dirty_pages
* TARGET_PAGE_SIZE
) :
403 MigrationStats ram_counters
;
405 /* used by the search for pages to send */
406 struct PageSearchStatus
{
407 /* Current block being searched */
409 /* Current page to search from */
411 /* Set once we wrap around */
414 typedef struct PageSearchStatus PageSearchStatus
;
416 CompressionStats compression_counters
;
418 struct CompressParam
{
428 /* internally used fields */
432 typedef struct CompressParam CompressParam
;
434 struct DecompressParam
{
444 typedef struct DecompressParam DecompressParam
;
446 static CompressParam
*comp_param
;
447 static QemuThread
*compress_threads
;
448 /* comp_done_cond is used to wake up the migration thread when
449 * one of the compression threads has finished the compression.
450 * comp_done_lock is used to co-work with comp_done_cond.
452 static QemuMutex comp_done_lock
;
453 static QemuCond comp_done_cond
;
454 /* The empty QEMUFileOps will be used by file in CompressParam */
455 static const QEMUFileOps empty_ops
= { };
457 static QEMUFile
*decomp_file
;
458 static DecompressParam
*decomp_param
;
459 static QemuThread
*decompress_threads
;
460 static QemuMutex decomp_done_lock
;
461 static QemuCond decomp_done_cond
;
463 static bool do_compress_ram_page(QEMUFile
*f
, z_stream
*stream
, RAMBlock
*block
,
464 ram_addr_t offset
, uint8_t *source_buf
);
466 static void *do_data_compress(void *opaque
)
468 CompressParam
*param
= opaque
;
473 qemu_mutex_lock(¶m
->mutex
);
474 while (!param
->quit
) {
476 block
= param
->block
;
477 offset
= param
->offset
;
479 qemu_mutex_unlock(¶m
->mutex
);
481 zero_page
= do_compress_ram_page(param
->file
, ¶m
->stream
,
482 block
, offset
, param
->originbuf
);
484 qemu_mutex_lock(&comp_done_lock
);
486 param
->zero_page
= zero_page
;
487 qemu_cond_signal(&comp_done_cond
);
488 qemu_mutex_unlock(&comp_done_lock
);
490 qemu_mutex_lock(¶m
->mutex
);
492 qemu_cond_wait(¶m
->cond
, ¶m
->mutex
);
495 qemu_mutex_unlock(¶m
->mutex
);
500 static void compress_threads_save_cleanup(void)
504 if (!migrate_use_compression() || !comp_param
) {
508 thread_count
= migrate_compress_threads();
509 for (i
= 0; i
< thread_count
; i
++) {
511 * we use it as a indicator which shows if the thread is
512 * properly init'd or not
514 if (!comp_param
[i
].file
) {
518 qemu_mutex_lock(&comp_param
[i
].mutex
);
519 comp_param
[i
].quit
= true;
520 qemu_cond_signal(&comp_param
[i
].cond
);
521 qemu_mutex_unlock(&comp_param
[i
].mutex
);
523 qemu_thread_join(compress_threads
+ i
);
524 qemu_mutex_destroy(&comp_param
[i
].mutex
);
525 qemu_cond_destroy(&comp_param
[i
].cond
);
526 deflateEnd(&comp_param
[i
].stream
);
527 g_free(comp_param
[i
].originbuf
);
528 qemu_fclose(comp_param
[i
].file
);
529 comp_param
[i
].file
= NULL
;
531 qemu_mutex_destroy(&comp_done_lock
);
532 qemu_cond_destroy(&comp_done_cond
);
533 g_free(compress_threads
);
535 compress_threads
= NULL
;
539 static int compress_threads_save_setup(void)
543 if (!migrate_use_compression()) {
546 thread_count
= migrate_compress_threads();
547 compress_threads
= g_new0(QemuThread
, thread_count
);
548 comp_param
= g_new0(CompressParam
, thread_count
);
549 qemu_cond_init(&comp_done_cond
);
550 qemu_mutex_init(&comp_done_lock
);
551 for (i
= 0; i
< thread_count
; i
++) {
552 comp_param
[i
].originbuf
= g_try_malloc(TARGET_PAGE_SIZE
);
553 if (!comp_param
[i
].originbuf
) {
557 if (deflateInit(&comp_param
[i
].stream
,
558 migrate_compress_level()) != Z_OK
) {
559 g_free(comp_param
[i
].originbuf
);
563 /* comp_param[i].file is just used as a dummy buffer to save data,
564 * set its ops to empty.
566 comp_param
[i
].file
= qemu_fopen_ops(NULL
, &empty_ops
);
567 comp_param
[i
].done
= true;
568 comp_param
[i
].quit
= false;
569 qemu_mutex_init(&comp_param
[i
].mutex
);
570 qemu_cond_init(&comp_param
[i
].cond
);
571 qemu_thread_create(compress_threads
+ i
, "compress",
572 do_data_compress
, comp_param
+ i
,
573 QEMU_THREAD_JOINABLE
);
578 compress_threads_save_cleanup();
583 * save_page_header: write page header to wire
585 * If this is the 1st block, it also writes the block identification
587 * Returns the number of bytes written
589 * @f: QEMUFile where to send the data
590 * @block: block that contains the page we want to send
591 * @offset: offset inside the block for the page
592 * in the lower bits, it contains flags
594 static size_t save_page_header(RAMState
*rs
, QEMUFile
*f
, RAMBlock
*block
,
599 if (block
== rs
->last_sent_block
) {
600 offset
|= RAM_SAVE_FLAG_CONTINUE
;
602 qemu_put_be64(f
, offset
);
605 if (!(offset
& RAM_SAVE_FLAG_CONTINUE
)) {
606 len
= strlen(block
->idstr
);
607 qemu_put_byte(f
, len
);
608 qemu_put_buffer(f
, (uint8_t *)block
->idstr
, len
);
610 rs
->last_sent_block
= block
;
616 * mig_throttle_guest_down: throotle down the guest
618 * Reduce amount of guest cpu execution to hopefully slow down memory
619 * writes. If guest dirty memory rate is reduced below the rate at
620 * which we can transfer pages to the destination then we should be
621 * able to complete migration. Some workloads dirty memory way too
622 * fast and will not effectively converge, even with auto-converge.
624 static void mig_throttle_guest_down(uint64_t bytes_dirty_period
,
625 uint64_t bytes_dirty_threshold
)
627 MigrationState
*s
= migrate_get_current();
628 uint64_t pct_initial
= s
->parameters
.cpu_throttle_initial
;
629 uint64_t pct_increment
= s
->parameters
.cpu_throttle_increment
;
630 bool pct_tailslow
= s
->parameters
.cpu_throttle_tailslow
;
631 int pct_max
= s
->parameters
.max_cpu_throttle
;
633 uint64_t throttle_now
= cpu_throttle_get_percentage();
634 uint64_t cpu_now
, cpu_ideal
, throttle_inc
;
636 /* We have not started throttling yet. Let's start it. */
637 if (!cpu_throttle_active()) {
638 cpu_throttle_set(pct_initial
);
640 /* Throttling already on, just increase the rate */
642 throttle_inc
= pct_increment
;
644 /* Compute the ideal CPU percentage used by Guest, which may
645 * make the dirty rate match the dirty rate threshold. */
646 cpu_now
= 100 - throttle_now
;
647 cpu_ideal
= cpu_now
* (bytes_dirty_threshold
* 1.0 /
649 throttle_inc
= MIN(cpu_now
- cpu_ideal
, pct_increment
);
651 cpu_throttle_set(MIN(throttle_now
+ throttle_inc
, pct_max
));
656 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
658 * @rs: current RAM state
659 * @current_addr: address for the zero page
661 * Update the xbzrle cache to reflect a page that's been sent as all 0.
662 * The important thing is that a stale (not-yet-0'd) page be replaced
664 * As a bonus, if the page wasn't in the cache it gets added so that
665 * when a small write is made into the 0'd page it gets XBZRLE sent.
667 static void xbzrle_cache_zero_page(RAMState
*rs
, ram_addr_t current_addr
)
669 if (rs
->ram_bulk_stage
|| !migrate_use_xbzrle()) {
673 /* We don't care if this fails to allocate a new cache page
674 * as long as it updated an old one */
675 cache_insert(XBZRLE
.cache
, current_addr
, XBZRLE
.zero_target_page
,
676 ram_counters
.dirty_sync_count
);
679 #define ENCODING_FLAG_XBZRLE 0x1
682 * save_xbzrle_page: compress and send current page
684 * Returns: 1 means that we wrote the page
685 * 0 means that page is identical to the one already sent
686 * -1 means that xbzrle would be longer than normal
688 * @rs: current RAM state
689 * @current_data: pointer to the address of the page contents
690 * @current_addr: addr of the page
691 * @block: block that contains the page we want to send
692 * @offset: offset inside the block for the page
693 * @last_stage: if we are at the completion stage
695 static int save_xbzrle_page(RAMState
*rs
, uint8_t **current_data
,
696 ram_addr_t current_addr
, RAMBlock
*block
,
697 ram_addr_t offset
, bool last_stage
)
699 int encoded_len
= 0, bytes_xbzrle
;
700 uint8_t *prev_cached_page
;
702 if (!cache_is_cached(XBZRLE
.cache
, current_addr
,
703 ram_counters
.dirty_sync_count
)) {
704 xbzrle_counters
.cache_miss
++;
706 if (cache_insert(XBZRLE
.cache
, current_addr
, *current_data
,
707 ram_counters
.dirty_sync_count
) == -1) {
710 /* update *current_data when the page has been
711 inserted into cache */
712 *current_data
= get_cached_data(XBZRLE
.cache
, current_addr
);
719 * Reaching here means the page has hit the xbzrle cache, no matter what
720 * encoding result it is (normal encoding, overflow or skipping the page),
721 * count the page as encoded. This is used to calculate the encoding rate.
723 * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
724 * 2nd page turns out to be skipped (i.e. no new bytes written to the
725 * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
726 * skipped page included. In this way, the encoding rate can tell if the
727 * guest page is good for xbzrle encoding.
729 xbzrle_counters
.pages
++;
730 prev_cached_page
= get_cached_data(XBZRLE
.cache
, current_addr
);
732 /* save current buffer into memory */
733 memcpy(XBZRLE
.current_buf
, *current_data
, TARGET_PAGE_SIZE
);
735 /* XBZRLE encoding (if there is no overflow) */
736 encoded_len
= xbzrle_encode_buffer(prev_cached_page
, XBZRLE
.current_buf
,
737 TARGET_PAGE_SIZE
, XBZRLE
.encoded_buf
,
741 * Update the cache contents, so that it corresponds to the data
742 * sent, in all cases except where we skip the page.
744 if (!last_stage
&& encoded_len
!= 0) {
745 memcpy(prev_cached_page
, XBZRLE
.current_buf
, TARGET_PAGE_SIZE
);
747 * In the case where we couldn't compress, ensure that the caller
748 * sends the data from the cache, since the guest might have
749 * changed the RAM since we copied it.
751 *current_data
= prev_cached_page
;
754 if (encoded_len
== 0) {
755 trace_save_xbzrle_page_skipping();
757 } else if (encoded_len
== -1) {
758 trace_save_xbzrle_page_overflow();
759 xbzrle_counters
.overflow
++;
760 xbzrle_counters
.bytes
+= TARGET_PAGE_SIZE
;
764 /* Send XBZRLE based compressed page */
765 bytes_xbzrle
= save_page_header(rs
, rs
->f
, block
,
766 offset
| RAM_SAVE_FLAG_XBZRLE
);
767 qemu_put_byte(rs
->f
, ENCODING_FLAG_XBZRLE
);
768 qemu_put_be16(rs
->f
, encoded_len
);
769 qemu_put_buffer(rs
->f
, XBZRLE
.encoded_buf
, encoded_len
);
770 bytes_xbzrle
+= encoded_len
+ 1 + 2;
772 * Like compressed_size (please see update_compress_thread_counts),
773 * the xbzrle encoded bytes don't count the 8 byte header with
774 * RAM_SAVE_FLAG_CONTINUE.
776 xbzrle_counters
.bytes
+= bytes_xbzrle
- 8;
777 ram_counters
.transferred
+= bytes_xbzrle
;
783 * migration_bitmap_find_dirty: find the next dirty page from start
785 * Returns the page offset within memory region of the start of a dirty page
787 * @rs: current RAM state
788 * @rb: RAMBlock where to search for dirty pages
789 * @start: page where we start the search
792 unsigned long migration_bitmap_find_dirty(RAMState
*rs
, RAMBlock
*rb
,
795 unsigned long size
= rb
->used_length
>> TARGET_PAGE_BITS
;
796 unsigned long *bitmap
= rb
->bmap
;
799 if (ramblock_is_ignored(rb
)) {
804 * When the free page optimization is enabled, we need to check the bitmap
805 * to send the non-free pages rather than all the pages in the bulk stage.
807 if (!rs
->fpo_enabled
&& rs
->ram_bulk_stage
&& start
> 0) {
810 next
= find_next_bit(bitmap
, size
, start
);
816 static inline bool migration_bitmap_clear_dirty(RAMState
*rs
,
822 QEMU_LOCK_GUARD(&rs
->bitmap_mutex
);
825 * Clear dirty bitmap if needed. This _must_ be called before we
826 * send any of the page in the chunk because we need to make sure
827 * we can capture further page content changes when we sync dirty
828 * log the next time. So as long as we are going to send any of
829 * the page in the chunk we clear the remote dirty bitmap for all.
830 * Clearing it earlier won't be a problem, but too late will.
832 if (rb
->clear_bmap
&& clear_bmap_test_and_clear(rb
, page
)) {
833 uint8_t shift
= rb
->clear_bmap_shift
;
834 hwaddr size
= 1ULL << (TARGET_PAGE_BITS
+ shift
);
835 hwaddr start
= (((ram_addr_t
)page
) << TARGET_PAGE_BITS
) & (-size
);
838 * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
839 * can make things easier sometimes since then start address
840 * of the small chunk will always be 64 pages aligned so the
841 * bitmap will always be aligned to unsigned long. We should
842 * even be able to remove this restriction but I'm simply
846 trace_migration_bitmap_clear_dirty(rb
->idstr
, start
, size
, page
);
847 memory_region_clear_dirty_bitmap(rb
->mr
, start
, size
);
850 ret
= test_and_clear_bit(page
, rb
->bmap
);
853 rs
->migration_dirty_pages
--;
859 /* Called with RCU critical section */
860 static void ramblock_sync_dirty_bitmap(RAMState
*rs
, RAMBlock
*rb
)
862 uint64_t new_dirty_pages
=
863 cpu_physical_memory_sync_dirty_bitmap(rb
, 0, rb
->used_length
);
865 rs
->migration_dirty_pages
+= new_dirty_pages
;
866 rs
->num_dirty_pages_period
+= new_dirty_pages
;
870 * ram_pagesize_summary: calculate all the pagesizes of a VM
872 * Returns a summary bitmap of the page sizes of all RAMBlocks
874 * For VMs with just normal pages this is equivalent to the host page
875 * size. If it's got some huge pages then it's the OR of all the
876 * different page sizes.
878 uint64_t ram_pagesize_summary(void)
881 uint64_t summary
= 0;
883 RAMBLOCK_FOREACH_NOT_IGNORED(block
) {
884 summary
|= block
->page_size
;
890 uint64_t ram_get_total_transferred_pages(void)
892 return ram_counters
.normal
+ ram_counters
.duplicate
+
893 compression_counters
.pages
+ xbzrle_counters
.pages
;
896 static void migration_update_rates(RAMState
*rs
, int64_t end_time
)
898 uint64_t page_count
= rs
->target_page_count
- rs
->target_page_count_prev
;
899 double compressed_size
;
901 /* calculate period counters */
902 ram_counters
.dirty_pages_rate
= rs
->num_dirty_pages_period
* 1000
903 / (end_time
- rs
->time_last_bitmap_sync
);
909 if (migrate_use_xbzrle()) {
910 double encoded_size
, unencoded_size
;
912 xbzrle_counters
.cache_miss_rate
= (double)(xbzrle_counters
.cache_miss
-
913 rs
->xbzrle_cache_miss_prev
) / page_count
;
914 rs
->xbzrle_cache_miss_prev
= xbzrle_counters
.cache_miss
;
915 unencoded_size
= (xbzrle_counters
.pages
- rs
->xbzrle_pages_prev
) *
917 encoded_size
= xbzrle_counters
.bytes
- rs
->xbzrle_bytes_prev
;
918 if (xbzrle_counters
.pages
== rs
->xbzrle_pages_prev
|| !encoded_size
) {
919 xbzrle_counters
.encoding_rate
= 0;
921 xbzrle_counters
.encoding_rate
= unencoded_size
/ encoded_size
;
923 rs
->xbzrle_pages_prev
= xbzrle_counters
.pages
;
924 rs
->xbzrle_bytes_prev
= xbzrle_counters
.bytes
;
927 if (migrate_use_compression()) {
928 compression_counters
.busy_rate
= (double)(compression_counters
.busy
-
929 rs
->compress_thread_busy_prev
) / page_count
;
930 rs
->compress_thread_busy_prev
= compression_counters
.busy
;
932 compressed_size
= compression_counters
.compressed_size
-
933 rs
->compressed_size_prev
;
934 if (compressed_size
) {
935 double uncompressed_size
= (compression_counters
.pages
-
936 rs
->compress_pages_prev
) * TARGET_PAGE_SIZE
;
938 /* Compression-Ratio = Uncompressed-size / Compressed-size */
939 compression_counters
.compression_rate
=
940 uncompressed_size
/ compressed_size
;
942 rs
->compress_pages_prev
= compression_counters
.pages
;
943 rs
->compressed_size_prev
= compression_counters
.compressed_size
;
948 static void migration_trigger_throttle(RAMState
*rs
)
950 MigrationState
*s
= migrate_get_current();
951 uint64_t threshold
= s
->parameters
.throttle_trigger_threshold
;
953 uint64_t bytes_xfer_period
= ram_counters
.transferred
- rs
->bytes_xfer_prev
;
954 uint64_t bytes_dirty_period
= rs
->num_dirty_pages_period
* TARGET_PAGE_SIZE
;
955 uint64_t bytes_dirty_threshold
= bytes_xfer_period
* threshold
/ 100;
957 /* During block migration the auto-converge logic incorrectly detects
958 * that ram migration makes no progress. Avoid this by disabling the
959 * throttling logic during the bulk phase of block migration. */
960 if (migrate_auto_converge() && !blk_mig_bulk_active()) {
961 /* The following detection logic can be refined later. For now:
962 Check to see if the ratio between dirtied bytes and the approx.
963 amount of bytes that just got transferred since the last time
964 we were in this routine reaches the threshold. If that happens
965 twice, start or increase throttling. */
967 if ((bytes_dirty_period
> bytes_dirty_threshold
) &&
968 (++rs
->dirty_rate_high_cnt
>= 2)) {
969 trace_migration_throttle();
970 rs
->dirty_rate_high_cnt
= 0;
971 mig_throttle_guest_down(bytes_dirty_period
,
972 bytes_dirty_threshold
);
977 static void migration_bitmap_sync(RAMState
*rs
)
982 ram_counters
.dirty_sync_count
++;
984 if (!rs
->time_last_bitmap_sync
) {
985 rs
->time_last_bitmap_sync
= qemu_clock_get_ms(QEMU_CLOCK_REALTIME
);
988 trace_migration_bitmap_sync_start();
989 memory_global_dirty_log_sync();
991 qemu_mutex_lock(&rs
->bitmap_mutex
);
992 WITH_RCU_READ_LOCK_GUARD() {
993 RAMBLOCK_FOREACH_NOT_IGNORED(block
) {
994 ramblock_sync_dirty_bitmap(rs
, block
);
996 ram_counters
.remaining
= ram_bytes_remaining();
998 qemu_mutex_unlock(&rs
->bitmap_mutex
);
1000 memory_global_after_dirty_log_sync();
1001 trace_migration_bitmap_sync_end(rs
->num_dirty_pages_period
);
1003 end_time
= qemu_clock_get_ms(QEMU_CLOCK_REALTIME
);
1005 /* more than 1 second = 1000 millisecons */
1006 if (end_time
> rs
->time_last_bitmap_sync
+ 1000) {
1007 migration_trigger_throttle(rs
);
1009 migration_update_rates(rs
, end_time
);
1011 rs
->target_page_count_prev
= rs
->target_page_count
;
1013 /* reset period counters */
1014 rs
->time_last_bitmap_sync
= end_time
;
1015 rs
->num_dirty_pages_period
= 0;
1016 rs
->bytes_xfer_prev
= ram_counters
.transferred
;
1018 if (migrate_use_events()) {
1019 qapi_event_send_migration_pass(ram_counters
.dirty_sync_count
);
1023 static void migration_bitmap_sync_precopy(RAMState
*rs
)
1025 Error
*local_err
= NULL
;
1028 * The current notifier usage is just an optimization to migration, so we
1029 * don't stop the normal migration process in the error case.
1031 if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC
, &local_err
)) {
1032 error_report_err(local_err
);
1036 migration_bitmap_sync(rs
);
1038 if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC
, &local_err
)) {
1039 error_report_err(local_err
);
1044 * save_zero_page_to_file: send the zero page to the file
1046 * Returns the size of data written to the file, 0 means the page is not
1049 * @rs: current RAM state
1050 * @file: the file where the data is saved
1051 * @block: block that contains the page we want to send
1052 * @offset: offset inside the block for the page
1054 static int save_zero_page_to_file(RAMState
*rs
, QEMUFile
*file
,
1055 RAMBlock
*block
, ram_addr_t offset
)
1057 uint8_t *p
= block
->host
+ offset
;
1060 if (is_zero_range(p
, TARGET_PAGE_SIZE
)) {
1061 len
+= save_page_header(rs
, file
, block
, offset
| RAM_SAVE_FLAG_ZERO
);
1062 qemu_put_byte(file
, 0);
1069 * save_zero_page: send the zero page to the stream
1071 * Returns the number of pages written.
1073 * @rs: current RAM state
1074 * @block: block that contains the page we want to send
1075 * @offset: offset inside the block for the page
1077 static int save_zero_page(RAMState
*rs
, RAMBlock
*block
, ram_addr_t offset
)
1079 int len
= save_zero_page_to_file(rs
, rs
->f
, block
, offset
);
1082 ram_counters
.duplicate
++;
1083 ram_counters
.transferred
+= len
;
1089 static void ram_release_pages(const char *rbname
, uint64_t offset
, int pages
)
1091 if (!migrate_release_ram() || !migration_in_postcopy()) {
1095 ram_discard_range(rbname
, offset
, ((ram_addr_t
)pages
) << TARGET_PAGE_BITS
);
1099 * @pages: the number of pages written by the control path,
1101 * > 0 - number of pages written
1103 * Return true if the pages has been saved, otherwise false is returned.
1105 static bool control_save_page(RAMState
*rs
, RAMBlock
*block
, ram_addr_t offset
,
1108 uint64_t bytes_xmit
= 0;
1112 ret
= ram_control_save_page(rs
->f
, block
->offset
, offset
, TARGET_PAGE_SIZE
,
1114 if (ret
== RAM_SAVE_CONTROL_NOT_SUPP
) {
1119 ram_counters
.transferred
+= bytes_xmit
;
1123 if (ret
== RAM_SAVE_CONTROL_DELAYED
) {
1127 if (bytes_xmit
> 0) {
1128 ram_counters
.normal
++;
1129 } else if (bytes_xmit
== 0) {
1130 ram_counters
.duplicate
++;
1137 * directly send the page to the stream
1139 * Returns the number of pages written.
1141 * @rs: current RAM state
1142 * @block: block that contains the page we want to send
1143 * @offset: offset inside the block for the page
1144 * @buf: the page to be sent
1145 * @async: send to page asyncly
1147 static int save_normal_page(RAMState
*rs
, RAMBlock
*block
, ram_addr_t offset
,
1148 uint8_t *buf
, bool async
)
1150 ram_counters
.transferred
+= save_page_header(rs
, rs
->f
, block
,
1151 offset
| RAM_SAVE_FLAG_PAGE
);
1153 qemu_put_buffer_async(rs
->f
, buf
, TARGET_PAGE_SIZE
,
1154 migrate_release_ram() &
1155 migration_in_postcopy());
1157 qemu_put_buffer(rs
->f
, buf
, TARGET_PAGE_SIZE
);
1159 ram_counters
.transferred
+= TARGET_PAGE_SIZE
;
1160 ram_counters
.normal
++;
1165 * ram_save_page: send the given page to the stream
1167 * Returns the number of pages written.
1169 * >=0 - Number of pages written - this might legally be 0
1170 * if xbzrle noticed the page was the same.
1172 * @rs: current RAM state
1173 * @block: block that contains the page we want to send
1174 * @offset: offset inside the block for the page
1175 * @last_stage: if we are at the completion stage
1177 static int ram_save_page(RAMState
*rs
, PageSearchStatus
*pss
, bool last_stage
)
1181 bool send_async
= true;
1182 RAMBlock
*block
= pss
->block
;
1183 ram_addr_t offset
= ((ram_addr_t
)pss
->page
) << TARGET_PAGE_BITS
;
1184 ram_addr_t current_addr
= block
->offset
+ offset
;
1186 p
= block
->host
+ offset
;
1187 trace_ram_save_page(block
->idstr
, (uint64_t)offset
, p
);
1189 XBZRLE_cache_lock();
1190 if (!rs
->ram_bulk_stage
&& !migration_in_postcopy() &&
1191 migrate_use_xbzrle()) {
1192 pages
= save_xbzrle_page(rs
, &p
, current_addr
, block
,
1193 offset
, last_stage
);
1195 /* Can't send this cached data async, since the cache page
1196 * might get updated before it gets to the wire
1202 /* XBZRLE overflow or normal page */
1204 pages
= save_normal_page(rs
, block
, offset
, p
, send_async
);
1207 XBZRLE_cache_unlock();
1212 static int ram_save_multifd_page(RAMState
*rs
, RAMBlock
*block
,
1215 if (multifd_queue_page(rs
->f
, block
, offset
) < 0) {
1218 ram_counters
.normal
++;
1223 static bool do_compress_ram_page(QEMUFile
*f
, z_stream
*stream
, RAMBlock
*block
,
1224 ram_addr_t offset
, uint8_t *source_buf
)
1226 RAMState
*rs
= ram_state
;
1227 uint8_t *p
= block
->host
+ (offset
& TARGET_PAGE_MASK
);
1228 bool zero_page
= false;
1231 if (save_zero_page_to_file(rs
, f
, block
, offset
)) {
1236 save_page_header(rs
, f
, block
, offset
| RAM_SAVE_FLAG_COMPRESS_PAGE
);
1239 * copy it to a internal buffer to avoid it being modified by VM
1240 * so that we can catch up the error during compression and
1243 memcpy(source_buf
, p
, TARGET_PAGE_SIZE
);
1244 ret
= qemu_put_compression_data(f
, stream
, source_buf
, TARGET_PAGE_SIZE
);
1246 qemu_file_set_error(migrate_get_current()->to_dst_file
, ret
);
1247 error_report("compressed data failed!");
1252 ram_release_pages(block
->idstr
, offset
& TARGET_PAGE_MASK
, 1);
1257 update_compress_thread_counts(const CompressParam
*param
, int bytes_xmit
)
1259 ram_counters
.transferred
+= bytes_xmit
;
1261 if (param
->zero_page
) {
1262 ram_counters
.duplicate
++;
1266 /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1267 compression_counters
.compressed_size
+= bytes_xmit
- 8;
1268 compression_counters
.pages
++;
1271 static bool save_page_use_compression(RAMState
*rs
);
1273 static void flush_compressed_data(RAMState
*rs
)
1275 int idx
, len
, thread_count
;
1277 if (!save_page_use_compression(rs
)) {
1280 thread_count
= migrate_compress_threads();
1282 qemu_mutex_lock(&comp_done_lock
);
1283 for (idx
= 0; idx
< thread_count
; idx
++) {
1284 while (!comp_param
[idx
].done
) {
1285 qemu_cond_wait(&comp_done_cond
, &comp_done_lock
);
1288 qemu_mutex_unlock(&comp_done_lock
);
1290 for (idx
= 0; idx
< thread_count
; idx
++) {
1291 qemu_mutex_lock(&comp_param
[idx
].mutex
);
1292 if (!comp_param
[idx
].quit
) {
1293 len
= qemu_put_qemu_file(rs
->f
, comp_param
[idx
].file
);
1295 * it's safe to fetch zero_page without holding comp_done_lock
1296 * as there is no further request submitted to the thread,
1297 * i.e, the thread should be waiting for a request at this point.
1299 update_compress_thread_counts(&comp_param
[idx
], len
);
1301 qemu_mutex_unlock(&comp_param
[idx
].mutex
);
1305 static inline void set_compress_params(CompressParam
*param
, RAMBlock
*block
,
1308 param
->block
= block
;
1309 param
->offset
= offset
;
1312 static int compress_page_with_multi_thread(RAMState
*rs
, RAMBlock
*block
,
1315 int idx
, thread_count
, bytes_xmit
= -1, pages
= -1;
1316 bool wait
= migrate_compress_wait_thread();
1318 thread_count
= migrate_compress_threads();
1319 qemu_mutex_lock(&comp_done_lock
);
1321 for (idx
= 0; idx
< thread_count
; idx
++) {
1322 if (comp_param
[idx
].done
) {
1323 comp_param
[idx
].done
= false;
1324 bytes_xmit
= qemu_put_qemu_file(rs
->f
, comp_param
[idx
].file
);
1325 qemu_mutex_lock(&comp_param
[idx
].mutex
);
1326 set_compress_params(&comp_param
[idx
], block
, offset
);
1327 qemu_cond_signal(&comp_param
[idx
].cond
);
1328 qemu_mutex_unlock(&comp_param
[idx
].mutex
);
1330 update_compress_thread_counts(&comp_param
[idx
], bytes_xmit
);
1336 * wait for the free thread if the user specifies 'compress-wait-thread',
1337 * otherwise we will post the page out in the main thread as normal page.
1339 if (pages
< 0 && wait
) {
1340 qemu_cond_wait(&comp_done_cond
, &comp_done_lock
);
1343 qemu_mutex_unlock(&comp_done_lock
);
1349 * find_dirty_block: find the next dirty page and update any state
1350 * associated with the search process.
1352 * Returns true if a page is found
1354 * @rs: current RAM state
1355 * @pss: data about the state of the current dirty page scan
1356 * @again: set to false if the search has scanned the whole of RAM
1358 static bool find_dirty_block(RAMState
*rs
, PageSearchStatus
*pss
, bool *again
)
1360 pss
->page
= migration_bitmap_find_dirty(rs
, pss
->block
, pss
->page
);
1361 if (pss
->complete_round
&& pss
->block
== rs
->last_seen_block
&&
1362 pss
->page
>= rs
->last_page
) {
1364 * We've been once around the RAM and haven't found anything.
1370 if ((((ram_addr_t
)pss
->page
) << TARGET_PAGE_BITS
)
1371 >= pss
->block
->used_length
) {
1372 /* Didn't find anything in this RAM Block */
1374 pss
->block
= QLIST_NEXT_RCU(pss
->block
, next
);
1377 * If memory migration starts over, we will meet a dirtied page
1378 * which may still exists in compression threads's ring, so we
1379 * should flush the compressed data to make sure the new page
1380 * is not overwritten by the old one in the destination.
1382 * Also If xbzrle is on, stop using the data compression at this
1383 * point. In theory, xbzrle can do better than compression.
1385 flush_compressed_data(rs
);
1387 /* Hit the end of the list */
1388 pss
->block
= QLIST_FIRST_RCU(&ram_list
.blocks
);
1389 /* Flag that we've looped */
1390 pss
->complete_round
= true;
1391 rs
->ram_bulk_stage
= false;
1393 /* Didn't find anything this time, but try again on the new block */
1397 /* Can go around again, but... */
1399 /* We've found something so probably don't need to */
1405 * unqueue_page: gets a page of the queue
1407 * Helper for 'get_queued_page' - gets a page off the queue
1409 * Returns the block of the page (or NULL if none available)
1411 * @rs: current RAM state
1412 * @offset: used to return the offset within the RAMBlock
1414 static RAMBlock
*unqueue_page(RAMState
*rs
, ram_addr_t
*offset
)
1416 RAMBlock
*block
= NULL
;
1418 if (QSIMPLEQ_EMPTY_ATOMIC(&rs
->src_page_requests
)) {
1422 QEMU_LOCK_GUARD(&rs
->src_page_req_mutex
);
1423 if (!QSIMPLEQ_EMPTY(&rs
->src_page_requests
)) {
1424 struct RAMSrcPageRequest
*entry
=
1425 QSIMPLEQ_FIRST(&rs
->src_page_requests
);
1427 *offset
= entry
->offset
;
1429 if (entry
->len
> TARGET_PAGE_SIZE
) {
1430 entry
->len
-= TARGET_PAGE_SIZE
;
1431 entry
->offset
+= TARGET_PAGE_SIZE
;
1433 memory_region_unref(block
->mr
);
1434 QSIMPLEQ_REMOVE_HEAD(&rs
->src_page_requests
, next_req
);
1436 migration_consume_urgent_request();
1443 #if defined(__linux__)
1445 * poll_fault_page: try to get next UFFD write fault page and, if pending fault
1446 * is found, return RAM block pointer and page offset
1448 * Returns pointer to the RAMBlock containing faulting page,
1449 * NULL if no write faults are pending
1451 * @rs: current RAM state
1452 * @offset: page offset from the beginning of the block
1454 static RAMBlock
*poll_fault_page(RAMState
*rs
, ram_addr_t
*offset
)
1456 struct uffd_msg uffd_msg
;
1461 if (!migrate_background_snapshot()) {
1465 res
= uffd_read_events(rs
->uffdio_fd
, &uffd_msg
, 1);
1470 page_address
= (void *)(uintptr_t) uffd_msg
.arg
.pagefault
.address
;
1471 bs
= qemu_ram_block_from_host(page_address
, false, offset
);
1472 assert(bs
&& (bs
->flags
& RAM_UF_WRITEPROTECT
) != 0);
1477 * ram_save_release_protection: release UFFD write protection after
1478 * a range of pages has been saved
1480 * @rs: current RAM state
1481 * @pss: page-search-status structure
1482 * @start_page: index of the first page in the range relative to pss->block
1484 * Returns 0 on success, negative value in case of an error
1486 static int ram_save_release_protection(RAMState
*rs
, PageSearchStatus
*pss
,
1487 unsigned long start_page
)
1491 /* Check if page is from UFFD-managed region. */
1492 if (pss
->block
->flags
& RAM_UF_WRITEPROTECT
) {
1493 void *page_address
= pss
->block
->host
+ (start_page
<< TARGET_PAGE_BITS
);
1494 uint64_t run_length
= (pss
->page
- start_page
+ 1) << TARGET_PAGE_BITS
;
1496 /* Flush async buffers before un-protect. */
1498 /* Un-protect memory range. */
1499 res
= uffd_change_protection(rs
->uffdio_fd
, page_address
, run_length
,
1506 /* ram_write_tracking_available: check if kernel supports required UFFD features
1508 * Returns true if supports, false otherwise
1510 bool ram_write_tracking_available(void)
1512 uint64_t uffd_features
;
1515 res
= uffd_query_features(&uffd_features
);
1517 (uffd_features
& UFFD_FEATURE_PAGEFAULT_FLAG_WP
) != 0);
1520 /* ram_write_tracking_compatible: check if guest configuration is
1521 * compatible with 'write-tracking'
1523 * Returns true if compatible, false otherwise
1525 bool ram_write_tracking_compatible(void)
1527 const uint64_t uffd_ioctls_mask
= BIT(_UFFDIO_WRITEPROTECT
);
1532 /* Open UFFD file descriptor */
1533 uffd_fd
= uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP
, false);
1538 RCU_READ_LOCK_GUARD();
1540 RAMBLOCK_FOREACH_NOT_IGNORED(bs
) {
1541 uint64_t uffd_ioctls
;
1543 /* Nothing to do with read-only and MMIO-writable regions */
1544 if (bs
->mr
->readonly
|| bs
->mr
->rom_device
) {
1547 /* Try to register block memory via UFFD-IO to track writes */
1548 if (uffd_register_memory(uffd_fd
, bs
->host
, bs
->max_length
,
1549 UFFDIO_REGISTER_MODE_WP
, &uffd_ioctls
)) {
1552 if ((uffd_ioctls
& uffd_ioctls_mask
) != uffd_ioctls_mask
) {
1559 uffd_close_fd(uffd_fd
);
1564 * ram_write_tracking_start: start UFFD-WP memory tracking
1566 * Returns 0 for success or negative value in case of error
1568 int ram_write_tracking_start(void)
1571 RAMState
*rs
= ram_state
;
1574 /* Open UFFD file descriptor */
1575 uffd_fd
= uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP
, true);
1579 rs
->uffdio_fd
= uffd_fd
;
1581 RCU_READ_LOCK_GUARD();
1583 RAMBLOCK_FOREACH_NOT_IGNORED(bs
) {
1584 /* Nothing to do with read-only and MMIO-writable regions */
1585 if (bs
->mr
->readonly
|| bs
->mr
->rom_device
) {
1589 /* Register block memory with UFFD to track writes */
1590 if (uffd_register_memory(rs
->uffdio_fd
, bs
->host
,
1591 bs
->max_length
, UFFDIO_REGISTER_MODE_WP
, NULL
)) {
1594 /* Apply UFFD write protection to the block memory range */
1595 if (uffd_change_protection(rs
->uffdio_fd
, bs
->host
,
1596 bs
->max_length
, true, false)) {
1599 bs
->flags
|= RAM_UF_WRITEPROTECT
;
1600 memory_region_ref(bs
->mr
);
1602 trace_ram_write_tracking_ramblock_start(bs
->idstr
, bs
->page_size
,
1603 bs
->host
, bs
->max_length
);
1609 error_report("ram_write_tracking_start() failed: restoring initial memory state");
1611 RAMBLOCK_FOREACH_NOT_IGNORED(bs
) {
1612 if ((bs
->flags
& RAM_UF_WRITEPROTECT
) == 0) {
1616 * In case some memory block failed to be write-protected
1617 * remove protection and unregister all succeeded RAM blocks
1619 uffd_change_protection(rs
->uffdio_fd
, bs
->host
, bs
->max_length
, false, false);
1620 uffd_unregister_memory(rs
->uffdio_fd
, bs
->host
, bs
->max_length
);
1621 /* Cleanup flags and remove reference */
1622 bs
->flags
&= ~RAM_UF_WRITEPROTECT
;
1623 memory_region_unref(bs
->mr
);
1626 uffd_close_fd(uffd_fd
);
1632 * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
1634 void ram_write_tracking_stop(void)
1636 RAMState
*rs
= ram_state
;
1639 RCU_READ_LOCK_GUARD();
1641 RAMBLOCK_FOREACH_NOT_IGNORED(bs
) {
1642 if ((bs
->flags
& RAM_UF_WRITEPROTECT
) == 0) {
1645 /* Remove protection and unregister all affected RAM blocks */
1646 uffd_change_protection(rs
->uffdio_fd
, bs
->host
, bs
->max_length
, false, false);
1647 uffd_unregister_memory(rs
->uffdio_fd
, bs
->host
, bs
->max_length
);
1649 trace_ram_write_tracking_ramblock_stop(bs
->idstr
, bs
->page_size
,
1650 bs
->host
, bs
->max_length
);
1652 /* Cleanup flags and remove reference */
1653 bs
->flags
&= ~RAM_UF_WRITEPROTECT
;
1654 memory_region_unref(bs
->mr
);
1657 /* Finally close UFFD file descriptor */
1658 uffd_close_fd(rs
->uffdio_fd
);
1663 /* No target OS support, stubs just fail or ignore */
1665 static RAMBlock
*poll_fault_page(RAMState
*rs
, ram_addr_t
*offset
)
1673 static int ram_save_release_protection(RAMState
*rs
, PageSearchStatus
*pss
,
1674 unsigned long start_page
)
1683 bool ram_write_tracking_available(void)
1688 bool ram_write_tracking_compatible(void)
1694 int ram_write_tracking_start(void)
1700 void ram_write_tracking_stop(void)
1704 #endif /* defined(__linux__) */
1707 * get_queued_page: unqueue a page from the postcopy requests
1709 * Skips pages that are already sent (!dirty)
1711 * Returns true if a queued page is found
1713 * @rs: current RAM state
1714 * @pss: data about the state of the current dirty page scan
1716 static bool get_queued_page(RAMState
*rs
, PageSearchStatus
*pss
)
1723 block
= unqueue_page(rs
, &offset
);
1725 * We're sending this page, and since it's postcopy nothing else
1726 * will dirty it, and we must make sure it doesn't get sent again
1727 * even if this queue request was received after the background
1728 * search already sent it.
1733 page
= offset
>> TARGET_PAGE_BITS
;
1734 dirty
= test_bit(page
, block
->bmap
);
1736 trace_get_queued_page_not_dirty(block
->idstr
, (uint64_t)offset
,
1739 trace_get_queued_page(block
->idstr
, (uint64_t)offset
, page
);
1743 } while (block
&& !dirty
);
1747 * Poll write faults too if background snapshot is enabled; that's
1748 * when we have vcpus got blocked by the write protected pages.
1750 block
= poll_fault_page(rs
, &offset
);
1755 * As soon as we start servicing pages out of order, then we have
1756 * to kill the bulk stage, since the bulk stage assumes
1757 * in (migration_bitmap_find_and_reset_dirty) that every page is
1758 * dirty, that's no longer true.
1760 rs
->ram_bulk_stage
= false;
1763 * We want the background search to continue from the queued page
1764 * since the guest is likely to want other pages near to the page
1765 * it just requested.
1768 pss
->page
= offset
>> TARGET_PAGE_BITS
;
1771 * This unqueued page would break the "one round" check, even is
1774 pss
->complete_round
= false;
1781 * migration_page_queue_free: drop any remaining pages in the ram
1784 * It should be empty at the end anyway, but in error cases there may
1785 * be some left. in case that there is any page left, we drop it.
1788 static void migration_page_queue_free(RAMState
*rs
)
1790 struct RAMSrcPageRequest
*mspr
, *next_mspr
;
1791 /* This queue generally should be empty - but in the case of a failed
1792 * migration might have some droppings in.
1794 RCU_READ_LOCK_GUARD();
1795 QSIMPLEQ_FOREACH_SAFE(mspr
, &rs
->src_page_requests
, next_req
, next_mspr
) {
1796 memory_region_unref(mspr
->rb
->mr
);
1797 QSIMPLEQ_REMOVE_HEAD(&rs
->src_page_requests
, next_req
);
1803 * ram_save_queue_pages: queue the page for transmission
1805 * A request from postcopy destination for example.
1807 * Returns zero on success or negative on error
1809 * @rbname: Name of the RAMBLock of the request. NULL means the
1810 * same that last one.
1811 * @start: starting address from the start of the RAMBlock
1812 * @len: length (in bytes) to send
1814 int ram_save_queue_pages(const char *rbname
, ram_addr_t start
, ram_addr_t len
)
1817 RAMState
*rs
= ram_state
;
1819 ram_counters
.postcopy_requests
++;
1820 RCU_READ_LOCK_GUARD();
1823 /* Reuse last RAMBlock */
1824 ramblock
= rs
->last_req_rb
;
1828 * Shouldn't happen, we can't reuse the last RAMBlock if
1829 * it's the 1st request.
1831 error_report("ram_save_queue_pages no previous block");
1835 ramblock
= qemu_ram_block_by_name(rbname
);
1838 /* We shouldn't be asked for a non-existent RAMBlock */
1839 error_report("ram_save_queue_pages no block '%s'", rbname
);
1842 rs
->last_req_rb
= ramblock
;
1844 trace_ram_save_queue_pages(ramblock
->idstr
, start
, len
);
1845 if (start
+ len
> ramblock
->used_length
) {
1846 error_report("%s request overrun start=" RAM_ADDR_FMT
" len="
1847 RAM_ADDR_FMT
" blocklen=" RAM_ADDR_FMT
,
1848 __func__
, start
, len
, ramblock
->used_length
);
1852 struct RAMSrcPageRequest
*new_entry
=
1853 g_malloc0(sizeof(struct RAMSrcPageRequest
));
1854 new_entry
->rb
= ramblock
;
1855 new_entry
->offset
= start
;
1856 new_entry
->len
= len
;
1858 memory_region_ref(ramblock
->mr
);
1859 qemu_mutex_lock(&rs
->src_page_req_mutex
);
1860 QSIMPLEQ_INSERT_TAIL(&rs
->src_page_requests
, new_entry
, next_req
);
1861 migration_make_urgent_request();
1862 qemu_mutex_unlock(&rs
->src_page_req_mutex
);
1867 static bool save_page_use_compression(RAMState
*rs
)
1869 if (!migrate_use_compression()) {
1874 * If xbzrle is on, stop using the data compression after first
1875 * round of migration even if compression is enabled. In theory,
1876 * xbzrle can do better than compression.
1878 if (rs
->ram_bulk_stage
|| !migrate_use_xbzrle()) {
1886 * try to compress the page before posting it out, return true if the page
1887 * has been properly handled by compression, otherwise needs other
1888 * paths to handle it
1890 static bool save_compress_page(RAMState
*rs
, RAMBlock
*block
, ram_addr_t offset
)
1892 if (!save_page_use_compression(rs
)) {
1897 * When starting the process of a new block, the first page of
1898 * the block should be sent out before other pages in the same
1899 * block, and all the pages in last block should have been sent
1900 * out, keeping this order is important, because the 'cont' flag
1901 * is used to avoid resending the block name.
1903 * We post the fist page as normal page as compression will take
1904 * much CPU resource.
1906 if (block
!= rs
->last_sent_block
) {
1907 flush_compressed_data(rs
);
1911 if (compress_page_with_multi_thread(rs
, block
, offset
) > 0) {
1915 compression_counters
.busy
++;
1920 * ram_save_target_page: save one target page
1922 * Returns the number of pages written
1924 * @rs: current RAM state
1925 * @pss: data about the page we want to send
1926 * @last_stage: if we are at the completion stage
1928 static int ram_save_target_page(RAMState
*rs
, PageSearchStatus
*pss
,
1931 RAMBlock
*block
= pss
->block
;
1932 ram_addr_t offset
= ((ram_addr_t
)pss
->page
) << TARGET_PAGE_BITS
;
1935 if (control_save_page(rs
, block
, offset
, &res
)) {
1939 if (save_compress_page(rs
, block
, offset
)) {
1943 res
= save_zero_page(rs
, block
, offset
);
1945 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
1946 * page would be stale
1948 if (!save_page_use_compression(rs
)) {
1949 XBZRLE_cache_lock();
1950 xbzrle_cache_zero_page(rs
, block
->offset
+ offset
);
1951 XBZRLE_cache_unlock();
1953 ram_release_pages(block
->idstr
, offset
, res
);
1958 * Do not use multifd for:
1959 * 1. Compression as the first page in the new block should be posted out
1960 * before sending the compressed page
1961 * 2. In postcopy as one whole host page should be placed
1963 if (!save_page_use_compression(rs
) && migrate_use_multifd()
1964 && !migration_in_postcopy()) {
1965 return ram_save_multifd_page(rs
, block
, offset
);
1968 return ram_save_page(rs
, pss
, last_stage
);
1972 * ram_save_host_page: save a whole host page
1974 * Starting at *offset send pages up to the end of the current host
1975 * page. It's valid for the initial offset to point into the middle of
1976 * a host page in which case the remainder of the hostpage is sent.
1977 * Only dirty target pages are sent. Note that the host page size may
1978 * be a huge page for this block.
1979 * The saving stops at the boundary of the used_length of the block
1980 * if the RAMBlock isn't a multiple of the host page size.
1982 * Returns the number of pages written or negative on error
1984 * @rs: current RAM state
1985 * @ms: current migration state
1986 * @pss: data about the page we want to send
1987 * @last_stage: if we are at the completion stage
1989 static int ram_save_host_page(RAMState
*rs
, PageSearchStatus
*pss
,
1992 int tmppages
, pages
= 0;
1993 size_t pagesize_bits
=
1994 qemu_ram_pagesize(pss
->block
) >> TARGET_PAGE_BITS
;
1995 unsigned long start_page
= pss
->page
;
1998 if (ramblock_is_ignored(pss
->block
)) {
1999 error_report("block %s should not be migrated !", pss
->block
->idstr
);
2004 /* Check the pages is dirty and if it is send it */
2005 if (!migration_bitmap_clear_dirty(rs
, pss
->block
, pss
->page
)) {
2010 tmppages
= ram_save_target_page(rs
, pss
, last_stage
);
2017 /* Allow rate limiting to happen in the middle of huge pages */
2018 migration_rate_limit();
2019 } while ((pss
->page
& (pagesize_bits
- 1)) &&
2020 offset_in_ramblock(pss
->block
,
2021 ((ram_addr_t
)pss
->page
) << TARGET_PAGE_BITS
));
2022 /* The offset we leave with is the last one we looked at */
2025 res
= ram_save_release_protection(rs
, pss
, start_page
);
2026 return (res
< 0 ? res
: pages
);
2030 * ram_find_and_save_block: finds a dirty page and sends it to f
2032 * Called within an RCU critical section.
2034 * Returns the number of pages written where zero means no dirty pages,
2035 * or negative on error
2037 * @rs: current RAM state
2038 * @last_stage: if we are at the completion stage
2040 * On systems where host-page-size > target-page-size it will send all the
2041 * pages in a host page that are dirty.
2044 static int ram_find_and_save_block(RAMState
*rs
, bool last_stage
)
2046 PageSearchStatus pss
;
2050 /* No dirty page as there is zero RAM */
2051 if (!ram_bytes_total()) {
2055 pss
.block
= rs
->last_seen_block
;
2056 pss
.page
= rs
->last_page
;
2057 pss
.complete_round
= false;
2060 pss
.block
= QLIST_FIRST_RCU(&ram_list
.blocks
);
2065 found
= get_queued_page(rs
, &pss
);
2068 /* priority queue empty, so just search for something dirty */
2069 found
= find_dirty_block(rs
, &pss
, &again
);
2073 pages
= ram_save_host_page(rs
, &pss
, last_stage
);
2075 } while (!pages
&& again
);
2077 rs
->last_seen_block
= pss
.block
;
2078 rs
->last_page
= pss
.page
;
2083 void acct_update_position(QEMUFile
*f
, size_t size
, bool zero
)
2085 uint64_t pages
= size
/ TARGET_PAGE_SIZE
;
2088 ram_counters
.duplicate
+= pages
;
2090 ram_counters
.normal
+= pages
;
2091 ram_counters
.transferred
+= size
;
2092 qemu_update_position(f
, size
);
2096 static uint64_t ram_bytes_total_common(bool count_ignored
)
2101 RCU_READ_LOCK_GUARD();
2103 if (count_ignored
) {
2104 RAMBLOCK_FOREACH_MIGRATABLE(block
) {
2105 total
+= block
->used_length
;
2108 RAMBLOCK_FOREACH_NOT_IGNORED(block
) {
2109 total
+= block
->used_length
;
2115 uint64_t ram_bytes_total(void)
2117 return ram_bytes_total_common(false);
2120 static void xbzrle_load_setup(void)
2122 XBZRLE
.decoded_buf
= g_malloc(TARGET_PAGE_SIZE
);
2125 static void xbzrle_load_cleanup(void)
2127 g_free(XBZRLE
.decoded_buf
);
2128 XBZRLE
.decoded_buf
= NULL
;
2131 static void ram_state_cleanup(RAMState
**rsp
)
2134 migration_page_queue_free(*rsp
);
2135 qemu_mutex_destroy(&(*rsp
)->bitmap_mutex
);
2136 qemu_mutex_destroy(&(*rsp
)->src_page_req_mutex
);
2142 static void xbzrle_cleanup(void)
2144 XBZRLE_cache_lock();
2146 cache_fini(XBZRLE
.cache
);
2147 g_free(XBZRLE
.encoded_buf
);
2148 g_free(XBZRLE
.current_buf
);
2149 g_free(XBZRLE
.zero_target_page
);
2150 XBZRLE
.cache
= NULL
;
2151 XBZRLE
.encoded_buf
= NULL
;
2152 XBZRLE
.current_buf
= NULL
;
2153 XBZRLE
.zero_target_page
= NULL
;
2155 XBZRLE_cache_unlock();
2158 static void ram_save_cleanup(void *opaque
)
2160 RAMState
**rsp
= opaque
;
2163 /* We don't use dirty log with background snapshots */
2164 if (!migrate_background_snapshot()) {
2165 /* caller have hold iothread lock or is in a bh, so there is
2166 * no writing race against the migration bitmap
2168 memory_global_dirty_log_stop();
2171 RAMBLOCK_FOREACH_NOT_IGNORED(block
) {
2172 g_free(block
->clear_bmap
);
2173 block
->clear_bmap
= NULL
;
2174 g_free(block
->bmap
);
2179 compress_threads_save_cleanup();
2180 ram_state_cleanup(rsp
);
2183 static void ram_state_reset(RAMState
*rs
)
2185 rs
->last_seen_block
= NULL
;
2186 rs
->last_sent_block
= NULL
;
2188 rs
->last_version
= ram_list
.version
;
2189 rs
->ram_bulk_stage
= true;
2190 rs
->fpo_enabled
= false;
2193 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2196 * 'expected' is the value you expect the bitmap mostly to be full
2197 * of; it won't bother printing lines that are all this value.
2198 * If 'todump' is null the migration bitmap is dumped.
2200 void ram_debug_dump_bitmap(unsigned long *todump
, bool expected
,
2201 unsigned long pages
)
2204 int64_t linelen
= 128;
2207 for (cur
= 0; cur
< pages
; cur
+= linelen
) {
2211 * Last line; catch the case where the line length
2212 * is longer than remaining ram
2214 if (cur
+ linelen
> pages
) {
2215 linelen
= pages
- cur
;
2217 for (curb
= 0; curb
< linelen
; curb
++) {
2218 bool thisbit
= test_bit(cur
+ curb
, todump
);
2219 linebuf
[curb
] = thisbit
? '1' : '.';
2220 found
= found
|| (thisbit
!= expected
);
2223 linebuf
[curb
] = '\0';
2224 fprintf(stderr
, "0x%08" PRIx64
" : %s\n", cur
, linebuf
);
2229 /* **** functions for postcopy ***** */
2231 void ram_postcopy_migrated_memory_release(MigrationState
*ms
)
2233 struct RAMBlock
*block
;
2235 RAMBLOCK_FOREACH_NOT_IGNORED(block
) {
2236 unsigned long *bitmap
= block
->bmap
;
2237 unsigned long range
= block
->used_length
>> TARGET_PAGE_BITS
;
2238 unsigned long run_start
= find_next_zero_bit(bitmap
, range
, 0);
2240 while (run_start
< range
) {
2241 unsigned long run_end
= find_next_bit(bitmap
, range
, run_start
+ 1);
2242 ram_discard_range(block
->idstr
,
2243 ((ram_addr_t
)run_start
) << TARGET_PAGE_BITS
,
2244 ((ram_addr_t
)(run_end
- run_start
))
2245 << TARGET_PAGE_BITS
);
2246 run_start
= find_next_zero_bit(bitmap
, range
, run_end
+ 1);
2252 * postcopy_send_discard_bm_ram: discard a RAMBlock
2254 * Returns zero on success
2256 * Callback from postcopy_each_ram_send_discard for each RAMBlock
2258 * @ms: current migration state
2259 * @block: RAMBlock to discard
2261 static int postcopy_send_discard_bm_ram(MigrationState
*ms
, RAMBlock
*block
)
2263 unsigned long end
= block
->used_length
>> TARGET_PAGE_BITS
;
2264 unsigned long current
;
2265 unsigned long *bitmap
= block
->bmap
;
2267 for (current
= 0; current
< end
; ) {
2268 unsigned long one
= find_next_bit(bitmap
, end
, current
);
2269 unsigned long zero
, discard_length
;
2275 zero
= find_next_zero_bit(bitmap
, end
, one
+ 1);
2278 discard_length
= end
- one
;
2280 discard_length
= zero
- one
;
2282 postcopy_discard_send_range(ms
, one
, discard_length
);
2283 current
= one
+ discard_length
;
2290 * postcopy_each_ram_send_discard: discard all RAMBlocks
2292 * Returns 0 for success or negative for error
2294 * Utility for the outgoing postcopy code.
2295 * Calls postcopy_send_discard_bm_ram for each RAMBlock
2296 * passing it bitmap indexes and name.
2297 * (qemu_ram_foreach_block ends up passing unscaled lengths
2298 * which would mean postcopy code would have to deal with target page)
2300 * @ms: current migration state
2302 static int postcopy_each_ram_send_discard(MigrationState
*ms
)
2304 struct RAMBlock
*block
;
2307 RAMBLOCK_FOREACH_NOT_IGNORED(block
) {
2308 postcopy_discard_send_init(ms
, block
->idstr
);
2311 * Postcopy sends chunks of bitmap over the wire, but it
2312 * just needs indexes at this point, avoids it having
2313 * target page specific code.
2315 ret
= postcopy_send_discard_bm_ram(ms
, block
);
2316 postcopy_discard_send_finish(ms
);
2326 * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2328 * Helper for postcopy_chunk_hostpages; it's called twice to
2329 * canonicalize the two bitmaps, that are similar, but one is
2332 * Postcopy requires that all target pages in a hostpage are dirty or
2333 * clean, not a mix. This function canonicalizes the bitmaps.
2335 * @ms: current migration state
2336 * @block: block that contains the page we want to canonicalize
2338 static void postcopy_chunk_hostpages_pass(MigrationState
*ms
, RAMBlock
*block
)
2340 RAMState
*rs
= ram_state
;
2341 unsigned long *bitmap
= block
->bmap
;
2342 unsigned int host_ratio
= block
->page_size
/ TARGET_PAGE_SIZE
;
2343 unsigned long pages
= block
->used_length
>> TARGET_PAGE_BITS
;
2344 unsigned long run_start
;
2346 if (block
->page_size
== TARGET_PAGE_SIZE
) {
2347 /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2351 /* Find a dirty page */
2352 run_start
= find_next_bit(bitmap
, pages
, 0);
2354 while (run_start
< pages
) {
2357 * If the start of this run of pages is in the middle of a host
2358 * page, then we need to fixup this host page.
2360 if (QEMU_IS_ALIGNED(run_start
, host_ratio
)) {
2361 /* Find the end of this run */
2362 run_start
= find_next_zero_bit(bitmap
, pages
, run_start
+ 1);
2364 * If the end isn't at the start of a host page, then the
2365 * run doesn't finish at the end of a host page
2366 * and we need to discard.
2370 if (!QEMU_IS_ALIGNED(run_start
, host_ratio
)) {
2372 unsigned long fixup_start_addr
= QEMU_ALIGN_DOWN(run_start
,
2374 run_start
= QEMU_ALIGN_UP(run_start
, host_ratio
);
2376 /* Clean up the bitmap */
2377 for (page
= fixup_start_addr
;
2378 page
< fixup_start_addr
+ host_ratio
; page
++) {
2380 * Remark them as dirty, updating the count for any pages
2381 * that weren't previously dirty.
2383 rs
->migration_dirty_pages
+= !test_and_set_bit(page
, bitmap
);
2387 /* Find the next dirty page for the next iteration */
2388 run_start
= find_next_bit(bitmap
, pages
, run_start
);
2393 * postcopy_chunk_hostpages: discard any partially sent host page
2395 * Utility for the outgoing postcopy code.
2397 * Discard any partially sent host-page size chunks, mark any partially
2398 * dirty host-page size chunks as all dirty. In this case the host-page
2399 * is the host-page for the particular RAMBlock, i.e. it might be a huge page
2401 * Returns zero on success
2403 * @ms: current migration state
2404 * @block: block we want to work with
2406 static int postcopy_chunk_hostpages(MigrationState
*ms
, RAMBlock
*block
)
2408 postcopy_discard_send_init(ms
, block
->idstr
);
2411 * Ensure that all partially dirty host pages are made fully dirty.
2413 postcopy_chunk_hostpages_pass(ms
, block
);
2415 postcopy_discard_send_finish(ms
);
2420 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2422 * Returns zero on success
2424 * Transmit the set of pages to be discarded after precopy to the target
2425 * these are pages that:
2426 * a) Have been previously transmitted but are now dirty again
2427 * b) Pages that have never been transmitted, this ensures that
2428 * any pages on the destination that have been mapped by background
2429 * tasks get discarded (transparent huge pages is the specific concern)
2430 * Hopefully this is pretty sparse
2432 * @ms: current migration state
2434 int ram_postcopy_send_discard_bitmap(MigrationState
*ms
)
2436 RAMState
*rs
= ram_state
;
2440 RCU_READ_LOCK_GUARD();
2442 /* This should be our last sync, the src is now paused */
2443 migration_bitmap_sync(rs
);
2445 /* Easiest way to make sure we don't resume in the middle of a host-page */
2446 rs
->last_seen_block
= NULL
;
2447 rs
->last_sent_block
= NULL
;
2450 RAMBLOCK_FOREACH_NOT_IGNORED(block
) {
2451 /* Deal with TPS != HPS and huge pages */
2452 ret
= postcopy_chunk_hostpages(ms
, block
);
2457 #ifdef DEBUG_POSTCOPY
2458 ram_debug_dump_bitmap(block
->bmap
, true,
2459 block
->used_length
>> TARGET_PAGE_BITS
);
2462 trace_ram_postcopy_send_discard_bitmap();
2464 return postcopy_each_ram_send_discard(ms
);
2468 * ram_discard_range: discard dirtied pages at the beginning of postcopy
2470 * Returns zero on success
2472 * @rbname: name of the RAMBlock of the request. NULL means the
2473 * same that last one.
2474 * @start: RAMBlock starting page
2475 * @length: RAMBlock size
2477 int ram_discard_range(const char *rbname
, uint64_t start
, size_t length
)
2479 trace_ram_discard_range(rbname
, start
, length
);
2481 RCU_READ_LOCK_GUARD();
2482 RAMBlock
*rb
= qemu_ram_block_by_name(rbname
);
2485 error_report("ram_discard_range: Failed to find block '%s'", rbname
);
2490 * On source VM, we don't need to update the received bitmap since
2491 * we don't even have one.
2493 if (rb
->receivedmap
) {
2494 bitmap_clear(rb
->receivedmap
, start
>> qemu_target_page_bits(),
2495 length
>> qemu_target_page_bits());
2498 return ram_block_discard_range(rb
, start
, length
);
2502 * For every allocation, we will try not to crash the VM if the
2503 * allocation failed.
2505 static int xbzrle_init(void)
2507 Error
*local_err
= NULL
;
2509 if (!migrate_use_xbzrle()) {
2513 XBZRLE_cache_lock();
2515 XBZRLE
.zero_target_page
= g_try_malloc0(TARGET_PAGE_SIZE
);
2516 if (!XBZRLE
.zero_target_page
) {
2517 error_report("%s: Error allocating zero page", __func__
);
2521 XBZRLE
.cache
= cache_init(migrate_xbzrle_cache_size(),
2522 TARGET_PAGE_SIZE
, &local_err
);
2523 if (!XBZRLE
.cache
) {
2524 error_report_err(local_err
);
2525 goto free_zero_page
;
2528 XBZRLE
.encoded_buf
= g_try_malloc0(TARGET_PAGE_SIZE
);
2529 if (!XBZRLE
.encoded_buf
) {
2530 error_report("%s: Error allocating encoded_buf", __func__
);
2534 XBZRLE
.current_buf
= g_try_malloc(TARGET_PAGE_SIZE
);
2535 if (!XBZRLE
.current_buf
) {
2536 error_report("%s: Error allocating current_buf", __func__
);
2537 goto free_encoded_buf
;
2540 /* We are all good */
2541 XBZRLE_cache_unlock();
2545 g_free(XBZRLE
.encoded_buf
);
2546 XBZRLE
.encoded_buf
= NULL
;
2548 cache_fini(XBZRLE
.cache
);
2549 XBZRLE
.cache
= NULL
;
2551 g_free(XBZRLE
.zero_target_page
);
2552 XBZRLE
.zero_target_page
= NULL
;
2554 XBZRLE_cache_unlock();
2558 static int ram_state_init(RAMState
**rsp
)
2560 *rsp
= g_try_new0(RAMState
, 1);
2563 error_report("%s: Init ramstate fail", __func__
);
2567 qemu_mutex_init(&(*rsp
)->bitmap_mutex
);
2568 qemu_mutex_init(&(*rsp
)->src_page_req_mutex
);
2569 QSIMPLEQ_INIT(&(*rsp
)->src_page_requests
);
2572 * Count the total number of pages used by ram blocks not including any
2573 * gaps due to alignment or unplugs.
2574 * This must match with the initial values of dirty bitmap.
2576 (*rsp
)->migration_dirty_pages
= ram_bytes_total() >> TARGET_PAGE_BITS
;
2577 ram_state_reset(*rsp
);
2582 static void ram_list_init_bitmaps(void)
2584 MigrationState
*ms
= migrate_get_current();
2586 unsigned long pages
;
2589 /* Skip setting bitmap if there is no RAM */
2590 if (ram_bytes_total()) {
2591 shift
= ms
->clear_bitmap_shift
;
2592 if (shift
> CLEAR_BITMAP_SHIFT_MAX
) {
2593 error_report("clear_bitmap_shift (%u) too big, using "
2594 "max value (%u)", shift
, CLEAR_BITMAP_SHIFT_MAX
);
2595 shift
= CLEAR_BITMAP_SHIFT_MAX
;
2596 } else if (shift
< CLEAR_BITMAP_SHIFT_MIN
) {
2597 error_report("clear_bitmap_shift (%u) too small, using "
2598 "min value (%u)", shift
, CLEAR_BITMAP_SHIFT_MIN
);
2599 shift
= CLEAR_BITMAP_SHIFT_MIN
;
2602 RAMBLOCK_FOREACH_NOT_IGNORED(block
) {
2603 pages
= block
->max_length
>> TARGET_PAGE_BITS
;
2605 * The initial dirty bitmap for migration must be set with all
2606 * ones to make sure we'll migrate every guest RAM page to
2608 * Here we set RAMBlock.bmap all to 1 because when rebegin a
2609 * new migration after a failed migration, ram_list.
2610 * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
2613 block
->bmap
= bitmap_new(pages
);
2614 bitmap_set(block
->bmap
, 0, pages
);
2615 block
->clear_bmap_shift
= shift
;
2616 block
->clear_bmap
= bitmap_new(clear_bmap_size(pages
, shift
));
2621 static void ram_init_bitmaps(RAMState
*rs
)
2623 /* For memory_global_dirty_log_start below. */
2624 qemu_mutex_lock_iothread();
2625 qemu_mutex_lock_ramlist();
2627 WITH_RCU_READ_LOCK_GUARD() {
2628 ram_list_init_bitmaps();
2629 /* We don't use dirty log with background snapshots */
2630 if (!migrate_background_snapshot()) {
2631 memory_global_dirty_log_start();
2632 migration_bitmap_sync_precopy(rs
);
2635 qemu_mutex_unlock_ramlist();
2636 qemu_mutex_unlock_iothread();
2639 static int ram_init_all(RAMState
**rsp
)
2641 if (ram_state_init(rsp
)) {
2645 if (xbzrle_init()) {
2646 ram_state_cleanup(rsp
);
2650 ram_init_bitmaps(*rsp
);
2655 static void ram_state_resume_prepare(RAMState
*rs
, QEMUFile
*out
)
2661 * Postcopy is not using xbzrle/compression, so no need for that.
2662 * Also, since source are already halted, we don't need to care
2663 * about dirty page logging as well.
2666 RAMBLOCK_FOREACH_NOT_IGNORED(block
) {
2667 pages
+= bitmap_count_one(block
->bmap
,
2668 block
->used_length
>> TARGET_PAGE_BITS
);
2671 /* This may not be aligned with current bitmaps. Recalculate. */
2672 rs
->migration_dirty_pages
= pages
;
2674 rs
->last_seen_block
= NULL
;
2675 rs
->last_sent_block
= NULL
;
2677 rs
->last_version
= ram_list
.version
;
2679 * Disable the bulk stage, otherwise we'll resend the whole RAM no
2680 * matter what we have sent.
2682 rs
->ram_bulk_stage
= false;
2684 /* Update RAMState cache of output QEMUFile */
2687 trace_ram_state_resume_prepare(pages
);
2691 * This function clears bits of the free pages reported by the caller from the
2692 * migration dirty bitmap. @addr is the host address corresponding to the
2693 * start of the continuous guest free pages, and @len is the total bytes of
2696 void qemu_guest_free_page_hint(void *addr
, size_t len
)
2700 size_t used_len
, start
, npages
;
2701 MigrationState
*s
= migrate_get_current();
2703 /* This function is currently expected to be used during live migration */
2704 if (!migration_is_setup_or_active(s
->state
)) {
2708 for (; len
> 0; len
-= used_len
, addr
+= used_len
) {
2709 block
= qemu_ram_block_from_host(addr
, false, &offset
);
2710 if (unlikely(!block
|| offset
>= block
->used_length
)) {
2712 * The implementation might not support RAMBlock resize during
2713 * live migration, but it could happen in theory with future
2714 * updates. So we add a check here to capture that case.
2716 error_report_once("%s unexpected error", __func__
);
2720 if (len
<= block
->used_length
- offset
) {
2723 used_len
= block
->used_length
- offset
;
2726 start
= offset
>> TARGET_PAGE_BITS
;
2727 npages
= used_len
>> TARGET_PAGE_BITS
;
2729 qemu_mutex_lock(&ram_state
->bitmap_mutex
);
2730 ram_state
->migration_dirty_pages
-=
2731 bitmap_count_one_with_offset(block
->bmap
, start
, npages
);
2732 bitmap_clear(block
->bmap
, start
, npages
);
2733 qemu_mutex_unlock(&ram_state
->bitmap_mutex
);
2738 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2739 * long-running RCU critical section. When rcu-reclaims in the code
2740 * start to become numerous it will be necessary to reduce the
2741 * granularity of these critical sections.
2745 * ram_save_setup: Setup RAM for migration
2747 * Returns zero to indicate success and negative for error
2749 * @f: QEMUFile where to send the data
2750 * @opaque: RAMState pointer
2752 static int ram_save_setup(QEMUFile
*f
, void *opaque
)
2754 RAMState
**rsp
= opaque
;
2757 if (compress_threads_save_setup()) {
2761 /* migration has already setup the bitmap, reuse it. */
2762 if (!migration_in_colo_state()) {
2763 if (ram_init_all(rsp
) != 0) {
2764 compress_threads_save_cleanup();
2770 WITH_RCU_READ_LOCK_GUARD() {
2771 qemu_put_be64(f
, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE
);
2773 RAMBLOCK_FOREACH_MIGRATABLE(block
) {
2774 qemu_put_byte(f
, strlen(block
->idstr
));
2775 qemu_put_buffer(f
, (uint8_t *)block
->idstr
, strlen(block
->idstr
));
2776 qemu_put_be64(f
, block
->used_length
);
2777 if (migrate_postcopy_ram() && block
->page_size
!=
2778 qemu_host_page_size
) {
2779 qemu_put_be64(f
, block
->page_size
);
2781 if (migrate_ignore_shared()) {
2782 qemu_put_be64(f
, block
->mr
->addr
);
2787 ram_control_before_iterate(f
, RAM_CONTROL_SETUP
);
2788 ram_control_after_iterate(f
, RAM_CONTROL_SETUP
);
2790 multifd_send_sync_main(f
);
2791 qemu_put_be64(f
, RAM_SAVE_FLAG_EOS
);
2798 * ram_save_iterate: iterative stage for migration
2800 * Returns zero to indicate success and negative for error
2802 * @f: QEMUFile where to send the data
2803 * @opaque: RAMState pointer
2805 static int ram_save_iterate(QEMUFile
*f
, void *opaque
)
2807 RAMState
**temp
= opaque
;
2808 RAMState
*rs
= *temp
;
2814 if (blk_mig_bulk_active()) {
2815 /* Avoid transferring ram during bulk phase of block migration as
2816 * the bulk phase will usually take a long time and transferring
2817 * ram updates during that time is pointless. */
2821 WITH_RCU_READ_LOCK_GUARD() {
2822 if (ram_list
.version
!= rs
->last_version
) {
2823 ram_state_reset(rs
);
2826 /* Read version before ram_list.blocks */
2829 ram_control_before_iterate(f
, RAM_CONTROL_ROUND
);
2831 t0
= qemu_clock_get_ns(QEMU_CLOCK_REALTIME
);
2833 while ((ret
= qemu_file_rate_limit(f
)) == 0 ||
2834 !QSIMPLEQ_EMPTY(&rs
->src_page_requests
)) {
2837 if (qemu_file_get_error(f
)) {
2841 pages
= ram_find_and_save_block(rs
, false);
2842 /* no more pages to sent */
2849 qemu_file_set_error(f
, pages
);
2853 rs
->target_page_count
+= pages
;
2856 * During postcopy, it is necessary to make sure one whole host
2857 * page is sent in one chunk.
2859 if (migrate_postcopy_ram()) {
2860 flush_compressed_data(rs
);
2864 * we want to check in the 1st loop, just in case it was the 1st
2865 * time and we had to sync the dirty bitmap.
2866 * qemu_clock_get_ns() is a bit expensive, so we only check each
2869 if ((i
& 63) == 0) {
2870 uint64_t t1
= (qemu_clock_get_ns(QEMU_CLOCK_REALTIME
) - t0
) /
2872 if (t1
> MAX_WAIT
) {
2873 trace_ram_save_iterate_big_wait(t1
, i
);
2882 * Must occur before EOS (or any QEMUFile operation)
2883 * because of RDMA protocol.
2885 ram_control_after_iterate(f
, RAM_CONTROL_ROUND
);
2889 && migration_is_setup_or_active(migrate_get_current()->state
)) {
2890 multifd_send_sync_main(rs
->f
);
2891 qemu_put_be64(f
, RAM_SAVE_FLAG_EOS
);
2893 ram_counters
.transferred
+= 8;
2895 ret
= qemu_file_get_error(f
);
2905 * ram_save_complete: function called to send the remaining amount of ram
2907 * Returns zero to indicate success or negative on error
2909 * Called with iothread lock
2911 * @f: QEMUFile where to send the data
2912 * @opaque: RAMState pointer
2914 static int ram_save_complete(QEMUFile
*f
, void *opaque
)
2916 RAMState
**temp
= opaque
;
2917 RAMState
*rs
= *temp
;
2920 WITH_RCU_READ_LOCK_GUARD() {
2921 if (!migration_in_postcopy()) {
2922 migration_bitmap_sync_precopy(rs
);
2925 ram_control_before_iterate(f
, RAM_CONTROL_FINISH
);
2927 /* try transferring iterative blocks of memory */
2929 /* flush all remaining blocks regardless of rate limiting */
2933 pages
= ram_find_and_save_block(rs
, !migration_in_colo_state());
2934 /* no more blocks to sent */
2944 flush_compressed_data(rs
);
2945 ram_control_after_iterate(f
, RAM_CONTROL_FINISH
);
2949 multifd_send_sync_main(rs
->f
);
2950 qemu_put_be64(f
, RAM_SAVE_FLAG_EOS
);
2957 static void ram_save_pending(QEMUFile
*f
, void *opaque
, uint64_t max_size
,
2958 uint64_t *res_precopy_only
,
2959 uint64_t *res_compatible
,
2960 uint64_t *res_postcopy_only
)
2962 RAMState
**temp
= opaque
;
2963 RAMState
*rs
= *temp
;
2964 uint64_t remaining_size
;
2966 remaining_size
= rs
->migration_dirty_pages
* TARGET_PAGE_SIZE
;
2968 if (!migration_in_postcopy() &&
2969 remaining_size
< max_size
) {
2970 qemu_mutex_lock_iothread();
2971 WITH_RCU_READ_LOCK_GUARD() {
2972 migration_bitmap_sync_precopy(rs
);
2974 qemu_mutex_unlock_iothread();
2975 remaining_size
= rs
->migration_dirty_pages
* TARGET_PAGE_SIZE
;
2978 if (migrate_postcopy_ram()) {
2979 /* We can do postcopy, and all the data is postcopiable */
2980 *res_compatible
+= remaining_size
;
2982 *res_precopy_only
+= remaining_size
;
2986 static int load_xbzrle(QEMUFile
*f
, ram_addr_t addr
, void *host
)
2988 unsigned int xh_len
;
2990 uint8_t *loaded_data
;
2992 /* extract RLE header */
2993 xh_flags
= qemu_get_byte(f
);
2994 xh_len
= qemu_get_be16(f
);
2996 if (xh_flags
!= ENCODING_FLAG_XBZRLE
) {
2997 error_report("Failed to load XBZRLE page - wrong compression!");
3001 if (xh_len
> TARGET_PAGE_SIZE
) {
3002 error_report("Failed to load XBZRLE page - len overflow!");
3005 loaded_data
= XBZRLE
.decoded_buf
;
3006 /* load data and decode */
3007 /* it can change loaded_data to point to an internal buffer */
3008 qemu_get_buffer_in_place(f
, &loaded_data
, xh_len
);
3011 if (xbzrle_decode_buffer(loaded_data
, xh_len
, host
,
3012 TARGET_PAGE_SIZE
) == -1) {
3013 error_report("Failed to load XBZRLE page - decode error!");
3021 * ram_block_from_stream: read a RAMBlock id from the migration stream
3023 * Must be called from within a rcu critical section.
3025 * Returns a pointer from within the RCU-protected ram_list.
3027 * @f: QEMUFile where to read the data from
3028 * @flags: Page flags (mostly to see if it's a continuation of previous block)
3030 static inline RAMBlock
*ram_block_from_stream(QEMUFile
*f
, int flags
)
3032 static RAMBlock
*block
;
3036 if (flags
& RAM_SAVE_FLAG_CONTINUE
) {
3038 error_report("Ack, bad migration stream!");
3044 len
= qemu_get_byte(f
);
3045 qemu_get_buffer(f
, (uint8_t *)id
, len
);
3048 block
= qemu_ram_block_by_name(id
);
3050 error_report("Can't find block %s", id
);
3054 if (ramblock_is_ignored(block
)) {
3055 error_report("block %s should not be migrated !", id
);
3062 static inline void *host_from_ram_block_offset(RAMBlock
*block
,
3065 if (!offset_in_ramblock(block
, offset
)) {
3069 return block
->host
+ offset
;
3072 static inline void *colo_cache_from_block_offset(RAMBlock
*block
,
3073 ram_addr_t offset
, bool record_bitmap
)
3075 if (!offset_in_ramblock(block
, offset
)) {
3078 if (!block
->colo_cache
) {
3079 error_report("%s: colo_cache is NULL in block :%s",
3080 __func__
, block
->idstr
);
3085 * During colo checkpoint, we need bitmap of these migrated pages.
3086 * It help us to decide which pages in ram cache should be flushed
3087 * into VM's RAM later.
3089 if (record_bitmap
&&
3090 !test_and_set_bit(offset
>> TARGET_PAGE_BITS
, block
->bmap
)) {
3091 ram_state
->migration_dirty_pages
++;
3093 return block
->colo_cache
+ offset
;
3097 * ram_handle_compressed: handle the zero page case
3099 * If a page (or a whole RDMA chunk) has been
3100 * determined to be zero, then zap it.
3102 * @host: host address for the zero page
3103 * @ch: what the page is filled from. We only support zero
3104 * @size: size of the zero page
3106 void ram_handle_compressed(void *host
, uint8_t ch
, uint64_t size
)
3108 if (ch
!= 0 || !is_zero_range(host
, size
)) {
3109 memset(host
, ch
, size
);
3113 /* return the size after decompression, or negative value on error */
3115 qemu_uncompress_data(z_stream
*stream
, uint8_t *dest
, size_t dest_len
,
3116 const uint8_t *source
, size_t source_len
)
3120 err
= inflateReset(stream
);
3125 stream
->avail_in
= source_len
;
3126 stream
->next_in
= (uint8_t *)source
;
3127 stream
->avail_out
= dest_len
;
3128 stream
->next_out
= dest
;
3130 err
= inflate(stream
, Z_NO_FLUSH
);
3131 if (err
!= Z_STREAM_END
) {
3135 return stream
->total_out
;
3138 static void *do_data_decompress(void *opaque
)
3140 DecompressParam
*param
= opaque
;
3141 unsigned long pagesize
;
3145 qemu_mutex_lock(¶m
->mutex
);
3146 while (!param
->quit
) {
3151 qemu_mutex_unlock(¶m
->mutex
);
3153 pagesize
= TARGET_PAGE_SIZE
;
3155 ret
= qemu_uncompress_data(¶m
->stream
, des
, pagesize
,
3156 param
->compbuf
, len
);
3157 if (ret
< 0 && migrate_get_current()->decompress_error_check
) {
3158 error_report("decompress data failed");
3159 qemu_file_set_error(decomp_file
, ret
);
3162 qemu_mutex_lock(&decomp_done_lock
);
3164 qemu_cond_signal(&decomp_done_cond
);
3165 qemu_mutex_unlock(&decomp_done_lock
);
3167 qemu_mutex_lock(¶m
->mutex
);
3169 qemu_cond_wait(¶m
->cond
, ¶m
->mutex
);
3172 qemu_mutex_unlock(¶m
->mutex
);
3177 static int wait_for_decompress_done(void)
3179 int idx
, thread_count
;
3181 if (!migrate_use_compression()) {
3185 thread_count
= migrate_decompress_threads();
3186 qemu_mutex_lock(&decomp_done_lock
);
3187 for (idx
= 0; idx
< thread_count
; idx
++) {
3188 while (!decomp_param
[idx
].done
) {
3189 qemu_cond_wait(&decomp_done_cond
, &decomp_done_lock
);
3192 qemu_mutex_unlock(&decomp_done_lock
);
3193 return qemu_file_get_error(decomp_file
);
3196 static void compress_threads_load_cleanup(void)
3198 int i
, thread_count
;
3200 if (!migrate_use_compression()) {
3203 thread_count
= migrate_decompress_threads();
3204 for (i
= 0; i
< thread_count
; i
++) {
3206 * we use it as a indicator which shows if the thread is
3207 * properly init'd or not
3209 if (!decomp_param
[i
].compbuf
) {
3213 qemu_mutex_lock(&decomp_param
[i
].mutex
);
3214 decomp_param
[i
].quit
= true;
3215 qemu_cond_signal(&decomp_param
[i
].cond
);
3216 qemu_mutex_unlock(&decomp_param
[i
].mutex
);
3218 for (i
= 0; i
< thread_count
; i
++) {
3219 if (!decomp_param
[i
].compbuf
) {
3223 qemu_thread_join(decompress_threads
+ i
);
3224 qemu_mutex_destroy(&decomp_param
[i
].mutex
);
3225 qemu_cond_destroy(&decomp_param
[i
].cond
);
3226 inflateEnd(&decomp_param
[i
].stream
);
3227 g_free(decomp_param
[i
].compbuf
);
3228 decomp_param
[i
].compbuf
= NULL
;
3230 g_free(decompress_threads
);
3231 g_free(decomp_param
);
3232 decompress_threads
= NULL
;
3233 decomp_param
= NULL
;
3237 static int compress_threads_load_setup(QEMUFile
*f
)
3239 int i
, thread_count
;
3241 if (!migrate_use_compression()) {
3245 thread_count
= migrate_decompress_threads();
3246 decompress_threads
= g_new0(QemuThread
, thread_count
);
3247 decomp_param
= g_new0(DecompressParam
, thread_count
);
3248 qemu_mutex_init(&decomp_done_lock
);
3249 qemu_cond_init(&decomp_done_cond
);
3251 for (i
= 0; i
< thread_count
; i
++) {
3252 if (inflateInit(&decomp_param
[i
].stream
) != Z_OK
) {
3256 decomp_param
[i
].compbuf
= g_malloc0(compressBound(TARGET_PAGE_SIZE
));
3257 qemu_mutex_init(&decomp_param
[i
].mutex
);
3258 qemu_cond_init(&decomp_param
[i
].cond
);
3259 decomp_param
[i
].done
= true;
3260 decomp_param
[i
].quit
= false;
3261 qemu_thread_create(decompress_threads
+ i
, "decompress",
3262 do_data_decompress
, decomp_param
+ i
,
3263 QEMU_THREAD_JOINABLE
);
3267 compress_threads_load_cleanup();
3271 static void decompress_data_with_multi_threads(QEMUFile
*f
,
3272 void *host
, int len
)
3274 int idx
, thread_count
;
3276 thread_count
= migrate_decompress_threads();
3277 QEMU_LOCK_GUARD(&decomp_done_lock
);
3279 for (idx
= 0; idx
< thread_count
; idx
++) {
3280 if (decomp_param
[idx
].done
) {
3281 decomp_param
[idx
].done
= false;
3282 qemu_mutex_lock(&decomp_param
[idx
].mutex
);
3283 qemu_get_buffer(f
, decomp_param
[idx
].compbuf
, len
);
3284 decomp_param
[idx
].des
= host
;
3285 decomp_param
[idx
].len
= len
;
3286 qemu_cond_signal(&decomp_param
[idx
].cond
);
3287 qemu_mutex_unlock(&decomp_param
[idx
].mutex
);
3291 if (idx
< thread_count
) {
3294 qemu_cond_wait(&decomp_done_cond
, &decomp_done_lock
);
3300 * we must set ram_bulk_stage to false, otherwise in
3301 * migation_bitmap_find_dirty the bitmap will be unused and
3302 * all the pages in ram cache wil be flushed to the ram of
3305 static void colo_init_ram_state(void)
3307 ram_state_init(&ram_state
);
3308 ram_state
->ram_bulk_stage
= false;
3312 * colo cache: this is for secondary VM, we cache the whole
3313 * memory of the secondary VM, it is need to hold the global lock
3314 * to call this helper.
3316 int colo_init_ram_cache(void)
3320 WITH_RCU_READ_LOCK_GUARD() {
3321 RAMBLOCK_FOREACH_NOT_IGNORED(block
) {
3322 block
->colo_cache
= qemu_anon_ram_alloc(block
->used_length
,
3325 if (!block
->colo_cache
) {
3326 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3327 "size 0x" RAM_ADDR_FMT
, __func__
, block
->idstr
,
3328 block
->used_length
);
3329 RAMBLOCK_FOREACH_NOT_IGNORED(block
) {
3330 if (block
->colo_cache
) {
3331 qemu_anon_ram_free(block
->colo_cache
, block
->used_length
);
3332 block
->colo_cache
= NULL
;
3341 * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3342 * with to decide which page in cache should be flushed into SVM's RAM. Here
3343 * we use the same name 'ram_bitmap' as for migration.
3345 if (ram_bytes_total()) {
3348 RAMBLOCK_FOREACH_NOT_IGNORED(block
) {
3349 unsigned long pages
= block
->max_length
>> TARGET_PAGE_BITS
;
3350 block
->bmap
= bitmap_new(pages
);
3354 colo_init_ram_state();
3358 /* TODO: duplicated with ram_init_bitmaps */
3359 void colo_incoming_start_dirty_log(void)
3361 RAMBlock
*block
= NULL
;
3362 /* For memory_global_dirty_log_start below. */
3363 qemu_mutex_lock_iothread();
3364 qemu_mutex_lock_ramlist();
3366 memory_global_dirty_log_sync();
3367 WITH_RCU_READ_LOCK_GUARD() {
3368 RAMBLOCK_FOREACH_NOT_IGNORED(block
) {
3369 ramblock_sync_dirty_bitmap(ram_state
, block
);
3370 /* Discard this dirty bitmap record */
3371 bitmap_zero(block
->bmap
, block
->max_length
>> TARGET_PAGE_BITS
);
3373 memory_global_dirty_log_start();
3375 ram_state
->migration_dirty_pages
= 0;
3376 qemu_mutex_unlock_ramlist();
3377 qemu_mutex_unlock_iothread();
3380 /* It is need to hold the global lock to call this helper */
3381 void colo_release_ram_cache(void)
3385 memory_global_dirty_log_stop();
3386 RAMBLOCK_FOREACH_NOT_IGNORED(block
) {
3387 g_free(block
->bmap
);
3391 WITH_RCU_READ_LOCK_GUARD() {
3392 RAMBLOCK_FOREACH_NOT_IGNORED(block
) {
3393 if (block
->colo_cache
) {
3394 qemu_anon_ram_free(block
->colo_cache
, block
->used_length
);
3395 block
->colo_cache
= NULL
;
3399 ram_state_cleanup(&ram_state
);
3403 * ram_load_setup: Setup RAM for migration incoming side
3405 * Returns zero to indicate success and negative for error
3407 * @f: QEMUFile where to receive the data
3408 * @opaque: RAMState pointer
3410 static int ram_load_setup(QEMUFile
*f
, void *opaque
)
3412 if (compress_threads_load_setup(f
)) {
3416 xbzrle_load_setup();
3417 ramblock_recv_map_init();
3422 static int ram_load_cleanup(void *opaque
)
3426 RAMBLOCK_FOREACH_NOT_IGNORED(rb
) {
3427 qemu_ram_block_writeback(rb
);
3430 xbzrle_load_cleanup();
3431 compress_threads_load_cleanup();
3433 RAMBLOCK_FOREACH_NOT_IGNORED(rb
) {
3434 g_free(rb
->receivedmap
);
3435 rb
->receivedmap
= NULL
;
3442 * ram_postcopy_incoming_init: allocate postcopy data structures
3444 * Returns 0 for success and negative if there was one error
3446 * @mis: current migration incoming state
3448 * Allocate data structures etc needed by incoming migration with
3449 * postcopy-ram. postcopy-ram's similarly names
3450 * postcopy_ram_incoming_init does the work.
3452 int ram_postcopy_incoming_init(MigrationIncomingState
*mis
)
3454 return postcopy_ram_incoming_init(mis
);
3458 * ram_load_postcopy: load a page in postcopy case
3460 * Returns 0 for success or -errno in case of error
3462 * Called in postcopy mode by ram_load().
3463 * rcu_read_lock is taken prior to this being called.
3465 * @f: QEMUFile where to send the data
3467 static int ram_load_postcopy(QEMUFile
*f
)
3469 int flags
= 0, ret
= 0;
3470 bool place_needed
= false;
3471 bool matches_target_page_size
= false;
3472 MigrationIncomingState
*mis
= migration_incoming_get_current();
3473 /* Temporary page that is later 'placed' */
3474 void *postcopy_host_page
= mis
->postcopy_tmp_page
;
3475 void *this_host
= NULL
;
3476 bool all_zero
= true;
3477 int target_pages
= 0;
3479 while (!ret
&& !(flags
& RAM_SAVE_FLAG_EOS
)) {
3482 void *page_buffer
= NULL
;
3483 void *place_source
= NULL
;
3484 RAMBlock
*block
= NULL
;
3488 addr
= qemu_get_be64(f
);
3491 * If qemu file error, we should stop here, and then "addr"
3494 ret
= qemu_file_get_error(f
);
3499 flags
= addr
& ~TARGET_PAGE_MASK
;
3500 addr
&= TARGET_PAGE_MASK
;
3502 trace_ram_load_postcopy_loop((uint64_t)addr
, flags
);
3503 if (flags
& (RAM_SAVE_FLAG_ZERO
| RAM_SAVE_FLAG_PAGE
|
3504 RAM_SAVE_FLAG_COMPRESS_PAGE
)) {
3505 block
= ram_block_from_stream(f
, flags
);
3507 host
= host_from_ram_block_offset(block
, addr
);
3509 error_report("Illegal RAM offset " RAM_ADDR_FMT
, addr
);
3514 matches_target_page_size
= block
->page_size
== TARGET_PAGE_SIZE
;
3516 * Postcopy requires that we place whole host pages atomically;
3517 * these may be huge pages for RAMBlocks that are backed by
3519 * To make it atomic, the data is read into a temporary page
3520 * that's moved into place later.
3521 * The migration protocol uses, possibly smaller, target-pages
3522 * however the source ensures it always sends all the components
3523 * of a host page in one chunk.
3525 page_buffer
= postcopy_host_page
+
3526 ((uintptr_t)host
& (block
->page_size
- 1));
3527 if (target_pages
== 1) {
3528 this_host
= (void *)QEMU_ALIGN_DOWN((uintptr_t)host
,
3531 /* not the 1st TP within the HP */
3532 if (QEMU_ALIGN_DOWN((uintptr_t)host
, block
->page_size
) !=
3533 (uintptr_t)this_host
) {
3534 error_report("Non-same host page %p/%p",
3542 * If it's the last part of a host page then we place the host
3545 if (target_pages
== (block
->page_size
/ TARGET_PAGE_SIZE
)) {
3546 place_needed
= true;
3548 place_source
= postcopy_host_page
;
3551 switch (flags
& ~RAM_SAVE_FLAG_CONTINUE
) {
3552 case RAM_SAVE_FLAG_ZERO
:
3553 ch
= qemu_get_byte(f
);
3555 * Can skip to set page_buffer when
3556 * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
3558 if (ch
|| !matches_target_page_size
) {
3559 memset(page_buffer
, ch
, TARGET_PAGE_SIZE
);
3566 case RAM_SAVE_FLAG_PAGE
:
3568 if (!matches_target_page_size
) {
3569 /* For huge pages, we always use temporary buffer */
3570 qemu_get_buffer(f
, page_buffer
, TARGET_PAGE_SIZE
);
3573 * For small pages that matches target page size, we
3574 * avoid the qemu_file copy. Instead we directly use
3575 * the buffer of QEMUFile to place the page. Note: we
3576 * cannot do any QEMUFile operation before using that
3577 * buffer to make sure the buffer is valid when
3580 qemu_get_buffer_in_place(f
, (uint8_t **)&place_source
,
3584 case RAM_SAVE_FLAG_COMPRESS_PAGE
:
3586 len
= qemu_get_be32(f
);
3587 if (len
< 0 || len
> compressBound(TARGET_PAGE_SIZE
)) {
3588 error_report("Invalid compressed data length: %d", len
);
3592 decompress_data_with_multi_threads(f
, page_buffer
, len
);
3595 case RAM_SAVE_FLAG_EOS
:
3597 multifd_recv_sync_main();
3600 error_report("Unknown combination of migration flags: 0x%x"
3601 " (postcopy mode)", flags
);
3606 /* Got the whole host page, wait for decompress before placing. */
3608 ret
|= wait_for_decompress_done();
3611 /* Detect for any possible file errors */
3612 if (!ret
&& qemu_file_get_error(f
)) {
3613 ret
= qemu_file_get_error(f
);
3616 if (!ret
&& place_needed
) {
3617 /* This gets called at the last target page in the host page */
3618 void *place_dest
= (void *)QEMU_ALIGN_DOWN((uintptr_t)host
,
3622 ret
= postcopy_place_page_zero(mis
, place_dest
,
3625 ret
= postcopy_place_page(mis
, place_dest
,
3626 place_source
, block
);
3628 place_needed
= false;
3630 /* Assume we have a zero page until we detect something different */
3638 static bool postcopy_is_advised(void)
3640 PostcopyState ps
= postcopy_state_get();
3641 return ps
>= POSTCOPY_INCOMING_ADVISE
&& ps
< POSTCOPY_INCOMING_END
;
3644 static bool postcopy_is_running(void)
3646 PostcopyState ps
= postcopy_state_get();
3647 return ps
>= POSTCOPY_INCOMING_LISTENING
&& ps
< POSTCOPY_INCOMING_END
;
3651 * Flush content of RAM cache into SVM's memory.
3652 * Only flush the pages that be dirtied by PVM or SVM or both.
3654 void colo_flush_ram_cache(void)
3656 RAMBlock
*block
= NULL
;
3659 unsigned long offset
= 0;
3661 memory_global_dirty_log_sync();
3662 WITH_RCU_READ_LOCK_GUARD() {
3663 RAMBLOCK_FOREACH_NOT_IGNORED(block
) {
3664 ramblock_sync_dirty_bitmap(ram_state
, block
);
3668 trace_colo_flush_ram_cache_begin(ram_state
->migration_dirty_pages
);
3669 WITH_RCU_READ_LOCK_GUARD() {
3670 block
= QLIST_FIRST_RCU(&ram_list
.blocks
);
3673 offset
= migration_bitmap_find_dirty(ram_state
, block
, offset
);
3675 if (((ram_addr_t
)offset
) << TARGET_PAGE_BITS
3676 >= block
->used_length
) {
3678 block
= QLIST_NEXT_RCU(block
, next
);
3680 migration_bitmap_clear_dirty(ram_state
, block
, offset
);
3681 dst_host
= block
->host
3682 + (((ram_addr_t
)offset
) << TARGET_PAGE_BITS
);
3683 src_host
= block
->colo_cache
3684 + (((ram_addr_t
)offset
) << TARGET_PAGE_BITS
);
3685 memcpy(dst_host
, src_host
, TARGET_PAGE_SIZE
);
3689 trace_colo_flush_ram_cache_end();
3693 * ram_load_precopy: load pages in precopy case
3695 * Returns 0 for success or -errno in case of error
3697 * Called in precopy mode by ram_load().
3698 * rcu_read_lock is taken prior to this being called.
3700 * @f: QEMUFile where to send the data
3702 static int ram_load_precopy(QEMUFile
*f
)
3704 int flags
= 0, ret
= 0, invalid_flags
= 0, len
= 0, i
= 0;
3705 /* ADVISE is earlier, it shows the source has the postcopy capability on */
3706 bool postcopy_advised
= postcopy_is_advised();
3707 if (!migrate_use_compression()) {
3708 invalid_flags
|= RAM_SAVE_FLAG_COMPRESS_PAGE
;
3711 while (!ret
&& !(flags
& RAM_SAVE_FLAG_EOS
)) {
3712 ram_addr_t addr
, total_ram_bytes
;
3713 void *host
= NULL
, *host_bak
= NULL
;
3717 * Yield periodically to let main loop run, but an iteration of
3718 * the main loop is expensive, so do it each some iterations
3720 if ((i
& 32767) == 0 && qemu_in_coroutine()) {
3721 aio_co_schedule(qemu_get_current_aio_context(),
3722 qemu_coroutine_self());
3723 qemu_coroutine_yield();
3727 addr
= qemu_get_be64(f
);
3728 flags
= addr
& ~TARGET_PAGE_MASK
;
3729 addr
&= TARGET_PAGE_MASK
;
3731 if (flags
& invalid_flags
) {
3732 if (flags
& invalid_flags
& RAM_SAVE_FLAG_COMPRESS_PAGE
) {
3733 error_report("Received an unexpected compressed page");
3740 if (flags
& (RAM_SAVE_FLAG_ZERO
| RAM_SAVE_FLAG_PAGE
|
3741 RAM_SAVE_FLAG_COMPRESS_PAGE
| RAM_SAVE_FLAG_XBZRLE
)) {
3742 RAMBlock
*block
= ram_block_from_stream(f
, flags
);
3744 host
= host_from_ram_block_offset(block
, addr
);
3746 * After going into COLO stage, we should not load the page
3747 * into SVM's memory directly, we put them into colo_cache firstly.
3748 * NOTE: We need to keep a copy of SVM's ram in colo_cache.
3749 * Previously, we copied all these memory in preparing stage of COLO
3750 * while we need to stop VM, which is a time-consuming process.
3751 * Here we optimize it by a trick, back-up every page while in
3752 * migration process while COLO is enabled, though it affects the
3753 * speed of the migration, but it obviously reduce the downtime of
3754 * back-up all SVM'S memory in COLO preparing stage.
3756 if (migration_incoming_colo_enabled()) {
3757 if (migration_incoming_in_colo_state()) {
3758 /* In COLO stage, put all pages into cache temporarily */
3759 host
= colo_cache_from_block_offset(block
, addr
, true);
3762 * In migration stage but before COLO stage,
3763 * Put all pages into both cache and SVM's memory.
3765 host_bak
= colo_cache_from_block_offset(block
, addr
, false);
3769 error_report("Illegal RAM offset " RAM_ADDR_FMT
, addr
);
3773 if (!migration_incoming_in_colo_state()) {
3774 ramblock_recv_bitmap_set(block
, host
);
3777 trace_ram_load_loop(block
->idstr
, (uint64_t)addr
, flags
, host
);
3780 switch (flags
& ~RAM_SAVE_FLAG_CONTINUE
) {
3781 case RAM_SAVE_FLAG_MEM_SIZE
:
3782 /* Synchronize RAM block list */
3783 total_ram_bytes
= addr
;
3784 while (!ret
&& total_ram_bytes
) {
3789 len
= qemu_get_byte(f
);
3790 qemu_get_buffer(f
, (uint8_t *)id
, len
);
3792 length
= qemu_get_be64(f
);
3794 block
= qemu_ram_block_by_name(id
);
3795 if (block
&& !qemu_ram_is_migratable(block
)) {
3796 error_report("block %s should not be migrated !", id
);
3799 if (length
!= block
->used_length
) {
3800 Error
*local_err
= NULL
;
3802 ret
= qemu_ram_resize(block
, length
,
3805 error_report_err(local_err
);
3808 /* For postcopy we need to check hugepage sizes match */
3809 if (postcopy_advised
&& migrate_postcopy_ram() &&
3810 block
->page_size
!= qemu_host_page_size
) {
3811 uint64_t remote_page_size
= qemu_get_be64(f
);
3812 if (remote_page_size
!= block
->page_size
) {
3813 error_report("Mismatched RAM page size %s "
3814 "(local) %zd != %" PRId64
,
3815 id
, block
->page_size
,
3820 if (migrate_ignore_shared()) {
3821 hwaddr addr
= qemu_get_be64(f
);
3822 if (ramblock_is_ignored(block
) &&
3823 block
->mr
->addr
!= addr
) {
3824 error_report("Mismatched GPAs for block %s "
3825 "%" PRId64
"!= %" PRId64
,
3827 (uint64_t)block
->mr
->addr
);
3831 ram_control_load_hook(f
, RAM_CONTROL_BLOCK_REG
,
3834 error_report("Unknown ramblock \"%s\", cannot "
3835 "accept migration", id
);
3839 total_ram_bytes
-= length
;
3843 case RAM_SAVE_FLAG_ZERO
:
3844 ch
= qemu_get_byte(f
);
3845 ram_handle_compressed(host
, ch
, TARGET_PAGE_SIZE
);
3848 case RAM_SAVE_FLAG_PAGE
:
3849 qemu_get_buffer(f
, host
, TARGET_PAGE_SIZE
);
3852 case RAM_SAVE_FLAG_COMPRESS_PAGE
:
3853 len
= qemu_get_be32(f
);
3854 if (len
< 0 || len
> compressBound(TARGET_PAGE_SIZE
)) {
3855 error_report("Invalid compressed data length: %d", len
);
3859 decompress_data_with_multi_threads(f
, host
, len
);
3862 case RAM_SAVE_FLAG_XBZRLE
:
3863 if (load_xbzrle(f
, addr
, host
) < 0) {
3864 error_report("Failed to decompress XBZRLE page at "
3865 RAM_ADDR_FMT
, addr
);
3870 case RAM_SAVE_FLAG_EOS
:
3872 multifd_recv_sync_main();
3875 if (flags
& RAM_SAVE_FLAG_HOOK
) {
3876 ram_control_load_hook(f
, RAM_CONTROL_HOOK
, NULL
);
3878 error_report("Unknown combination of migration flags: 0x%x",
3884 ret
= qemu_file_get_error(f
);
3886 if (!ret
&& host_bak
) {
3887 memcpy(host_bak
, host
, TARGET_PAGE_SIZE
);
3891 ret
|= wait_for_decompress_done();
3895 static int ram_load(QEMUFile
*f
, void *opaque
, int version_id
)
3898 static uint64_t seq_iter
;
3900 * If system is running in postcopy mode, page inserts to host memory must
3903 bool postcopy_running
= postcopy_is_running();
3907 if (version_id
!= 4) {
3912 * This RCU critical section can be very long running.
3913 * When RCU reclaims in the code start to become numerous,
3914 * it will be necessary to reduce the granularity of this
3917 WITH_RCU_READ_LOCK_GUARD() {
3918 if (postcopy_running
) {
3919 ret
= ram_load_postcopy(f
);
3921 ret
= ram_load_precopy(f
);
3924 trace_ram_load_complete(ret
, seq_iter
);
3929 static bool ram_has_postcopy(void *opaque
)
3932 RAMBLOCK_FOREACH_NOT_IGNORED(rb
) {
3933 if (ramblock_is_pmem(rb
)) {
3934 info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
3935 "is not supported now!", rb
->idstr
, rb
->host
);
3940 return migrate_postcopy_ram();
3943 /* Sync all the dirty bitmap with destination VM. */
3944 static int ram_dirty_bitmap_sync_all(MigrationState
*s
, RAMState
*rs
)
3947 QEMUFile
*file
= s
->to_dst_file
;
3948 int ramblock_count
= 0;
3950 trace_ram_dirty_bitmap_sync_start();
3952 RAMBLOCK_FOREACH_NOT_IGNORED(block
) {
3953 qemu_savevm_send_recv_bitmap(file
, block
->idstr
);
3954 trace_ram_dirty_bitmap_request(block
->idstr
);
3958 trace_ram_dirty_bitmap_sync_wait();
3960 /* Wait until all the ramblocks' dirty bitmap synced */
3961 while (ramblock_count
--) {
3962 qemu_sem_wait(&s
->rp_state
.rp_sem
);
3965 trace_ram_dirty_bitmap_sync_complete();
3970 static void ram_dirty_bitmap_reload_notify(MigrationState
*s
)
3972 qemu_sem_post(&s
->rp_state
.rp_sem
);
3976 * Read the received bitmap, revert it as the initial dirty bitmap.
3977 * This is only used when the postcopy migration is paused but wants
3978 * to resume from a middle point.
3980 int ram_dirty_bitmap_reload(MigrationState
*s
, RAMBlock
*block
)
3983 QEMUFile
*file
= s
->rp_state
.from_dst_file
;
3984 unsigned long *le_bitmap
, nbits
= block
->used_length
>> TARGET_PAGE_BITS
;
3985 uint64_t local_size
= DIV_ROUND_UP(nbits
, 8);
3986 uint64_t size
, end_mark
;
3988 trace_ram_dirty_bitmap_reload_begin(block
->idstr
);
3990 if (s
->state
!= MIGRATION_STATUS_POSTCOPY_RECOVER
) {
3991 error_report("%s: incorrect state %s", __func__
,
3992 MigrationStatus_str(s
->state
));
3997 * Note: see comments in ramblock_recv_bitmap_send() on why we
3998 * need the endianness conversion, and the paddings.
4000 local_size
= ROUND_UP(local_size
, 8);
4003 le_bitmap
= bitmap_new(nbits
+ BITS_PER_LONG
);
4005 size
= qemu_get_be64(file
);
4007 /* The size of the bitmap should match with our ramblock */
4008 if (size
!= local_size
) {
4009 error_report("%s: ramblock '%s' bitmap size mismatch "
4010 "(0x%"PRIx64
" != 0x%"PRIx64
")", __func__
,
4011 block
->idstr
, size
, local_size
);
4016 size
= qemu_get_buffer(file
, (uint8_t *)le_bitmap
, local_size
);
4017 end_mark
= qemu_get_be64(file
);
4019 ret
= qemu_file_get_error(file
);
4020 if (ret
|| size
!= local_size
) {
4021 error_report("%s: read bitmap failed for ramblock '%s': %d"
4022 " (size 0x%"PRIx64
", got: 0x%"PRIx64
")",
4023 __func__
, block
->idstr
, ret
, local_size
, size
);
4028 if (end_mark
!= RAMBLOCK_RECV_BITMAP_ENDING
) {
4029 error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64
,
4030 __func__
, block
->idstr
, end_mark
);
4036 * Endianness conversion. We are during postcopy (though paused).
4037 * The dirty bitmap won't change. We can directly modify it.
4039 bitmap_from_le(block
->bmap
, le_bitmap
, nbits
);
4042 * What we received is "received bitmap". Revert it as the initial
4043 * dirty bitmap for this ramblock.
4045 bitmap_complement(block
->bmap
, block
->bmap
, nbits
);
4047 trace_ram_dirty_bitmap_reload_complete(block
->idstr
);
4050 * We succeeded to sync bitmap for current ramblock. If this is
4051 * the last one to sync, we need to notify the main send thread.
4053 ram_dirty_bitmap_reload_notify(s
);
4061 static int ram_resume_prepare(MigrationState
*s
, void *opaque
)
4063 RAMState
*rs
= *(RAMState
**)opaque
;
4066 ret
= ram_dirty_bitmap_sync_all(s
, rs
);
4071 ram_state_resume_prepare(rs
, s
->to_dst_file
);
4076 static SaveVMHandlers savevm_ram_handlers
= {
4077 .save_setup
= ram_save_setup
,
4078 .save_live_iterate
= ram_save_iterate
,
4079 .save_live_complete_postcopy
= ram_save_complete
,
4080 .save_live_complete_precopy
= ram_save_complete
,
4081 .has_postcopy
= ram_has_postcopy
,
4082 .save_live_pending
= ram_save_pending
,
4083 .load_state
= ram_load
,
4084 .save_cleanup
= ram_save_cleanup
,
4085 .load_setup
= ram_load_setup
,
4086 .load_cleanup
= ram_load_cleanup
,
4087 .resume_prepare
= ram_resume_prepare
,
4090 void ram_mig_init(void)
4092 qemu_mutex_init(&XBZRLE
.lock
);
4093 register_savevm_live("ram", 0, 4, &savevm_ram_handlers
, &ram_state
);