4 * Copyright (c) 2003-2008 Fabrice Bellard
5 * Copyright (c) 2011-2015 Red Hat Inc
8 * Juan Quintela <quintela@redhat.com>
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
29 #include "qemu/osdep.h"
31 #include "qemu/cutils.h"
32 #include "qemu/bitops.h"
33 #include "qemu/bitmap.h"
34 #include "qemu/main-loop.h"
37 #include "migration.h"
38 #include "migration/register.h"
39 #include "migration/misc.h"
40 #include "qemu-file.h"
41 #include "postcopy-ram.h"
42 #include "page_cache.h"
43 #include "qemu/error-report.h"
44 #include "qapi/error.h"
45 #include "qapi/qapi-types-migration.h"
46 #include "qapi/qapi-events-migration.h"
47 #include "qapi/qmp/qerror.h"
49 #include "exec/ram_addr.h"
50 #include "exec/target_page.h"
51 #include "qemu/rcu_queue.h"
52 #include "migration/colo.h"
54 #include "sysemu/sysemu.h"
55 #include "sysemu/cpu-throttle.h"
59 #include "sysemu/runstate.h"
61 #if defined(__linux__)
62 #include "qemu/userfaultfd.h"
63 #endif /* defined(__linux__) */
65 /***********************************************************/
66 /* ram save/restore */
68 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
69 * worked for pages that where filled with the same char. We switched
70 * it to only search for the zero value. And to avoid confusion with
71 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
74 #define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */
75 #define RAM_SAVE_FLAG_ZERO 0x02
76 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
77 #define RAM_SAVE_FLAG_PAGE 0x08
78 #define RAM_SAVE_FLAG_EOS 0x10
79 #define RAM_SAVE_FLAG_CONTINUE 0x20
80 #define RAM_SAVE_FLAG_XBZRLE 0x40
81 /* 0x80 is reserved in migration.h start with 0x100 next */
82 #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
84 static inline bool is_zero_range(uint8_t *p
, uint64_t size
)
86 return buffer_is_zero(p
, size
);
89 XBZRLECacheStats xbzrle_counters
;
91 /* struct contains XBZRLE cache and a static page
92 used by the compression */
94 /* buffer used for XBZRLE encoding */
96 /* buffer for storing page content */
98 /* Cache for XBZRLE, Protected by lock. */
101 /* it will store a page full of zeros */
102 uint8_t *zero_target_page
;
103 /* buffer used for XBZRLE decoding */
104 uint8_t *decoded_buf
;
107 static void XBZRLE_cache_lock(void)
109 if (migrate_use_xbzrle()) {
110 qemu_mutex_lock(&XBZRLE
.lock
);
114 static void XBZRLE_cache_unlock(void)
116 if (migrate_use_xbzrle()) {
117 qemu_mutex_unlock(&XBZRLE
.lock
);
122 * xbzrle_cache_resize: resize the xbzrle cache
124 * This function is called from qmp_migrate_set_cache_size in main
125 * thread, possibly while a migration is in progress. A running
126 * migration may be using the cache and might finish during this call,
127 * hence changes to the cache are protected by XBZRLE.lock().
129 * Returns 0 for success or -1 for error
131 * @new_size: new cache size
132 * @errp: set *errp if the check failed, with reason
134 int xbzrle_cache_resize(uint64_t new_size
, Error
**errp
)
136 PageCache
*new_cache
;
139 /* Check for truncation */
140 if (new_size
!= (size_t)new_size
) {
141 error_setg(errp
, QERR_INVALID_PARAMETER_VALUE
, "cache size",
142 "exceeding address space");
146 if (new_size
== migrate_xbzrle_cache_size()) {
153 if (XBZRLE
.cache
!= NULL
) {
154 new_cache
= cache_init(new_size
, TARGET_PAGE_SIZE
, errp
);
160 cache_fini(XBZRLE
.cache
);
161 XBZRLE
.cache
= new_cache
;
164 XBZRLE_cache_unlock();
168 bool ramblock_is_ignored(RAMBlock
*block
)
170 return !qemu_ram_is_migratable(block
) ||
171 (migrate_ignore_shared() && qemu_ram_is_shared(block
));
174 #undef RAMBLOCK_FOREACH
176 int foreach_not_ignored_block(RAMBlockIterFunc func
, void *opaque
)
181 RCU_READ_LOCK_GUARD();
183 RAMBLOCK_FOREACH_NOT_IGNORED(block
) {
184 ret
= func(block
, opaque
);
192 static void ramblock_recv_map_init(void)
196 RAMBLOCK_FOREACH_NOT_IGNORED(rb
) {
197 assert(!rb
->receivedmap
);
198 rb
->receivedmap
= bitmap_new(rb
->max_length
>> qemu_target_page_bits());
202 int ramblock_recv_bitmap_test(RAMBlock
*rb
, void *host_addr
)
204 return test_bit(ramblock_recv_bitmap_offset(host_addr
, rb
),
208 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock
*rb
, uint64_t byte_offset
)
210 return test_bit(byte_offset
>> TARGET_PAGE_BITS
, rb
->receivedmap
);
213 void ramblock_recv_bitmap_set(RAMBlock
*rb
, void *host_addr
)
215 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr
, rb
), rb
->receivedmap
);
218 void ramblock_recv_bitmap_set_range(RAMBlock
*rb
, void *host_addr
,
221 bitmap_set_atomic(rb
->receivedmap
,
222 ramblock_recv_bitmap_offset(host_addr
, rb
),
226 #define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL)
229 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
231 * Returns >0 if success with sent bytes, or <0 if error.
233 int64_t ramblock_recv_bitmap_send(QEMUFile
*file
,
234 const char *block_name
)
236 RAMBlock
*block
= qemu_ram_block_by_name(block_name
);
237 unsigned long *le_bitmap
, nbits
;
241 error_report("%s: invalid block name: %s", __func__
, block_name
);
245 nbits
= block
->used_length
>> TARGET_PAGE_BITS
;
248 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
249 * machines we may need 4 more bytes for padding (see below
250 * comment). So extend it a bit before hand.
252 le_bitmap
= bitmap_new(nbits
+ BITS_PER_LONG
);
255 * Always use little endian when sending the bitmap. This is
256 * required that when source and destination VMs are not using the
257 * same endianness. (Note: big endian won't work.)
259 bitmap_to_le(le_bitmap
, block
->receivedmap
, nbits
);
261 /* Size of the bitmap, in bytes */
262 size
= DIV_ROUND_UP(nbits
, 8);
265 * size is always aligned to 8 bytes for 64bit machines, but it
266 * may not be true for 32bit machines. We need this padding to
267 * make sure the migration can survive even between 32bit and
270 size
= ROUND_UP(size
, 8);
272 qemu_put_be64(file
, size
);
273 qemu_put_buffer(file
, (const uint8_t *)le_bitmap
, size
);
275 * Mark as an end, in case the middle part is screwed up due to
276 * some "mysterious" reason.
278 qemu_put_be64(file
, RAMBLOCK_RECV_BITMAP_ENDING
);
283 if (qemu_file_get_error(file
)) {
284 return qemu_file_get_error(file
);
287 return size
+ sizeof(size
);
291 * An outstanding page request, on the source, having been received
294 struct RAMSrcPageRequest
{
299 QSIMPLEQ_ENTRY(RAMSrcPageRequest
) next_req
;
302 /* State of RAM for migration */
304 /* QEMUFile used for this migration */
306 /* UFFD file descriptor, used in 'write-tracking' migration */
308 /* Last block that we have visited searching for dirty pages */
309 RAMBlock
*last_seen_block
;
310 /* Last block from where we have sent data */
311 RAMBlock
*last_sent_block
;
312 /* Last dirty target page we have sent */
313 ram_addr_t last_page
;
314 /* last ram version we have seen */
315 uint32_t last_version
;
316 /* We are in the first round */
318 /* The free page optimization is enabled */
320 /* How many times we have dirty too many pages */
321 int dirty_rate_high_cnt
;
322 /* these variables are used for bitmap sync */
323 /* last time we did a full bitmap_sync */
324 int64_t time_last_bitmap_sync
;
325 /* bytes transferred at start_time */
326 uint64_t bytes_xfer_prev
;
327 /* number of dirty pages since start_time */
328 uint64_t num_dirty_pages_period
;
329 /* xbzrle misses since the beginning of the period */
330 uint64_t xbzrle_cache_miss_prev
;
331 /* Amount of xbzrle pages since the beginning of the period */
332 uint64_t xbzrle_pages_prev
;
333 /* Amount of xbzrle encoded bytes since the beginning of the period */
334 uint64_t xbzrle_bytes_prev
;
336 /* compression statistics since the beginning of the period */
337 /* amount of count that no free thread to compress data */
338 uint64_t compress_thread_busy_prev
;
339 /* amount bytes after compression */
340 uint64_t compressed_size_prev
;
341 /* amount of compressed pages */
342 uint64_t compress_pages_prev
;
344 /* total handled target pages at the beginning of period */
345 uint64_t target_page_count_prev
;
346 /* total handled target pages since start */
347 uint64_t target_page_count
;
348 /* number of dirty bits in the bitmap */
349 uint64_t migration_dirty_pages
;
350 /* Protects modification of the bitmap and migration dirty pages */
351 QemuMutex bitmap_mutex
;
352 /* The RAMBlock used in the last src_page_requests */
353 RAMBlock
*last_req_rb
;
354 /* Queue of outstanding page requests from the destination */
355 QemuMutex src_page_req_mutex
;
356 QSIMPLEQ_HEAD(, RAMSrcPageRequest
) src_page_requests
;
358 typedef struct RAMState RAMState
;
360 static RAMState
*ram_state
;
362 static NotifierWithReturnList precopy_notifier_list
;
364 void precopy_infrastructure_init(void)
366 notifier_with_return_list_init(&precopy_notifier_list
);
369 void precopy_add_notifier(NotifierWithReturn
*n
)
371 notifier_with_return_list_add(&precopy_notifier_list
, n
);
374 void precopy_remove_notifier(NotifierWithReturn
*n
)
376 notifier_with_return_remove(n
);
379 int precopy_notify(PrecopyNotifyReason reason
, Error
**errp
)
381 PrecopyNotifyData pnd
;
385 return notifier_with_return_list_notify(&precopy_notifier_list
, &pnd
);
388 void precopy_enable_free_page_optimization(void)
394 ram_state
->fpo_enabled
= true;
397 uint64_t ram_bytes_remaining(void)
399 return ram_state
? (ram_state
->migration_dirty_pages
* TARGET_PAGE_SIZE
) :
403 MigrationStats ram_counters
;
405 /* used by the search for pages to send */
406 struct PageSearchStatus
{
407 /* Current block being searched */
409 /* Current page to search from */
411 /* Set once we wrap around */
414 typedef struct PageSearchStatus PageSearchStatus
;
416 CompressionStats compression_counters
;
418 struct CompressParam
{
428 /* internally used fields */
432 typedef struct CompressParam CompressParam
;
434 struct DecompressParam
{
444 typedef struct DecompressParam DecompressParam
;
446 static CompressParam
*comp_param
;
447 static QemuThread
*compress_threads
;
448 /* comp_done_cond is used to wake up the migration thread when
449 * one of the compression threads has finished the compression.
450 * comp_done_lock is used to co-work with comp_done_cond.
452 static QemuMutex comp_done_lock
;
453 static QemuCond comp_done_cond
;
454 /* The empty QEMUFileOps will be used by file in CompressParam */
455 static const QEMUFileOps empty_ops
= { };
457 static QEMUFile
*decomp_file
;
458 static DecompressParam
*decomp_param
;
459 static QemuThread
*decompress_threads
;
460 static QemuMutex decomp_done_lock
;
461 static QemuCond decomp_done_cond
;
463 static bool do_compress_ram_page(QEMUFile
*f
, z_stream
*stream
, RAMBlock
*block
,
464 ram_addr_t offset
, uint8_t *source_buf
);
466 static void *do_data_compress(void *opaque
)
468 CompressParam
*param
= opaque
;
473 qemu_mutex_lock(¶m
->mutex
);
474 while (!param
->quit
) {
476 block
= param
->block
;
477 offset
= param
->offset
;
479 qemu_mutex_unlock(¶m
->mutex
);
481 zero_page
= do_compress_ram_page(param
->file
, ¶m
->stream
,
482 block
, offset
, param
->originbuf
);
484 qemu_mutex_lock(&comp_done_lock
);
486 param
->zero_page
= zero_page
;
487 qemu_cond_signal(&comp_done_cond
);
488 qemu_mutex_unlock(&comp_done_lock
);
490 qemu_mutex_lock(¶m
->mutex
);
492 qemu_cond_wait(¶m
->cond
, ¶m
->mutex
);
495 qemu_mutex_unlock(¶m
->mutex
);
500 static void compress_threads_save_cleanup(void)
504 if (!migrate_use_compression() || !comp_param
) {
508 thread_count
= migrate_compress_threads();
509 for (i
= 0; i
< thread_count
; i
++) {
511 * we use it as a indicator which shows if the thread is
512 * properly init'd or not
514 if (!comp_param
[i
].file
) {
518 qemu_mutex_lock(&comp_param
[i
].mutex
);
519 comp_param
[i
].quit
= true;
520 qemu_cond_signal(&comp_param
[i
].cond
);
521 qemu_mutex_unlock(&comp_param
[i
].mutex
);
523 qemu_thread_join(compress_threads
+ i
);
524 qemu_mutex_destroy(&comp_param
[i
].mutex
);
525 qemu_cond_destroy(&comp_param
[i
].cond
);
526 deflateEnd(&comp_param
[i
].stream
);
527 g_free(comp_param
[i
].originbuf
);
528 qemu_fclose(comp_param
[i
].file
);
529 comp_param
[i
].file
= NULL
;
531 qemu_mutex_destroy(&comp_done_lock
);
532 qemu_cond_destroy(&comp_done_cond
);
533 g_free(compress_threads
);
535 compress_threads
= NULL
;
539 static int compress_threads_save_setup(void)
543 if (!migrate_use_compression()) {
546 thread_count
= migrate_compress_threads();
547 compress_threads
= g_new0(QemuThread
, thread_count
);
548 comp_param
= g_new0(CompressParam
, thread_count
);
549 qemu_cond_init(&comp_done_cond
);
550 qemu_mutex_init(&comp_done_lock
);
551 for (i
= 0; i
< thread_count
; i
++) {
552 comp_param
[i
].originbuf
= g_try_malloc(TARGET_PAGE_SIZE
);
553 if (!comp_param
[i
].originbuf
) {
557 if (deflateInit(&comp_param
[i
].stream
,
558 migrate_compress_level()) != Z_OK
) {
559 g_free(comp_param
[i
].originbuf
);
563 /* comp_param[i].file is just used as a dummy buffer to save data,
564 * set its ops to empty.
566 comp_param
[i
].file
= qemu_fopen_ops(NULL
, &empty_ops
);
567 comp_param
[i
].done
= true;
568 comp_param
[i
].quit
= false;
569 qemu_mutex_init(&comp_param
[i
].mutex
);
570 qemu_cond_init(&comp_param
[i
].cond
);
571 qemu_thread_create(compress_threads
+ i
, "compress",
572 do_data_compress
, comp_param
+ i
,
573 QEMU_THREAD_JOINABLE
);
578 compress_threads_save_cleanup();
583 * save_page_header: write page header to wire
585 * If this is the 1st block, it also writes the block identification
587 * Returns the number of bytes written
589 * @f: QEMUFile where to send the data
590 * @block: block that contains the page we want to send
591 * @offset: offset inside the block for the page
592 * in the lower bits, it contains flags
594 static size_t save_page_header(RAMState
*rs
, QEMUFile
*f
, RAMBlock
*block
,
599 if (block
== rs
->last_sent_block
) {
600 offset
|= RAM_SAVE_FLAG_CONTINUE
;
602 qemu_put_be64(f
, offset
);
605 if (!(offset
& RAM_SAVE_FLAG_CONTINUE
)) {
606 len
= strlen(block
->idstr
);
607 qemu_put_byte(f
, len
);
608 qemu_put_buffer(f
, (uint8_t *)block
->idstr
, len
);
610 rs
->last_sent_block
= block
;
616 * mig_throttle_guest_down: throotle down the guest
618 * Reduce amount of guest cpu execution to hopefully slow down memory
619 * writes. If guest dirty memory rate is reduced below the rate at
620 * which we can transfer pages to the destination then we should be
621 * able to complete migration. Some workloads dirty memory way too
622 * fast and will not effectively converge, even with auto-converge.
624 static void mig_throttle_guest_down(uint64_t bytes_dirty_period
,
625 uint64_t bytes_dirty_threshold
)
627 MigrationState
*s
= migrate_get_current();
628 uint64_t pct_initial
= s
->parameters
.cpu_throttle_initial
;
629 uint64_t pct_increment
= s
->parameters
.cpu_throttle_increment
;
630 bool pct_tailslow
= s
->parameters
.cpu_throttle_tailslow
;
631 int pct_max
= s
->parameters
.max_cpu_throttle
;
633 uint64_t throttle_now
= cpu_throttle_get_percentage();
634 uint64_t cpu_now
, cpu_ideal
, throttle_inc
;
636 /* We have not started throttling yet. Let's start it. */
637 if (!cpu_throttle_active()) {
638 cpu_throttle_set(pct_initial
);
640 /* Throttling already on, just increase the rate */
642 throttle_inc
= pct_increment
;
644 /* Compute the ideal CPU percentage used by Guest, which may
645 * make the dirty rate match the dirty rate threshold. */
646 cpu_now
= 100 - throttle_now
;
647 cpu_ideal
= cpu_now
* (bytes_dirty_threshold
* 1.0 /
649 throttle_inc
= MIN(cpu_now
- cpu_ideal
, pct_increment
);
651 cpu_throttle_set(MIN(throttle_now
+ throttle_inc
, pct_max
));
656 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
658 * @rs: current RAM state
659 * @current_addr: address for the zero page
661 * Update the xbzrle cache to reflect a page that's been sent as all 0.
662 * The important thing is that a stale (not-yet-0'd) page be replaced
664 * As a bonus, if the page wasn't in the cache it gets added so that
665 * when a small write is made into the 0'd page it gets XBZRLE sent.
667 static void xbzrle_cache_zero_page(RAMState
*rs
, ram_addr_t current_addr
)
669 if (rs
->ram_bulk_stage
|| !migrate_use_xbzrle()) {
673 /* We don't care if this fails to allocate a new cache page
674 * as long as it updated an old one */
675 cache_insert(XBZRLE
.cache
, current_addr
, XBZRLE
.zero_target_page
,
676 ram_counters
.dirty_sync_count
);
679 #define ENCODING_FLAG_XBZRLE 0x1
682 * save_xbzrle_page: compress and send current page
684 * Returns: 1 means that we wrote the page
685 * 0 means that page is identical to the one already sent
686 * -1 means that xbzrle would be longer than normal
688 * @rs: current RAM state
689 * @current_data: pointer to the address of the page contents
690 * @current_addr: addr of the page
691 * @block: block that contains the page we want to send
692 * @offset: offset inside the block for the page
693 * @last_stage: if we are at the completion stage
695 static int save_xbzrle_page(RAMState
*rs
, uint8_t **current_data
,
696 ram_addr_t current_addr
, RAMBlock
*block
,
697 ram_addr_t offset
, bool last_stage
)
699 int encoded_len
= 0, bytes_xbzrle
;
700 uint8_t *prev_cached_page
;
702 if (!cache_is_cached(XBZRLE
.cache
, current_addr
,
703 ram_counters
.dirty_sync_count
)) {
704 xbzrle_counters
.cache_miss
++;
706 if (cache_insert(XBZRLE
.cache
, current_addr
, *current_data
,
707 ram_counters
.dirty_sync_count
) == -1) {
710 /* update *current_data when the page has been
711 inserted into cache */
712 *current_data
= get_cached_data(XBZRLE
.cache
, current_addr
);
719 * Reaching here means the page has hit the xbzrle cache, no matter what
720 * encoding result it is (normal encoding, overflow or skipping the page),
721 * count the page as encoded. This is used to calculate the encoding rate.
723 * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
724 * 2nd page turns out to be skipped (i.e. no new bytes written to the
725 * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
726 * skipped page included. In this way, the encoding rate can tell if the
727 * guest page is good for xbzrle encoding.
729 xbzrle_counters
.pages
++;
730 prev_cached_page
= get_cached_data(XBZRLE
.cache
, current_addr
);
732 /* save current buffer into memory */
733 memcpy(XBZRLE
.current_buf
, *current_data
, TARGET_PAGE_SIZE
);
735 /* XBZRLE encoding (if there is no overflow) */
736 encoded_len
= xbzrle_encode_buffer(prev_cached_page
, XBZRLE
.current_buf
,
737 TARGET_PAGE_SIZE
, XBZRLE
.encoded_buf
,
741 * Update the cache contents, so that it corresponds to the data
742 * sent, in all cases except where we skip the page.
744 if (!last_stage
&& encoded_len
!= 0) {
745 memcpy(prev_cached_page
, XBZRLE
.current_buf
, TARGET_PAGE_SIZE
);
747 * In the case where we couldn't compress, ensure that the caller
748 * sends the data from the cache, since the guest might have
749 * changed the RAM since we copied it.
751 *current_data
= prev_cached_page
;
754 if (encoded_len
== 0) {
755 trace_save_xbzrle_page_skipping();
757 } else if (encoded_len
== -1) {
758 trace_save_xbzrle_page_overflow();
759 xbzrle_counters
.overflow
++;
760 xbzrle_counters
.bytes
+= TARGET_PAGE_SIZE
;
764 /* Send XBZRLE based compressed page */
765 bytes_xbzrle
= save_page_header(rs
, rs
->f
, block
,
766 offset
| RAM_SAVE_FLAG_XBZRLE
);
767 qemu_put_byte(rs
->f
, ENCODING_FLAG_XBZRLE
);
768 qemu_put_be16(rs
->f
, encoded_len
);
769 qemu_put_buffer(rs
->f
, XBZRLE
.encoded_buf
, encoded_len
);
770 bytes_xbzrle
+= encoded_len
+ 1 + 2;
772 * Like compressed_size (please see update_compress_thread_counts),
773 * the xbzrle encoded bytes don't count the 8 byte header with
774 * RAM_SAVE_FLAG_CONTINUE.
776 xbzrle_counters
.bytes
+= bytes_xbzrle
- 8;
777 ram_counters
.transferred
+= bytes_xbzrle
;
783 * migration_bitmap_find_dirty: find the next dirty page from start
785 * Returns the page offset within memory region of the start of a dirty page
787 * @rs: current RAM state
788 * @rb: RAMBlock where to search for dirty pages
789 * @start: page where we start the search
792 unsigned long migration_bitmap_find_dirty(RAMState
*rs
, RAMBlock
*rb
,
795 unsigned long size
= rb
->used_length
>> TARGET_PAGE_BITS
;
796 unsigned long *bitmap
= rb
->bmap
;
799 if (ramblock_is_ignored(rb
)) {
804 * When the free page optimization is enabled, we need to check the bitmap
805 * to send the non-free pages rather than all the pages in the bulk stage.
807 if (!rs
->fpo_enabled
&& rs
->ram_bulk_stage
&& start
> 0) {
810 next
= find_next_bit(bitmap
, size
, start
);
816 static inline bool migration_bitmap_clear_dirty(RAMState
*rs
,
822 qemu_mutex_lock(&rs
->bitmap_mutex
);
825 * Clear dirty bitmap if needed. This _must_ be called before we
826 * send any of the page in the chunk because we need to make sure
827 * we can capture further page content changes when we sync dirty
828 * log the next time. So as long as we are going to send any of
829 * the page in the chunk we clear the remote dirty bitmap for all.
830 * Clearing it earlier won't be a problem, but too late will.
832 if (rb
->clear_bmap
&& clear_bmap_test_and_clear(rb
, page
)) {
833 uint8_t shift
= rb
->clear_bmap_shift
;
834 hwaddr size
= 1ULL << (TARGET_PAGE_BITS
+ shift
);
835 hwaddr start
= (((ram_addr_t
)page
) << TARGET_PAGE_BITS
) & (-size
);
838 * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
839 * can make things easier sometimes since then start address
840 * of the small chunk will always be 64 pages aligned so the
841 * bitmap will always be aligned to unsigned long. We should
842 * even be able to remove this restriction but I'm simply
846 trace_migration_bitmap_clear_dirty(rb
->idstr
, start
, size
, page
);
847 memory_region_clear_dirty_bitmap(rb
->mr
, start
, size
);
850 ret
= test_and_clear_bit(page
, rb
->bmap
);
853 rs
->migration_dirty_pages
--;
855 qemu_mutex_unlock(&rs
->bitmap_mutex
);
860 /* Called with RCU critical section */
861 static void ramblock_sync_dirty_bitmap(RAMState
*rs
, RAMBlock
*rb
)
863 uint64_t new_dirty_pages
=
864 cpu_physical_memory_sync_dirty_bitmap(rb
, 0, rb
->used_length
);
866 rs
->migration_dirty_pages
+= new_dirty_pages
;
867 rs
->num_dirty_pages_period
+= new_dirty_pages
;
871 * ram_pagesize_summary: calculate all the pagesizes of a VM
873 * Returns a summary bitmap of the page sizes of all RAMBlocks
875 * For VMs with just normal pages this is equivalent to the host page
876 * size. If it's got some huge pages then it's the OR of all the
877 * different page sizes.
879 uint64_t ram_pagesize_summary(void)
882 uint64_t summary
= 0;
884 RAMBLOCK_FOREACH_NOT_IGNORED(block
) {
885 summary
|= block
->page_size
;
891 uint64_t ram_get_total_transferred_pages(void)
893 return ram_counters
.normal
+ ram_counters
.duplicate
+
894 compression_counters
.pages
+ xbzrle_counters
.pages
;
897 static void migration_update_rates(RAMState
*rs
, int64_t end_time
)
899 uint64_t page_count
= rs
->target_page_count
- rs
->target_page_count_prev
;
900 double compressed_size
;
902 /* calculate period counters */
903 ram_counters
.dirty_pages_rate
= rs
->num_dirty_pages_period
* 1000
904 / (end_time
- rs
->time_last_bitmap_sync
);
910 if (migrate_use_xbzrle()) {
911 double encoded_size
, unencoded_size
;
913 xbzrle_counters
.cache_miss_rate
= (double)(xbzrle_counters
.cache_miss
-
914 rs
->xbzrle_cache_miss_prev
) / page_count
;
915 rs
->xbzrle_cache_miss_prev
= xbzrle_counters
.cache_miss
;
916 unencoded_size
= (xbzrle_counters
.pages
- rs
->xbzrle_pages_prev
) *
918 encoded_size
= xbzrle_counters
.bytes
- rs
->xbzrle_bytes_prev
;
919 if (xbzrle_counters
.pages
== rs
->xbzrle_pages_prev
|| !encoded_size
) {
920 xbzrle_counters
.encoding_rate
= 0;
922 xbzrle_counters
.encoding_rate
= unencoded_size
/ encoded_size
;
924 rs
->xbzrle_pages_prev
= xbzrle_counters
.pages
;
925 rs
->xbzrle_bytes_prev
= xbzrle_counters
.bytes
;
928 if (migrate_use_compression()) {
929 compression_counters
.busy_rate
= (double)(compression_counters
.busy
-
930 rs
->compress_thread_busy_prev
) / page_count
;
931 rs
->compress_thread_busy_prev
= compression_counters
.busy
;
933 compressed_size
= compression_counters
.compressed_size
-
934 rs
->compressed_size_prev
;
935 if (compressed_size
) {
936 double uncompressed_size
= (compression_counters
.pages
-
937 rs
->compress_pages_prev
) * TARGET_PAGE_SIZE
;
939 /* Compression-Ratio = Uncompressed-size / Compressed-size */
940 compression_counters
.compression_rate
=
941 uncompressed_size
/ compressed_size
;
943 rs
->compress_pages_prev
= compression_counters
.pages
;
944 rs
->compressed_size_prev
= compression_counters
.compressed_size
;
949 static void migration_trigger_throttle(RAMState
*rs
)
951 MigrationState
*s
= migrate_get_current();
952 uint64_t threshold
= s
->parameters
.throttle_trigger_threshold
;
954 uint64_t bytes_xfer_period
= ram_counters
.transferred
- rs
->bytes_xfer_prev
;
955 uint64_t bytes_dirty_period
= rs
->num_dirty_pages_period
* TARGET_PAGE_SIZE
;
956 uint64_t bytes_dirty_threshold
= bytes_xfer_period
* threshold
/ 100;
958 /* During block migration the auto-converge logic incorrectly detects
959 * that ram migration makes no progress. Avoid this by disabling the
960 * throttling logic during the bulk phase of block migration. */
961 if (migrate_auto_converge() && !blk_mig_bulk_active()) {
962 /* The following detection logic can be refined later. For now:
963 Check to see if the ratio between dirtied bytes and the approx.
964 amount of bytes that just got transferred since the last time
965 we were in this routine reaches the threshold. If that happens
966 twice, start or increase throttling. */
968 if ((bytes_dirty_period
> bytes_dirty_threshold
) &&
969 (++rs
->dirty_rate_high_cnt
>= 2)) {
970 trace_migration_throttle();
971 rs
->dirty_rate_high_cnt
= 0;
972 mig_throttle_guest_down(bytes_dirty_period
,
973 bytes_dirty_threshold
);
978 static void migration_bitmap_sync(RAMState
*rs
)
983 ram_counters
.dirty_sync_count
++;
985 if (!rs
->time_last_bitmap_sync
) {
986 rs
->time_last_bitmap_sync
= qemu_clock_get_ms(QEMU_CLOCK_REALTIME
);
989 trace_migration_bitmap_sync_start();
990 memory_global_dirty_log_sync();
992 qemu_mutex_lock(&rs
->bitmap_mutex
);
993 WITH_RCU_READ_LOCK_GUARD() {
994 RAMBLOCK_FOREACH_NOT_IGNORED(block
) {
995 ramblock_sync_dirty_bitmap(rs
, block
);
997 ram_counters
.remaining
= ram_bytes_remaining();
999 qemu_mutex_unlock(&rs
->bitmap_mutex
);
1001 memory_global_after_dirty_log_sync();
1002 trace_migration_bitmap_sync_end(rs
->num_dirty_pages_period
);
1004 end_time
= qemu_clock_get_ms(QEMU_CLOCK_REALTIME
);
1006 /* more than 1 second = 1000 millisecons */
1007 if (end_time
> rs
->time_last_bitmap_sync
+ 1000) {
1008 migration_trigger_throttle(rs
);
1010 migration_update_rates(rs
, end_time
);
1012 rs
->target_page_count_prev
= rs
->target_page_count
;
1014 /* reset period counters */
1015 rs
->time_last_bitmap_sync
= end_time
;
1016 rs
->num_dirty_pages_period
= 0;
1017 rs
->bytes_xfer_prev
= ram_counters
.transferred
;
1019 if (migrate_use_events()) {
1020 qapi_event_send_migration_pass(ram_counters
.dirty_sync_count
);
1024 static void migration_bitmap_sync_precopy(RAMState
*rs
)
1026 Error
*local_err
= NULL
;
1029 * The current notifier usage is just an optimization to migration, so we
1030 * don't stop the normal migration process in the error case.
1032 if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC
, &local_err
)) {
1033 error_report_err(local_err
);
1037 migration_bitmap_sync(rs
);
1039 if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC
, &local_err
)) {
1040 error_report_err(local_err
);
1045 * save_zero_page_to_file: send the zero page to the file
1047 * Returns the size of data written to the file, 0 means the page is not
1050 * @rs: current RAM state
1051 * @file: the file where the data is saved
1052 * @block: block that contains the page we want to send
1053 * @offset: offset inside the block for the page
1055 static int save_zero_page_to_file(RAMState
*rs
, QEMUFile
*file
,
1056 RAMBlock
*block
, ram_addr_t offset
)
1058 uint8_t *p
= block
->host
+ offset
;
1061 if (is_zero_range(p
, TARGET_PAGE_SIZE
)) {
1062 len
+= save_page_header(rs
, file
, block
, offset
| RAM_SAVE_FLAG_ZERO
);
1063 qemu_put_byte(file
, 0);
1070 * save_zero_page: send the zero page to the stream
1072 * Returns the number of pages written.
1074 * @rs: current RAM state
1075 * @block: block that contains the page we want to send
1076 * @offset: offset inside the block for the page
1078 static int save_zero_page(RAMState
*rs
, RAMBlock
*block
, ram_addr_t offset
)
1080 int len
= save_zero_page_to_file(rs
, rs
->f
, block
, offset
);
1083 ram_counters
.duplicate
++;
1084 ram_counters
.transferred
+= len
;
1090 static void ram_release_pages(const char *rbname
, uint64_t offset
, int pages
)
1092 if (!migrate_release_ram() || !migration_in_postcopy()) {
1096 ram_discard_range(rbname
, offset
, ((ram_addr_t
)pages
) << TARGET_PAGE_BITS
);
1100 * @pages: the number of pages written by the control path,
1102 * > 0 - number of pages written
1104 * Return true if the pages has been saved, otherwise false is returned.
1106 static bool control_save_page(RAMState
*rs
, RAMBlock
*block
, ram_addr_t offset
,
1109 uint64_t bytes_xmit
= 0;
1113 ret
= ram_control_save_page(rs
->f
, block
->offset
, offset
, TARGET_PAGE_SIZE
,
1115 if (ret
== RAM_SAVE_CONTROL_NOT_SUPP
) {
1120 ram_counters
.transferred
+= bytes_xmit
;
1124 if (ret
== RAM_SAVE_CONTROL_DELAYED
) {
1128 if (bytes_xmit
> 0) {
1129 ram_counters
.normal
++;
1130 } else if (bytes_xmit
== 0) {
1131 ram_counters
.duplicate
++;
1138 * directly send the page to the stream
1140 * Returns the number of pages written.
1142 * @rs: current RAM state
1143 * @block: block that contains the page we want to send
1144 * @offset: offset inside the block for the page
1145 * @buf: the page to be sent
1146 * @async: send to page asyncly
1148 static int save_normal_page(RAMState
*rs
, RAMBlock
*block
, ram_addr_t offset
,
1149 uint8_t *buf
, bool async
)
1151 ram_counters
.transferred
+= save_page_header(rs
, rs
->f
, block
,
1152 offset
| RAM_SAVE_FLAG_PAGE
);
1154 qemu_put_buffer_async(rs
->f
, buf
, TARGET_PAGE_SIZE
,
1155 migrate_release_ram() &
1156 migration_in_postcopy());
1158 qemu_put_buffer(rs
->f
, buf
, TARGET_PAGE_SIZE
);
1160 ram_counters
.transferred
+= TARGET_PAGE_SIZE
;
1161 ram_counters
.normal
++;
1166 * ram_save_page: send the given page to the stream
1168 * Returns the number of pages written.
1170 * >=0 - Number of pages written - this might legally be 0
1171 * if xbzrle noticed the page was the same.
1173 * @rs: current RAM state
1174 * @block: block that contains the page we want to send
1175 * @offset: offset inside the block for the page
1176 * @last_stage: if we are at the completion stage
1178 static int ram_save_page(RAMState
*rs
, PageSearchStatus
*pss
, bool last_stage
)
1182 bool send_async
= true;
1183 RAMBlock
*block
= pss
->block
;
1184 ram_addr_t offset
= ((ram_addr_t
)pss
->page
) << TARGET_PAGE_BITS
;
1185 ram_addr_t current_addr
= block
->offset
+ offset
;
1187 p
= block
->host
+ offset
;
1188 trace_ram_save_page(block
->idstr
, (uint64_t)offset
, p
);
1190 XBZRLE_cache_lock();
1191 if (!rs
->ram_bulk_stage
&& !migration_in_postcopy() &&
1192 migrate_use_xbzrle()) {
1193 pages
= save_xbzrle_page(rs
, &p
, current_addr
, block
,
1194 offset
, last_stage
);
1196 /* Can't send this cached data async, since the cache page
1197 * might get updated before it gets to the wire
1203 /* XBZRLE overflow or normal page */
1205 pages
= save_normal_page(rs
, block
, offset
, p
, send_async
);
1208 XBZRLE_cache_unlock();
1213 static int ram_save_multifd_page(RAMState
*rs
, RAMBlock
*block
,
1216 if (multifd_queue_page(rs
->f
, block
, offset
) < 0) {
1219 ram_counters
.normal
++;
1224 static bool do_compress_ram_page(QEMUFile
*f
, z_stream
*stream
, RAMBlock
*block
,
1225 ram_addr_t offset
, uint8_t *source_buf
)
1227 RAMState
*rs
= ram_state
;
1228 uint8_t *p
= block
->host
+ (offset
& TARGET_PAGE_MASK
);
1229 bool zero_page
= false;
1232 if (save_zero_page_to_file(rs
, f
, block
, offset
)) {
1237 save_page_header(rs
, f
, block
, offset
| RAM_SAVE_FLAG_COMPRESS_PAGE
);
1240 * copy it to a internal buffer to avoid it being modified by VM
1241 * so that we can catch up the error during compression and
1244 memcpy(source_buf
, p
, TARGET_PAGE_SIZE
);
1245 ret
= qemu_put_compression_data(f
, stream
, source_buf
, TARGET_PAGE_SIZE
);
1247 qemu_file_set_error(migrate_get_current()->to_dst_file
, ret
);
1248 error_report("compressed data failed!");
1253 ram_release_pages(block
->idstr
, offset
& TARGET_PAGE_MASK
, 1);
1258 update_compress_thread_counts(const CompressParam
*param
, int bytes_xmit
)
1260 ram_counters
.transferred
+= bytes_xmit
;
1262 if (param
->zero_page
) {
1263 ram_counters
.duplicate
++;
1267 /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1268 compression_counters
.compressed_size
+= bytes_xmit
- 8;
1269 compression_counters
.pages
++;
1272 static bool save_page_use_compression(RAMState
*rs
);
1274 static void flush_compressed_data(RAMState
*rs
)
1276 int idx
, len
, thread_count
;
1278 if (!save_page_use_compression(rs
)) {
1281 thread_count
= migrate_compress_threads();
1283 qemu_mutex_lock(&comp_done_lock
);
1284 for (idx
= 0; idx
< thread_count
; idx
++) {
1285 while (!comp_param
[idx
].done
) {
1286 qemu_cond_wait(&comp_done_cond
, &comp_done_lock
);
1289 qemu_mutex_unlock(&comp_done_lock
);
1291 for (idx
= 0; idx
< thread_count
; idx
++) {
1292 qemu_mutex_lock(&comp_param
[idx
].mutex
);
1293 if (!comp_param
[idx
].quit
) {
1294 len
= qemu_put_qemu_file(rs
->f
, comp_param
[idx
].file
);
1296 * it's safe to fetch zero_page without holding comp_done_lock
1297 * as there is no further request submitted to the thread,
1298 * i.e, the thread should be waiting for a request at this point.
1300 update_compress_thread_counts(&comp_param
[idx
], len
);
1302 qemu_mutex_unlock(&comp_param
[idx
].mutex
);
1306 static inline void set_compress_params(CompressParam
*param
, RAMBlock
*block
,
1309 param
->block
= block
;
1310 param
->offset
= offset
;
1313 static int compress_page_with_multi_thread(RAMState
*rs
, RAMBlock
*block
,
1316 int idx
, thread_count
, bytes_xmit
= -1, pages
= -1;
1317 bool wait
= migrate_compress_wait_thread();
1319 thread_count
= migrate_compress_threads();
1320 qemu_mutex_lock(&comp_done_lock
);
1322 for (idx
= 0; idx
< thread_count
; idx
++) {
1323 if (comp_param
[idx
].done
) {
1324 comp_param
[idx
].done
= false;
1325 bytes_xmit
= qemu_put_qemu_file(rs
->f
, comp_param
[idx
].file
);
1326 qemu_mutex_lock(&comp_param
[idx
].mutex
);
1327 set_compress_params(&comp_param
[idx
], block
, offset
);
1328 qemu_cond_signal(&comp_param
[idx
].cond
);
1329 qemu_mutex_unlock(&comp_param
[idx
].mutex
);
1331 update_compress_thread_counts(&comp_param
[idx
], bytes_xmit
);
1337 * wait for the free thread if the user specifies 'compress-wait-thread',
1338 * otherwise we will post the page out in the main thread as normal page.
1340 if (pages
< 0 && wait
) {
1341 qemu_cond_wait(&comp_done_cond
, &comp_done_lock
);
1344 qemu_mutex_unlock(&comp_done_lock
);
1350 * find_dirty_block: find the next dirty page and update any state
1351 * associated with the search process.
1353 * Returns true if a page is found
1355 * @rs: current RAM state
1356 * @pss: data about the state of the current dirty page scan
1357 * @again: set to false if the search has scanned the whole of RAM
1359 static bool find_dirty_block(RAMState
*rs
, PageSearchStatus
*pss
, bool *again
)
1361 pss
->page
= migration_bitmap_find_dirty(rs
, pss
->block
, pss
->page
);
1362 if (pss
->complete_round
&& pss
->block
== rs
->last_seen_block
&&
1363 pss
->page
>= rs
->last_page
) {
1365 * We've been once around the RAM and haven't found anything.
1371 if ((((ram_addr_t
)pss
->page
) << TARGET_PAGE_BITS
)
1372 >= pss
->block
->used_length
) {
1373 /* Didn't find anything in this RAM Block */
1375 pss
->block
= QLIST_NEXT_RCU(pss
->block
, next
);
1378 * If memory migration starts over, we will meet a dirtied page
1379 * which may still exists in compression threads's ring, so we
1380 * should flush the compressed data to make sure the new page
1381 * is not overwritten by the old one in the destination.
1383 * Also If xbzrle is on, stop using the data compression at this
1384 * point. In theory, xbzrle can do better than compression.
1386 flush_compressed_data(rs
);
1388 /* Hit the end of the list */
1389 pss
->block
= QLIST_FIRST_RCU(&ram_list
.blocks
);
1390 /* Flag that we've looped */
1391 pss
->complete_round
= true;
1392 rs
->ram_bulk_stage
= false;
1394 /* Didn't find anything this time, but try again on the new block */
1398 /* Can go around again, but... */
1400 /* We've found something so probably don't need to */
1406 * unqueue_page: gets a page of the queue
1408 * Helper for 'get_queued_page' - gets a page off the queue
1410 * Returns the block of the page (or NULL if none available)
1412 * @rs: current RAM state
1413 * @offset: used to return the offset within the RAMBlock
1415 static RAMBlock
*unqueue_page(RAMState
*rs
, ram_addr_t
*offset
)
1417 RAMBlock
*block
= NULL
;
1419 if (QSIMPLEQ_EMPTY_ATOMIC(&rs
->src_page_requests
)) {
1423 QEMU_LOCK_GUARD(&rs
->src_page_req_mutex
);
1424 if (!QSIMPLEQ_EMPTY(&rs
->src_page_requests
)) {
1425 struct RAMSrcPageRequest
*entry
=
1426 QSIMPLEQ_FIRST(&rs
->src_page_requests
);
1428 *offset
= entry
->offset
;
1430 if (entry
->len
> TARGET_PAGE_SIZE
) {
1431 entry
->len
-= TARGET_PAGE_SIZE
;
1432 entry
->offset
+= TARGET_PAGE_SIZE
;
1434 memory_region_unref(block
->mr
);
1435 QSIMPLEQ_REMOVE_HEAD(&rs
->src_page_requests
, next_req
);
1437 migration_consume_urgent_request();
1444 #if defined(__linux__)
1446 * poll_fault_page: try to get next UFFD write fault page and, if pending fault
1447 * is found, return RAM block pointer and page offset
1449 * Returns pointer to the RAMBlock containing faulting page,
1450 * NULL if no write faults are pending
1452 * @rs: current RAM state
1453 * @offset: page offset from the beginning of the block
1455 static RAMBlock
*poll_fault_page(RAMState
*rs
, ram_addr_t
*offset
)
1457 struct uffd_msg uffd_msg
;
1462 if (!migrate_background_snapshot()) {
1466 res
= uffd_read_events(rs
->uffdio_fd
, &uffd_msg
, 1);
1471 page_address
= (void *)(uintptr_t) uffd_msg
.arg
.pagefault
.address
;
1472 bs
= qemu_ram_block_from_host(page_address
, false, offset
);
1473 assert(bs
&& (bs
->flags
& RAM_UF_WRITEPROTECT
) != 0);
1478 * ram_save_release_protection: release UFFD write protection after
1479 * a range of pages has been saved
1481 * @rs: current RAM state
1482 * @pss: page-search-status structure
1483 * @start_page: index of the first page in the range relative to pss->block
1485 * Returns 0 on success, negative value in case of an error
1487 static int ram_save_release_protection(RAMState
*rs
, PageSearchStatus
*pss
,
1488 unsigned long start_page
)
1492 /* Check if page is from UFFD-managed region. */
1493 if (pss
->block
->flags
& RAM_UF_WRITEPROTECT
) {
1494 void *page_address
= pss
->block
->host
+ (start_page
<< TARGET_PAGE_BITS
);
1495 uint64_t run_length
= (pss
->page
- start_page
+ 1) << TARGET_PAGE_BITS
;
1497 /* Flush async buffers before un-protect. */
1499 /* Un-protect memory range. */
1500 res
= uffd_change_protection(rs
->uffdio_fd
, page_address
, run_length
,
1507 /* ram_write_tracking_available: check if kernel supports required UFFD features
1509 * Returns true if supports, false otherwise
1511 bool ram_write_tracking_available(void)
1513 uint64_t uffd_features
;
1516 res
= uffd_query_features(&uffd_features
);
1518 (uffd_features
& UFFD_FEATURE_PAGEFAULT_FLAG_WP
) != 0);
1521 /* ram_write_tracking_compatible: check if guest configuration is
1522 * compatible with 'write-tracking'
1524 * Returns true if compatible, false otherwise
1526 bool ram_write_tracking_compatible(void)
1528 const uint64_t uffd_ioctls_mask
= BIT(_UFFDIO_WRITEPROTECT
);
1533 /* Open UFFD file descriptor */
1534 uffd_fd
= uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP
, false);
1539 RCU_READ_LOCK_GUARD();
1541 RAMBLOCK_FOREACH_NOT_IGNORED(bs
) {
1542 uint64_t uffd_ioctls
;
1544 /* Nothing to do with read-only and MMIO-writable regions */
1545 if (bs
->mr
->readonly
|| bs
->mr
->rom_device
) {
1548 /* Try to register block memory via UFFD-IO to track writes */
1549 if (uffd_register_memory(uffd_fd
, bs
->host
, bs
->max_length
,
1550 UFFDIO_REGISTER_MODE_WP
, &uffd_ioctls
)) {
1553 if ((uffd_ioctls
& uffd_ioctls_mask
) != uffd_ioctls_mask
) {
1560 uffd_close_fd(uffd_fd
);
1565 * ram_write_tracking_start: start UFFD-WP memory tracking
1567 * Returns 0 for success or negative value in case of error
1569 int ram_write_tracking_start(void)
1572 RAMState
*rs
= ram_state
;
1575 /* Open UFFD file descriptor */
1576 uffd_fd
= uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP
, true);
1580 rs
->uffdio_fd
= uffd_fd
;
1582 RCU_READ_LOCK_GUARD();
1584 RAMBLOCK_FOREACH_NOT_IGNORED(bs
) {
1585 /* Nothing to do with read-only and MMIO-writable regions */
1586 if (bs
->mr
->readonly
|| bs
->mr
->rom_device
) {
1590 /* Register block memory with UFFD to track writes */
1591 if (uffd_register_memory(rs
->uffdio_fd
, bs
->host
,
1592 bs
->max_length
, UFFDIO_REGISTER_MODE_WP
, NULL
)) {
1595 /* Apply UFFD write protection to the block memory range */
1596 if (uffd_change_protection(rs
->uffdio_fd
, bs
->host
,
1597 bs
->max_length
, true, false)) {
1600 bs
->flags
|= RAM_UF_WRITEPROTECT
;
1601 memory_region_ref(bs
->mr
);
1603 trace_ram_write_tracking_ramblock_start(bs
->idstr
, bs
->page_size
,
1604 bs
->host
, bs
->max_length
);
1610 error_report("ram_write_tracking_start() failed: restoring initial memory state");
1612 RAMBLOCK_FOREACH_NOT_IGNORED(bs
) {
1613 if ((bs
->flags
& RAM_UF_WRITEPROTECT
) == 0) {
1617 * In case some memory block failed to be write-protected
1618 * remove protection and unregister all succeeded RAM blocks
1620 uffd_change_protection(rs
->uffdio_fd
, bs
->host
, bs
->max_length
, false, false);
1621 uffd_unregister_memory(rs
->uffdio_fd
, bs
->host
, bs
->max_length
);
1622 /* Cleanup flags and remove reference */
1623 bs
->flags
&= ~RAM_UF_WRITEPROTECT
;
1624 memory_region_unref(bs
->mr
);
1627 uffd_close_fd(uffd_fd
);
1633 * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
1635 void ram_write_tracking_stop(void)
1637 RAMState
*rs
= ram_state
;
1640 RCU_READ_LOCK_GUARD();
1642 RAMBLOCK_FOREACH_NOT_IGNORED(bs
) {
1643 if ((bs
->flags
& RAM_UF_WRITEPROTECT
) == 0) {
1646 /* Remove protection and unregister all affected RAM blocks */
1647 uffd_change_protection(rs
->uffdio_fd
, bs
->host
, bs
->max_length
, false, false);
1648 uffd_unregister_memory(rs
->uffdio_fd
, bs
->host
, bs
->max_length
);
1650 trace_ram_write_tracking_ramblock_stop(bs
->idstr
, bs
->page_size
,
1651 bs
->host
, bs
->max_length
);
1653 /* Cleanup flags and remove reference */
1654 bs
->flags
&= ~RAM_UF_WRITEPROTECT
;
1655 memory_region_unref(bs
->mr
);
1658 /* Finally close UFFD file descriptor */
1659 uffd_close_fd(rs
->uffdio_fd
);
1664 /* No target OS support, stubs just fail or ignore */
1666 static RAMBlock
*poll_fault_page(RAMState
*rs
, ram_addr_t
*offset
)
1674 static int ram_save_release_protection(RAMState
*rs
, PageSearchStatus
*pss
,
1675 unsigned long start_page
)
1684 bool ram_write_tracking_available(void)
1689 bool ram_write_tracking_compatible(void)
1695 int ram_write_tracking_start(void)
1701 void ram_write_tracking_stop(void)
1705 #endif /* defined(__linux__) */
1708 * get_queued_page: unqueue a page from the postcopy requests
1710 * Skips pages that are already sent (!dirty)
1712 * Returns true if a queued page is found
1714 * @rs: current RAM state
1715 * @pss: data about the state of the current dirty page scan
1717 static bool get_queued_page(RAMState
*rs
, PageSearchStatus
*pss
)
1724 block
= unqueue_page(rs
, &offset
);
1726 * We're sending this page, and since it's postcopy nothing else
1727 * will dirty it, and we must make sure it doesn't get sent again
1728 * even if this queue request was received after the background
1729 * search already sent it.
1734 page
= offset
>> TARGET_PAGE_BITS
;
1735 dirty
= test_bit(page
, block
->bmap
);
1737 trace_get_queued_page_not_dirty(block
->idstr
, (uint64_t)offset
,
1740 trace_get_queued_page(block
->idstr
, (uint64_t)offset
, page
);
1744 } while (block
&& !dirty
);
1748 * Poll write faults too if background snapshot is enabled; that's
1749 * when we have vcpus got blocked by the write protected pages.
1751 block
= poll_fault_page(rs
, &offset
);
1756 * As soon as we start servicing pages out of order, then we have
1757 * to kill the bulk stage, since the bulk stage assumes
1758 * in (migration_bitmap_find_and_reset_dirty) that every page is
1759 * dirty, that's no longer true.
1761 rs
->ram_bulk_stage
= false;
1764 * We want the background search to continue from the queued page
1765 * since the guest is likely to want other pages near to the page
1766 * it just requested.
1769 pss
->page
= offset
>> TARGET_PAGE_BITS
;
1772 * This unqueued page would break the "one round" check, even is
1775 pss
->complete_round
= false;
1782 * migration_page_queue_free: drop any remaining pages in the ram
1785 * It should be empty at the end anyway, but in error cases there may
1786 * be some left. in case that there is any page left, we drop it.
1789 static void migration_page_queue_free(RAMState
*rs
)
1791 struct RAMSrcPageRequest
*mspr
, *next_mspr
;
1792 /* This queue generally should be empty - but in the case of a failed
1793 * migration might have some droppings in.
1795 RCU_READ_LOCK_GUARD();
1796 QSIMPLEQ_FOREACH_SAFE(mspr
, &rs
->src_page_requests
, next_req
, next_mspr
) {
1797 memory_region_unref(mspr
->rb
->mr
);
1798 QSIMPLEQ_REMOVE_HEAD(&rs
->src_page_requests
, next_req
);
1804 * ram_save_queue_pages: queue the page for transmission
1806 * A request from postcopy destination for example.
1808 * Returns zero on success or negative on error
1810 * @rbname: Name of the RAMBLock of the request. NULL means the
1811 * same that last one.
1812 * @start: starting address from the start of the RAMBlock
1813 * @len: length (in bytes) to send
1815 int ram_save_queue_pages(const char *rbname
, ram_addr_t start
, ram_addr_t len
)
1818 RAMState
*rs
= ram_state
;
1820 ram_counters
.postcopy_requests
++;
1821 RCU_READ_LOCK_GUARD();
1824 /* Reuse last RAMBlock */
1825 ramblock
= rs
->last_req_rb
;
1829 * Shouldn't happen, we can't reuse the last RAMBlock if
1830 * it's the 1st request.
1832 error_report("ram_save_queue_pages no previous block");
1836 ramblock
= qemu_ram_block_by_name(rbname
);
1839 /* We shouldn't be asked for a non-existent RAMBlock */
1840 error_report("ram_save_queue_pages no block '%s'", rbname
);
1843 rs
->last_req_rb
= ramblock
;
1845 trace_ram_save_queue_pages(ramblock
->idstr
, start
, len
);
1846 if (start
+ len
> ramblock
->used_length
) {
1847 error_report("%s request overrun start=" RAM_ADDR_FMT
" len="
1848 RAM_ADDR_FMT
" blocklen=" RAM_ADDR_FMT
,
1849 __func__
, start
, len
, ramblock
->used_length
);
1853 struct RAMSrcPageRequest
*new_entry
=
1854 g_malloc0(sizeof(struct RAMSrcPageRequest
));
1855 new_entry
->rb
= ramblock
;
1856 new_entry
->offset
= start
;
1857 new_entry
->len
= len
;
1859 memory_region_ref(ramblock
->mr
);
1860 qemu_mutex_lock(&rs
->src_page_req_mutex
);
1861 QSIMPLEQ_INSERT_TAIL(&rs
->src_page_requests
, new_entry
, next_req
);
1862 migration_make_urgent_request();
1863 qemu_mutex_unlock(&rs
->src_page_req_mutex
);
1868 static bool save_page_use_compression(RAMState
*rs
)
1870 if (!migrate_use_compression()) {
1875 * If xbzrle is on, stop using the data compression after first
1876 * round of migration even if compression is enabled. In theory,
1877 * xbzrle can do better than compression.
1879 if (rs
->ram_bulk_stage
|| !migrate_use_xbzrle()) {
1887 * try to compress the page before posting it out, return true if the page
1888 * has been properly handled by compression, otherwise needs other
1889 * paths to handle it
1891 static bool save_compress_page(RAMState
*rs
, RAMBlock
*block
, ram_addr_t offset
)
1893 if (!save_page_use_compression(rs
)) {
1898 * When starting the process of a new block, the first page of
1899 * the block should be sent out before other pages in the same
1900 * block, and all the pages in last block should have been sent
1901 * out, keeping this order is important, because the 'cont' flag
1902 * is used to avoid resending the block name.
1904 * We post the fist page as normal page as compression will take
1905 * much CPU resource.
1907 if (block
!= rs
->last_sent_block
) {
1908 flush_compressed_data(rs
);
1912 if (compress_page_with_multi_thread(rs
, block
, offset
) > 0) {
1916 compression_counters
.busy
++;
1921 * ram_save_target_page: save one target page
1923 * Returns the number of pages written
1925 * @rs: current RAM state
1926 * @pss: data about the page we want to send
1927 * @last_stage: if we are at the completion stage
1929 static int ram_save_target_page(RAMState
*rs
, PageSearchStatus
*pss
,
1932 RAMBlock
*block
= pss
->block
;
1933 ram_addr_t offset
= ((ram_addr_t
)pss
->page
) << TARGET_PAGE_BITS
;
1936 if (control_save_page(rs
, block
, offset
, &res
)) {
1940 if (save_compress_page(rs
, block
, offset
)) {
1944 res
= save_zero_page(rs
, block
, offset
);
1946 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
1947 * page would be stale
1949 if (!save_page_use_compression(rs
)) {
1950 XBZRLE_cache_lock();
1951 xbzrle_cache_zero_page(rs
, block
->offset
+ offset
);
1952 XBZRLE_cache_unlock();
1954 ram_release_pages(block
->idstr
, offset
, res
);
1959 * Do not use multifd for:
1960 * 1. Compression as the first page in the new block should be posted out
1961 * before sending the compressed page
1962 * 2. In postcopy as one whole host page should be placed
1964 if (!save_page_use_compression(rs
) && migrate_use_multifd()
1965 && !migration_in_postcopy()) {
1966 return ram_save_multifd_page(rs
, block
, offset
);
1969 return ram_save_page(rs
, pss
, last_stage
);
1973 * ram_save_host_page: save a whole host page
1975 * Starting at *offset send pages up to the end of the current host
1976 * page. It's valid for the initial offset to point into the middle of
1977 * a host page in which case the remainder of the hostpage is sent.
1978 * Only dirty target pages are sent. Note that the host page size may
1979 * be a huge page for this block.
1980 * The saving stops at the boundary of the used_length of the block
1981 * if the RAMBlock isn't a multiple of the host page size.
1983 * Returns the number of pages written or negative on error
1985 * @rs: current RAM state
1986 * @ms: current migration state
1987 * @pss: data about the page we want to send
1988 * @last_stage: if we are at the completion stage
1990 static int ram_save_host_page(RAMState
*rs
, PageSearchStatus
*pss
,
1993 int tmppages
, pages
= 0;
1994 size_t pagesize_bits
=
1995 qemu_ram_pagesize(pss
->block
) >> TARGET_PAGE_BITS
;
1996 unsigned long start_page
= pss
->page
;
1999 if (ramblock_is_ignored(pss
->block
)) {
2000 error_report("block %s should not be migrated !", pss
->block
->idstr
);
2005 /* Check the pages is dirty and if it is send it */
2006 if (!migration_bitmap_clear_dirty(rs
, pss
->block
, pss
->page
)) {
2011 tmppages
= ram_save_target_page(rs
, pss
, last_stage
);
2018 /* Allow rate limiting to happen in the middle of huge pages */
2019 migration_rate_limit();
2020 } while ((pss
->page
& (pagesize_bits
- 1)) &&
2021 offset_in_ramblock(pss
->block
,
2022 ((ram_addr_t
)pss
->page
) << TARGET_PAGE_BITS
));
2023 /* The offset we leave with is the last one we looked at */
2026 res
= ram_save_release_protection(rs
, pss
, start_page
);
2027 return (res
< 0 ? res
: pages
);
2031 * ram_find_and_save_block: finds a dirty page and sends it to f
2033 * Called within an RCU critical section.
2035 * Returns the number of pages written where zero means no dirty pages,
2036 * or negative on error
2038 * @rs: current RAM state
2039 * @last_stage: if we are at the completion stage
2041 * On systems where host-page-size > target-page-size it will send all the
2042 * pages in a host page that are dirty.
2045 static int ram_find_and_save_block(RAMState
*rs
, bool last_stage
)
2047 PageSearchStatus pss
;
2051 /* No dirty page as there is zero RAM */
2052 if (!ram_bytes_total()) {
2056 pss
.block
= rs
->last_seen_block
;
2057 pss
.page
= rs
->last_page
;
2058 pss
.complete_round
= false;
2061 pss
.block
= QLIST_FIRST_RCU(&ram_list
.blocks
);
2066 found
= get_queued_page(rs
, &pss
);
2069 /* priority queue empty, so just search for something dirty */
2070 found
= find_dirty_block(rs
, &pss
, &again
);
2074 pages
= ram_save_host_page(rs
, &pss
, last_stage
);
2076 } while (!pages
&& again
);
2078 rs
->last_seen_block
= pss
.block
;
2079 rs
->last_page
= pss
.page
;
2084 void acct_update_position(QEMUFile
*f
, size_t size
, bool zero
)
2086 uint64_t pages
= size
/ TARGET_PAGE_SIZE
;
2089 ram_counters
.duplicate
+= pages
;
2091 ram_counters
.normal
+= pages
;
2092 ram_counters
.transferred
+= size
;
2093 qemu_update_position(f
, size
);
2097 static uint64_t ram_bytes_total_common(bool count_ignored
)
2102 RCU_READ_LOCK_GUARD();
2104 if (count_ignored
) {
2105 RAMBLOCK_FOREACH_MIGRATABLE(block
) {
2106 total
+= block
->used_length
;
2109 RAMBLOCK_FOREACH_NOT_IGNORED(block
) {
2110 total
+= block
->used_length
;
2116 uint64_t ram_bytes_total(void)
2118 return ram_bytes_total_common(false);
2121 static void xbzrle_load_setup(void)
2123 XBZRLE
.decoded_buf
= g_malloc(TARGET_PAGE_SIZE
);
2126 static void xbzrle_load_cleanup(void)
2128 g_free(XBZRLE
.decoded_buf
);
2129 XBZRLE
.decoded_buf
= NULL
;
2132 static void ram_state_cleanup(RAMState
**rsp
)
2135 migration_page_queue_free(*rsp
);
2136 qemu_mutex_destroy(&(*rsp
)->bitmap_mutex
);
2137 qemu_mutex_destroy(&(*rsp
)->src_page_req_mutex
);
2143 static void xbzrle_cleanup(void)
2145 XBZRLE_cache_lock();
2147 cache_fini(XBZRLE
.cache
);
2148 g_free(XBZRLE
.encoded_buf
);
2149 g_free(XBZRLE
.current_buf
);
2150 g_free(XBZRLE
.zero_target_page
);
2151 XBZRLE
.cache
= NULL
;
2152 XBZRLE
.encoded_buf
= NULL
;
2153 XBZRLE
.current_buf
= NULL
;
2154 XBZRLE
.zero_target_page
= NULL
;
2156 XBZRLE_cache_unlock();
2159 static void ram_save_cleanup(void *opaque
)
2161 RAMState
**rsp
= opaque
;
2164 /* We don't use dirty log with background snapshots */
2165 if (!migrate_background_snapshot()) {
2166 /* caller have hold iothread lock or is in a bh, so there is
2167 * no writing race against the migration bitmap
2169 memory_global_dirty_log_stop();
2172 RAMBLOCK_FOREACH_NOT_IGNORED(block
) {
2173 g_free(block
->clear_bmap
);
2174 block
->clear_bmap
= NULL
;
2175 g_free(block
->bmap
);
2180 compress_threads_save_cleanup();
2181 ram_state_cleanup(rsp
);
2184 static void ram_state_reset(RAMState
*rs
)
2186 rs
->last_seen_block
= NULL
;
2187 rs
->last_sent_block
= NULL
;
2189 rs
->last_version
= ram_list
.version
;
2190 rs
->ram_bulk_stage
= true;
2191 rs
->fpo_enabled
= false;
2194 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2197 * 'expected' is the value you expect the bitmap mostly to be full
2198 * of; it won't bother printing lines that are all this value.
2199 * If 'todump' is null the migration bitmap is dumped.
2201 void ram_debug_dump_bitmap(unsigned long *todump
, bool expected
,
2202 unsigned long pages
)
2205 int64_t linelen
= 128;
2208 for (cur
= 0; cur
< pages
; cur
+= linelen
) {
2212 * Last line; catch the case where the line length
2213 * is longer than remaining ram
2215 if (cur
+ linelen
> pages
) {
2216 linelen
= pages
- cur
;
2218 for (curb
= 0; curb
< linelen
; curb
++) {
2219 bool thisbit
= test_bit(cur
+ curb
, todump
);
2220 linebuf
[curb
] = thisbit
? '1' : '.';
2221 found
= found
|| (thisbit
!= expected
);
2224 linebuf
[curb
] = '\0';
2225 fprintf(stderr
, "0x%08" PRIx64
" : %s\n", cur
, linebuf
);
2230 /* **** functions for postcopy ***** */
2232 void ram_postcopy_migrated_memory_release(MigrationState
*ms
)
2234 struct RAMBlock
*block
;
2236 RAMBLOCK_FOREACH_NOT_IGNORED(block
) {
2237 unsigned long *bitmap
= block
->bmap
;
2238 unsigned long range
= block
->used_length
>> TARGET_PAGE_BITS
;
2239 unsigned long run_start
= find_next_zero_bit(bitmap
, range
, 0);
2241 while (run_start
< range
) {
2242 unsigned long run_end
= find_next_bit(bitmap
, range
, run_start
+ 1);
2243 ram_discard_range(block
->idstr
,
2244 ((ram_addr_t
)run_start
) << TARGET_PAGE_BITS
,
2245 ((ram_addr_t
)(run_end
- run_start
))
2246 << TARGET_PAGE_BITS
);
2247 run_start
= find_next_zero_bit(bitmap
, range
, run_end
+ 1);
2253 * postcopy_send_discard_bm_ram: discard a RAMBlock
2255 * Returns zero on success
2257 * Callback from postcopy_each_ram_send_discard for each RAMBlock
2259 * @ms: current migration state
2260 * @block: RAMBlock to discard
2262 static int postcopy_send_discard_bm_ram(MigrationState
*ms
, RAMBlock
*block
)
2264 unsigned long end
= block
->used_length
>> TARGET_PAGE_BITS
;
2265 unsigned long current
;
2266 unsigned long *bitmap
= block
->bmap
;
2268 for (current
= 0; current
< end
; ) {
2269 unsigned long one
= find_next_bit(bitmap
, end
, current
);
2270 unsigned long zero
, discard_length
;
2276 zero
= find_next_zero_bit(bitmap
, end
, one
+ 1);
2279 discard_length
= end
- one
;
2281 discard_length
= zero
- one
;
2283 postcopy_discard_send_range(ms
, one
, discard_length
);
2284 current
= one
+ discard_length
;
2291 * postcopy_each_ram_send_discard: discard all RAMBlocks
2293 * Returns 0 for success or negative for error
2295 * Utility for the outgoing postcopy code.
2296 * Calls postcopy_send_discard_bm_ram for each RAMBlock
2297 * passing it bitmap indexes and name.
2298 * (qemu_ram_foreach_block ends up passing unscaled lengths
2299 * which would mean postcopy code would have to deal with target page)
2301 * @ms: current migration state
2303 static int postcopy_each_ram_send_discard(MigrationState
*ms
)
2305 struct RAMBlock
*block
;
2308 RAMBLOCK_FOREACH_NOT_IGNORED(block
) {
2309 postcopy_discard_send_init(ms
, block
->idstr
);
2312 * Postcopy sends chunks of bitmap over the wire, but it
2313 * just needs indexes at this point, avoids it having
2314 * target page specific code.
2316 ret
= postcopy_send_discard_bm_ram(ms
, block
);
2317 postcopy_discard_send_finish(ms
);
2327 * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2329 * Helper for postcopy_chunk_hostpages; it's called twice to
2330 * canonicalize the two bitmaps, that are similar, but one is
2333 * Postcopy requires that all target pages in a hostpage are dirty or
2334 * clean, not a mix. This function canonicalizes the bitmaps.
2336 * @ms: current migration state
2337 * @block: block that contains the page we want to canonicalize
2339 static void postcopy_chunk_hostpages_pass(MigrationState
*ms
, RAMBlock
*block
)
2341 RAMState
*rs
= ram_state
;
2342 unsigned long *bitmap
= block
->bmap
;
2343 unsigned int host_ratio
= block
->page_size
/ TARGET_PAGE_SIZE
;
2344 unsigned long pages
= block
->used_length
>> TARGET_PAGE_BITS
;
2345 unsigned long run_start
;
2347 if (block
->page_size
== TARGET_PAGE_SIZE
) {
2348 /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2352 /* Find a dirty page */
2353 run_start
= find_next_bit(bitmap
, pages
, 0);
2355 while (run_start
< pages
) {
2358 * If the start of this run of pages is in the middle of a host
2359 * page, then we need to fixup this host page.
2361 if (QEMU_IS_ALIGNED(run_start
, host_ratio
)) {
2362 /* Find the end of this run */
2363 run_start
= find_next_zero_bit(bitmap
, pages
, run_start
+ 1);
2365 * If the end isn't at the start of a host page, then the
2366 * run doesn't finish at the end of a host page
2367 * and we need to discard.
2371 if (!QEMU_IS_ALIGNED(run_start
, host_ratio
)) {
2373 unsigned long fixup_start_addr
= QEMU_ALIGN_DOWN(run_start
,
2375 run_start
= QEMU_ALIGN_UP(run_start
, host_ratio
);
2377 /* Clean up the bitmap */
2378 for (page
= fixup_start_addr
;
2379 page
< fixup_start_addr
+ host_ratio
; page
++) {
2381 * Remark them as dirty, updating the count for any pages
2382 * that weren't previously dirty.
2384 rs
->migration_dirty_pages
+= !test_and_set_bit(page
, bitmap
);
2388 /* Find the next dirty page for the next iteration */
2389 run_start
= find_next_bit(bitmap
, pages
, run_start
);
2394 * postcopy_chunk_hostpages: discard any partially sent host page
2396 * Utility for the outgoing postcopy code.
2398 * Discard any partially sent host-page size chunks, mark any partially
2399 * dirty host-page size chunks as all dirty. In this case the host-page
2400 * is the host-page for the particular RAMBlock, i.e. it might be a huge page
2402 * Returns zero on success
2404 * @ms: current migration state
2405 * @block: block we want to work with
2407 static int postcopy_chunk_hostpages(MigrationState
*ms
, RAMBlock
*block
)
2409 postcopy_discard_send_init(ms
, block
->idstr
);
2412 * Ensure that all partially dirty host pages are made fully dirty.
2414 postcopy_chunk_hostpages_pass(ms
, block
);
2416 postcopy_discard_send_finish(ms
);
2421 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2423 * Returns zero on success
2425 * Transmit the set of pages to be discarded after precopy to the target
2426 * these are pages that:
2427 * a) Have been previously transmitted but are now dirty again
2428 * b) Pages that have never been transmitted, this ensures that
2429 * any pages on the destination that have been mapped by background
2430 * tasks get discarded (transparent huge pages is the specific concern)
2431 * Hopefully this is pretty sparse
2433 * @ms: current migration state
2435 int ram_postcopy_send_discard_bitmap(MigrationState
*ms
)
2437 RAMState
*rs
= ram_state
;
2441 RCU_READ_LOCK_GUARD();
2443 /* This should be our last sync, the src is now paused */
2444 migration_bitmap_sync(rs
);
2446 /* Easiest way to make sure we don't resume in the middle of a host-page */
2447 rs
->last_seen_block
= NULL
;
2448 rs
->last_sent_block
= NULL
;
2451 RAMBLOCK_FOREACH_NOT_IGNORED(block
) {
2452 /* Deal with TPS != HPS and huge pages */
2453 ret
= postcopy_chunk_hostpages(ms
, block
);
2458 #ifdef DEBUG_POSTCOPY
2459 ram_debug_dump_bitmap(block
->bmap
, true,
2460 block
->used_length
>> TARGET_PAGE_BITS
);
2463 trace_ram_postcopy_send_discard_bitmap();
2465 return postcopy_each_ram_send_discard(ms
);
2469 * ram_discard_range: discard dirtied pages at the beginning of postcopy
2471 * Returns zero on success
2473 * @rbname: name of the RAMBlock of the request. NULL means the
2474 * same that last one.
2475 * @start: RAMBlock starting page
2476 * @length: RAMBlock size
2478 int ram_discard_range(const char *rbname
, uint64_t start
, size_t length
)
2480 trace_ram_discard_range(rbname
, start
, length
);
2482 RCU_READ_LOCK_GUARD();
2483 RAMBlock
*rb
= qemu_ram_block_by_name(rbname
);
2486 error_report("ram_discard_range: Failed to find block '%s'", rbname
);
2491 * On source VM, we don't need to update the received bitmap since
2492 * we don't even have one.
2494 if (rb
->receivedmap
) {
2495 bitmap_clear(rb
->receivedmap
, start
>> qemu_target_page_bits(),
2496 length
>> qemu_target_page_bits());
2499 return ram_block_discard_range(rb
, start
, length
);
2503 * For every allocation, we will try not to crash the VM if the
2504 * allocation failed.
2506 static int xbzrle_init(void)
2508 Error
*local_err
= NULL
;
2510 if (!migrate_use_xbzrle()) {
2514 XBZRLE_cache_lock();
2516 XBZRLE
.zero_target_page
= g_try_malloc0(TARGET_PAGE_SIZE
);
2517 if (!XBZRLE
.zero_target_page
) {
2518 error_report("%s: Error allocating zero page", __func__
);
2522 XBZRLE
.cache
= cache_init(migrate_xbzrle_cache_size(),
2523 TARGET_PAGE_SIZE
, &local_err
);
2524 if (!XBZRLE
.cache
) {
2525 error_report_err(local_err
);
2526 goto free_zero_page
;
2529 XBZRLE
.encoded_buf
= g_try_malloc0(TARGET_PAGE_SIZE
);
2530 if (!XBZRLE
.encoded_buf
) {
2531 error_report("%s: Error allocating encoded_buf", __func__
);
2535 XBZRLE
.current_buf
= g_try_malloc(TARGET_PAGE_SIZE
);
2536 if (!XBZRLE
.current_buf
) {
2537 error_report("%s: Error allocating current_buf", __func__
);
2538 goto free_encoded_buf
;
2541 /* We are all good */
2542 XBZRLE_cache_unlock();
2546 g_free(XBZRLE
.encoded_buf
);
2547 XBZRLE
.encoded_buf
= NULL
;
2549 cache_fini(XBZRLE
.cache
);
2550 XBZRLE
.cache
= NULL
;
2552 g_free(XBZRLE
.zero_target_page
);
2553 XBZRLE
.zero_target_page
= NULL
;
2555 XBZRLE_cache_unlock();
2559 static int ram_state_init(RAMState
**rsp
)
2561 *rsp
= g_try_new0(RAMState
, 1);
2564 error_report("%s: Init ramstate fail", __func__
);
2568 qemu_mutex_init(&(*rsp
)->bitmap_mutex
);
2569 qemu_mutex_init(&(*rsp
)->src_page_req_mutex
);
2570 QSIMPLEQ_INIT(&(*rsp
)->src_page_requests
);
2573 * Count the total number of pages used by ram blocks not including any
2574 * gaps due to alignment or unplugs.
2575 * This must match with the initial values of dirty bitmap.
2577 (*rsp
)->migration_dirty_pages
= ram_bytes_total() >> TARGET_PAGE_BITS
;
2578 ram_state_reset(*rsp
);
2583 static void ram_list_init_bitmaps(void)
2585 MigrationState
*ms
= migrate_get_current();
2587 unsigned long pages
;
2590 /* Skip setting bitmap if there is no RAM */
2591 if (ram_bytes_total()) {
2592 shift
= ms
->clear_bitmap_shift
;
2593 if (shift
> CLEAR_BITMAP_SHIFT_MAX
) {
2594 error_report("clear_bitmap_shift (%u) too big, using "
2595 "max value (%u)", shift
, CLEAR_BITMAP_SHIFT_MAX
);
2596 shift
= CLEAR_BITMAP_SHIFT_MAX
;
2597 } else if (shift
< CLEAR_BITMAP_SHIFT_MIN
) {
2598 error_report("clear_bitmap_shift (%u) too small, using "
2599 "min value (%u)", shift
, CLEAR_BITMAP_SHIFT_MIN
);
2600 shift
= CLEAR_BITMAP_SHIFT_MIN
;
2603 RAMBLOCK_FOREACH_NOT_IGNORED(block
) {
2604 pages
= block
->max_length
>> TARGET_PAGE_BITS
;
2606 * The initial dirty bitmap for migration must be set with all
2607 * ones to make sure we'll migrate every guest RAM page to
2609 * Here we set RAMBlock.bmap all to 1 because when rebegin a
2610 * new migration after a failed migration, ram_list.
2611 * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
2614 block
->bmap
= bitmap_new(pages
);
2615 bitmap_set(block
->bmap
, 0, pages
);
2616 block
->clear_bmap_shift
= shift
;
2617 block
->clear_bmap
= bitmap_new(clear_bmap_size(pages
, shift
));
2622 static void ram_init_bitmaps(RAMState
*rs
)
2624 /* For memory_global_dirty_log_start below. */
2625 qemu_mutex_lock_iothread();
2626 qemu_mutex_lock_ramlist();
2628 WITH_RCU_READ_LOCK_GUARD() {
2629 ram_list_init_bitmaps();
2630 /* We don't use dirty log with background snapshots */
2631 if (!migrate_background_snapshot()) {
2632 memory_global_dirty_log_start();
2633 migration_bitmap_sync_precopy(rs
);
2636 qemu_mutex_unlock_ramlist();
2637 qemu_mutex_unlock_iothread();
2640 static int ram_init_all(RAMState
**rsp
)
2642 if (ram_state_init(rsp
)) {
2646 if (xbzrle_init()) {
2647 ram_state_cleanup(rsp
);
2651 ram_init_bitmaps(*rsp
);
2656 static void ram_state_resume_prepare(RAMState
*rs
, QEMUFile
*out
)
2662 * Postcopy is not using xbzrle/compression, so no need for that.
2663 * Also, since source are already halted, we don't need to care
2664 * about dirty page logging as well.
2667 RAMBLOCK_FOREACH_NOT_IGNORED(block
) {
2668 pages
+= bitmap_count_one(block
->bmap
,
2669 block
->used_length
>> TARGET_PAGE_BITS
);
2672 /* This may not be aligned with current bitmaps. Recalculate. */
2673 rs
->migration_dirty_pages
= pages
;
2675 rs
->last_seen_block
= NULL
;
2676 rs
->last_sent_block
= NULL
;
2678 rs
->last_version
= ram_list
.version
;
2680 * Disable the bulk stage, otherwise we'll resend the whole RAM no
2681 * matter what we have sent.
2683 rs
->ram_bulk_stage
= false;
2685 /* Update RAMState cache of output QEMUFile */
2688 trace_ram_state_resume_prepare(pages
);
2692 * This function clears bits of the free pages reported by the caller from the
2693 * migration dirty bitmap. @addr is the host address corresponding to the
2694 * start of the continuous guest free pages, and @len is the total bytes of
2697 void qemu_guest_free_page_hint(void *addr
, size_t len
)
2701 size_t used_len
, start
, npages
;
2702 MigrationState
*s
= migrate_get_current();
2704 /* This function is currently expected to be used during live migration */
2705 if (!migration_is_setup_or_active(s
->state
)) {
2709 for (; len
> 0; len
-= used_len
, addr
+= used_len
) {
2710 block
= qemu_ram_block_from_host(addr
, false, &offset
);
2711 if (unlikely(!block
|| offset
>= block
->used_length
)) {
2713 * The implementation might not support RAMBlock resize during
2714 * live migration, but it could happen in theory with future
2715 * updates. So we add a check here to capture that case.
2717 error_report_once("%s unexpected error", __func__
);
2721 if (len
<= block
->used_length
- offset
) {
2724 used_len
= block
->used_length
- offset
;
2727 start
= offset
>> TARGET_PAGE_BITS
;
2728 npages
= used_len
>> TARGET_PAGE_BITS
;
2730 qemu_mutex_lock(&ram_state
->bitmap_mutex
);
2731 ram_state
->migration_dirty_pages
-=
2732 bitmap_count_one_with_offset(block
->bmap
, start
, npages
);
2733 bitmap_clear(block
->bmap
, start
, npages
);
2734 qemu_mutex_unlock(&ram_state
->bitmap_mutex
);
2739 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2740 * long-running RCU critical section. When rcu-reclaims in the code
2741 * start to become numerous it will be necessary to reduce the
2742 * granularity of these critical sections.
2746 * ram_save_setup: Setup RAM for migration
2748 * Returns zero to indicate success and negative for error
2750 * @f: QEMUFile where to send the data
2751 * @opaque: RAMState pointer
2753 static int ram_save_setup(QEMUFile
*f
, void *opaque
)
2755 RAMState
**rsp
= opaque
;
2758 if (compress_threads_save_setup()) {
2762 /* migration has already setup the bitmap, reuse it. */
2763 if (!migration_in_colo_state()) {
2764 if (ram_init_all(rsp
) != 0) {
2765 compress_threads_save_cleanup();
2771 WITH_RCU_READ_LOCK_GUARD() {
2772 qemu_put_be64(f
, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE
);
2774 RAMBLOCK_FOREACH_MIGRATABLE(block
) {
2775 qemu_put_byte(f
, strlen(block
->idstr
));
2776 qemu_put_buffer(f
, (uint8_t *)block
->idstr
, strlen(block
->idstr
));
2777 qemu_put_be64(f
, block
->used_length
);
2778 if (migrate_postcopy_ram() && block
->page_size
!=
2779 qemu_host_page_size
) {
2780 qemu_put_be64(f
, block
->page_size
);
2782 if (migrate_ignore_shared()) {
2783 qemu_put_be64(f
, block
->mr
->addr
);
2788 ram_control_before_iterate(f
, RAM_CONTROL_SETUP
);
2789 ram_control_after_iterate(f
, RAM_CONTROL_SETUP
);
2791 multifd_send_sync_main(f
);
2792 qemu_put_be64(f
, RAM_SAVE_FLAG_EOS
);
2799 * ram_save_iterate: iterative stage for migration
2801 * Returns zero to indicate success and negative for error
2803 * @f: QEMUFile where to send the data
2804 * @opaque: RAMState pointer
2806 static int ram_save_iterate(QEMUFile
*f
, void *opaque
)
2808 RAMState
**temp
= opaque
;
2809 RAMState
*rs
= *temp
;
2815 if (blk_mig_bulk_active()) {
2816 /* Avoid transferring ram during bulk phase of block migration as
2817 * the bulk phase will usually take a long time and transferring
2818 * ram updates during that time is pointless. */
2822 WITH_RCU_READ_LOCK_GUARD() {
2823 if (ram_list
.version
!= rs
->last_version
) {
2824 ram_state_reset(rs
);
2827 /* Read version before ram_list.blocks */
2830 ram_control_before_iterate(f
, RAM_CONTROL_ROUND
);
2832 t0
= qemu_clock_get_ns(QEMU_CLOCK_REALTIME
);
2834 while ((ret
= qemu_file_rate_limit(f
)) == 0 ||
2835 !QSIMPLEQ_EMPTY(&rs
->src_page_requests
)) {
2838 if (qemu_file_get_error(f
)) {
2842 pages
= ram_find_and_save_block(rs
, false);
2843 /* no more pages to sent */
2850 qemu_file_set_error(f
, pages
);
2854 rs
->target_page_count
+= pages
;
2857 * During postcopy, it is necessary to make sure one whole host
2858 * page is sent in one chunk.
2860 if (migrate_postcopy_ram()) {
2861 flush_compressed_data(rs
);
2865 * we want to check in the 1st loop, just in case it was the 1st
2866 * time and we had to sync the dirty bitmap.
2867 * qemu_clock_get_ns() is a bit expensive, so we only check each
2870 if ((i
& 63) == 0) {
2871 uint64_t t1
= (qemu_clock_get_ns(QEMU_CLOCK_REALTIME
) - t0
) /
2873 if (t1
> MAX_WAIT
) {
2874 trace_ram_save_iterate_big_wait(t1
, i
);
2883 * Must occur before EOS (or any QEMUFile operation)
2884 * because of RDMA protocol.
2886 ram_control_after_iterate(f
, RAM_CONTROL_ROUND
);
2890 && migration_is_setup_or_active(migrate_get_current()->state
)) {
2891 multifd_send_sync_main(rs
->f
);
2892 qemu_put_be64(f
, RAM_SAVE_FLAG_EOS
);
2894 ram_counters
.transferred
+= 8;
2896 ret
= qemu_file_get_error(f
);
2906 * ram_save_complete: function called to send the remaining amount of ram
2908 * Returns zero to indicate success or negative on error
2910 * Called with iothread lock
2912 * @f: QEMUFile where to send the data
2913 * @opaque: RAMState pointer
2915 static int ram_save_complete(QEMUFile
*f
, void *opaque
)
2917 RAMState
**temp
= opaque
;
2918 RAMState
*rs
= *temp
;
2921 WITH_RCU_READ_LOCK_GUARD() {
2922 if (!migration_in_postcopy()) {
2923 migration_bitmap_sync_precopy(rs
);
2926 ram_control_before_iterate(f
, RAM_CONTROL_FINISH
);
2928 /* try transferring iterative blocks of memory */
2930 /* flush all remaining blocks regardless of rate limiting */
2934 pages
= ram_find_and_save_block(rs
, !migration_in_colo_state());
2935 /* no more blocks to sent */
2945 flush_compressed_data(rs
);
2946 ram_control_after_iterate(f
, RAM_CONTROL_FINISH
);
2950 multifd_send_sync_main(rs
->f
);
2951 qemu_put_be64(f
, RAM_SAVE_FLAG_EOS
);
2958 static void ram_save_pending(QEMUFile
*f
, void *opaque
, uint64_t max_size
,
2959 uint64_t *res_precopy_only
,
2960 uint64_t *res_compatible
,
2961 uint64_t *res_postcopy_only
)
2963 RAMState
**temp
= opaque
;
2964 RAMState
*rs
= *temp
;
2965 uint64_t remaining_size
;
2967 remaining_size
= rs
->migration_dirty_pages
* TARGET_PAGE_SIZE
;
2969 if (!migration_in_postcopy() &&
2970 remaining_size
< max_size
) {
2971 qemu_mutex_lock_iothread();
2972 WITH_RCU_READ_LOCK_GUARD() {
2973 migration_bitmap_sync_precopy(rs
);
2975 qemu_mutex_unlock_iothread();
2976 remaining_size
= rs
->migration_dirty_pages
* TARGET_PAGE_SIZE
;
2979 if (migrate_postcopy_ram()) {
2980 /* We can do postcopy, and all the data is postcopiable */
2981 *res_compatible
+= remaining_size
;
2983 *res_precopy_only
+= remaining_size
;
2987 static int load_xbzrle(QEMUFile
*f
, ram_addr_t addr
, void *host
)
2989 unsigned int xh_len
;
2991 uint8_t *loaded_data
;
2993 /* extract RLE header */
2994 xh_flags
= qemu_get_byte(f
);
2995 xh_len
= qemu_get_be16(f
);
2997 if (xh_flags
!= ENCODING_FLAG_XBZRLE
) {
2998 error_report("Failed to load XBZRLE page - wrong compression!");
3002 if (xh_len
> TARGET_PAGE_SIZE
) {
3003 error_report("Failed to load XBZRLE page - len overflow!");
3006 loaded_data
= XBZRLE
.decoded_buf
;
3007 /* load data and decode */
3008 /* it can change loaded_data to point to an internal buffer */
3009 qemu_get_buffer_in_place(f
, &loaded_data
, xh_len
);
3012 if (xbzrle_decode_buffer(loaded_data
, xh_len
, host
,
3013 TARGET_PAGE_SIZE
) == -1) {
3014 error_report("Failed to load XBZRLE page - decode error!");
3022 * ram_block_from_stream: read a RAMBlock id from the migration stream
3024 * Must be called from within a rcu critical section.
3026 * Returns a pointer from within the RCU-protected ram_list.
3028 * @f: QEMUFile where to read the data from
3029 * @flags: Page flags (mostly to see if it's a continuation of previous block)
3031 static inline RAMBlock
*ram_block_from_stream(QEMUFile
*f
, int flags
)
3033 static RAMBlock
*block
;
3037 if (flags
& RAM_SAVE_FLAG_CONTINUE
) {
3039 error_report("Ack, bad migration stream!");
3045 len
= qemu_get_byte(f
);
3046 qemu_get_buffer(f
, (uint8_t *)id
, len
);
3049 block
= qemu_ram_block_by_name(id
);
3051 error_report("Can't find block %s", id
);
3055 if (ramblock_is_ignored(block
)) {
3056 error_report("block %s should not be migrated !", id
);
3063 static inline void *host_from_ram_block_offset(RAMBlock
*block
,
3066 if (!offset_in_ramblock(block
, offset
)) {
3070 return block
->host
+ offset
;
3073 static inline void *colo_cache_from_block_offset(RAMBlock
*block
,
3074 ram_addr_t offset
, bool record_bitmap
)
3076 if (!offset_in_ramblock(block
, offset
)) {
3079 if (!block
->colo_cache
) {
3080 error_report("%s: colo_cache is NULL in block :%s",
3081 __func__
, block
->idstr
);
3086 * During colo checkpoint, we need bitmap of these migrated pages.
3087 * It help us to decide which pages in ram cache should be flushed
3088 * into VM's RAM later.
3090 if (record_bitmap
&&
3091 !test_and_set_bit(offset
>> TARGET_PAGE_BITS
, block
->bmap
)) {
3092 ram_state
->migration_dirty_pages
++;
3094 return block
->colo_cache
+ offset
;
3098 * ram_handle_compressed: handle the zero page case
3100 * If a page (or a whole RDMA chunk) has been
3101 * determined to be zero, then zap it.
3103 * @host: host address for the zero page
3104 * @ch: what the page is filled from. We only support zero
3105 * @size: size of the zero page
3107 void ram_handle_compressed(void *host
, uint8_t ch
, uint64_t size
)
3109 if (ch
!= 0 || !is_zero_range(host
, size
)) {
3110 memset(host
, ch
, size
);
3114 /* return the size after decompression, or negative value on error */
3116 qemu_uncompress_data(z_stream
*stream
, uint8_t *dest
, size_t dest_len
,
3117 const uint8_t *source
, size_t source_len
)
3121 err
= inflateReset(stream
);
3126 stream
->avail_in
= source_len
;
3127 stream
->next_in
= (uint8_t *)source
;
3128 stream
->avail_out
= dest_len
;
3129 stream
->next_out
= dest
;
3131 err
= inflate(stream
, Z_NO_FLUSH
);
3132 if (err
!= Z_STREAM_END
) {
3136 return stream
->total_out
;
3139 static void *do_data_decompress(void *opaque
)
3141 DecompressParam
*param
= opaque
;
3142 unsigned long pagesize
;
3146 qemu_mutex_lock(¶m
->mutex
);
3147 while (!param
->quit
) {
3152 qemu_mutex_unlock(¶m
->mutex
);
3154 pagesize
= TARGET_PAGE_SIZE
;
3156 ret
= qemu_uncompress_data(¶m
->stream
, des
, pagesize
,
3157 param
->compbuf
, len
);
3158 if (ret
< 0 && migrate_get_current()->decompress_error_check
) {
3159 error_report("decompress data failed");
3160 qemu_file_set_error(decomp_file
, ret
);
3163 qemu_mutex_lock(&decomp_done_lock
);
3165 qemu_cond_signal(&decomp_done_cond
);
3166 qemu_mutex_unlock(&decomp_done_lock
);
3168 qemu_mutex_lock(¶m
->mutex
);
3170 qemu_cond_wait(¶m
->cond
, ¶m
->mutex
);
3173 qemu_mutex_unlock(¶m
->mutex
);
3178 static int wait_for_decompress_done(void)
3180 int idx
, thread_count
;
3182 if (!migrate_use_compression()) {
3186 thread_count
= migrate_decompress_threads();
3187 qemu_mutex_lock(&decomp_done_lock
);
3188 for (idx
= 0; idx
< thread_count
; idx
++) {
3189 while (!decomp_param
[idx
].done
) {
3190 qemu_cond_wait(&decomp_done_cond
, &decomp_done_lock
);
3193 qemu_mutex_unlock(&decomp_done_lock
);
3194 return qemu_file_get_error(decomp_file
);
3197 static void compress_threads_load_cleanup(void)
3199 int i
, thread_count
;
3201 if (!migrate_use_compression()) {
3204 thread_count
= migrate_decompress_threads();
3205 for (i
= 0; i
< thread_count
; i
++) {
3207 * we use it as a indicator which shows if the thread is
3208 * properly init'd or not
3210 if (!decomp_param
[i
].compbuf
) {
3214 qemu_mutex_lock(&decomp_param
[i
].mutex
);
3215 decomp_param
[i
].quit
= true;
3216 qemu_cond_signal(&decomp_param
[i
].cond
);
3217 qemu_mutex_unlock(&decomp_param
[i
].mutex
);
3219 for (i
= 0; i
< thread_count
; i
++) {
3220 if (!decomp_param
[i
].compbuf
) {
3224 qemu_thread_join(decompress_threads
+ i
);
3225 qemu_mutex_destroy(&decomp_param
[i
].mutex
);
3226 qemu_cond_destroy(&decomp_param
[i
].cond
);
3227 inflateEnd(&decomp_param
[i
].stream
);
3228 g_free(decomp_param
[i
].compbuf
);
3229 decomp_param
[i
].compbuf
= NULL
;
3231 g_free(decompress_threads
);
3232 g_free(decomp_param
);
3233 decompress_threads
= NULL
;
3234 decomp_param
= NULL
;
3238 static int compress_threads_load_setup(QEMUFile
*f
)
3240 int i
, thread_count
;
3242 if (!migrate_use_compression()) {
3246 thread_count
= migrate_decompress_threads();
3247 decompress_threads
= g_new0(QemuThread
, thread_count
);
3248 decomp_param
= g_new0(DecompressParam
, thread_count
);
3249 qemu_mutex_init(&decomp_done_lock
);
3250 qemu_cond_init(&decomp_done_cond
);
3252 for (i
= 0; i
< thread_count
; i
++) {
3253 if (inflateInit(&decomp_param
[i
].stream
) != Z_OK
) {
3257 decomp_param
[i
].compbuf
= g_malloc0(compressBound(TARGET_PAGE_SIZE
));
3258 qemu_mutex_init(&decomp_param
[i
].mutex
);
3259 qemu_cond_init(&decomp_param
[i
].cond
);
3260 decomp_param
[i
].done
= true;
3261 decomp_param
[i
].quit
= false;
3262 qemu_thread_create(decompress_threads
+ i
, "decompress",
3263 do_data_decompress
, decomp_param
+ i
,
3264 QEMU_THREAD_JOINABLE
);
3268 compress_threads_load_cleanup();
3272 static void decompress_data_with_multi_threads(QEMUFile
*f
,
3273 void *host
, int len
)
3275 int idx
, thread_count
;
3277 thread_count
= migrate_decompress_threads();
3278 qemu_mutex_lock(&decomp_done_lock
);
3280 for (idx
= 0; idx
< thread_count
; idx
++) {
3281 if (decomp_param
[idx
].done
) {
3282 decomp_param
[idx
].done
= false;
3283 qemu_mutex_lock(&decomp_param
[idx
].mutex
);
3284 qemu_get_buffer(f
, decomp_param
[idx
].compbuf
, len
);
3285 decomp_param
[idx
].des
= host
;
3286 decomp_param
[idx
].len
= len
;
3287 qemu_cond_signal(&decomp_param
[idx
].cond
);
3288 qemu_mutex_unlock(&decomp_param
[idx
].mutex
);
3292 if (idx
< thread_count
) {
3295 qemu_cond_wait(&decomp_done_cond
, &decomp_done_lock
);
3298 qemu_mutex_unlock(&decomp_done_lock
);
3302 * we must set ram_bulk_stage to false, otherwise in
3303 * migation_bitmap_find_dirty the bitmap will be unused and
3304 * all the pages in ram cache wil be flushed to the ram of
3307 static void colo_init_ram_state(void)
3309 ram_state_init(&ram_state
);
3310 ram_state
->ram_bulk_stage
= false;
3314 * colo cache: this is for secondary VM, we cache the whole
3315 * memory of the secondary VM, it is need to hold the global lock
3316 * to call this helper.
3318 int colo_init_ram_cache(void)
3322 WITH_RCU_READ_LOCK_GUARD() {
3323 RAMBLOCK_FOREACH_NOT_IGNORED(block
) {
3324 block
->colo_cache
= qemu_anon_ram_alloc(block
->used_length
,
3327 if (!block
->colo_cache
) {
3328 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3329 "size 0x" RAM_ADDR_FMT
, __func__
, block
->idstr
,
3330 block
->used_length
);
3331 RAMBLOCK_FOREACH_NOT_IGNORED(block
) {
3332 if (block
->colo_cache
) {
3333 qemu_anon_ram_free(block
->colo_cache
, block
->used_length
);
3334 block
->colo_cache
= NULL
;
3343 * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3344 * with to decide which page in cache should be flushed into SVM's RAM. Here
3345 * we use the same name 'ram_bitmap' as for migration.
3347 if (ram_bytes_total()) {
3350 RAMBLOCK_FOREACH_NOT_IGNORED(block
) {
3351 unsigned long pages
= block
->max_length
>> TARGET_PAGE_BITS
;
3352 block
->bmap
= bitmap_new(pages
);
3356 colo_init_ram_state();
3360 /* TODO: duplicated with ram_init_bitmaps */
3361 void colo_incoming_start_dirty_log(void)
3363 RAMBlock
*block
= NULL
;
3364 /* For memory_global_dirty_log_start below. */
3365 qemu_mutex_lock_iothread();
3366 qemu_mutex_lock_ramlist();
3368 memory_global_dirty_log_sync();
3369 WITH_RCU_READ_LOCK_GUARD() {
3370 RAMBLOCK_FOREACH_NOT_IGNORED(block
) {
3371 ramblock_sync_dirty_bitmap(ram_state
, block
);
3372 /* Discard this dirty bitmap record */
3373 bitmap_zero(block
->bmap
, block
->max_length
>> TARGET_PAGE_BITS
);
3375 memory_global_dirty_log_start();
3377 ram_state
->migration_dirty_pages
= 0;
3378 qemu_mutex_unlock_ramlist();
3379 qemu_mutex_unlock_iothread();
3382 /* It is need to hold the global lock to call this helper */
3383 void colo_release_ram_cache(void)
3387 memory_global_dirty_log_stop();
3388 RAMBLOCK_FOREACH_NOT_IGNORED(block
) {
3389 g_free(block
->bmap
);
3393 WITH_RCU_READ_LOCK_GUARD() {
3394 RAMBLOCK_FOREACH_NOT_IGNORED(block
) {
3395 if (block
->colo_cache
) {
3396 qemu_anon_ram_free(block
->colo_cache
, block
->used_length
);
3397 block
->colo_cache
= NULL
;
3401 ram_state_cleanup(&ram_state
);
3405 * ram_load_setup: Setup RAM for migration incoming side
3407 * Returns zero to indicate success and negative for error
3409 * @f: QEMUFile where to receive the data
3410 * @opaque: RAMState pointer
3412 static int ram_load_setup(QEMUFile
*f
, void *opaque
)
3414 if (compress_threads_load_setup(f
)) {
3418 xbzrle_load_setup();
3419 ramblock_recv_map_init();
3424 static int ram_load_cleanup(void *opaque
)
3428 RAMBLOCK_FOREACH_NOT_IGNORED(rb
) {
3429 qemu_ram_block_writeback(rb
);
3432 xbzrle_load_cleanup();
3433 compress_threads_load_cleanup();
3435 RAMBLOCK_FOREACH_NOT_IGNORED(rb
) {
3436 g_free(rb
->receivedmap
);
3437 rb
->receivedmap
= NULL
;
3444 * ram_postcopy_incoming_init: allocate postcopy data structures
3446 * Returns 0 for success and negative if there was one error
3448 * @mis: current migration incoming state
3450 * Allocate data structures etc needed by incoming migration with
3451 * postcopy-ram. postcopy-ram's similarly names
3452 * postcopy_ram_incoming_init does the work.
3454 int ram_postcopy_incoming_init(MigrationIncomingState
*mis
)
3456 return postcopy_ram_incoming_init(mis
);
3460 * ram_load_postcopy: load a page in postcopy case
3462 * Returns 0 for success or -errno in case of error
3464 * Called in postcopy mode by ram_load().
3465 * rcu_read_lock is taken prior to this being called.
3467 * @f: QEMUFile where to send the data
3469 static int ram_load_postcopy(QEMUFile
*f
)
3471 int flags
= 0, ret
= 0;
3472 bool place_needed
= false;
3473 bool matches_target_page_size
= false;
3474 MigrationIncomingState
*mis
= migration_incoming_get_current();
3475 /* Temporary page that is later 'placed' */
3476 void *postcopy_host_page
= mis
->postcopy_tmp_page
;
3477 void *this_host
= NULL
;
3478 bool all_zero
= true;
3479 int target_pages
= 0;
3481 while (!ret
&& !(flags
& RAM_SAVE_FLAG_EOS
)) {
3484 void *page_buffer
= NULL
;
3485 void *place_source
= NULL
;
3486 RAMBlock
*block
= NULL
;
3490 addr
= qemu_get_be64(f
);
3493 * If qemu file error, we should stop here, and then "addr"
3496 ret
= qemu_file_get_error(f
);
3501 flags
= addr
& ~TARGET_PAGE_MASK
;
3502 addr
&= TARGET_PAGE_MASK
;
3504 trace_ram_load_postcopy_loop((uint64_t)addr
, flags
);
3505 if (flags
& (RAM_SAVE_FLAG_ZERO
| RAM_SAVE_FLAG_PAGE
|
3506 RAM_SAVE_FLAG_COMPRESS_PAGE
)) {
3507 block
= ram_block_from_stream(f
, flags
);
3509 host
= host_from_ram_block_offset(block
, addr
);
3511 error_report("Illegal RAM offset " RAM_ADDR_FMT
, addr
);
3516 matches_target_page_size
= block
->page_size
== TARGET_PAGE_SIZE
;
3518 * Postcopy requires that we place whole host pages atomically;
3519 * these may be huge pages for RAMBlocks that are backed by
3521 * To make it atomic, the data is read into a temporary page
3522 * that's moved into place later.
3523 * The migration protocol uses, possibly smaller, target-pages
3524 * however the source ensures it always sends all the components
3525 * of a host page in one chunk.
3527 page_buffer
= postcopy_host_page
+
3528 ((uintptr_t)host
& (block
->page_size
- 1));
3529 if (target_pages
== 1) {
3530 this_host
= (void *)QEMU_ALIGN_DOWN((uintptr_t)host
,
3533 /* not the 1st TP within the HP */
3534 if (QEMU_ALIGN_DOWN((uintptr_t)host
, block
->page_size
) !=
3535 (uintptr_t)this_host
) {
3536 error_report("Non-same host page %p/%p",
3544 * If it's the last part of a host page then we place the host
3547 if (target_pages
== (block
->page_size
/ TARGET_PAGE_SIZE
)) {
3548 place_needed
= true;
3550 place_source
= postcopy_host_page
;
3553 switch (flags
& ~RAM_SAVE_FLAG_CONTINUE
) {
3554 case RAM_SAVE_FLAG_ZERO
:
3555 ch
= qemu_get_byte(f
);
3557 * Can skip to set page_buffer when
3558 * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
3560 if (ch
|| !matches_target_page_size
) {
3561 memset(page_buffer
, ch
, TARGET_PAGE_SIZE
);
3568 case RAM_SAVE_FLAG_PAGE
:
3570 if (!matches_target_page_size
) {
3571 /* For huge pages, we always use temporary buffer */
3572 qemu_get_buffer(f
, page_buffer
, TARGET_PAGE_SIZE
);
3575 * For small pages that matches target page size, we
3576 * avoid the qemu_file copy. Instead we directly use
3577 * the buffer of QEMUFile to place the page. Note: we
3578 * cannot do any QEMUFile operation before using that
3579 * buffer to make sure the buffer is valid when
3582 qemu_get_buffer_in_place(f
, (uint8_t **)&place_source
,
3586 case RAM_SAVE_FLAG_COMPRESS_PAGE
:
3588 len
= qemu_get_be32(f
);
3589 if (len
< 0 || len
> compressBound(TARGET_PAGE_SIZE
)) {
3590 error_report("Invalid compressed data length: %d", len
);
3594 decompress_data_with_multi_threads(f
, page_buffer
, len
);
3597 case RAM_SAVE_FLAG_EOS
:
3599 multifd_recv_sync_main();
3602 error_report("Unknown combination of migration flags: 0x%x"
3603 " (postcopy mode)", flags
);
3608 /* Got the whole host page, wait for decompress before placing. */
3610 ret
|= wait_for_decompress_done();
3613 /* Detect for any possible file errors */
3614 if (!ret
&& qemu_file_get_error(f
)) {
3615 ret
= qemu_file_get_error(f
);
3618 if (!ret
&& place_needed
) {
3619 /* This gets called at the last target page in the host page */
3620 void *place_dest
= (void *)QEMU_ALIGN_DOWN((uintptr_t)host
,
3624 ret
= postcopy_place_page_zero(mis
, place_dest
,
3627 ret
= postcopy_place_page(mis
, place_dest
,
3628 place_source
, block
);
3630 place_needed
= false;
3632 /* Assume we have a zero page until we detect something different */
3640 static bool postcopy_is_advised(void)
3642 PostcopyState ps
= postcopy_state_get();
3643 return ps
>= POSTCOPY_INCOMING_ADVISE
&& ps
< POSTCOPY_INCOMING_END
;
3646 static bool postcopy_is_running(void)
3648 PostcopyState ps
= postcopy_state_get();
3649 return ps
>= POSTCOPY_INCOMING_LISTENING
&& ps
< POSTCOPY_INCOMING_END
;
3653 * Flush content of RAM cache into SVM's memory.
3654 * Only flush the pages that be dirtied by PVM or SVM or both.
3656 void colo_flush_ram_cache(void)
3658 RAMBlock
*block
= NULL
;
3661 unsigned long offset
= 0;
3663 memory_global_dirty_log_sync();
3664 WITH_RCU_READ_LOCK_GUARD() {
3665 RAMBLOCK_FOREACH_NOT_IGNORED(block
) {
3666 ramblock_sync_dirty_bitmap(ram_state
, block
);
3670 trace_colo_flush_ram_cache_begin(ram_state
->migration_dirty_pages
);
3671 WITH_RCU_READ_LOCK_GUARD() {
3672 block
= QLIST_FIRST_RCU(&ram_list
.blocks
);
3675 offset
= migration_bitmap_find_dirty(ram_state
, block
, offset
);
3677 if (((ram_addr_t
)offset
) << TARGET_PAGE_BITS
3678 >= block
->used_length
) {
3680 block
= QLIST_NEXT_RCU(block
, next
);
3682 migration_bitmap_clear_dirty(ram_state
, block
, offset
);
3683 dst_host
= block
->host
3684 + (((ram_addr_t
)offset
) << TARGET_PAGE_BITS
);
3685 src_host
= block
->colo_cache
3686 + (((ram_addr_t
)offset
) << TARGET_PAGE_BITS
);
3687 memcpy(dst_host
, src_host
, TARGET_PAGE_SIZE
);
3691 trace_colo_flush_ram_cache_end();
3695 * ram_load_precopy: load pages in precopy case
3697 * Returns 0 for success or -errno in case of error
3699 * Called in precopy mode by ram_load().
3700 * rcu_read_lock is taken prior to this being called.
3702 * @f: QEMUFile where to send the data
3704 static int ram_load_precopy(QEMUFile
*f
)
3706 int flags
= 0, ret
= 0, invalid_flags
= 0, len
= 0, i
= 0;
3707 /* ADVISE is earlier, it shows the source has the postcopy capability on */
3708 bool postcopy_advised
= postcopy_is_advised();
3709 if (!migrate_use_compression()) {
3710 invalid_flags
|= RAM_SAVE_FLAG_COMPRESS_PAGE
;
3713 while (!ret
&& !(flags
& RAM_SAVE_FLAG_EOS
)) {
3714 ram_addr_t addr
, total_ram_bytes
;
3715 void *host
= NULL
, *host_bak
= NULL
;
3719 * Yield periodically to let main loop run, but an iteration of
3720 * the main loop is expensive, so do it each some iterations
3722 if ((i
& 32767) == 0 && qemu_in_coroutine()) {
3723 aio_co_schedule(qemu_get_current_aio_context(),
3724 qemu_coroutine_self());
3725 qemu_coroutine_yield();
3729 addr
= qemu_get_be64(f
);
3730 flags
= addr
& ~TARGET_PAGE_MASK
;
3731 addr
&= TARGET_PAGE_MASK
;
3733 if (flags
& invalid_flags
) {
3734 if (flags
& invalid_flags
& RAM_SAVE_FLAG_COMPRESS_PAGE
) {
3735 error_report("Received an unexpected compressed page");
3742 if (flags
& (RAM_SAVE_FLAG_ZERO
| RAM_SAVE_FLAG_PAGE
|
3743 RAM_SAVE_FLAG_COMPRESS_PAGE
| RAM_SAVE_FLAG_XBZRLE
)) {
3744 RAMBlock
*block
= ram_block_from_stream(f
, flags
);
3746 host
= host_from_ram_block_offset(block
, addr
);
3748 * After going into COLO stage, we should not load the page
3749 * into SVM's memory directly, we put them into colo_cache firstly.
3750 * NOTE: We need to keep a copy of SVM's ram in colo_cache.
3751 * Previously, we copied all these memory in preparing stage of COLO
3752 * while we need to stop VM, which is a time-consuming process.
3753 * Here we optimize it by a trick, back-up every page while in
3754 * migration process while COLO is enabled, though it affects the
3755 * speed of the migration, but it obviously reduce the downtime of
3756 * back-up all SVM'S memory in COLO preparing stage.
3758 if (migration_incoming_colo_enabled()) {
3759 if (migration_incoming_in_colo_state()) {
3760 /* In COLO stage, put all pages into cache temporarily */
3761 host
= colo_cache_from_block_offset(block
, addr
, true);
3764 * In migration stage but before COLO stage,
3765 * Put all pages into both cache and SVM's memory.
3767 host_bak
= colo_cache_from_block_offset(block
, addr
, false);
3771 error_report("Illegal RAM offset " RAM_ADDR_FMT
, addr
);
3775 if (!migration_incoming_in_colo_state()) {
3776 ramblock_recv_bitmap_set(block
, host
);
3779 trace_ram_load_loop(block
->idstr
, (uint64_t)addr
, flags
, host
);
3782 switch (flags
& ~RAM_SAVE_FLAG_CONTINUE
) {
3783 case RAM_SAVE_FLAG_MEM_SIZE
:
3784 /* Synchronize RAM block list */
3785 total_ram_bytes
= addr
;
3786 while (!ret
&& total_ram_bytes
) {
3791 len
= qemu_get_byte(f
);
3792 qemu_get_buffer(f
, (uint8_t *)id
, len
);
3794 length
= qemu_get_be64(f
);
3796 block
= qemu_ram_block_by_name(id
);
3797 if (block
&& !qemu_ram_is_migratable(block
)) {
3798 error_report("block %s should not be migrated !", id
);
3801 if (length
!= block
->used_length
) {
3802 Error
*local_err
= NULL
;
3804 ret
= qemu_ram_resize(block
, length
,
3807 error_report_err(local_err
);
3810 /* For postcopy we need to check hugepage sizes match */
3811 if (postcopy_advised
&& migrate_postcopy_ram() &&
3812 block
->page_size
!= qemu_host_page_size
) {
3813 uint64_t remote_page_size
= qemu_get_be64(f
);
3814 if (remote_page_size
!= block
->page_size
) {
3815 error_report("Mismatched RAM page size %s "
3816 "(local) %zd != %" PRId64
,
3817 id
, block
->page_size
,
3822 if (migrate_ignore_shared()) {
3823 hwaddr addr
= qemu_get_be64(f
);
3824 if (ramblock_is_ignored(block
) &&
3825 block
->mr
->addr
!= addr
) {
3826 error_report("Mismatched GPAs for block %s "
3827 "%" PRId64
"!= %" PRId64
,
3829 (uint64_t)block
->mr
->addr
);
3833 ram_control_load_hook(f
, RAM_CONTROL_BLOCK_REG
,
3836 error_report("Unknown ramblock \"%s\", cannot "
3837 "accept migration", id
);
3841 total_ram_bytes
-= length
;
3845 case RAM_SAVE_FLAG_ZERO
:
3846 ch
= qemu_get_byte(f
);
3847 ram_handle_compressed(host
, ch
, TARGET_PAGE_SIZE
);
3850 case RAM_SAVE_FLAG_PAGE
:
3851 qemu_get_buffer(f
, host
, TARGET_PAGE_SIZE
);
3854 case RAM_SAVE_FLAG_COMPRESS_PAGE
:
3855 len
= qemu_get_be32(f
);
3856 if (len
< 0 || len
> compressBound(TARGET_PAGE_SIZE
)) {
3857 error_report("Invalid compressed data length: %d", len
);
3861 decompress_data_with_multi_threads(f
, host
, len
);
3864 case RAM_SAVE_FLAG_XBZRLE
:
3865 if (load_xbzrle(f
, addr
, host
) < 0) {
3866 error_report("Failed to decompress XBZRLE page at "
3867 RAM_ADDR_FMT
, addr
);
3872 case RAM_SAVE_FLAG_EOS
:
3874 multifd_recv_sync_main();
3877 if (flags
& RAM_SAVE_FLAG_HOOK
) {
3878 ram_control_load_hook(f
, RAM_CONTROL_HOOK
, NULL
);
3880 error_report("Unknown combination of migration flags: 0x%x",
3886 ret
= qemu_file_get_error(f
);
3888 if (!ret
&& host_bak
) {
3889 memcpy(host_bak
, host
, TARGET_PAGE_SIZE
);
3893 ret
|= wait_for_decompress_done();
3897 static int ram_load(QEMUFile
*f
, void *opaque
, int version_id
)
3900 static uint64_t seq_iter
;
3902 * If system is running in postcopy mode, page inserts to host memory must
3905 bool postcopy_running
= postcopy_is_running();
3909 if (version_id
!= 4) {
3914 * This RCU critical section can be very long running.
3915 * When RCU reclaims in the code start to become numerous,
3916 * it will be necessary to reduce the granularity of this
3919 WITH_RCU_READ_LOCK_GUARD() {
3920 if (postcopy_running
) {
3921 ret
= ram_load_postcopy(f
);
3923 ret
= ram_load_precopy(f
);
3926 trace_ram_load_complete(ret
, seq_iter
);
3931 static bool ram_has_postcopy(void *opaque
)
3934 RAMBLOCK_FOREACH_NOT_IGNORED(rb
) {
3935 if (ramblock_is_pmem(rb
)) {
3936 info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
3937 "is not supported now!", rb
->idstr
, rb
->host
);
3942 return migrate_postcopy_ram();
3945 /* Sync all the dirty bitmap with destination VM. */
3946 static int ram_dirty_bitmap_sync_all(MigrationState
*s
, RAMState
*rs
)
3949 QEMUFile
*file
= s
->to_dst_file
;
3950 int ramblock_count
= 0;
3952 trace_ram_dirty_bitmap_sync_start();
3954 RAMBLOCK_FOREACH_NOT_IGNORED(block
) {
3955 qemu_savevm_send_recv_bitmap(file
, block
->idstr
);
3956 trace_ram_dirty_bitmap_request(block
->idstr
);
3960 trace_ram_dirty_bitmap_sync_wait();
3962 /* Wait until all the ramblocks' dirty bitmap synced */
3963 while (ramblock_count
--) {
3964 qemu_sem_wait(&s
->rp_state
.rp_sem
);
3967 trace_ram_dirty_bitmap_sync_complete();
3972 static void ram_dirty_bitmap_reload_notify(MigrationState
*s
)
3974 qemu_sem_post(&s
->rp_state
.rp_sem
);
3978 * Read the received bitmap, revert it as the initial dirty bitmap.
3979 * This is only used when the postcopy migration is paused but wants
3980 * to resume from a middle point.
3982 int ram_dirty_bitmap_reload(MigrationState
*s
, RAMBlock
*block
)
3985 QEMUFile
*file
= s
->rp_state
.from_dst_file
;
3986 unsigned long *le_bitmap
, nbits
= block
->used_length
>> TARGET_PAGE_BITS
;
3987 uint64_t local_size
= DIV_ROUND_UP(nbits
, 8);
3988 uint64_t size
, end_mark
;
3990 trace_ram_dirty_bitmap_reload_begin(block
->idstr
);
3992 if (s
->state
!= MIGRATION_STATUS_POSTCOPY_RECOVER
) {
3993 error_report("%s: incorrect state %s", __func__
,
3994 MigrationStatus_str(s
->state
));
3999 * Note: see comments in ramblock_recv_bitmap_send() on why we
4000 * need the endianness conversion, and the paddings.
4002 local_size
= ROUND_UP(local_size
, 8);
4005 le_bitmap
= bitmap_new(nbits
+ BITS_PER_LONG
);
4007 size
= qemu_get_be64(file
);
4009 /* The size of the bitmap should match with our ramblock */
4010 if (size
!= local_size
) {
4011 error_report("%s: ramblock '%s' bitmap size mismatch "
4012 "(0x%"PRIx64
" != 0x%"PRIx64
")", __func__
,
4013 block
->idstr
, size
, local_size
);
4018 size
= qemu_get_buffer(file
, (uint8_t *)le_bitmap
, local_size
);
4019 end_mark
= qemu_get_be64(file
);
4021 ret
= qemu_file_get_error(file
);
4022 if (ret
|| size
!= local_size
) {
4023 error_report("%s: read bitmap failed for ramblock '%s': %d"
4024 " (size 0x%"PRIx64
", got: 0x%"PRIx64
")",
4025 __func__
, block
->idstr
, ret
, local_size
, size
);
4030 if (end_mark
!= RAMBLOCK_RECV_BITMAP_ENDING
) {
4031 error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64
,
4032 __func__
, block
->idstr
, end_mark
);
4038 * Endianness conversion. We are during postcopy (though paused).
4039 * The dirty bitmap won't change. We can directly modify it.
4041 bitmap_from_le(block
->bmap
, le_bitmap
, nbits
);
4044 * What we received is "received bitmap". Revert it as the initial
4045 * dirty bitmap for this ramblock.
4047 bitmap_complement(block
->bmap
, block
->bmap
, nbits
);
4049 trace_ram_dirty_bitmap_reload_complete(block
->idstr
);
4052 * We succeeded to sync bitmap for current ramblock. If this is
4053 * the last one to sync, we need to notify the main send thread.
4055 ram_dirty_bitmap_reload_notify(s
);
4063 static int ram_resume_prepare(MigrationState
*s
, void *opaque
)
4065 RAMState
*rs
= *(RAMState
**)opaque
;
4068 ret
= ram_dirty_bitmap_sync_all(s
, rs
);
4073 ram_state_resume_prepare(rs
, s
->to_dst_file
);
4078 static SaveVMHandlers savevm_ram_handlers
= {
4079 .save_setup
= ram_save_setup
,
4080 .save_live_iterate
= ram_save_iterate
,
4081 .save_live_complete_postcopy
= ram_save_complete
,
4082 .save_live_complete_precopy
= ram_save_complete
,
4083 .has_postcopy
= ram_has_postcopy
,
4084 .save_live_pending
= ram_save_pending
,
4085 .load_state
= ram_load
,
4086 .save_cleanup
= ram_save_cleanup
,
4087 .load_setup
= ram_load_setup
,
4088 .load_cleanup
= ram_load_cleanup
,
4089 .resume_prepare
= ram_resume_prepare
,
4092 void ram_mig_init(void)
4094 qemu_mutex_init(&XBZRLE
.lock
);
4095 register_savevm_live("ram", 0, 4, &savevm_ram_handlers
, &ram_state
);