migrate/ram: remove "ram_bulk_stage" and "fpo_enabled"
[qemu/ar7.git] / migration / ram.c
blobbee2756cd391e1fd87fe7244bc2efbcd3f6a385d
1 /*
2 * QEMU System Emulator
4 * Copyright (c) 2003-2008 Fabrice Bellard
5 * Copyright (c) 2011-2015 Red Hat Inc
7 * Authors:
8 * Juan Quintela <quintela@redhat.com>
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 * THE SOFTWARE.
29 #include "qemu/osdep.h"
30 #include "qemu/cutils.h"
31 #include "qemu/bitops.h"
32 #include "qemu/bitmap.h"
33 #include "qemu/main-loop.h"
34 #include "xbzrle.h"
35 #include "ram.h"
36 #include "migration.h"
37 #include "migration/register.h"
38 #include "migration/misc.h"
39 #include "qemu-file.h"
40 #include "postcopy-ram.h"
41 #include "page_cache.h"
42 #include "qemu/error-report.h"
43 #include "qapi/error.h"
44 #include "qapi/qapi-types-migration.h"
45 #include "qapi/qapi-events-migration.h"
46 #include "qapi/qmp/qerror.h"
47 #include "trace.h"
48 #include "exec/ram_addr.h"
49 #include "exec/target_page.h"
50 #include "qemu/rcu_queue.h"
51 #include "migration/colo.h"
52 #include "block.h"
53 #include "sysemu/cpu-throttle.h"
54 #include "savevm.h"
55 #include "qemu/iov.h"
56 #include "multifd.h"
57 #include "sysemu/runstate.h"
59 #if defined(__linux__)
60 #include "qemu/userfaultfd.h"
61 #endif /* defined(__linux__) */
63 /***********************************************************/
64 /* ram save/restore */
66 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
67 * worked for pages that where filled with the same char. We switched
68 * it to only search for the zero value. And to avoid confusion with
69 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
72 #define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */
73 #define RAM_SAVE_FLAG_ZERO 0x02
74 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
75 #define RAM_SAVE_FLAG_PAGE 0x08
76 #define RAM_SAVE_FLAG_EOS 0x10
77 #define RAM_SAVE_FLAG_CONTINUE 0x20
78 #define RAM_SAVE_FLAG_XBZRLE 0x40
79 /* 0x80 is reserved in migration.h start with 0x100 next */
80 #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
82 static inline bool is_zero_range(uint8_t *p, uint64_t size)
84 return buffer_is_zero(p, size);
87 XBZRLECacheStats xbzrle_counters;
89 /* struct contains XBZRLE cache and a static page
90 used by the compression */
91 static struct {
92 /* buffer used for XBZRLE encoding */
93 uint8_t *encoded_buf;
94 /* buffer for storing page content */
95 uint8_t *current_buf;
96 /* Cache for XBZRLE, Protected by lock. */
97 PageCache *cache;
98 QemuMutex lock;
99 /* it will store a page full of zeros */
100 uint8_t *zero_target_page;
101 /* buffer used for XBZRLE decoding */
102 uint8_t *decoded_buf;
103 } XBZRLE;
105 static void XBZRLE_cache_lock(void)
107 if (migrate_use_xbzrle()) {
108 qemu_mutex_lock(&XBZRLE.lock);
112 static void XBZRLE_cache_unlock(void)
114 if (migrate_use_xbzrle()) {
115 qemu_mutex_unlock(&XBZRLE.lock);
120 * xbzrle_cache_resize: resize the xbzrle cache
122 * This function is called from migrate_params_apply in main
123 * thread, possibly while a migration is in progress. A running
124 * migration may be using the cache and might finish during this call,
125 * hence changes to the cache are protected by XBZRLE.lock().
127 * Returns 0 for success or -1 for error
129 * @new_size: new cache size
130 * @errp: set *errp if the check failed, with reason
132 int xbzrle_cache_resize(uint64_t new_size, Error **errp)
134 PageCache *new_cache;
135 int64_t ret = 0;
137 /* Check for truncation */
138 if (new_size != (size_t)new_size) {
139 error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size",
140 "exceeding address space");
141 return -1;
144 if (new_size == migrate_xbzrle_cache_size()) {
145 /* nothing to do */
146 return 0;
149 XBZRLE_cache_lock();
151 if (XBZRLE.cache != NULL) {
152 new_cache = cache_init(new_size, TARGET_PAGE_SIZE, errp);
153 if (!new_cache) {
154 ret = -1;
155 goto out;
158 cache_fini(XBZRLE.cache);
159 XBZRLE.cache = new_cache;
161 out:
162 XBZRLE_cache_unlock();
163 return ret;
166 bool ramblock_is_ignored(RAMBlock *block)
168 return !qemu_ram_is_migratable(block) ||
169 (migrate_ignore_shared() && qemu_ram_is_shared(block));
172 #undef RAMBLOCK_FOREACH
174 int foreach_not_ignored_block(RAMBlockIterFunc func, void *opaque)
176 RAMBlock *block;
177 int ret = 0;
179 RCU_READ_LOCK_GUARD();
181 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
182 ret = func(block, opaque);
183 if (ret) {
184 break;
187 return ret;
190 static void ramblock_recv_map_init(void)
192 RAMBlock *rb;
194 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
195 assert(!rb->receivedmap);
196 rb->receivedmap = bitmap_new(rb->max_length >> qemu_target_page_bits());
200 int ramblock_recv_bitmap_test(RAMBlock *rb, void *host_addr)
202 return test_bit(ramblock_recv_bitmap_offset(host_addr, rb),
203 rb->receivedmap);
206 bool ramblock_recv_bitmap_test_byte_offset(RAMBlock *rb, uint64_t byte_offset)
208 return test_bit(byte_offset >> TARGET_PAGE_BITS, rb->receivedmap);
211 void ramblock_recv_bitmap_set(RAMBlock *rb, void *host_addr)
213 set_bit_atomic(ramblock_recv_bitmap_offset(host_addr, rb), rb->receivedmap);
216 void ramblock_recv_bitmap_set_range(RAMBlock *rb, void *host_addr,
217 size_t nr)
219 bitmap_set_atomic(rb->receivedmap,
220 ramblock_recv_bitmap_offset(host_addr, rb),
221 nr);
224 #define RAMBLOCK_RECV_BITMAP_ENDING (0x0123456789abcdefULL)
227 * Format: bitmap_size (8 bytes) + whole_bitmap (N bytes).
229 * Returns >0 if success with sent bytes, or <0 if error.
231 int64_t ramblock_recv_bitmap_send(QEMUFile *file,
232 const char *block_name)
234 RAMBlock *block = qemu_ram_block_by_name(block_name);
235 unsigned long *le_bitmap, nbits;
236 uint64_t size;
238 if (!block) {
239 error_report("%s: invalid block name: %s", __func__, block_name);
240 return -1;
243 nbits = block->used_length >> TARGET_PAGE_BITS;
246 * Make sure the tmp bitmap buffer is big enough, e.g., on 32bit
247 * machines we may need 4 more bytes for padding (see below
248 * comment). So extend it a bit before hand.
250 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
253 * Always use little endian when sending the bitmap. This is
254 * required that when source and destination VMs are not using the
255 * same endianness. (Note: big endian won't work.)
257 bitmap_to_le(le_bitmap, block->receivedmap, nbits);
259 /* Size of the bitmap, in bytes */
260 size = DIV_ROUND_UP(nbits, 8);
263 * size is always aligned to 8 bytes for 64bit machines, but it
264 * may not be true for 32bit machines. We need this padding to
265 * make sure the migration can survive even between 32bit and
266 * 64bit machines.
268 size = ROUND_UP(size, 8);
270 qemu_put_be64(file, size);
271 qemu_put_buffer(file, (const uint8_t *)le_bitmap, size);
273 * Mark as an end, in case the middle part is screwed up due to
274 * some "mysterious" reason.
276 qemu_put_be64(file, RAMBLOCK_RECV_BITMAP_ENDING);
277 qemu_fflush(file);
279 g_free(le_bitmap);
281 if (qemu_file_get_error(file)) {
282 return qemu_file_get_error(file);
285 return size + sizeof(size);
289 * An outstanding page request, on the source, having been received
290 * and queued
292 struct RAMSrcPageRequest {
293 RAMBlock *rb;
294 hwaddr offset;
295 hwaddr len;
297 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
300 /* State of RAM for migration */
301 struct RAMState {
302 /* QEMUFile used for this migration */
303 QEMUFile *f;
304 /* UFFD file descriptor, used in 'write-tracking' migration */
305 int uffdio_fd;
306 /* Last block that we have visited searching for dirty pages */
307 RAMBlock *last_seen_block;
308 /* Last block from where we have sent data */
309 RAMBlock *last_sent_block;
310 /* Last dirty target page we have sent */
311 ram_addr_t last_page;
312 /* last ram version we have seen */
313 uint32_t last_version;
314 /* How many times we have dirty too many pages */
315 int dirty_rate_high_cnt;
316 /* these variables are used for bitmap sync */
317 /* last time we did a full bitmap_sync */
318 int64_t time_last_bitmap_sync;
319 /* bytes transferred at start_time */
320 uint64_t bytes_xfer_prev;
321 /* number of dirty pages since start_time */
322 uint64_t num_dirty_pages_period;
323 /* xbzrle misses since the beginning of the period */
324 uint64_t xbzrle_cache_miss_prev;
325 /* Amount of xbzrle pages since the beginning of the period */
326 uint64_t xbzrle_pages_prev;
327 /* Amount of xbzrle encoded bytes since the beginning of the period */
328 uint64_t xbzrle_bytes_prev;
329 /* Start using XBZRLE (e.g., after the first round). */
330 bool xbzrle_enabled;
332 /* compression statistics since the beginning of the period */
333 /* amount of count that no free thread to compress data */
334 uint64_t compress_thread_busy_prev;
335 /* amount bytes after compression */
336 uint64_t compressed_size_prev;
337 /* amount of compressed pages */
338 uint64_t compress_pages_prev;
340 /* total handled target pages at the beginning of period */
341 uint64_t target_page_count_prev;
342 /* total handled target pages since start */
343 uint64_t target_page_count;
344 /* number of dirty bits in the bitmap */
345 uint64_t migration_dirty_pages;
346 /* Protects modification of the bitmap and migration dirty pages */
347 QemuMutex bitmap_mutex;
348 /* The RAMBlock used in the last src_page_requests */
349 RAMBlock *last_req_rb;
350 /* Queue of outstanding page requests from the destination */
351 QemuMutex src_page_req_mutex;
352 QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
354 typedef struct RAMState RAMState;
356 static RAMState *ram_state;
358 static NotifierWithReturnList precopy_notifier_list;
360 void precopy_infrastructure_init(void)
362 notifier_with_return_list_init(&precopy_notifier_list);
365 void precopy_add_notifier(NotifierWithReturn *n)
367 notifier_with_return_list_add(&precopy_notifier_list, n);
370 void precopy_remove_notifier(NotifierWithReturn *n)
372 notifier_with_return_remove(n);
375 int precopy_notify(PrecopyNotifyReason reason, Error **errp)
377 PrecopyNotifyData pnd;
378 pnd.reason = reason;
379 pnd.errp = errp;
381 return notifier_with_return_list_notify(&precopy_notifier_list, &pnd);
384 uint64_t ram_bytes_remaining(void)
386 return ram_state ? (ram_state->migration_dirty_pages * TARGET_PAGE_SIZE) :
390 MigrationStats ram_counters;
392 /* used by the search for pages to send */
393 struct PageSearchStatus {
394 /* Current block being searched */
395 RAMBlock *block;
396 /* Current page to search from */
397 unsigned long page;
398 /* Set once we wrap around */
399 bool complete_round;
401 typedef struct PageSearchStatus PageSearchStatus;
403 CompressionStats compression_counters;
405 struct CompressParam {
406 bool done;
407 bool quit;
408 bool zero_page;
409 QEMUFile *file;
410 QemuMutex mutex;
411 QemuCond cond;
412 RAMBlock *block;
413 ram_addr_t offset;
415 /* internally used fields */
416 z_stream stream;
417 uint8_t *originbuf;
419 typedef struct CompressParam CompressParam;
421 struct DecompressParam {
422 bool done;
423 bool quit;
424 QemuMutex mutex;
425 QemuCond cond;
426 void *des;
427 uint8_t *compbuf;
428 int len;
429 z_stream stream;
431 typedef struct DecompressParam DecompressParam;
433 static CompressParam *comp_param;
434 static QemuThread *compress_threads;
435 /* comp_done_cond is used to wake up the migration thread when
436 * one of the compression threads has finished the compression.
437 * comp_done_lock is used to co-work with comp_done_cond.
439 static QemuMutex comp_done_lock;
440 static QemuCond comp_done_cond;
441 /* The empty QEMUFileOps will be used by file in CompressParam */
442 static const QEMUFileOps empty_ops = { };
444 static QEMUFile *decomp_file;
445 static DecompressParam *decomp_param;
446 static QemuThread *decompress_threads;
447 static QemuMutex decomp_done_lock;
448 static QemuCond decomp_done_cond;
450 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
451 ram_addr_t offset, uint8_t *source_buf);
453 static void *do_data_compress(void *opaque)
455 CompressParam *param = opaque;
456 RAMBlock *block;
457 ram_addr_t offset;
458 bool zero_page;
460 qemu_mutex_lock(&param->mutex);
461 while (!param->quit) {
462 if (param->block) {
463 block = param->block;
464 offset = param->offset;
465 param->block = NULL;
466 qemu_mutex_unlock(&param->mutex);
468 zero_page = do_compress_ram_page(param->file, &param->stream,
469 block, offset, param->originbuf);
471 qemu_mutex_lock(&comp_done_lock);
472 param->done = true;
473 param->zero_page = zero_page;
474 qemu_cond_signal(&comp_done_cond);
475 qemu_mutex_unlock(&comp_done_lock);
477 qemu_mutex_lock(&param->mutex);
478 } else {
479 qemu_cond_wait(&param->cond, &param->mutex);
482 qemu_mutex_unlock(&param->mutex);
484 return NULL;
487 static void compress_threads_save_cleanup(void)
489 int i, thread_count;
491 if (!migrate_use_compression() || !comp_param) {
492 return;
495 thread_count = migrate_compress_threads();
496 for (i = 0; i < thread_count; i++) {
498 * we use it as a indicator which shows if the thread is
499 * properly init'd or not
501 if (!comp_param[i].file) {
502 break;
505 qemu_mutex_lock(&comp_param[i].mutex);
506 comp_param[i].quit = true;
507 qemu_cond_signal(&comp_param[i].cond);
508 qemu_mutex_unlock(&comp_param[i].mutex);
510 qemu_thread_join(compress_threads + i);
511 qemu_mutex_destroy(&comp_param[i].mutex);
512 qemu_cond_destroy(&comp_param[i].cond);
513 deflateEnd(&comp_param[i].stream);
514 g_free(comp_param[i].originbuf);
515 qemu_fclose(comp_param[i].file);
516 comp_param[i].file = NULL;
518 qemu_mutex_destroy(&comp_done_lock);
519 qemu_cond_destroy(&comp_done_cond);
520 g_free(compress_threads);
521 g_free(comp_param);
522 compress_threads = NULL;
523 comp_param = NULL;
526 static int compress_threads_save_setup(void)
528 int i, thread_count;
530 if (!migrate_use_compression()) {
531 return 0;
533 thread_count = migrate_compress_threads();
534 compress_threads = g_new0(QemuThread, thread_count);
535 comp_param = g_new0(CompressParam, thread_count);
536 qemu_cond_init(&comp_done_cond);
537 qemu_mutex_init(&comp_done_lock);
538 for (i = 0; i < thread_count; i++) {
539 comp_param[i].originbuf = g_try_malloc(TARGET_PAGE_SIZE);
540 if (!comp_param[i].originbuf) {
541 goto exit;
544 if (deflateInit(&comp_param[i].stream,
545 migrate_compress_level()) != Z_OK) {
546 g_free(comp_param[i].originbuf);
547 goto exit;
550 /* comp_param[i].file is just used as a dummy buffer to save data,
551 * set its ops to empty.
553 comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
554 comp_param[i].done = true;
555 comp_param[i].quit = false;
556 qemu_mutex_init(&comp_param[i].mutex);
557 qemu_cond_init(&comp_param[i].cond);
558 qemu_thread_create(compress_threads + i, "compress",
559 do_data_compress, comp_param + i,
560 QEMU_THREAD_JOINABLE);
562 return 0;
564 exit:
565 compress_threads_save_cleanup();
566 return -1;
570 * save_page_header: write page header to wire
572 * If this is the 1st block, it also writes the block identification
574 * Returns the number of bytes written
576 * @f: QEMUFile where to send the data
577 * @block: block that contains the page we want to send
578 * @offset: offset inside the block for the page
579 * in the lower bits, it contains flags
581 static size_t save_page_header(RAMState *rs, QEMUFile *f, RAMBlock *block,
582 ram_addr_t offset)
584 size_t size, len;
586 if (block == rs->last_sent_block) {
587 offset |= RAM_SAVE_FLAG_CONTINUE;
589 qemu_put_be64(f, offset);
590 size = 8;
592 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
593 len = strlen(block->idstr);
594 qemu_put_byte(f, len);
595 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
596 size += 1 + len;
597 rs->last_sent_block = block;
599 return size;
603 * mig_throttle_guest_down: throotle down the guest
605 * Reduce amount of guest cpu execution to hopefully slow down memory
606 * writes. If guest dirty memory rate is reduced below the rate at
607 * which we can transfer pages to the destination then we should be
608 * able to complete migration. Some workloads dirty memory way too
609 * fast and will not effectively converge, even with auto-converge.
611 static void mig_throttle_guest_down(uint64_t bytes_dirty_period,
612 uint64_t bytes_dirty_threshold)
614 MigrationState *s = migrate_get_current();
615 uint64_t pct_initial = s->parameters.cpu_throttle_initial;
616 uint64_t pct_increment = s->parameters.cpu_throttle_increment;
617 bool pct_tailslow = s->parameters.cpu_throttle_tailslow;
618 int pct_max = s->parameters.max_cpu_throttle;
620 uint64_t throttle_now = cpu_throttle_get_percentage();
621 uint64_t cpu_now, cpu_ideal, throttle_inc;
623 /* We have not started throttling yet. Let's start it. */
624 if (!cpu_throttle_active()) {
625 cpu_throttle_set(pct_initial);
626 } else {
627 /* Throttling already on, just increase the rate */
628 if (!pct_tailslow) {
629 throttle_inc = pct_increment;
630 } else {
631 /* Compute the ideal CPU percentage used by Guest, which may
632 * make the dirty rate match the dirty rate threshold. */
633 cpu_now = 100 - throttle_now;
634 cpu_ideal = cpu_now * (bytes_dirty_threshold * 1.0 /
635 bytes_dirty_period);
636 throttle_inc = MIN(cpu_now - cpu_ideal, pct_increment);
638 cpu_throttle_set(MIN(throttle_now + throttle_inc, pct_max));
643 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
645 * @rs: current RAM state
646 * @current_addr: address for the zero page
648 * Update the xbzrle cache to reflect a page that's been sent as all 0.
649 * The important thing is that a stale (not-yet-0'd) page be replaced
650 * by the new data.
651 * As a bonus, if the page wasn't in the cache it gets added so that
652 * when a small write is made into the 0'd page it gets XBZRLE sent.
654 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
656 if (!rs->xbzrle_enabled) {
657 return;
660 /* We don't care if this fails to allocate a new cache page
661 * as long as it updated an old one */
662 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
663 ram_counters.dirty_sync_count);
666 #define ENCODING_FLAG_XBZRLE 0x1
669 * save_xbzrle_page: compress and send current page
671 * Returns: 1 means that we wrote the page
672 * 0 means that page is identical to the one already sent
673 * -1 means that xbzrle would be longer than normal
675 * @rs: current RAM state
676 * @current_data: pointer to the address of the page contents
677 * @current_addr: addr of the page
678 * @block: block that contains the page we want to send
679 * @offset: offset inside the block for the page
680 * @last_stage: if we are at the completion stage
682 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
683 ram_addr_t current_addr, RAMBlock *block,
684 ram_addr_t offset, bool last_stage)
686 int encoded_len = 0, bytes_xbzrle;
687 uint8_t *prev_cached_page;
689 if (!cache_is_cached(XBZRLE.cache, current_addr,
690 ram_counters.dirty_sync_count)) {
691 xbzrle_counters.cache_miss++;
692 if (!last_stage) {
693 if (cache_insert(XBZRLE.cache, current_addr, *current_data,
694 ram_counters.dirty_sync_count) == -1) {
695 return -1;
696 } else {
697 /* update *current_data when the page has been
698 inserted into cache */
699 *current_data = get_cached_data(XBZRLE.cache, current_addr);
702 return -1;
706 * Reaching here means the page has hit the xbzrle cache, no matter what
707 * encoding result it is (normal encoding, overflow or skipping the page),
708 * count the page as encoded. This is used to calculate the encoding rate.
710 * Example: 2 pages (8KB) being encoded, first page encoding generates 2KB,
711 * 2nd page turns out to be skipped (i.e. no new bytes written to the
712 * page), the overall encoding rate will be 8KB / 2KB = 4, which has the
713 * skipped page included. In this way, the encoding rate can tell if the
714 * guest page is good for xbzrle encoding.
716 xbzrle_counters.pages++;
717 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
719 /* save current buffer into memory */
720 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
722 /* XBZRLE encoding (if there is no overflow) */
723 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
724 TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
725 TARGET_PAGE_SIZE);
728 * Update the cache contents, so that it corresponds to the data
729 * sent, in all cases except where we skip the page.
731 if (!last_stage && encoded_len != 0) {
732 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
734 * In the case where we couldn't compress, ensure that the caller
735 * sends the data from the cache, since the guest might have
736 * changed the RAM since we copied it.
738 *current_data = prev_cached_page;
741 if (encoded_len == 0) {
742 trace_save_xbzrle_page_skipping();
743 return 0;
744 } else if (encoded_len == -1) {
745 trace_save_xbzrle_page_overflow();
746 xbzrle_counters.overflow++;
747 xbzrle_counters.bytes += TARGET_PAGE_SIZE;
748 return -1;
751 /* Send XBZRLE based compressed page */
752 bytes_xbzrle = save_page_header(rs, rs->f, block,
753 offset | RAM_SAVE_FLAG_XBZRLE);
754 qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
755 qemu_put_be16(rs->f, encoded_len);
756 qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
757 bytes_xbzrle += encoded_len + 1 + 2;
759 * Like compressed_size (please see update_compress_thread_counts),
760 * the xbzrle encoded bytes don't count the 8 byte header with
761 * RAM_SAVE_FLAG_CONTINUE.
763 xbzrle_counters.bytes += bytes_xbzrle - 8;
764 ram_counters.transferred += bytes_xbzrle;
766 return 1;
770 * migration_bitmap_find_dirty: find the next dirty page from start
772 * Returns the page offset within memory region of the start of a dirty page
774 * @rs: current RAM state
775 * @rb: RAMBlock where to search for dirty pages
776 * @start: page where we start the search
778 static inline
779 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
780 unsigned long start)
782 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
783 unsigned long *bitmap = rb->bmap;
785 if (ramblock_is_ignored(rb)) {
786 return size;
789 return find_next_bit(bitmap, size, start);
792 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
793 RAMBlock *rb,
794 unsigned long page)
796 bool ret;
798 QEMU_LOCK_GUARD(&rs->bitmap_mutex);
801 * Clear dirty bitmap if needed. This _must_ be called before we
802 * send any of the page in the chunk because we need to make sure
803 * we can capture further page content changes when we sync dirty
804 * log the next time. So as long as we are going to send any of
805 * the page in the chunk we clear the remote dirty bitmap for all.
806 * Clearing it earlier won't be a problem, but too late will.
808 if (rb->clear_bmap && clear_bmap_test_and_clear(rb, page)) {
809 uint8_t shift = rb->clear_bmap_shift;
810 hwaddr size = 1ULL << (TARGET_PAGE_BITS + shift);
811 hwaddr start = (((ram_addr_t)page) << TARGET_PAGE_BITS) & (-size);
814 * CLEAR_BITMAP_SHIFT_MIN should always guarantee this... this
815 * can make things easier sometimes since then start address
816 * of the small chunk will always be 64 pages aligned so the
817 * bitmap will always be aligned to unsigned long. We should
818 * even be able to remove this restriction but I'm simply
819 * keeping it.
821 assert(shift >= 6);
822 trace_migration_bitmap_clear_dirty(rb->idstr, start, size, page);
823 memory_region_clear_dirty_bitmap(rb->mr, start, size);
826 ret = test_and_clear_bit(page, rb->bmap);
828 if (ret) {
829 rs->migration_dirty_pages--;
832 return ret;
835 /* Called with RCU critical section */
836 static void ramblock_sync_dirty_bitmap(RAMState *rs, RAMBlock *rb)
838 uint64_t new_dirty_pages =
839 cpu_physical_memory_sync_dirty_bitmap(rb, 0, rb->used_length);
841 rs->migration_dirty_pages += new_dirty_pages;
842 rs->num_dirty_pages_period += new_dirty_pages;
846 * ram_pagesize_summary: calculate all the pagesizes of a VM
848 * Returns a summary bitmap of the page sizes of all RAMBlocks
850 * For VMs with just normal pages this is equivalent to the host page
851 * size. If it's got some huge pages then it's the OR of all the
852 * different page sizes.
854 uint64_t ram_pagesize_summary(void)
856 RAMBlock *block;
857 uint64_t summary = 0;
859 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
860 summary |= block->page_size;
863 return summary;
866 uint64_t ram_get_total_transferred_pages(void)
868 return ram_counters.normal + ram_counters.duplicate +
869 compression_counters.pages + xbzrle_counters.pages;
872 static void migration_update_rates(RAMState *rs, int64_t end_time)
874 uint64_t page_count = rs->target_page_count - rs->target_page_count_prev;
875 double compressed_size;
877 /* calculate period counters */
878 ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
879 / (end_time - rs->time_last_bitmap_sync);
881 if (!page_count) {
882 return;
885 if (migrate_use_xbzrle()) {
886 double encoded_size, unencoded_size;
888 xbzrle_counters.cache_miss_rate = (double)(xbzrle_counters.cache_miss -
889 rs->xbzrle_cache_miss_prev) / page_count;
890 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
891 unencoded_size = (xbzrle_counters.pages - rs->xbzrle_pages_prev) *
892 TARGET_PAGE_SIZE;
893 encoded_size = xbzrle_counters.bytes - rs->xbzrle_bytes_prev;
894 if (xbzrle_counters.pages == rs->xbzrle_pages_prev || !encoded_size) {
895 xbzrle_counters.encoding_rate = 0;
896 } else {
897 xbzrle_counters.encoding_rate = unencoded_size / encoded_size;
899 rs->xbzrle_pages_prev = xbzrle_counters.pages;
900 rs->xbzrle_bytes_prev = xbzrle_counters.bytes;
903 if (migrate_use_compression()) {
904 compression_counters.busy_rate = (double)(compression_counters.busy -
905 rs->compress_thread_busy_prev) / page_count;
906 rs->compress_thread_busy_prev = compression_counters.busy;
908 compressed_size = compression_counters.compressed_size -
909 rs->compressed_size_prev;
910 if (compressed_size) {
911 double uncompressed_size = (compression_counters.pages -
912 rs->compress_pages_prev) * TARGET_PAGE_SIZE;
914 /* Compression-Ratio = Uncompressed-size / Compressed-size */
915 compression_counters.compression_rate =
916 uncompressed_size / compressed_size;
918 rs->compress_pages_prev = compression_counters.pages;
919 rs->compressed_size_prev = compression_counters.compressed_size;
924 static void migration_trigger_throttle(RAMState *rs)
926 MigrationState *s = migrate_get_current();
927 uint64_t threshold = s->parameters.throttle_trigger_threshold;
929 uint64_t bytes_xfer_period = ram_counters.transferred - rs->bytes_xfer_prev;
930 uint64_t bytes_dirty_period = rs->num_dirty_pages_period * TARGET_PAGE_SIZE;
931 uint64_t bytes_dirty_threshold = bytes_xfer_period * threshold / 100;
933 /* During block migration the auto-converge logic incorrectly detects
934 * that ram migration makes no progress. Avoid this by disabling the
935 * throttling logic during the bulk phase of block migration. */
936 if (migrate_auto_converge() && !blk_mig_bulk_active()) {
937 /* The following detection logic can be refined later. For now:
938 Check to see if the ratio between dirtied bytes and the approx.
939 amount of bytes that just got transferred since the last time
940 we were in this routine reaches the threshold. If that happens
941 twice, start or increase throttling. */
943 if ((bytes_dirty_period > bytes_dirty_threshold) &&
944 (++rs->dirty_rate_high_cnt >= 2)) {
945 trace_migration_throttle();
946 rs->dirty_rate_high_cnt = 0;
947 mig_throttle_guest_down(bytes_dirty_period,
948 bytes_dirty_threshold);
953 static void migration_bitmap_sync(RAMState *rs)
955 RAMBlock *block;
956 int64_t end_time;
958 ram_counters.dirty_sync_count++;
960 if (!rs->time_last_bitmap_sync) {
961 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
964 trace_migration_bitmap_sync_start();
965 memory_global_dirty_log_sync();
967 qemu_mutex_lock(&rs->bitmap_mutex);
968 WITH_RCU_READ_LOCK_GUARD() {
969 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
970 ramblock_sync_dirty_bitmap(rs, block);
972 ram_counters.remaining = ram_bytes_remaining();
974 qemu_mutex_unlock(&rs->bitmap_mutex);
976 memory_global_after_dirty_log_sync();
977 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
979 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
981 /* more than 1 second = 1000 millisecons */
982 if (end_time > rs->time_last_bitmap_sync + 1000) {
983 migration_trigger_throttle(rs);
985 migration_update_rates(rs, end_time);
987 rs->target_page_count_prev = rs->target_page_count;
989 /* reset period counters */
990 rs->time_last_bitmap_sync = end_time;
991 rs->num_dirty_pages_period = 0;
992 rs->bytes_xfer_prev = ram_counters.transferred;
994 if (migrate_use_events()) {
995 qapi_event_send_migration_pass(ram_counters.dirty_sync_count);
999 static void migration_bitmap_sync_precopy(RAMState *rs)
1001 Error *local_err = NULL;
1004 * The current notifier usage is just an optimization to migration, so we
1005 * don't stop the normal migration process in the error case.
1007 if (precopy_notify(PRECOPY_NOTIFY_BEFORE_BITMAP_SYNC, &local_err)) {
1008 error_report_err(local_err);
1009 local_err = NULL;
1012 migration_bitmap_sync(rs);
1014 if (precopy_notify(PRECOPY_NOTIFY_AFTER_BITMAP_SYNC, &local_err)) {
1015 error_report_err(local_err);
1020 * save_zero_page_to_file: send the zero page to the file
1022 * Returns the size of data written to the file, 0 means the page is not
1023 * a zero page
1025 * @rs: current RAM state
1026 * @file: the file where the data is saved
1027 * @block: block that contains the page we want to send
1028 * @offset: offset inside the block for the page
1030 static int save_zero_page_to_file(RAMState *rs, QEMUFile *file,
1031 RAMBlock *block, ram_addr_t offset)
1033 uint8_t *p = block->host + offset;
1034 int len = 0;
1036 if (is_zero_range(p, TARGET_PAGE_SIZE)) {
1037 len += save_page_header(rs, file, block, offset | RAM_SAVE_FLAG_ZERO);
1038 qemu_put_byte(file, 0);
1039 len += 1;
1041 return len;
1045 * save_zero_page: send the zero page to the stream
1047 * Returns the number of pages written.
1049 * @rs: current RAM state
1050 * @block: block that contains the page we want to send
1051 * @offset: offset inside the block for the page
1053 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1055 int len = save_zero_page_to_file(rs, rs->f, block, offset);
1057 if (len) {
1058 ram_counters.duplicate++;
1059 ram_counters.transferred += len;
1060 return 1;
1062 return -1;
1065 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
1067 if (!migrate_release_ram() || !migration_in_postcopy()) {
1068 return;
1071 ram_discard_range(rbname, offset, ((ram_addr_t)pages) << TARGET_PAGE_BITS);
1075 * @pages: the number of pages written by the control path,
1076 * < 0 - error
1077 * > 0 - number of pages written
1079 * Return true if the pages has been saved, otherwise false is returned.
1081 static bool control_save_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1082 int *pages)
1084 uint64_t bytes_xmit = 0;
1085 int ret;
1087 *pages = -1;
1088 ret = ram_control_save_page(rs->f, block->offset, offset, TARGET_PAGE_SIZE,
1089 &bytes_xmit);
1090 if (ret == RAM_SAVE_CONTROL_NOT_SUPP) {
1091 return false;
1094 if (bytes_xmit) {
1095 ram_counters.transferred += bytes_xmit;
1096 *pages = 1;
1099 if (ret == RAM_SAVE_CONTROL_DELAYED) {
1100 return true;
1103 if (bytes_xmit > 0) {
1104 ram_counters.normal++;
1105 } else if (bytes_xmit == 0) {
1106 ram_counters.duplicate++;
1109 return true;
1113 * directly send the page to the stream
1115 * Returns the number of pages written.
1117 * @rs: current RAM state
1118 * @block: block that contains the page we want to send
1119 * @offset: offset inside the block for the page
1120 * @buf: the page to be sent
1121 * @async: send to page asyncly
1123 static int save_normal_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
1124 uint8_t *buf, bool async)
1126 ram_counters.transferred += save_page_header(rs, rs->f, block,
1127 offset | RAM_SAVE_FLAG_PAGE);
1128 if (async) {
1129 qemu_put_buffer_async(rs->f, buf, TARGET_PAGE_SIZE,
1130 migrate_release_ram() &
1131 migration_in_postcopy());
1132 } else {
1133 qemu_put_buffer(rs->f, buf, TARGET_PAGE_SIZE);
1135 ram_counters.transferred += TARGET_PAGE_SIZE;
1136 ram_counters.normal++;
1137 return 1;
1141 * ram_save_page: send the given page to the stream
1143 * Returns the number of pages written.
1144 * < 0 - error
1145 * >=0 - Number of pages written - this might legally be 0
1146 * if xbzrle noticed the page was the same.
1148 * @rs: current RAM state
1149 * @block: block that contains the page we want to send
1150 * @offset: offset inside the block for the page
1151 * @last_stage: if we are at the completion stage
1153 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
1155 int pages = -1;
1156 uint8_t *p;
1157 bool send_async = true;
1158 RAMBlock *block = pss->block;
1159 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1160 ram_addr_t current_addr = block->offset + offset;
1162 p = block->host + offset;
1163 trace_ram_save_page(block->idstr, (uint64_t)offset, p);
1165 XBZRLE_cache_lock();
1166 if (rs->xbzrle_enabled && !migration_in_postcopy()) {
1167 pages = save_xbzrle_page(rs, &p, current_addr, block,
1168 offset, last_stage);
1169 if (!last_stage) {
1170 /* Can't send this cached data async, since the cache page
1171 * might get updated before it gets to the wire
1173 send_async = false;
1177 /* XBZRLE overflow or normal page */
1178 if (pages == -1) {
1179 pages = save_normal_page(rs, block, offset, p, send_async);
1182 XBZRLE_cache_unlock();
1184 return pages;
1187 static int ram_save_multifd_page(RAMState *rs, RAMBlock *block,
1188 ram_addr_t offset)
1190 if (multifd_queue_page(rs->f, block, offset) < 0) {
1191 return -1;
1193 ram_counters.normal++;
1195 return 1;
1198 static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
1199 ram_addr_t offset, uint8_t *source_buf)
1201 RAMState *rs = ram_state;
1202 uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1203 bool zero_page = false;
1204 int ret;
1206 if (save_zero_page_to_file(rs, f, block, offset)) {
1207 zero_page = true;
1208 goto exit;
1211 save_page_header(rs, f, block, offset | RAM_SAVE_FLAG_COMPRESS_PAGE);
1214 * copy it to a internal buffer to avoid it being modified by VM
1215 * so that we can catch up the error during compression and
1216 * decompression
1218 memcpy(source_buf, p, TARGET_PAGE_SIZE);
1219 ret = qemu_put_compression_data(f, stream, source_buf, TARGET_PAGE_SIZE);
1220 if (ret < 0) {
1221 qemu_file_set_error(migrate_get_current()->to_dst_file, ret);
1222 error_report("compressed data failed!");
1223 return false;
1226 exit:
1227 ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
1228 return zero_page;
1231 static void
1232 update_compress_thread_counts(const CompressParam *param, int bytes_xmit)
1234 ram_counters.transferred += bytes_xmit;
1236 if (param->zero_page) {
1237 ram_counters.duplicate++;
1238 return;
1241 /* 8 means a header with RAM_SAVE_FLAG_CONTINUE. */
1242 compression_counters.compressed_size += bytes_xmit - 8;
1243 compression_counters.pages++;
1246 static bool save_page_use_compression(RAMState *rs);
1248 static void flush_compressed_data(RAMState *rs)
1250 int idx, len, thread_count;
1252 if (!save_page_use_compression(rs)) {
1253 return;
1255 thread_count = migrate_compress_threads();
1257 qemu_mutex_lock(&comp_done_lock);
1258 for (idx = 0; idx < thread_count; idx++) {
1259 while (!comp_param[idx].done) {
1260 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1263 qemu_mutex_unlock(&comp_done_lock);
1265 for (idx = 0; idx < thread_count; idx++) {
1266 qemu_mutex_lock(&comp_param[idx].mutex);
1267 if (!comp_param[idx].quit) {
1268 len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1270 * it's safe to fetch zero_page without holding comp_done_lock
1271 * as there is no further request submitted to the thread,
1272 * i.e, the thread should be waiting for a request at this point.
1274 update_compress_thread_counts(&comp_param[idx], len);
1276 qemu_mutex_unlock(&comp_param[idx].mutex);
1280 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1281 ram_addr_t offset)
1283 param->block = block;
1284 param->offset = offset;
1287 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1288 ram_addr_t offset)
1290 int idx, thread_count, bytes_xmit = -1, pages = -1;
1291 bool wait = migrate_compress_wait_thread();
1293 thread_count = migrate_compress_threads();
1294 qemu_mutex_lock(&comp_done_lock);
1295 retry:
1296 for (idx = 0; idx < thread_count; idx++) {
1297 if (comp_param[idx].done) {
1298 comp_param[idx].done = false;
1299 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1300 qemu_mutex_lock(&comp_param[idx].mutex);
1301 set_compress_params(&comp_param[idx], block, offset);
1302 qemu_cond_signal(&comp_param[idx].cond);
1303 qemu_mutex_unlock(&comp_param[idx].mutex);
1304 pages = 1;
1305 update_compress_thread_counts(&comp_param[idx], bytes_xmit);
1306 break;
1311 * wait for the free thread if the user specifies 'compress-wait-thread',
1312 * otherwise we will post the page out in the main thread as normal page.
1314 if (pages < 0 && wait) {
1315 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1316 goto retry;
1318 qemu_mutex_unlock(&comp_done_lock);
1320 return pages;
1324 * find_dirty_block: find the next dirty page and update any state
1325 * associated with the search process.
1327 * Returns true if a page is found
1329 * @rs: current RAM state
1330 * @pss: data about the state of the current dirty page scan
1331 * @again: set to false if the search has scanned the whole of RAM
1333 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1335 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1336 if (pss->complete_round && pss->block == rs->last_seen_block &&
1337 pss->page >= rs->last_page) {
1339 * We've been once around the RAM and haven't found anything.
1340 * Give up.
1342 *again = false;
1343 return false;
1345 if ((((ram_addr_t)pss->page) << TARGET_PAGE_BITS)
1346 >= pss->block->used_length) {
1347 /* Didn't find anything in this RAM Block */
1348 pss->page = 0;
1349 pss->block = QLIST_NEXT_RCU(pss->block, next);
1350 if (!pss->block) {
1352 * If memory migration starts over, we will meet a dirtied page
1353 * which may still exists in compression threads's ring, so we
1354 * should flush the compressed data to make sure the new page
1355 * is not overwritten by the old one in the destination.
1357 * Also If xbzrle is on, stop using the data compression at this
1358 * point. In theory, xbzrle can do better than compression.
1360 flush_compressed_data(rs);
1362 /* Hit the end of the list */
1363 pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1364 /* Flag that we've looped */
1365 pss->complete_round = true;
1366 /* After the first round, enable XBZRLE. */
1367 if (migrate_use_xbzrle()) {
1368 rs->xbzrle_enabled = true;
1371 /* Didn't find anything this time, but try again on the new block */
1372 *again = true;
1373 return false;
1374 } else {
1375 /* Can go around again, but... */
1376 *again = true;
1377 /* We've found something so probably don't need to */
1378 return true;
1383 * unqueue_page: gets a page of the queue
1385 * Helper for 'get_queued_page' - gets a page off the queue
1387 * Returns the block of the page (or NULL if none available)
1389 * @rs: current RAM state
1390 * @offset: used to return the offset within the RAMBlock
1392 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1394 RAMBlock *block = NULL;
1396 if (QSIMPLEQ_EMPTY_ATOMIC(&rs->src_page_requests)) {
1397 return NULL;
1400 QEMU_LOCK_GUARD(&rs->src_page_req_mutex);
1401 if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1402 struct RAMSrcPageRequest *entry =
1403 QSIMPLEQ_FIRST(&rs->src_page_requests);
1404 block = entry->rb;
1405 *offset = entry->offset;
1407 if (entry->len > TARGET_PAGE_SIZE) {
1408 entry->len -= TARGET_PAGE_SIZE;
1409 entry->offset += TARGET_PAGE_SIZE;
1410 } else {
1411 memory_region_unref(block->mr);
1412 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1413 g_free(entry);
1414 migration_consume_urgent_request();
1418 return block;
1421 #if defined(__linux__)
1423 * poll_fault_page: try to get next UFFD write fault page and, if pending fault
1424 * is found, return RAM block pointer and page offset
1426 * Returns pointer to the RAMBlock containing faulting page,
1427 * NULL if no write faults are pending
1429 * @rs: current RAM state
1430 * @offset: page offset from the beginning of the block
1432 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1434 struct uffd_msg uffd_msg;
1435 void *page_address;
1436 RAMBlock *block;
1437 int res;
1439 if (!migrate_background_snapshot()) {
1440 return NULL;
1443 res = uffd_read_events(rs->uffdio_fd, &uffd_msg, 1);
1444 if (res <= 0) {
1445 return NULL;
1448 page_address = (void *)(uintptr_t) uffd_msg.arg.pagefault.address;
1449 block = qemu_ram_block_from_host(page_address, false, offset);
1450 assert(block && (block->flags & RAM_UF_WRITEPROTECT) != 0);
1451 return block;
1455 * ram_save_release_protection: release UFFD write protection after
1456 * a range of pages has been saved
1458 * @rs: current RAM state
1459 * @pss: page-search-status structure
1460 * @start_page: index of the first page in the range relative to pss->block
1462 * Returns 0 on success, negative value in case of an error
1464 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1465 unsigned long start_page)
1467 int res = 0;
1469 /* Check if page is from UFFD-managed region. */
1470 if (pss->block->flags & RAM_UF_WRITEPROTECT) {
1471 void *page_address = pss->block->host + (start_page << TARGET_PAGE_BITS);
1472 uint64_t run_length = (pss->page - start_page + 1) << TARGET_PAGE_BITS;
1474 /* Flush async buffers before un-protect. */
1475 qemu_fflush(rs->f);
1476 /* Un-protect memory range. */
1477 res = uffd_change_protection(rs->uffdio_fd, page_address, run_length,
1478 false, false);
1481 return res;
1484 /* ram_write_tracking_available: check if kernel supports required UFFD features
1486 * Returns true if supports, false otherwise
1488 bool ram_write_tracking_available(void)
1490 uint64_t uffd_features;
1491 int res;
1493 res = uffd_query_features(&uffd_features);
1494 return (res == 0 &&
1495 (uffd_features & UFFD_FEATURE_PAGEFAULT_FLAG_WP) != 0);
1498 /* ram_write_tracking_compatible: check if guest configuration is
1499 * compatible with 'write-tracking'
1501 * Returns true if compatible, false otherwise
1503 bool ram_write_tracking_compatible(void)
1505 const uint64_t uffd_ioctls_mask = BIT(_UFFDIO_WRITEPROTECT);
1506 int uffd_fd;
1507 RAMBlock *block;
1508 bool ret = false;
1510 /* Open UFFD file descriptor */
1511 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, false);
1512 if (uffd_fd < 0) {
1513 return false;
1516 RCU_READ_LOCK_GUARD();
1518 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1519 uint64_t uffd_ioctls;
1521 /* Nothing to do with read-only and MMIO-writable regions */
1522 if (block->mr->readonly || block->mr->rom_device) {
1523 continue;
1525 /* Try to register block memory via UFFD-IO to track writes */
1526 if (uffd_register_memory(uffd_fd, block->host, block->max_length,
1527 UFFDIO_REGISTER_MODE_WP, &uffd_ioctls)) {
1528 goto out;
1530 if ((uffd_ioctls & uffd_ioctls_mask) != uffd_ioctls_mask) {
1531 goto out;
1534 ret = true;
1536 out:
1537 uffd_close_fd(uffd_fd);
1538 return ret;
1542 * ram_block_populate_pages: populate memory in the RAM block by reading
1543 * an integer from the beginning of each page.
1545 * Since it's solely used for userfault_fd WP feature, here we just
1546 * hardcode page size to qemu_real_host_page_size.
1548 * @block: RAM block to populate
1550 static void ram_block_populate_pages(RAMBlock *block)
1552 char *ptr = (char *) block->host;
1554 for (ram_addr_t offset = 0; offset < block->used_length;
1555 offset += qemu_real_host_page_size) {
1556 char tmp = *(ptr + offset);
1558 /* Don't optimize the read out */
1559 asm volatile("" : "+r" (tmp));
1564 * ram_write_tracking_prepare: prepare for UFFD-WP memory tracking
1566 void ram_write_tracking_prepare(void)
1568 RAMBlock *block;
1570 RCU_READ_LOCK_GUARD();
1572 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1573 /* Nothing to do with read-only and MMIO-writable regions */
1574 if (block->mr->readonly || block->mr->rom_device) {
1575 continue;
1579 * Populate pages of the RAM block before enabling userfault_fd
1580 * write protection.
1582 * This stage is required since ioctl(UFFDIO_WRITEPROTECT) with
1583 * UFFDIO_WRITEPROTECT_MODE_WP mode setting would silently skip
1584 * pages with pte_none() entries in page table.
1586 ram_block_populate_pages(block);
1591 * ram_write_tracking_start: start UFFD-WP memory tracking
1593 * Returns 0 for success or negative value in case of error
1595 int ram_write_tracking_start(void)
1597 int uffd_fd;
1598 RAMState *rs = ram_state;
1599 RAMBlock *block;
1601 /* Open UFFD file descriptor */
1602 uffd_fd = uffd_create_fd(UFFD_FEATURE_PAGEFAULT_FLAG_WP, true);
1603 if (uffd_fd < 0) {
1604 return uffd_fd;
1606 rs->uffdio_fd = uffd_fd;
1608 RCU_READ_LOCK_GUARD();
1610 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1611 /* Nothing to do with read-only and MMIO-writable regions */
1612 if (block->mr->readonly || block->mr->rom_device) {
1613 continue;
1616 /* Register block memory with UFFD to track writes */
1617 if (uffd_register_memory(rs->uffdio_fd, block->host,
1618 block->max_length, UFFDIO_REGISTER_MODE_WP, NULL)) {
1619 goto fail;
1621 /* Apply UFFD write protection to the block memory range */
1622 if (uffd_change_protection(rs->uffdio_fd, block->host,
1623 block->max_length, true, false)) {
1624 goto fail;
1626 block->flags |= RAM_UF_WRITEPROTECT;
1627 memory_region_ref(block->mr);
1629 trace_ram_write_tracking_ramblock_start(block->idstr, block->page_size,
1630 block->host, block->max_length);
1633 return 0;
1635 fail:
1636 error_report("ram_write_tracking_start() failed: restoring initial memory state");
1638 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1639 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1640 continue;
1643 * In case some memory block failed to be write-protected
1644 * remove protection and unregister all succeeded RAM blocks
1646 uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1647 false, false);
1648 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1649 /* Cleanup flags and remove reference */
1650 block->flags &= ~RAM_UF_WRITEPROTECT;
1651 memory_region_unref(block->mr);
1654 uffd_close_fd(uffd_fd);
1655 rs->uffdio_fd = -1;
1656 return -1;
1660 * ram_write_tracking_stop: stop UFFD-WP memory tracking and remove protection
1662 void ram_write_tracking_stop(void)
1664 RAMState *rs = ram_state;
1665 RAMBlock *block;
1667 RCU_READ_LOCK_GUARD();
1669 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
1670 if ((block->flags & RAM_UF_WRITEPROTECT) == 0) {
1671 continue;
1673 /* Remove protection and unregister all affected RAM blocks */
1674 uffd_change_protection(rs->uffdio_fd, block->host, block->max_length,
1675 false, false);
1676 uffd_unregister_memory(rs->uffdio_fd, block->host, block->max_length);
1678 trace_ram_write_tracking_ramblock_stop(block->idstr, block->page_size,
1679 block->host, block->max_length);
1681 /* Cleanup flags and remove reference */
1682 block->flags &= ~RAM_UF_WRITEPROTECT;
1683 memory_region_unref(block->mr);
1686 /* Finally close UFFD file descriptor */
1687 uffd_close_fd(rs->uffdio_fd);
1688 rs->uffdio_fd = -1;
1691 #else
1692 /* No target OS support, stubs just fail or ignore */
1694 static RAMBlock *poll_fault_page(RAMState *rs, ram_addr_t *offset)
1696 (void) rs;
1697 (void) offset;
1699 return NULL;
1702 static int ram_save_release_protection(RAMState *rs, PageSearchStatus *pss,
1703 unsigned long start_page)
1705 (void) rs;
1706 (void) pss;
1707 (void) start_page;
1709 return 0;
1712 bool ram_write_tracking_available(void)
1714 return false;
1717 bool ram_write_tracking_compatible(void)
1719 assert(0);
1720 return false;
1723 int ram_write_tracking_start(void)
1725 assert(0);
1726 return -1;
1729 void ram_write_tracking_stop(void)
1731 assert(0);
1733 #endif /* defined(__linux__) */
1736 * get_queued_page: unqueue a page from the postcopy requests
1738 * Skips pages that are already sent (!dirty)
1740 * Returns true if a queued page is found
1742 * @rs: current RAM state
1743 * @pss: data about the state of the current dirty page scan
1745 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1747 RAMBlock *block;
1748 ram_addr_t offset;
1749 bool dirty;
1751 do {
1752 block = unqueue_page(rs, &offset);
1754 * We're sending this page, and since it's postcopy nothing else
1755 * will dirty it, and we must make sure it doesn't get sent again
1756 * even if this queue request was received after the background
1757 * search already sent it.
1759 if (block) {
1760 unsigned long page;
1762 page = offset >> TARGET_PAGE_BITS;
1763 dirty = test_bit(page, block->bmap);
1764 if (!dirty) {
1765 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1766 page);
1767 } else {
1768 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1772 } while (block && !dirty);
1774 if (!block) {
1776 * Poll write faults too if background snapshot is enabled; that's
1777 * when we have vcpus got blocked by the write protected pages.
1779 block = poll_fault_page(rs, &offset);
1782 if (block) {
1784 * We want the background search to continue from the queued page
1785 * since the guest is likely to want other pages near to the page
1786 * it just requested.
1788 pss->block = block;
1789 pss->page = offset >> TARGET_PAGE_BITS;
1792 * This unqueued page would break the "one round" check, even is
1793 * really rare.
1795 pss->complete_round = false;
1798 return !!block;
1802 * migration_page_queue_free: drop any remaining pages in the ram
1803 * request queue
1805 * It should be empty at the end anyway, but in error cases there may
1806 * be some left. in case that there is any page left, we drop it.
1809 static void migration_page_queue_free(RAMState *rs)
1811 struct RAMSrcPageRequest *mspr, *next_mspr;
1812 /* This queue generally should be empty - but in the case of a failed
1813 * migration might have some droppings in.
1815 RCU_READ_LOCK_GUARD();
1816 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1817 memory_region_unref(mspr->rb->mr);
1818 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1819 g_free(mspr);
1824 * ram_save_queue_pages: queue the page for transmission
1826 * A request from postcopy destination for example.
1828 * Returns zero on success or negative on error
1830 * @rbname: Name of the RAMBLock of the request. NULL means the
1831 * same that last one.
1832 * @start: starting address from the start of the RAMBlock
1833 * @len: length (in bytes) to send
1835 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1837 RAMBlock *ramblock;
1838 RAMState *rs = ram_state;
1840 ram_counters.postcopy_requests++;
1841 RCU_READ_LOCK_GUARD();
1843 if (!rbname) {
1844 /* Reuse last RAMBlock */
1845 ramblock = rs->last_req_rb;
1847 if (!ramblock) {
1849 * Shouldn't happen, we can't reuse the last RAMBlock if
1850 * it's the 1st request.
1852 error_report("ram_save_queue_pages no previous block");
1853 return -1;
1855 } else {
1856 ramblock = qemu_ram_block_by_name(rbname);
1858 if (!ramblock) {
1859 /* We shouldn't be asked for a non-existent RAMBlock */
1860 error_report("ram_save_queue_pages no block '%s'", rbname);
1861 return -1;
1863 rs->last_req_rb = ramblock;
1865 trace_ram_save_queue_pages(ramblock->idstr, start, len);
1866 if (start + len > ramblock->used_length) {
1867 error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1868 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1869 __func__, start, len, ramblock->used_length);
1870 return -1;
1873 struct RAMSrcPageRequest *new_entry =
1874 g_malloc0(sizeof(struct RAMSrcPageRequest));
1875 new_entry->rb = ramblock;
1876 new_entry->offset = start;
1877 new_entry->len = len;
1879 memory_region_ref(ramblock->mr);
1880 qemu_mutex_lock(&rs->src_page_req_mutex);
1881 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1882 migration_make_urgent_request();
1883 qemu_mutex_unlock(&rs->src_page_req_mutex);
1885 return 0;
1888 static bool save_page_use_compression(RAMState *rs)
1890 if (!migrate_use_compression()) {
1891 return false;
1895 * If xbzrle is enabled (e.g., after first round of migration), stop
1896 * using the data compression. In theory, xbzrle can do better than
1897 * compression.
1899 if (rs->xbzrle_enabled) {
1900 return false;
1903 return true;
1907 * try to compress the page before posting it out, return true if the page
1908 * has been properly handled by compression, otherwise needs other
1909 * paths to handle it
1911 static bool save_compress_page(RAMState *rs, RAMBlock *block, ram_addr_t offset)
1913 if (!save_page_use_compression(rs)) {
1914 return false;
1918 * When starting the process of a new block, the first page of
1919 * the block should be sent out before other pages in the same
1920 * block, and all the pages in last block should have been sent
1921 * out, keeping this order is important, because the 'cont' flag
1922 * is used to avoid resending the block name.
1924 * We post the fist page as normal page as compression will take
1925 * much CPU resource.
1927 if (block != rs->last_sent_block) {
1928 flush_compressed_data(rs);
1929 return false;
1932 if (compress_page_with_multi_thread(rs, block, offset) > 0) {
1933 return true;
1936 compression_counters.busy++;
1937 return false;
1941 * ram_save_target_page: save one target page
1943 * Returns the number of pages written
1945 * @rs: current RAM state
1946 * @pss: data about the page we want to send
1947 * @last_stage: if we are at the completion stage
1949 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
1950 bool last_stage)
1952 RAMBlock *block = pss->block;
1953 ram_addr_t offset = ((ram_addr_t)pss->page) << TARGET_PAGE_BITS;
1954 int res;
1956 if (control_save_page(rs, block, offset, &res)) {
1957 return res;
1960 if (save_compress_page(rs, block, offset)) {
1961 return 1;
1964 res = save_zero_page(rs, block, offset);
1965 if (res > 0) {
1966 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
1967 * page would be stale
1969 if (!save_page_use_compression(rs)) {
1970 XBZRLE_cache_lock();
1971 xbzrle_cache_zero_page(rs, block->offset + offset);
1972 XBZRLE_cache_unlock();
1974 ram_release_pages(block->idstr, offset, res);
1975 return res;
1979 * Do not use multifd for:
1980 * 1. Compression as the first page in the new block should be posted out
1981 * before sending the compressed page
1982 * 2. In postcopy as one whole host page should be placed
1984 if (!save_page_use_compression(rs) && migrate_use_multifd()
1985 && !migration_in_postcopy()) {
1986 return ram_save_multifd_page(rs, block, offset);
1989 return ram_save_page(rs, pss, last_stage);
1993 * ram_save_host_page: save a whole host page
1995 * Starting at *offset send pages up to the end of the current host
1996 * page. It's valid for the initial offset to point into the middle of
1997 * a host page in which case the remainder of the hostpage is sent.
1998 * Only dirty target pages are sent. Note that the host page size may
1999 * be a huge page for this block.
2000 * The saving stops at the boundary of the used_length of the block
2001 * if the RAMBlock isn't a multiple of the host page size.
2003 * Returns the number of pages written or negative on error
2005 * @rs: current RAM state
2006 * @ms: current migration state
2007 * @pss: data about the page we want to send
2008 * @last_stage: if we are at the completion stage
2010 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
2011 bool last_stage)
2013 int tmppages, pages = 0;
2014 size_t pagesize_bits =
2015 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
2016 unsigned long start_page = pss->page;
2017 int res;
2019 if (ramblock_is_ignored(pss->block)) {
2020 error_report("block %s should not be migrated !", pss->block->idstr);
2021 return 0;
2024 do {
2025 /* Check the pages is dirty and if it is send it */
2026 if (!migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
2027 pss->page++;
2028 continue;
2031 tmppages = ram_save_target_page(rs, pss, last_stage);
2032 if (tmppages < 0) {
2033 return tmppages;
2036 pages += tmppages;
2037 pss->page++;
2038 /* Allow rate limiting to happen in the middle of huge pages */
2039 migration_rate_limit();
2040 } while ((pss->page & (pagesize_bits - 1)) &&
2041 offset_in_ramblock(pss->block,
2042 ((ram_addr_t)pss->page) << TARGET_PAGE_BITS));
2043 /* The offset we leave with is the last one we looked at */
2044 pss->page--;
2046 res = ram_save_release_protection(rs, pss, start_page);
2047 return (res < 0 ? res : pages);
2051 * ram_find_and_save_block: finds a dirty page and sends it to f
2053 * Called within an RCU critical section.
2055 * Returns the number of pages written where zero means no dirty pages,
2056 * or negative on error
2058 * @rs: current RAM state
2059 * @last_stage: if we are at the completion stage
2061 * On systems where host-page-size > target-page-size it will send all the
2062 * pages in a host page that are dirty.
2065 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
2067 PageSearchStatus pss;
2068 int pages = 0;
2069 bool again, found;
2071 /* No dirty page as there is zero RAM */
2072 if (!ram_bytes_total()) {
2073 return pages;
2076 pss.block = rs->last_seen_block;
2077 pss.page = rs->last_page;
2078 pss.complete_round = false;
2080 if (!pss.block) {
2081 pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
2084 do {
2085 again = true;
2086 found = get_queued_page(rs, &pss);
2088 if (!found) {
2089 /* priority queue empty, so just search for something dirty */
2090 found = find_dirty_block(rs, &pss, &again);
2093 if (found) {
2094 pages = ram_save_host_page(rs, &pss, last_stage);
2096 } while (!pages && again);
2098 rs->last_seen_block = pss.block;
2099 rs->last_page = pss.page;
2101 return pages;
2104 void acct_update_position(QEMUFile *f, size_t size, bool zero)
2106 uint64_t pages = size / TARGET_PAGE_SIZE;
2108 if (zero) {
2109 ram_counters.duplicate += pages;
2110 } else {
2111 ram_counters.normal += pages;
2112 ram_counters.transferred += size;
2113 qemu_update_position(f, size);
2117 static uint64_t ram_bytes_total_common(bool count_ignored)
2119 RAMBlock *block;
2120 uint64_t total = 0;
2122 RCU_READ_LOCK_GUARD();
2124 if (count_ignored) {
2125 RAMBLOCK_FOREACH_MIGRATABLE(block) {
2126 total += block->used_length;
2128 } else {
2129 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2130 total += block->used_length;
2133 return total;
2136 uint64_t ram_bytes_total(void)
2138 return ram_bytes_total_common(false);
2141 static void xbzrle_load_setup(void)
2143 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2146 static void xbzrle_load_cleanup(void)
2148 g_free(XBZRLE.decoded_buf);
2149 XBZRLE.decoded_buf = NULL;
2152 static void ram_state_cleanup(RAMState **rsp)
2154 if (*rsp) {
2155 migration_page_queue_free(*rsp);
2156 qemu_mutex_destroy(&(*rsp)->bitmap_mutex);
2157 qemu_mutex_destroy(&(*rsp)->src_page_req_mutex);
2158 g_free(*rsp);
2159 *rsp = NULL;
2163 static void xbzrle_cleanup(void)
2165 XBZRLE_cache_lock();
2166 if (XBZRLE.cache) {
2167 cache_fini(XBZRLE.cache);
2168 g_free(XBZRLE.encoded_buf);
2169 g_free(XBZRLE.current_buf);
2170 g_free(XBZRLE.zero_target_page);
2171 XBZRLE.cache = NULL;
2172 XBZRLE.encoded_buf = NULL;
2173 XBZRLE.current_buf = NULL;
2174 XBZRLE.zero_target_page = NULL;
2176 XBZRLE_cache_unlock();
2179 static void ram_save_cleanup(void *opaque)
2181 RAMState **rsp = opaque;
2182 RAMBlock *block;
2184 /* We don't use dirty log with background snapshots */
2185 if (!migrate_background_snapshot()) {
2186 /* caller have hold iothread lock or is in a bh, so there is
2187 * no writing race against the migration bitmap
2189 memory_global_dirty_log_stop();
2192 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2193 g_free(block->clear_bmap);
2194 block->clear_bmap = NULL;
2195 g_free(block->bmap);
2196 block->bmap = NULL;
2199 xbzrle_cleanup();
2200 compress_threads_save_cleanup();
2201 ram_state_cleanup(rsp);
2204 static void ram_state_reset(RAMState *rs)
2206 rs->last_seen_block = NULL;
2207 rs->last_sent_block = NULL;
2208 rs->last_page = 0;
2209 rs->last_version = ram_list.version;
2210 rs->xbzrle_enabled = false;
2213 #define MAX_WAIT 50 /* ms, half buffered_file limit */
2216 * 'expected' is the value you expect the bitmap mostly to be full
2217 * of; it won't bother printing lines that are all this value.
2218 * If 'todump' is null the migration bitmap is dumped.
2220 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
2221 unsigned long pages)
2223 int64_t cur;
2224 int64_t linelen = 128;
2225 char linebuf[129];
2227 for (cur = 0; cur < pages; cur += linelen) {
2228 int64_t curb;
2229 bool found = false;
2231 * Last line; catch the case where the line length
2232 * is longer than remaining ram
2234 if (cur + linelen > pages) {
2235 linelen = pages - cur;
2237 for (curb = 0; curb < linelen; curb++) {
2238 bool thisbit = test_bit(cur + curb, todump);
2239 linebuf[curb] = thisbit ? '1' : '.';
2240 found = found || (thisbit != expected);
2242 if (found) {
2243 linebuf[curb] = '\0';
2244 fprintf(stderr, "0x%08" PRIx64 " : %s\n", cur, linebuf);
2249 /* **** functions for postcopy ***** */
2251 void ram_postcopy_migrated_memory_release(MigrationState *ms)
2253 struct RAMBlock *block;
2255 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2256 unsigned long *bitmap = block->bmap;
2257 unsigned long range = block->used_length >> TARGET_PAGE_BITS;
2258 unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
2260 while (run_start < range) {
2261 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
2262 ram_discard_range(block->idstr,
2263 ((ram_addr_t)run_start) << TARGET_PAGE_BITS,
2264 ((ram_addr_t)(run_end - run_start))
2265 << TARGET_PAGE_BITS);
2266 run_start = find_next_zero_bit(bitmap, range, run_end + 1);
2272 * postcopy_send_discard_bm_ram: discard a RAMBlock
2274 * Returns zero on success
2276 * Callback from postcopy_each_ram_send_discard for each RAMBlock
2278 * @ms: current migration state
2279 * @block: RAMBlock to discard
2281 static int postcopy_send_discard_bm_ram(MigrationState *ms, RAMBlock *block)
2283 unsigned long end = block->used_length >> TARGET_PAGE_BITS;
2284 unsigned long current;
2285 unsigned long *bitmap = block->bmap;
2287 for (current = 0; current < end; ) {
2288 unsigned long one = find_next_bit(bitmap, end, current);
2289 unsigned long zero, discard_length;
2291 if (one >= end) {
2292 break;
2295 zero = find_next_zero_bit(bitmap, end, one + 1);
2297 if (zero >= end) {
2298 discard_length = end - one;
2299 } else {
2300 discard_length = zero - one;
2302 postcopy_discard_send_range(ms, one, discard_length);
2303 current = one + discard_length;
2306 return 0;
2310 * postcopy_each_ram_send_discard: discard all RAMBlocks
2312 * Returns 0 for success or negative for error
2314 * Utility for the outgoing postcopy code.
2315 * Calls postcopy_send_discard_bm_ram for each RAMBlock
2316 * passing it bitmap indexes and name.
2317 * (qemu_ram_foreach_block ends up passing unscaled lengths
2318 * which would mean postcopy code would have to deal with target page)
2320 * @ms: current migration state
2322 static int postcopy_each_ram_send_discard(MigrationState *ms)
2324 struct RAMBlock *block;
2325 int ret;
2327 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2328 postcopy_discard_send_init(ms, block->idstr);
2331 * Postcopy sends chunks of bitmap over the wire, but it
2332 * just needs indexes at this point, avoids it having
2333 * target page specific code.
2335 ret = postcopy_send_discard_bm_ram(ms, block);
2336 postcopy_discard_send_finish(ms);
2337 if (ret) {
2338 return ret;
2342 return 0;
2346 * postcopy_chunk_hostpages_pass: canonicalize bitmap in hostpages
2348 * Helper for postcopy_chunk_hostpages; it's called twice to
2349 * canonicalize the two bitmaps, that are similar, but one is
2350 * inverted.
2352 * Postcopy requires that all target pages in a hostpage are dirty or
2353 * clean, not a mix. This function canonicalizes the bitmaps.
2355 * @ms: current migration state
2356 * @block: block that contains the page we want to canonicalize
2358 static void postcopy_chunk_hostpages_pass(MigrationState *ms, RAMBlock *block)
2360 RAMState *rs = ram_state;
2361 unsigned long *bitmap = block->bmap;
2362 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
2363 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
2364 unsigned long run_start;
2366 if (block->page_size == TARGET_PAGE_SIZE) {
2367 /* Easy case - TPS==HPS for a non-huge page RAMBlock */
2368 return;
2371 /* Find a dirty page */
2372 run_start = find_next_bit(bitmap, pages, 0);
2374 while (run_start < pages) {
2377 * If the start of this run of pages is in the middle of a host
2378 * page, then we need to fixup this host page.
2380 if (QEMU_IS_ALIGNED(run_start, host_ratio)) {
2381 /* Find the end of this run */
2382 run_start = find_next_zero_bit(bitmap, pages, run_start + 1);
2384 * If the end isn't at the start of a host page, then the
2385 * run doesn't finish at the end of a host page
2386 * and we need to discard.
2390 if (!QEMU_IS_ALIGNED(run_start, host_ratio)) {
2391 unsigned long page;
2392 unsigned long fixup_start_addr = QEMU_ALIGN_DOWN(run_start,
2393 host_ratio);
2394 run_start = QEMU_ALIGN_UP(run_start, host_ratio);
2396 /* Clean up the bitmap */
2397 for (page = fixup_start_addr;
2398 page < fixup_start_addr + host_ratio; page++) {
2400 * Remark them as dirty, updating the count for any pages
2401 * that weren't previously dirty.
2403 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
2407 /* Find the next dirty page for the next iteration */
2408 run_start = find_next_bit(bitmap, pages, run_start);
2413 * postcopy_chunk_hostpages: discard any partially sent host page
2415 * Utility for the outgoing postcopy code.
2417 * Discard any partially sent host-page size chunks, mark any partially
2418 * dirty host-page size chunks as all dirty. In this case the host-page
2419 * is the host-page for the particular RAMBlock, i.e. it might be a huge page
2421 * Returns zero on success
2423 * @ms: current migration state
2424 * @block: block we want to work with
2426 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
2428 postcopy_discard_send_init(ms, block->idstr);
2431 * Ensure that all partially dirty host pages are made fully dirty.
2433 postcopy_chunk_hostpages_pass(ms, block);
2435 postcopy_discard_send_finish(ms);
2436 return 0;
2440 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
2442 * Returns zero on success
2444 * Transmit the set of pages to be discarded after precopy to the target
2445 * these are pages that:
2446 * a) Have been previously transmitted but are now dirty again
2447 * b) Pages that have never been transmitted, this ensures that
2448 * any pages on the destination that have been mapped by background
2449 * tasks get discarded (transparent huge pages is the specific concern)
2450 * Hopefully this is pretty sparse
2452 * @ms: current migration state
2454 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
2456 RAMState *rs = ram_state;
2457 RAMBlock *block;
2458 int ret;
2460 RCU_READ_LOCK_GUARD();
2462 /* This should be our last sync, the src is now paused */
2463 migration_bitmap_sync(rs);
2465 /* Easiest way to make sure we don't resume in the middle of a host-page */
2466 rs->last_seen_block = NULL;
2467 rs->last_sent_block = NULL;
2468 rs->last_page = 0;
2470 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2471 /* Deal with TPS != HPS and huge pages */
2472 ret = postcopy_chunk_hostpages(ms, block);
2473 if (ret) {
2474 return ret;
2477 #ifdef DEBUG_POSTCOPY
2478 ram_debug_dump_bitmap(block->bmap, true,
2479 block->used_length >> TARGET_PAGE_BITS);
2480 #endif
2482 trace_ram_postcopy_send_discard_bitmap();
2484 return postcopy_each_ram_send_discard(ms);
2488 * ram_discard_range: discard dirtied pages at the beginning of postcopy
2490 * Returns zero on success
2492 * @rbname: name of the RAMBlock of the request. NULL means the
2493 * same that last one.
2494 * @start: RAMBlock starting page
2495 * @length: RAMBlock size
2497 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
2499 trace_ram_discard_range(rbname, start, length);
2501 RCU_READ_LOCK_GUARD();
2502 RAMBlock *rb = qemu_ram_block_by_name(rbname);
2504 if (!rb) {
2505 error_report("ram_discard_range: Failed to find block '%s'", rbname);
2506 return -1;
2510 * On source VM, we don't need to update the received bitmap since
2511 * we don't even have one.
2513 if (rb->receivedmap) {
2514 bitmap_clear(rb->receivedmap, start >> qemu_target_page_bits(),
2515 length >> qemu_target_page_bits());
2518 return ram_block_discard_range(rb, start, length);
2522 * For every allocation, we will try not to crash the VM if the
2523 * allocation failed.
2525 static int xbzrle_init(void)
2527 Error *local_err = NULL;
2529 if (!migrate_use_xbzrle()) {
2530 return 0;
2533 XBZRLE_cache_lock();
2535 XBZRLE.zero_target_page = g_try_malloc0(TARGET_PAGE_SIZE);
2536 if (!XBZRLE.zero_target_page) {
2537 error_report("%s: Error allocating zero page", __func__);
2538 goto err_out;
2541 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size(),
2542 TARGET_PAGE_SIZE, &local_err);
2543 if (!XBZRLE.cache) {
2544 error_report_err(local_err);
2545 goto free_zero_page;
2548 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2549 if (!XBZRLE.encoded_buf) {
2550 error_report("%s: Error allocating encoded_buf", __func__);
2551 goto free_cache;
2554 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2555 if (!XBZRLE.current_buf) {
2556 error_report("%s: Error allocating current_buf", __func__);
2557 goto free_encoded_buf;
2560 /* We are all good */
2561 XBZRLE_cache_unlock();
2562 return 0;
2564 free_encoded_buf:
2565 g_free(XBZRLE.encoded_buf);
2566 XBZRLE.encoded_buf = NULL;
2567 free_cache:
2568 cache_fini(XBZRLE.cache);
2569 XBZRLE.cache = NULL;
2570 free_zero_page:
2571 g_free(XBZRLE.zero_target_page);
2572 XBZRLE.zero_target_page = NULL;
2573 err_out:
2574 XBZRLE_cache_unlock();
2575 return -ENOMEM;
2578 static int ram_state_init(RAMState **rsp)
2580 *rsp = g_try_new0(RAMState, 1);
2582 if (!*rsp) {
2583 error_report("%s: Init ramstate fail", __func__);
2584 return -1;
2587 qemu_mutex_init(&(*rsp)->bitmap_mutex);
2588 qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2589 QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2592 * Count the total number of pages used by ram blocks not including any
2593 * gaps due to alignment or unplugs.
2594 * This must match with the initial values of dirty bitmap.
2596 (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2597 ram_state_reset(*rsp);
2599 return 0;
2602 static void ram_list_init_bitmaps(void)
2604 MigrationState *ms = migrate_get_current();
2605 RAMBlock *block;
2606 unsigned long pages;
2607 uint8_t shift;
2609 /* Skip setting bitmap if there is no RAM */
2610 if (ram_bytes_total()) {
2611 shift = ms->clear_bitmap_shift;
2612 if (shift > CLEAR_BITMAP_SHIFT_MAX) {
2613 error_report("clear_bitmap_shift (%u) too big, using "
2614 "max value (%u)", shift, CLEAR_BITMAP_SHIFT_MAX);
2615 shift = CLEAR_BITMAP_SHIFT_MAX;
2616 } else if (shift < CLEAR_BITMAP_SHIFT_MIN) {
2617 error_report("clear_bitmap_shift (%u) too small, using "
2618 "min value (%u)", shift, CLEAR_BITMAP_SHIFT_MIN);
2619 shift = CLEAR_BITMAP_SHIFT_MIN;
2622 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2623 pages = block->max_length >> TARGET_PAGE_BITS;
2625 * The initial dirty bitmap for migration must be set with all
2626 * ones to make sure we'll migrate every guest RAM page to
2627 * destination.
2628 * Here we set RAMBlock.bmap all to 1 because when rebegin a
2629 * new migration after a failed migration, ram_list.
2630 * dirty_memory[DIRTY_MEMORY_MIGRATION] don't include the whole
2631 * guest memory.
2633 block->bmap = bitmap_new(pages);
2634 bitmap_set(block->bmap, 0, pages);
2635 block->clear_bmap_shift = shift;
2636 block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
2641 static void ram_init_bitmaps(RAMState *rs)
2643 /* For memory_global_dirty_log_start below. */
2644 qemu_mutex_lock_iothread();
2645 qemu_mutex_lock_ramlist();
2647 WITH_RCU_READ_LOCK_GUARD() {
2648 ram_list_init_bitmaps();
2649 /* We don't use dirty log with background snapshots */
2650 if (!migrate_background_snapshot()) {
2651 memory_global_dirty_log_start();
2652 migration_bitmap_sync_precopy(rs);
2655 qemu_mutex_unlock_ramlist();
2656 qemu_mutex_unlock_iothread();
2659 static int ram_init_all(RAMState **rsp)
2661 if (ram_state_init(rsp)) {
2662 return -1;
2665 if (xbzrle_init()) {
2666 ram_state_cleanup(rsp);
2667 return -1;
2670 ram_init_bitmaps(*rsp);
2672 return 0;
2675 static void ram_state_resume_prepare(RAMState *rs, QEMUFile *out)
2677 RAMBlock *block;
2678 uint64_t pages = 0;
2681 * Postcopy is not using xbzrle/compression, so no need for that.
2682 * Also, since source are already halted, we don't need to care
2683 * about dirty page logging as well.
2686 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
2687 pages += bitmap_count_one(block->bmap,
2688 block->used_length >> TARGET_PAGE_BITS);
2691 /* This may not be aligned with current bitmaps. Recalculate. */
2692 rs->migration_dirty_pages = pages;
2694 ram_state_reset(rs);
2696 /* Update RAMState cache of output QEMUFile */
2697 rs->f = out;
2699 trace_ram_state_resume_prepare(pages);
2703 * This function clears bits of the free pages reported by the caller from the
2704 * migration dirty bitmap. @addr is the host address corresponding to the
2705 * start of the continuous guest free pages, and @len is the total bytes of
2706 * those pages.
2708 void qemu_guest_free_page_hint(void *addr, size_t len)
2710 RAMBlock *block;
2711 ram_addr_t offset;
2712 size_t used_len, start, npages;
2713 MigrationState *s = migrate_get_current();
2715 /* This function is currently expected to be used during live migration */
2716 if (!migration_is_setup_or_active(s->state)) {
2717 return;
2720 for (; len > 0; len -= used_len, addr += used_len) {
2721 block = qemu_ram_block_from_host(addr, false, &offset);
2722 if (unlikely(!block || offset >= block->used_length)) {
2724 * The implementation might not support RAMBlock resize during
2725 * live migration, but it could happen in theory with future
2726 * updates. So we add a check here to capture that case.
2728 error_report_once("%s unexpected error", __func__);
2729 return;
2732 if (len <= block->used_length - offset) {
2733 used_len = len;
2734 } else {
2735 used_len = block->used_length - offset;
2738 start = offset >> TARGET_PAGE_BITS;
2739 npages = used_len >> TARGET_PAGE_BITS;
2741 qemu_mutex_lock(&ram_state->bitmap_mutex);
2742 ram_state->migration_dirty_pages -=
2743 bitmap_count_one_with_offset(block->bmap, start, npages);
2744 bitmap_clear(block->bmap, start, npages);
2745 qemu_mutex_unlock(&ram_state->bitmap_mutex);
2750 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2751 * long-running RCU critical section. When rcu-reclaims in the code
2752 * start to become numerous it will be necessary to reduce the
2753 * granularity of these critical sections.
2757 * ram_save_setup: Setup RAM for migration
2759 * Returns zero to indicate success and negative for error
2761 * @f: QEMUFile where to send the data
2762 * @opaque: RAMState pointer
2764 static int ram_save_setup(QEMUFile *f, void *opaque)
2766 RAMState **rsp = opaque;
2767 RAMBlock *block;
2769 if (compress_threads_save_setup()) {
2770 return -1;
2773 /* migration has already setup the bitmap, reuse it. */
2774 if (!migration_in_colo_state()) {
2775 if (ram_init_all(rsp) != 0) {
2776 compress_threads_save_cleanup();
2777 return -1;
2780 (*rsp)->f = f;
2782 WITH_RCU_READ_LOCK_GUARD() {
2783 qemu_put_be64(f, ram_bytes_total_common(true) | RAM_SAVE_FLAG_MEM_SIZE);
2785 RAMBLOCK_FOREACH_MIGRATABLE(block) {
2786 qemu_put_byte(f, strlen(block->idstr));
2787 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2788 qemu_put_be64(f, block->used_length);
2789 if (migrate_postcopy_ram() && block->page_size !=
2790 qemu_host_page_size) {
2791 qemu_put_be64(f, block->page_size);
2793 if (migrate_ignore_shared()) {
2794 qemu_put_be64(f, block->mr->addr);
2799 ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2800 ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2802 multifd_send_sync_main(f);
2803 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2804 qemu_fflush(f);
2806 return 0;
2810 * ram_save_iterate: iterative stage for migration
2812 * Returns zero to indicate success and negative for error
2814 * @f: QEMUFile where to send the data
2815 * @opaque: RAMState pointer
2817 static int ram_save_iterate(QEMUFile *f, void *opaque)
2819 RAMState **temp = opaque;
2820 RAMState *rs = *temp;
2821 int ret = 0;
2822 int i;
2823 int64_t t0;
2824 int done = 0;
2826 if (blk_mig_bulk_active()) {
2827 /* Avoid transferring ram during bulk phase of block migration as
2828 * the bulk phase will usually take a long time and transferring
2829 * ram updates during that time is pointless. */
2830 goto out;
2833 WITH_RCU_READ_LOCK_GUARD() {
2834 if (ram_list.version != rs->last_version) {
2835 ram_state_reset(rs);
2838 /* Read version before ram_list.blocks */
2839 smp_rmb();
2841 ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2843 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2844 i = 0;
2845 while ((ret = qemu_file_rate_limit(f)) == 0 ||
2846 !QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
2847 int pages;
2849 if (qemu_file_get_error(f)) {
2850 break;
2853 pages = ram_find_and_save_block(rs, false);
2854 /* no more pages to sent */
2855 if (pages == 0) {
2856 done = 1;
2857 break;
2860 if (pages < 0) {
2861 qemu_file_set_error(f, pages);
2862 break;
2865 rs->target_page_count += pages;
2868 * During postcopy, it is necessary to make sure one whole host
2869 * page is sent in one chunk.
2871 if (migrate_postcopy_ram()) {
2872 flush_compressed_data(rs);
2876 * we want to check in the 1st loop, just in case it was the 1st
2877 * time and we had to sync the dirty bitmap.
2878 * qemu_clock_get_ns() is a bit expensive, so we only check each
2879 * some iterations
2881 if ((i & 63) == 0) {
2882 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) /
2883 1000000;
2884 if (t1 > MAX_WAIT) {
2885 trace_ram_save_iterate_big_wait(t1, i);
2886 break;
2889 i++;
2894 * Must occur before EOS (or any QEMUFile operation)
2895 * because of RDMA protocol.
2897 ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2899 out:
2900 if (ret >= 0
2901 && migration_is_setup_or_active(migrate_get_current()->state)) {
2902 multifd_send_sync_main(rs->f);
2903 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2904 qemu_fflush(f);
2905 ram_counters.transferred += 8;
2907 ret = qemu_file_get_error(f);
2909 if (ret < 0) {
2910 return ret;
2913 return done;
2917 * ram_save_complete: function called to send the remaining amount of ram
2919 * Returns zero to indicate success or negative on error
2921 * Called with iothread lock
2923 * @f: QEMUFile where to send the data
2924 * @opaque: RAMState pointer
2926 static int ram_save_complete(QEMUFile *f, void *opaque)
2928 RAMState **temp = opaque;
2929 RAMState *rs = *temp;
2930 int ret = 0;
2932 WITH_RCU_READ_LOCK_GUARD() {
2933 if (!migration_in_postcopy()) {
2934 migration_bitmap_sync_precopy(rs);
2937 ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2939 /* try transferring iterative blocks of memory */
2941 /* flush all remaining blocks regardless of rate limiting */
2942 while (true) {
2943 int pages;
2945 pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2946 /* no more blocks to sent */
2947 if (pages == 0) {
2948 break;
2950 if (pages < 0) {
2951 ret = pages;
2952 break;
2956 flush_compressed_data(rs);
2957 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2960 if (ret >= 0) {
2961 multifd_send_sync_main(rs->f);
2962 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2963 qemu_fflush(f);
2966 return ret;
2969 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2970 uint64_t *res_precopy_only,
2971 uint64_t *res_compatible,
2972 uint64_t *res_postcopy_only)
2974 RAMState **temp = opaque;
2975 RAMState *rs = *temp;
2976 uint64_t remaining_size;
2978 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2980 if (!migration_in_postcopy() &&
2981 remaining_size < max_size) {
2982 qemu_mutex_lock_iothread();
2983 WITH_RCU_READ_LOCK_GUARD() {
2984 migration_bitmap_sync_precopy(rs);
2986 qemu_mutex_unlock_iothread();
2987 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2990 if (migrate_postcopy_ram()) {
2991 /* We can do postcopy, and all the data is postcopiable */
2992 *res_compatible += remaining_size;
2993 } else {
2994 *res_precopy_only += remaining_size;
2998 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
3000 unsigned int xh_len;
3001 int xh_flags;
3002 uint8_t *loaded_data;
3004 /* extract RLE header */
3005 xh_flags = qemu_get_byte(f);
3006 xh_len = qemu_get_be16(f);
3008 if (xh_flags != ENCODING_FLAG_XBZRLE) {
3009 error_report("Failed to load XBZRLE page - wrong compression!");
3010 return -1;
3013 if (xh_len > TARGET_PAGE_SIZE) {
3014 error_report("Failed to load XBZRLE page - len overflow!");
3015 return -1;
3017 loaded_data = XBZRLE.decoded_buf;
3018 /* load data and decode */
3019 /* it can change loaded_data to point to an internal buffer */
3020 qemu_get_buffer_in_place(f, &loaded_data, xh_len);
3022 /* decode RLE */
3023 if (xbzrle_decode_buffer(loaded_data, xh_len, host,
3024 TARGET_PAGE_SIZE) == -1) {
3025 error_report("Failed to load XBZRLE page - decode error!");
3026 return -1;
3029 return 0;
3033 * ram_block_from_stream: read a RAMBlock id from the migration stream
3035 * Must be called from within a rcu critical section.
3037 * Returns a pointer from within the RCU-protected ram_list.
3039 * @f: QEMUFile where to read the data from
3040 * @flags: Page flags (mostly to see if it's a continuation of previous block)
3042 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
3044 static RAMBlock *block;
3045 char id[256];
3046 uint8_t len;
3048 if (flags & RAM_SAVE_FLAG_CONTINUE) {
3049 if (!block) {
3050 error_report("Ack, bad migration stream!");
3051 return NULL;
3053 return block;
3056 len = qemu_get_byte(f);
3057 qemu_get_buffer(f, (uint8_t *)id, len);
3058 id[len] = 0;
3060 block = qemu_ram_block_by_name(id);
3061 if (!block) {
3062 error_report("Can't find block %s", id);
3063 return NULL;
3066 if (ramblock_is_ignored(block)) {
3067 error_report("block %s should not be migrated !", id);
3068 return NULL;
3071 return block;
3074 static inline void *host_from_ram_block_offset(RAMBlock *block,
3075 ram_addr_t offset)
3077 if (!offset_in_ramblock(block, offset)) {
3078 return NULL;
3081 return block->host + offset;
3084 static inline void *colo_cache_from_block_offset(RAMBlock *block,
3085 ram_addr_t offset, bool record_bitmap)
3087 if (!offset_in_ramblock(block, offset)) {
3088 return NULL;
3090 if (!block->colo_cache) {
3091 error_report("%s: colo_cache is NULL in block :%s",
3092 __func__, block->idstr);
3093 return NULL;
3097 * During colo checkpoint, we need bitmap of these migrated pages.
3098 * It help us to decide which pages in ram cache should be flushed
3099 * into VM's RAM later.
3101 if (record_bitmap &&
3102 !test_and_set_bit(offset >> TARGET_PAGE_BITS, block->bmap)) {
3103 ram_state->migration_dirty_pages++;
3105 return block->colo_cache + offset;
3109 * ram_handle_compressed: handle the zero page case
3111 * If a page (or a whole RDMA chunk) has been
3112 * determined to be zero, then zap it.
3114 * @host: host address for the zero page
3115 * @ch: what the page is filled from. We only support zero
3116 * @size: size of the zero page
3118 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
3120 if (ch != 0 || !is_zero_range(host, size)) {
3121 memset(host, ch, size);
3125 /* return the size after decompression, or negative value on error */
3126 static int
3127 qemu_uncompress_data(z_stream *stream, uint8_t *dest, size_t dest_len,
3128 const uint8_t *source, size_t source_len)
3130 int err;
3132 err = inflateReset(stream);
3133 if (err != Z_OK) {
3134 return -1;
3137 stream->avail_in = source_len;
3138 stream->next_in = (uint8_t *)source;
3139 stream->avail_out = dest_len;
3140 stream->next_out = dest;
3142 err = inflate(stream, Z_NO_FLUSH);
3143 if (err != Z_STREAM_END) {
3144 return -1;
3147 return stream->total_out;
3150 static void *do_data_decompress(void *opaque)
3152 DecompressParam *param = opaque;
3153 unsigned long pagesize;
3154 uint8_t *des;
3155 int len, ret;
3157 qemu_mutex_lock(&param->mutex);
3158 while (!param->quit) {
3159 if (param->des) {
3160 des = param->des;
3161 len = param->len;
3162 param->des = 0;
3163 qemu_mutex_unlock(&param->mutex);
3165 pagesize = TARGET_PAGE_SIZE;
3167 ret = qemu_uncompress_data(&param->stream, des, pagesize,
3168 param->compbuf, len);
3169 if (ret < 0 && migrate_get_current()->decompress_error_check) {
3170 error_report("decompress data failed");
3171 qemu_file_set_error(decomp_file, ret);
3174 qemu_mutex_lock(&decomp_done_lock);
3175 param->done = true;
3176 qemu_cond_signal(&decomp_done_cond);
3177 qemu_mutex_unlock(&decomp_done_lock);
3179 qemu_mutex_lock(&param->mutex);
3180 } else {
3181 qemu_cond_wait(&param->cond, &param->mutex);
3184 qemu_mutex_unlock(&param->mutex);
3186 return NULL;
3189 static int wait_for_decompress_done(void)
3191 int idx, thread_count;
3193 if (!migrate_use_compression()) {
3194 return 0;
3197 thread_count = migrate_decompress_threads();
3198 qemu_mutex_lock(&decomp_done_lock);
3199 for (idx = 0; idx < thread_count; idx++) {
3200 while (!decomp_param[idx].done) {
3201 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3204 qemu_mutex_unlock(&decomp_done_lock);
3205 return qemu_file_get_error(decomp_file);
3208 static void compress_threads_load_cleanup(void)
3210 int i, thread_count;
3212 if (!migrate_use_compression()) {
3213 return;
3215 thread_count = migrate_decompress_threads();
3216 for (i = 0; i < thread_count; i++) {
3218 * we use it as a indicator which shows if the thread is
3219 * properly init'd or not
3221 if (!decomp_param[i].compbuf) {
3222 break;
3225 qemu_mutex_lock(&decomp_param[i].mutex);
3226 decomp_param[i].quit = true;
3227 qemu_cond_signal(&decomp_param[i].cond);
3228 qemu_mutex_unlock(&decomp_param[i].mutex);
3230 for (i = 0; i < thread_count; i++) {
3231 if (!decomp_param[i].compbuf) {
3232 break;
3235 qemu_thread_join(decompress_threads + i);
3236 qemu_mutex_destroy(&decomp_param[i].mutex);
3237 qemu_cond_destroy(&decomp_param[i].cond);
3238 inflateEnd(&decomp_param[i].stream);
3239 g_free(decomp_param[i].compbuf);
3240 decomp_param[i].compbuf = NULL;
3242 g_free(decompress_threads);
3243 g_free(decomp_param);
3244 decompress_threads = NULL;
3245 decomp_param = NULL;
3246 decomp_file = NULL;
3249 static int compress_threads_load_setup(QEMUFile *f)
3251 int i, thread_count;
3253 if (!migrate_use_compression()) {
3254 return 0;
3257 thread_count = migrate_decompress_threads();
3258 decompress_threads = g_new0(QemuThread, thread_count);
3259 decomp_param = g_new0(DecompressParam, thread_count);
3260 qemu_mutex_init(&decomp_done_lock);
3261 qemu_cond_init(&decomp_done_cond);
3262 decomp_file = f;
3263 for (i = 0; i < thread_count; i++) {
3264 if (inflateInit(&decomp_param[i].stream) != Z_OK) {
3265 goto exit;
3268 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
3269 qemu_mutex_init(&decomp_param[i].mutex);
3270 qemu_cond_init(&decomp_param[i].cond);
3271 decomp_param[i].done = true;
3272 decomp_param[i].quit = false;
3273 qemu_thread_create(decompress_threads + i, "decompress",
3274 do_data_decompress, decomp_param + i,
3275 QEMU_THREAD_JOINABLE);
3277 return 0;
3278 exit:
3279 compress_threads_load_cleanup();
3280 return -1;
3283 static void decompress_data_with_multi_threads(QEMUFile *f,
3284 void *host, int len)
3286 int idx, thread_count;
3288 thread_count = migrate_decompress_threads();
3289 QEMU_LOCK_GUARD(&decomp_done_lock);
3290 while (true) {
3291 for (idx = 0; idx < thread_count; idx++) {
3292 if (decomp_param[idx].done) {
3293 decomp_param[idx].done = false;
3294 qemu_mutex_lock(&decomp_param[idx].mutex);
3295 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
3296 decomp_param[idx].des = host;
3297 decomp_param[idx].len = len;
3298 qemu_cond_signal(&decomp_param[idx].cond);
3299 qemu_mutex_unlock(&decomp_param[idx].mutex);
3300 break;
3303 if (idx < thread_count) {
3304 break;
3305 } else {
3306 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
3311 static void colo_init_ram_state(void)
3313 ram_state_init(&ram_state);
3317 * colo cache: this is for secondary VM, we cache the whole
3318 * memory of the secondary VM, it is need to hold the global lock
3319 * to call this helper.
3321 int colo_init_ram_cache(void)
3323 RAMBlock *block;
3325 WITH_RCU_READ_LOCK_GUARD() {
3326 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3327 block->colo_cache = qemu_anon_ram_alloc(block->used_length,
3328 NULL,
3329 false);
3330 if (!block->colo_cache) {
3331 error_report("%s: Can't alloc memory for COLO cache of block %s,"
3332 "size 0x" RAM_ADDR_FMT, __func__, block->idstr,
3333 block->used_length);
3334 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3335 if (block->colo_cache) {
3336 qemu_anon_ram_free(block->colo_cache, block->used_length);
3337 block->colo_cache = NULL;
3340 return -errno;
3346 * Record the dirty pages that sent by PVM, we use this dirty bitmap together
3347 * with to decide which page in cache should be flushed into SVM's RAM. Here
3348 * we use the same name 'ram_bitmap' as for migration.
3350 if (ram_bytes_total()) {
3351 RAMBlock *block;
3353 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3354 unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
3355 block->bmap = bitmap_new(pages);
3359 colo_init_ram_state();
3360 return 0;
3363 /* TODO: duplicated with ram_init_bitmaps */
3364 void colo_incoming_start_dirty_log(void)
3366 RAMBlock *block = NULL;
3367 /* For memory_global_dirty_log_start below. */
3368 qemu_mutex_lock_iothread();
3369 qemu_mutex_lock_ramlist();
3371 memory_global_dirty_log_sync();
3372 WITH_RCU_READ_LOCK_GUARD() {
3373 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3374 ramblock_sync_dirty_bitmap(ram_state, block);
3375 /* Discard this dirty bitmap record */
3376 bitmap_zero(block->bmap, block->max_length >> TARGET_PAGE_BITS);
3378 memory_global_dirty_log_start();
3380 ram_state->migration_dirty_pages = 0;
3381 qemu_mutex_unlock_ramlist();
3382 qemu_mutex_unlock_iothread();
3385 /* It is need to hold the global lock to call this helper */
3386 void colo_release_ram_cache(void)
3388 RAMBlock *block;
3390 memory_global_dirty_log_stop();
3391 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3392 g_free(block->bmap);
3393 block->bmap = NULL;
3396 WITH_RCU_READ_LOCK_GUARD() {
3397 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3398 if (block->colo_cache) {
3399 qemu_anon_ram_free(block->colo_cache, block->used_length);
3400 block->colo_cache = NULL;
3404 ram_state_cleanup(&ram_state);
3408 * ram_load_setup: Setup RAM for migration incoming side
3410 * Returns zero to indicate success and negative for error
3412 * @f: QEMUFile where to receive the data
3413 * @opaque: RAMState pointer
3415 static int ram_load_setup(QEMUFile *f, void *opaque)
3417 if (compress_threads_load_setup(f)) {
3418 return -1;
3421 xbzrle_load_setup();
3422 ramblock_recv_map_init();
3424 return 0;
3427 static int ram_load_cleanup(void *opaque)
3429 RAMBlock *rb;
3431 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3432 qemu_ram_block_writeback(rb);
3435 xbzrle_load_cleanup();
3436 compress_threads_load_cleanup();
3438 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3439 g_free(rb->receivedmap);
3440 rb->receivedmap = NULL;
3443 return 0;
3447 * ram_postcopy_incoming_init: allocate postcopy data structures
3449 * Returns 0 for success and negative if there was one error
3451 * @mis: current migration incoming state
3453 * Allocate data structures etc needed by incoming migration with
3454 * postcopy-ram. postcopy-ram's similarly names
3455 * postcopy_ram_incoming_init does the work.
3457 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
3459 return postcopy_ram_incoming_init(mis);
3463 * ram_load_postcopy: load a page in postcopy case
3465 * Returns 0 for success or -errno in case of error
3467 * Called in postcopy mode by ram_load().
3468 * rcu_read_lock is taken prior to this being called.
3470 * @f: QEMUFile where to send the data
3472 static int ram_load_postcopy(QEMUFile *f)
3474 int flags = 0, ret = 0;
3475 bool place_needed = false;
3476 bool matches_target_page_size = false;
3477 MigrationIncomingState *mis = migration_incoming_get_current();
3478 /* Temporary page that is later 'placed' */
3479 void *postcopy_host_page = mis->postcopy_tmp_page;
3480 void *this_host = NULL;
3481 bool all_zero = true;
3482 int target_pages = 0;
3484 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3485 ram_addr_t addr;
3486 void *host = NULL;
3487 void *page_buffer = NULL;
3488 void *place_source = NULL;
3489 RAMBlock *block = NULL;
3490 uint8_t ch;
3491 int len;
3493 addr = qemu_get_be64(f);
3496 * If qemu file error, we should stop here, and then "addr"
3497 * may be invalid
3499 ret = qemu_file_get_error(f);
3500 if (ret) {
3501 break;
3504 flags = addr & ~TARGET_PAGE_MASK;
3505 addr &= TARGET_PAGE_MASK;
3507 trace_ram_load_postcopy_loop((uint64_t)addr, flags);
3508 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3509 RAM_SAVE_FLAG_COMPRESS_PAGE)) {
3510 block = ram_block_from_stream(f, flags);
3512 host = host_from_ram_block_offset(block, addr);
3513 if (!host) {
3514 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3515 ret = -EINVAL;
3516 break;
3518 target_pages++;
3519 matches_target_page_size = block->page_size == TARGET_PAGE_SIZE;
3521 * Postcopy requires that we place whole host pages atomically;
3522 * these may be huge pages for RAMBlocks that are backed by
3523 * hugetlbfs.
3524 * To make it atomic, the data is read into a temporary page
3525 * that's moved into place later.
3526 * The migration protocol uses, possibly smaller, target-pages
3527 * however the source ensures it always sends all the components
3528 * of a host page in one chunk.
3530 page_buffer = postcopy_host_page +
3531 ((uintptr_t)host & (block->page_size - 1));
3532 if (target_pages == 1) {
3533 this_host = (void *)QEMU_ALIGN_DOWN((uintptr_t)host,
3534 block->page_size);
3535 } else {
3536 /* not the 1st TP within the HP */
3537 if (QEMU_ALIGN_DOWN((uintptr_t)host, block->page_size) !=
3538 (uintptr_t)this_host) {
3539 error_report("Non-same host page %p/%p",
3540 host, this_host);
3541 ret = -EINVAL;
3542 break;
3547 * If it's the last part of a host page then we place the host
3548 * page
3550 if (target_pages == (block->page_size / TARGET_PAGE_SIZE)) {
3551 place_needed = true;
3553 place_source = postcopy_host_page;
3556 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3557 case RAM_SAVE_FLAG_ZERO:
3558 ch = qemu_get_byte(f);
3560 * Can skip to set page_buffer when
3561 * this is a zero page and (block->page_size == TARGET_PAGE_SIZE).
3563 if (ch || !matches_target_page_size) {
3564 memset(page_buffer, ch, TARGET_PAGE_SIZE);
3566 if (ch) {
3567 all_zero = false;
3569 break;
3571 case RAM_SAVE_FLAG_PAGE:
3572 all_zero = false;
3573 if (!matches_target_page_size) {
3574 /* For huge pages, we always use temporary buffer */
3575 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
3576 } else {
3578 * For small pages that matches target page size, we
3579 * avoid the qemu_file copy. Instead we directly use
3580 * the buffer of QEMUFile to place the page. Note: we
3581 * cannot do any QEMUFile operation before using that
3582 * buffer to make sure the buffer is valid when
3583 * placing the page.
3585 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
3586 TARGET_PAGE_SIZE);
3588 break;
3589 case RAM_SAVE_FLAG_COMPRESS_PAGE:
3590 all_zero = false;
3591 len = qemu_get_be32(f);
3592 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3593 error_report("Invalid compressed data length: %d", len);
3594 ret = -EINVAL;
3595 break;
3597 decompress_data_with_multi_threads(f, page_buffer, len);
3598 break;
3600 case RAM_SAVE_FLAG_EOS:
3601 /* normal exit */
3602 multifd_recv_sync_main();
3603 break;
3604 default:
3605 error_report("Unknown combination of migration flags: 0x%x"
3606 " (postcopy mode)", flags);
3607 ret = -EINVAL;
3608 break;
3611 /* Got the whole host page, wait for decompress before placing. */
3612 if (place_needed) {
3613 ret |= wait_for_decompress_done();
3616 /* Detect for any possible file errors */
3617 if (!ret && qemu_file_get_error(f)) {
3618 ret = qemu_file_get_error(f);
3621 if (!ret && place_needed) {
3622 /* This gets called at the last target page in the host page */
3623 void *place_dest = (void *)QEMU_ALIGN_DOWN((uintptr_t)host,
3624 block->page_size);
3626 if (all_zero) {
3627 ret = postcopy_place_page_zero(mis, place_dest,
3628 block);
3629 } else {
3630 ret = postcopy_place_page(mis, place_dest,
3631 place_source, block);
3633 place_needed = false;
3634 target_pages = 0;
3635 /* Assume we have a zero page until we detect something different */
3636 all_zero = true;
3640 return ret;
3643 static bool postcopy_is_advised(void)
3645 PostcopyState ps = postcopy_state_get();
3646 return ps >= POSTCOPY_INCOMING_ADVISE && ps < POSTCOPY_INCOMING_END;
3649 static bool postcopy_is_running(void)
3651 PostcopyState ps = postcopy_state_get();
3652 return ps >= POSTCOPY_INCOMING_LISTENING && ps < POSTCOPY_INCOMING_END;
3656 * Flush content of RAM cache into SVM's memory.
3657 * Only flush the pages that be dirtied by PVM or SVM or both.
3659 void colo_flush_ram_cache(void)
3661 RAMBlock *block = NULL;
3662 void *dst_host;
3663 void *src_host;
3664 unsigned long offset = 0;
3666 memory_global_dirty_log_sync();
3667 WITH_RCU_READ_LOCK_GUARD() {
3668 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3669 ramblock_sync_dirty_bitmap(ram_state, block);
3673 trace_colo_flush_ram_cache_begin(ram_state->migration_dirty_pages);
3674 WITH_RCU_READ_LOCK_GUARD() {
3675 block = QLIST_FIRST_RCU(&ram_list.blocks);
3677 while (block) {
3678 offset = migration_bitmap_find_dirty(ram_state, block, offset);
3680 if (((ram_addr_t)offset) << TARGET_PAGE_BITS
3681 >= block->used_length) {
3682 offset = 0;
3683 block = QLIST_NEXT_RCU(block, next);
3684 } else {
3685 migration_bitmap_clear_dirty(ram_state, block, offset);
3686 dst_host = block->host
3687 + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3688 src_host = block->colo_cache
3689 + (((ram_addr_t)offset) << TARGET_PAGE_BITS);
3690 memcpy(dst_host, src_host, TARGET_PAGE_SIZE);
3694 trace_colo_flush_ram_cache_end();
3698 * ram_load_precopy: load pages in precopy case
3700 * Returns 0 for success or -errno in case of error
3702 * Called in precopy mode by ram_load().
3703 * rcu_read_lock is taken prior to this being called.
3705 * @f: QEMUFile where to send the data
3707 static int ram_load_precopy(QEMUFile *f)
3709 int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
3710 /* ADVISE is earlier, it shows the source has the postcopy capability on */
3711 bool postcopy_advised = postcopy_is_advised();
3712 if (!migrate_use_compression()) {
3713 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
3716 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
3717 ram_addr_t addr, total_ram_bytes;
3718 void *host = NULL, *host_bak = NULL;
3719 uint8_t ch;
3722 * Yield periodically to let main loop run, but an iteration of
3723 * the main loop is expensive, so do it each some iterations
3725 if ((i & 32767) == 0 && qemu_in_coroutine()) {
3726 aio_co_schedule(qemu_get_current_aio_context(),
3727 qemu_coroutine_self());
3728 qemu_coroutine_yield();
3730 i++;
3732 addr = qemu_get_be64(f);
3733 flags = addr & ~TARGET_PAGE_MASK;
3734 addr &= TARGET_PAGE_MASK;
3736 if (flags & invalid_flags) {
3737 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
3738 error_report("Received an unexpected compressed page");
3741 ret = -EINVAL;
3742 break;
3745 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
3746 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
3747 RAMBlock *block = ram_block_from_stream(f, flags);
3749 host = host_from_ram_block_offset(block, addr);
3751 * After going into COLO stage, we should not load the page
3752 * into SVM's memory directly, we put them into colo_cache firstly.
3753 * NOTE: We need to keep a copy of SVM's ram in colo_cache.
3754 * Previously, we copied all these memory in preparing stage of COLO
3755 * while we need to stop VM, which is a time-consuming process.
3756 * Here we optimize it by a trick, back-up every page while in
3757 * migration process while COLO is enabled, though it affects the
3758 * speed of the migration, but it obviously reduce the downtime of
3759 * back-up all SVM'S memory in COLO preparing stage.
3761 if (migration_incoming_colo_enabled()) {
3762 if (migration_incoming_in_colo_state()) {
3763 /* In COLO stage, put all pages into cache temporarily */
3764 host = colo_cache_from_block_offset(block, addr, true);
3765 } else {
3767 * In migration stage but before COLO stage,
3768 * Put all pages into both cache and SVM's memory.
3770 host_bak = colo_cache_from_block_offset(block, addr, false);
3773 if (!host) {
3774 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
3775 ret = -EINVAL;
3776 break;
3778 if (!migration_incoming_in_colo_state()) {
3779 ramblock_recv_bitmap_set(block, host);
3782 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
3785 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
3786 case RAM_SAVE_FLAG_MEM_SIZE:
3787 /* Synchronize RAM block list */
3788 total_ram_bytes = addr;
3789 while (!ret && total_ram_bytes) {
3790 RAMBlock *block;
3791 char id[256];
3792 ram_addr_t length;
3794 len = qemu_get_byte(f);
3795 qemu_get_buffer(f, (uint8_t *)id, len);
3796 id[len] = 0;
3797 length = qemu_get_be64(f);
3799 block = qemu_ram_block_by_name(id);
3800 if (block && !qemu_ram_is_migratable(block)) {
3801 error_report("block %s should not be migrated !", id);
3802 ret = -EINVAL;
3803 } else if (block) {
3804 if (length != block->used_length) {
3805 Error *local_err = NULL;
3807 ret = qemu_ram_resize(block, length,
3808 &local_err);
3809 if (local_err) {
3810 error_report_err(local_err);
3813 /* For postcopy we need to check hugepage sizes match */
3814 if (postcopy_advised && migrate_postcopy_ram() &&
3815 block->page_size != qemu_host_page_size) {
3816 uint64_t remote_page_size = qemu_get_be64(f);
3817 if (remote_page_size != block->page_size) {
3818 error_report("Mismatched RAM page size %s "
3819 "(local) %zd != %" PRId64,
3820 id, block->page_size,
3821 remote_page_size);
3822 ret = -EINVAL;
3825 if (migrate_ignore_shared()) {
3826 hwaddr addr = qemu_get_be64(f);
3827 if (ramblock_is_ignored(block) &&
3828 block->mr->addr != addr) {
3829 error_report("Mismatched GPAs for block %s "
3830 "%" PRId64 "!= %" PRId64,
3831 id, (uint64_t)addr,
3832 (uint64_t)block->mr->addr);
3833 ret = -EINVAL;
3836 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
3837 block->idstr);
3838 } else {
3839 error_report("Unknown ramblock \"%s\", cannot "
3840 "accept migration", id);
3841 ret = -EINVAL;
3844 total_ram_bytes -= length;
3846 break;
3848 case RAM_SAVE_FLAG_ZERO:
3849 ch = qemu_get_byte(f);
3850 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
3851 break;
3853 case RAM_SAVE_FLAG_PAGE:
3854 qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
3855 break;
3857 case RAM_SAVE_FLAG_COMPRESS_PAGE:
3858 len = qemu_get_be32(f);
3859 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
3860 error_report("Invalid compressed data length: %d", len);
3861 ret = -EINVAL;
3862 break;
3864 decompress_data_with_multi_threads(f, host, len);
3865 break;
3867 case RAM_SAVE_FLAG_XBZRLE:
3868 if (load_xbzrle(f, addr, host) < 0) {
3869 error_report("Failed to decompress XBZRLE page at "
3870 RAM_ADDR_FMT, addr);
3871 ret = -EINVAL;
3872 break;
3874 break;
3875 case RAM_SAVE_FLAG_EOS:
3876 /* normal exit */
3877 multifd_recv_sync_main();
3878 break;
3879 default:
3880 if (flags & RAM_SAVE_FLAG_HOOK) {
3881 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
3882 } else {
3883 error_report("Unknown combination of migration flags: 0x%x",
3884 flags);
3885 ret = -EINVAL;
3888 if (!ret) {
3889 ret = qemu_file_get_error(f);
3891 if (!ret && host_bak) {
3892 memcpy(host_bak, host, TARGET_PAGE_SIZE);
3896 ret |= wait_for_decompress_done();
3897 return ret;
3900 static int ram_load(QEMUFile *f, void *opaque, int version_id)
3902 int ret = 0;
3903 static uint64_t seq_iter;
3905 * If system is running in postcopy mode, page inserts to host memory must
3906 * be atomic
3908 bool postcopy_running = postcopy_is_running();
3910 seq_iter++;
3912 if (version_id != 4) {
3913 return -EINVAL;
3917 * This RCU critical section can be very long running.
3918 * When RCU reclaims in the code start to become numerous,
3919 * it will be necessary to reduce the granularity of this
3920 * critical section.
3922 WITH_RCU_READ_LOCK_GUARD() {
3923 if (postcopy_running) {
3924 ret = ram_load_postcopy(f);
3925 } else {
3926 ret = ram_load_precopy(f);
3929 trace_ram_load_complete(ret, seq_iter);
3931 return ret;
3934 static bool ram_has_postcopy(void *opaque)
3936 RAMBlock *rb;
3937 RAMBLOCK_FOREACH_NOT_IGNORED(rb) {
3938 if (ramblock_is_pmem(rb)) {
3939 info_report("Block: %s, host: %p is a nvdimm memory, postcopy"
3940 "is not supported now!", rb->idstr, rb->host);
3941 return false;
3945 return migrate_postcopy_ram();
3948 /* Sync all the dirty bitmap with destination VM. */
3949 static int ram_dirty_bitmap_sync_all(MigrationState *s, RAMState *rs)
3951 RAMBlock *block;
3952 QEMUFile *file = s->to_dst_file;
3953 int ramblock_count = 0;
3955 trace_ram_dirty_bitmap_sync_start();
3957 RAMBLOCK_FOREACH_NOT_IGNORED(block) {
3958 qemu_savevm_send_recv_bitmap(file, block->idstr);
3959 trace_ram_dirty_bitmap_request(block->idstr);
3960 ramblock_count++;
3963 trace_ram_dirty_bitmap_sync_wait();
3965 /* Wait until all the ramblocks' dirty bitmap synced */
3966 while (ramblock_count--) {
3967 qemu_sem_wait(&s->rp_state.rp_sem);
3970 trace_ram_dirty_bitmap_sync_complete();
3972 return 0;
3975 static void ram_dirty_bitmap_reload_notify(MigrationState *s)
3977 qemu_sem_post(&s->rp_state.rp_sem);
3981 * Read the received bitmap, revert it as the initial dirty bitmap.
3982 * This is only used when the postcopy migration is paused but wants
3983 * to resume from a middle point.
3985 int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *block)
3987 int ret = -EINVAL;
3988 QEMUFile *file = s->rp_state.from_dst_file;
3989 unsigned long *le_bitmap, nbits = block->used_length >> TARGET_PAGE_BITS;
3990 uint64_t local_size = DIV_ROUND_UP(nbits, 8);
3991 uint64_t size, end_mark;
3993 trace_ram_dirty_bitmap_reload_begin(block->idstr);
3995 if (s->state != MIGRATION_STATUS_POSTCOPY_RECOVER) {
3996 error_report("%s: incorrect state %s", __func__,
3997 MigrationStatus_str(s->state));
3998 return -EINVAL;
4002 * Note: see comments in ramblock_recv_bitmap_send() on why we
4003 * need the endianness conversion, and the paddings.
4005 local_size = ROUND_UP(local_size, 8);
4007 /* Add paddings */
4008 le_bitmap = bitmap_new(nbits + BITS_PER_LONG);
4010 size = qemu_get_be64(file);
4012 /* The size of the bitmap should match with our ramblock */
4013 if (size != local_size) {
4014 error_report("%s: ramblock '%s' bitmap size mismatch "
4015 "(0x%"PRIx64" != 0x%"PRIx64")", __func__,
4016 block->idstr, size, local_size);
4017 ret = -EINVAL;
4018 goto out;
4021 size = qemu_get_buffer(file, (uint8_t *)le_bitmap, local_size);
4022 end_mark = qemu_get_be64(file);
4024 ret = qemu_file_get_error(file);
4025 if (ret || size != local_size) {
4026 error_report("%s: read bitmap failed for ramblock '%s': %d"
4027 " (size 0x%"PRIx64", got: 0x%"PRIx64")",
4028 __func__, block->idstr, ret, local_size, size);
4029 ret = -EIO;
4030 goto out;
4033 if (end_mark != RAMBLOCK_RECV_BITMAP_ENDING) {
4034 error_report("%s: ramblock '%s' end mark incorrect: 0x%"PRIx64,
4035 __func__, block->idstr, end_mark);
4036 ret = -EINVAL;
4037 goto out;
4041 * Endianness conversion. We are during postcopy (though paused).
4042 * The dirty bitmap won't change. We can directly modify it.
4044 bitmap_from_le(block->bmap, le_bitmap, nbits);
4047 * What we received is "received bitmap". Revert it as the initial
4048 * dirty bitmap for this ramblock.
4050 bitmap_complement(block->bmap, block->bmap, nbits);
4052 trace_ram_dirty_bitmap_reload_complete(block->idstr);
4055 * We succeeded to sync bitmap for current ramblock. If this is
4056 * the last one to sync, we need to notify the main send thread.
4058 ram_dirty_bitmap_reload_notify(s);
4060 ret = 0;
4061 out:
4062 g_free(le_bitmap);
4063 return ret;
4066 static int ram_resume_prepare(MigrationState *s, void *opaque)
4068 RAMState *rs = *(RAMState **)opaque;
4069 int ret;
4071 ret = ram_dirty_bitmap_sync_all(s, rs);
4072 if (ret) {
4073 return ret;
4076 ram_state_resume_prepare(rs, s->to_dst_file);
4078 return 0;
4081 static SaveVMHandlers savevm_ram_handlers = {
4082 .save_setup = ram_save_setup,
4083 .save_live_iterate = ram_save_iterate,
4084 .save_live_complete_postcopy = ram_save_complete,
4085 .save_live_complete_precopy = ram_save_complete,
4086 .has_postcopy = ram_has_postcopy,
4087 .save_live_pending = ram_save_pending,
4088 .load_state = ram_load,
4089 .save_cleanup = ram_save_cleanup,
4090 .load_setup = ram_load_setup,
4091 .load_cleanup = ram_load_cleanup,
4092 .resume_prepare = ram_resume_prepare,
4095 void ram_mig_init(void)
4097 qemu_mutex_init(&XBZRLE.lock);
4098 register_savevm_live("ram", 0, 4, &savevm_ram_handlers, &ram_state);