xen/disk: don't leak stack data via response ring
[qemu/ar7.git] / migration / ram.c
blob0baa1e0d56052055ada6d20cc84c5434c1ac955c
1 /*
2 * QEMU System Emulator
4 * Copyright (c) 2003-2008 Fabrice Bellard
5 * Copyright (c) 2011-2015 Red Hat Inc
7 * Authors:
8 * Juan Quintela <quintela@redhat.com>
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 * THE SOFTWARE.
28 #include "qemu/osdep.h"
29 #include "cpu.h"
30 #include <zlib.h>
31 #include "qapi-event.h"
32 #include "qemu/cutils.h"
33 #include "qemu/bitops.h"
34 #include "qemu/bitmap.h"
35 #include "qemu/main-loop.h"
36 #include "xbzrle.h"
37 #include "ram.h"
38 #include "migration.h"
39 #include "migration/register.h"
40 #include "migration/misc.h"
41 #include "qemu-file.h"
42 #include "postcopy-ram.h"
43 #include "migration/page_cache.h"
44 #include "qemu/error-report.h"
45 #include "trace.h"
46 #include "exec/ram_addr.h"
47 #include "qemu/rcu_queue.h"
48 #include "migration/colo.h"
50 /***********************************************************/
51 /* ram save/restore */
53 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
54 * worked for pages that where filled with the same char. We switched
55 * it to only search for the zero value. And to avoid confusion with
56 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
59 #define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */
60 #define RAM_SAVE_FLAG_ZERO 0x02
61 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
62 #define RAM_SAVE_FLAG_PAGE 0x08
63 #define RAM_SAVE_FLAG_EOS 0x10
64 #define RAM_SAVE_FLAG_CONTINUE 0x20
65 #define RAM_SAVE_FLAG_XBZRLE 0x40
66 /* 0x80 is reserved in migration.h start with 0x100 next */
67 #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
69 static inline bool is_zero_range(uint8_t *p, uint64_t size)
71 return buffer_is_zero(p, size);
74 XBZRLECacheStats xbzrle_counters;
76 /* struct contains XBZRLE cache and a static page
77 used by the compression */
78 static struct {
79 /* buffer used for XBZRLE encoding */
80 uint8_t *encoded_buf;
81 /* buffer for storing page content */
82 uint8_t *current_buf;
83 /* Cache for XBZRLE, Protected by lock. */
84 PageCache *cache;
85 QemuMutex lock;
86 /* it will store a page full of zeros */
87 uint8_t *zero_target_page;
88 } XBZRLE;
90 /* buffer used for XBZRLE decoding */
91 static uint8_t *xbzrle_decoded_buf;
93 static void XBZRLE_cache_lock(void)
95 if (migrate_use_xbzrle())
96 qemu_mutex_lock(&XBZRLE.lock);
99 static void XBZRLE_cache_unlock(void)
101 if (migrate_use_xbzrle())
102 qemu_mutex_unlock(&XBZRLE.lock);
106 * xbzrle_cache_resize: resize the xbzrle cache
108 * This function is called from qmp_migrate_set_cache_size in main
109 * thread, possibly while a migration is in progress. A running
110 * migration may be using the cache and might finish during this call,
111 * hence changes to the cache are protected by XBZRLE.lock().
113 * Returns the new_size or negative in case of error.
115 * @new_size: new cache size
117 int64_t xbzrle_cache_resize(int64_t new_size)
119 PageCache *new_cache;
120 int64_t ret;
122 if (new_size < TARGET_PAGE_SIZE) {
123 return -1;
126 XBZRLE_cache_lock();
128 if (XBZRLE.cache != NULL) {
129 if (pow2floor(new_size) == migrate_xbzrle_cache_size()) {
130 goto out_new_size;
132 new_cache = cache_init(new_size / TARGET_PAGE_SIZE,
133 TARGET_PAGE_SIZE);
134 if (!new_cache) {
135 error_report("Error creating cache");
136 ret = -1;
137 goto out;
140 cache_fini(XBZRLE.cache);
141 XBZRLE.cache = new_cache;
144 out_new_size:
145 ret = pow2floor(new_size);
146 out:
147 XBZRLE_cache_unlock();
148 return ret;
152 * An outstanding page request, on the source, having been received
153 * and queued
155 struct RAMSrcPageRequest {
156 RAMBlock *rb;
157 hwaddr offset;
158 hwaddr len;
160 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
163 /* State of RAM for migration */
164 struct RAMState {
165 /* QEMUFile used for this migration */
166 QEMUFile *f;
167 /* Last block that we have visited searching for dirty pages */
168 RAMBlock *last_seen_block;
169 /* Last block from where we have sent data */
170 RAMBlock *last_sent_block;
171 /* Last dirty target page we have sent */
172 ram_addr_t last_page;
173 /* last ram version we have seen */
174 uint32_t last_version;
175 /* We are in the first round */
176 bool ram_bulk_stage;
177 /* How many times we have dirty too many pages */
178 int dirty_rate_high_cnt;
179 /* these variables are used for bitmap sync */
180 /* last time we did a full bitmap_sync */
181 int64_t time_last_bitmap_sync;
182 /* bytes transferred at start_time */
183 uint64_t bytes_xfer_prev;
184 /* number of dirty pages since start_time */
185 uint64_t num_dirty_pages_period;
186 /* xbzrle misses since the beginning of the period */
187 uint64_t xbzrle_cache_miss_prev;
188 /* number of iterations at the beginning of period */
189 uint64_t iterations_prev;
190 /* Iterations since start */
191 uint64_t iterations;
192 /* protects modification of the bitmap */
193 uint64_t migration_dirty_pages;
194 /* number of dirty bits in the bitmap */
195 QemuMutex bitmap_mutex;
196 /* The RAMBlock used in the last src_page_requests */
197 RAMBlock *last_req_rb;
198 /* Queue of outstanding page requests from the destination */
199 QemuMutex src_page_req_mutex;
200 QSIMPLEQ_HEAD(src_page_requests, RAMSrcPageRequest) src_page_requests;
202 typedef struct RAMState RAMState;
204 static RAMState *ram_state;
206 uint64_t ram_bytes_remaining(void)
208 return ram_state->migration_dirty_pages * TARGET_PAGE_SIZE;
211 MigrationStats ram_counters;
213 /* used by the search for pages to send */
214 struct PageSearchStatus {
215 /* Current block being searched */
216 RAMBlock *block;
217 /* Current page to search from */
218 unsigned long page;
219 /* Set once we wrap around */
220 bool complete_round;
222 typedef struct PageSearchStatus PageSearchStatus;
224 struct CompressParam {
225 bool done;
226 bool quit;
227 QEMUFile *file;
228 QemuMutex mutex;
229 QemuCond cond;
230 RAMBlock *block;
231 ram_addr_t offset;
233 typedef struct CompressParam CompressParam;
235 struct DecompressParam {
236 bool done;
237 bool quit;
238 QemuMutex mutex;
239 QemuCond cond;
240 void *des;
241 uint8_t *compbuf;
242 int len;
244 typedef struct DecompressParam DecompressParam;
246 static CompressParam *comp_param;
247 static QemuThread *compress_threads;
248 /* comp_done_cond is used to wake up the migration thread when
249 * one of the compression threads has finished the compression.
250 * comp_done_lock is used to co-work with comp_done_cond.
252 static QemuMutex comp_done_lock;
253 static QemuCond comp_done_cond;
254 /* The empty QEMUFileOps will be used by file in CompressParam */
255 static const QEMUFileOps empty_ops = { };
257 static DecompressParam *decomp_param;
258 static QemuThread *decompress_threads;
259 static QemuMutex decomp_done_lock;
260 static QemuCond decomp_done_cond;
262 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
263 ram_addr_t offset);
265 static void *do_data_compress(void *opaque)
267 CompressParam *param = opaque;
268 RAMBlock *block;
269 ram_addr_t offset;
271 qemu_mutex_lock(&param->mutex);
272 while (!param->quit) {
273 if (param->block) {
274 block = param->block;
275 offset = param->offset;
276 param->block = NULL;
277 qemu_mutex_unlock(&param->mutex);
279 do_compress_ram_page(param->file, block, offset);
281 qemu_mutex_lock(&comp_done_lock);
282 param->done = true;
283 qemu_cond_signal(&comp_done_cond);
284 qemu_mutex_unlock(&comp_done_lock);
286 qemu_mutex_lock(&param->mutex);
287 } else {
288 qemu_cond_wait(&param->cond, &param->mutex);
291 qemu_mutex_unlock(&param->mutex);
293 return NULL;
296 static inline void terminate_compression_threads(void)
298 int idx, thread_count;
300 thread_count = migrate_compress_threads();
302 for (idx = 0; idx < thread_count; idx++) {
303 qemu_mutex_lock(&comp_param[idx].mutex);
304 comp_param[idx].quit = true;
305 qemu_cond_signal(&comp_param[idx].cond);
306 qemu_mutex_unlock(&comp_param[idx].mutex);
310 void migrate_compress_threads_join(void)
312 int i, thread_count;
314 if (!migrate_use_compression()) {
315 return;
317 terminate_compression_threads();
318 thread_count = migrate_compress_threads();
319 for (i = 0; i < thread_count; i++) {
320 qemu_thread_join(compress_threads + i);
321 qemu_fclose(comp_param[i].file);
322 qemu_mutex_destroy(&comp_param[i].mutex);
323 qemu_cond_destroy(&comp_param[i].cond);
325 qemu_mutex_destroy(&comp_done_lock);
326 qemu_cond_destroy(&comp_done_cond);
327 g_free(compress_threads);
328 g_free(comp_param);
329 compress_threads = NULL;
330 comp_param = NULL;
333 void migrate_compress_threads_create(void)
335 int i, thread_count;
337 if (!migrate_use_compression()) {
338 return;
340 thread_count = migrate_compress_threads();
341 compress_threads = g_new0(QemuThread, thread_count);
342 comp_param = g_new0(CompressParam, thread_count);
343 qemu_cond_init(&comp_done_cond);
344 qemu_mutex_init(&comp_done_lock);
345 for (i = 0; i < thread_count; i++) {
346 /* comp_param[i].file is just used as a dummy buffer to save data,
347 * set its ops to empty.
349 comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
350 comp_param[i].done = true;
351 comp_param[i].quit = false;
352 qemu_mutex_init(&comp_param[i].mutex);
353 qemu_cond_init(&comp_param[i].cond);
354 qemu_thread_create(compress_threads + i, "compress",
355 do_data_compress, comp_param + i,
356 QEMU_THREAD_JOINABLE);
361 * save_page_header: write page header to wire
363 * If this is the 1st block, it also writes the block identification
365 * Returns the number of bytes written
367 * @f: QEMUFile where to send the data
368 * @block: block that contains the page we want to send
369 * @offset: offset inside the block for the page
370 * in the lower bits, it contains flags
372 static size_t save_page_header(RAMState *rs, QEMUFile *f, RAMBlock *block,
373 ram_addr_t offset)
375 size_t size, len;
377 if (block == rs->last_sent_block) {
378 offset |= RAM_SAVE_FLAG_CONTINUE;
380 qemu_put_be64(f, offset);
381 size = 8;
383 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
384 len = strlen(block->idstr);
385 qemu_put_byte(f, len);
386 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
387 size += 1 + len;
388 rs->last_sent_block = block;
390 return size;
394 * mig_throttle_guest_down: throotle down the guest
396 * Reduce amount of guest cpu execution to hopefully slow down memory
397 * writes. If guest dirty memory rate is reduced below the rate at
398 * which we can transfer pages to the destination then we should be
399 * able to complete migration. Some workloads dirty memory way too
400 * fast and will not effectively converge, even with auto-converge.
402 static void mig_throttle_guest_down(void)
404 MigrationState *s = migrate_get_current();
405 uint64_t pct_initial = s->parameters.cpu_throttle_initial;
406 uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
408 /* We have not started throttling yet. Let's start it. */
409 if (!cpu_throttle_active()) {
410 cpu_throttle_set(pct_initial);
411 } else {
412 /* Throttling already on, just increase the rate */
413 cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
418 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
420 * @rs: current RAM state
421 * @current_addr: address for the zero page
423 * Update the xbzrle cache to reflect a page that's been sent as all 0.
424 * The important thing is that a stale (not-yet-0'd) page be replaced
425 * by the new data.
426 * As a bonus, if the page wasn't in the cache it gets added so that
427 * when a small write is made into the 0'd page it gets XBZRLE sent.
429 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
431 if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
432 return;
435 /* We don't care if this fails to allocate a new cache page
436 * as long as it updated an old one */
437 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
438 ram_counters.dirty_sync_count);
441 #define ENCODING_FLAG_XBZRLE 0x1
444 * save_xbzrle_page: compress and send current page
446 * Returns: 1 means that we wrote the page
447 * 0 means that page is identical to the one already sent
448 * -1 means that xbzrle would be longer than normal
450 * @rs: current RAM state
451 * @current_data: pointer to the address of the page contents
452 * @current_addr: addr of the page
453 * @block: block that contains the page we want to send
454 * @offset: offset inside the block for the page
455 * @last_stage: if we are at the completion stage
457 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
458 ram_addr_t current_addr, RAMBlock *block,
459 ram_addr_t offset, bool last_stage)
461 int encoded_len = 0, bytes_xbzrle;
462 uint8_t *prev_cached_page;
464 if (!cache_is_cached(XBZRLE.cache, current_addr,
465 ram_counters.dirty_sync_count)) {
466 xbzrle_counters.cache_miss++;
467 if (!last_stage) {
468 if (cache_insert(XBZRLE.cache, current_addr, *current_data,
469 ram_counters.dirty_sync_count) == -1) {
470 return -1;
471 } else {
472 /* update *current_data when the page has been
473 inserted into cache */
474 *current_data = get_cached_data(XBZRLE.cache, current_addr);
477 return -1;
480 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
482 /* save current buffer into memory */
483 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
485 /* XBZRLE encoding (if there is no overflow) */
486 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
487 TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
488 TARGET_PAGE_SIZE);
489 if (encoded_len == 0) {
490 trace_save_xbzrle_page_skipping();
491 return 0;
492 } else if (encoded_len == -1) {
493 trace_save_xbzrle_page_overflow();
494 xbzrle_counters.overflow++;
495 /* update data in the cache */
496 if (!last_stage) {
497 memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
498 *current_data = prev_cached_page;
500 return -1;
503 /* we need to update the data in the cache, in order to get the same data */
504 if (!last_stage) {
505 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
508 /* Send XBZRLE based compressed page */
509 bytes_xbzrle = save_page_header(rs, rs->f, block,
510 offset | RAM_SAVE_FLAG_XBZRLE);
511 qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
512 qemu_put_be16(rs->f, encoded_len);
513 qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
514 bytes_xbzrle += encoded_len + 1 + 2;
515 xbzrle_counters.pages++;
516 xbzrle_counters.bytes += bytes_xbzrle;
517 ram_counters.transferred += bytes_xbzrle;
519 return 1;
523 * migration_bitmap_find_dirty: find the next dirty page from start
525 * Called with rcu_read_lock() to protect migration_bitmap
527 * Returns the byte offset within memory region of the start of a dirty page
529 * @rs: current RAM state
530 * @rb: RAMBlock where to search for dirty pages
531 * @start: page where we start the search
533 static inline
534 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
535 unsigned long start)
537 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
538 unsigned long *bitmap = rb->bmap;
539 unsigned long next;
541 if (rs->ram_bulk_stage && start > 0) {
542 next = start + 1;
543 } else {
544 next = find_next_bit(bitmap, size, start);
547 return next;
550 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
551 RAMBlock *rb,
552 unsigned long page)
554 bool ret;
556 ret = test_and_clear_bit(page, rb->bmap);
558 if (ret) {
559 rs->migration_dirty_pages--;
561 return ret;
564 static void migration_bitmap_sync_range(RAMState *rs, RAMBlock *rb,
565 ram_addr_t start, ram_addr_t length)
567 rs->migration_dirty_pages +=
568 cpu_physical_memory_sync_dirty_bitmap(rb, start, length,
569 &rs->num_dirty_pages_period);
573 * ram_pagesize_summary: calculate all the pagesizes of a VM
575 * Returns a summary bitmap of the page sizes of all RAMBlocks
577 * For VMs with just normal pages this is equivalent to the host page
578 * size. If it's got some huge pages then it's the OR of all the
579 * different page sizes.
581 uint64_t ram_pagesize_summary(void)
583 RAMBlock *block;
584 uint64_t summary = 0;
586 RAMBLOCK_FOREACH(block) {
587 summary |= block->page_size;
590 return summary;
593 static void migration_bitmap_sync(RAMState *rs)
595 RAMBlock *block;
596 int64_t end_time;
597 uint64_t bytes_xfer_now;
599 ram_counters.dirty_sync_count++;
601 if (!rs->time_last_bitmap_sync) {
602 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
605 trace_migration_bitmap_sync_start();
606 memory_global_dirty_log_sync();
608 qemu_mutex_lock(&rs->bitmap_mutex);
609 rcu_read_lock();
610 RAMBLOCK_FOREACH(block) {
611 migration_bitmap_sync_range(rs, block, 0, block->used_length);
613 rcu_read_unlock();
614 qemu_mutex_unlock(&rs->bitmap_mutex);
616 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
618 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
620 /* more than 1 second = 1000 millisecons */
621 if (end_time > rs->time_last_bitmap_sync + 1000) {
622 /* calculate period counters */
623 ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
624 / (end_time - rs->time_last_bitmap_sync);
625 bytes_xfer_now = ram_counters.transferred;
627 if (migrate_auto_converge()) {
628 /* The following detection logic can be refined later. For now:
629 Check to see if the dirtied bytes is 50% more than the approx.
630 amount of bytes that just got transferred since the last time we
631 were in this routine. If that happens twice, start or increase
632 throttling */
634 if ((rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
635 (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
636 (++rs->dirty_rate_high_cnt >= 2)) {
637 trace_migration_throttle();
638 rs->dirty_rate_high_cnt = 0;
639 mig_throttle_guest_down();
643 if (migrate_use_xbzrle()) {
644 if (rs->iterations_prev != rs->iterations) {
645 xbzrle_counters.cache_miss_rate =
646 (double)(xbzrle_counters.cache_miss -
647 rs->xbzrle_cache_miss_prev) /
648 (rs->iterations - rs->iterations_prev);
650 rs->iterations_prev = rs->iterations;
651 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
654 /* reset period counters */
655 rs->time_last_bitmap_sync = end_time;
656 rs->num_dirty_pages_period = 0;
657 rs->bytes_xfer_prev = bytes_xfer_now;
659 if (migrate_use_events()) {
660 qapi_event_send_migration_pass(ram_counters.dirty_sync_count, NULL);
665 * save_zero_page: send the zero page to the stream
667 * Returns the number of pages written.
669 * @rs: current RAM state
670 * @block: block that contains the page we want to send
671 * @offset: offset inside the block for the page
672 * @p: pointer to the page
674 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
675 uint8_t *p)
677 int pages = -1;
679 if (is_zero_range(p, TARGET_PAGE_SIZE)) {
680 ram_counters.duplicate++;
681 ram_counters.transferred +=
682 save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_ZERO);
683 qemu_put_byte(rs->f, 0);
684 ram_counters.transferred += 1;
685 pages = 1;
688 return pages;
691 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
693 if (!migrate_release_ram() || !migration_in_postcopy()) {
694 return;
697 ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
701 * ram_save_page: send the given page to the stream
703 * Returns the number of pages written.
704 * < 0 - error
705 * >=0 - Number of pages written - this might legally be 0
706 * if xbzrle noticed the page was the same.
708 * @rs: current RAM state
709 * @block: block that contains the page we want to send
710 * @offset: offset inside the block for the page
711 * @last_stage: if we are at the completion stage
713 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
715 int pages = -1;
716 uint64_t bytes_xmit;
717 ram_addr_t current_addr;
718 uint8_t *p;
719 int ret;
720 bool send_async = true;
721 RAMBlock *block = pss->block;
722 ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
724 p = block->host + offset;
725 trace_ram_save_page(block->idstr, (uint64_t)offset, p);
727 /* In doubt sent page as normal */
728 bytes_xmit = 0;
729 ret = ram_control_save_page(rs->f, block->offset,
730 offset, TARGET_PAGE_SIZE, &bytes_xmit);
731 if (bytes_xmit) {
732 ram_counters.transferred += bytes_xmit;
733 pages = 1;
736 XBZRLE_cache_lock();
738 current_addr = block->offset + offset;
740 if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
741 if (ret != RAM_SAVE_CONTROL_DELAYED) {
742 if (bytes_xmit > 0) {
743 ram_counters.normal++;
744 } else if (bytes_xmit == 0) {
745 ram_counters.duplicate++;
748 } else {
749 pages = save_zero_page(rs, block, offset, p);
750 if (pages > 0) {
751 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
752 * page would be stale
754 xbzrle_cache_zero_page(rs, current_addr);
755 ram_release_pages(block->idstr, offset, pages);
756 } else if (!rs->ram_bulk_stage &&
757 !migration_in_postcopy() && migrate_use_xbzrle()) {
758 pages = save_xbzrle_page(rs, &p, current_addr, block,
759 offset, last_stage);
760 if (!last_stage) {
761 /* Can't send this cached data async, since the cache page
762 * might get updated before it gets to the wire
764 send_async = false;
769 /* XBZRLE overflow or normal page */
770 if (pages == -1) {
771 ram_counters.transferred +=
772 save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_PAGE);
773 if (send_async) {
774 qemu_put_buffer_async(rs->f, p, TARGET_PAGE_SIZE,
775 migrate_release_ram() &
776 migration_in_postcopy());
777 } else {
778 qemu_put_buffer(rs->f, p, TARGET_PAGE_SIZE);
780 ram_counters.transferred += TARGET_PAGE_SIZE;
781 pages = 1;
782 ram_counters.normal++;
785 XBZRLE_cache_unlock();
787 return pages;
790 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
791 ram_addr_t offset)
793 RAMState *rs = ram_state;
794 int bytes_sent, blen;
795 uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
797 bytes_sent = save_page_header(rs, f, block, offset |
798 RAM_SAVE_FLAG_COMPRESS_PAGE);
799 blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
800 migrate_compress_level());
801 if (blen < 0) {
802 bytes_sent = 0;
803 qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
804 error_report("compressed data failed!");
805 } else {
806 bytes_sent += blen;
807 ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
810 return bytes_sent;
813 static void flush_compressed_data(RAMState *rs)
815 int idx, len, thread_count;
817 if (!migrate_use_compression()) {
818 return;
820 thread_count = migrate_compress_threads();
822 qemu_mutex_lock(&comp_done_lock);
823 for (idx = 0; idx < thread_count; idx++) {
824 while (!comp_param[idx].done) {
825 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
828 qemu_mutex_unlock(&comp_done_lock);
830 for (idx = 0; idx < thread_count; idx++) {
831 qemu_mutex_lock(&comp_param[idx].mutex);
832 if (!comp_param[idx].quit) {
833 len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
834 ram_counters.transferred += len;
836 qemu_mutex_unlock(&comp_param[idx].mutex);
840 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
841 ram_addr_t offset)
843 param->block = block;
844 param->offset = offset;
847 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
848 ram_addr_t offset)
850 int idx, thread_count, bytes_xmit = -1, pages = -1;
852 thread_count = migrate_compress_threads();
853 qemu_mutex_lock(&comp_done_lock);
854 while (true) {
855 for (idx = 0; idx < thread_count; idx++) {
856 if (comp_param[idx].done) {
857 comp_param[idx].done = false;
858 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
859 qemu_mutex_lock(&comp_param[idx].mutex);
860 set_compress_params(&comp_param[idx], block, offset);
861 qemu_cond_signal(&comp_param[idx].cond);
862 qemu_mutex_unlock(&comp_param[idx].mutex);
863 pages = 1;
864 ram_counters.normal++;
865 ram_counters.transferred += bytes_xmit;
866 break;
869 if (pages > 0) {
870 break;
871 } else {
872 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
875 qemu_mutex_unlock(&comp_done_lock);
877 return pages;
881 * ram_save_compressed_page: compress the given page and send it to the stream
883 * Returns the number of pages written.
885 * @rs: current RAM state
886 * @block: block that contains the page we want to send
887 * @offset: offset inside the block for the page
888 * @last_stage: if we are at the completion stage
890 static int ram_save_compressed_page(RAMState *rs, PageSearchStatus *pss,
891 bool last_stage)
893 int pages = -1;
894 uint64_t bytes_xmit = 0;
895 uint8_t *p;
896 int ret, blen;
897 RAMBlock *block = pss->block;
898 ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
900 p = block->host + offset;
902 ret = ram_control_save_page(rs->f, block->offset,
903 offset, TARGET_PAGE_SIZE, &bytes_xmit);
904 if (bytes_xmit) {
905 ram_counters.transferred += bytes_xmit;
906 pages = 1;
908 if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
909 if (ret != RAM_SAVE_CONTROL_DELAYED) {
910 if (bytes_xmit > 0) {
911 ram_counters.normal++;
912 } else if (bytes_xmit == 0) {
913 ram_counters.duplicate++;
916 } else {
917 /* When starting the process of a new block, the first page of
918 * the block should be sent out before other pages in the same
919 * block, and all the pages in last block should have been sent
920 * out, keeping this order is important, because the 'cont' flag
921 * is used to avoid resending the block name.
923 if (block != rs->last_sent_block) {
924 flush_compressed_data(rs);
925 pages = save_zero_page(rs, block, offset, p);
926 if (pages == -1) {
927 /* Make sure the first page is sent out before other pages */
928 bytes_xmit = save_page_header(rs, rs->f, block, offset |
929 RAM_SAVE_FLAG_COMPRESS_PAGE);
930 blen = qemu_put_compression_data(rs->f, p, TARGET_PAGE_SIZE,
931 migrate_compress_level());
932 if (blen > 0) {
933 ram_counters.transferred += bytes_xmit + blen;
934 ram_counters.normal++;
935 pages = 1;
936 } else {
937 qemu_file_set_error(rs->f, blen);
938 error_report("compressed data failed!");
941 if (pages > 0) {
942 ram_release_pages(block->idstr, offset, pages);
944 } else {
945 pages = save_zero_page(rs, block, offset, p);
946 if (pages == -1) {
947 pages = compress_page_with_multi_thread(rs, block, offset);
948 } else {
949 ram_release_pages(block->idstr, offset, pages);
954 return pages;
958 * find_dirty_block: find the next dirty page and update any state
959 * associated with the search process.
961 * Returns if a page is found
963 * @rs: current RAM state
964 * @pss: data about the state of the current dirty page scan
965 * @again: set to false if the search has scanned the whole of RAM
967 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
969 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
970 if (pss->complete_round && pss->block == rs->last_seen_block &&
971 pss->page >= rs->last_page) {
973 * We've been once around the RAM and haven't found anything.
974 * Give up.
976 *again = false;
977 return false;
979 if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
980 /* Didn't find anything in this RAM Block */
981 pss->page = 0;
982 pss->block = QLIST_NEXT_RCU(pss->block, next);
983 if (!pss->block) {
984 /* Hit the end of the list */
985 pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
986 /* Flag that we've looped */
987 pss->complete_round = true;
988 rs->ram_bulk_stage = false;
989 if (migrate_use_xbzrle()) {
990 /* If xbzrle is on, stop using the data compression at this
991 * point. In theory, xbzrle can do better than compression.
993 flush_compressed_data(rs);
996 /* Didn't find anything this time, but try again on the new block */
997 *again = true;
998 return false;
999 } else {
1000 /* Can go around again, but... */
1001 *again = true;
1002 /* We've found something so probably don't need to */
1003 return true;
1008 * unqueue_page: gets a page of the queue
1010 * Helper for 'get_queued_page' - gets a page off the queue
1012 * Returns the block of the page (or NULL if none available)
1014 * @rs: current RAM state
1015 * @offset: used to return the offset within the RAMBlock
1017 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1019 RAMBlock *block = NULL;
1021 qemu_mutex_lock(&rs->src_page_req_mutex);
1022 if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1023 struct RAMSrcPageRequest *entry =
1024 QSIMPLEQ_FIRST(&rs->src_page_requests);
1025 block = entry->rb;
1026 *offset = entry->offset;
1028 if (entry->len > TARGET_PAGE_SIZE) {
1029 entry->len -= TARGET_PAGE_SIZE;
1030 entry->offset += TARGET_PAGE_SIZE;
1031 } else {
1032 memory_region_unref(block->mr);
1033 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1034 g_free(entry);
1037 qemu_mutex_unlock(&rs->src_page_req_mutex);
1039 return block;
1043 * get_queued_page: unqueue a page from the postocpy requests
1045 * Skips pages that are already sent (!dirty)
1047 * Returns if a queued page is found
1049 * @rs: current RAM state
1050 * @pss: data about the state of the current dirty page scan
1052 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1054 RAMBlock *block;
1055 ram_addr_t offset;
1056 bool dirty;
1058 do {
1059 block = unqueue_page(rs, &offset);
1061 * We're sending this page, and since it's postcopy nothing else
1062 * will dirty it, and we must make sure it doesn't get sent again
1063 * even if this queue request was received after the background
1064 * search already sent it.
1066 if (block) {
1067 unsigned long page;
1069 page = offset >> TARGET_PAGE_BITS;
1070 dirty = test_bit(page, block->bmap);
1071 if (!dirty) {
1072 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1073 page, test_bit(page, block->unsentmap));
1074 } else {
1075 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1079 } while (block && !dirty);
1081 if (block) {
1083 * As soon as we start servicing pages out of order, then we have
1084 * to kill the bulk stage, since the bulk stage assumes
1085 * in (migration_bitmap_find_and_reset_dirty) that every page is
1086 * dirty, that's no longer true.
1088 rs->ram_bulk_stage = false;
1091 * We want the background search to continue from the queued page
1092 * since the guest is likely to want other pages near to the page
1093 * it just requested.
1095 pss->block = block;
1096 pss->page = offset >> TARGET_PAGE_BITS;
1099 return !!block;
1103 * migration_page_queue_free: drop any remaining pages in the ram
1104 * request queue
1106 * It should be empty at the end anyway, but in error cases there may
1107 * be some left. in case that there is any page left, we drop it.
1110 static void migration_page_queue_free(RAMState *rs)
1112 struct RAMSrcPageRequest *mspr, *next_mspr;
1113 /* This queue generally should be empty - but in the case of a failed
1114 * migration might have some droppings in.
1116 rcu_read_lock();
1117 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1118 memory_region_unref(mspr->rb->mr);
1119 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1120 g_free(mspr);
1122 rcu_read_unlock();
1126 * ram_save_queue_pages: queue the page for transmission
1128 * A request from postcopy destination for example.
1130 * Returns zero on success or negative on error
1132 * @rbname: Name of the RAMBLock of the request. NULL means the
1133 * same that last one.
1134 * @start: starting address from the start of the RAMBlock
1135 * @len: length (in bytes) to send
1137 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1139 RAMBlock *ramblock;
1140 RAMState *rs = ram_state;
1142 ram_counters.postcopy_requests++;
1143 rcu_read_lock();
1144 if (!rbname) {
1145 /* Reuse last RAMBlock */
1146 ramblock = rs->last_req_rb;
1148 if (!ramblock) {
1150 * Shouldn't happen, we can't reuse the last RAMBlock if
1151 * it's the 1st request.
1153 error_report("ram_save_queue_pages no previous block");
1154 goto err;
1156 } else {
1157 ramblock = qemu_ram_block_by_name(rbname);
1159 if (!ramblock) {
1160 /* We shouldn't be asked for a non-existent RAMBlock */
1161 error_report("ram_save_queue_pages no block '%s'", rbname);
1162 goto err;
1164 rs->last_req_rb = ramblock;
1166 trace_ram_save_queue_pages(ramblock->idstr, start, len);
1167 if (start+len > ramblock->used_length) {
1168 error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1169 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1170 __func__, start, len, ramblock->used_length);
1171 goto err;
1174 struct RAMSrcPageRequest *new_entry =
1175 g_malloc0(sizeof(struct RAMSrcPageRequest));
1176 new_entry->rb = ramblock;
1177 new_entry->offset = start;
1178 new_entry->len = len;
1180 memory_region_ref(ramblock->mr);
1181 qemu_mutex_lock(&rs->src_page_req_mutex);
1182 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1183 qemu_mutex_unlock(&rs->src_page_req_mutex);
1184 rcu_read_unlock();
1186 return 0;
1188 err:
1189 rcu_read_unlock();
1190 return -1;
1194 * ram_save_target_page: save one target page
1196 * Returns the number of pages written
1198 * @rs: current RAM state
1199 * @ms: current migration state
1200 * @pss: data about the page we want to send
1201 * @last_stage: if we are at the completion stage
1203 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
1204 bool last_stage)
1206 int res = 0;
1208 /* Check the pages is dirty and if it is send it */
1209 if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
1211 * If xbzrle is on, stop using the data compression after first
1212 * round of migration even if compression is enabled. In theory,
1213 * xbzrle can do better than compression.
1215 if (migrate_use_compression() &&
1216 (rs->ram_bulk_stage || !migrate_use_xbzrle())) {
1217 res = ram_save_compressed_page(rs, pss, last_stage);
1218 } else {
1219 res = ram_save_page(rs, pss, last_stage);
1222 if (res < 0) {
1223 return res;
1225 if (pss->block->unsentmap) {
1226 clear_bit(pss->page, pss->block->unsentmap);
1230 return res;
1234 * ram_save_host_page: save a whole host page
1236 * Starting at *offset send pages up to the end of the current host
1237 * page. It's valid for the initial offset to point into the middle of
1238 * a host page in which case the remainder of the hostpage is sent.
1239 * Only dirty target pages are sent. Note that the host page size may
1240 * be a huge page for this block.
1241 * The saving stops at the boundary of the used_length of the block
1242 * if the RAMBlock isn't a multiple of the host page size.
1244 * Returns the number of pages written or negative on error
1246 * @rs: current RAM state
1247 * @ms: current migration state
1248 * @pss: data about the page we want to send
1249 * @last_stage: if we are at the completion stage
1251 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
1252 bool last_stage)
1254 int tmppages, pages = 0;
1255 size_t pagesize_bits =
1256 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
1258 do {
1259 tmppages = ram_save_target_page(rs, pss, last_stage);
1260 if (tmppages < 0) {
1261 return tmppages;
1264 pages += tmppages;
1265 pss->page++;
1266 } while ((pss->page & (pagesize_bits - 1)) &&
1267 offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS));
1269 /* The offset we leave with is the last one we looked at */
1270 pss->page--;
1271 return pages;
1275 * ram_find_and_save_block: finds a dirty page and sends it to f
1277 * Called within an RCU critical section.
1279 * Returns the number of pages written where zero means no dirty pages
1281 * @rs: current RAM state
1282 * @last_stage: if we are at the completion stage
1284 * On systems where host-page-size > target-page-size it will send all the
1285 * pages in a host page that are dirty.
1288 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
1290 PageSearchStatus pss;
1291 int pages = 0;
1292 bool again, found;
1294 /* No dirty page as there is zero RAM */
1295 if (!ram_bytes_total()) {
1296 return pages;
1299 pss.block = rs->last_seen_block;
1300 pss.page = rs->last_page;
1301 pss.complete_round = false;
1303 if (!pss.block) {
1304 pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1307 do {
1308 again = true;
1309 found = get_queued_page(rs, &pss);
1311 if (!found) {
1312 /* priority queue empty, so just search for something dirty */
1313 found = find_dirty_block(rs, &pss, &again);
1316 if (found) {
1317 pages = ram_save_host_page(rs, &pss, last_stage);
1319 } while (!pages && again);
1321 rs->last_seen_block = pss.block;
1322 rs->last_page = pss.page;
1324 return pages;
1327 void acct_update_position(QEMUFile *f, size_t size, bool zero)
1329 uint64_t pages = size / TARGET_PAGE_SIZE;
1331 if (zero) {
1332 ram_counters.duplicate += pages;
1333 } else {
1334 ram_counters.normal += pages;
1335 ram_counters.transferred += size;
1336 qemu_update_position(f, size);
1340 uint64_t ram_bytes_total(void)
1342 RAMBlock *block;
1343 uint64_t total = 0;
1345 rcu_read_lock();
1346 RAMBLOCK_FOREACH(block) {
1347 total += block->used_length;
1349 rcu_read_unlock();
1350 return total;
1353 void free_xbzrle_decoded_buf(void)
1355 g_free(xbzrle_decoded_buf);
1356 xbzrle_decoded_buf = NULL;
1359 static void ram_migration_cleanup(void *opaque)
1361 RAMState **rsp = opaque;
1362 RAMBlock *block;
1364 /* caller have hold iothread lock or is in a bh, so there is
1365 * no writing race against this migration_bitmap
1367 memory_global_dirty_log_stop();
1369 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1370 g_free(block->bmap);
1371 block->bmap = NULL;
1372 g_free(block->unsentmap);
1373 block->unsentmap = NULL;
1376 XBZRLE_cache_lock();
1377 if (XBZRLE.cache) {
1378 cache_fini(XBZRLE.cache);
1379 g_free(XBZRLE.encoded_buf);
1380 g_free(XBZRLE.current_buf);
1381 g_free(XBZRLE.zero_target_page);
1382 XBZRLE.cache = NULL;
1383 XBZRLE.encoded_buf = NULL;
1384 XBZRLE.current_buf = NULL;
1385 XBZRLE.zero_target_page = NULL;
1387 XBZRLE_cache_unlock();
1388 migration_page_queue_free(*rsp);
1389 g_free(*rsp);
1390 *rsp = NULL;
1393 static void ram_state_reset(RAMState *rs)
1395 rs->last_seen_block = NULL;
1396 rs->last_sent_block = NULL;
1397 rs->last_page = 0;
1398 rs->last_version = ram_list.version;
1399 rs->ram_bulk_stage = true;
1402 #define MAX_WAIT 50 /* ms, half buffered_file limit */
1405 * 'expected' is the value you expect the bitmap mostly to be full
1406 * of; it won't bother printing lines that are all this value.
1407 * If 'todump' is null the migration bitmap is dumped.
1409 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
1410 unsigned long pages)
1412 int64_t cur;
1413 int64_t linelen = 128;
1414 char linebuf[129];
1416 for (cur = 0; cur < pages; cur += linelen) {
1417 int64_t curb;
1418 bool found = false;
1420 * Last line; catch the case where the line length
1421 * is longer than remaining ram
1423 if (cur + linelen > pages) {
1424 linelen = pages - cur;
1426 for (curb = 0; curb < linelen; curb++) {
1427 bool thisbit = test_bit(cur + curb, todump);
1428 linebuf[curb] = thisbit ? '1' : '.';
1429 found = found || (thisbit != expected);
1431 if (found) {
1432 linebuf[curb] = '\0';
1433 fprintf(stderr, "0x%08" PRIx64 " : %s\n", cur, linebuf);
1438 /* **** functions for postcopy ***** */
1440 void ram_postcopy_migrated_memory_release(MigrationState *ms)
1442 struct RAMBlock *block;
1444 RAMBLOCK_FOREACH(block) {
1445 unsigned long *bitmap = block->bmap;
1446 unsigned long range = block->used_length >> TARGET_PAGE_BITS;
1447 unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
1449 while (run_start < range) {
1450 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
1451 ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
1452 (run_end - run_start) << TARGET_PAGE_BITS);
1453 run_start = find_next_zero_bit(bitmap, range, run_end + 1);
1459 * postcopy_send_discard_bm_ram: discard a RAMBlock
1461 * Returns zero on success
1463 * Callback from postcopy_each_ram_send_discard for each RAMBlock
1464 * Note: At this point the 'unsentmap' is the processed bitmap combined
1465 * with the dirtymap; so a '1' means it's either dirty or unsent.
1467 * @ms: current migration state
1468 * @pds: state for postcopy
1469 * @start: RAMBlock starting page
1470 * @length: RAMBlock size
1472 static int postcopy_send_discard_bm_ram(MigrationState *ms,
1473 PostcopyDiscardState *pds,
1474 RAMBlock *block)
1476 unsigned long end = block->used_length >> TARGET_PAGE_BITS;
1477 unsigned long current;
1478 unsigned long *unsentmap = block->unsentmap;
1480 for (current = 0; current < end; ) {
1481 unsigned long one = find_next_bit(unsentmap, end, current);
1483 if (one <= end) {
1484 unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
1485 unsigned long discard_length;
1487 if (zero >= end) {
1488 discard_length = end - one;
1489 } else {
1490 discard_length = zero - one;
1492 if (discard_length) {
1493 postcopy_discard_send_range(ms, pds, one, discard_length);
1495 current = one + discard_length;
1496 } else {
1497 current = one;
1501 return 0;
1505 * postcopy_each_ram_send_discard: discard all RAMBlocks
1507 * Returns 0 for success or negative for error
1509 * Utility for the outgoing postcopy code.
1510 * Calls postcopy_send_discard_bm_ram for each RAMBlock
1511 * passing it bitmap indexes and name.
1512 * (qemu_ram_foreach_block ends up passing unscaled lengths
1513 * which would mean postcopy code would have to deal with target page)
1515 * @ms: current migration state
1517 static int postcopy_each_ram_send_discard(MigrationState *ms)
1519 struct RAMBlock *block;
1520 int ret;
1522 RAMBLOCK_FOREACH(block) {
1523 PostcopyDiscardState *pds =
1524 postcopy_discard_send_init(ms, block->idstr);
1527 * Postcopy sends chunks of bitmap over the wire, but it
1528 * just needs indexes at this point, avoids it having
1529 * target page specific code.
1531 ret = postcopy_send_discard_bm_ram(ms, pds, block);
1532 postcopy_discard_send_finish(ms, pds);
1533 if (ret) {
1534 return ret;
1538 return 0;
1542 * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
1544 * Helper for postcopy_chunk_hostpages; it's called twice to
1545 * canonicalize the two bitmaps, that are similar, but one is
1546 * inverted.
1548 * Postcopy requires that all target pages in a hostpage are dirty or
1549 * clean, not a mix. This function canonicalizes the bitmaps.
1551 * @ms: current migration state
1552 * @unsent_pass: if true we need to canonicalize partially unsent host pages
1553 * otherwise we need to canonicalize partially dirty host pages
1554 * @block: block that contains the page we want to canonicalize
1555 * @pds: state for postcopy
1557 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
1558 RAMBlock *block,
1559 PostcopyDiscardState *pds)
1561 RAMState *rs = ram_state;
1562 unsigned long *bitmap = block->bmap;
1563 unsigned long *unsentmap = block->unsentmap;
1564 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
1565 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
1566 unsigned long run_start;
1568 if (block->page_size == TARGET_PAGE_SIZE) {
1569 /* Easy case - TPS==HPS for a non-huge page RAMBlock */
1570 return;
1573 if (unsent_pass) {
1574 /* Find a sent page */
1575 run_start = find_next_zero_bit(unsentmap, pages, 0);
1576 } else {
1577 /* Find a dirty page */
1578 run_start = find_next_bit(bitmap, pages, 0);
1581 while (run_start < pages) {
1582 bool do_fixup = false;
1583 unsigned long fixup_start_addr;
1584 unsigned long host_offset;
1587 * If the start of this run of pages is in the middle of a host
1588 * page, then we need to fixup this host page.
1590 host_offset = run_start % host_ratio;
1591 if (host_offset) {
1592 do_fixup = true;
1593 run_start -= host_offset;
1594 fixup_start_addr = run_start;
1595 /* For the next pass */
1596 run_start = run_start + host_ratio;
1597 } else {
1598 /* Find the end of this run */
1599 unsigned long run_end;
1600 if (unsent_pass) {
1601 run_end = find_next_bit(unsentmap, pages, run_start + 1);
1602 } else {
1603 run_end = find_next_zero_bit(bitmap, pages, run_start + 1);
1606 * If the end isn't at the start of a host page, then the
1607 * run doesn't finish at the end of a host page
1608 * and we need to discard.
1610 host_offset = run_end % host_ratio;
1611 if (host_offset) {
1612 do_fixup = true;
1613 fixup_start_addr = run_end - host_offset;
1615 * This host page has gone, the next loop iteration starts
1616 * from after the fixup
1618 run_start = fixup_start_addr + host_ratio;
1619 } else {
1621 * No discards on this iteration, next loop starts from
1622 * next sent/dirty page
1624 run_start = run_end + 1;
1628 if (do_fixup) {
1629 unsigned long page;
1631 /* Tell the destination to discard this page */
1632 if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
1633 /* For the unsent_pass we:
1634 * discard partially sent pages
1635 * For the !unsent_pass (dirty) we:
1636 * discard partially dirty pages that were sent
1637 * (any partially sent pages were already discarded
1638 * by the previous unsent_pass)
1640 postcopy_discard_send_range(ms, pds, fixup_start_addr,
1641 host_ratio);
1644 /* Clean up the bitmap */
1645 for (page = fixup_start_addr;
1646 page < fixup_start_addr + host_ratio; page++) {
1647 /* All pages in this host page are now not sent */
1648 set_bit(page, unsentmap);
1651 * Remark them as dirty, updating the count for any pages
1652 * that weren't previously dirty.
1654 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
1658 if (unsent_pass) {
1659 /* Find the next sent page for the next iteration */
1660 run_start = find_next_zero_bit(unsentmap, pages, run_start);
1661 } else {
1662 /* Find the next dirty page for the next iteration */
1663 run_start = find_next_bit(bitmap, pages, run_start);
1669 * postcopy_chuck_hostpages: discrad any partially sent host page
1671 * Utility for the outgoing postcopy code.
1673 * Discard any partially sent host-page size chunks, mark any partially
1674 * dirty host-page size chunks as all dirty. In this case the host-page
1675 * is the host-page for the particular RAMBlock, i.e. it might be a huge page
1677 * Returns zero on success
1679 * @ms: current migration state
1680 * @block: block we want to work with
1682 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
1684 PostcopyDiscardState *pds =
1685 postcopy_discard_send_init(ms, block->idstr);
1687 /* First pass: Discard all partially sent host pages */
1688 postcopy_chunk_hostpages_pass(ms, true, block, pds);
1690 * Second pass: Ensure that all partially dirty host pages are made
1691 * fully dirty.
1693 postcopy_chunk_hostpages_pass(ms, false, block, pds);
1695 postcopy_discard_send_finish(ms, pds);
1696 return 0;
1700 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
1702 * Returns zero on success
1704 * Transmit the set of pages to be discarded after precopy to the target
1705 * these are pages that:
1706 * a) Have been previously transmitted but are now dirty again
1707 * b) Pages that have never been transmitted, this ensures that
1708 * any pages on the destination that have been mapped by background
1709 * tasks get discarded (transparent huge pages is the specific concern)
1710 * Hopefully this is pretty sparse
1712 * @ms: current migration state
1714 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
1716 RAMState *rs = ram_state;
1717 RAMBlock *block;
1718 int ret;
1720 rcu_read_lock();
1722 /* This should be our last sync, the src is now paused */
1723 migration_bitmap_sync(rs);
1725 /* Easiest way to make sure we don't resume in the middle of a host-page */
1726 rs->last_seen_block = NULL;
1727 rs->last_sent_block = NULL;
1728 rs->last_page = 0;
1730 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1731 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
1732 unsigned long *bitmap = block->bmap;
1733 unsigned long *unsentmap = block->unsentmap;
1735 if (!unsentmap) {
1736 /* We don't have a safe way to resize the sentmap, so
1737 * if the bitmap was resized it will be NULL at this
1738 * point.
1740 error_report("migration ram resized during precopy phase");
1741 rcu_read_unlock();
1742 return -EINVAL;
1744 /* Deal with TPS != HPS and huge pages */
1745 ret = postcopy_chunk_hostpages(ms, block);
1746 if (ret) {
1747 rcu_read_unlock();
1748 return ret;
1752 * Update the unsentmap to be unsentmap = unsentmap | dirty
1754 bitmap_or(unsentmap, unsentmap, bitmap, pages);
1755 #ifdef DEBUG_POSTCOPY
1756 ram_debug_dump_bitmap(unsentmap, true, pages);
1757 #endif
1759 trace_ram_postcopy_send_discard_bitmap();
1761 ret = postcopy_each_ram_send_discard(ms);
1762 rcu_read_unlock();
1764 return ret;
1768 * ram_discard_range: discard dirtied pages at the beginning of postcopy
1770 * Returns zero on success
1772 * @rbname: name of the RAMBlock of the request. NULL means the
1773 * same that last one.
1774 * @start: RAMBlock starting page
1775 * @length: RAMBlock size
1777 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
1779 int ret = -1;
1781 trace_ram_discard_range(rbname, start, length);
1783 rcu_read_lock();
1784 RAMBlock *rb = qemu_ram_block_by_name(rbname);
1786 if (!rb) {
1787 error_report("ram_discard_range: Failed to find block '%s'", rbname);
1788 goto err;
1791 ret = ram_block_discard_range(rb, start, length);
1793 err:
1794 rcu_read_unlock();
1796 return ret;
1799 static int ram_state_init(RAMState **rsp)
1801 *rsp = g_new0(RAMState, 1);
1803 qemu_mutex_init(&(*rsp)->bitmap_mutex);
1804 qemu_mutex_init(&(*rsp)->src_page_req_mutex);
1805 QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
1807 if (migrate_use_xbzrle()) {
1808 XBZRLE_cache_lock();
1809 XBZRLE.zero_target_page = g_malloc0(TARGET_PAGE_SIZE);
1810 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size() /
1811 TARGET_PAGE_SIZE,
1812 TARGET_PAGE_SIZE);
1813 if (!XBZRLE.cache) {
1814 XBZRLE_cache_unlock();
1815 error_report("Error creating cache");
1816 g_free(*rsp);
1817 *rsp = NULL;
1818 return -1;
1820 XBZRLE_cache_unlock();
1822 /* We prefer not to abort if there is no memory */
1823 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
1824 if (!XBZRLE.encoded_buf) {
1825 error_report("Error allocating encoded_buf");
1826 g_free(*rsp);
1827 *rsp = NULL;
1828 return -1;
1831 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
1832 if (!XBZRLE.current_buf) {
1833 error_report("Error allocating current_buf");
1834 g_free(XBZRLE.encoded_buf);
1835 XBZRLE.encoded_buf = NULL;
1836 g_free(*rsp);
1837 *rsp = NULL;
1838 return -1;
1842 /* For memory_global_dirty_log_start below. */
1843 qemu_mutex_lock_iothread();
1845 qemu_mutex_lock_ramlist();
1846 rcu_read_lock();
1847 ram_state_reset(*rsp);
1849 /* Skip setting bitmap if there is no RAM */
1850 if (ram_bytes_total()) {
1851 RAMBlock *block;
1853 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1854 unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
1856 block->bmap = bitmap_new(pages);
1857 bitmap_set(block->bmap, 0, pages);
1858 if (migrate_postcopy_ram()) {
1859 block->unsentmap = bitmap_new(pages);
1860 bitmap_set(block->unsentmap, 0, pages);
1866 * Count the total number of pages used by ram blocks not including any
1867 * gaps due to alignment or unplugs.
1869 (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
1871 memory_global_dirty_log_start();
1872 migration_bitmap_sync(*rsp);
1873 qemu_mutex_unlock_ramlist();
1874 qemu_mutex_unlock_iothread();
1875 rcu_read_unlock();
1877 return 0;
1881 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
1882 * long-running RCU critical section. When rcu-reclaims in the code
1883 * start to become numerous it will be necessary to reduce the
1884 * granularity of these critical sections.
1888 * ram_save_setup: Setup RAM for migration
1890 * Returns zero to indicate success and negative for error
1892 * @f: QEMUFile where to send the data
1893 * @opaque: RAMState pointer
1895 static int ram_save_setup(QEMUFile *f, void *opaque)
1897 RAMState **rsp = opaque;
1898 RAMBlock *block;
1900 /* migration has already setup the bitmap, reuse it. */
1901 if (!migration_in_colo_state()) {
1902 if (ram_state_init(rsp) != 0) {
1903 return -1;
1906 (*rsp)->f = f;
1908 rcu_read_lock();
1910 qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
1912 RAMBLOCK_FOREACH(block) {
1913 qemu_put_byte(f, strlen(block->idstr));
1914 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
1915 qemu_put_be64(f, block->used_length);
1916 if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
1917 qemu_put_be64(f, block->page_size);
1921 rcu_read_unlock();
1923 ram_control_before_iterate(f, RAM_CONTROL_SETUP);
1924 ram_control_after_iterate(f, RAM_CONTROL_SETUP);
1926 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
1928 return 0;
1932 * ram_save_iterate: iterative stage for migration
1934 * Returns zero to indicate success and negative for error
1936 * @f: QEMUFile where to send the data
1937 * @opaque: RAMState pointer
1939 static int ram_save_iterate(QEMUFile *f, void *opaque)
1941 RAMState **temp = opaque;
1942 RAMState *rs = *temp;
1943 int ret;
1944 int i;
1945 int64_t t0;
1946 int done = 0;
1948 rcu_read_lock();
1949 if (ram_list.version != rs->last_version) {
1950 ram_state_reset(rs);
1953 /* Read version before ram_list.blocks */
1954 smp_rmb();
1956 ram_control_before_iterate(f, RAM_CONTROL_ROUND);
1958 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
1959 i = 0;
1960 while ((ret = qemu_file_rate_limit(f)) == 0) {
1961 int pages;
1963 pages = ram_find_and_save_block(rs, false);
1964 /* no more pages to sent */
1965 if (pages == 0) {
1966 done = 1;
1967 break;
1969 rs->iterations++;
1971 /* we want to check in the 1st loop, just in case it was the 1st time
1972 and we had to sync the dirty bitmap.
1973 qemu_get_clock_ns() is a bit expensive, so we only check each some
1974 iterations
1976 if ((i & 63) == 0) {
1977 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
1978 if (t1 > MAX_WAIT) {
1979 trace_ram_save_iterate_big_wait(t1, i);
1980 break;
1983 i++;
1985 flush_compressed_data(rs);
1986 rcu_read_unlock();
1989 * Must occur before EOS (or any QEMUFile operation)
1990 * because of RDMA protocol.
1992 ram_control_after_iterate(f, RAM_CONTROL_ROUND);
1994 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
1995 ram_counters.transferred += 8;
1997 ret = qemu_file_get_error(f);
1998 if (ret < 0) {
1999 return ret;
2002 return done;
2006 * ram_save_complete: function called to send the remaining amount of ram
2008 * Returns zero to indicate success
2010 * Called with iothread lock
2012 * @f: QEMUFile where to send the data
2013 * @opaque: RAMState pointer
2015 static int ram_save_complete(QEMUFile *f, void *opaque)
2017 RAMState **temp = opaque;
2018 RAMState *rs = *temp;
2020 rcu_read_lock();
2022 if (!migration_in_postcopy()) {
2023 migration_bitmap_sync(rs);
2026 ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2028 /* try transferring iterative blocks of memory */
2030 /* flush all remaining blocks regardless of rate limiting */
2031 while (true) {
2032 int pages;
2034 pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2035 /* no more blocks to sent */
2036 if (pages == 0) {
2037 break;
2041 flush_compressed_data(rs);
2042 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2044 rcu_read_unlock();
2046 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2048 return 0;
2051 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2052 uint64_t *non_postcopiable_pending,
2053 uint64_t *postcopiable_pending)
2055 RAMState **temp = opaque;
2056 RAMState *rs = *temp;
2057 uint64_t remaining_size;
2059 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2061 if (!migration_in_postcopy() &&
2062 remaining_size < max_size) {
2063 qemu_mutex_lock_iothread();
2064 rcu_read_lock();
2065 migration_bitmap_sync(rs);
2066 rcu_read_unlock();
2067 qemu_mutex_unlock_iothread();
2068 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2071 /* We can do postcopy, and all the data is postcopiable */
2072 *postcopiable_pending += remaining_size;
2075 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2077 unsigned int xh_len;
2078 int xh_flags;
2079 uint8_t *loaded_data;
2081 if (!xbzrle_decoded_buf) {
2082 xbzrle_decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2084 loaded_data = xbzrle_decoded_buf;
2086 /* extract RLE header */
2087 xh_flags = qemu_get_byte(f);
2088 xh_len = qemu_get_be16(f);
2090 if (xh_flags != ENCODING_FLAG_XBZRLE) {
2091 error_report("Failed to load XBZRLE page - wrong compression!");
2092 return -1;
2095 if (xh_len > TARGET_PAGE_SIZE) {
2096 error_report("Failed to load XBZRLE page - len overflow!");
2097 return -1;
2099 /* load data and decode */
2100 qemu_get_buffer_in_place(f, &loaded_data, xh_len);
2102 /* decode RLE */
2103 if (xbzrle_decode_buffer(loaded_data, xh_len, host,
2104 TARGET_PAGE_SIZE) == -1) {
2105 error_report("Failed to load XBZRLE page - decode error!");
2106 return -1;
2109 return 0;
2113 * ram_block_from_stream: read a RAMBlock id from the migration stream
2115 * Must be called from within a rcu critical section.
2117 * Returns a pointer from within the RCU-protected ram_list.
2119 * @f: QEMUFile where to read the data from
2120 * @flags: Page flags (mostly to see if it's a continuation of previous block)
2122 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
2124 static RAMBlock *block = NULL;
2125 char id[256];
2126 uint8_t len;
2128 if (flags & RAM_SAVE_FLAG_CONTINUE) {
2129 if (!block) {
2130 error_report("Ack, bad migration stream!");
2131 return NULL;
2133 return block;
2136 len = qemu_get_byte(f);
2137 qemu_get_buffer(f, (uint8_t *)id, len);
2138 id[len] = 0;
2140 block = qemu_ram_block_by_name(id);
2141 if (!block) {
2142 error_report("Can't find block %s", id);
2143 return NULL;
2146 return block;
2149 static inline void *host_from_ram_block_offset(RAMBlock *block,
2150 ram_addr_t offset)
2152 if (!offset_in_ramblock(block, offset)) {
2153 return NULL;
2156 return block->host + offset;
2160 * ram_handle_compressed: handle the zero page case
2162 * If a page (or a whole RDMA chunk) has been
2163 * determined to be zero, then zap it.
2165 * @host: host address for the zero page
2166 * @ch: what the page is filled from. We only support zero
2167 * @size: size of the zero page
2169 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2171 if (ch != 0 || !is_zero_range(host, size)) {
2172 memset(host, ch, size);
2176 static void *do_data_decompress(void *opaque)
2178 DecompressParam *param = opaque;
2179 unsigned long pagesize;
2180 uint8_t *des;
2181 int len;
2183 qemu_mutex_lock(&param->mutex);
2184 while (!param->quit) {
2185 if (param->des) {
2186 des = param->des;
2187 len = param->len;
2188 param->des = 0;
2189 qemu_mutex_unlock(&param->mutex);
2191 pagesize = TARGET_PAGE_SIZE;
2192 /* uncompress() will return failed in some case, especially
2193 * when the page is dirted when doing the compression, it's
2194 * not a problem because the dirty page will be retransferred
2195 * and uncompress() won't break the data in other pages.
2197 uncompress((Bytef *)des, &pagesize,
2198 (const Bytef *)param->compbuf, len);
2200 qemu_mutex_lock(&decomp_done_lock);
2201 param->done = true;
2202 qemu_cond_signal(&decomp_done_cond);
2203 qemu_mutex_unlock(&decomp_done_lock);
2205 qemu_mutex_lock(&param->mutex);
2206 } else {
2207 qemu_cond_wait(&param->cond, &param->mutex);
2210 qemu_mutex_unlock(&param->mutex);
2212 return NULL;
2215 static void wait_for_decompress_done(void)
2217 int idx, thread_count;
2219 if (!migrate_use_compression()) {
2220 return;
2223 thread_count = migrate_decompress_threads();
2224 qemu_mutex_lock(&decomp_done_lock);
2225 for (idx = 0; idx < thread_count; idx++) {
2226 while (!decomp_param[idx].done) {
2227 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2230 qemu_mutex_unlock(&decomp_done_lock);
2233 void migrate_decompress_threads_create(void)
2235 int i, thread_count;
2237 if (!migrate_use_compression()) {
2238 return;
2240 thread_count = migrate_decompress_threads();
2241 decompress_threads = g_new0(QemuThread, thread_count);
2242 decomp_param = g_new0(DecompressParam, thread_count);
2243 qemu_mutex_init(&decomp_done_lock);
2244 qemu_cond_init(&decomp_done_cond);
2245 for (i = 0; i < thread_count; i++) {
2246 qemu_mutex_init(&decomp_param[i].mutex);
2247 qemu_cond_init(&decomp_param[i].cond);
2248 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
2249 decomp_param[i].done = true;
2250 decomp_param[i].quit = false;
2251 qemu_thread_create(decompress_threads + i, "decompress",
2252 do_data_decompress, decomp_param + i,
2253 QEMU_THREAD_JOINABLE);
2257 void migrate_decompress_threads_join(void)
2259 int i, thread_count;
2261 if (!migrate_use_compression()) {
2262 return;
2264 thread_count = migrate_decompress_threads();
2265 for (i = 0; i < thread_count; i++) {
2266 qemu_mutex_lock(&decomp_param[i].mutex);
2267 decomp_param[i].quit = true;
2268 qemu_cond_signal(&decomp_param[i].cond);
2269 qemu_mutex_unlock(&decomp_param[i].mutex);
2271 for (i = 0; i < thread_count; i++) {
2272 qemu_thread_join(decompress_threads + i);
2273 qemu_mutex_destroy(&decomp_param[i].mutex);
2274 qemu_cond_destroy(&decomp_param[i].cond);
2275 g_free(decomp_param[i].compbuf);
2277 g_free(decompress_threads);
2278 g_free(decomp_param);
2279 decompress_threads = NULL;
2280 decomp_param = NULL;
2283 static void decompress_data_with_multi_threads(QEMUFile *f,
2284 void *host, int len)
2286 int idx, thread_count;
2288 thread_count = migrate_decompress_threads();
2289 qemu_mutex_lock(&decomp_done_lock);
2290 while (true) {
2291 for (idx = 0; idx < thread_count; idx++) {
2292 if (decomp_param[idx].done) {
2293 decomp_param[idx].done = false;
2294 qemu_mutex_lock(&decomp_param[idx].mutex);
2295 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
2296 decomp_param[idx].des = host;
2297 decomp_param[idx].len = len;
2298 qemu_cond_signal(&decomp_param[idx].cond);
2299 qemu_mutex_unlock(&decomp_param[idx].mutex);
2300 break;
2303 if (idx < thread_count) {
2304 break;
2305 } else {
2306 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2309 qemu_mutex_unlock(&decomp_done_lock);
2313 * ram_postcopy_incoming_init: allocate postcopy data structures
2315 * Returns 0 for success and negative if there was one error
2317 * @mis: current migration incoming state
2319 * Allocate data structures etc needed by incoming migration with
2320 * postcopy-ram. postcopy-ram's similarly names
2321 * postcopy_ram_incoming_init does the work.
2323 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
2325 unsigned long ram_pages = last_ram_page();
2327 return postcopy_ram_incoming_init(mis, ram_pages);
2331 * ram_load_postcopy: load a page in postcopy case
2333 * Returns 0 for success or -errno in case of error
2335 * Called in postcopy mode by ram_load().
2336 * rcu_read_lock is taken prior to this being called.
2338 * @f: QEMUFile where to send the data
2340 static int ram_load_postcopy(QEMUFile *f)
2342 int flags = 0, ret = 0;
2343 bool place_needed = false;
2344 bool matching_page_sizes = false;
2345 MigrationIncomingState *mis = migration_incoming_get_current();
2346 /* Temporary page that is later 'placed' */
2347 void *postcopy_host_page = postcopy_get_tmp_page(mis);
2348 void *last_host = NULL;
2349 bool all_zero = false;
2351 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2352 ram_addr_t addr;
2353 void *host = NULL;
2354 void *page_buffer = NULL;
2355 void *place_source = NULL;
2356 RAMBlock *block = NULL;
2357 uint8_t ch;
2359 addr = qemu_get_be64(f);
2360 flags = addr & ~TARGET_PAGE_MASK;
2361 addr &= TARGET_PAGE_MASK;
2363 trace_ram_load_postcopy_loop((uint64_t)addr, flags);
2364 place_needed = false;
2365 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) {
2366 block = ram_block_from_stream(f, flags);
2368 host = host_from_ram_block_offset(block, addr);
2369 if (!host) {
2370 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2371 ret = -EINVAL;
2372 break;
2374 matching_page_sizes = block->page_size == TARGET_PAGE_SIZE;
2376 * Postcopy requires that we place whole host pages atomically;
2377 * these may be huge pages for RAMBlocks that are backed by
2378 * hugetlbfs.
2379 * To make it atomic, the data is read into a temporary page
2380 * that's moved into place later.
2381 * The migration protocol uses, possibly smaller, target-pages
2382 * however the source ensures it always sends all the components
2383 * of a host page in order.
2385 page_buffer = postcopy_host_page +
2386 ((uintptr_t)host & (block->page_size - 1));
2387 /* If all TP are zero then we can optimise the place */
2388 if (!((uintptr_t)host & (block->page_size - 1))) {
2389 all_zero = true;
2390 } else {
2391 /* not the 1st TP within the HP */
2392 if (host != (last_host + TARGET_PAGE_SIZE)) {
2393 error_report("Non-sequential target page %p/%p",
2394 host, last_host);
2395 ret = -EINVAL;
2396 break;
2402 * If it's the last part of a host page then we place the host
2403 * page
2405 place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
2406 (block->page_size - 1)) == 0;
2407 place_source = postcopy_host_page;
2409 last_host = host;
2411 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2412 case RAM_SAVE_FLAG_ZERO:
2413 ch = qemu_get_byte(f);
2414 memset(page_buffer, ch, TARGET_PAGE_SIZE);
2415 if (ch) {
2416 all_zero = false;
2418 break;
2420 case RAM_SAVE_FLAG_PAGE:
2421 all_zero = false;
2422 if (!place_needed || !matching_page_sizes) {
2423 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
2424 } else {
2425 /* Avoids the qemu_file copy during postcopy, which is
2426 * going to do a copy later; can only do it when we
2427 * do this read in one go (matching page sizes)
2429 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
2430 TARGET_PAGE_SIZE);
2432 break;
2433 case RAM_SAVE_FLAG_EOS:
2434 /* normal exit */
2435 break;
2436 default:
2437 error_report("Unknown combination of migration flags: %#x"
2438 " (postcopy mode)", flags);
2439 ret = -EINVAL;
2442 if (place_needed) {
2443 /* This gets called at the last target page in the host page */
2444 void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
2446 if (all_zero) {
2447 ret = postcopy_place_page_zero(mis, place_dest,
2448 block->page_size);
2449 } else {
2450 ret = postcopy_place_page(mis, place_dest,
2451 place_source, block->page_size);
2454 if (!ret) {
2455 ret = qemu_file_get_error(f);
2459 return ret;
2462 static int ram_load(QEMUFile *f, void *opaque, int version_id)
2464 int flags = 0, ret = 0, invalid_flags = 0;
2465 static uint64_t seq_iter;
2466 int len = 0;
2468 * If system is running in postcopy mode, page inserts to host memory must
2469 * be atomic
2471 bool postcopy_running = postcopy_state_get() >= POSTCOPY_INCOMING_LISTENING;
2472 /* ADVISE is earlier, it shows the source has the postcopy capability on */
2473 bool postcopy_advised = postcopy_state_get() >= POSTCOPY_INCOMING_ADVISE;
2475 seq_iter++;
2477 if (version_id != 4) {
2478 ret = -EINVAL;
2481 if (!migrate_use_compression()) {
2482 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
2484 /* This RCU critical section can be very long running.
2485 * When RCU reclaims in the code start to become numerous,
2486 * it will be necessary to reduce the granularity of this
2487 * critical section.
2489 rcu_read_lock();
2491 if (postcopy_running) {
2492 ret = ram_load_postcopy(f);
2495 while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2496 ram_addr_t addr, total_ram_bytes;
2497 void *host = NULL;
2498 uint8_t ch;
2500 addr = qemu_get_be64(f);
2501 flags = addr & ~TARGET_PAGE_MASK;
2502 addr &= TARGET_PAGE_MASK;
2504 if (flags & invalid_flags) {
2505 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
2506 error_report("Received an unexpected compressed page");
2509 ret = -EINVAL;
2510 break;
2513 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
2514 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
2515 RAMBlock *block = ram_block_from_stream(f, flags);
2517 host = host_from_ram_block_offset(block, addr);
2518 if (!host) {
2519 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2520 ret = -EINVAL;
2521 break;
2523 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
2526 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2527 case RAM_SAVE_FLAG_MEM_SIZE:
2528 /* Synchronize RAM block list */
2529 total_ram_bytes = addr;
2530 while (!ret && total_ram_bytes) {
2531 RAMBlock *block;
2532 char id[256];
2533 ram_addr_t length;
2535 len = qemu_get_byte(f);
2536 qemu_get_buffer(f, (uint8_t *)id, len);
2537 id[len] = 0;
2538 length = qemu_get_be64(f);
2540 block = qemu_ram_block_by_name(id);
2541 if (block) {
2542 if (length != block->used_length) {
2543 Error *local_err = NULL;
2545 ret = qemu_ram_resize(block, length,
2546 &local_err);
2547 if (local_err) {
2548 error_report_err(local_err);
2551 /* For postcopy we need to check hugepage sizes match */
2552 if (postcopy_advised &&
2553 block->page_size != qemu_host_page_size) {
2554 uint64_t remote_page_size = qemu_get_be64(f);
2555 if (remote_page_size != block->page_size) {
2556 error_report("Mismatched RAM page size %s "
2557 "(local) %zd != %" PRId64,
2558 id, block->page_size,
2559 remote_page_size);
2560 ret = -EINVAL;
2563 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
2564 block->idstr);
2565 } else {
2566 error_report("Unknown ramblock \"%s\", cannot "
2567 "accept migration", id);
2568 ret = -EINVAL;
2571 total_ram_bytes -= length;
2573 break;
2575 case RAM_SAVE_FLAG_ZERO:
2576 ch = qemu_get_byte(f);
2577 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
2578 break;
2580 case RAM_SAVE_FLAG_PAGE:
2581 qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
2582 break;
2584 case RAM_SAVE_FLAG_COMPRESS_PAGE:
2585 len = qemu_get_be32(f);
2586 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
2587 error_report("Invalid compressed data length: %d", len);
2588 ret = -EINVAL;
2589 break;
2591 decompress_data_with_multi_threads(f, host, len);
2592 break;
2594 case RAM_SAVE_FLAG_XBZRLE:
2595 if (load_xbzrle(f, addr, host) < 0) {
2596 error_report("Failed to decompress XBZRLE page at "
2597 RAM_ADDR_FMT, addr);
2598 ret = -EINVAL;
2599 break;
2601 break;
2602 case RAM_SAVE_FLAG_EOS:
2603 /* normal exit */
2604 break;
2605 default:
2606 if (flags & RAM_SAVE_FLAG_HOOK) {
2607 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
2608 } else {
2609 error_report("Unknown combination of migration flags: %#x",
2610 flags);
2611 ret = -EINVAL;
2614 if (!ret) {
2615 ret = qemu_file_get_error(f);
2619 wait_for_decompress_done();
2620 rcu_read_unlock();
2621 trace_ram_load_complete(ret, seq_iter);
2622 return ret;
2625 static SaveVMHandlers savevm_ram_handlers = {
2626 .save_live_setup = ram_save_setup,
2627 .save_live_iterate = ram_save_iterate,
2628 .save_live_complete_postcopy = ram_save_complete,
2629 .save_live_complete_precopy = ram_save_complete,
2630 .save_live_pending = ram_save_pending,
2631 .load_state = ram_load,
2632 .cleanup = ram_migration_cleanup,
2635 void ram_mig_init(void)
2637 qemu_mutex_init(&XBZRLE.lock);
2638 register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);