migration: use dirty_rate_high_cnt more aggressively
[qemu/ar7.git] / migration / ram.c
blob26e03a5dfa35bccdebc29d8d3c00313f9e91653e
1 /*
2 * QEMU System Emulator
4 * Copyright (c) 2003-2008 Fabrice Bellard
5 * Copyright (c) 2011-2015 Red Hat Inc
7 * Authors:
8 * Juan Quintela <quintela@redhat.com>
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 * THE SOFTWARE.
28 #include "qemu/osdep.h"
29 #include "qemu-common.h"
30 #include "cpu.h"
31 #include <zlib.h>
32 #include "qapi-event.h"
33 #include "qemu/cutils.h"
34 #include "qemu/bitops.h"
35 #include "qemu/bitmap.h"
36 #include "qemu/timer.h"
37 #include "qemu/main-loop.h"
38 #include "xbzrle.h"
39 #include "migration/migration.h"
40 #include "migration/qemu-file.h"
41 #include "migration/vmstate.h"
42 #include "postcopy-ram.h"
43 #include "exec/address-spaces.h"
44 #include "migration/page_cache.h"
45 #include "qemu/error-report.h"
46 #include "trace.h"
47 #include "exec/ram_addr.h"
48 #include "qemu/rcu_queue.h"
49 #include "migration/colo.h"
51 /***********************************************************/
52 /* ram save/restore */
54 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
55 * worked for pages that where filled with the same char. We switched
56 * it to only search for the zero value. And to avoid confusion with
57 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
60 #define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */
61 #define RAM_SAVE_FLAG_ZERO 0x02
62 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
63 #define RAM_SAVE_FLAG_PAGE 0x08
64 #define RAM_SAVE_FLAG_EOS 0x10
65 #define RAM_SAVE_FLAG_CONTINUE 0x20
66 #define RAM_SAVE_FLAG_XBZRLE 0x40
67 /* 0x80 is reserved in migration.h start with 0x100 next */
68 #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
70 static uint8_t *ZERO_TARGET_PAGE;
72 static inline bool is_zero_range(uint8_t *p, uint64_t size)
74 return buffer_is_zero(p, size);
77 /* struct contains XBZRLE cache and a static page
78 used by the compression */
79 static struct {
80 /* buffer used for XBZRLE encoding */
81 uint8_t *encoded_buf;
82 /* buffer for storing page content */
83 uint8_t *current_buf;
84 /* Cache for XBZRLE, Protected by lock. */
85 PageCache *cache;
86 QemuMutex lock;
87 } XBZRLE;
89 /* buffer used for XBZRLE decoding */
90 static uint8_t *xbzrle_decoded_buf;
92 static void XBZRLE_cache_lock(void)
94 if (migrate_use_xbzrle())
95 qemu_mutex_lock(&XBZRLE.lock);
98 static void XBZRLE_cache_unlock(void)
100 if (migrate_use_xbzrle())
101 qemu_mutex_unlock(&XBZRLE.lock);
105 * xbzrle_cache_resize: resize the xbzrle cache
107 * This function is called from qmp_migrate_set_cache_size in main
108 * thread, possibly while a migration is in progress. A running
109 * migration may be using the cache and might finish during this call,
110 * hence changes to the cache are protected by XBZRLE.lock().
112 * Returns the new_size or negative in case of error.
114 * @new_size: new cache size
116 int64_t xbzrle_cache_resize(int64_t new_size)
118 PageCache *new_cache;
119 int64_t ret;
121 if (new_size < TARGET_PAGE_SIZE) {
122 return -1;
125 XBZRLE_cache_lock();
127 if (XBZRLE.cache != NULL) {
128 if (pow2floor(new_size) == migrate_xbzrle_cache_size()) {
129 goto out_new_size;
131 new_cache = cache_init(new_size / TARGET_PAGE_SIZE,
132 TARGET_PAGE_SIZE);
133 if (!new_cache) {
134 error_report("Error creating cache");
135 ret = -1;
136 goto out;
139 cache_fini(XBZRLE.cache);
140 XBZRLE.cache = new_cache;
143 out_new_size:
144 ret = pow2floor(new_size);
145 out:
146 XBZRLE_cache_unlock();
147 return ret;
151 * An outstanding page request, on the source, having been received
152 * and queued
154 struct RAMSrcPageRequest {
155 RAMBlock *rb;
156 hwaddr offset;
157 hwaddr len;
159 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
162 /* State of RAM for migration */
163 struct RAMState {
164 /* QEMUFile used for this migration */
165 QEMUFile *f;
166 /* Last block that we have visited searching for dirty pages */
167 RAMBlock *last_seen_block;
168 /* Last block from where we have sent data */
169 RAMBlock *last_sent_block;
170 /* Last dirty target page we have sent */
171 ram_addr_t last_page;
172 /* last ram version we have seen */
173 uint32_t last_version;
174 /* We are in the first round */
175 bool ram_bulk_stage;
176 /* How many times we have dirty too many pages */
177 int dirty_rate_high_cnt;
178 /* How many times we have synchronized the bitmap */
179 uint64_t bitmap_sync_count;
180 /* these variables are used for bitmap sync */
181 /* last time we did a full bitmap_sync */
182 int64_t time_last_bitmap_sync;
183 /* bytes transferred at start_time */
184 uint64_t bytes_xfer_prev;
185 /* number of dirty pages since start_time */
186 uint64_t num_dirty_pages_period;
187 /* xbzrle misses since the beginning of the period */
188 uint64_t xbzrle_cache_miss_prev;
189 /* number of iterations at the beginning of period */
190 uint64_t iterations_prev;
191 /* Accounting fields */
192 /* number of zero pages. It used to be pages filled by the same char. */
193 uint64_t zero_pages;
194 /* number of normal transferred pages */
195 uint64_t norm_pages;
196 /* Iterations since start */
197 uint64_t iterations;
198 /* xbzrle transmitted bytes. Notice that this is with
199 * compression, they can't be calculated from the pages */
200 uint64_t xbzrle_bytes;
201 /* xbzrle transmmited pages */
202 uint64_t xbzrle_pages;
203 /* xbzrle number of cache miss */
204 uint64_t xbzrle_cache_miss;
205 /* xbzrle miss rate */
206 double xbzrle_cache_miss_rate;
207 /* xbzrle number of overflows */
208 uint64_t xbzrle_overflows;
209 /* number of dirty bits in the bitmap */
210 uint64_t migration_dirty_pages;
211 /* total number of bytes transferred */
212 uint64_t bytes_transferred;
213 /* number of dirtied pages in the last second */
214 uint64_t dirty_pages_rate;
215 /* Count of requests incoming from destination */
216 uint64_t postcopy_requests;
217 /* protects modification of the bitmap */
218 QemuMutex bitmap_mutex;
219 /* The RAMBlock used in the last src_page_requests */
220 RAMBlock *last_req_rb;
221 /* Queue of outstanding page requests from the destination */
222 QemuMutex src_page_req_mutex;
223 QSIMPLEQ_HEAD(src_page_requests, RAMSrcPageRequest) src_page_requests;
225 typedef struct RAMState RAMState;
227 static RAMState ram_state;
229 uint64_t dup_mig_pages_transferred(void)
231 return ram_state.zero_pages;
234 uint64_t norm_mig_pages_transferred(void)
236 return ram_state.norm_pages;
239 uint64_t xbzrle_mig_bytes_transferred(void)
241 return ram_state.xbzrle_bytes;
244 uint64_t xbzrle_mig_pages_transferred(void)
246 return ram_state.xbzrle_pages;
249 uint64_t xbzrle_mig_pages_cache_miss(void)
251 return ram_state.xbzrle_cache_miss;
254 double xbzrle_mig_cache_miss_rate(void)
256 return ram_state.xbzrle_cache_miss_rate;
259 uint64_t xbzrle_mig_pages_overflow(void)
261 return ram_state.xbzrle_overflows;
264 uint64_t ram_bytes_transferred(void)
266 return ram_state.bytes_transferred;
269 uint64_t ram_bytes_remaining(void)
271 return ram_state.migration_dirty_pages * TARGET_PAGE_SIZE;
274 uint64_t ram_dirty_sync_count(void)
276 return ram_state.bitmap_sync_count;
279 uint64_t ram_dirty_pages_rate(void)
281 return ram_state.dirty_pages_rate;
284 uint64_t ram_postcopy_requests(void)
286 return ram_state.postcopy_requests;
289 /* used by the search for pages to send */
290 struct PageSearchStatus {
291 /* Current block being searched */
292 RAMBlock *block;
293 /* Current page to search from */
294 unsigned long page;
295 /* Set once we wrap around */
296 bool complete_round;
298 typedef struct PageSearchStatus PageSearchStatus;
300 struct CompressParam {
301 bool done;
302 bool quit;
303 QEMUFile *file;
304 QemuMutex mutex;
305 QemuCond cond;
306 RAMBlock *block;
307 ram_addr_t offset;
309 typedef struct CompressParam CompressParam;
311 struct DecompressParam {
312 bool done;
313 bool quit;
314 QemuMutex mutex;
315 QemuCond cond;
316 void *des;
317 uint8_t *compbuf;
318 int len;
320 typedef struct DecompressParam DecompressParam;
322 static CompressParam *comp_param;
323 static QemuThread *compress_threads;
324 /* comp_done_cond is used to wake up the migration thread when
325 * one of the compression threads has finished the compression.
326 * comp_done_lock is used to co-work with comp_done_cond.
328 static QemuMutex comp_done_lock;
329 static QemuCond comp_done_cond;
330 /* The empty QEMUFileOps will be used by file in CompressParam */
331 static const QEMUFileOps empty_ops = { };
333 static DecompressParam *decomp_param;
334 static QemuThread *decompress_threads;
335 static QemuMutex decomp_done_lock;
336 static QemuCond decomp_done_cond;
338 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
339 ram_addr_t offset);
341 static void *do_data_compress(void *opaque)
343 CompressParam *param = opaque;
344 RAMBlock *block;
345 ram_addr_t offset;
347 qemu_mutex_lock(&param->mutex);
348 while (!param->quit) {
349 if (param->block) {
350 block = param->block;
351 offset = param->offset;
352 param->block = NULL;
353 qemu_mutex_unlock(&param->mutex);
355 do_compress_ram_page(param->file, block, offset);
357 qemu_mutex_lock(&comp_done_lock);
358 param->done = true;
359 qemu_cond_signal(&comp_done_cond);
360 qemu_mutex_unlock(&comp_done_lock);
362 qemu_mutex_lock(&param->mutex);
363 } else {
364 qemu_cond_wait(&param->cond, &param->mutex);
367 qemu_mutex_unlock(&param->mutex);
369 return NULL;
372 static inline void terminate_compression_threads(void)
374 int idx, thread_count;
376 thread_count = migrate_compress_threads();
378 for (idx = 0; idx < thread_count; idx++) {
379 qemu_mutex_lock(&comp_param[idx].mutex);
380 comp_param[idx].quit = true;
381 qemu_cond_signal(&comp_param[idx].cond);
382 qemu_mutex_unlock(&comp_param[idx].mutex);
386 void migrate_compress_threads_join(void)
388 int i, thread_count;
390 if (!migrate_use_compression()) {
391 return;
393 terminate_compression_threads();
394 thread_count = migrate_compress_threads();
395 for (i = 0; i < thread_count; i++) {
396 qemu_thread_join(compress_threads + i);
397 qemu_fclose(comp_param[i].file);
398 qemu_mutex_destroy(&comp_param[i].mutex);
399 qemu_cond_destroy(&comp_param[i].cond);
401 qemu_mutex_destroy(&comp_done_lock);
402 qemu_cond_destroy(&comp_done_cond);
403 g_free(compress_threads);
404 g_free(comp_param);
405 compress_threads = NULL;
406 comp_param = NULL;
409 void migrate_compress_threads_create(void)
411 int i, thread_count;
413 if (!migrate_use_compression()) {
414 return;
416 thread_count = migrate_compress_threads();
417 compress_threads = g_new0(QemuThread, thread_count);
418 comp_param = g_new0(CompressParam, thread_count);
419 qemu_cond_init(&comp_done_cond);
420 qemu_mutex_init(&comp_done_lock);
421 for (i = 0; i < thread_count; i++) {
422 /* comp_param[i].file is just used as a dummy buffer to save data,
423 * set its ops to empty.
425 comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
426 comp_param[i].done = true;
427 comp_param[i].quit = false;
428 qemu_mutex_init(&comp_param[i].mutex);
429 qemu_cond_init(&comp_param[i].cond);
430 qemu_thread_create(compress_threads + i, "compress",
431 do_data_compress, comp_param + i,
432 QEMU_THREAD_JOINABLE);
437 * save_page_header: write page header to wire
439 * If this is the 1st block, it also writes the block identification
441 * Returns the number of bytes written
443 * @f: QEMUFile where to send the data
444 * @block: block that contains the page we want to send
445 * @offset: offset inside the block for the page
446 * in the lower bits, it contains flags
448 static size_t save_page_header(RAMState *rs, QEMUFile *f, RAMBlock *block,
449 ram_addr_t offset)
451 size_t size, len;
453 if (block == rs->last_sent_block) {
454 offset |= RAM_SAVE_FLAG_CONTINUE;
456 qemu_put_be64(f, offset);
457 size = 8;
459 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
460 len = strlen(block->idstr);
461 qemu_put_byte(f, len);
462 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
463 size += 1 + len;
464 rs->last_sent_block = block;
466 return size;
470 * mig_throttle_guest_down: throotle down the guest
472 * Reduce amount of guest cpu execution to hopefully slow down memory
473 * writes. If guest dirty memory rate is reduced below the rate at
474 * which we can transfer pages to the destination then we should be
475 * able to complete migration. Some workloads dirty memory way too
476 * fast and will not effectively converge, even with auto-converge.
478 static void mig_throttle_guest_down(void)
480 MigrationState *s = migrate_get_current();
481 uint64_t pct_initial = s->parameters.cpu_throttle_initial;
482 uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
484 /* We have not started throttling yet. Let's start it. */
485 if (!cpu_throttle_active()) {
486 cpu_throttle_set(pct_initial);
487 } else {
488 /* Throttling already on, just increase the rate */
489 cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
494 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
496 * @rs: current RAM state
497 * @current_addr: address for the zero page
499 * Update the xbzrle cache to reflect a page that's been sent as all 0.
500 * The important thing is that a stale (not-yet-0'd) page be replaced
501 * by the new data.
502 * As a bonus, if the page wasn't in the cache it gets added so that
503 * when a small write is made into the 0'd page it gets XBZRLE sent.
505 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
507 if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
508 return;
511 /* We don't care if this fails to allocate a new cache page
512 * as long as it updated an old one */
513 cache_insert(XBZRLE.cache, current_addr, ZERO_TARGET_PAGE,
514 rs->bitmap_sync_count);
517 #define ENCODING_FLAG_XBZRLE 0x1
520 * save_xbzrle_page: compress and send current page
522 * Returns: 1 means that we wrote the page
523 * 0 means that page is identical to the one already sent
524 * -1 means that xbzrle would be longer than normal
526 * @rs: current RAM state
527 * @current_data: pointer to the address of the page contents
528 * @current_addr: addr of the page
529 * @block: block that contains the page we want to send
530 * @offset: offset inside the block for the page
531 * @last_stage: if we are at the completion stage
533 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
534 ram_addr_t current_addr, RAMBlock *block,
535 ram_addr_t offset, bool last_stage)
537 int encoded_len = 0, bytes_xbzrle;
538 uint8_t *prev_cached_page;
540 if (!cache_is_cached(XBZRLE.cache, current_addr, rs->bitmap_sync_count)) {
541 rs->xbzrle_cache_miss++;
542 if (!last_stage) {
543 if (cache_insert(XBZRLE.cache, current_addr, *current_data,
544 rs->bitmap_sync_count) == -1) {
545 return -1;
546 } else {
547 /* update *current_data when the page has been
548 inserted into cache */
549 *current_data = get_cached_data(XBZRLE.cache, current_addr);
552 return -1;
555 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
557 /* save current buffer into memory */
558 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
560 /* XBZRLE encoding (if there is no overflow) */
561 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
562 TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
563 TARGET_PAGE_SIZE);
564 if (encoded_len == 0) {
565 trace_save_xbzrle_page_skipping();
566 return 0;
567 } else if (encoded_len == -1) {
568 trace_save_xbzrle_page_overflow();
569 rs->xbzrle_overflows++;
570 /* update data in the cache */
571 if (!last_stage) {
572 memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
573 *current_data = prev_cached_page;
575 return -1;
578 /* we need to update the data in the cache, in order to get the same data */
579 if (!last_stage) {
580 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
583 /* Send XBZRLE based compressed page */
584 bytes_xbzrle = save_page_header(rs, rs->f, block,
585 offset | RAM_SAVE_FLAG_XBZRLE);
586 qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
587 qemu_put_be16(rs->f, encoded_len);
588 qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
589 bytes_xbzrle += encoded_len + 1 + 2;
590 rs->xbzrle_pages++;
591 rs->xbzrle_bytes += bytes_xbzrle;
592 rs->bytes_transferred += bytes_xbzrle;
594 return 1;
598 * migration_bitmap_find_dirty: find the next dirty page from start
600 * Called with rcu_read_lock() to protect migration_bitmap
602 * Returns the byte offset within memory region of the start of a dirty page
604 * @rs: current RAM state
605 * @rb: RAMBlock where to search for dirty pages
606 * @start: page where we start the search
608 static inline
609 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
610 unsigned long start)
612 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
613 unsigned long *bitmap = rb->bmap;
614 unsigned long next;
616 if (rs->ram_bulk_stage && start > 0) {
617 next = start + 1;
618 } else {
619 next = find_next_bit(bitmap, size, start);
622 return next;
625 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
626 RAMBlock *rb,
627 unsigned long page)
629 bool ret;
631 ret = test_and_clear_bit(page, rb->bmap);
633 if (ret) {
634 rs->migration_dirty_pages--;
636 return ret;
639 static void migration_bitmap_sync_range(RAMState *rs, RAMBlock *rb,
640 ram_addr_t start, ram_addr_t length)
642 rs->migration_dirty_pages +=
643 cpu_physical_memory_sync_dirty_bitmap(rb, start, length,
644 &rs->num_dirty_pages_period);
648 * ram_pagesize_summary: calculate all the pagesizes of a VM
650 * Returns a summary bitmap of the page sizes of all RAMBlocks
652 * For VMs with just normal pages this is equivalent to the host page
653 * size. If it's got some huge pages then it's the OR of all the
654 * different page sizes.
656 uint64_t ram_pagesize_summary(void)
658 RAMBlock *block;
659 uint64_t summary = 0;
661 RAMBLOCK_FOREACH(block) {
662 summary |= block->page_size;
665 return summary;
668 static void migration_bitmap_sync(RAMState *rs)
670 RAMBlock *block;
671 int64_t end_time;
672 uint64_t bytes_xfer_now;
674 rs->bitmap_sync_count++;
676 if (!rs->time_last_bitmap_sync) {
677 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
680 trace_migration_bitmap_sync_start();
681 memory_global_dirty_log_sync();
683 qemu_mutex_lock(&rs->bitmap_mutex);
684 rcu_read_lock();
685 RAMBLOCK_FOREACH(block) {
686 migration_bitmap_sync_range(rs, block, 0, block->used_length);
688 rcu_read_unlock();
689 qemu_mutex_unlock(&rs->bitmap_mutex);
691 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
693 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
695 /* more than 1 second = 1000 millisecons */
696 if (end_time > rs->time_last_bitmap_sync + 1000) {
697 /* calculate period counters */
698 rs->dirty_pages_rate = rs->num_dirty_pages_period * 1000
699 / (end_time - rs->time_last_bitmap_sync);
700 bytes_xfer_now = ram_bytes_transferred();
702 if (migrate_auto_converge()) {
703 /* The following detection logic can be refined later. For now:
704 Check to see if the dirtied bytes is 50% more than the approx.
705 amount of bytes that just got transferred since the last time we
706 were in this routine. If that happens twice, start or increase
707 throttling */
709 if ((rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
710 (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
711 (++rs->dirty_rate_high_cnt >= 2)) {
712 trace_migration_throttle();
713 rs->dirty_rate_high_cnt = 0;
714 mig_throttle_guest_down();
718 if (migrate_use_xbzrle()) {
719 if (rs->iterations_prev != rs->iterations) {
720 rs->xbzrle_cache_miss_rate =
721 (double)(rs->xbzrle_cache_miss -
722 rs->xbzrle_cache_miss_prev) /
723 (rs->iterations - rs->iterations_prev);
725 rs->iterations_prev = rs->iterations;
726 rs->xbzrle_cache_miss_prev = rs->xbzrle_cache_miss;
729 /* reset period counters */
730 rs->time_last_bitmap_sync = end_time;
731 rs->num_dirty_pages_period = 0;
732 rs->bytes_xfer_prev = bytes_xfer_now;
734 if (migrate_use_events()) {
735 qapi_event_send_migration_pass(rs->bitmap_sync_count, NULL);
740 * save_zero_page: send the zero page to the stream
742 * Returns the number of pages written.
744 * @rs: current RAM state
745 * @block: block that contains the page we want to send
746 * @offset: offset inside the block for the page
747 * @p: pointer to the page
749 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
750 uint8_t *p)
752 int pages = -1;
754 if (is_zero_range(p, TARGET_PAGE_SIZE)) {
755 rs->zero_pages++;
756 rs->bytes_transferred +=
757 save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_ZERO);
758 qemu_put_byte(rs->f, 0);
759 rs->bytes_transferred += 1;
760 pages = 1;
763 return pages;
766 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
768 if (!migrate_release_ram() || !migration_in_postcopy()) {
769 return;
772 ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
776 * ram_save_page: send the given page to the stream
778 * Returns the number of pages written.
779 * < 0 - error
780 * >=0 - Number of pages written - this might legally be 0
781 * if xbzrle noticed the page was the same.
783 * @rs: current RAM state
784 * @block: block that contains the page we want to send
785 * @offset: offset inside the block for the page
786 * @last_stage: if we are at the completion stage
788 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
790 int pages = -1;
791 uint64_t bytes_xmit;
792 ram_addr_t current_addr;
793 uint8_t *p;
794 int ret;
795 bool send_async = true;
796 RAMBlock *block = pss->block;
797 ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
799 p = block->host + offset;
800 trace_ram_save_page(block->idstr, (uint64_t)offset, p);
802 /* In doubt sent page as normal */
803 bytes_xmit = 0;
804 ret = ram_control_save_page(rs->f, block->offset,
805 offset, TARGET_PAGE_SIZE, &bytes_xmit);
806 if (bytes_xmit) {
807 rs->bytes_transferred += bytes_xmit;
808 pages = 1;
811 XBZRLE_cache_lock();
813 current_addr = block->offset + offset;
815 if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
816 if (ret != RAM_SAVE_CONTROL_DELAYED) {
817 if (bytes_xmit > 0) {
818 rs->norm_pages++;
819 } else if (bytes_xmit == 0) {
820 rs->zero_pages++;
823 } else {
824 pages = save_zero_page(rs, block, offset, p);
825 if (pages > 0) {
826 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
827 * page would be stale
829 xbzrle_cache_zero_page(rs, current_addr);
830 ram_release_pages(block->idstr, offset, pages);
831 } else if (!rs->ram_bulk_stage &&
832 !migration_in_postcopy() && migrate_use_xbzrle()) {
833 pages = save_xbzrle_page(rs, &p, current_addr, block,
834 offset, last_stage);
835 if (!last_stage) {
836 /* Can't send this cached data async, since the cache page
837 * might get updated before it gets to the wire
839 send_async = false;
844 /* XBZRLE overflow or normal page */
845 if (pages == -1) {
846 rs->bytes_transferred += save_page_header(rs, rs->f, block,
847 offset | RAM_SAVE_FLAG_PAGE);
848 if (send_async) {
849 qemu_put_buffer_async(rs->f, p, TARGET_PAGE_SIZE,
850 migrate_release_ram() &
851 migration_in_postcopy());
852 } else {
853 qemu_put_buffer(rs->f, p, TARGET_PAGE_SIZE);
855 rs->bytes_transferred += TARGET_PAGE_SIZE;
856 pages = 1;
857 rs->norm_pages++;
860 XBZRLE_cache_unlock();
862 return pages;
865 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
866 ram_addr_t offset)
868 RAMState *rs = &ram_state;
869 int bytes_sent, blen;
870 uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
872 bytes_sent = save_page_header(rs, f, block, offset |
873 RAM_SAVE_FLAG_COMPRESS_PAGE);
874 blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
875 migrate_compress_level());
876 if (blen < 0) {
877 bytes_sent = 0;
878 qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
879 error_report("compressed data failed!");
880 } else {
881 bytes_sent += blen;
882 ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
885 return bytes_sent;
888 static void flush_compressed_data(RAMState *rs)
890 int idx, len, thread_count;
892 if (!migrate_use_compression()) {
893 return;
895 thread_count = migrate_compress_threads();
897 qemu_mutex_lock(&comp_done_lock);
898 for (idx = 0; idx < thread_count; idx++) {
899 while (!comp_param[idx].done) {
900 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
903 qemu_mutex_unlock(&comp_done_lock);
905 for (idx = 0; idx < thread_count; idx++) {
906 qemu_mutex_lock(&comp_param[idx].mutex);
907 if (!comp_param[idx].quit) {
908 len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
909 rs->bytes_transferred += len;
911 qemu_mutex_unlock(&comp_param[idx].mutex);
915 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
916 ram_addr_t offset)
918 param->block = block;
919 param->offset = offset;
922 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
923 ram_addr_t offset)
925 int idx, thread_count, bytes_xmit = -1, pages = -1;
927 thread_count = migrate_compress_threads();
928 qemu_mutex_lock(&comp_done_lock);
929 while (true) {
930 for (idx = 0; idx < thread_count; idx++) {
931 if (comp_param[idx].done) {
932 comp_param[idx].done = false;
933 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
934 qemu_mutex_lock(&comp_param[idx].mutex);
935 set_compress_params(&comp_param[idx], block, offset);
936 qemu_cond_signal(&comp_param[idx].cond);
937 qemu_mutex_unlock(&comp_param[idx].mutex);
938 pages = 1;
939 rs->norm_pages++;
940 rs->bytes_transferred += bytes_xmit;
941 break;
944 if (pages > 0) {
945 break;
946 } else {
947 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
950 qemu_mutex_unlock(&comp_done_lock);
952 return pages;
956 * ram_save_compressed_page: compress the given page and send it to the stream
958 * Returns the number of pages written.
960 * @rs: current RAM state
961 * @block: block that contains the page we want to send
962 * @offset: offset inside the block for the page
963 * @last_stage: if we are at the completion stage
965 static int ram_save_compressed_page(RAMState *rs, PageSearchStatus *pss,
966 bool last_stage)
968 int pages = -1;
969 uint64_t bytes_xmit = 0;
970 uint8_t *p;
971 int ret, blen;
972 RAMBlock *block = pss->block;
973 ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
975 p = block->host + offset;
977 ret = ram_control_save_page(rs->f, block->offset,
978 offset, TARGET_PAGE_SIZE, &bytes_xmit);
979 if (bytes_xmit) {
980 rs->bytes_transferred += bytes_xmit;
981 pages = 1;
983 if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
984 if (ret != RAM_SAVE_CONTROL_DELAYED) {
985 if (bytes_xmit > 0) {
986 rs->norm_pages++;
987 } else if (bytes_xmit == 0) {
988 rs->zero_pages++;
991 } else {
992 /* When starting the process of a new block, the first page of
993 * the block should be sent out before other pages in the same
994 * block, and all the pages in last block should have been sent
995 * out, keeping this order is important, because the 'cont' flag
996 * is used to avoid resending the block name.
998 if (block != rs->last_sent_block) {
999 flush_compressed_data(rs);
1000 pages = save_zero_page(rs, block, offset, p);
1001 if (pages == -1) {
1002 /* Make sure the first page is sent out before other pages */
1003 bytes_xmit = save_page_header(rs, rs->f, block, offset |
1004 RAM_SAVE_FLAG_COMPRESS_PAGE);
1005 blen = qemu_put_compression_data(rs->f, p, TARGET_PAGE_SIZE,
1006 migrate_compress_level());
1007 if (blen > 0) {
1008 rs->bytes_transferred += bytes_xmit + blen;
1009 rs->norm_pages++;
1010 pages = 1;
1011 } else {
1012 qemu_file_set_error(rs->f, blen);
1013 error_report("compressed data failed!");
1016 if (pages > 0) {
1017 ram_release_pages(block->idstr, offset, pages);
1019 } else {
1020 pages = save_zero_page(rs, block, offset, p);
1021 if (pages == -1) {
1022 pages = compress_page_with_multi_thread(rs, block, offset);
1023 } else {
1024 ram_release_pages(block->idstr, offset, pages);
1029 return pages;
1033 * find_dirty_block: find the next dirty page and update any state
1034 * associated with the search process.
1036 * Returns if a page is found
1038 * @rs: current RAM state
1039 * @pss: data about the state of the current dirty page scan
1040 * @again: set to false if the search has scanned the whole of RAM
1042 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1044 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1045 if (pss->complete_round && pss->block == rs->last_seen_block &&
1046 pss->page >= rs->last_page) {
1048 * We've been once around the RAM and haven't found anything.
1049 * Give up.
1051 *again = false;
1052 return false;
1054 if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
1055 /* Didn't find anything in this RAM Block */
1056 pss->page = 0;
1057 pss->block = QLIST_NEXT_RCU(pss->block, next);
1058 if (!pss->block) {
1059 /* Hit the end of the list */
1060 pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1061 /* Flag that we've looped */
1062 pss->complete_round = true;
1063 rs->ram_bulk_stage = false;
1064 if (migrate_use_xbzrle()) {
1065 /* If xbzrle is on, stop using the data compression at this
1066 * point. In theory, xbzrle can do better than compression.
1068 flush_compressed_data(rs);
1071 /* Didn't find anything this time, but try again on the new block */
1072 *again = true;
1073 return false;
1074 } else {
1075 /* Can go around again, but... */
1076 *again = true;
1077 /* We've found something so probably don't need to */
1078 return true;
1083 * unqueue_page: gets a page of the queue
1085 * Helper for 'get_queued_page' - gets a page off the queue
1087 * Returns the block of the page (or NULL if none available)
1089 * @rs: current RAM state
1090 * @offset: used to return the offset within the RAMBlock
1092 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1094 RAMBlock *block = NULL;
1096 qemu_mutex_lock(&rs->src_page_req_mutex);
1097 if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1098 struct RAMSrcPageRequest *entry =
1099 QSIMPLEQ_FIRST(&rs->src_page_requests);
1100 block = entry->rb;
1101 *offset = entry->offset;
1103 if (entry->len > TARGET_PAGE_SIZE) {
1104 entry->len -= TARGET_PAGE_SIZE;
1105 entry->offset += TARGET_PAGE_SIZE;
1106 } else {
1107 memory_region_unref(block->mr);
1108 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1109 g_free(entry);
1112 qemu_mutex_unlock(&rs->src_page_req_mutex);
1114 return block;
1118 * get_queued_page: unqueue a page from the postocpy requests
1120 * Skips pages that are already sent (!dirty)
1122 * Returns if a queued page is found
1124 * @rs: current RAM state
1125 * @pss: data about the state of the current dirty page scan
1127 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1129 RAMBlock *block;
1130 ram_addr_t offset;
1131 bool dirty;
1133 do {
1134 block = unqueue_page(rs, &offset);
1136 * We're sending this page, and since it's postcopy nothing else
1137 * will dirty it, and we must make sure it doesn't get sent again
1138 * even if this queue request was received after the background
1139 * search already sent it.
1141 if (block) {
1142 unsigned long page;
1144 page = offset >> TARGET_PAGE_BITS;
1145 dirty = test_bit(page, block->bmap);
1146 if (!dirty) {
1147 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1148 page, test_bit(page, block->unsentmap));
1149 } else {
1150 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1154 } while (block && !dirty);
1156 if (block) {
1158 * As soon as we start servicing pages out of order, then we have
1159 * to kill the bulk stage, since the bulk stage assumes
1160 * in (migration_bitmap_find_and_reset_dirty) that every page is
1161 * dirty, that's no longer true.
1163 rs->ram_bulk_stage = false;
1166 * We want the background search to continue from the queued page
1167 * since the guest is likely to want other pages near to the page
1168 * it just requested.
1170 pss->block = block;
1171 pss->page = offset >> TARGET_PAGE_BITS;
1174 return !!block;
1178 * migration_page_queue_free: drop any remaining pages in the ram
1179 * request queue
1181 * It should be empty at the end anyway, but in error cases there may
1182 * be some left. in case that there is any page left, we drop it.
1185 void migration_page_queue_free(void)
1187 struct RAMSrcPageRequest *mspr, *next_mspr;
1188 RAMState *rs = &ram_state;
1189 /* This queue generally should be empty - but in the case of a failed
1190 * migration might have some droppings in.
1192 rcu_read_lock();
1193 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1194 memory_region_unref(mspr->rb->mr);
1195 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1196 g_free(mspr);
1198 rcu_read_unlock();
1202 * ram_save_queue_pages: queue the page for transmission
1204 * A request from postcopy destination for example.
1206 * Returns zero on success or negative on error
1208 * @rbname: Name of the RAMBLock of the request. NULL means the
1209 * same that last one.
1210 * @start: starting address from the start of the RAMBlock
1211 * @len: length (in bytes) to send
1213 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1215 RAMBlock *ramblock;
1216 RAMState *rs = &ram_state;
1218 rs->postcopy_requests++;
1219 rcu_read_lock();
1220 if (!rbname) {
1221 /* Reuse last RAMBlock */
1222 ramblock = rs->last_req_rb;
1224 if (!ramblock) {
1226 * Shouldn't happen, we can't reuse the last RAMBlock if
1227 * it's the 1st request.
1229 error_report("ram_save_queue_pages no previous block");
1230 goto err;
1232 } else {
1233 ramblock = qemu_ram_block_by_name(rbname);
1235 if (!ramblock) {
1236 /* We shouldn't be asked for a non-existent RAMBlock */
1237 error_report("ram_save_queue_pages no block '%s'", rbname);
1238 goto err;
1240 rs->last_req_rb = ramblock;
1242 trace_ram_save_queue_pages(ramblock->idstr, start, len);
1243 if (start+len > ramblock->used_length) {
1244 error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1245 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1246 __func__, start, len, ramblock->used_length);
1247 goto err;
1250 struct RAMSrcPageRequest *new_entry =
1251 g_malloc0(sizeof(struct RAMSrcPageRequest));
1252 new_entry->rb = ramblock;
1253 new_entry->offset = start;
1254 new_entry->len = len;
1256 memory_region_ref(ramblock->mr);
1257 qemu_mutex_lock(&rs->src_page_req_mutex);
1258 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1259 qemu_mutex_unlock(&rs->src_page_req_mutex);
1260 rcu_read_unlock();
1262 return 0;
1264 err:
1265 rcu_read_unlock();
1266 return -1;
1270 * ram_save_target_page: save one target page
1272 * Returns the number of pages written
1274 * @rs: current RAM state
1275 * @ms: current migration state
1276 * @pss: data about the page we want to send
1277 * @last_stage: if we are at the completion stage
1279 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
1280 bool last_stage)
1282 int res = 0;
1284 /* Check the pages is dirty and if it is send it */
1285 if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
1287 * If xbzrle is on, stop using the data compression after first
1288 * round of migration even if compression is enabled. In theory,
1289 * xbzrle can do better than compression.
1291 if (migrate_use_compression() &&
1292 (rs->ram_bulk_stage || !migrate_use_xbzrle())) {
1293 res = ram_save_compressed_page(rs, pss, last_stage);
1294 } else {
1295 res = ram_save_page(rs, pss, last_stage);
1298 if (res < 0) {
1299 return res;
1301 if (pss->block->unsentmap) {
1302 clear_bit(pss->page, pss->block->unsentmap);
1306 return res;
1310 * ram_save_host_page: save a whole host page
1312 * Starting at *offset send pages up to the end of the current host
1313 * page. It's valid for the initial offset to point into the middle of
1314 * a host page in which case the remainder of the hostpage is sent.
1315 * Only dirty target pages are sent. Note that the host page size may
1316 * be a huge page for this block.
1317 * The saving stops at the boundary of the used_length of the block
1318 * if the RAMBlock isn't a multiple of the host page size.
1320 * Returns the number of pages written or negative on error
1322 * @rs: current RAM state
1323 * @ms: current migration state
1324 * @pss: data about the page we want to send
1325 * @last_stage: if we are at the completion stage
1327 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
1328 bool last_stage)
1330 int tmppages, pages = 0;
1331 size_t pagesize_bits =
1332 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
1334 do {
1335 tmppages = ram_save_target_page(rs, pss, last_stage);
1336 if (tmppages < 0) {
1337 return tmppages;
1340 pages += tmppages;
1341 pss->page++;
1342 } while ((pss->page & (pagesize_bits - 1)) &&
1343 offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS));
1345 /* The offset we leave with is the last one we looked at */
1346 pss->page--;
1347 return pages;
1351 * ram_find_and_save_block: finds a dirty page and sends it to f
1353 * Called within an RCU critical section.
1355 * Returns the number of pages written where zero means no dirty pages
1357 * @rs: current RAM state
1358 * @last_stage: if we are at the completion stage
1360 * On systems where host-page-size > target-page-size it will send all the
1361 * pages in a host page that are dirty.
1364 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
1366 PageSearchStatus pss;
1367 int pages = 0;
1368 bool again, found;
1370 /* No dirty page as there is zero RAM */
1371 if (!ram_bytes_total()) {
1372 return pages;
1375 pss.block = rs->last_seen_block;
1376 pss.page = rs->last_page;
1377 pss.complete_round = false;
1379 if (!pss.block) {
1380 pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1383 do {
1384 again = true;
1385 found = get_queued_page(rs, &pss);
1387 if (!found) {
1388 /* priority queue empty, so just search for something dirty */
1389 found = find_dirty_block(rs, &pss, &again);
1392 if (found) {
1393 pages = ram_save_host_page(rs, &pss, last_stage);
1395 } while (!pages && again);
1397 rs->last_seen_block = pss.block;
1398 rs->last_page = pss.page;
1400 return pages;
1403 void acct_update_position(QEMUFile *f, size_t size, bool zero)
1405 uint64_t pages = size / TARGET_PAGE_SIZE;
1406 RAMState *rs = &ram_state;
1408 if (zero) {
1409 rs->zero_pages += pages;
1410 } else {
1411 rs->norm_pages += pages;
1412 rs->bytes_transferred += size;
1413 qemu_update_position(f, size);
1417 uint64_t ram_bytes_total(void)
1419 RAMBlock *block;
1420 uint64_t total = 0;
1422 rcu_read_lock();
1423 RAMBLOCK_FOREACH(block) {
1424 total += block->used_length;
1426 rcu_read_unlock();
1427 return total;
1430 void free_xbzrle_decoded_buf(void)
1432 g_free(xbzrle_decoded_buf);
1433 xbzrle_decoded_buf = NULL;
1436 static void ram_migration_cleanup(void *opaque)
1438 RAMBlock *block;
1440 /* caller have hold iothread lock or is in a bh, so there is
1441 * no writing race against this migration_bitmap
1443 memory_global_dirty_log_stop();
1445 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1446 g_free(block->bmap);
1447 block->bmap = NULL;
1448 g_free(block->unsentmap);
1449 block->unsentmap = NULL;
1452 XBZRLE_cache_lock();
1453 if (XBZRLE.cache) {
1454 cache_fini(XBZRLE.cache);
1455 g_free(XBZRLE.encoded_buf);
1456 g_free(XBZRLE.current_buf);
1457 g_free(ZERO_TARGET_PAGE);
1458 XBZRLE.cache = NULL;
1459 XBZRLE.encoded_buf = NULL;
1460 XBZRLE.current_buf = NULL;
1462 XBZRLE_cache_unlock();
1465 static void ram_state_reset(RAMState *rs)
1467 rs->last_seen_block = NULL;
1468 rs->last_sent_block = NULL;
1469 rs->last_page = 0;
1470 rs->last_version = ram_list.version;
1471 rs->ram_bulk_stage = true;
1474 #define MAX_WAIT 50 /* ms, half buffered_file limit */
1477 * 'expected' is the value you expect the bitmap mostly to be full
1478 * of; it won't bother printing lines that are all this value.
1479 * If 'todump' is null the migration bitmap is dumped.
1481 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
1482 unsigned long pages)
1484 int64_t cur;
1485 int64_t linelen = 128;
1486 char linebuf[129];
1488 for (cur = 0; cur < pages; cur += linelen) {
1489 int64_t curb;
1490 bool found = false;
1492 * Last line; catch the case where the line length
1493 * is longer than remaining ram
1495 if (cur + linelen > pages) {
1496 linelen = pages - cur;
1498 for (curb = 0; curb < linelen; curb++) {
1499 bool thisbit = test_bit(cur + curb, todump);
1500 linebuf[curb] = thisbit ? '1' : '.';
1501 found = found || (thisbit != expected);
1503 if (found) {
1504 linebuf[curb] = '\0';
1505 fprintf(stderr, "0x%08" PRIx64 " : %s\n", cur, linebuf);
1510 /* **** functions for postcopy ***** */
1512 void ram_postcopy_migrated_memory_release(MigrationState *ms)
1514 struct RAMBlock *block;
1516 RAMBLOCK_FOREACH(block) {
1517 unsigned long *bitmap = block->bmap;
1518 unsigned long range = block->used_length >> TARGET_PAGE_BITS;
1519 unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
1521 while (run_start < range) {
1522 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
1523 ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
1524 (run_end - run_start) << TARGET_PAGE_BITS);
1525 run_start = find_next_zero_bit(bitmap, range, run_end + 1);
1531 * postcopy_send_discard_bm_ram: discard a RAMBlock
1533 * Returns zero on success
1535 * Callback from postcopy_each_ram_send_discard for each RAMBlock
1536 * Note: At this point the 'unsentmap' is the processed bitmap combined
1537 * with the dirtymap; so a '1' means it's either dirty or unsent.
1539 * @ms: current migration state
1540 * @pds: state for postcopy
1541 * @start: RAMBlock starting page
1542 * @length: RAMBlock size
1544 static int postcopy_send_discard_bm_ram(MigrationState *ms,
1545 PostcopyDiscardState *pds,
1546 RAMBlock *block)
1548 unsigned long end = block->used_length >> TARGET_PAGE_BITS;
1549 unsigned long current;
1550 unsigned long *unsentmap = block->unsentmap;
1552 for (current = 0; current < end; ) {
1553 unsigned long one = find_next_bit(unsentmap, end, current);
1555 if (one <= end) {
1556 unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
1557 unsigned long discard_length;
1559 if (zero >= end) {
1560 discard_length = end - one;
1561 } else {
1562 discard_length = zero - one;
1564 if (discard_length) {
1565 postcopy_discard_send_range(ms, pds, one, discard_length);
1567 current = one + discard_length;
1568 } else {
1569 current = one;
1573 return 0;
1577 * postcopy_each_ram_send_discard: discard all RAMBlocks
1579 * Returns 0 for success or negative for error
1581 * Utility for the outgoing postcopy code.
1582 * Calls postcopy_send_discard_bm_ram for each RAMBlock
1583 * passing it bitmap indexes and name.
1584 * (qemu_ram_foreach_block ends up passing unscaled lengths
1585 * which would mean postcopy code would have to deal with target page)
1587 * @ms: current migration state
1589 static int postcopy_each_ram_send_discard(MigrationState *ms)
1591 struct RAMBlock *block;
1592 int ret;
1594 RAMBLOCK_FOREACH(block) {
1595 PostcopyDiscardState *pds =
1596 postcopy_discard_send_init(ms, block->idstr);
1599 * Postcopy sends chunks of bitmap over the wire, but it
1600 * just needs indexes at this point, avoids it having
1601 * target page specific code.
1603 ret = postcopy_send_discard_bm_ram(ms, pds, block);
1604 postcopy_discard_send_finish(ms, pds);
1605 if (ret) {
1606 return ret;
1610 return 0;
1614 * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
1616 * Helper for postcopy_chunk_hostpages; it's called twice to
1617 * canonicalize the two bitmaps, that are similar, but one is
1618 * inverted.
1620 * Postcopy requires that all target pages in a hostpage are dirty or
1621 * clean, not a mix. This function canonicalizes the bitmaps.
1623 * @ms: current migration state
1624 * @unsent_pass: if true we need to canonicalize partially unsent host pages
1625 * otherwise we need to canonicalize partially dirty host pages
1626 * @block: block that contains the page we want to canonicalize
1627 * @pds: state for postcopy
1629 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
1630 RAMBlock *block,
1631 PostcopyDiscardState *pds)
1633 RAMState *rs = &ram_state;
1634 unsigned long *bitmap = block->bmap;
1635 unsigned long *unsentmap = block->unsentmap;
1636 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
1637 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
1638 unsigned long run_start;
1640 if (block->page_size == TARGET_PAGE_SIZE) {
1641 /* Easy case - TPS==HPS for a non-huge page RAMBlock */
1642 return;
1645 if (unsent_pass) {
1646 /* Find a sent page */
1647 run_start = find_next_zero_bit(unsentmap, pages, 0);
1648 } else {
1649 /* Find a dirty page */
1650 run_start = find_next_bit(bitmap, pages, 0);
1653 while (run_start < pages) {
1654 bool do_fixup = false;
1655 unsigned long fixup_start_addr;
1656 unsigned long host_offset;
1659 * If the start of this run of pages is in the middle of a host
1660 * page, then we need to fixup this host page.
1662 host_offset = run_start % host_ratio;
1663 if (host_offset) {
1664 do_fixup = true;
1665 run_start -= host_offset;
1666 fixup_start_addr = run_start;
1667 /* For the next pass */
1668 run_start = run_start + host_ratio;
1669 } else {
1670 /* Find the end of this run */
1671 unsigned long run_end;
1672 if (unsent_pass) {
1673 run_end = find_next_bit(unsentmap, pages, run_start + 1);
1674 } else {
1675 run_end = find_next_zero_bit(bitmap, pages, run_start + 1);
1678 * If the end isn't at the start of a host page, then the
1679 * run doesn't finish at the end of a host page
1680 * and we need to discard.
1682 host_offset = run_end % host_ratio;
1683 if (host_offset) {
1684 do_fixup = true;
1685 fixup_start_addr = run_end - host_offset;
1687 * This host page has gone, the next loop iteration starts
1688 * from after the fixup
1690 run_start = fixup_start_addr + host_ratio;
1691 } else {
1693 * No discards on this iteration, next loop starts from
1694 * next sent/dirty page
1696 run_start = run_end + 1;
1700 if (do_fixup) {
1701 unsigned long page;
1703 /* Tell the destination to discard this page */
1704 if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
1705 /* For the unsent_pass we:
1706 * discard partially sent pages
1707 * For the !unsent_pass (dirty) we:
1708 * discard partially dirty pages that were sent
1709 * (any partially sent pages were already discarded
1710 * by the previous unsent_pass)
1712 postcopy_discard_send_range(ms, pds, fixup_start_addr,
1713 host_ratio);
1716 /* Clean up the bitmap */
1717 for (page = fixup_start_addr;
1718 page < fixup_start_addr + host_ratio; page++) {
1719 /* All pages in this host page are now not sent */
1720 set_bit(page, unsentmap);
1723 * Remark them as dirty, updating the count for any pages
1724 * that weren't previously dirty.
1726 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
1730 if (unsent_pass) {
1731 /* Find the next sent page for the next iteration */
1732 run_start = find_next_zero_bit(unsentmap, pages, run_start);
1733 } else {
1734 /* Find the next dirty page for the next iteration */
1735 run_start = find_next_bit(bitmap, pages, run_start);
1741 * postcopy_chuck_hostpages: discrad any partially sent host page
1743 * Utility for the outgoing postcopy code.
1745 * Discard any partially sent host-page size chunks, mark any partially
1746 * dirty host-page size chunks as all dirty. In this case the host-page
1747 * is the host-page for the particular RAMBlock, i.e. it might be a huge page
1749 * Returns zero on success
1751 * @ms: current migration state
1752 * @block: block we want to work with
1754 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
1756 PostcopyDiscardState *pds =
1757 postcopy_discard_send_init(ms, block->idstr);
1759 /* First pass: Discard all partially sent host pages */
1760 postcopy_chunk_hostpages_pass(ms, true, block, pds);
1762 * Second pass: Ensure that all partially dirty host pages are made
1763 * fully dirty.
1765 postcopy_chunk_hostpages_pass(ms, false, block, pds);
1767 postcopy_discard_send_finish(ms, pds);
1768 return 0;
1772 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
1774 * Returns zero on success
1776 * Transmit the set of pages to be discarded after precopy to the target
1777 * these are pages that:
1778 * a) Have been previously transmitted but are now dirty again
1779 * b) Pages that have never been transmitted, this ensures that
1780 * any pages on the destination that have been mapped by background
1781 * tasks get discarded (transparent huge pages is the specific concern)
1782 * Hopefully this is pretty sparse
1784 * @ms: current migration state
1786 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
1788 RAMState *rs = &ram_state;
1789 RAMBlock *block;
1790 int ret;
1792 rcu_read_lock();
1794 /* This should be our last sync, the src is now paused */
1795 migration_bitmap_sync(rs);
1797 /* Easiest way to make sure we don't resume in the middle of a host-page */
1798 rs->last_seen_block = NULL;
1799 rs->last_sent_block = NULL;
1800 rs->last_page = 0;
1802 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1803 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
1804 unsigned long *bitmap = block->bmap;
1805 unsigned long *unsentmap = block->unsentmap;
1807 if (!unsentmap) {
1808 /* We don't have a safe way to resize the sentmap, so
1809 * if the bitmap was resized it will be NULL at this
1810 * point.
1812 error_report("migration ram resized during precopy phase");
1813 rcu_read_unlock();
1814 return -EINVAL;
1816 /* Deal with TPS != HPS and huge pages */
1817 ret = postcopy_chunk_hostpages(ms, block);
1818 if (ret) {
1819 rcu_read_unlock();
1820 return ret;
1824 * Update the unsentmap to be unsentmap = unsentmap | dirty
1826 bitmap_or(unsentmap, unsentmap, bitmap, pages);
1827 #ifdef DEBUG_POSTCOPY
1828 ram_debug_dump_bitmap(unsentmap, true, pages);
1829 #endif
1831 trace_ram_postcopy_send_discard_bitmap();
1833 ret = postcopy_each_ram_send_discard(ms);
1834 rcu_read_unlock();
1836 return ret;
1840 * ram_discard_range: discard dirtied pages at the beginning of postcopy
1842 * Returns zero on success
1844 * @rbname: name of the RAMBlock of the request. NULL means the
1845 * same that last one.
1846 * @start: RAMBlock starting page
1847 * @length: RAMBlock size
1849 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
1851 int ret = -1;
1853 trace_ram_discard_range(rbname, start, length);
1855 rcu_read_lock();
1856 RAMBlock *rb = qemu_ram_block_by_name(rbname);
1858 if (!rb) {
1859 error_report("ram_discard_range: Failed to find block '%s'", rbname);
1860 goto err;
1863 ret = ram_block_discard_range(rb, start, length);
1865 err:
1866 rcu_read_unlock();
1868 return ret;
1871 static int ram_state_init(RAMState *rs)
1873 memset(rs, 0, sizeof(*rs));
1874 qemu_mutex_init(&rs->bitmap_mutex);
1875 qemu_mutex_init(&rs->src_page_req_mutex);
1876 QSIMPLEQ_INIT(&rs->src_page_requests);
1878 if (migrate_use_xbzrle()) {
1879 XBZRLE_cache_lock();
1880 ZERO_TARGET_PAGE = g_malloc0(TARGET_PAGE_SIZE);
1881 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size() /
1882 TARGET_PAGE_SIZE,
1883 TARGET_PAGE_SIZE);
1884 if (!XBZRLE.cache) {
1885 XBZRLE_cache_unlock();
1886 error_report("Error creating cache");
1887 return -1;
1889 XBZRLE_cache_unlock();
1891 /* We prefer not to abort if there is no memory */
1892 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
1893 if (!XBZRLE.encoded_buf) {
1894 error_report("Error allocating encoded_buf");
1895 return -1;
1898 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
1899 if (!XBZRLE.current_buf) {
1900 error_report("Error allocating current_buf");
1901 g_free(XBZRLE.encoded_buf);
1902 XBZRLE.encoded_buf = NULL;
1903 return -1;
1907 /* For memory_global_dirty_log_start below. */
1908 qemu_mutex_lock_iothread();
1910 qemu_mutex_lock_ramlist();
1911 rcu_read_lock();
1912 ram_state_reset(rs);
1914 /* Skip setting bitmap if there is no RAM */
1915 if (ram_bytes_total()) {
1916 RAMBlock *block;
1918 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1919 unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
1921 block->bmap = bitmap_new(pages);
1922 bitmap_set(block->bmap, 0, pages);
1923 if (migrate_postcopy_ram()) {
1924 block->unsentmap = bitmap_new(pages);
1925 bitmap_set(block->unsentmap, 0, pages);
1931 * Count the total number of pages used by ram blocks not including any
1932 * gaps due to alignment or unplugs.
1934 rs->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
1936 memory_global_dirty_log_start();
1937 migration_bitmap_sync(rs);
1938 qemu_mutex_unlock_ramlist();
1939 qemu_mutex_unlock_iothread();
1940 rcu_read_unlock();
1942 return 0;
1946 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
1947 * long-running RCU critical section. When rcu-reclaims in the code
1948 * start to become numerous it will be necessary to reduce the
1949 * granularity of these critical sections.
1953 * ram_save_setup: Setup RAM for migration
1955 * Returns zero to indicate success and negative for error
1957 * @f: QEMUFile where to send the data
1958 * @opaque: RAMState pointer
1960 static int ram_save_setup(QEMUFile *f, void *opaque)
1962 RAMState *rs = opaque;
1963 RAMBlock *block;
1965 /* migration has already setup the bitmap, reuse it. */
1966 if (!migration_in_colo_state()) {
1967 if (ram_state_init(rs) < 0) {
1968 return -1;
1971 rs->f = f;
1973 rcu_read_lock();
1975 qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
1977 RAMBLOCK_FOREACH(block) {
1978 qemu_put_byte(f, strlen(block->idstr));
1979 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
1980 qemu_put_be64(f, block->used_length);
1981 if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
1982 qemu_put_be64(f, block->page_size);
1986 rcu_read_unlock();
1988 ram_control_before_iterate(f, RAM_CONTROL_SETUP);
1989 ram_control_after_iterate(f, RAM_CONTROL_SETUP);
1991 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
1993 return 0;
1997 * ram_save_iterate: iterative stage for migration
1999 * Returns zero to indicate success and negative for error
2001 * @f: QEMUFile where to send the data
2002 * @opaque: RAMState pointer
2004 static int ram_save_iterate(QEMUFile *f, void *opaque)
2006 RAMState *rs = opaque;
2007 int ret;
2008 int i;
2009 int64_t t0;
2010 int done = 0;
2012 rcu_read_lock();
2013 if (ram_list.version != rs->last_version) {
2014 ram_state_reset(rs);
2017 /* Read version before ram_list.blocks */
2018 smp_rmb();
2020 ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2022 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2023 i = 0;
2024 while ((ret = qemu_file_rate_limit(f)) == 0) {
2025 int pages;
2027 pages = ram_find_and_save_block(rs, false);
2028 /* no more pages to sent */
2029 if (pages == 0) {
2030 done = 1;
2031 break;
2033 rs->iterations++;
2035 /* we want to check in the 1st loop, just in case it was the 1st time
2036 and we had to sync the dirty bitmap.
2037 qemu_get_clock_ns() is a bit expensive, so we only check each some
2038 iterations
2040 if ((i & 63) == 0) {
2041 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
2042 if (t1 > MAX_WAIT) {
2043 trace_ram_save_iterate_big_wait(t1, i);
2044 break;
2047 i++;
2049 flush_compressed_data(rs);
2050 rcu_read_unlock();
2053 * Must occur before EOS (or any QEMUFile operation)
2054 * because of RDMA protocol.
2056 ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2058 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2059 rs->bytes_transferred += 8;
2061 ret = qemu_file_get_error(f);
2062 if (ret < 0) {
2063 return ret;
2066 return done;
2070 * ram_save_complete: function called to send the remaining amount of ram
2072 * Returns zero to indicate success
2074 * Called with iothread lock
2076 * @f: QEMUFile where to send the data
2077 * @opaque: RAMState pointer
2079 static int ram_save_complete(QEMUFile *f, void *opaque)
2081 RAMState *rs = opaque;
2083 rcu_read_lock();
2085 if (!migration_in_postcopy()) {
2086 migration_bitmap_sync(rs);
2089 ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2091 /* try transferring iterative blocks of memory */
2093 /* flush all remaining blocks regardless of rate limiting */
2094 while (true) {
2095 int pages;
2097 pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2098 /* no more blocks to sent */
2099 if (pages == 0) {
2100 break;
2104 flush_compressed_data(rs);
2105 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2107 rcu_read_unlock();
2109 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2111 return 0;
2114 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2115 uint64_t *non_postcopiable_pending,
2116 uint64_t *postcopiable_pending)
2118 RAMState *rs = opaque;
2119 uint64_t remaining_size;
2121 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2123 if (!migration_in_postcopy() &&
2124 remaining_size < max_size) {
2125 qemu_mutex_lock_iothread();
2126 rcu_read_lock();
2127 migration_bitmap_sync(rs);
2128 rcu_read_unlock();
2129 qemu_mutex_unlock_iothread();
2130 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2133 /* We can do postcopy, and all the data is postcopiable */
2134 *postcopiable_pending += remaining_size;
2137 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2139 unsigned int xh_len;
2140 int xh_flags;
2141 uint8_t *loaded_data;
2143 if (!xbzrle_decoded_buf) {
2144 xbzrle_decoded_buf = g_malloc(TARGET_PAGE_SIZE);
2146 loaded_data = xbzrle_decoded_buf;
2148 /* extract RLE header */
2149 xh_flags = qemu_get_byte(f);
2150 xh_len = qemu_get_be16(f);
2152 if (xh_flags != ENCODING_FLAG_XBZRLE) {
2153 error_report("Failed to load XBZRLE page - wrong compression!");
2154 return -1;
2157 if (xh_len > TARGET_PAGE_SIZE) {
2158 error_report("Failed to load XBZRLE page - len overflow!");
2159 return -1;
2161 /* load data and decode */
2162 qemu_get_buffer_in_place(f, &loaded_data, xh_len);
2164 /* decode RLE */
2165 if (xbzrle_decode_buffer(loaded_data, xh_len, host,
2166 TARGET_PAGE_SIZE) == -1) {
2167 error_report("Failed to load XBZRLE page - decode error!");
2168 return -1;
2171 return 0;
2175 * ram_block_from_stream: read a RAMBlock id from the migration stream
2177 * Must be called from within a rcu critical section.
2179 * Returns a pointer from within the RCU-protected ram_list.
2181 * @f: QEMUFile where to read the data from
2182 * @flags: Page flags (mostly to see if it's a continuation of previous block)
2184 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
2186 static RAMBlock *block = NULL;
2187 char id[256];
2188 uint8_t len;
2190 if (flags & RAM_SAVE_FLAG_CONTINUE) {
2191 if (!block) {
2192 error_report("Ack, bad migration stream!");
2193 return NULL;
2195 return block;
2198 len = qemu_get_byte(f);
2199 qemu_get_buffer(f, (uint8_t *)id, len);
2200 id[len] = 0;
2202 block = qemu_ram_block_by_name(id);
2203 if (!block) {
2204 error_report("Can't find block %s", id);
2205 return NULL;
2208 return block;
2211 static inline void *host_from_ram_block_offset(RAMBlock *block,
2212 ram_addr_t offset)
2214 if (!offset_in_ramblock(block, offset)) {
2215 return NULL;
2218 return block->host + offset;
2222 * ram_handle_compressed: handle the zero page case
2224 * If a page (or a whole RDMA chunk) has been
2225 * determined to be zero, then zap it.
2227 * @host: host address for the zero page
2228 * @ch: what the page is filled from. We only support zero
2229 * @size: size of the zero page
2231 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2233 if (ch != 0 || !is_zero_range(host, size)) {
2234 memset(host, ch, size);
2238 static void *do_data_decompress(void *opaque)
2240 DecompressParam *param = opaque;
2241 unsigned long pagesize;
2242 uint8_t *des;
2243 int len;
2245 qemu_mutex_lock(&param->mutex);
2246 while (!param->quit) {
2247 if (param->des) {
2248 des = param->des;
2249 len = param->len;
2250 param->des = 0;
2251 qemu_mutex_unlock(&param->mutex);
2253 pagesize = TARGET_PAGE_SIZE;
2254 /* uncompress() will return failed in some case, especially
2255 * when the page is dirted when doing the compression, it's
2256 * not a problem because the dirty page will be retransferred
2257 * and uncompress() won't break the data in other pages.
2259 uncompress((Bytef *)des, &pagesize,
2260 (const Bytef *)param->compbuf, len);
2262 qemu_mutex_lock(&decomp_done_lock);
2263 param->done = true;
2264 qemu_cond_signal(&decomp_done_cond);
2265 qemu_mutex_unlock(&decomp_done_lock);
2267 qemu_mutex_lock(&param->mutex);
2268 } else {
2269 qemu_cond_wait(&param->cond, &param->mutex);
2272 qemu_mutex_unlock(&param->mutex);
2274 return NULL;
2277 static void wait_for_decompress_done(void)
2279 int idx, thread_count;
2281 if (!migrate_use_compression()) {
2282 return;
2285 thread_count = migrate_decompress_threads();
2286 qemu_mutex_lock(&decomp_done_lock);
2287 for (idx = 0; idx < thread_count; idx++) {
2288 while (!decomp_param[idx].done) {
2289 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2292 qemu_mutex_unlock(&decomp_done_lock);
2295 void migrate_decompress_threads_create(void)
2297 int i, thread_count;
2299 thread_count = migrate_decompress_threads();
2300 decompress_threads = g_new0(QemuThread, thread_count);
2301 decomp_param = g_new0(DecompressParam, thread_count);
2302 qemu_mutex_init(&decomp_done_lock);
2303 qemu_cond_init(&decomp_done_cond);
2304 for (i = 0; i < thread_count; i++) {
2305 qemu_mutex_init(&decomp_param[i].mutex);
2306 qemu_cond_init(&decomp_param[i].cond);
2307 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
2308 decomp_param[i].done = true;
2309 decomp_param[i].quit = false;
2310 qemu_thread_create(decompress_threads + i, "decompress",
2311 do_data_decompress, decomp_param + i,
2312 QEMU_THREAD_JOINABLE);
2316 void migrate_decompress_threads_join(void)
2318 int i, thread_count;
2320 thread_count = migrate_decompress_threads();
2321 for (i = 0; i < thread_count; i++) {
2322 qemu_mutex_lock(&decomp_param[i].mutex);
2323 decomp_param[i].quit = true;
2324 qemu_cond_signal(&decomp_param[i].cond);
2325 qemu_mutex_unlock(&decomp_param[i].mutex);
2327 for (i = 0; i < thread_count; i++) {
2328 qemu_thread_join(decompress_threads + i);
2329 qemu_mutex_destroy(&decomp_param[i].mutex);
2330 qemu_cond_destroy(&decomp_param[i].cond);
2331 g_free(decomp_param[i].compbuf);
2333 g_free(decompress_threads);
2334 g_free(decomp_param);
2335 decompress_threads = NULL;
2336 decomp_param = NULL;
2339 static void decompress_data_with_multi_threads(QEMUFile *f,
2340 void *host, int len)
2342 int idx, thread_count;
2344 thread_count = migrate_decompress_threads();
2345 qemu_mutex_lock(&decomp_done_lock);
2346 while (true) {
2347 for (idx = 0; idx < thread_count; idx++) {
2348 if (decomp_param[idx].done) {
2349 decomp_param[idx].done = false;
2350 qemu_mutex_lock(&decomp_param[idx].mutex);
2351 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
2352 decomp_param[idx].des = host;
2353 decomp_param[idx].len = len;
2354 qemu_cond_signal(&decomp_param[idx].cond);
2355 qemu_mutex_unlock(&decomp_param[idx].mutex);
2356 break;
2359 if (idx < thread_count) {
2360 break;
2361 } else {
2362 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2365 qemu_mutex_unlock(&decomp_done_lock);
2369 * ram_postcopy_incoming_init: allocate postcopy data structures
2371 * Returns 0 for success and negative if there was one error
2373 * @mis: current migration incoming state
2375 * Allocate data structures etc needed by incoming migration with
2376 * postcopy-ram. postcopy-ram's similarly names
2377 * postcopy_ram_incoming_init does the work.
2379 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
2381 unsigned long ram_pages = last_ram_page();
2383 return postcopy_ram_incoming_init(mis, ram_pages);
2387 * ram_load_postcopy: load a page in postcopy case
2389 * Returns 0 for success or -errno in case of error
2391 * Called in postcopy mode by ram_load().
2392 * rcu_read_lock is taken prior to this being called.
2394 * @f: QEMUFile where to send the data
2396 static int ram_load_postcopy(QEMUFile *f)
2398 int flags = 0, ret = 0;
2399 bool place_needed = false;
2400 bool matching_page_sizes = false;
2401 MigrationIncomingState *mis = migration_incoming_get_current();
2402 /* Temporary page that is later 'placed' */
2403 void *postcopy_host_page = postcopy_get_tmp_page(mis);
2404 void *last_host = NULL;
2405 bool all_zero = false;
2407 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2408 ram_addr_t addr;
2409 void *host = NULL;
2410 void *page_buffer = NULL;
2411 void *place_source = NULL;
2412 RAMBlock *block = NULL;
2413 uint8_t ch;
2415 addr = qemu_get_be64(f);
2416 flags = addr & ~TARGET_PAGE_MASK;
2417 addr &= TARGET_PAGE_MASK;
2419 trace_ram_load_postcopy_loop((uint64_t)addr, flags);
2420 place_needed = false;
2421 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) {
2422 block = ram_block_from_stream(f, flags);
2424 host = host_from_ram_block_offset(block, addr);
2425 if (!host) {
2426 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2427 ret = -EINVAL;
2428 break;
2430 matching_page_sizes = block->page_size == TARGET_PAGE_SIZE;
2432 * Postcopy requires that we place whole host pages atomically;
2433 * these may be huge pages for RAMBlocks that are backed by
2434 * hugetlbfs.
2435 * To make it atomic, the data is read into a temporary page
2436 * that's moved into place later.
2437 * The migration protocol uses, possibly smaller, target-pages
2438 * however the source ensures it always sends all the components
2439 * of a host page in order.
2441 page_buffer = postcopy_host_page +
2442 ((uintptr_t)host & (block->page_size - 1));
2443 /* If all TP are zero then we can optimise the place */
2444 if (!((uintptr_t)host & (block->page_size - 1))) {
2445 all_zero = true;
2446 } else {
2447 /* not the 1st TP within the HP */
2448 if (host != (last_host + TARGET_PAGE_SIZE)) {
2449 error_report("Non-sequential target page %p/%p",
2450 host, last_host);
2451 ret = -EINVAL;
2452 break;
2458 * If it's the last part of a host page then we place the host
2459 * page
2461 place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
2462 (block->page_size - 1)) == 0;
2463 place_source = postcopy_host_page;
2465 last_host = host;
2467 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2468 case RAM_SAVE_FLAG_ZERO:
2469 ch = qemu_get_byte(f);
2470 memset(page_buffer, ch, TARGET_PAGE_SIZE);
2471 if (ch) {
2472 all_zero = false;
2474 break;
2476 case RAM_SAVE_FLAG_PAGE:
2477 all_zero = false;
2478 if (!place_needed || !matching_page_sizes) {
2479 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
2480 } else {
2481 /* Avoids the qemu_file copy during postcopy, which is
2482 * going to do a copy later; can only do it when we
2483 * do this read in one go (matching page sizes)
2485 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
2486 TARGET_PAGE_SIZE);
2488 break;
2489 case RAM_SAVE_FLAG_EOS:
2490 /* normal exit */
2491 break;
2492 default:
2493 error_report("Unknown combination of migration flags: %#x"
2494 " (postcopy mode)", flags);
2495 ret = -EINVAL;
2498 if (place_needed) {
2499 /* This gets called at the last target page in the host page */
2500 void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
2502 if (all_zero) {
2503 ret = postcopy_place_page_zero(mis, place_dest,
2504 block->page_size);
2505 } else {
2506 ret = postcopy_place_page(mis, place_dest,
2507 place_source, block->page_size);
2510 if (!ret) {
2511 ret = qemu_file_get_error(f);
2515 return ret;
2518 static int ram_load(QEMUFile *f, void *opaque, int version_id)
2520 int flags = 0, ret = 0;
2521 static uint64_t seq_iter;
2522 int len = 0;
2524 * If system is running in postcopy mode, page inserts to host memory must
2525 * be atomic
2527 bool postcopy_running = postcopy_state_get() >= POSTCOPY_INCOMING_LISTENING;
2528 /* ADVISE is earlier, it shows the source has the postcopy capability on */
2529 bool postcopy_advised = postcopy_state_get() >= POSTCOPY_INCOMING_ADVISE;
2531 seq_iter++;
2533 if (version_id != 4) {
2534 ret = -EINVAL;
2537 /* This RCU critical section can be very long running.
2538 * When RCU reclaims in the code start to become numerous,
2539 * it will be necessary to reduce the granularity of this
2540 * critical section.
2542 rcu_read_lock();
2544 if (postcopy_running) {
2545 ret = ram_load_postcopy(f);
2548 while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2549 ram_addr_t addr, total_ram_bytes;
2550 void *host = NULL;
2551 uint8_t ch;
2553 addr = qemu_get_be64(f);
2554 flags = addr & ~TARGET_PAGE_MASK;
2555 addr &= TARGET_PAGE_MASK;
2557 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
2558 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
2559 RAMBlock *block = ram_block_from_stream(f, flags);
2561 host = host_from_ram_block_offset(block, addr);
2562 if (!host) {
2563 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2564 ret = -EINVAL;
2565 break;
2567 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
2570 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2571 case RAM_SAVE_FLAG_MEM_SIZE:
2572 /* Synchronize RAM block list */
2573 total_ram_bytes = addr;
2574 while (!ret && total_ram_bytes) {
2575 RAMBlock *block;
2576 char id[256];
2577 ram_addr_t length;
2579 len = qemu_get_byte(f);
2580 qemu_get_buffer(f, (uint8_t *)id, len);
2581 id[len] = 0;
2582 length = qemu_get_be64(f);
2584 block = qemu_ram_block_by_name(id);
2585 if (block) {
2586 if (length != block->used_length) {
2587 Error *local_err = NULL;
2589 ret = qemu_ram_resize(block, length,
2590 &local_err);
2591 if (local_err) {
2592 error_report_err(local_err);
2595 /* For postcopy we need to check hugepage sizes match */
2596 if (postcopy_advised &&
2597 block->page_size != qemu_host_page_size) {
2598 uint64_t remote_page_size = qemu_get_be64(f);
2599 if (remote_page_size != block->page_size) {
2600 error_report("Mismatched RAM page size %s "
2601 "(local) %zd != %" PRId64,
2602 id, block->page_size,
2603 remote_page_size);
2604 ret = -EINVAL;
2607 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
2608 block->idstr);
2609 } else {
2610 error_report("Unknown ramblock \"%s\", cannot "
2611 "accept migration", id);
2612 ret = -EINVAL;
2615 total_ram_bytes -= length;
2617 break;
2619 case RAM_SAVE_FLAG_ZERO:
2620 ch = qemu_get_byte(f);
2621 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
2622 break;
2624 case RAM_SAVE_FLAG_PAGE:
2625 qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
2626 break;
2628 case RAM_SAVE_FLAG_COMPRESS_PAGE:
2629 len = qemu_get_be32(f);
2630 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
2631 error_report("Invalid compressed data length: %d", len);
2632 ret = -EINVAL;
2633 break;
2635 decompress_data_with_multi_threads(f, host, len);
2636 break;
2638 case RAM_SAVE_FLAG_XBZRLE:
2639 if (load_xbzrle(f, addr, host) < 0) {
2640 error_report("Failed to decompress XBZRLE page at "
2641 RAM_ADDR_FMT, addr);
2642 ret = -EINVAL;
2643 break;
2645 break;
2646 case RAM_SAVE_FLAG_EOS:
2647 /* normal exit */
2648 break;
2649 default:
2650 if (flags & RAM_SAVE_FLAG_HOOK) {
2651 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
2652 } else {
2653 error_report("Unknown combination of migration flags: %#x",
2654 flags);
2655 ret = -EINVAL;
2658 if (!ret) {
2659 ret = qemu_file_get_error(f);
2663 wait_for_decompress_done();
2664 rcu_read_unlock();
2665 trace_ram_load_complete(ret, seq_iter);
2666 return ret;
2669 static SaveVMHandlers savevm_ram_handlers = {
2670 .save_live_setup = ram_save_setup,
2671 .save_live_iterate = ram_save_iterate,
2672 .save_live_complete_postcopy = ram_save_complete,
2673 .save_live_complete_precopy = ram_save_complete,
2674 .save_live_pending = ram_save_pending,
2675 .load_state = ram_load,
2676 .cleanup = ram_migration_cleanup,
2679 void ram_mig_init(void)
2681 qemu_mutex_init(&XBZRLE.lock);
2682 register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);