tpm: Added support for TPM emulator
[qemu/armbru.git] / migration / ram.c
blobb83f8977c569f52204e72ff2759f5e42f5d634ef
1 /*
2 * QEMU System Emulator
4 * Copyright (c) 2003-2008 Fabrice Bellard
5 * Copyright (c) 2011-2015 Red Hat Inc
7 * Authors:
8 * Juan Quintela <quintela@redhat.com>
10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal
12 * in the Software without restriction, including without limitation the rights
13 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 * copies of the Software, and to permit persons to whom the Software is
15 * furnished to do so, subject to the following conditions:
17 * The above copyright notice and this permission notice shall be included in
18 * all copies or substantial portions of the Software.
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 * THE SOFTWARE.
28 #include "qemu/osdep.h"
29 #include "cpu.h"
30 #include <zlib.h>
31 #include "qapi-event.h"
32 #include "qemu/cutils.h"
33 #include "qemu/bitops.h"
34 #include "qemu/bitmap.h"
35 #include "qemu/main-loop.h"
36 #include "xbzrle.h"
37 #include "ram.h"
38 #include "migration.h"
39 #include "migration/register.h"
40 #include "migration/misc.h"
41 #include "qemu-file.h"
42 #include "postcopy-ram.h"
43 #include "migration/page_cache.h"
44 #include "qemu/error-report.h"
45 #include "trace.h"
46 #include "exec/ram_addr.h"
47 #include "qemu/rcu_queue.h"
48 #include "migration/colo.h"
49 #include "migration/block.h"
51 /***********************************************************/
52 /* ram save/restore */
54 /* RAM_SAVE_FLAG_ZERO used to be named RAM_SAVE_FLAG_COMPRESS, it
55 * worked for pages that where filled with the same char. We switched
56 * it to only search for the zero value. And to avoid confusion with
57 * RAM_SSAVE_FLAG_COMPRESS_PAGE just rename it.
60 #define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */
61 #define RAM_SAVE_FLAG_ZERO 0x02
62 #define RAM_SAVE_FLAG_MEM_SIZE 0x04
63 #define RAM_SAVE_FLAG_PAGE 0x08
64 #define RAM_SAVE_FLAG_EOS 0x10
65 #define RAM_SAVE_FLAG_CONTINUE 0x20
66 #define RAM_SAVE_FLAG_XBZRLE 0x40
67 /* 0x80 is reserved in migration.h start with 0x100 next */
68 #define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100
70 static inline bool is_zero_range(uint8_t *p, uint64_t size)
72 return buffer_is_zero(p, size);
75 XBZRLECacheStats xbzrle_counters;
77 /* struct contains XBZRLE cache and a static page
78 used by the compression */
79 static struct {
80 /* buffer used for XBZRLE encoding */
81 uint8_t *encoded_buf;
82 /* buffer for storing page content */
83 uint8_t *current_buf;
84 /* Cache for XBZRLE, Protected by lock. */
85 PageCache *cache;
86 QemuMutex lock;
87 /* it will store a page full of zeros */
88 uint8_t *zero_target_page;
89 /* buffer used for XBZRLE decoding */
90 uint8_t *decoded_buf;
91 } XBZRLE;
93 static void XBZRLE_cache_lock(void)
95 if (migrate_use_xbzrle())
96 qemu_mutex_lock(&XBZRLE.lock);
99 static void XBZRLE_cache_unlock(void)
101 if (migrate_use_xbzrle())
102 qemu_mutex_unlock(&XBZRLE.lock);
106 * xbzrle_cache_resize: resize the xbzrle cache
108 * This function is called from qmp_migrate_set_cache_size in main
109 * thread, possibly while a migration is in progress. A running
110 * migration may be using the cache and might finish during this call,
111 * hence changes to the cache are protected by XBZRLE.lock().
113 * Returns the new_size or negative in case of error.
115 * @new_size: new cache size
117 int64_t xbzrle_cache_resize(int64_t new_size)
119 PageCache *new_cache;
120 int64_t ret;
122 if (new_size < TARGET_PAGE_SIZE) {
123 return -1;
126 XBZRLE_cache_lock();
128 if (XBZRLE.cache != NULL) {
129 if (pow2floor(new_size) == migrate_xbzrle_cache_size()) {
130 goto out_new_size;
132 new_cache = cache_init(new_size / TARGET_PAGE_SIZE,
133 TARGET_PAGE_SIZE);
134 if (!new_cache) {
135 error_report("Error creating cache");
136 ret = -1;
137 goto out;
140 cache_fini(XBZRLE.cache);
141 XBZRLE.cache = new_cache;
144 out_new_size:
145 ret = pow2floor(new_size);
146 out:
147 XBZRLE_cache_unlock();
148 return ret;
152 * An outstanding page request, on the source, having been received
153 * and queued
155 struct RAMSrcPageRequest {
156 RAMBlock *rb;
157 hwaddr offset;
158 hwaddr len;
160 QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
163 /* State of RAM for migration */
164 struct RAMState {
165 /* QEMUFile used for this migration */
166 QEMUFile *f;
167 /* Last block that we have visited searching for dirty pages */
168 RAMBlock *last_seen_block;
169 /* Last block from where we have sent data */
170 RAMBlock *last_sent_block;
171 /* Last dirty target page we have sent */
172 ram_addr_t last_page;
173 /* last ram version we have seen */
174 uint32_t last_version;
175 /* We are in the first round */
176 bool ram_bulk_stage;
177 /* How many times we have dirty too many pages */
178 int dirty_rate_high_cnt;
179 /* these variables are used for bitmap sync */
180 /* last time we did a full bitmap_sync */
181 int64_t time_last_bitmap_sync;
182 /* bytes transferred at start_time */
183 uint64_t bytes_xfer_prev;
184 /* number of dirty pages since start_time */
185 uint64_t num_dirty_pages_period;
186 /* xbzrle misses since the beginning of the period */
187 uint64_t xbzrle_cache_miss_prev;
188 /* number of iterations at the beginning of period */
189 uint64_t iterations_prev;
190 /* Iterations since start */
191 uint64_t iterations;
192 /* number of dirty bits in the bitmap */
193 uint64_t migration_dirty_pages;
194 /* protects modification of the bitmap */
195 QemuMutex bitmap_mutex;
196 /* The RAMBlock used in the last src_page_requests */
197 RAMBlock *last_req_rb;
198 /* Queue of outstanding page requests from the destination */
199 QemuMutex src_page_req_mutex;
200 QSIMPLEQ_HEAD(src_page_requests, RAMSrcPageRequest) src_page_requests;
202 typedef struct RAMState RAMState;
204 static RAMState *ram_state;
206 uint64_t ram_bytes_remaining(void)
208 return ram_state->migration_dirty_pages * TARGET_PAGE_SIZE;
211 MigrationStats ram_counters;
213 /* used by the search for pages to send */
214 struct PageSearchStatus {
215 /* Current block being searched */
216 RAMBlock *block;
217 /* Current page to search from */
218 unsigned long page;
219 /* Set once we wrap around */
220 bool complete_round;
222 typedef struct PageSearchStatus PageSearchStatus;
224 struct CompressParam {
225 bool done;
226 bool quit;
227 QEMUFile *file;
228 QemuMutex mutex;
229 QemuCond cond;
230 RAMBlock *block;
231 ram_addr_t offset;
233 typedef struct CompressParam CompressParam;
235 struct DecompressParam {
236 bool done;
237 bool quit;
238 QemuMutex mutex;
239 QemuCond cond;
240 void *des;
241 uint8_t *compbuf;
242 int len;
244 typedef struct DecompressParam DecompressParam;
246 static CompressParam *comp_param;
247 static QemuThread *compress_threads;
248 /* comp_done_cond is used to wake up the migration thread when
249 * one of the compression threads has finished the compression.
250 * comp_done_lock is used to co-work with comp_done_cond.
252 static QemuMutex comp_done_lock;
253 static QemuCond comp_done_cond;
254 /* The empty QEMUFileOps will be used by file in CompressParam */
255 static const QEMUFileOps empty_ops = { };
257 static DecompressParam *decomp_param;
258 static QemuThread *decompress_threads;
259 static QemuMutex decomp_done_lock;
260 static QemuCond decomp_done_cond;
262 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
263 ram_addr_t offset);
265 static void *do_data_compress(void *opaque)
267 CompressParam *param = opaque;
268 RAMBlock *block;
269 ram_addr_t offset;
271 qemu_mutex_lock(&param->mutex);
272 while (!param->quit) {
273 if (param->block) {
274 block = param->block;
275 offset = param->offset;
276 param->block = NULL;
277 qemu_mutex_unlock(&param->mutex);
279 do_compress_ram_page(param->file, block, offset);
281 qemu_mutex_lock(&comp_done_lock);
282 param->done = true;
283 qemu_cond_signal(&comp_done_cond);
284 qemu_mutex_unlock(&comp_done_lock);
286 qemu_mutex_lock(&param->mutex);
287 } else {
288 qemu_cond_wait(&param->cond, &param->mutex);
291 qemu_mutex_unlock(&param->mutex);
293 return NULL;
296 static inline void terminate_compression_threads(void)
298 int idx, thread_count;
300 thread_count = migrate_compress_threads();
302 for (idx = 0; idx < thread_count; idx++) {
303 qemu_mutex_lock(&comp_param[idx].mutex);
304 comp_param[idx].quit = true;
305 qemu_cond_signal(&comp_param[idx].cond);
306 qemu_mutex_unlock(&comp_param[idx].mutex);
310 static void compress_threads_save_cleanup(void)
312 int i, thread_count;
314 if (!migrate_use_compression()) {
315 return;
317 terminate_compression_threads();
318 thread_count = migrate_compress_threads();
319 for (i = 0; i < thread_count; i++) {
320 qemu_thread_join(compress_threads + i);
321 qemu_fclose(comp_param[i].file);
322 qemu_mutex_destroy(&comp_param[i].mutex);
323 qemu_cond_destroy(&comp_param[i].cond);
325 qemu_mutex_destroy(&comp_done_lock);
326 qemu_cond_destroy(&comp_done_cond);
327 g_free(compress_threads);
328 g_free(comp_param);
329 compress_threads = NULL;
330 comp_param = NULL;
333 static void compress_threads_save_setup(void)
335 int i, thread_count;
337 if (!migrate_use_compression()) {
338 return;
340 thread_count = migrate_compress_threads();
341 compress_threads = g_new0(QemuThread, thread_count);
342 comp_param = g_new0(CompressParam, thread_count);
343 qemu_cond_init(&comp_done_cond);
344 qemu_mutex_init(&comp_done_lock);
345 for (i = 0; i < thread_count; i++) {
346 /* comp_param[i].file is just used as a dummy buffer to save data,
347 * set its ops to empty.
349 comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops);
350 comp_param[i].done = true;
351 comp_param[i].quit = false;
352 qemu_mutex_init(&comp_param[i].mutex);
353 qemu_cond_init(&comp_param[i].cond);
354 qemu_thread_create(compress_threads + i, "compress",
355 do_data_compress, comp_param + i,
356 QEMU_THREAD_JOINABLE);
360 /* Multiple fd's */
362 struct MultiFDSendParams {
363 uint8_t id;
364 char *name;
365 QemuThread thread;
366 QemuSemaphore sem;
367 QemuMutex mutex;
368 bool quit;
370 typedef struct MultiFDSendParams MultiFDSendParams;
372 struct {
373 MultiFDSendParams *params;
374 /* number of created threads */
375 int count;
376 } *multifd_send_state;
378 static void terminate_multifd_send_threads(Error *errp)
380 int i;
382 for (i = 0; i < multifd_send_state->count; i++) {
383 MultiFDSendParams *p = &multifd_send_state->params[i];
385 qemu_mutex_lock(&p->mutex);
386 p->quit = true;
387 qemu_sem_post(&p->sem);
388 qemu_mutex_unlock(&p->mutex);
392 int multifd_save_cleanup(Error **errp)
394 int i;
395 int ret = 0;
397 if (!migrate_use_multifd()) {
398 return 0;
400 terminate_multifd_send_threads(NULL);
401 for (i = 0; i < multifd_send_state->count; i++) {
402 MultiFDSendParams *p = &multifd_send_state->params[i];
404 qemu_thread_join(&p->thread);
405 qemu_mutex_destroy(&p->mutex);
406 qemu_sem_destroy(&p->sem);
407 g_free(p->name);
408 p->name = NULL;
410 g_free(multifd_send_state->params);
411 multifd_send_state->params = NULL;
412 g_free(multifd_send_state);
413 multifd_send_state = NULL;
414 return ret;
417 static void *multifd_send_thread(void *opaque)
419 MultiFDSendParams *p = opaque;
421 while (true) {
422 qemu_mutex_lock(&p->mutex);
423 if (p->quit) {
424 qemu_mutex_unlock(&p->mutex);
425 break;
427 qemu_mutex_unlock(&p->mutex);
428 qemu_sem_wait(&p->sem);
431 return NULL;
434 int multifd_save_setup(void)
436 int thread_count;
437 uint8_t i;
439 if (!migrate_use_multifd()) {
440 return 0;
442 thread_count = migrate_multifd_channels();
443 multifd_send_state = g_malloc0(sizeof(*multifd_send_state));
444 multifd_send_state->params = g_new0(MultiFDSendParams, thread_count);
445 multifd_send_state->count = 0;
446 for (i = 0; i < thread_count; i++) {
447 MultiFDSendParams *p = &multifd_send_state->params[i];
449 qemu_mutex_init(&p->mutex);
450 qemu_sem_init(&p->sem, 0);
451 p->quit = false;
452 p->id = i;
453 p->name = g_strdup_printf("multifdsend_%d", i);
454 qemu_thread_create(&p->thread, p->name, multifd_send_thread, p,
455 QEMU_THREAD_JOINABLE);
457 multifd_send_state->count++;
459 return 0;
462 struct MultiFDRecvParams {
463 uint8_t id;
464 char *name;
465 QemuThread thread;
466 QemuSemaphore sem;
467 QemuMutex mutex;
468 bool quit;
470 typedef struct MultiFDRecvParams MultiFDRecvParams;
472 struct {
473 MultiFDRecvParams *params;
474 /* number of created threads */
475 int count;
476 } *multifd_recv_state;
478 static void terminate_multifd_recv_threads(Error *errp)
480 int i;
482 for (i = 0; i < multifd_recv_state->count; i++) {
483 MultiFDRecvParams *p = &multifd_recv_state->params[i];
485 qemu_mutex_lock(&p->mutex);
486 p->quit = true;
487 qemu_sem_post(&p->sem);
488 qemu_mutex_unlock(&p->mutex);
492 int multifd_load_cleanup(Error **errp)
494 int i;
495 int ret = 0;
497 if (!migrate_use_multifd()) {
498 return 0;
500 terminate_multifd_recv_threads(NULL);
501 for (i = 0; i < multifd_recv_state->count; i++) {
502 MultiFDRecvParams *p = &multifd_recv_state->params[i];
504 qemu_thread_join(&p->thread);
505 qemu_mutex_destroy(&p->mutex);
506 qemu_sem_destroy(&p->sem);
507 g_free(p->name);
508 p->name = NULL;
510 g_free(multifd_recv_state->params);
511 multifd_recv_state->params = NULL;
512 g_free(multifd_recv_state);
513 multifd_recv_state = NULL;
515 return ret;
518 static void *multifd_recv_thread(void *opaque)
520 MultiFDRecvParams *p = opaque;
522 while (true) {
523 qemu_mutex_lock(&p->mutex);
524 if (p->quit) {
525 qemu_mutex_unlock(&p->mutex);
526 break;
528 qemu_mutex_unlock(&p->mutex);
529 qemu_sem_wait(&p->sem);
532 return NULL;
535 int multifd_load_setup(void)
537 int thread_count;
538 uint8_t i;
540 if (!migrate_use_multifd()) {
541 return 0;
543 thread_count = migrate_multifd_channels();
544 multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state));
545 multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count);
546 multifd_recv_state->count = 0;
547 for (i = 0; i < thread_count; i++) {
548 MultiFDRecvParams *p = &multifd_recv_state->params[i];
550 qemu_mutex_init(&p->mutex);
551 qemu_sem_init(&p->sem, 0);
552 p->quit = false;
553 p->id = i;
554 p->name = g_strdup_printf("multifdrecv_%d", i);
555 qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p,
556 QEMU_THREAD_JOINABLE);
557 multifd_recv_state->count++;
559 return 0;
563 * save_page_header: write page header to wire
565 * If this is the 1st block, it also writes the block identification
567 * Returns the number of bytes written
569 * @f: QEMUFile where to send the data
570 * @block: block that contains the page we want to send
571 * @offset: offset inside the block for the page
572 * in the lower bits, it contains flags
574 static size_t save_page_header(RAMState *rs, QEMUFile *f, RAMBlock *block,
575 ram_addr_t offset)
577 size_t size, len;
579 if (block == rs->last_sent_block) {
580 offset |= RAM_SAVE_FLAG_CONTINUE;
582 qemu_put_be64(f, offset);
583 size = 8;
585 if (!(offset & RAM_SAVE_FLAG_CONTINUE)) {
586 len = strlen(block->idstr);
587 qemu_put_byte(f, len);
588 qemu_put_buffer(f, (uint8_t *)block->idstr, len);
589 size += 1 + len;
590 rs->last_sent_block = block;
592 return size;
596 * mig_throttle_guest_down: throotle down the guest
598 * Reduce amount of guest cpu execution to hopefully slow down memory
599 * writes. If guest dirty memory rate is reduced below the rate at
600 * which we can transfer pages to the destination then we should be
601 * able to complete migration. Some workloads dirty memory way too
602 * fast and will not effectively converge, even with auto-converge.
604 static void mig_throttle_guest_down(void)
606 MigrationState *s = migrate_get_current();
607 uint64_t pct_initial = s->parameters.cpu_throttle_initial;
608 uint64_t pct_icrement = s->parameters.cpu_throttle_increment;
610 /* We have not started throttling yet. Let's start it. */
611 if (!cpu_throttle_active()) {
612 cpu_throttle_set(pct_initial);
613 } else {
614 /* Throttling already on, just increase the rate */
615 cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement);
620 * xbzrle_cache_zero_page: insert a zero page in the XBZRLE cache
622 * @rs: current RAM state
623 * @current_addr: address for the zero page
625 * Update the xbzrle cache to reflect a page that's been sent as all 0.
626 * The important thing is that a stale (not-yet-0'd) page be replaced
627 * by the new data.
628 * As a bonus, if the page wasn't in the cache it gets added so that
629 * when a small write is made into the 0'd page it gets XBZRLE sent.
631 static void xbzrle_cache_zero_page(RAMState *rs, ram_addr_t current_addr)
633 if (rs->ram_bulk_stage || !migrate_use_xbzrle()) {
634 return;
637 /* We don't care if this fails to allocate a new cache page
638 * as long as it updated an old one */
639 cache_insert(XBZRLE.cache, current_addr, XBZRLE.zero_target_page,
640 ram_counters.dirty_sync_count);
643 #define ENCODING_FLAG_XBZRLE 0x1
646 * save_xbzrle_page: compress and send current page
648 * Returns: 1 means that we wrote the page
649 * 0 means that page is identical to the one already sent
650 * -1 means that xbzrle would be longer than normal
652 * @rs: current RAM state
653 * @current_data: pointer to the address of the page contents
654 * @current_addr: addr of the page
655 * @block: block that contains the page we want to send
656 * @offset: offset inside the block for the page
657 * @last_stage: if we are at the completion stage
659 static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
660 ram_addr_t current_addr, RAMBlock *block,
661 ram_addr_t offset, bool last_stage)
663 int encoded_len = 0, bytes_xbzrle;
664 uint8_t *prev_cached_page;
666 if (!cache_is_cached(XBZRLE.cache, current_addr,
667 ram_counters.dirty_sync_count)) {
668 xbzrle_counters.cache_miss++;
669 if (!last_stage) {
670 if (cache_insert(XBZRLE.cache, current_addr, *current_data,
671 ram_counters.dirty_sync_count) == -1) {
672 return -1;
673 } else {
674 /* update *current_data when the page has been
675 inserted into cache */
676 *current_data = get_cached_data(XBZRLE.cache, current_addr);
679 return -1;
682 prev_cached_page = get_cached_data(XBZRLE.cache, current_addr);
684 /* save current buffer into memory */
685 memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
687 /* XBZRLE encoding (if there is no overflow) */
688 encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
689 TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
690 TARGET_PAGE_SIZE);
691 if (encoded_len == 0) {
692 trace_save_xbzrle_page_skipping();
693 return 0;
694 } else if (encoded_len == -1) {
695 trace_save_xbzrle_page_overflow();
696 xbzrle_counters.overflow++;
697 /* update data in the cache */
698 if (!last_stage) {
699 memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE);
700 *current_data = prev_cached_page;
702 return -1;
705 /* we need to update the data in the cache, in order to get the same data */
706 if (!last_stage) {
707 memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE);
710 /* Send XBZRLE based compressed page */
711 bytes_xbzrle = save_page_header(rs, rs->f, block,
712 offset | RAM_SAVE_FLAG_XBZRLE);
713 qemu_put_byte(rs->f, ENCODING_FLAG_XBZRLE);
714 qemu_put_be16(rs->f, encoded_len);
715 qemu_put_buffer(rs->f, XBZRLE.encoded_buf, encoded_len);
716 bytes_xbzrle += encoded_len + 1 + 2;
717 xbzrle_counters.pages++;
718 xbzrle_counters.bytes += bytes_xbzrle;
719 ram_counters.transferred += bytes_xbzrle;
721 return 1;
725 * migration_bitmap_find_dirty: find the next dirty page from start
727 * Called with rcu_read_lock() to protect migration_bitmap
729 * Returns the byte offset within memory region of the start of a dirty page
731 * @rs: current RAM state
732 * @rb: RAMBlock where to search for dirty pages
733 * @start: page where we start the search
735 static inline
736 unsigned long migration_bitmap_find_dirty(RAMState *rs, RAMBlock *rb,
737 unsigned long start)
739 unsigned long size = rb->used_length >> TARGET_PAGE_BITS;
740 unsigned long *bitmap = rb->bmap;
741 unsigned long next;
743 if (rs->ram_bulk_stage && start > 0) {
744 next = start + 1;
745 } else {
746 next = find_next_bit(bitmap, size, start);
749 return next;
752 static inline bool migration_bitmap_clear_dirty(RAMState *rs,
753 RAMBlock *rb,
754 unsigned long page)
756 bool ret;
758 ret = test_and_clear_bit(page, rb->bmap);
760 if (ret) {
761 rs->migration_dirty_pages--;
763 return ret;
766 static void migration_bitmap_sync_range(RAMState *rs, RAMBlock *rb,
767 ram_addr_t start, ram_addr_t length)
769 rs->migration_dirty_pages +=
770 cpu_physical_memory_sync_dirty_bitmap(rb, start, length,
771 &rs->num_dirty_pages_period);
775 * ram_pagesize_summary: calculate all the pagesizes of a VM
777 * Returns a summary bitmap of the page sizes of all RAMBlocks
779 * For VMs with just normal pages this is equivalent to the host page
780 * size. If it's got some huge pages then it's the OR of all the
781 * different page sizes.
783 uint64_t ram_pagesize_summary(void)
785 RAMBlock *block;
786 uint64_t summary = 0;
788 RAMBLOCK_FOREACH(block) {
789 summary |= block->page_size;
792 return summary;
795 static void migration_bitmap_sync(RAMState *rs)
797 RAMBlock *block;
798 int64_t end_time;
799 uint64_t bytes_xfer_now;
801 ram_counters.dirty_sync_count++;
803 if (!rs->time_last_bitmap_sync) {
804 rs->time_last_bitmap_sync = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
807 trace_migration_bitmap_sync_start();
808 memory_global_dirty_log_sync();
810 qemu_mutex_lock(&rs->bitmap_mutex);
811 rcu_read_lock();
812 RAMBLOCK_FOREACH(block) {
813 migration_bitmap_sync_range(rs, block, 0, block->used_length);
815 rcu_read_unlock();
816 qemu_mutex_unlock(&rs->bitmap_mutex);
818 trace_migration_bitmap_sync_end(rs->num_dirty_pages_period);
820 end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
822 /* more than 1 second = 1000 millisecons */
823 if (end_time > rs->time_last_bitmap_sync + 1000) {
824 /* calculate period counters */
825 ram_counters.dirty_pages_rate = rs->num_dirty_pages_period * 1000
826 / (end_time - rs->time_last_bitmap_sync);
827 bytes_xfer_now = ram_counters.transferred;
829 /* During block migration the auto-converge logic incorrectly detects
830 * that ram migration makes no progress. Avoid this by disabling the
831 * throttling logic during the bulk phase of block migration. */
832 if (migrate_auto_converge() && !blk_mig_bulk_active()) {
833 /* The following detection logic can be refined later. For now:
834 Check to see if the dirtied bytes is 50% more than the approx.
835 amount of bytes that just got transferred since the last time we
836 were in this routine. If that happens twice, start or increase
837 throttling */
839 if ((rs->num_dirty_pages_period * TARGET_PAGE_SIZE >
840 (bytes_xfer_now - rs->bytes_xfer_prev) / 2) &&
841 (++rs->dirty_rate_high_cnt >= 2)) {
842 trace_migration_throttle();
843 rs->dirty_rate_high_cnt = 0;
844 mig_throttle_guest_down();
848 if (migrate_use_xbzrle()) {
849 if (rs->iterations_prev != rs->iterations) {
850 xbzrle_counters.cache_miss_rate =
851 (double)(xbzrle_counters.cache_miss -
852 rs->xbzrle_cache_miss_prev) /
853 (rs->iterations - rs->iterations_prev);
855 rs->iterations_prev = rs->iterations;
856 rs->xbzrle_cache_miss_prev = xbzrle_counters.cache_miss;
859 /* reset period counters */
860 rs->time_last_bitmap_sync = end_time;
861 rs->num_dirty_pages_period = 0;
862 rs->bytes_xfer_prev = bytes_xfer_now;
864 if (migrate_use_events()) {
865 qapi_event_send_migration_pass(ram_counters.dirty_sync_count, NULL);
870 * save_zero_page: send the zero page to the stream
872 * Returns the number of pages written.
874 * @rs: current RAM state
875 * @block: block that contains the page we want to send
876 * @offset: offset inside the block for the page
877 * @p: pointer to the page
879 static int save_zero_page(RAMState *rs, RAMBlock *block, ram_addr_t offset,
880 uint8_t *p)
882 int pages = -1;
884 if (is_zero_range(p, TARGET_PAGE_SIZE)) {
885 ram_counters.duplicate++;
886 ram_counters.transferred +=
887 save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_ZERO);
888 qemu_put_byte(rs->f, 0);
889 ram_counters.transferred += 1;
890 pages = 1;
893 return pages;
896 static void ram_release_pages(const char *rbname, uint64_t offset, int pages)
898 if (!migrate_release_ram() || !migration_in_postcopy()) {
899 return;
902 ram_discard_range(rbname, offset, pages << TARGET_PAGE_BITS);
906 * ram_save_page: send the given page to the stream
908 * Returns the number of pages written.
909 * < 0 - error
910 * >=0 - Number of pages written - this might legally be 0
911 * if xbzrle noticed the page was the same.
913 * @rs: current RAM state
914 * @block: block that contains the page we want to send
915 * @offset: offset inside the block for the page
916 * @last_stage: if we are at the completion stage
918 static int ram_save_page(RAMState *rs, PageSearchStatus *pss, bool last_stage)
920 int pages = -1;
921 uint64_t bytes_xmit;
922 ram_addr_t current_addr;
923 uint8_t *p;
924 int ret;
925 bool send_async = true;
926 RAMBlock *block = pss->block;
927 ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
929 p = block->host + offset;
930 trace_ram_save_page(block->idstr, (uint64_t)offset, p);
932 /* In doubt sent page as normal */
933 bytes_xmit = 0;
934 ret = ram_control_save_page(rs->f, block->offset,
935 offset, TARGET_PAGE_SIZE, &bytes_xmit);
936 if (bytes_xmit) {
937 ram_counters.transferred += bytes_xmit;
938 pages = 1;
941 XBZRLE_cache_lock();
943 current_addr = block->offset + offset;
945 if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
946 if (ret != RAM_SAVE_CONTROL_DELAYED) {
947 if (bytes_xmit > 0) {
948 ram_counters.normal++;
949 } else if (bytes_xmit == 0) {
950 ram_counters.duplicate++;
953 } else {
954 pages = save_zero_page(rs, block, offset, p);
955 if (pages > 0) {
956 /* Must let xbzrle know, otherwise a previous (now 0'd) cached
957 * page would be stale
959 xbzrle_cache_zero_page(rs, current_addr);
960 ram_release_pages(block->idstr, offset, pages);
961 } else if (!rs->ram_bulk_stage &&
962 !migration_in_postcopy() && migrate_use_xbzrle()) {
963 pages = save_xbzrle_page(rs, &p, current_addr, block,
964 offset, last_stage);
965 if (!last_stage) {
966 /* Can't send this cached data async, since the cache page
967 * might get updated before it gets to the wire
969 send_async = false;
974 /* XBZRLE overflow or normal page */
975 if (pages == -1) {
976 ram_counters.transferred +=
977 save_page_header(rs, rs->f, block, offset | RAM_SAVE_FLAG_PAGE);
978 if (send_async) {
979 qemu_put_buffer_async(rs->f, p, TARGET_PAGE_SIZE,
980 migrate_release_ram() &
981 migration_in_postcopy());
982 } else {
983 qemu_put_buffer(rs->f, p, TARGET_PAGE_SIZE);
985 ram_counters.transferred += TARGET_PAGE_SIZE;
986 pages = 1;
987 ram_counters.normal++;
990 XBZRLE_cache_unlock();
992 return pages;
995 static int do_compress_ram_page(QEMUFile *f, RAMBlock *block,
996 ram_addr_t offset)
998 RAMState *rs = ram_state;
999 int bytes_sent, blen;
1000 uint8_t *p = block->host + (offset & TARGET_PAGE_MASK);
1002 bytes_sent = save_page_header(rs, f, block, offset |
1003 RAM_SAVE_FLAG_COMPRESS_PAGE);
1004 blen = qemu_put_compression_data(f, p, TARGET_PAGE_SIZE,
1005 migrate_compress_level());
1006 if (blen < 0) {
1007 bytes_sent = 0;
1008 qemu_file_set_error(migrate_get_current()->to_dst_file, blen);
1009 error_report("compressed data failed!");
1010 } else {
1011 bytes_sent += blen;
1012 ram_release_pages(block->idstr, offset & TARGET_PAGE_MASK, 1);
1015 return bytes_sent;
1018 static void flush_compressed_data(RAMState *rs)
1020 int idx, len, thread_count;
1022 if (!migrate_use_compression()) {
1023 return;
1025 thread_count = migrate_compress_threads();
1027 qemu_mutex_lock(&comp_done_lock);
1028 for (idx = 0; idx < thread_count; idx++) {
1029 while (!comp_param[idx].done) {
1030 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1033 qemu_mutex_unlock(&comp_done_lock);
1035 for (idx = 0; idx < thread_count; idx++) {
1036 qemu_mutex_lock(&comp_param[idx].mutex);
1037 if (!comp_param[idx].quit) {
1038 len = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1039 ram_counters.transferred += len;
1041 qemu_mutex_unlock(&comp_param[idx].mutex);
1045 static inline void set_compress_params(CompressParam *param, RAMBlock *block,
1046 ram_addr_t offset)
1048 param->block = block;
1049 param->offset = offset;
1052 static int compress_page_with_multi_thread(RAMState *rs, RAMBlock *block,
1053 ram_addr_t offset)
1055 int idx, thread_count, bytes_xmit = -1, pages = -1;
1057 thread_count = migrate_compress_threads();
1058 qemu_mutex_lock(&comp_done_lock);
1059 while (true) {
1060 for (idx = 0; idx < thread_count; idx++) {
1061 if (comp_param[idx].done) {
1062 comp_param[idx].done = false;
1063 bytes_xmit = qemu_put_qemu_file(rs->f, comp_param[idx].file);
1064 qemu_mutex_lock(&comp_param[idx].mutex);
1065 set_compress_params(&comp_param[idx], block, offset);
1066 qemu_cond_signal(&comp_param[idx].cond);
1067 qemu_mutex_unlock(&comp_param[idx].mutex);
1068 pages = 1;
1069 ram_counters.normal++;
1070 ram_counters.transferred += bytes_xmit;
1071 break;
1074 if (pages > 0) {
1075 break;
1076 } else {
1077 qemu_cond_wait(&comp_done_cond, &comp_done_lock);
1080 qemu_mutex_unlock(&comp_done_lock);
1082 return pages;
1086 * ram_save_compressed_page: compress the given page and send it to the stream
1088 * Returns the number of pages written.
1090 * @rs: current RAM state
1091 * @block: block that contains the page we want to send
1092 * @offset: offset inside the block for the page
1093 * @last_stage: if we are at the completion stage
1095 static int ram_save_compressed_page(RAMState *rs, PageSearchStatus *pss,
1096 bool last_stage)
1098 int pages = -1;
1099 uint64_t bytes_xmit = 0;
1100 uint8_t *p;
1101 int ret, blen;
1102 RAMBlock *block = pss->block;
1103 ram_addr_t offset = pss->page << TARGET_PAGE_BITS;
1105 p = block->host + offset;
1107 ret = ram_control_save_page(rs->f, block->offset,
1108 offset, TARGET_PAGE_SIZE, &bytes_xmit);
1109 if (bytes_xmit) {
1110 ram_counters.transferred += bytes_xmit;
1111 pages = 1;
1113 if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
1114 if (ret != RAM_SAVE_CONTROL_DELAYED) {
1115 if (bytes_xmit > 0) {
1116 ram_counters.normal++;
1117 } else if (bytes_xmit == 0) {
1118 ram_counters.duplicate++;
1121 } else {
1122 /* When starting the process of a new block, the first page of
1123 * the block should be sent out before other pages in the same
1124 * block, and all the pages in last block should have been sent
1125 * out, keeping this order is important, because the 'cont' flag
1126 * is used to avoid resending the block name.
1128 if (block != rs->last_sent_block) {
1129 flush_compressed_data(rs);
1130 pages = save_zero_page(rs, block, offset, p);
1131 if (pages == -1) {
1132 /* Make sure the first page is sent out before other pages */
1133 bytes_xmit = save_page_header(rs, rs->f, block, offset |
1134 RAM_SAVE_FLAG_COMPRESS_PAGE);
1135 blen = qemu_put_compression_data(rs->f, p, TARGET_PAGE_SIZE,
1136 migrate_compress_level());
1137 if (blen > 0) {
1138 ram_counters.transferred += bytes_xmit + blen;
1139 ram_counters.normal++;
1140 pages = 1;
1141 } else {
1142 qemu_file_set_error(rs->f, blen);
1143 error_report("compressed data failed!");
1146 if (pages > 0) {
1147 ram_release_pages(block->idstr, offset, pages);
1149 } else {
1150 pages = save_zero_page(rs, block, offset, p);
1151 if (pages == -1) {
1152 pages = compress_page_with_multi_thread(rs, block, offset);
1153 } else {
1154 ram_release_pages(block->idstr, offset, pages);
1159 return pages;
1163 * find_dirty_block: find the next dirty page and update any state
1164 * associated with the search process.
1166 * Returns if a page is found
1168 * @rs: current RAM state
1169 * @pss: data about the state of the current dirty page scan
1170 * @again: set to false if the search has scanned the whole of RAM
1172 static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
1174 pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
1175 if (pss->complete_round && pss->block == rs->last_seen_block &&
1176 pss->page >= rs->last_page) {
1178 * We've been once around the RAM and haven't found anything.
1179 * Give up.
1181 *again = false;
1182 return false;
1184 if ((pss->page << TARGET_PAGE_BITS) >= pss->block->used_length) {
1185 /* Didn't find anything in this RAM Block */
1186 pss->page = 0;
1187 pss->block = QLIST_NEXT_RCU(pss->block, next);
1188 if (!pss->block) {
1189 /* Hit the end of the list */
1190 pss->block = QLIST_FIRST_RCU(&ram_list.blocks);
1191 /* Flag that we've looped */
1192 pss->complete_round = true;
1193 rs->ram_bulk_stage = false;
1194 if (migrate_use_xbzrle()) {
1195 /* If xbzrle is on, stop using the data compression at this
1196 * point. In theory, xbzrle can do better than compression.
1198 flush_compressed_data(rs);
1201 /* Didn't find anything this time, but try again on the new block */
1202 *again = true;
1203 return false;
1204 } else {
1205 /* Can go around again, but... */
1206 *again = true;
1207 /* We've found something so probably don't need to */
1208 return true;
1213 * unqueue_page: gets a page of the queue
1215 * Helper for 'get_queued_page' - gets a page off the queue
1217 * Returns the block of the page (or NULL if none available)
1219 * @rs: current RAM state
1220 * @offset: used to return the offset within the RAMBlock
1222 static RAMBlock *unqueue_page(RAMState *rs, ram_addr_t *offset)
1224 RAMBlock *block = NULL;
1226 qemu_mutex_lock(&rs->src_page_req_mutex);
1227 if (!QSIMPLEQ_EMPTY(&rs->src_page_requests)) {
1228 struct RAMSrcPageRequest *entry =
1229 QSIMPLEQ_FIRST(&rs->src_page_requests);
1230 block = entry->rb;
1231 *offset = entry->offset;
1233 if (entry->len > TARGET_PAGE_SIZE) {
1234 entry->len -= TARGET_PAGE_SIZE;
1235 entry->offset += TARGET_PAGE_SIZE;
1236 } else {
1237 memory_region_unref(block->mr);
1238 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1239 g_free(entry);
1242 qemu_mutex_unlock(&rs->src_page_req_mutex);
1244 return block;
1248 * get_queued_page: unqueue a page from the postocpy requests
1250 * Skips pages that are already sent (!dirty)
1252 * Returns if a queued page is found
1254 * @rs: current RAM state
1255 * @pss: data about the state of the current dirty page scan
1257 static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
1259 RAMBlock *block;
1260 ram_addr_t offset;
1261 bool dirty;
1263 do {
1264 block = unqueue_page(rs, &offset);
1266 * We're sending this page, and since it's postcopy nothing else
1267 * will dirty it, and we must make sure it doesn't get sent again
1268 * even if this queue request was received after the background
1269 * search already sent it.
1271 if (block) {
1272 unsigned long page;
1274 page = offset >> TARGET_PAGE_BITS;
1275 dirty = test_bit(page, block->bmap);
1276 if (!dirty) {
1277 trace_get_queued_page_not_dirty(block->idstr, (uint64_t)offset,
1278 page, test_bit(page, block->unsentmap));
1279 } else {
1280 trace_get_queued_page(block->idstr, (uint64_t)offset, page);
1284 } while (block && !dirty);
1286 if (block) {
1288 * As soon as we start servicing pages out of order, then we have
1289 * to kill the bulk stage, since the bulk stage assumes
1290 * in (migration_bitmap_find_and_reset_dirty) that every page is
1291 * dirty, that's no longer true.
1293 rs->ram_bulk_stage = false;
1296 * We want the background search to continue from the queued page
1297 * since the guest is likely to want other pages near to the page
1298 * it just requested.
1300 pss->block = block;
1301 pss->page = offset >> TARGET_PAGE_BITS;
1304 return !!block;
1308 * migration_page_queue_free: drop any remaining pages in the ram
1309 * request queue
1311 * It should be empty at the end anyway, but in error cases there may
1312 * be some left. in case that there is any page left, we drop it.
1315 static void migration_page_queue_free(RAMState *rs)
1317 struct RAMSrcPageRequest *mspr, *next_mspr;
1318 /* This queue generally should be empty - but in the case of a failed
1319 * migration might have some droppings in.
1321 rcu_read_lock();
1322 QSIMPLEQ_FOREACH_SAFE(mspr, &rs->src_page_requests, next_req, next_mspr) {
1323 memory_region_unref(mspr->rb->mr);
1324 QSIMPLEQ_REMOVE_HEAD(&rs->src_page_requests, next_req);
1325 g_free(mspr);
1327 rcu_read_unlock();
1331 * ram_save_queue_pages: queue the page for transmission
1333 * A request from postcopy destination for example.
1335 * Returns zero on success or negative on error
1337 * @rbname: Name of the RAMBLock of the request. NULL means the
1338 * same that last one.
1339 * @start: starting address from the start of the RAMBlock
1340 * @len: length (in bytes) to send
1342 int ram_save_queue_pages(const char *rbname, ram_addr_t start, ram_addr_t len)
1344 RAMBlock *ramblock;
1345 RAMState *rs = ram_state;
1347 ram_counters.postcopy_requests++;
1348 rcu_read_lock();
1349 if (!rbname) {
1350 /* Reuse last RAMBlock */
1351 ramblock = rs->last_req_rb;
1353 if (!ramblock) {
1355 * Shouldn't happen, we can't reuse the last RAMBlock if
1356 * it's the 1st request.
1358 error_report("ram_save_queue_pages no previous block");
1359 goto err;
1361 } else {
1362 ramblock = qemu_ram_block_by_name(rbname);
1364 if (!ramblock) {
1365 /* We shouldn't be asked for a non-existent RAMBlock */
1366 error_report("ram_save_queue_pages no block '%s'", rbname);
1367 goto err;
1369 rs->last_req_rb = ramblock;
1371 trace_ram_save_queue_pages(ramblock->idstr, start, len);
1372 if (start+len > ramblock->used_length) {
1373 error_report("%s request overrun start=" RAM_ADDR_FMT " len="
1374 RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
1375 __func__, start, len, ramblock->used_length);
1376 goto err;
1379 struct RAMSrcPageRequest *new_entry =
1380 g_malloc0(sizeof(struct RAMSrcPageRequest));
1381 new_entry->rb = ramblock;
1382 new_entry->offset = start;
1383 new_entry->len = len;
1385 memory_region_ref(ramblock->mr);
1386 qemu_mutex_lock(&rs->src_page_req_mutex);
1387 QSIMPLEQ_INSERT_TAIL(&rs->src_page_requests, new_entry, next_req);
1388 qemu_mutex_unlock(&rs->src_page_req_mutex);
1389 rcu_read_unlock();
1391 return 0;
1393 err:
1394 rcu_read_unlock();
1395 return -1;
1399 * ram_save_target_page: save one target page
1401 * Returns the number of pages written
1403 * @rs: current RAM state
1404 * @ms: current migration state
1405 * @pss: data about the page we want to send
1406 * @last_stage: if we are at the completion stage
1408 static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss,
1409 bool last_stage)
1411 int res = 0;
1413 /* Check the pages is dirty and if it is send it */
1414 if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
1416 * If xbzrle is on, stop using the data compression after first
1417 * round of migration even if compression is enabled. In theory,
1418 * xbzrle can do better than compression.
1420 if (migrate_use_compression() &&
1421 (rs->ram_bulk_stage || !migrate_use_xbzrle())) {
1422 res = ram_save_compressed_page(rs, pss, last_stage);
1423 } else {
1424 res = ram_save_page(rs, pss, last_stage);
1427 if (res < 0) {
1428 return res;
1430 if (pss->block->unsentmap) {
1431 clear_bit(pss->page, pss->block->unsentmap);
1435 return res;
1439 * ram_save_host_page: save a whole host page
1441 * Starting at *offset send pages up to the end of the current host
1442 * page. It's valid for the initial offset to point into the middle of
1443 * a host page in which case the remainder of the hostpage is sent.
1444 * Only dirty target pages are sent. Note that the host page size may
1445 * be a huge page for this block.
1446 * The saving stops at the boundary of the used_length of the block
1447 * if the RAMBlock isn't a multiple of the host page size.
1449 * Returns the number of pages written or negative on error
1451 * @rs: current RAM state
1452 * @ms: current migration state
1453 * @pss: data about the page we want to send
1454 * @last_stage: if we are at the completion stage
1456 static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss,
1457 bool last_stage)
1459 int tmppages, pages = 0;
1460 size_t pagesize_bits =
1461 qemu_ram_pagesize(pss->block) >> TARGET_PAGE_BITS;
1463 do {
1464 tmppages = ram_save_target_page(rs, pss, last_stage);
1465 if (tmppages < 0) {
1466 return tmppages;
1469 pages += tmppages;
1470 pss->page++;
1471 } while ((pss->page & (pagesize_bits - 1)) &&
1472 offset_in_ramblock(pss->block, pss->page << TARGET_PAGE_BITS));
1474 /* The offset we leave with is the last one we looked at */
1475 pss->page--;
1476 return pages;
1480 * ram_find_and_save_block: finds a dirty page and sends it to f
1482 * Called within an RCU critical section.
1484 * Returns the number of pages written where zero means no dirty pages
1486 * @rs: current RAM state
1487 * @last_stage: if we are at the completion stage
1489 * On systems where host-page-size > target-page-size it will send all the
1490 * pages in a host page that are dirty.
1493 static int ram_find_and_save_block(RAMState *rs, bool last_stage)
1495 PageSearchStatus pss;
1496 int pages = 0;
1497 bool again, found;
1499 /* No dirty page as there is zero RAM */
1500 if (!ram_bytes_total()) {
1501 return pages;
1504 pss.block = rs->last_seen_block;
1505 pss.page = rs->last_page;
1506 pss.complete_round = false;
1508 if (!pss.block) {
1509 pss.block = QLIST_FIRST_RCU(&ram_list.blocks);
1512 do {
1513 again = true;
1514 found = get_queued_page(rs, &pss);
1516 if (!found) {
1517 /* priority queue empty, so just search for something dirty */
1518 found = find_dirty_block(rs, &pss, &again);
1521 if (found) {
1522 pages = ram_save_host_page(rs, &pss, last_stage);
1524 } while (!pages && again);
1526 rs->last_seen_block = pss.block;
1527 rs->last_page = pss.page;
1529 return pages;
1532 void acct_update_position(QEMUFile *f, size_t size, bool zero)
1534 uint64_t pages = size / TARGET_PAGE_SIZE;
1536 if (zero) {
1537 ram_counters.duplicate += pages;
1538 } else {
1539 ram_counters.normal += pages;
1540 ram_counters.transferred += size;
1541 qemu_update_position(f, size);
1545 uint64_t ram_bytes_total(void)
1547 RAMBlock *block;
1548 uint64_t total = 0;
1550 rcu_read_lock();
1551 RAMBLOCK_FOREACH(block) {
1552 total += block->used_length;
1554 rcu_read_unlock();
1555 return total;
1558 static void xbzrle_load_setup(void)
1560 XBZRLE.decoded_buf = g_malloc(TARGET_PAGE_SIZE);
1563 static void xbzrle_load_cleanup(void)
1565 g_free(XBZRLE.decoded_buf);
1566 XBZRLE.decoded_buf = NULL;
1569 static void ram_save_cleanup(void *opaque)
1571 RAMState **rsp = opaque;
1572 RAMBlock *block;
1574 /* caller have hold iothread lock or is in a bh, so there is
1575 * no writing race against this migration_bitmap
1577 memory_global_dirty_log_stop();
1579 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1580 g_free(block->bmap);
1581 block->bmap = NULL;
1582 g_free(block->unsentmap);
1583 block->unsentmap = NULL;
1586 XBZRLE_cache_lock();
1587 if (XBZRLE.cache) {
1588 cache_fini(XBZRLE.cache);
1589 g_free(XBZRLE.encoded_buf);
1590 g_free(XBZRLE.current_buf);
1591 g_free(XBZRLE.zero_target_page);
1592 XBZRLE.cache = NULL;
1593 XBZRLE.encoded_buf = NULL;
1594 XBZRLE.current_buf = NULL;
1595 XBZRLE.zero_target_page = NULL;
1597 XBZRLE_cache_unlock();
1598 migration_page_queue_free(*rsp);
1599 compress_threads_save_cleanup();
1600 g_free(*rsp);
1601 *rsp = NULL;
1604 static void ram_state_reset(RAMState *rs)
1606 rs->last_seen_block = NULL;
1607 rs->last_sent_block = NULL;
1608 rs->last_page = 0;
1609 rs->last_version = ram_list.version;
1610 rs->ram_bulk_stage = true;
1613 #define MAX_WAIT 50 /* ms, half buffered_file limit */
1616 * 'expected' is the value you expect the bitmap mostly to be full
1617 * of; it won't bother printing lines that are all this value.
1618 * If 'todump' is null the migration bitmap is dumped.
1620 void ram_debug_dump_bitmap(unsigned long *todump, bool expected,
1621 unsigned long pages)
1623 int64_t cur;
1624 int64_t linelen = 128;
1625 char linebuf[129];
1627 for (cur = 0; cur < pages; cur += linelen) {
1628 int64_t curb;
1629 bool found = false;
1631 * Last line; catch the case where the line length
1632 * is longer than remaining ram
1634 if (cur + linelen > pages) {
1635 linelen = pages - cur;
1637 for (curb = 0; curb < linelen; curb++) {
1638 bool thisbit = test_bit(cur + curb, todump);
1639 linebuf[curb] = thisbit ? '1' : '.';
1640 found = found || (thisbit != expected);
1642 if (found) {
1643 linebuf[curb] = '\0';
1644 fprintf(stderr, "0x%08" PRIx64 " : %s\n", cur, linebuf);
1649 /* **** functions for postcopy ***** */
1651 void ram_postcopy_migrated_memory_release(MigrationState *ms)
1653 struct RAMBlock *block;
1655 RAMBLOCK_FOREACH(block) {
1656 unsigned long *bitmap = block->bmap;
1657 unsigned long range = block->used_length >> TARGET_PAGE_BITS;
1658 unsigned long run_start = find_next_zero_bit(bitmap, range, 0);
1660 while (run_start < range) {
1661 unsigned long run_end = find_next_bit(bitmap, range, run_start + 1);
1662 ram_discard_range(block->idstr, run_start << TARGET_PAGE_BITS,
1663 (run_end - run_start) << TARGET_PAGE_BITS);
1664 run_start = find_next_zero_bit(bitmap, range, run_end + 1);
1670 * postcopy_send_discard_bm_ram: discard a RAMBlock
1672 * Returns zero on success
1674 * Callback from postcopy_each_ram_send_discard for each RAMBlock
1675 * Note: At this point the 'unsentmap' is the processed bitmap combined
1676 * with the dirtymap; so a '1' means it's either dirty or unsent.
1678 * @ms: current migration state
1679 * @pds: state for postcopy
1680 * @start: RAMBlock starting page
1681 * @length: RAMBlock size
1683 static int postcopy_send_discard_bm_ram(MigrationState *ms,
1684 PostcopyDiscardState *pds,
1685 RAMBlock *block)
1687 unsigned long end = block->used_length >> TARGET_PAGE_BITS;
1688 unsigned long current;
1689 unsigned long *unsentmap = block->unsentmap;
1691 for (current = 0; current < end; ) {
1692 unsigned long one = find_next_bit(unsentmap, end, current);
1694 if (one <= end) {
1695 unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
1696 unsigned long discard_length;
1698 if (zero >= end) {
1699 discard_length = end - one;
1700 } else {
1701 discard_length = zero - one;
1703 if (discard_length) {
1704 postcopy_discard_send_range(ms, pds, one, discard_length);
1706 current = one + discard_length;
1707 } else {
1708 current = one;
1712 return 0;
1716 * postcopy_each_ram_send_discard: discard all RAMBlocks
1718 * Returns 0 for success or negative for error
1720 * Utility for the outgoing postcopy code.
1721 * Calls postcopy_send_discard_bm_ram for each RAMBlock
1722 * passing it bitmap indexes and name.
1723 * (qemu_ram_foreach_block ends up passing unscaled lengths
1724 * which would mean postcopy code would have to deal with target page)
1726 * @ms: current migration state
1728 static int postcopy_each_ram_send_discard(MigrationState *ms)
1730 struct RAMBlock *block;
1731 int ret;
1733 RAMBLOCK_FOREACH(block) {
1734 PostcopyDiscardState *pds =
1735 postcopy_discard_send_init(ms, block->idstr);
1738 * Postcopy sends chunks of bitmap over the wire, but it
1739 * just needs indexes at this point, avoids it having
1740 * target page specific code.
1742 ret = postcopy_send_discard_bm_ram(ms, pds, block);
1743 postcopy_discard_send_finish(ms, pds);
1744 if (ret) {
1745 return ret;
1749 return 0;
1753 * postcopy_chunk_hostpages_pass: canocalize bitmap in hostpages
1755 * Helper for postcopy_chunk_hostpages; it's called twice to
1756 * canonicalize the two bitmaps, that are similar, but one is
1757 * inverted.
1759 * Postcopy requires that all target pages in a hostpage are dirty or
1760 * clean, not a mix. This function canonicalizes the bitmaps.
1762 * @ms: current migration state
1763 * @unsent_pass: if true we need to canonicalize partially unsent host pages
1764 * otherwise we need to canonicalize partially dirty host pages
1765 * @block: block that contains the page we want to canonicalize
1766 * @pds: state for postcopy
1768 static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
1769 RAMBlock *block,
1770 PostcopyDiscardState *pds)
1772 RAMState *rs = ram_state;
1773 unsigned long *bitmap = block->bmap;
1774 unsigned long *unsentmap = block->unsentmap;
1775 unsigned int host_ratio = block->page_size / TARGET_PAGE_SIZE;
1776 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
1777 unsigned long run_start;
1779 if (block->page_size == TARGET_PAGE_SIZE) {
1780 /* Easy case - TPS==HPS for a non-huge page RAMBlock */
1781 return;
1784 if (unsent_pass) {
1785 /* Find a sent page */
1786 run_start = find_next_zero_bit(unsentmap, pages, 0);
1787 } else {
1788 /* Find a dirty page */
1789 run_start = find_next_bit(bitmap, pages, 0);
1792 while (run_start < pages) {
1793 bool do_fixup = false;
1794 unsigned long fixup_start_addr;
1795 unsigned long host_offset;
1798 * If the start of this run of pages is in the middle of a host
1799 * page, then we need to fixup this host page.
1801 host_offset = run_start % host_ratio;
1802 if (host_offset) {
1803 do_fixup = true;
1804 run_start -= host_offset;
1805 fixup_start_addr = run_start;
1806 /* For the next pass */
1807 run_start = run_start + host_ratio;
1808 } else {
1809 /* Find the end of this run */
1810 unsigned long run_end;
1811 if (unsent_pass) {
1812 run_end = find_next_bit(unsentmap, pages, run_start + 1);
1813 } else {
1814 run_end = find_next_zero_bit(bitmap, pages, run_start + 1);
1817 * If the end isn't at the start of a host page, then the
1818 * run doesn't finish at the end of a host page
1819 * and we need to discard.
1821 host_offset = run_end % host_ratio;
1822 if (host_offset) {
1823 do_fixup = true;
1824 fixup_start_addr = run_end - host_offset;
1826 * This host page has gone, the next loop iteration starts
1827 * from after the fixup
1829 run_start = fixup_start_addr + host_ratio;
1830 } else {
1832 * No discards on this iteration, next loop starts from
1833 * next sent/dirty page
1835 run_start = run_end + 1;
1839 if (do_fixup) {
1840 unsigned long page;
1842 /* Tell the destination to discard this page */
1843 if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
1844 /* For the unsent_pass we:
1845 * discard partially sent pages
1846 * For the !unsent_pass (dirty) we:
1847 * discard partially dirty pages that were sent
1848 * (any partially sent pages were already discarded
1849 * by the previous unsent_pass)
1851 postcopy_discard_send_range(ms, pds, fixup_start_addr,
1852 host_ratio);
1855 /* Clean up the bitmap */
1856 for (page = fixup_start_addr;
1857 page < fixup_start_addr + host_ratio; page++) {
1858 /* All pages in this host page are now not sent */
1859 set_bit(page, unsentmap);
1862 * Remark them as dirty, updating the count for any pages
1863 * that weren't previously dirty.
1865 rs->migration_dirty_pages += !test_and_set_bit(page, bitmap);
1869 if (unsent_pass) {
1870 /* Find the next sent page for the next iteration */
1871 run_start = find_next_zero_bit(unsentmap, pages, run_start);
1872 } else {
1873 /* Find the next dirty page for the next iteration */
1874 run_start = find_next_bit(bitmap, pages, run_start);
1880 * postcopy_chuck_hostpages: discrad any partially sent host page
1882 * Utility for the outgoing postcopy code.
1884 * Discard any partially sent host-page size chunks, mark any partially
1885 * dirty host-page size chunks as all dirty. In this case the host-page
1886 * is the host-page for the particular RAMBlock, i.e. it might be a huge page
1888 * Returns zero on success
1890 * @ms: current migration state
1891 * @block: block we want to work with
1893 static int postcopy_chunk_hostpages(MigrationState *ms, RAMBlock *block)
1895 PostcopyDiscardState *pds =
1896 postcopy_discard_send_init(ms, block->idstr);
1898 /* First pass: Discard all partially sent host pages */
1899 postcopy_chunk_hostpages_pass(ms, true, block, pds);
1901 * Second pass: Ensure that all partially dirty host pages are made
1902 * fully dirty.
1904 postcopy_chunk_hostpages_pass(ms, false, block, pds);
1906 postcopy_discard_send_finish(ms, pds);
1907 return 0;
1911 * ram_postcopy_send_discard_bitmap: transmit the discard bitmap
1913 * Returns zero on success
1915 * Transmit the set of pages to be discarded after precopy to the target
1916 * these are pages that:
1917 * a) Have been previously transmitted but are now dirty again
1918 * b) Pages that have never been transmitted, this ensures that
1919 * any pages on the destination that have been mapped by background
1920 * tasks get discarded (transparent huge pages is the specific concern)
1921 * Hopefully this is pretty sparse
1923 * @ms: current migration state
1925 int ram_postcopy_send_discard_bitmap(MigrationState *ms)
1927 RAMState *rs = ram_state;
1928 RAMBlock *block;
1929 int ret;
1931 rcu_read_lock();
1933 /* This should be our last sync, the src is now paused */
1934 migration_bitmap_sync(rs);
1936 /* Easiest way to make sure we don't resume in the middle of a host-page */
1937 rs->last_seen_block = NULL;
1938 rs->last_sent_block = NULL;
1939 rs->last_page = 0;
1941 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
1942 unsigned long pages = block->used_length >> TARGET_PAGE_BITS;
1943 unsigned long *bitmap = block->bmap;
1944 unsigned long *unsentmap = block->unsentmap;
1946 if (!unsentmap) {
1947 /* We don't have a safe way to resize the sentmap, so
1948 * if the bitmap was resized it will be NULL at this
1949 * point.
1951 error_report("migration ram resized during precopy phase");
1952 rcu_read_unlock();
1953 return -EINVAL;
1955 /* Deal with TPS != HPS and huge pages */
1956 ret = postcopy_chunk_hostpages(ms, block);
1957 if (ret) {
1958 rcu_read_unlock();
1959 return ret;
1963 * Update the unsentmap to be unsentmap = unsentmap | dirty
1965 bitmap_or(unsentmap, unsentmap, bitmap, pages);
1966 #ifdef DEBUG_POSTCOPY
1967 ram_debug_dump_bitmap(unsentmap, true, pages);
1968 #endif
1970 trace_ram_postcopy_send_discard_bitmap();
1972 ret = postcopy_each_ram_send_discard(ms);
1973 rcu_read_unlock();
1975 return ret;
1979 * ram_discard_range: discard dirtied pages at the beginning of postcopy
1981 * Returns zero on success
1983 * @rbname: name of the RAMBlock of the request. NULL means the
1984 * same that last one.
1985 * @start: RAMBlock starting page
1986 * @length: RAMBlock size
1988 int ram_discard_range(const char *rbname, uint64_t start, size_t length)
1990 int ret = -1;
1992 trace_ram_discard_range(rbname, start, length);
1994 rcu_read_lock();
1995 RAMBlock *rb = qemu_ram_block_by_name(rbname);
1997 if (!rb) {
1998 error_report("ram_discard_range: Failed to find block '%s'", rbname);
1999 goto err;
2002 ret = ram_block_discard_range(rb, start, length);
2004 err:
2005 rcu_read_unlock();
2007 return ret;
2010 static int ram_state_init(RAMState **rsp)
2012 *rsp = g_new0(RAMState, 1);
2014 qemu_mutex_init(&(*rsp)->bitmap_mutex);
2015 qemu_mutex_init(&(*rsp)->src_page_req_mutex);
2016 QSIMPLEQ_INIT(&(*rsp)->src_page_requests);
2018 if (migrate_use_xbzrle()) {
2019 XBZRLE_cache_lock();
2020 XBZRLE.zero_target_page = g_malloc0(TARGET_PAGE_SIZE);
2021 XBZRLE.cache = cache_init(migrate_xbzrle_cache_size() /
2022 TARGET_PAGE_SIZE,
2023 TARGET_PAGE_SIZE);
2024 if (!XBZRLE.cache) {
2025 XBZRLE_cache_unlock();
2026 error_report("Error creating cache");
2027 g_free(*rsp);
2028 *rsp = NULL;
2029 return -1;
2031 XBZRLE_cache_unlock();
2033 /* We prefer not to abort if there is no memory */
2034 XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE);
2035 if (!XBZRLE.encoded_buf) {
2036 error_report("Error allocating encoded_buf");
2037 g_free(*rsp);
2038 *rsp = NULL;
2039 return -1;
2042 XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE);
2043 if (!XBZRLE.current_buf) {
2044 error_report("Error allocating current_buf");
2045 g_free(XBZRLE.encoded_buf);
2046 XBZRLE.encoded_buf = NULL;
2047 g_free(*rsp);
2048 *rsp = NULL;
2049 return -1;
2053 /* For memory_global_dirty_log_start below. */
2054 qemu_mutex_lock_iothread();
2056 qemu_mutex_lock_ramlist();
2057 rcu_read_lock();
2058 ram_state_reset(*rsp);
2060 /* Skip setting bitmap if there is no RAM */
2061 if (ram_bytes_total()) {
2062 RAMBlock *block;
2064 QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
2065 unsigned long pages = block->max_length >> TARGET_PAGE_BITS;
2067 block->bmap = bitmap_new(pages);
2068 bitmap_set(block->bmap, 0, pages);
2069 if (migrate_postcopy_ram()) {
2070 block->unsentmap = bitmap_new(pages);
2071 bitmap_set(block->unsentmap, 0, pages);
2077 * Count the total number of pages used by ram blocks not including any
2078 * gaps due to alignment or unplugs.
2080 (*rsp)->migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS;
2082 memory_global_dirty_log_start();
2083 migration_bitmap_sync(*rsp);
2084 qemu_mutex_unlock_ramlist();
2085 qemu_mutex_unlock_iothread();
2086 rcu_read_unlock();
2088 return 0;
2092 * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
2093 * long-running RCU critical section. When rcu-reclaims in the code
2094 * start to become numerous it will be necessary to reduce the
2095 * granularity of these critical sections.
2099 * ram_save_setup: Setup RAM for migration
2101 * Returns zero to indicate success and negative for error
2103 * @f: QEMUFile where to send the data
2104 * @opaque: RAMState pointer
2106 static int ram_save_setup(QEMUFile *f, void *opaque)
2108 RAMState **rsp = opaque;
2109 RAMBlock *block;
2111 /* migration has already setup the bitmap, reuse it. */
2112 if (!migration_in_colo_state()) {
2113 if (ram_state_init(rsp) != 0) {
2114 return -1;
2117 (*rsp)->f = f;
2119 rcu_read_lock();
2121 qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
2123 RAMBLOCK_FOREACH(block) {
2124 qemu_put_byte(f, strlen(block->idstr));
2125 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
2126 qemu_put_be64(f, block->used_length);
2127 if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) {
2128 qemu_put_be64(f, block->page_size);
2132 rcu_read_unlock();
2133 compress_threads_save_setup();
2135 ram_control_before_iterate(f, RAM_CONTROL_SETUP);
2136 ram_control_after_iterate(f, RAM_CONTROL_SETUP);
2138 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2140 return 0;
2144 * ram_save_iterate: iterative stage for migration
2146 * Returns zero to indicate success and negative for error
2148 * @f: QEMUFile where to send the data
2149 * @opaque: RAMState pointer
2151 static int ram_save_iterate(QEMUFile *f, void *opaque)
2153 RAMState **temp = opaque;
2154 RAMState *rs = *temp;
2155 int ret;
2156 int i;
2157 int64_t t0;
2158 int done = 0;
2160 rcu_read_lock();
2161 if (ram_list.version != rs->last_version) {
2162 ram_state_reset(rs);
2165 /* Read version before ram_list.blocks */
2166 smp_rmb();
2168 ram_control_before_iterate(f, RAM_CONTROL_ROUND);
2170 t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
2171 i = 0;
2172 while ((ret = qemu_file_rate_limit(f)) == 0) {
2173 int pages;
2175 pages = ram_find_and_save_block(rs, false);
2176 /* no more pages to sent */
2177 if (pages == 0) {
2178 done = 1;
2179 break;
2181 rs->iterations++;
2183 /* we want to check in the 1st loop, just in case it was the 1st time
2184 and we had to sync the dirty bitmap.
2185 qemu_get_clock_ns() is a bit expensive, so we only check each some
2186 iterations
2188 if ((i & 63) == 0) {
2189 uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000;
2190 if (t1 > MAX_WAIT) {
2191 trace_ram_save_iterate_big_wait(t1, i);
2192 break;
2195 i++;
2197 flush_compressed_data(rs);
2198 rcu_read_unlock();
2201 * Must occur before EOS (or any QEMUFile operation)
2202 * because of RDMA protocol.
2204 ram_control_after_iterate(f, RAM_CONTROL_ROUND);
2206 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2207 ram_counters.transferred += 8;
2209 ret = qemu_file_get_error(f);
2210 if (ret < 0) {
2211 return ret;
2214 return done;
2218 * ram_save_complete: function called to send the remaining amount of ram
2220 * Returns zero to indicate success
2222 * Called with iothread lock
2224 * @f: QEMUFile where to send the data
2225 * @opaque: RAMState pointer
2227 static int ram_save_complete(QEMUFile *f, void *opaque)
2229 RAMState **temp = opaque;
2230 RAMState *rs = *temp;
2232 rcu_read_lock();
2234 if (!migration_in_postcopy()) {
2235 migration_bitmap_sync(rs);
2238 ram_control_before_iterate(f, RAM_CONTROL_FINISH);
2240 /* try transferring iterative blocks of memory */
2242 /* flush all remaining blocks regardless of rate limiting */
2243 while (true) {
2244 int pages;
2246 pages = ram_find_and_save_block(rs, !migration_in_colo_state());
2247 /* no more blocks to sent */
2248 if (pages == 0) {
2249 break;
2253 flush_compressed_data(rs);
2254 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
2256 rcu_read_unlock();
2258 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
2260 return 0;
2263 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
2264 uint64_t *non_postcopiable_pending,
2265 uint64_t *postcopiable_pending)
2267 RAMState **temp = opaque;
2268 RAMState *rs = *temp;
2269 uint64_t remaining_size;
2271 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2273 if (!migration_in_postcopy() &&
2274 remaining_size < max_size) {
2275 qemu_mutex_lock_iothread();
2276 rcu_read_lock();
2277 migration_bitmap_sync(rs);
2278 rcu_read_unlock();
2279 qemu_mutex_unlock_iothread();
2280 remaining_size = rs->migration_dirty_pages * TARGET_PAGE_SIZE;
2283 if (migrate_postcopy_ram()) {
2284 /* We can do postcopy, and all the data is postcopiable */
2285 *postcopiable_pending += remaining_size;
2286 } else {
2287 *non_postcopiable_pending += remaining_size;
2291 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
2293 unsigned int xh_len;
2294 int xh_flags;
2295 uint8_t *loaded_data;
2297 /* extract RLE header */
2298 xh_flags = qemu_get_byte(f);
2299 xh_len = qemu_get_be16(f);
2301 if (xh_flags != ENCODING_FLAG_XBZRLE) {
2302 error_report("Failed to load XBZRLE page - wrong compression!");
2303 return -1;
2306 if (xh_len > TARGET_PAGE_SIZE) {
2307 error_report("Failed to load XBZRLE page - len overflow!");
2308 return -1;
2310 loaded_data = XBZRLE.decoded_buf;
2311 /* load data and decode */
2312 /* it can change loaded_data to point to an internal buffer */
2313 qemu_get_buffer_in_place(f, &loaded_data, xh_len);
2315 /* decode RLE */
2316 if (xbzrle_decode_buffer(loaded_data, xh_len, host,
2317 TARGET_PAGE_SIZE) == -1) {
2318 error_report("Failed to load XBZRLE page - decode error!");
2319 return -1;
2322 return 0;
2326 * ram_block_from_stream: read a RAMBlock id from the migration stream
2328 * Must be called from within a rcu critical section.
2330 * Returns a pointer from within the RCU-protected ram_list.
2332 * @f: QEMUFile where to read the data from
2333 * @flags: Page flags (mostly to see if it's a continuation of previous block)
2335 static inline RAMBlock *ram_block_from_stream(QEMUFile *f, int flags)
2337 static RAMBlock *block = NULL;
2338 char id[256];
2339 uint8_t len;
2341 if (flags & RAM_SAVE_FLAG_CONTINUE) {
2342 if (!block) {
2343 error_report("Ack, bad migration stream!");
2344 return NULL;
2346 return block;
2349 len = qemu_get_byte(f);
2350 qemu_get_buffer(f, (uint8_t *)id, len);
2351 id[len] = 0;
2353 block = qemu_ram_block_by_name(id);
2354 if (!block) {
2355 error_report("Can't find block %s", id);
2356 return NULL;
2359 return block;
2362 static inline void *host_from_ram_block_offset(RAMBlock *block,
2363 ram_addr_t offset)
2365 if (!offset_in_ramblock(block, offset)) {
2366 return NULL;
2369 return block->host + offset;
2373 * ram_handle_compressed: handle the zero page case
2375 * If a page (or a whole RDMA chunk) has been
2376 * determined to be zero, then zap it.
2378 * @host: host address for the zero page
2379 * @ch: what the page is filled from. We only support zero
2380 * @size: size of the zero page
2382 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size)
2384 if (ch != 0 || !is_zero_range(host, size)) {
2385 memset(host, ch, size);
2389 static void *do_data_decompress(void *opaque)
2391 DecompressParam *param = opaque;
2392 unsigned long pagesize;
2393 uint8_t *des;
2394 int len;
2396 qemu_mutex_lock(&param->mutex);
2397 while (!param->quit) {
2398 if (param->des) {
2399 des = param->des;
2400 len = param->len;
2401 param->des = 0;
2402 qemu_mutex_unlock(&param->mutex);
2404 pagesize = TARGET_PAGE_SIZE;
2405 /* uncompress() will return failed in some case, especially
2406 * when the page is dirted when doing the compression, it's
2407 * not a problem because the dirty page will be retransferred
2408 * and uncompress() won't break the data in other pages.
2410 uncompress((Bytef *)des, &pagesize,
2411 (const Bytef *)param->compbuf, len);
2413 qemu_mutex_lock(&decomp_done_lock);
2414 param->done = true;
2415 qemu_cond_signal(&decomp_done_cond);
2416 qemu_mutex_unlock(&decomp_done_lock);
2418 qemu_mutex_lock(&param->mutex);
2419 } else {
2420 qemu_cond_wait(&param->cond, &param->mutex);
2423 qemu_mutex_unlock(&param->mutex);
2425 return NULL;
2428 static void wait_for_decompress_done(void)
2430 int idx, thread_count;
2432 if (!migrate_use_compression()) {
2433 return;
2436 thread_count = migrate_decompress_threads();
2437 qemu_mutex_lock(&decomp_done_lock);
2438 for (idx = 0; idx < thread_count; idx++) {
2439 while (!decomp_param[idx].done) {
2440 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2443 qemu_mutex_unlock(&decomp_done_lock);
2446 static void compress_threads_load_setup(void)
2448 int i, thread_count;
2450 if (!migrate_use_compression()) {
2451 return;
2453 thread_count = migrate_decompress_threads();
2454 decompress_threads = g_new0(QemuThread, thread_count);
2455 decomp_param = g_new0(DecompressParam, thread_count);
2456 qemu_mutex_init(&decomp_done_lock);
2457 qemu_cond_init(&decomp_done_cond);
2458 for (i = 0; i < thread_count; i++) {
2459 qemu_mutex_init(&decomp_param[i].mutex);
2460 qemu_cond_init(&decomp_param[i].cond);
2461 decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE));
2462 decomp_param[i].done = true;
2463 decomp_param[i].quit = false;
2464 qemu_thread_create(decompress_threads + i, "decompress",
2465 do_data_decompress, decomp_param + i,
2466 QEMU_THREAD_JOINABLE);
2470 static void compress_threads_load_cleanup(void)
2472 int i, thread_count;
2474 if (!migrate_use_compression()) {
2475 return;
2477 thread_count = migrate_decompress_threads();
2478 for (i = 0; i < thread_count; i++) {
2479 qemu_mutex_lock(&decomp_param[i].mutex);
2480 decomp_param[i].quit = true;
2481 qemu_cond_signal(&decomp_param[i].cond);
2482 qemu_mutex_unlock(&decomp_param[i].mutex);
2484 for (i = 0; i < thread_count; i++) {
2485 qemu_thread_join(decompress_threads + i);
2486 qemu_mutex_destroy(&decomp_param[i].mutex);
2487 qemu_cond_destroy(&decomp_param[i].cond);
2488 g_free(decomp_param[i].compbuf);
2490 g_free(decompress_threads);
2491 g_free(decomp_param);
2492 decompress_threads = NULL;
2493 decomp_param = NULL;
2496 static void decompress_data_with_multi_threads(QEMUFile *f,
2497 void *host, int len)
2499 int idx, thread_count;
2501 thread_count = migrate_decompress_threads();
2502 qemu_mutex_lock(&decomp_done_lock);
2503 while (true) {
2504 for (idx = 0; idx < thread_count; idx++) {
2505 if (decomp_param[idx].done) {
2506 decomp_param[idx].done = false;
2507 qemu_mutex_lock(&decomp_param[idx].mutex);
2508 qemu_get_buffer(f, decomp_param[idx].compbuf, len);
2509 decomp_param[idx].des = host;
2510 decomp_param[idx].len = len;
2511 qemu_cond_signal(&decomp_param[idx].cond);
2512 qemu_mutex_unlock(&decomp_param[idx].mutex);
2513 break;
2516 if (idx < thread_count) {
2517 break;
2518 } else {
2519 qemu_cond_wait(&decomp_done_cond, &decomp_done_lock);
2522 qemu_mutex_unlock(&decomp_done_lock);
2526 * ram_load_setup: Setup RAM for migration incoming side
2528 * Returns zero to indicate success and negative for error
2530 * @f: QEMUFile where to receive the data
2531 * @opaque: RAMState pointer
2533 static int ram_load_setup(QEMUFile *f, void *opaque)
2535 xbzrle_load_setup();
2536 compress_threads_load_setup();
2537 return 0;
2540 static int ram_load_cleanup(void *opaque)
2542 xbzrle_load_cleanup();
2543 compress_threads_load_cleanup();
2544 return 0;
2548 * ram_postcopy_incoming_init: allocate postcopy data structures
2550 * Returns 0 for success and negative if there was one error
2552 * @mis: current migration incoming state
2554 * Allocate data structures etc needed by incoming migration with
2555 * postcopy-ram. postcopy-ram's similarly names
2556 * postcopy_ram_incoming_init does the work.
2558 int ram_postcopy_incoming_init(MigrationIncomingState *mis)
2560 unsigned long ram_pages = last_ram_page();
2562 return postcopy_ram_incoming_init(mis, ram_pages);
2566 * ram_load_postcopy: load a page in postcopy case
2568 * Returns 0 for success or -errno in case of error
2570 * Called in postcopy mode by ram_load().
2571 * rcu_read_lock is taken prior to this being called.
2573 * @f: QEMUFile where to send the data
2575 static int ram_load_postcopy(QEMUFile *f)
2577 int flags = 0, ret = 0;
2578 bool place_needed = false;
2579 bool matching_page_sizes = false;
2580 MigrationIncomingState *mis = migration_incoming_get_current();
2581 /* Temporary page that is later 'placed' */
2582 void *postcopy_host_page = postcopy_get_tmp_page(mis);
2583 void *last_host = NULL;
2584 bool all_zero = false;
2586 while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2587 ram_addr_t addr;
2588 void *host = NULL;
2589 void *page_buffer = NULL;
2590 void *place_source = NULL;
2591 RAMBlock *block = NULL;
2592 uint8_t ch;
2594 addr = qemu_get_be64(f);
2595 flags = addr & ~TARGET_PAGE_MASK;
2596 addr &= TARGET_PAGE_MASK;
2598 trace_ram_load_postcopy_loop((uint64_t)addr, flags);
2599 place_needed = false;
2600 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE)) {
2601 block = ram_block_from_stream(f, flags);
2603 host = host_from_ram_block_offset(block, addr);
2604 if (!host) {
2605 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2606 ret = -EINVAL;
2607 break;
2609 matching_page_sizes = block->page_size == TARGET_PAGE_SIZE;
2611 * Postcopy requires that we place whole host pages atomically;
2612 * these may be huge pages for RAMBlocks that are backed by
2613 * hugetlbfs.
2614 * To make it atomic, the data is read into a temporary page
2615 * that's moved into place later.
2616 * The migration protocol uses, possibly smaller, target-pages
2617 * however the source ensures it always sends all the components
2618 * of a host page in order.
2620 page_buffer = postcopy_host_page +
2621 ((uintptr_t)host & (block->page_size - 1));
2622 /* If all TP are zero then we can optimise the place */
2623 if (!((uintptr_t)host & (block->page_size - 1))) {
2624 all_zero = true;
2625 } else {
2626 /* not the 1st TP within the HP */
2627 if (host != (last_host + TARGET_PAGE_SIZE)) {
2628 error_report("Non-sequential target page %p/%p",
2629 host, last_host);
2630 ret = -EINVAL;
2631 break;
2637 * If it's the last part of a host page then we place the host
2638 * page
2640 place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
2641 (block->page_size - 1)) == 0;
2642 place_source = postcopy_host_page;
2644 last_host = host;
2646 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2647 case RAM_SAVE_FLAG_ZERO:
2648 ch = qemu_get_byte(f);
2649 memset(page_buffer, ch, TARGET_PAGE_SIZE);
2650 if (ch) {
2651 all_zero = false;
2653 break;
2655 case RAM_SAVE_FLAG_PAGE:
2656 all_zero = false;
2657 if (!place_needed || !matching_page_sizes) {
2658 qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
2659 } else {
2660 /* Avoids the qemu_file copy during postcopy, which is
2661 * going to do a copy later; can only do it when we
2662 * do this read in one go (matching page sizes)
2664 qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
2665 TARGET_PAGE_SIZE);
2667 break;
2668 case RAM_SAVE_FLAG_EOS:
2669 /* normal exit */
2670 break;
2671 default:
2672 error_report("Unknown combination of migration flags: %#x"
2673 " (postcopy mode)", flags);
2674 ret = -EINVAL;
2677 if (place_needed) {
2678 /* This gets called at the last target page in the host page */
2679 void *place_dest = host + TARGET_PAGE_SIZE - block->page_size;
2681 if (all_zero) {
2682 ret = postcopy_place_page_zero(mis, place_dest,
2683 block->page_size);
2684 } else {
2685 ret = postcopy_place_page(mis, place_dest,
2686 place_source, block->page_size);
2689 if (!ret) {
2690 ret = qemu_file_get_error(f);
2694 return ret;
2697 static int ram_load(QEMUFile *f, void *opaque, int version_id)
2699 int flags = 0, ret = 0, invalid_flags = 0;
2700 static uint64_t seq_iter;
2701 int len = 0;
2703 * If system is running in postcopy mode, page inserts to host memory must
2704 * be atomic
2706 bool postcopy_running = postcopy_state_get() >= POSTCOPY_INCOMING_LISTENING;
2707 /* ADVISE is earlier, it shows the source has the postcopy capability on */
2708 bool postcopy_advised = postcopy_state_get() >= POSTCOPY_INCOMING_ADVISE;
2710 seq_iter++;
2712 if (version_id != 4) {
2713 ret = -EINVAL;
2716 if (!migrate_use_compression()) {
2717 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
2719 /* This RCU critical section can be very long running.
2720 * When RCU reclaims in the code start to become numerous,
2721 * it will be necessary to reduce the granularity of this
2722 * critical section.
2724 rcu_read_lock();
2726 if (postcopy_running) {
2727 ret = ram_load_postcopy(f);
2730 while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
2731 ram_addr_t addr, total_ram_bytes;
2732 void *host = NULL;
2733 uint8_t ch;
2735 addr = qemu_get_be64(f);
2736 flags = addr & ~TARGET_PAGE_MASK;
2737 addr &= TARGET_PAGE_MASK;
2739 if (flags & invalid_flags) {
2740 if (flags & invalid_flags & RAM_SAVE_FLAG_COMPRESS_PAGE) {
2741 error_report("Received an unexpected compressed page");
2744 ret = -EINVAL;
2745 break;
2748 if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
2749 RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
2750 RAMBlock *block = ram_block_from_stream(f, flags);
2752 host = host_from_ram_block_offset(block, addr);
2753 if (!host) {
2754 error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
2755 ret = -EINVAL;
2756 break;
2758 trace_ram_load_loop(block->idstr, (uint64_t)addr, flags, host);
2761 switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
2762 case RAM_SAVE_FLAG_MEM_SIZE:
2763 /* Synchronize RAM block list */
2764 total_ram_bytes = addr;
2765 while (!ret && total_ram_bytes) {
2766 RAMBlock *block;
2767 char id[256];
2768 ram_addr_t length;
2770 len = qemu_get_byte(f);
2771 qemu_get_buffer(f, (uint8_t *)id, len);
2772 id[len] = 0;
2773 length = qemu_get_be64(f);
2775 block = qemu_ram_block_by_name(id);
2776 if (block) {
2777 if (length != block->used_length) {
2778 Error *local_err = NULL;
2780 ret = qemu_ram_resize(block, length,
2781 &local_err);
2782 if (local_err) {
2783 error_report_err(local_err);
2786 /* For postcopy we need to check hugepage sizes match */
2787 if (postcopy_advised &&
2788 block->page_size != qemu_host_page_size) {
2789 uint64_t remote_page_size = qemu_get_be64(f);
2790 if (remote_page_size != block->page_size) {
2791 error_report("Mismatched RAM page size %s "
2792 "(local) %zd != %" PRId64,
2793 id, block->page_size,
2794 remote_page_size);
2795 ret = -EINVAL;
2798 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
2799 block->idstr);
2800 } else {
2801 error_report("Unknown ramblock \"%s\", cannot "
2802 "accept migration", id);
2803 ret = -EINVAL;
2806 total_ram_bytes -= length;
2808 break;
2810 case RAM_SAVE_FLAG_ZERO:
2811 ch = qemu_get_byte(f);
2812 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
2813 break;
2815 case RAM_SAVE_FLAG_PAGE:
2816 qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
2817 break;
2819 case RAM_SAVE_FLAG_COMPRESS_PAGE:
2820 len = qemu_get_be32(f);
2821 if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
2822 error_report("Invalid compressed data length: %d", len);
2823 ret = -EINVAL;
2824 break;
2826 decompress_data_with_multi_threads(f, host, len);
2827 break;
2829 case RAM_SAVE_FLAG_XBZRLE:
2830 if (load_xbzrle(f, addr, host) < 0) {
2831 error_report("Failed to decompress XBZRLE page at "
2832 RAM_ADDR_FMT, addr);
2833 ret = -EINVAL;
2834 break;
2836 break;
2837 case RAM_SAVE_FLAG_EOS:
2838 /* normal exit */
2839 break;
2840 default:
2841 if (flags & RAM_SAVE_FLAG_HOOK) {
2842 ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL);
2843 } else {
2844 error_report("Unknown combination of migration flags: %#x",
2845 flags);
2846 ret = -EINVAL;
2849 if (!ret) {
2850 ret = qemu_file_get_error(f);
2854 wait_for_decompress_done();
2855 rcu_read_unlock();
2856 trace_ram_load_complete(ret, seq_iter);
2857 return ret;
2860 static bool ram_has_postcopy(void *opaque)
2862 return migrate_postcopy_ram();
2865 static SaveVMHandlers savevm_ram_handlers = {
2866 .save_setup = ram_save_setup,
2867 .save_live_iterate = ram_save_iterate,
2868 .save_live_complete_postcopy = ram_save_complete,
2869 .save_live_complete_precopy = ram_save_complete,
2870 .has_postcopy = ram_has_postcopy,
2871 .save_live_pending = ram_save_pending,
2872 .load_state = ram_load,
2873 .save_cleanup = ram_save_cleanup,
2874 .load_setup = ram_load_setup,
2875 .load_cleanup = ram_load_cleanup,
2878 void ram_mig_init(void)
2880 qemu_mutex_init(&XBZRLE.lock);
2881 register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, &ram_state);