migration/multifd.c

   1 /*
   2  * Multifd common code
   3  *
   4  * Copyright (c) 2019-2020 Red Hat Inc
   5  *
   6  * Authors:
   7  *  Juan Quintela <quintela@redhat.com>
   8  *
   9  * This work is licensed under the terms of the GNU GPL, version 2 or later.
  10  * See the COPYING file in the top-level directory.
  11  */
  12
  13 #include "qemu/osdep.h"
  14 #include "qemu/cutils.h"
  15 #include "qemu/rcu.h"
  16 #include "exec/target_page.h"
  17 #include "sysemu/sysemu.h"
  18 #include "exec/ramblock.h"
  19 #include "qemu/error-report.h"
  20 #include "qapi/error.h"
  21 #include "file.h"
  22 #include "migration.h"
  23 #include "migration-stats.h"
  24 #include "socket.h"
  25 #include "tls.h"
  26 #include "qemu-file.h"
  27 #include "trace.h"
  28 #include "multifd.h"
  29 #include "threadinfo.h"
  30 #include "options.h"
  31 #include "qemu/yank.h"
  32 #include "io/channel-file.h"
  33 #include "io/channel-socket.h"
  34 #include "yank_functions.h"
  35
  36 /* Multiple fd's */
  37
  38 #define MULTIFD_MAGIC 0x11223344U
  39 #define MULTIFD_VERSION 1
  40
  41 typedef struct {
  42     uint32_t magic;
  43     uint32_t version;
  44     unsigned char uuid[16]; /* QemuUUID */
  45     uint8_t id;
  46     uint8_t unused1[7];     /* Reserved for future use */
  47     uint64_t unused2[4];    /* Reserved for future use */
  48 } __attribute__((packed)) MultiFDInit_t;
  49
  50 struct {
  51     MultiFDSendParams *params;
  52     /* array of pages to sent */
  53     MultiFDPages_t *pages;
  54     /*
  55      * Global number of generated multifd packets.
  56      *
  57      * Note that we used 'uintptr_t' because it'll naturally support atomic
  58      * operations on both 32bit / 64 bits hosts.  It means on 32bit systems
  59      * multifd will overflow the packet_num easier, but that should be
  60      * fine.
  61      *
  62      * Another option is to use QEMU's Stat64 then it'll be 64 bits on all
  63      * hosts, however so far it does not support atomic fetch_add() yet.
  64      * Make it easy for now.
  65      */
  66     uintptr_t packet_num;
  67     /*
  68      * Synchronization point past which no more channels will be
  69      * created.
  70      */
  71     QemuSemaphore channels_created;
  72     /* send channels ready */
  73     QemuSemaphore channels_ready;
  74     /*
  75      * Have we already run terminate threads.  There is a race when it
  76      * happens that we got one error while we are exiting.
  77      * We will use atomic operations.  Only valid values are 0 and 1.
  78      */
  79     int exiting;
  80     /* multifd ops */
  81     MultiFDMethods *ops;
  82 } *multifd_send_state;
  83
  84 struct {
  85     MultiFDRecvParams *params;
  86     MultiFDRecvData *data;
  87     /* number of created threads */
  88     int count;
  89     /*
  90      * This is always posted by the recv threads, the migration thread
  91      * uses it to wait for recv threads to finish assigned tasks.
  92      */
  93     QemuSemaphore sem_sync;
  94     /* global number of generated multifd packets */
  95     uint64_t packet_num;
  96     int exiting;
  97     /* multifd ops */
  98     MultiFDMethods *ops;
  99 } *multifd_recv_state;
 100
 101 static bool multifd_use_packets(void)
 102 {
 103     return !migrate_mapped_ram();
 104 }
 105
 106 void multifd_send_channel_created(void)
 107 {
 108     qemu_sem_post(&multifd_send_state->channels_created);
 109 }
 110
 111 static void multifd_set_file_bitmap(MultiFDSendParams *p)
 112 {
 113     MultiFDPages_t *pages = p->pages;
 114
 115     assert(pages->block);
 116
 117     for (int i = 0; i < p->pages->normal_num; i++) {
 118         ramblock_set_file_bmap_atomic(pages->block, pages->offset[i], true);
 119     }
 120
 121     for (int i = p->pages->normal_num; i < p->pages->num; i++) {
 122         ramblock_set_file_bmap_atomic(pages->block, pages->offset[i], false);
 123     }
 124 }
 125
 126 /* Multifd without compression */
 127
 128 /**
 129  * nocomp_send_setup: setup send side
 130  *
 131  * @p: Params for the channel that we are using
 132  * @errp: pointer to an error
 133  */
 134 static int nocomp_send_setup(MultiFDSendParams *p, Error **errp)
 135 {
 136     if (migrate_zero_copy_send()) {
 137         p->write_flags |= QIO_CHANNEL_WRITE_FLAG_ZERO_COPY;
 138     }
 139
 140     return 0;
 141 }
 142
 143 /**
 144  * nocomp_send_cleanup: cleanup send side
 145  *
 146  * For no compression this function does nothing.
 147  *
 148  * @p: Params for the channel that we are using
 149  * @errp: pointer to an error
 150  */
 151 static void nocomp_send_cleanup(MultiFDSendParams *p, Error **errp)
 152 {
 153     return;
 154 }
 155
 156 static void multifd_send_prepare_iovs(MultiFDSendParams *p)
 157 {
 158     MultiFDPages_t *pages = p->pages;
 159
 160     for (int i = 0; i < pages->normal_num; i++) {
 161         p->iov[p->iovs_num].iov_base = pages->block->host + pages->offset[i];
 162         p->iov[p->iovs_num].iov_len = p->page_size;
 163         p->iovs_num++;
 164     }
 165
 166     p->next_packet_size = pages->normal_num * p->page_size;
 167 }
 168
 169 /**
 170  * nocomp_send_prepare: prepare date to be able to send
 171  *
 172  * For no compression we just have to calculate the size of the
 173  * packet.
 174  *
 175  * Returns 0 for success or -1 for error
 176  *
 177  * @p: Params for the channel that we are using
 178  * @errp: pointer to an error
 179  */
 180 static int nocomp_send_prepare(MultiFDSendParams *p, Error **errp)
 181 {
 182     bool use_zero_copy_send = migrate_zero_copy_send();
 183     int ret;
 184
 185     multifd_send_zero_page_detect(p);
 186
 187     if (!multifd_use_packets()) {
 188         multifd_send_prepare_iovs(p);
 189         multifd_set_file_bitmap(p);
 190
 191         return 0;
 192     }
 193
 194     if (!use_zero_copy_send) {
 195         /*
 196          * Only !zerocopy needs the header in IOV; zerocopy will
 197          * send it separately.
 198          */
 199         multifd_send_prepare_header(p);
 200     }
 201
 202     multifd_send_prepare_iovs(p);
 203     p->flags |= MULTIFD_FLAG_NOCOMP;
 204
 205     multifd_send_fill_packet(p);
 206
 207     if (use_zero_copy_send) {
 208         /* Send header first, without zerocopy */
 209         ret = qio_channel_write_all(p->c, (void *)p->packet,
 210                                     p->packet_len, errp);
 211         if (ret != 0) {
 212             return -1;
 213         }
 214     }
 215
 216     return 0;
 217 }
 218
 219 /**
 220  * nocomp_recv_setup: setup receive side
 221  *
 222  * For no compression this function does nothing.
 223  *
 224  * Returns 0 for success or -1 for error
 225  *
 226  * @p: Params for the channel that we are using
 227  * @errp: pointer to an error
 228  */
 229 static int nocomp_recv_setup(MultiFDRecvParams *p, Error **errp)
 230 {
 231     return 0;
 232 }
 233
 234 /**
 235  * nocomp_recv_cleanup: setup receive side
 236  *
 237  * For no compression this function does nothing.
 238  *
 239  * @p: Params for the channel that we are using
 240  */
 241 static void nocomp_recv_cleanup(MultiFDRecvParams *p)
 242 {
 243 }
 244
 245 /**
 246  * nocomp_recv: read the data from the channel
 247  *
 248  * For no compression we just need to read things into the correct place.
 249  *
 250  * Returns 0 for success or -1 for error
 251  *
 252  * @p: Params for the channel that we are using
 253  * @errp: pointer to an error
 254  */
 255 static int nocomp_recv(MultiFDRecvParams *p, Error **errp)
 256 {
 257     uint32_t flags;
 258
 259     if (!multifd_use_packets()) {
 260         return multifd_file_recv_data(p, errp);
 261     }
 262
 263     flags = p->flags & MULTIFD_FLAG_COMPRESSION_MASK;
 264
 265     if (flags != MULTIFD_FLAG_NOCOMP) {
 266         error_setg(errp, "multifd %u: flags received %x flags expected %x",
 267                    p->id, flags, MULTIFD_FLAG_NOCOMP);
 268         return -1;
 269     }
 270
 271     multifd_recv_zero_page_process(p);
 272
 273     if (!p->normal_num) {
 274         return 0;
 275     }
 276
 277     for (int i = 0; i < p->normal_num; i++) {
 278         p->iov[i].iov_base = p->host + p->normal[i];
 279         p->iov[i].iov_len = p->page_size;
 280         ramblock_recv_bitmap_set_offset(p->block, p->normal[i]);
 281     }
 282     return qio_channel_readv_all(p->c, p->iov, p->normal_num, errp);
 283 }
 284
 285 static MultiFDMethods multifd_nocomp_ops = {
 286     .send_setup = nocomp_send_setup,
 287     .send_cleanup = nocomp_send_cleanup,
 288     .send_prepare = nocomp_send_prepare,
 289     .recv_setup = nocomp_recv_setup,
 290     .recv_cleanup = nocomp_recv_cleanup,
 291     .recv = nocomp_recv
 292 };
 293
 294 static MultiFDMethods *multifd_ops[MULTIFD_COMPRESSION__MAX] = {
 295     [MULTIFD_COMPRESSION_NONE] = &multifd_nocomp_ops,
 296 };
 297
 298 void multifd_register_ops(int method, MultiFDMethods *ops)
 299 {
 300     assert(0 < method && method < MULTIFD_COMPRESSION__MAX);
 301     multifd_ops[method] = ops;
 302 }
 303
 304 /* Reset a MultiFDPages_t* object for the next use */
 305 static void multifd_pages_reset(MultiFDPages_t *pages)
 306 {
 307     /*
 308      * We don't need to touch offset[] array, because it will be
 309      * overwritten later when reused.
 310      */
 311     pages->num = 0;
 312     pages->normal_num = 0;
 313     pages->block = NULL;
 314 }
 315
 316 static int multifd_send_initial_packet(MultiFDSendParams *p, Error **errp)
 317 {
 318     MultiFDInit_t msg = {};
 319     size_t size = sizeof(msg);
 320     int ret;
 321
 322     msg.magic = cpu_to_be32(MULTIFD_MAGIC);
 323     msg.version = cpu_to_be32(MULTIFD_VERSION);
 324     msg.id = p->id;
 325     memcpy(msg.uuid, &qemu_uuid.data, sizeof(msg.uuid));
 326
 327     ret = qio_channel_write_all(p->c, (char *)&msg, size, errp);
 328     if (ret != 0) {
 329         return -1;
 330     }
 331     stat64_add(&mig_stats.multifd_bytes, size);
 332     return 0;
 333 }
 334
 335 static int multifd_recv_initial_packet(QIOChannel *c, Error **errp)
 336 {
 337     MultiFDInit_t msg;
 338     int ret;
 339
 340     ret = qio_channel_read_all(c, (char *)&msg, sizeof(msg), errp);
 341     if (ret != 0) {
 342         return -1;
 343     }
 344
 345     msg.magic = be32_to_cpu(msg.magic);
 346     msg.version = be32_to_cpu(msg.version);
 347
 348     if (msg.magic != MULTIFD_MAGIC) {
 349         error_setg(errp, "multifd: received packet magic %x "
 350                    "expected %x", msg.magic, MULTIFD_MAGIC);
 351         return -1;
 352     }
 353
 354     if (msg.version != MULTIFD_VERSION) {
 355         error_setg(errp, "multifd: received packet version %u "
 356                    "expected %u", msg.version, MULTIFD_VERSION);
 357         return -1;
 358     }
 359
 360     if (memcmp(msg.uuid, &qemu_uuid, sizeof(qemu_uuid))) {
 361         char *uuid = qemu_uuid_unparse_strdup(&qemu_uuid);
 362         char *msg_uuid = qemu_uuid_unparse_strdup((const QemuUUID *)msg.uuid);
 363
 364         error_setg(errp, "multifd: received uuid '%s' and expected "
 365                    "uuid '%s' for channel %hhd", msg_uuid, uuid, msg.id);
 366         g_free(uuid);
 367         g_free(msg_uuid);
 368         return -1;
 369     }
 370
 371     if (msg.id > migrate_multifd_channels()) {
 372         error_setg(errp, "multifd: received channel id %u is greater than "
 373                    "number of channels %u", msg.id, migrate_multifd_channels());
 374         return -1;
 375     }
 376
 377     return msg.id;
 378 }
 379
 380 static MultiFDPages_t *multifd_pages_init(uint32_t n)
 381 {
 382     MultiFDPages_t *pages = g_new0(MultiFDPages_t, 1);
 383
 384     pages->allocated = n;
 385     pages->offset = g_new0(ram_addr_t, n);
 386
 387     return pages;
 388 }
 389
 390 static void multifd_pages_clear(MultiFDPages_t *pages)
 391 {
 392     multifd_pages_reset(pages);
 393     pages->allocated = 0;
 394     g_free(pages->offset);
 395     pages->offset = NULL;
 396     g_free(pages);
 397 }
 398
 399 void multifd_send_fill_packet(MultiFDSendParams *p)
 400 {
 401     MultiFDPacket_t *packet = p->packet;
 402     MultiFDPages_t *pages = p->pages;
 403     uint64_t packet_num;
 404     uint32_t zero_num = pages->num - pages->normal_num;
 405     int i;
 406
 407     packet->flags = cpu_to_be32(p->flags);
 408     packet->pages_alloc = cpu_to_be32(p->pages->allocated);
 409     packet->normal_pages = cpu_to_be32(pages->normal_num);
 410     packet->zero_pages = cpu_to_be32(zero_num);
 411     packet->next_packet_size = cpu_to_be32(p->next_packet_size);
 412
 413     packet_num = qatomic_fetch_inc(&multifd_send_state->packet_num);
 414     packet->packet_num = cpu_to_be64(packet_num);
 415
 416     if (pages->block) {
 417         strncpy(packet->ramblock, pages->block->idstr, 256);
 418     }
 419
 420     for (i = 0; i < pages->num; i++) {
 421         /* there are architectures where ram_addr_t is 32 bit */
 422         uint64_t temp = pages->offset[i];
 423
 424         packet->offset[i] = cpu_to_be64(temp);
 425     }
 426
 427     p->packets_sent++;
 428     p->total_normal_pages += pages->normal_num;
 429     p->total_zero_pages += zero_num;
 430
 431     trace_multifd_send(p->id, packet_num, pages->normal_num, zero_num,
 432                        p->flags, p->next_packet_size);
 433 }
 434
 435 static int multifd_recv_unfill_packet(MultiFDRecvParams *p, Error **errp)
 436 {
 437     MultiFDPacket_t *packet = p->packet;
 438     int i;
 439
 440     packet->magic = be32_to_cpu(packet->magic);
 441     if (packet->magic != MULTIFD_MAGIC) {
 442         error_setg(errp, "multifd: received packet "
 443                    "magic %x and expected magic %x",
 444                    packet->magic, MULTIFD_MAGIC);
 445         return -1;
 446     }
 447
 448     packet->version = be32_to_cpu(packet->version);
 449     if (packet->version != MULTIFD_VERSION) {
 450         error_setg(errp, "multifd: received packet "
 451                    "version %u and expected version %u",
 452                    packet->version, MULTIFD_VERSION);
 453         return -1;
 454     }
 455
 456     p->flags = be32_to_cpu(packet->flags);
 457
 458     packet->pages_alloc = be32_to_cpu(packet->pages_alloc);
 459     /*
 460      * If we received a packet that is 100 times bigger than expected
 461      * just stop migration.  It is a magic number.
 462      */
 463     if (packet->pages_alloc > p->page_count) {
 464         error_setg(errp, "multifd: received packet "
 465                    "with size %u and expected a size of %u",
 466                    packet->pages_alloc, p->page_count) ;
 467         return -1;
 468     }
 469
 470     p->normal_num = be32_to_cpu(packet->normal_pages);
 471     if (p->normal_num > packet->pages_alloc) {
 472         error_setg(errp, "multifd: received packet "
 473                    "with %u normal pages and expected maximum pages are %u",
 474                    p->normal_num, packet->pages_alloc) ;
 475         return -1;
 476     }
 477
 478     p->zero_num = be32_to_cpu(packet->zero_pages);
 479     if (p->zero_num > packet->pages_alloc - p->normal_num) {
 480         error_setg(errp, "multifd: received packet "
 481                    "with %u zero pages and expected maximum zero pages are %u",
 482                    p->zero_num, packet->pages_alloc - p->normal_num) ;
 483         return -1;
 484     }
 485
 486     p->next_packet_size = be32_to_cpu(packet->next_packet_size);
 487     p->packet_num = be64_to_cpu(packet->packet_num);
 488     p->packets_recved++;
 489     p->total_normal_pages += p->normal_num;
 490     p->total_zero_pages += p->zero_num;
 491
 492     trace_multifd_recv(p->id, p->packet_num, p->normal_num, p->zero_num,
 493                        p->flags, p->next_packet_size);
 494
 495     if (p->normal_num == 0 && p->zero_num == 0) {
 496         return 0;
 497     }
 498
 499     /* make sure that ramblock is 0 terminated */
 500     packet->ramblock[255] = 0;
 501     p->block = qemu_ram_block_by_name(packet->ramblock);
 502     if (!p->block) {
 503         error_setg(errp, "multifd: unknown ram block %s",
 504                    packet->ramblock);
 505         return -1;
 506     }
 507
 508     p->host = p->block->host;
 509     for (i = 0; i < p->normal_num; i++) {
 510         uint64_t offset = be64_to_cpu(packet->offset[i]);
 511
 512         if (offset > (p->block->used_length - p->page_size)) {
 513             error_setg(errp, "multifd: offset too long %" PRIu64
 514                        " (max " RAM_ADDR_FMT ")",
 515                        offset, p->block->used_length);
 516             return -1;
 517         }
 518         p->normal[i] = offset;
 519     }
 520
 521     for (i = 0; i < p->zero_num; i++) {
 522         uint64_t offset = be64_to_cpu(packet->offset[p->normal_num + i]);
 523
 524         if (offset > (p->block->used_length - p->page_size)) {
 525             error_setg(errp, "multifd: offset too long %" PRIu64
 526                        " (max " RAM_ADDR_FMT ")",
 527                        offset, p->block->used_length);
 528             return -1;
 529         }
 530         p->zero[i] = offset;
 531     }
 532
 533     return 0;
 534 }
 535
 536 static bool multifd_send_should_exit(void)
 537 {
 538     return qatomic_read(&multifd_send_state->exiting);
 539 }
 540
 541 static bool multifd_recv_should_exit(void)
 542 {
 543     return qatomic_read(&multifd_recv_state->exiting);
 544 }
 545
 546 /*
 547  * The migration thread can wait on either of the two semaphores.  This
 548  * function can be used to kick the main thread out of waiting on either of
 549  * them.  Should mostly only be called when something wrong happened with
 550  * the current multifd send thread.
 551  */
 552 static void multifd_send_kick_main(MultiFDSendParams *p)
 553 {
 554     qemu_sem_post(&p->sem_sync);
 555     qemu_sem_post(&multifd_send_state->channels_ready);
 556 }
 557
 558 /*
 559  * How we use multifd_send_state->pages and channel->pages?
 560  *
 561  * We create a pages for each channel, and a main one.  Each time that
 562  * we need to send a batch of pages we interchange the ones between
 563  * multifd_send_state and the channel that is sending it.  There are
 564  * two reasons for that:
 565  *    - to not have to do so many mallocs during migration
 566  *    - to make easier to know what to free at the end of migration
 567  *
 568  * This way we always know who is the owner of each "pages" struct,
 569  * and we don't need any locking.  It belongs to the migration thread
 570  * or to the channel thread.  Switching is safe because the migration
 571  * thread is using the channel mutex when changing it, and the channel
 572  * have to had finish with its own, otherwise pending_job can't be
 573  * false.
 574  *
 575  * Returns true if succeed, false otherwise.
 576  */
 577 static bool multifd_send_pages(void)
 578 {
 579     int i;
 580     static int next_channel;
 581     MultiFDSendParams *p = NULL; /* make happy gcc */
 582     MultiFDPages_t *pages = multifd_send_state->pages;
 583
 584     if (multifd_send_should_exit()) {
 585         return false;
 586     }
 587
 588     /* We wait here, until at least one channel is ready */
 589     qemu_sem_wait(&multifd_send_state->channels_ready);
 590
 591     /*
 592      * next_channel can remain from a previous migration that was
 593      * using more channels, so ensure it doesn't overflow if the
 594      * limit is lower now.
 595      */
 596     next_channel %= migrate_multifd_channels();
 597     for (i = next_channel;; i = (i + 1) % migrate_multifd_channels()) {
 598         if (multifd_send_should_exit()) {
 599             return false;
 600         }
 601         p = &multifd_send_state->params[i];
 602         /*
 603          * Lockless read to p->pending_job is safe, because only multifd
 604          * sender thread can clear it.
 605          */
 606         if (qatomic_read(&p->pending_job) == false) {
 607             next_channel = (i + 1) % migrate_multifd_channels();
 608             break;
 609         }
 610     }
 611
 612     /*
 613      * Make sure we read p->pending_job before all the rest.  Pairs with
 614      * qatomic_store_release() in multifd_send_thread().
 615      */
 616     smp_mb_acquire();
 617     assert(!p->pages->num);
 618     multifd_send_state->pages = p->pages;
 619     p->pages = pages;
 620     /*
 621      * Making sure p->pages is setup before marking pending_job=true. Pairs
 622      * with the qatomic_load_acquire() in multifd_send_thread().
 623      */
 624     qatomic_store_release(&p->pending_job, true);
 625     qemu_sem_post(&p->sem);
 626
 627     return true;
 628 }
 629
 630 static inline bool multifd_queue_empty(MultiFDPages_t *pages)
 631 {
 632     return pages->num == 0;
 633 }
 634
 635 static inline bool multifd_queue_full(MultiFDPages_t *pages)
 636 {
 637     return pages->num == pages->allocated;
 638 }
 639
 640 static inline void multifd_enqueue(MultiFDPages_t *pages, ram_addr_t offset)
 641 {
 642     pages->offset[pages->num++] = offset;
 643 }
 644
 645 /* Returns true if enqueue successful, false otherwise */
 646 bool multifd_queue_page(RAMBlock *block, ram_addr_t offset)
 647 {
 648     MultiFDPages_t *pages;
 649
 650 retry:
 651     pages = multifd_send_state->pages;
 652
 653     /* If the queue is empty, we can already enqueue now */
 654     if (multifd_queue_empty(pages)) {
 655         pages->block = block;
 656         multifd_enqueue(pages, offset);
 657         return true;
 658     }
 659
 660     /*
 661      * Not empty, meanwhile we need a flush.  It can because of either:
 662      *
 663      * (1) The page is not on the same ramblock of previous ones, or,
 664      * (2) The queue is full.
 665      *
 666      * After flush, always retry.
 667      */
 668     if (pages->block != block || multifd_queue_full(pages)) {
 669         if (!multifd_send_pages()) {
 670             return false;
 671         }
 672         goto retry;
 673     }
 674
 675     /* Not empty, and we still have space, do it! */
 676     multifd_enqueue(pages, offset);
 677     return true;
 678 }
 679
 680 /* Multifd send side hit an error; remember it and prepare to quit */
 681 static void multifd_send_set_error(Error *err)
 682 {
 683     /*
 684      * We don't want to exit each threads twice.  Depending on where
 685      * we get the error, or if there are two independent errors in two
 686      * threads at the same time, we can end calling this function
 687      * twice.
 688      */
 689     if (qatomic_xchg(&multifd_send_state->exiting, 1)) {
 690         return;
 691     }
 692
 693     if (err) {
 694         MigrationState *s = migrate_get_current();
 695         migrate_set_error(s, err);
 696         if (s->state == MIGRATION_STATUS_SETUP ||
 697             s->state == MIGRATION_STATUS_PRE_SWITCHOVER ||
 698             s->state == MIGRATION_STATUS_DEVICE ||
 699             s->state == MIGRATION_STATUS_ACTIVE) {
 700             migrate_set_state(&s->state, s->state,
 701                               MIGRATION_STATUS_FAILED);
 702         }
 703     }
 704 }
 705
 706 static void multifd_send_terminate_threads(void)
 707 {
 708     int i;
 709
 710     trace_multifd_send_terminate_threads();
 711
 712     /*
 713      * Tell everyone we're quitting.  No xchg() needed here; we simply
 714      * always set it.
 715      */
 716     qatomic_set(&multifd_send_state->exiting, 1);
 717
 718     /*
 719      * Firstly, kick all threads out; no matter whether they are just idle,
 720      * or blocked in an IO system call.
 721      */
 722     for (i = 0; i < migrate_multifd_channels(); i++) {
 723         MultiFDSendParams *p = &multifd_send_state->params[i];
 724
 725         qemu_sem_post(&p->sem);
 726         if (p->c) {
 727             qio_channel_shutdown(p->c, QIO_CHANNEL_SHUTDOWN_BOTH, NULL);
 728         }
 729     }
 730
 731     /*
 732      * Finally recycle all the threads.
 733      */
 734     for (i = 0; i < migrate_multifd_channels(); i++) {
 735         MultiFDSendParams *p = &multifd_send_state->params[i];
 736
 737         if (p->tls_thread_created) {
 738             qemu_thread_join(&p->tls_thread);
 739         }
 740
 741         if (p->thread_created) {
 742             qemu_thread_join(&p->thread);
 743         }
 744     }
 745 }
 746
 747 static bool multifd_send_cleanup_channel(MultiFDSendParams *p, Error **errp)
 748 {
 749     if (p->c) {
 750         migration_ioc_unregister_yank(p->c);
 751         /*
 752          * The object_unref() cannot guarantee the fd will always be
 753          * released because finalize() of the iochannel is only
 754          * triggered on the last reference and it's not guaranteed
 755          * that we always hold the last refcount when reaching here.
 756          *
 757          * Closing the fd explicitly has the benefit that if there is any
 758          * registered I/O handler callbacks on such fd, that will get a
 759          * POLLNVAL event and will further trigger the cleanup to finally
 760          * release the IOC.
 761          *
 762          * FIXME: It should logically be guaranteed that all multifd
 763          * channels have no I/O handler callback registered when reaching
 764          * here, because migration thread will wait for all multifd channel
 765          * establishments to complete during setup.  Since
 766          * migrate_fd_cleanup() will be scheduled in main thread too, all
 767          * previous callbacks should guarantee to be completed when
 768          * reaching here.  See multifd_send_state.channels_created and its
 769          * usage.  In the future, we could replace this with an assert
 770          * making sure we're the last reference, or simply drop it if above
 771          * is more clear to be justified.
 772          */
 773         qio_channel_close(p->c, &error_abort);
 774         object_unref(OBJECT(p->c));
 775         p->c = NULL;
 776     }
 777     qemu_sem_destroy(&p->sem);
 778     qemu_sem_destroy(&p->sem_sync);
 779     g_free(p->name);
 780     p->name = NULL;
 781     multifd_pages_clear(p->pages);
 782     p->pages = NULL;
 783     p->packet_len = 0;
 784     g_free(p->packet);
 785     p->packet = NULL;
 786     g_free(p->iov);
 787     p->iov = NULL;
 788     multifd_send_state->ops->send_cleanup(p, errp);
 789
 790     return *errp == NULL;
 791 }
 792
 793 static void multifd_send_cleanup_state(void)
 794 {
 795     file_cleanup_outgoing_migration();
 796     socket_cleanup_outgoing_migration();
 797     qemu_sem_destroy(&multifd_send_state->channels_created);
 798     qemu_sem_destroy(&multifd_send_state->channels_ready);
 799     g_free(multifd_send_state->params);
 800     multifd_send_state->params = NULL;
 801     multifd_pages_clear(multifd_send_state->pages);
 802     multifd_send_state->pages = NULL;
 803     g_free(multifd_send_state);
 804     multifd_send_state = NULL;
 805 }
 806
 807 void multifd_send_shutdown(void)
 808 {
 809     int i;
 810
 811     if (!migrate_multifd()) {
 812         return;
 813     }
 814
 815     multifd_send_terminate_threads();
 816
 817     for (i = 0; i < migrate_multifd_channels(); i++) {
 818         MultiFDSendParams *p = &multifd_send_state->params[i];
 819         Error *local_err = NULL;
 820
 821         if (!multifd_send_cleanup_channel(p, &local_err)) {
 822             migrate_set_error(migrate_get_current(), local_err);
 823             error_free(local_err);
 824         }
 825     }
 826
 827     multifd_send_cleanup_state();
 828 }
 829
 830 static int multifd_zero_copy_flush(QIOChannel *c)
 831 {
 832     int ret;
 833     Error *err = NULL;
 834
 835     ret = qio_channel_flush(c, &err);
 836     if (ret < 0) {
 837         error_report_err(err);
 838         return -1;
 839     }
 840     if (ret == 1) {
 841         stat64_add(&mig_stats.dirty_sync_missed_zero_copy, 1);
 842     }
 843
 844     return ret;
 845 }
 846
 847 int multifd_send_sync_main(void)
 848 {
 849     int i;
 850     bool flush_zero_copy;
 851
 852     if (!migrate_multifd()) {
 853         return 0;
 854     }
 855     if (multifd_send_state->pages->num) {
 856         if (!multifd_send_pages()) {
 857             error_report("%s: multifd_send_pages fail", __func__);
 858             return -1;
 859         }
 860     }
 861
 862     flush_zero_copy = migrate_zero_copy_send();
 863
 864     for (i = 0; i < migrate_multifd_channels(); i++) {
 865         MultiFDSendParams *p = &multifd_send_state->params[i];
 866
 867         if (multifd_send_should_exit()) {
 868             return -1;
 869         }
 870
 871         trace_multifd_send_sync_main_signal(p->id);
 872
 873         /*
 874          * We should be the only user so far, so not possible to be set by
 875          * others concurrently.
 876          */
 877         assert(qatomic_read(&p->pending_sync) == false);
 878         qatomic_set(&p->pending_sync, true);
 879         qemu_sem_post(&p->sem);
 880     }
 881     for (i = 0; i < migrate_multifd_channels(); i++) {
 882         MultiFDSendParams *p = &multifd_send_state->params[i];
 883
 884         if (multifd_send_should_exit()) {
 885             return -1;
 886         }
 887
 888         qemu_sem_wait(&multifd_send_state->channels_ready);
 889         trace_multifd_send_sync_main_wait(p->id);
 890         qemu_sem_wait(&p->sem_sync);
 891
 892         if (flush_zero_copy && p->c && (multifd_zero_copy_flush(p->c) < 0)) {
 893             return -1;
 894         }
 895     }
 896     trace_multifd_send_sync_main(multifd_send_state->packet_num);
 897
 898     return 0;
 899 }
 900
 901 static void *multifd_send_thread(void *opaque)
 902 {
 903     MultiFDSendParams *p = opaque;
 904     MigrationThread *thread = NULL;
 905     Error *local_err = NULL;
 906     int ret = 0;
 907     bool use_packets = multifd_use_packets();
 908
 909     thread = migration_threads_add(p->name, qemu_get_thread_id());
 910
 911     trace_multifd_send_thread_start(p->id);
 912     rcu_register_thread();
 913
 914     if (use_packets) {
 915         if (multifd_send_initial_packet(p, &local_err) < 0) {
 916             ret = -1;
 917             goto out;
 918         }
 919     }
 920
 921     while (true) {
 922         qemu_sem_post(&multifd_send_state->channels_ready);
 923         qemu_sem_wait(&p->sem);
 924
 925         if (multifd_send_should_exit()) {
 926             break;
 927         }
 928
 929         /*
 930          * Read pending_job flag before p->pages.  Pairs with the
 931          * qatomic_store_release() in multifd_send_pages().
 932          */
 933         if (qatomic_load_acquire(&p->pending_job)) {
 934             MultiFDPages_t *pages = p->pages;
 935
 936             p->iovs_num = 0;
 937             assert(pages->num);
 938
 939             ret = multifd_send_state->ops->send_prepare(p, &local_err);
 940             if (ret != 0) {
 941                 break;
 942             }
 943
 944             if (migrate_mapped_ram()) {
 945                 ret = file_write_ramblock_iov(p->c, p->iov, p->iovs_num,
 946                                               p->pages->block, &local_err);
 947             } else {
 948                 ret = qio_channel_writev_full_all(p->c, p->iov, p->iovs_num,
 949                                                   NULL, 0, p->write_flags,
 950                                                   &local_err);
 951             }
 952
 953             if (ret != 0) {
 954                 break;
 955             }
 956
 957             stat64_add(&mig_stats.multifd_bytes,
 958                        p->next_packet_size + p->packet_len);
 959             stat64_add(&mig_stats.normal_pages, pages->normal_num);
 960             stat64_add(&mig_stats.zero_pages, pages->num - pages->normal_num);
 961
 962             multifd_pages_reset(p->pages);
 963             p->next_packet_size = 0;
 964
 965             /*
 966              * Making sure p->pages is published before saying "we're
 967              * free".  Pairs with the smp_mb_acquire() in
 968              * multifd_send_pages().
 969              */
 970             qatomic_store_release(&p->pending_job, false);
 971         } else {
 972             /*
 973              * If not a normal job, must be a sync request.  Note that
 974              * pending_sync is a standalone flag (unlike pending_job), so
 975              * it doesn't require explicit memory barriers.
 976              */
 977             assert(qatomic_read(&p->pending_sync));
 978
 979             if (use_packets) {
 980                 p->flags = MULTIFD_FLAG_SYNC;
 981                 multifd_send_fill_packet(p);
 982                 ret = qio_channel_write_all(p->c, (void *)p->packet,
 983                                             p->packet_len, &local_err);
 984                 if (ret != 0) {
 985                     break;
 986                 }
 987                 /* p->next_packet_size will always be zero for a SYNC packet */
 988                 stat64_add(&mig_stats.multifd_bytes, p->packet_len);
 989                 p->flags = 0;
 990             }
 991
 992             qatomic_set(&p->pending_sync, false);
 993             qemu_sem_post(&p->sem_sync);
 994         }
 995     }
 996
 997 out:
 998     if (ret) {
 999         assert(local_err);
1000         trace_multifd_send_error(p->id);
1001         multifd_send_set_error(local_err);
1002         multifd_send_kick_main(p);
1003         error_free(local_err);
1004     }
1005
1006     rcu_unregister_thread();
1007     migration_threads_remove(thread);
1008     trace_multifd_send_thread_end(p->id, p->packets_sent, p->total_normal_pages,
1009                                   p->total_zero_pages);
1010
1011     return NULL;
1012 }
1013
1014 static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque);
1015
1016 typedef struct {
1017     MultiFDSendParams *p;
1018     QIOChannelTLS *tioc;
1019 } MultiFDTLSThreadArgs;
1020
1021 static void *multifd_tls_handshake_thread(void *opaque)
1022 {
1023     MultiFDTLSThreadArgs *args = opaque;
1024
1025     qio_channel_tls_handshake(args->tioc,
1026                               multifd_new_send_channel_async,
1027                               args->p,
1028                               NULL,
1029                               NULL);
1030     g_free(args);
1031
1032     return NULL;
1033 }
1034
1035 static bool multifd_tls_channel_connect(MultiFDSendParams *p,
1036                                         QIOChannel *ioc,
1037                                         Error **errp)
1038 {
1039     MigrationState *s = migrate_get_current();
1040     const char *hostname = s->hostname;
1041     MultiFDTLSThreadArgs *args;
1042     QIOChannelTLS *tioc;
1043
1044     tioc = migration_tls_client_create(ioc, hostname, errp);
1045     if (!tioc) {
1046         return false;
1047     }
1048
1049     /*
1050      * Ownership of the socket channel now transfers to the newly
1051      * created TLS channel, which has already taken a reference.
1052      */
1053     object_unref(OBJECT(ioc));
1054     trace_multifd_tls_outgoing_handshake_start(ioc, tioc, hostname);
1055     qio_channel_set_name(QIO_CHANNEL(tioc), "multifd-tls-outgoing");
1056
1057     args = g_new0(MultiFDTLSThreadArgs, 1);
1058     args->tioc = tioc;
1059     args->p = p;
1060
1061     p->tls_thread_created = true;
1062     qemu_thread_create(&p->tls_thread, "multifd-tls-handshake-worker",
1063                        multifd_tls_handshake_thread, args,
1064                        QEMU_THREAD_JOINABLE);
1065     return true;
1066 }
1067
1068 void multifd_channel_connect(MultiFDSendParams *p, QIOChannel *ioc)
1069 {
1070     qio_channel_set_delay(ioc, false);
1071
1072     migration_ioc_register_yank(ioc);
1073     /* Setup p->c only if the channel is completely setup */
1074     p->c = ioc;
1075
1076     p->thread_created = true;
1077     qemu_thread_create(&p->thread, p->name, multifd_send_thread, p,
1078                        QEMU_THREAD_JOINABLE);
1079 }
1080
1081 /*
1082  * When TLS is enabled this function is called once to establish the
1083  * TLS connection and a second time after the TLS handshake to create
1084  * the multifd channel. Without TLS it goes straight into the channel
1085  * creation.
1086  */
1087 static void multifd_new_send_channel_async(QIOTask *task, gpointer opaque)
1088 {
1089     MultiFDSendParams *p = opaque;
1090     QIOChannel *ioc = QIO_CHANNEL(qio_task_get_source(task));
1091     Error *local_err = NULL;
1092     bool ret;
1093
1094     trace_multifd_new_send_channel_async(p->id);
1095
1096     if (qio_task_propagate_error(task, &local_err)) {
1097         ret = false;
1098         goto out;
1099     }
1100
1101     trace_multifd_set_outgoing_channel(ioc, object_get_typename(OBJECT(ioc)),
1102                                        migrate_get_current()->hostname);
1103
1104     if (migrate_channel_requires_tls_upgrade(ioc)) {
1105         ret = multifd_tls_channel_connect(p, ioc, &local_err);
1106         if (ret) {
1107             return;
1108         }
1109     } else {
1110         multifd_channel_connect(p, ioc);
1111         ret = true;
1112     }
1113
1114 out:
1115     /*
1116      * Here we're not interested whether creation succeeded, only that
1117      * it happened at all.
1118      */
1119     multifd_send_channel_created();
1120
1121     if (ret) {
1122         return;
1123     }
1124
1125     trace_multifd_new_send_channel_async_error(p->id, local_err);
1126     multifd_send_set_error(local_err);
1127     /*
1128      * For error cases (TLS or non-TLS), IO channel is always freed here
1129      * rather than when cleanup multifd: since p->c is not set, multifd
1130      * cleanup code doesn't even know its existence.
1131      */
1132     object_unref(OBJECT(ioc));
1133     error_free(local_err);
1134 }
1135
1136 static bool multifd_new_send_channel_create(gpointer opaque, Error **errp)
1137 {
1138     if (!multifd_use_packets()) {
1139         return file_send_channel_create(opaque, errp);
1140     }
1141
1142     socket_send_channel_create(multifd_new_send_channel_async, opaque);
1143     return true;
1144 }
1145
1146 bool multifd_send_setup(void)
1147 {
1148     MigrationState *s = migrate_get_current();
1149     Error *local_err = NULL;
1150     int thread_count, ret = 0;
1151     uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size();
1152     bool use_packets = multifd_use_packets();
1153     uint8_t i;
1154
1155     if (!migrate_multifd()) {
1156         return true;
1157     }
1158
1159     thread_count = migrate_multifd_channels();
1160     multifd_send_state = g_malloc0(sizeof(*multifd_send_state));
1161     multifd_send_state->params = g_new0(MultiFDSendParams, thread_count);
1162     multifd_send_state->pages = multifd_pages_init(page_count);
1163     qemu_sem_init(&multifd_send_state->channels_created, 0);
1164     qemu_sem_init(&multifd_send_state->channels_ready, 0);
1165     qatomic_set(&multifd_send_state->exiting, 0);
1166     multifd_send_state->ops = multifd_ops[migrate_multifd_compression()];
1167
1168     for (i = 0; i < thread_count; i++) {
1169         MultiFDSendParams *p = &multifd_send_state->params[i];
1170
1171         qemu_sem_init(&p->sem, 0);
1172         qemu_sem_init(&p->sem_sync, 0);
1173         p->id = i;
1174         p->pages = multifd_pages_init(page_count);
1175
1176         if (use_packets) {
1177             p->packet_len = sizeof(MultiFDPacket_t)
1178                           + sizeof(uint64_t) * page_count;
1179             p->packet = g_malloc0(p->packet_len);
1180             p->packet->magic = cpu_to_be32(MULTIFD_MAGIC);
1181             p->packet->version = cpu_to_be32(MULTIFD_VERSION);
1182
1183             /* We need one extra place for the packet header */
1184             p->iov = g_new0(struct iovec, page_count + 1);
1185         } else {
1186             p->iov = g_new0(struct iovec, page_count);
1187         }
1188         p->name = g_strdup_printf("multifdsend_%d", i);
1189         p->page_size = qemu_target_page_size();
1190         p->page_count = page_count;
1191         p->write_flags = 0;
1192
1193         if (!multifd_new_send_channel_create(p, &local_err)) {
1194             return false;
1195         }
1196     }
1197
1198     /*
1199      * Wait until channel creation has started for all channels. The
1200      * creation can still fail, but no more channels will be created
1201      * past this point.
1202      */
1203     for (i = 0; i < thread_count; i++) {
1204         qemu_sem_wait(&multifd_send_state->channels_created);
1205     }
1206
1207     for (i = 0; i < thread_count; i++) {
1208         MultiFDSendParams *p = &multifd_send_state->params[i];
1209
1210         ret = multifd_send_state->ops->send_setup(p, &local_err);
1211         if (ret) {
1212             break;
1213         }
1214     }
1215
1216     if (ret) {
1217         migrate_set_error(s, local_err);
1218         error_report_err(local_err);
1219         migrate_set_state(&s->state, MIGRATION_STATUS_SETUP,
1220                           MIGRATION_STATUS_FAILED);
1221         return false;
1222     }
1223
1224     return true;
1225 }
1226
1227 bool multifd_recv(void)
1228 {
1229     int i;
1230     static int next_recv_channel;
1231     MultiFDRecvParams *p = NULL;
1232     MultiFDRecvData *data = multifd_recv_state->data;
1233
1234     /*
1235      * next_channel can remain from a previous migration that was
1236      * using more channels, so ensure it doesn't overflow if the
1237      * limit is lower now.
1238      */
1239     next_recv_channel %= migrate_multifd_channels();
1240     for (i = next_recv_channel;; i = (i + 1) % migrate_multifd_channels()) {
1241         if (multifd_recv_should_exit()) {
1242             return false;
1243         }
1244
1245         p = &multifd_recv_state->params[i];
1246
1247         if (qatomic_read(&p->pending_job) == false) {
1248             next_recv_channel = (i + 1) % migrate_multifd_channels();
1249             break;
1250         }
1251     }
1252
1253     /*
1254      * Order pending_job read before manipulating p->data below. Pairs
1255      * with qatomic_store_release() at multifd_recv_thread().
1256      */
1257     smp_mb_acquire();
1258
1259     assert(!p->data->size);
1260     multifd_recv_state->data = p->data;
1261     p->data = data;
1262
1263     /*
1264      * Order p->data update before setting pending_job. Pairs with
1265      * qatomic_load_acquire() at multifd_recv_thread().
1266      */
1267     qatomic_store_release(&p->pending_job, true);
1268     qemu_sem_post(&p->sem);
1269
1270     return true;
1271 }
1272
1273 MultiFDRecvData *multifd_get_recv_data(void)
1274 {
1275     return multifd_recv_state->data;
1276 }
1277
1278 static void multifd_recv_terminate_threads(Error *err)
1279 {
1280     int i;
1281
1282     trace_multifd_recv_terminate_threads(err != NULL);
1283
1284     if (qatomic_xchg(&multifd_recv_state->exiting, 1)) {
1285         return;
1286     }
1287
1288     if (err) {
1289         MigrationState *s = migrate_get_current();
1290         migrate_set_error(s, err);
1291         if (s->state == MIGRATION_STATUS_SETUP ||
1292             s->state == MIGRATION_STATUS_ACTIVE) {
1293             migrate_set_state(&s->state, s->state,
1294                               MIGRATION_STATUS_FAILED);
1295         }
1296     }
1297
1298     for (i = 0; i < migrate_multifd_channels(); i++) {
1299         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1300
1301         /*
1302          * The migration thread and channels interact differently
1303          * depending on the presence of packets.
1304          */
1305         if (multifd_use_packets()) {
1306             /*
1307              * The channel receives as long as there are packets. When
1308              * packets end (i.e. MULTIFD_FLAG_SYNC is reached), the
1309              * channel waits for the migration thread to sync. If the
1310              * sync never happens, do it here.
1311              */
1312             qemu_sem_post(&p->sem_sync);
1313         } else {
1314             /*
1315              * The channel waits for the migration thread to give it
1316              * work. When the migration thread runs out of work, it
1317              * releases the channel and waits for any pending work to
1318              * finish. If we reach here (e.g. due to error) before the
1319              * work runs out, release the channel.
1320              */
1321             qemu_sem_post(&p->sem);
1322         }
1323
1324         /*
1325          * We could arrive here for two reasons:
1326          *  - normal quit, i.e. everything went fine, just finished
1327          *  - error quit: We close the channels so the channel threads
1328          *    finish the qio_channel_read_all_eof()
1329          */
1330         if (p->c) {
1331             qio_channel_shutdown(p->c, QIO_CHANNEL_SHUTDOWN_BOTH, NULL);
1332         }
1333     }
1334 }
1335
1336 void multifd_recv_shutdown(void)
1337 {
1338     if (migrate_multifd()) {
1339         multifd_recv_terminate_threads(NULL);
1340     }
1341 }
1342
1343 static void multifd_recv_cleanup_channel(MultiFDRecvParams *p)
1344 {
1345     migration_ioc_unregister_yank(p->c);
1346     object_unref(OBJECT(p->c));
1347     p->c = NULL;
1348     qemu_mutex_destroy(&p->mutex);
1349     qemu_sem_destroy(&p->sem_sync);
1350     qemu_sem_destroy(&p->sem);
1351     g_free(p->name);
1352     p->name = NULL;
1353     p->packet_len = 0;
1354     g_free(p->packet);
1355     p->packet = NULL;
1356     g_free(p->iov);
1357     p->iov = NULL;
1358     g_free(p->normal);
1359     p->normal = NULL;
1360     g_free(p->zero);
1361     p->zero = NULL;
1362     multifd_recv_state->ops->recv_cleanup(p);
1363 }
1364
1365 static void multifd_recv_cleanup_state(void)
1366 {
1367     qemu_sem_destroy(&multifd_recv_state->sem_sync);
1368     g_free(multifd_recv_state->params);
1369     multifd_recv_state->params = NULL;
1370     g_free(multifd_recv_state->data);
1371     multifd_recv_state->data = NULL;
1372     g_free(multifd_recv_state);
1373     multifd_recv_state = NULL;
1374 }
1375
1376 void multifd_recv_cleanup(void)
1377 {
1378     int i;
1379
1380     if (!migrate_multifd()) {
1381         return;
1382     }
1383     multifd_recv_terminate_threads(NULL);
1384     for (i = 0; i < migrate_multifd_channels(); i++) {
1385         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1386
1387         if (p->thread_created) {
1388             qemu_thread_join(&p->thread);
1389         }
1390     }
1391     for (i = 0; i < migrate_multifd_channels(); i++) {
1392         multifd_recv_cleanup_channel(&multifd_recv_state->params[i]);
1393     }
1394     multifd_recv_cleanup_state();
1395 }
1396
1397 void multifd_recv_sync_main(void)
1398 {
1399     int thread_count = migrate_multifd_channels();
1400     bool file_based = !multifd_use_packets();
1401     int i;
1402
1403     if (!migrate_multifd()) {
1404         return;
1405     }
1406
1407     /*
1408      * File-based channels don't use packets and therefore need to
1409      * wait for more work. Release them to start the sync.
1410      */
1411     if (file_based) {
1412         for (i = 0; i < thread_count; i++) {
1413             MultiFDRecvParams *p = &multifd_recv_state->params[i];
1414
1415             trace_multifd_recv_sync_main_signal(p->id);
1416             qemu_sem_post(&p->sem);
1417         }
1418     }
1419
1420     /*
1421      * Initiate the synchronization by waiting for all channels.
1422      *
1423      * For socket-based migration this means each channel has received
1424      * the SYNC packet on the stream.
1425      *
1426      * For file-based migration this means each channel is done with
1427      * the work (pending_job=false).
1428      */
1429     for (i = 0; i < thread_count; i++) {
1430         trace_multifd_recv_sync_main_wait(i);
1431         qemu_sem_wait(&multifd_recv_state->sem_sync);
1432     }
1433
1434     if (file_based) {
1435         /*
1436          * For file-based loading is done in one iteration. We're
1437          * done.
1438          */
1439         return;
1440     }
1441
1442     /*
1443      * Sync done. Release the channels for the next iteration.
1444      */
1445     for (i = 0; i < thread_count; i++) {
1446         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1447
1448         WITH_QEMU_LOCK_GUARD(&p->mutex) {
1449             if (multifd_recv_state->packet_num < p->packet_num) {
1450                 multifd_recv_state->packet_num = p->packet_num;
1451             }
1452         }
1453         trace_multifd_recv_sync_main_signal(p->id);
1454         qemu_sem_post(&p->sem_sync);
1455     }
1456     trace_multifd_recv_sync_main(multifd_recv_state->packet_num);
1457 }
1458
1459 static void *multifd_recv_thread(void *opaque)
1460 {
1461     MultiFDRecvParams *p = opaque;
1462     Error *local_err = NULL;
1463     bool use_packets = multifd_use_packets();
1464     int ret;
1465
1466     trace_multifd_recv_thread_start(p->id);
1467     rcu_register_thread();
1468
1469     while (true) {
1470         uint32_t flags = 0;
1471         bool has_data = false;
1472         p->normal_num = 0;
1473
1474         if (use_packets) {
1475             if (multifd_recv_should_exit()) {
1476                 break;
1477             }
1478
1479             ret = qio_channel_read_all_eof(p->c, (void *)p->packet,
1480                                            p->packet_len, &local_err);
1481             if (ret == 0 || ret == -1) {   /* 0: EOF  -1: Error */
1482                 break;
1483             }
1484
1485             qemu_mutex_lock(&p->mutex);
1486             ret = multifd_recv_unfill_packet(p, &local_err);
1487             if (ret) {
1488                 qemu_mutex_unlock(&p->mutex);
1489                 break;
1490             }
1491
1492             flags = p->flags;
1493             /* recv methods don't know how to handle the SYNC flag */
1494             p->flags &= ~MULTIFD_FLAG_SYNC;
1495             has_data = p->normal_num || p->zero_num;
1496             qemu_mutex_unlock(&p->mutex);
1497         } else {
1498             /*
1499              * No packets, so we need to wait for the vmstate code to
1500              * give us work.
1501              */
1502             qemu_sem_wait(&p->sem);
1503
1504             if (multifd_recv_should_exit()) {
1505                 break;
1506             }
1507
1508             /* pairs with qatomic_store_release() at multifd_recv() */
1509             if (!qatomic_load_acquire(&p->pending_job)) {
1510                 /*
1511                  * Migration thread did not send work, this is
1512                  * equivalent to pending_sync on the sending
1513                  * side. Post sem_sync to notify we reached this
1514                  * point.
1515                  */
1516                 qemu_sem_post(&multifd_recv_state->sem_sync);
1517                 continue;
1518             }
1519
1520             has_data = !!p->data->size;
1521         }
1522
1523         if (has_data) {
1524             ret = multifd_recv_state->ops->recv(p, &local_err);
1525             if (ret != 0) {
1526                 break;
1527             }
1528         }
1529
1530         if (use_packets) {
1531             if (flags & MULTIFD_FLAG_SYNC) {
1532                 qemu_sem_post(&multifd_recv_state->sem_sync);
1533                 qemu_sem_wait(&p->sem_sync);
1534             }
1535         } else {
1536             p->total_normal_pages += p->data->size / qemu_target_page_size();
1537             p->data->size = 0;
1538             /*
1539              * Order data->size update before clearing
1540              * pending_job. Pairs with smp_mb_acquire() at
1541              * multifd_recv().
1542              */
1543             qatomic_store_release(&p->pending_job, false);
1544         }
1545     }
1546
1547     if (local_err) {
1548         multifd_recv_terminate_threads(local_err);
1549         error_free(local_err);
1550     }
1551
1552     rcu_unregister_thread();
1553     trace_multifd_recv_thread_end(p->id, p->packets_recved,
1554                                   p->total_normal_pages,
1555                                   p->total_zero_pages);
1556
1557     return NULL;
1558 }
1559
1560 int multifd_recv_setup(Error **errp)
1561 {
1562     int thread_count;
1563     uint32_t page_count = MULTIFD_PACKET_SIZE / qemu_target_page_size();
1564     bool use_packets = multifd_use_packets();
1565     uint8_t i;
1566
1567     /*
1568      * Return successfully if multiFD recv state is already initialised
1569      * or multiFD is not enabled.
1570      */
1571     if (multifd_recv_state || !migrate_multifd()) {
1572         return 0;
1573     }
1574
1575     thread_count = migrate_multifd_channels();
1576     multifd_recv_state = g_malloc0(sizeof(*multifd_recv_state));
1577     multifd_recv_state->params = g_new0(MultiFDRecvParams, thread_count);
1578
1579     multifd_recv_state->data = g_new0(MultiFDRecvData, 1);
1580     multifd_recv_state->data->size = 0;
1581
1582     qatomic_set(&multifd_recv_state->count, 0);
1583     qatomic_set(&multifd_recv_state->exiting, 0);
1584     qemu_sem_init(&multifd_recv_state->sem_sync, 0);
1585     multifd_recv_state->ops = multifd_ops[migrate_multifd_compression()];
1586
1587     for (i = 0; i < thread_count; i++) {
1588         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1589
1590         qemu_mutex_init(&p->mutex);
1591         qemu_sem_init(&p->sem_sync, 0);
1592         qemu_sem_init(&p->sem, 0);
1593         p->pending_job = false;
1594         p->id = i;
1595
1596         p->data = g_new0(MultiFDRecvData, 1);
1597         p->data->size = 0;
1598
1599         if (use_packets) {
1600             p->packet_len = sizeof(MultiFDPacket_t)
1601                 + sizeof(uint64_t) * page_count;
1602             p->packet = g_malloc0(p->packet_len);
1603         }
1604         p->name = g_strdup_printf("multifdrecv_%d", i);
1605         p->iov = g_new0(struct iovec, page_count);
1606         p->normal = g_new0(ram_addr_t, page_count);
1607         p->zero = g_new0(ram_addr_t, page_count);
1608         p->page_count = page_count;
1609         p->page_size = qemu_target_page_size();
1610     }
1611
1612     for (i = 0; i < thread_count; i++) {
1613         MultiFDRecvParams *p = &multifd_recv_state->params[i];
1614         int ret;
1615
1616         ret = multifd_recv_state->ops->recv_setup(p, errp);
1617         if (ret) {
1618             return ret;
1619         }
1620     }
1621     return 0;
1622 }
1623
1624 bool multifd_recv_all_channels_created(void)
1625 {
1626     int thread_count = migrate_multifd_channels();
1627
1628     if (!migrate_multifd()) {
1629         return true;
1630     }
1631
1632     if (!multifd_recv_state) {
1633         /* Called before any connections created */
1634         return false;
1635     }
1636
1637     return thread_count == qatomic_read(&multifd_recv_state->count);
1638 }
1639
1640 /*
1641  * Try to receive all multifd channels to get ready for the migration.
1642  * Sets @errp when failing to receive the current channel.
1643  */
1644 void multifd_recv_new_channel(QIOChannel *ioc, Error **errp)
1645 {
1646     MultiFDRecvParams *p;
1647     Error *local_err = NULL;
1648     bool use_packets = multifd_use_packets();
1649     int id;
1650
1651     if (use_packets) {
1652         id = multifd_recv_initial_packet(ioc, &local_err);
1653         if (id < 0) {
1654             multifd_recv_terminate_threads(local_err);
1655             error_propagate_prepend(errp, local_err,
1656                                     "failed to receive packet"
1657                                     " via multifd channel %d: ",
1658                                     qatomic_read(&multifd_recv_state->count));
1659             return;
1660         }
1661         trace_multifd_recv_new_channel(id);
1662     } else {
1663         id = qatomic_read(&multifd_recv_state->count);
1664     }
1665
1666     p = &multifd_recv_state->params[id];
1667     if (p->c != NULL) {
1668         error_setg(&local_err, "multifd: received id '%d' already setup'",
1669                    id);
1670         multifd_recv_terminate_threads(local_err);
1671         error_propagate(errp, local_err);
1672         return;
1673     }
1674     p->c = ioc;
1675     object_ref(OBJECT(ioc));
1676
1677     p->thread_created = true;
1678     qemu_thread_create(&p->thread, p->name, multifd_recv_thread, p,
1679                        QEMU_THREAD_JOINABLE);
1680     qatomic_inc(&multifd_recv_state->count);
1681 }
1682
1683 bool multifd_send_prepare_common(MultiFDSendParams *p)
1684 {
1685     multifd_send_zero_page_detect(p);
1686
1687     if (!p->pages->normal_num) {
1688         p->next_packet_size = 0;
1689         return false;
1690     }
1691
1692     multifd_send_prepare_header(p);
1693
1694     return true;
1695 }