migration/rdma.c

   1 /*
   2  * RDMA protocol and interfaces
   3  *
   4  * Copyright IBM, Corp. 2010-2013
   5  * Copyright Red Hat, Inc. 2015-2016
   6  *
   7  * Authors:
   8  *  Michael R. Hines <mrhines@us.ibm.com>
   9  *  Jiuxing Liu <jl@us.ibm.com>
  10  *  Daniel P. Berrange <berrange@redhat.com>
  11  *
  12  * This work is licensed under the terms of the GNU GPL, version 2 or
  13  * later.  See the COPYING file in the top-level directory.
  14  *
  15  */
  16
  17 #include "qemu/osdep.h"
  18 #include "qapi/error.h"
  19 #include "qemu/cutils.h"
  20 #include "exec/target_page.h"
  21 #include "rdma.h"
  22 #include "migration.h"
  23 #include "migration-stats.h"
  24 #include "qemu-file.h"
  25 #include "ram.h"
  26 #include "qemu/error-report.h"
  27 #include "qemu/main-loop.h"
  28 #include "qemu/module.h"
  29 #include "qemu/rcu.h"
  30 #include "qemu/sockets.h"
  31 #include "qemu/bitmap.h"
  32 #include "qemu/coroutine.h"
  33 #include "exec/memory.h"
  34 #include <sys/socket.h>
  35 #include <netdb.h>
  36 #include <arpa/inet.h>
  37 #include <rdma/rdma_cma.h>
  38 #include "trace.h"
  39 #include "qom/object.h"
  40 #include "options.h"
  41 #include <poll.h>
  42
  43 #define RDMA_RESOLVE_TIMEOUT_MS 10000
  44
  45 /* Do not merge data if larger than this. */
  46 #define RDMA_MERGE_MAX (2 * 1024 * 1024)
  47 #define RDMA_SIGNALED_SEND_MAX (RDMA_MERGE_MAX / 4096)
  48
  49 #define RDMA_REG_CHUNK_SHIFT 20 /* 1 MB */
  50
  51 /*
  52  * This is only for non-live state being migrated.
  53  * Instead of RDMA_WRITE messages, we use RDMA_SEND
  54  * messages for that state, which requires a different
  55  * delivery design than main memory.
  56  */
  57 #define RDMA_SEND_INCREMENT 32768
  58
  59 /*
  60  * Maximum size infiniband SEND message
  61  */
  62 #define RDMA_CONTROL_MAX_BUFFER (512 * 1024)
  63 #define RDMA_CONTROL_MAX_COMMANDS_PER_MESSAGE 4096
  64
  65 #define RDMA_CONTROL_VERSION_CURRENT 1
  66 /*
  67  * Capabilities for negotiation.
  68  */
  69 #define RDMA_CAPABILITY_PIN_ALL 0x01
  70
  71 /*
  72  * Add the other flags above to this list of known capabilities
  73  * as they are introduced.
  74  */
  75 static uint32_t known_capabilities = RDMA_CAPABILITY_PIN_ALL;
  76
  77 /*
  78  * A work request ID is 64-bits and we split up these bits
  79  * into 3 parts:
  80  *
  81  * bits 0-15 : type of control message, 2^16
  82  * bits 16-29: ram block index, 2^14
  83  * bits 30-63: ram block chunk number, 2^34
  84  *
  85  * The last two bit ranges are only used for RDMA writes,
  86  * in order to track their completion and potentially
  87  * also track unregistration status of the message.
  88  */
  89 #define RDMA_WRID_TYPE_SHIFT  0UL
  90 #define RDMA_WRID_BLOCK_SHIFT 16UL
  91 #define RDMA_WRID_CHUNK_SHIFT 30UL
  92
  93 #define RDMA_WRID_TYPE_MASK \
  94     ((1UL << RDMA_WRID_BLOCK_SHIFT) - 1UL)
  95
  96 #define RDMA_WRID_BLOCK_MASK \
  97     (~RDMA_WRID_TYPE_MASK & ((1UL << RDMA_WRID_CHUNK_SHIFT) - 1UL))
  98
  99 #define RDMA_WRID_CHUNK_MASK (~RDMA_WRID_BLOCK_MASK & ~RDMA_WRID_TYPE_MASK)
 100
 101 /*
 102  * RDMA migration protocol:
 103  * 1. RDMA Writes (data messages, i.e. RAM)
 104  * 2. IB Send/Recv (control channel messages)
 105  */
 106 enum {
 107     RDMA_WRID_NONE = 0,
 108     RDMA_WRID_RDMA_WRITE = 1,
 109     RDMA_WRID_SEND_CONTROL = 2000,
 110     RDMA_WRID_RECV_CONTROL = 4000,
 111 };
 112
 113 /*
 114  * Work request IDs for IB SEND messages only (not RDMA writes).
 115  * This is used by the migration protocol to transmit
 116  * control messages (such as device state and registration commands)
 117  *
 118  * We could use more WRs, but we have enough for now.
 119  */
 120 enum {
 121     RDMA_WRID_READY = 0,
 122     RDMA_WRID_DATA,
 123     RDMA_WRID_CONTROL,
 124     RDMA_WRID_MAX,
 125 };
 126
 127 /*
 128  * SEND/RECV IB Control Messages.
 129  */
 130 enum {
 131     RDMA_CONTROL_NONE = 0,
 132     RDMA_CONTROL_ERROR,
 133     RDMA_CONTROL_READY,               /* ready to receive */
 134     RDMA_CONTROL_QEMU_FILE,           /* QEMUFile-transmitted bytes */
 135     RDMA_CONTROL_RAM_BLOCKS_REQUEST,  /* RAMBlock synchronization */
 136     RDMA_CONTROL_RAM_BLOCKS_RESULT,   /* RAMBlock synchronization */
 137     RDMA_CONTROL_COMPRESS,            /* page contains repeat values */
 138     RDMA_CONTROL_REGISTER_REQUEST,    /* dynamic page registration */
 139     RDMA_CONTROL_REGISTER_RESULT,     /* key to use after registration */
 140     RDMA_CONTROL_REGISTER_FINISHED,   /* current iteration finished */
 141     RDMA_CONTROL_UNREGISTER_REQUEST,  /* dynamic UN-registration */
 142     RDMA_CONTROL_UNREGISTER_FINISHED, /* unpinning finished */
 143 };
 144
 145
 146 /*
 147  * Memory and MR structures used to represent an IB Send/Recv work request.
 148  * This is *not* used for RDMA writes, only IB Send/Recv.
 149  */
 150 typedef struct {
 151     uint8_t  control[RDMA_CONTROL_MAX_BUFFER]; /* actual buffer to register */
 152     struct   ibv_mr *control_mr;               /* registration metadata */
 153     size_t   control_len;                      /* length of the message */
 154     uint8_t *control_curr;                     /* start of unconsumed bytes */
 155 } RDMAWorkRequestData;
 156
 157 /*
 158  * Negotiate RDMA capabilities during connection-setup time.
 159  */
 160 typedef struct {
 161     uint32_t version;
 162     uint32_t flags;
 163 } RDMACapabilities;
 164
 165 static void caps_to_network(RDMACapabilities *cap)
 166 {
 167     cap->version = htonl(cap->version);
 168     cap->flags = htonl(cap->flags);
 169 }
 170
 171 static void network_to_caps(RDMACapabilities *cap)
 172 {
 173     cap->version = ntohl(cap->version);
 174     cap->flags = ntohl(cap->flags);
 175 }
 176
 177 /*
 178  * Representation of a RAMBlock from an RDMA perspective.
 179  * This is not transmitted, only local.
 180  * This and subsequent structures cannot be linked lists
 181  * because we're using a single IB message to transmit
 182  * the information. It's small anyway, so a list is overkill.
 183  */
 184 typedef struct RDMALocalBlock {
 185     char          *block_name;
 186     uint8_t       *local_host_addr; /* local virtual address */
 187     uint64_t       remote_host_addr; /* remote virtual address */
 188     uint64_t       offset;
 189     uint64_t       length;
 190     struct         ibv_mr **pmr;    /* MRs for chunk-level registration */
 191     struct         ibv_mr *mr;      /* MR for non-chunk-level registration */
 192     uint32_t      *remote_keys;     /* rkeys for chunk-level registration */
 193     uint32_t       remote_rkey;     /* rkeys for non-chunk-level registration */
 194     int            index;           /* which block are we */
 195     unsigned int   src_index;       /* (Only used on dest) */
 196     bool           is_ram_block;
 197     int            nb_chunks;
 198     unsigned long *transit_bitmap;
 199     unsigned long *unregister_bitmap;
 200 } RDMALocalBlock;
 201
 202 /*
 203  * Also represents a RAMblock, but only on the dest.
 204  * This gets transmitted by the dest during connection-time
 205  * to the source VM and then is used to populate the
 206  * corresponding RDMALocalBlock with
 207  * the information needed to perform the actual RDMA.
 208  */
 209 typedef struct QEMU_PACKED RDMADestBlock {
 210     uint64_t remote_host_addr;
 211     uint64_t offset;
 212     uint64_t length;
 213     uint32_t remote_rkey;
 214     uint32_t padding;
 215 } RDMADestBlock;
 216
 217 static const char *control_desc(unsigned int rdma_control)
 218 {
 219     static const char *strs[] = {
 220         [RDMA_CONTROL_NONE] = "NONE",
 221         [RDMA_CONTROL_ERROR] = "ERROR",
 222         [RDMA_CONTROL_READY] = "READY",
 223         [RDMA_CONTROL_QEMU_FILE] = "QEMU FILE",
 224         [RDMA_CONTROL_RAM_BLOCKS_REQUEST] = "RAM BLOCKS REQUEST",
 225         [RDMA_CONTROL_RAM_BLOCKS_RESULT] = "RAM BLOCKS RESULT",
 226         [RDMA_CONTROL_COMPRESS] = "COMPRESS",
 227         [RDMA_CONTROL_REGISTER_REQUEST] = "REGISTER REQUEST",
 228         [RDMA_CONTROL_REGISTER_RESULT] = "REGISTER RESULT",
 229         [RDMA_CONTROL_REGISTER_FINISHED] = "REGISTER FINISHED",
 230         [RDMA_CONTROL_UNREGISTER_REQUEST] = "UNREGISTER REQUEST",
 231         [RDMA_CONTROL_UNREGISTER_FINISHED] = "UNREGISTER FINISHED",
 232     };
 233
 234     if (rdma_control > RDMA_CONTROL_UNREGISTER_FINISHED) {
 235         return "??BAD CONTROL VALUE??";
 236     }
 237
 238     return strs[rdma_control];
 239 }
 240
 241 static uint64_t htonll(uint64_t v)
 242 {
 243     union { uint32_t lv[2]; uint64_t llv; } u;
 244     u.lv[0] = htonl(v >> 32);
 245     u.lv[1] = htonl(v & 0xFFFFFFFFULL);
 246     return u.llv;
 247 }
 248
 249 static uint64_t ntohll(uint64_t v)
 250 {
 251     union { uint32_t lv[2]; uint64_t llv; } u;
 252     u.llv = v;
 253     return ((uint64_t)ntohl(u.lv[0]) << 32) | (uint64_t) ntohl(u.lv[1]);
 254 }
 255
 256 static void dest_block_to_network(RDMADestBlock *db)
 257 {
 258     db->remote_host_addr = htonll(db->remote_host_addr);
 259     db->offset = htonll(db->offset);
 260     db->length = htonll(db->length);
 261     db->remote_rkey = htonl(db->remote_rkey);
 262 }
 263
 264 static void network_to_dest_block(RDMADestBlock *db)
 265 {
 266     db->remote_host_addr = ntohll(db->remote_host_addr);
 267     db->offset = ntohll(db->offset);
 268     db->length = ntohll(db->length);
 269     db->remote_rkey = ntohl(db->remote_rkey);
 270 }
 271
 272 /*
 273  * Virtual address of the above structures used for transmitting
 274  * the RAMBlock descriptions at connection-time.
 275  * This structure is *not* transmitted.
 276  */
 277 typedef struct RDMALocalBlocks {
 278     int nb_blocks;
 279     bool     init;             /* main memory init complete */
 280     RDMALocalBlock *block;
 281 } RDMALocalBlocks;
 282
 283 /*
 284  * Main data structure for RDMA state.
 285  * While there is only one copy of this structure being allocated right now,
 286  * this is the place where one would start if you wanted to consider
 287  * having more than one RDMA connection open at the same time.
 288  */
 289 typedef struct RDMAContext {
 290     char *host;
 291     int port;
 292     char *host_port;
 293
 294     RDMAWorkRequestData wr_data[RDMA_WRID_MAX];
 295
 296     /*
 297      * This is used by *_exchange_send() to figure out whether or not
 298      * the initial "READY" message has already been received or not.
 299      * This is because other functions may potentially poll() and detect
 300      * the READY message before send() does, in which case we need to
 301      * know if it completed.
 302      */
 303     int control_ready_expected;
 304
 305     /* number of outstanding writes */
 306     int nb_sent;
 307
 308     /* store info about current buffer so that we can
 309        merge it with future sends */
 310     uint64_t current_addr;
 311     uint64_t current_length;
 312     /* index of ram block the current buffer belongs to */
 313     int current_index;
 314     /* index of the chunk in the current ram block */
 315     int current_chunk;
 316
 317     bool pin_all;
 318
 319     /*
 320      * infiniband-specific variables for opening the device
 321      * and maintaining connection state and so forth.
 322      *
 323      * cm_id also has ibv_context, rdma_event_channel, and ibv_qp in
 324      * cm_id->verbs, cm_id->channel, and cm_id->qp.
 325      */
 326     struct rdma_cm_id *cm_id;               /* connection manager ID */
 327     struct rdma_cm_id *listen_id;
 328     bool connected;
 329
 330     struct ibv_context          *verbs;
 331     struct rdma_event_channel   *channel;
 332     struct ibv_qp *qp;                      /* queue pair */
 333     struct ibv_comp_channel *recv_comp_channel;  /* recv completion channel */
 334     struct ibv_comp_channel *send_comp_channel;  /* send completion channel */
 335     struct ibv_pd *pd;                      /* protection domain */
 336     struct ibv_cq *recv_cq;                 /* recvieve completion queue */
 337     struct ibv_cq *send_cq;                 /* send completion queue */
 338
 339     /*
 340      * If a previous write failed (perhaps because of a failed
 341      * memory registration, then do not attempt any future work
 342      * and remember the error state.
 343      */
 344     bool errored;
 345     bool error_reported;
 346     bool received_error;
 347
 348     /*
 349      * Description of ram blocks used throughout the code.
 350      */
 351     RDMALocalBlocks local_ram_blocks;
 352     RDMADestBlock  *dest_blocks;
 353
 354     /* Index of the next RAMBlock received during block registration */
 355     unsigned int    next_src_index;
 356
 357     /*
 358      * Migration on *destination* started.
 359      * Then use coroutine yield function.
 360      * Source runs in a thread, so we don't care.
 361      */
 362     int migration_started_on_destination;
 363
 364     int total_registrations;
 365     int total_writes;
 366
 367     int unregister_current, unregister_next;
 368     uint64_t unregistrations[RDMA_SIGNALED_SEND_MAX];
 369
 370     GHashTable *blockmap;
 371
 372     /* the RDMAContext for return path */
 373     struct RDMAContext *return_path;
 374     bool is_return_path;
 375 } RDMAContext;
 376
 377 #define TYPE_QIO_CHANNEL_RDMA "qio-channel-rdma"
 378 OBJECT_DECLARE_SIMPLE_TYPE(QIOChannelRDMA, QIO_CHANNEL_RDMA)
 379
 380
 381
 382 struct QIOChannelRDMA {
 383     QIOChannel parent;
 384     RDMAContext *rdmain;
 385     RDMAContext *rdmaout;
 386     QEMUFile *file;
 387     bool blocking; /* XXX we don't actually honour this yet */
 388 };
 389
 390 /*
 391  * Main structure for IB Send/Recv control messages.
 392  * This gets prepended at the beginning of every Send/Recv.
 393  */
 394 typedef struct QEMU_PACKED {
 395     uint32_t len;     /* Total length of data portion */
 396     uint32_t type;    /* which control command to perform */
 397     uint32_t repeat;  /* number of commands in data portion of same type */
 398     uint32_t padding;
 399 } RDMAControlHeader;
 400
 401 static void control_to_network(RDMAControlHeader *control)
 402 {
 403     control->type = htonl(control->type);
 404     control->len = htonl(control->len);
 405     control->repeat = htonl(control->repeat);
 406 }
 407
 408 static void network_to_control(RDMAControlHeader *control)
 409 {
 410     control->type = ntohl(control->type);
 411     control->len = ntohl(control->len);
 412     control->repeat = ntohl(control->repeat);
 413 }
 414
 415 /*
 416  * Register a single Chunk.
 417  * Information sent by the source VM to inform the dest
 418  * to register an single chunk of memory before we can perform
 419  * the actual RDMA operation.
 420  */
 421 typedef struct QEMU_PACKED {
 422     union QEMU_PACKED {
 423         uint64_t current_addr;  /* offset into the ram_addr_t space */
 424         uint64_t chunk;         /* chunk to lookup if unregistering */
 425     } key;
 426     uint32_t current_index; /* which ramblock the chunk belongs to */
 427     uint32_t padding;
 428     uint64_t chunks;            /* how many sequential chunks to register */
 429 } RDMARegister;
 430
 431 static bool rdma_errored(RDMAContext *rdma)
 432 {
 433     if (rdma->errored && !rdma->error_reported) {
 434         error_report("RDMA is in an error state waiting migration"
 435                      " to abort!");
 436         rdma->error_reported = true;
 437     }
 438     return rdma->errored;
 439 }
 440
 441 static void register_to_network(RDMAContext *rdma, RDMARegister *reg)
 442 {
 443     RDMALocalBlock *local_block;
 444     local_block  = &rdma->local_ram_blocks.block[reg->current_index];
 445
 446     if (local_block->is_ram_block) {
 447         /*
 448          * current_addr as passed in is an address in the local ram_addr_t
 449          * space, we need to translate this for the destination
 450          */
 451         reg->key.current_addr -= local_block->offset;
 452         reg->key.current_addr += rdma->dest_blocks[reg->current_index].offset;
 453     }
 454     reg->key.current_addr = htonll(reg->key.current_addr);
 455     reg->current_index = htonl(reg->current_index);
 456     reg->chunks = htonll(reg->chunks);
 457 }
 458
 459 static void network_to_register(RDMARegister *reg)
 460 {
 461     reg->key.current_addr = ntohll(reg->key.current_addr);
 462     reg->current_index = ntohl(reg->current_index);
 463     reg->chunks = ntohll(reg->chunks);
 464 }
 465
 466 typedef struct QEMU_PACKED {
 467     uint32_t value;     /* if zero, we will madvise() */
 468     uint32_t block_idx; /* which ram block index */
 469     uint64_t offset;    /* Address in remote ram_addr_t space */
 470     uint64_t length;    /* length of the chunk */
 471 } RDMACompress;
 472
 473 static void compress_to_network(RDMAContext *rdma, RDMACompress *comp)
 474 {
 475     comp->value = htonl(comp->value);
 476     /*
 477      * comp->offset as passed in is an address in the local ram_addr_t
 478      * space, we need to translate this for the destination
 479      */
 480     comp->offset -= rdma->local_ram_blocks.block[comp->block_idx].offset;
 481     comp->offset += rdma->dest_blocks[comp->block_idx].offset;
 482     comp->block_idx = htonl(comp->block_idx);
 483     comp->offset = htonll(comp->offset);
 484     comp->length = htonll(comp->length);
 485 }
 486
 487 static void network_to_compress(RDMACompress *comp)
 488 {
 489     comp->value = ntohl(comp->value);
 490     comp->block_idx = ntohl(comp->block_idx);
 491     comp->offset = ntohll(comp->offset);
 492     comp->length = ntohll(comp->length);
 493 }
 494
 495 /*
 496  * The result of the dest's memory registration produces an "rkey"
 497  * which the source VM must reference in order to perform
 498  * the RDMA operation.
 499  */
 500 typedef struct QEMU_PACKED {
 501     uint32_t rkey;
 502     uint32_t padding;
 503     uint64_t host_addr;
 504 } RDMARegisterResult;
 505
 506 static void result_to_network(RDMARegisterResult *result)
 507 {
 508     result->rkey = htonl(result->rkey);
 509     result->host_addr = htonll(result->host_addr);
 510 };
 511
 512 static void network_to_result(RDMARegisterResult *result)
 513 {
 514     result->rkey = ntohl(result->rkey);
 515     result->host_addr = ntohll(result->host_addr);
 516 };
 517
 518 static int qemu_rdma_exchange_send(RDMAContext *rdma, RDMAControlHeader *head,
 519                                    uint8_t *data, RDMAControlHeader *resp,
 520                                    int *resp_idx,
 521                                    int (*callback)(RDMAContext *rdma,
 522                                                    Error **errp),
 523                                    Error **errp);
 524
 525 static inline uint64_t ram_chunk_index(const uint8_t *start,
 526                                        const uint8_t *host)
 527 {
 528     return ((uintptr_t) host - (uintptr_t) start) >> RDMA_REG_CHUNK_SHIFT;
 529 }
 530
 531 static inline uint8_t *ram_chunk_start(const RDMALocalBlock *rdma_ram_block,
 532                                        uint64_t i)
 533 {
 534     return (uint8_t *)(uintptr_t)(rdma_ram_block->local_host_addr +
 535                                   (i << RDMA_REG_CHUNK_SHIFT));
 536 }
 537
 538 static inline uint8_t *ram_chunk_end(const RDMALocalBlock *rdma_ram_block,
 539                                      uint64_t i)
 540 {
 541     uint8_t *result = ram_chunk_start(rdma_ram_block, i) +
 542                                          (1UL << RDMA_REG_CHUNK_SHIFT);
 543
 544     if (result > (rdma_ram_block->local_host_addr + rdma_ram_block->length)) {
 545         result = rdma_ram_block->local_host_addr + rdma_ram_block->length;
 546     }
 547
 548     return result;
 549 }
 550
 551 static void rdma_add_block(RDMAContext *rdma, const char *block_name,
 552                            void *host_addr,
 553                            ram_addr_t block_offset, uint64_t length)
 554 {
 555     RDMALocalBlocks *local = &rdma->local_ram_blocks;
 556     RDMALocalBlock *block;
 557     RDMALocalBlock *old = local->block;
 558
 559     local->block = g_new0(RDMALocalBlock, local->nb_blocks + 1);
 560
 561     if (local->nb_blocks) {
 562         if (rdma->blockmap) {
 563             for (int x = 0; x < local->nb_blocks; x++) {
 564                 g_hash_table_remove(rdma->blockmap,
 565                                     (void *)(uintptr_t)old[x].offset);
 566                 g_hash_table_insert(rdma->blockmap,
 567                                     (void *)(uintptr_t)old[x].offset,
 568                                     &local->block[x]);
 569             }
 570         }
 571         memcpy(local->block, old, sizeof(RDMALocalBlock) * local->nb_blocks);
 572         g_free(old);
 573     }
 574
 575     block = &local->block[local->nb_blocks];
 576
 577     block->block_name = g_strdup(block_name);
 578     block->local_host_addr = host_addr;
 579     block->offset = block_offset;
 580     block->length = length;
 581     block->index = local->nb_blocks;
 582     block->src_index = ~0U; /* Filled in by the receipt of the block list */
 583     block->nb_chunks = ram_chunk_index(host_addr, host_addr + length) + 1UL;
 584     block->transit_bitmap = bitmap_new(block->nb_chunks);
 585     bitmap_clear(block->transit_bitmap, 0, block->nb_chunks);
 586     block->unregister_bitmap = bitmap_new(block->nb_chunks);
 587     bitmap_clear(block->unregister_bitmap, 0, block->nb_chunks);
 588     block->remote_keys = g_new0(uint32_t, block->nb_chunks);
 589
 590     block->is_ram_block = local->init ? false : true;
 591
 592     if (rdma->blockmap) {
 593         g_hash_table_insert(rdma->blockmap, (void *)(uintptr_t)block_offset, block);
 594     }
 595
 596     trace_rdma_add_block(block_name, local->nb_blocks,
 597                          (uintptr_t) block->local_host_addr,
 598                          block->offset, block->length,
 599                          (uintptr_t) (block->local_host_addr + block->length),
 600                          BITS_TO_LONGS(block->nb_chunks) *
 601                              sizeof(unsigned long) * 8,
 602                          block->nb_chunks);
 603
 604     local->nb_blocks++;
 605 }
 606
 607 /*
 608  * Memory regions need to be registered with the device and queue pairs setup
 609  * in advanced before the migration starts. This tells us where the RAM blocks
 610  * are so that we can register them individually.
 611  */
 612 static int qemu_rdma_init_one_block(RAMBlock *rb, void *opaque)
 613 {
 614     const char *block_name = qemu_ram_get_idstr(rb);
 615     void *host_addr = qemu_ram_get_host_addr(rb);
 616     ram_addr_t block_offset = qemu_ram_get_offset(rb);
 617     ram_addr_t length = qemu_ram_get_used_length(rb);
 618     rdma_add_block(opaque, block_name, host_addr, block_offset, length);
 619     return 0;
 620 }
 621
 622 /*
 623  * Identify the RAMBlocks and their quantity. They will be references to
 624  * identify chunk boundaries inside each RAMBlock and also be referenced
 625  * during dynamic page registration.
 626  */
 627 static void qemu_rdma_init_ram_blocks(RDMAContext *rdma)
 628 {
 629     RDMALocalBlocks *local = &rdma->local_ram_blocks;
 630     int ret;
 631
 632     assert(rdma->blockmap == NULL);
 633     memset(local, 0, sizeof *local);
 634     ret = foreach_not_ignored_block(qemu_rdma_init_one_block, rdma);
 635     assert(!ret);
 636     trace_qemu_rdma_init_ram_blocks(local->nb_blocks);
 637     rdma->dest_blocks = g_new0(RDMADestBlock,
 638                                rdma->local_ram_blocks.nb_blocks);
 639     local->init = true;
 640 }
 641
 642 /*
 643  * Note: If used outside of cleanup, the caller must ensure that the destination
 644  * block structures are also updated
 645  */
 646 static void rdma_delete_block(RDMAContext *rdma, RDMALocalBlock *block)
 647 {
 648     RDMALocalBlocks *local = &rdma->local_ram_blocks;
 649     RDMALocalBlock *old = local->block;
 650
 651     if (rdma->blockmap) {
 652         g_hash_table_remove(rdma->blockmap, (void *)(uintptr_t)block->offset);
 653     }
 654     if (block->pmr) {
 655         for (int j = 0; j < block->nb_chunks; j++) {
 656             if (!block->pmr[j]) {
 657                 continue;
 658             }
 659             ibv_dereg_mr(block->pmr[j]);
 660             rdma->total_registrations--;
 661         }
 662         g_free(block->pmr);
 663         block->pmr = NULL;
 664     }
 665
 666     if (block->mr) {
 667         ibv_dereg_mr(block->mr);
 668         rdma->total_registrations--;
 669         block->mr = NULL;
 670     }
 671
 672     g_free(block->transit_bitmap);
 673     block->transit_bitmap = NULL;
 674
 675     g_free(block->unregister_bitmap);
 676     block->unregister_bitmap = NULL;
 677
 678     g_free(block->remote_keys);
 679     block->remote_keys = NULL;
 680
 681     g_free(block->block_name);
 682     block->block_name = NULL;
 683
 684     if (rdma->blockmap) {
 685         for (int x = 0; x < local->nb_blocks; x++) {
 686             g_hash_table_remove(rdma->blockmap,
 687                                 (void *)(uintptr_t)old[x].offset);
 688         }
 689     }
 690
 691     if (local->nb_blocks > 1) {
 692
 693         local->block = g_new0(RDMALocalBlock, local->nb_blocks - 1);
 694
 695         if (block->index) {
 696             memcpy(local->block, old, sizeof(RDMALocalBlock) * block->index);
 697         }
 698
 699         if (block->index < (local->nb_blocks - 1)) {
 700             memcpy(local->block + block->index, old + (block->index + 1),
 701                 sizeof(RDMALocalBlock) *
 702                     (local->nb_blocks - (block->index + 1)));
 703             for (int x = block->index; x < local->nb_blocks - 1; x++) {
 704                 local->block[x].index--;
 705             }
 706         }
 707     } else {
 708         assert(block == local->block);
 709         local->block = NULL;
 710     }
 711
 712     trace_rdma_delete_block(block, (uintptr_t)block->local_host_addr,
 713                            block->offset, block->length,
 714                             (uintptr_t)(block->local_host_addr + block->length),
 715                            BITS_TO_LONGS(block->nb_chunks) *
 716                                sizeof(unsigned long) * 8, block->nb_chunks);
 717
 718     g_free(old);
 719
 720     local->nb_blocks--;
 721
 722     if (local->nb_blocks && rdma->blockmap) {
 723         for (int x = 0; x < local->nb_blocks; x++) {
 724             g_hash_table_insert(rdma->blockmap,
 725                                 (void *)(uintptr_t)local->block[x].offset,
 726                                 &local->block[x]);
 727         }
 728     }
 729 }
 730
 731 /*
 732  * Trace RDMA device open, with device details.
 733  */
 734 static void qemu_rdma_dump_id(const char *who, struct ibv_context *verbs)
 735 {
 736     struct ibv_port_attr port;
 737
 738     if (ibv_query_port(verbs, 1, &port)) {
 739         trace_qemu_rdma_dump_id_failed(who);
 740         return;
 741     }
 742
 743     trace_qemu_rdma_dump_id(who,
 744                 verbs->device->name,
 745                 verbs->device->dev_name,
 746                 verbs->device->dev_path,
 747                 verbs->device->ibdev_path,
 748                 port.link_layer,
 749                 port.link_layer == IBV_LINK_LAYER_INFINIBAND ? "Infiniband"
 750                 : port.link_layer == IBV_LINK_LAYER_ETHERNET ? "Ethernet"
 751                 : "Unknown");
 752 }
 753
 754 /*
 755  * Trace RDMA gid addressing information.
 756  * Useful for understanding the RDMA device hierarchy in the kernel.
 757  */
 758 static void qemu_rdma_dump_gid(const char *who, struct rdma_cm_id *id)
 759 {
 760     char sgid[33];
 761     char dgid[33];
 762     inet_ntop(AF_INET6, &id->route.addr.addr.ibaddr.sgid, sgid, sizeof sgid);
 763     inet_ntop(AF_INET6, &id->route.addr.addr.ibaddr.dgid, dgid, sizeof dgid);
 764     trace_qemu_rdma_dump_gid(who, sgid, dgid);
 765 }
 766
 767 /*
 768  * As of now, IPv6 over RoCE / iWARP is not supported by linux.
 769  * We will try the next addrinfo struct, and fail if there are
 770  * no other valid addresses to bind against.
 771  *
 772  * If user is listening on '[::]', then we will not have a opened a device
 773  * yet and have no way of verifying if the device is RoCE or not.
 774  *
 775  * In this case, the source VM will throw an error for ALL types of
 776  * connections (both IPv4 and IPv6) if the destination machine does not have
 777  * a regular infiniband network available for use.
 778  *
 779  * The only way to guarantee that an error is thrown for broken kernels is
 780  * for the management software to choose a *specific* interface at bind time
 781  * and validate what time of hardware it is.
 782  *
 783  * Unfortunately, this puts the user in a fix:
 784  *
 785  *  If the source VM connects with an IPv4 address without knowing that the
 786  *  destination has bound to '[::]' the migration will unconditionally fail
 787  *  unless the management software is explicitly listening on the IPv4
 788  *  address while using a RoCE-based device.
 789  *
 790  *  If the source VM connects with an IPv6 address, then we're OK because we can
 791  *  throw an error on the source (and similarly on the destination).
 792  *
 793  *  But in mixed environments, this will be broken for a while until it is fixed
 794  *  inside linux.
 795  *
 796  * We do provide a *tiny* bit of help in this function: We can list all of the
 797  * devices in the system and check to see if all the devices are RoCE or
 798  * Infiniband.
 799  *
 800  * If we detect that we have a *pure* RoCE environment, then we can safely
 801  * thrown an error even if the management software has specified '[::]' as the
 802  * bind address.
 803  *
 804  * However, if there is are multiple hetergeneous devices, then we cannot make
 805  * this assumption and the user just has to be sure they know what they are
 806  * doing.
 807  *
 808  * Patches are being reviewed on linux-rdma.
 809  */
 810 static int qemu_rdma_broken_ipv6_kernel(struct ibv_context *verbs, Error **errp)
 811 {
 812     /* This bug only exists in linux, to our knowledge. */
 813 #ifdef CONFIG_LINUX
 814     struct ibv_port_attr port_attr;
 815
 816     /*
 817      * Verbs are only NULL if management has bound to '[::]'.
 818      *
 819      * Let's iterate through all the devices and see if there any pure IB
 820      * devices (non-ethernet).
 821      *
 822      * If not, then we can safely proceed with the migration.
 823      * Otherwise, there are no guarantees until the bug is fixed in linux.
 824      */
 825     if (!verbs) {
 826         int num_devices;
 827         struct ibv_device **dev_list = ibv_get_device_list(&num_devices);
 828         bool roce_found = false;
 829         bool ib_found = false;
 830
 831         for (int x = 0; x < num_devices; x++) {
 832             verbs = ibv_open_device(dev_list[x]);
 833             /*
 834              * ibv_open_device() is not documented to set errno.  If
 835              * it does, it's somebody else's doc bug.  If it doesn't,
 836              * the use of errno below is wrong.
 837              * TODO Find out whether ibv_open_device() sets errno.
 838              */
 839             if (!verbs) {
 840                 if (errno == EPERM) {
 841                     continue;
 842                 } else {
 843                     error_setg_errno(errp, errno,
 844                                      "could not open RDMA device context");
 845                     return -1;
 846                 }
 847             }
 848
 849             if (ibv_query_port(verbs, 1, &port_attr)) {
 850                 ibv_close_device(verbs);
 851                 error_setg(errp,
 852                            "RDMA ERROR: Could not query initial IB port");
 853                 return -1;
 854             }
 855
 856             if (port_attr.link_layer == IBV_LINK_LAYER_INFINIBAND) {
 857                 ib_found = true;
 858             } else if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) {
 859                 roce_found = true;
 860             }
 861
 862             ibv_close_device(verbs);
 863
 864         }
 865
 866         if (roce_found) {
 867             if (ib_found) {
 868                 warn_report("migrations may fail:"
 869                             " IPv6 over RoCE / iWARP in linux"
 870                             " is broken. But since you appear to have a"
 871                             " mixed RoCE / IB environment, be sure to only"
 872                             " migrate over the IB fabric until the kernel "
 873                             " fixes the bug.");
 874             } else {
 875                 error_setg(errp, "RDMA ERROR: "
 876                            "You only have RoCE / iWARP devices in your systems"
 877                            " and your management software has specified '[::]'"
 878                            ", but IPv6 over RoCE / iWARP is not supported in Linux.");
 879                 return -1;
 880             }
 881         }
 882
 883         return 0;
 884     }
 885
 886     /*
 887      * If we have a verbs context, that means that some other than '[::]' was
 888      * used by the management software for binding. In which case we can
 889      * actually warn the user about a potentially broken kernel.
 890      */
 891
 892     /* IB ports start with 1, not 0 */
 893     if (ibv_query_port(verbs, 1, &port_attr)) {
 894         error_setg(errp, "RDMA ERROR: Could not query initial IB port");
 895         return -1;
 896     }
 897
 898     if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) {
 899         error_setg(errp, "RDMA ERROR: "
 900                    "Linux kernel's RoCE / iWARP does not support IPv6 "
 901                    "(but patches on linux-rdma in progress)");
 902         return -1;
 903     }
 904
 905 #endif
 906
 907     return 0;
 908 }
 909
 910 /*
 911  * Figure out which RDMA device corresponds to the requested IP hostname
 912  * Also create the initial connection manager identifiers for opening
 913  * the connection.
 914  */
 915 static int qemu_rdma_resolve_host(RDMAContext *rdma, Error **errp)
 916 {
 917     Error *err = NULL;
 918     int ret;
 919     struct rdma_addrinfo *res;
 920     char port_str[16];
 921     struct rdma_cm_event *cm_event;
 922     char ip[40] = "unknown";
 923
 924     if (rdma->host == NULL || !strcmp(rdma->host, "")) {
 925         error_setg(errp, "RDMA ERROR: RDMA hostname has not been set");
 926         return -1;
 927     }
 928
 929     /* create CM channel */
 930     rdma->channel = rdma_create_event_channel();
 931     if (!rdma->channel) {
 932         error_setg(errp, "RDMA ERROR: could not create CM channel");
 933         return -1;
 934     }
 935
 936     /* create CM id */
 937     ret = rdma_create_id(rdma->channel, &rdma->cm_id, NULL, RDMA_PS_TCP);
 938     if (ret < 0) {
 939         error_setg(errp, "RDMA ERROR: could not create channel id");
 940         goto err_resolve_create_id;
 941     }
 942
 943     snprintf(port_str, 16, "%d", rdma->port);
 944     port_str[15] = '\0';
 945
 946     ret = rdma_getaddrinfo(rdma->host, port_str, NULL, &res);
 947     if (ret) {
 948         error_setg(errp, "RDMA ERROR: could not rdma_getaddrinfo address %s",
 949                    rdma->host);
 950         goto err_resolve_get_addr;
 951     }
 952
 953     /* Try all addresses, saving the first error in @err */
 954     for (struct rdma_addrinfo *e = res; e != NULL; e = e->ai_next) {
 955         Error **local_errp = err ? NULL : &err;
 956
 957         inet_ntop(e->ai_family,
 958             &((struct sockaddr_in *) e->ai_dst_addr)->sin_addr, ip, sizeof ip);
 959         trace_qemu_rdma_resolve_host_trying(rdma->host, ip);
 960
 961         ret = rdma_resolve_addr(rdma->cm_id, NULL, e->ai_dst_addr,
 962                 RDMA_RESOLVE_TIMEOUT_MS);
 963         if (ret >= 0) {
 964             if (e->ai_family == AF_INET6) {
 965                 ret = qemu_rdma_broken_ipv6_kernel(rdma->cm_id->verbs,
 966                                                    local_errp);
 967                 if (ret < 0) {
 968                     continue;
 969                 }
 970             }
 971             error_free(err);
 972             goto route;
 973         }
 974     }
 975
 976     rdma_freeaddrinfo(res);
 977     if (err) {
 978         error_propagate(errp, err);
 979     } else {
 980         error_setg(errp, "RDMA ERROR: could not resolve address %s",
 981                    rdma->host);
 982     }
 983     goto err_resolve_get_addr;
 984
 985 route:
 986     rdma_freeaddrinfo(res);
 987     qemu_rdma_dump_gid("source_resolve_addr", rdma->cm_id);
 988
 989     ret = rdma_get_cm_event(rdma->channel, &cm_event);
 990     if (ret < 0) {
 991         error_setg(errp, "RDMA ERROR: could not perform event_addr_resolved");
 992         goto err_resolve_get_addr;
 993     }
 994
 995     if (cm_event->event != RDMA_CM_EVENT_ADDR_RESOLVED) {
 996         error_setg(errp,
 997                    "RDMA ERROR: result not equal to event_addr_resolved %s",
 998                    rdma_event_str(cm_event->event));
 999         rdma_ack_cm_event(cm_event);
1000         goto err_resolve_get_addr;
1001     }
1002     rdma_ack_cm_event(cm_event);
1003
1004     /* resolve route */
1005     ret = rdma_resolve_route(rdma->cm_id, RDMA_RESOLVE_TIMEOUT_MS);
1006     if (ret < 0) {
1007         error_setg(errp, "RDMA ERROR: could not resolve rdma route");
1008         goto err_resolve_get_addr;
1009     }
1010
1011     ret = rdma_get_cm_event(rdma->channel, &cm_event);
1012     if (ret < 0) {
1013         error_setg(errp, "RDMA ERROR: could not perform event_route_resolved");
1014         goto err_resolve_get_addr;
1015     }
1016     if (cm_event->event != RDMA_CM_EVENT_ROUTE_RESOLVED) {
1017         error_setg(errp, "RDMA ERROR: "
1018                    "result not equal to event_route_resolved: %s",
1019                    rdma_event_str(cm_event->event));
1020         rdma_ack_cm_event(cm_event);
1021         goto err_resolve_get_addr;
1022     }
1023     rdma_ack_cm_event(cm_event);
1024     rdma->verbs = rdma->cm_id->verbs;
1025     qemu_rdma_dump_id("source_resolve_host", rdma->cm_id->verbs);
1026     qemu_rdma_dump_gid("source_resolve_host", rdma->cm_id);
1027     return 0;
1028
1029 err_resolve_get_addr:
1030     rdma_destroy_id(rdma->cm_id);
1031     rdma->cm_id = NULL;
1032 err_resolve_create_id:
1033     rdma_destroy_event_channel(rdma->channel);
1034     rdma->channel = NULL;
1035     return -1;
1036 }
1037
1038 /*
1039  * Create protection domain and completion queues
1040  */
1041 static int qemu_rdma_alloc_pd_cq(RDMAContext *rdma, Error **errp)
1042 {
1043     /* allocate pd */
1044     rdma->pd = ibv_alloc_pd(rdma->verbs);
1045     if (!rdma->pd) {
1046         error_setg(errp, "failed to allocate protection domain");
1047         return -1;
1048     }
1049
1050     /* create receive completion channel */
1051     rdma->recv_comp_channel = ibv_create_comp_channel(rdma->verbs);
1052     if (!rdma->recv_comp_channel) {
1053         error_setg(errp, "failed to allocate receive completion channel");
1054         goto err_alloc_pd_cq;
1055     }
1056
1057     /*
1058      * Completion queue can be filled by read work requests.
1059      */
1060     rdma->recv_cq = ibv_create_cq(rdma->verbs, (RDMA_SIGNALED_SEND_MAX * 3),
1061                                   NULL, rdma->recv_comp_channel, 0);
1062     if (!rdma->recv_cq) {
1063         error_setg(errp, "failed to allocate receive completion queue");
1064         goto err_alloc_pd_cq;
1065     }
1066
1067     /* create send completion channel */
1068     rdma->send_comp_channel = ibv_create_comp_channel(rdma->verbs);
1069     if (!rdma->send_comp_channel) {
1070         error_setg(errp, "failed to allocate send completion channel");
1071         goto err_alloc_pd_cq;
1072     }
1073
1074     rdma->send_cq = ibv_create_cq(rdma->verbs, (RDMA_SIGNALED_SEND_MAX * 3),
1075                                   NULL, rdma->send_comp_channel, 0);
1076     if (!rdma->send_cq) {
1077         error_setg(errp, "failed to allocate send completion queue");
1078         goto err_alloc_pd_cq;
1079     }
1080
1081     return 0;
1082
1083 err_alloc_pd_cq:
1084     if (rdma->pd) {
1085         ibv_dealloc_pd(rdma->pd);
1086     }
1087     if (rdma->recv_comp_channel) {
1088         ibv_destroy_comp_channel(rdma->recv_comp_channel);
1089     }
1090     if (rdma->send_comp_channel) {
1091         ibv_destroy_comp_channel(rdma->send_comp_channel);
1092     }
1093     if (rdma->recv_cq) {
1094         ibv_destroy_cq(rdma->recv_cq);
1095         rdma->recv_cq = NULL;
1096     }
1097     rdma->pd = NULL;
1098     rdma->recv_comp_channel = NULL;
1099     rdma->send_comp_channel = NULL;
1100     return -1;
1101
1102 }
1103
1104 /*
1105  * Create queue pairs.
1106  */
1107 static int qemu_rdma_alloc_qp(RDMAContext *rdma)
1108 {
1109     struct ibv_qp_init_attr attr = { 0 };
1110
1111     attr.cap.max_send_wr = RDMA_SIGNALED_SEND_MAX;
1112     attr.cap.max_recv_wr = 3;
1113     attr.cap.max_send_sge = 1;
1114     attr.cap.max_recv_sge = 1;
1115     attr.send_cq = rdma->send_cq;
1116     attr.recv_cq = rdma->recv_cq;
1117     attr.qp_type = IBV_QPT_RC;
1118
1119     if (rdma_create_qp(rdma->cm_id, rdma->pd, &attr) < 0) {
1120         return -1;
1121     }
1122
1123     rdma->qp = rdma->cm_id->qp;
1124     return 0;
1125 }
1126
1127 /* Check whether On-Demand Paging is supported by RDAM device */
1128 static bool rdma_support_odp(struct ibv_context *dev)
1129 {
1130     struct ibv_device_attr_ex attr = {0};
1131
1132     if (ibv_query_device_ex(dev, NULL, &attr)) {
1133         return false;
1134     }
1135
1136     if (attr.odp_caps.general_caps & IBV_ODP_SUPPORT) {
1137         return true;
1138     }
1139
1140     return false;
1141 }
1142
1143 /*
1144  * ibv_advise_mr to avoid RNR NAK error as far as possible.
1145  * The responder mr registering with ODP will sent RNR NAK back to
1146  * the requester in the face of the page fault.
1147  */
1148 static void qemu_rdma_advise_prefetch_mr(struct ibv_pd *pd, uint64_t addr,
1149                                          uint32_t len,  uint32_t lkey,
1150                                          const char *name, bool wr)
1151 {
1152 #ifdef HAVE_IBV_ADVISE_MR
1153     int ret;
1154     int advice = wr ? IBV_ADVISE_MR_ADVICE_PREFETCH_WRITE :
1155                  IBV_ADVISE_MR_ADVICE_PREFETCH;
1156     struct ibv_sge sg_list = {.lkey = lkey, .addr = addr, .length = len};
1157
1158     ret = ibv_advise_mr(pd, advice,
1159                         IBV_ADVISE_MR_FLAG_FLUSH, &sg_list, 1);
1160     /* ignore the error */
1161     trace_qemu_rdma_advise_mr(name, len, addr, strerror(ret));
1162 #endif
1163 }
1164
1165 static int qemu_rdma_reg_whole_ram_blocks(RDMAContext *rdma, Error **errp)
1166 {
1167     int i;
1168     RDMALocalBlocks *local = &rdma->local_ram_blocks;
1169
1170     for (i = 0; i < local->nb_blocks; i++) {
1171         int access = IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE;
1172
1173         local->block[i].mr =
1174             ibv_reg_mr(rdma->pd,
1175                     local->block[i].local_host_addr,
1176                     local->block[i].length, access
1177                     );
1178         /*
1179          * ibv_reg_mr() is not documented to set errno.  If it does,
1180          * it's somebody else's doc bug.  If it doesn't, the use of
1181          * errno below is wrong.
1182          * TODO Find out whether ibv_reg_mr() sets errno.
1183          */
1184         if (!local->block[i].mr &&
1185             errno == ENOTSUP && rdma_support_odp(rdma->verbs)) {
1186                 access |= IBV_ACCESS_ON_DEMAND;
1187                 /* register ODP mr */
1188                 local->block[i].mr =
1189                     ibv_reg_mr(rdma->pd,
1190                                local->block[i].local_host_addr,
1191                                local->block[i].length, access);
1192                 trace_qemu_rdma_register_odp_mr(local->block[i].block_name);
1193
1194                 if (local->block[i].mr) {
1195                     qemu_rdma_advise_prefetch_mr(rdma->pd,
1196                                     (uintptr_t)local->block[i].local_host_addr,
1197                                     local->block[i].length,
1198                                     local->block[i].mr->lkey,
1199                                     local->block[i].block_name,
1200                                     true);
1201                 }
1202         }
1203
1204         if (!local->block[i].mr) {
1205             error_setg_errno(errp, errno,
1206                              "Failed to register local dest ram block!");
1207             goto err;
1208         }
1209         rdma->total_registrations++;
1210     }
1211
1212     return 0;
1213
1214 err:
1215     for (i--; i >= 0; i--) {
1216         ibv_dereg_mr(local->block[i].mr);
1217         local->block[i].mr = NULL;
1218         rdma->total_registrations--;
1219     }
1220
1221     return -1;
1222
1223 }
1224
1225 /*
1226  * Find the ram block that corresponds to the page requested to be
1227  * transmitted by QEMU.
1228  *
1229  * Once the block is found, also identify which 'chunk' within that
1230  * block that the page belongs to.
1231  */
1232 static void qemu_rdma_search_ram_block(RDMAContext *rdma,
1233                                        uintptr_t block_offset,
1234                                        uint64_t offset,
1235                                        uint64_t length,
1236                                        uint64_t *block_index,
1237                                        uint64_t *chunk_index)
1238 {
1239     uint64_t current_addr = block_offset + offset;
1240     RDMALocalBlock *block = g_hash_table_lookup(rdma->blockmap,
1241                                                 (void *) block_offset);
1242     assert(block);
1243     assert(current_addr >= block->offset);
1244     assert((current_addr + length) <= (block->offset + block->length));
1245
1246     *block_index = block->index;
1247     *chunk_index = ram_chunk_index(block->local_host_addr,
1248                 block->local_host_addr + (current_addr - block->offset));
1249 }
1250
1251 /*
1252  * Register a chunk with IB. If the chunk was already registered
1253  * previously, then skip.
1254  *
1255  * Also return the keys associated with the registration needed
1256  * to perform the actual RDMA operation.
1257  */
1258 static int qemu_rdma_register_and_get_keys(RDMAContext *rdma,
1259         RDMALocalBlock *block, uintptr_t host_addr,
1260         uint32_t *lkey, uint32_t *rkey, int chunk,
1261         uint8_t *chunk_start, uint8_t *chunk_end)
1262 {
1263     if (block->mr) {
1264         if (lkey) {
1265             *lkey = block->mr->lkey;
1266         }
1267         if (rkey) {
1268             *rkey = block->mr->rkey;
1269         }
1270         return 0;
1271     }
1272
1273     /* allocate memory to store chunk MRs */
1274     if (!block->pmr) {
1275         block->pmr = g_new0(struct ibv_mr *, block->nb_chunks);
1276     }
1277
1278     /*
1279      * If 'rkey', then we're the destination, so grant access to the source.
1280      *
1281      * If 'lkey', then we're the source VM, so grant access only to ourselves.
1282      */
1283     if (!block->pmr[chunk]) {
1284         uint64_t len = chunk_end - chunk_start;
1285         int access = rkey ? IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE :
1286                      0;
1287
1288         trace_qemu_rdma_register_and_get_keys(len, chunk_start);
1289
1290         block->pmr[chunk] = ibv_reg_mr(rdma->pd, chunk_start, len, access);
1291         /*
1292          * ibv_reg_mr() is not documented to set errno.  If it does,
1293          * it's somebody else's doc bug.  If it doesn't, the use of
1294          * errno below is wrong.
1295          * TODO Find out whether ibv_reg_mr() sets errno.
1296          */
1297         if (!block->pmr[chunk] &&
1298             errno == ENOTSUP && rdma_support_odp(rdma->verbs)) {
1299             access |= IBV_ACCESS_ON_DEMAND;
1300             /* register ODP mr */
1301             block->pmr[chunk] = ibv_reg_mr(rdma->pd, chunk_start, len, access);
1302             trace_qemu_rdma_register_odp_mr(block->block_name);
1303
1304             if (block->pmr[chunk]) {
1305                 qemu_rdma_advise_prefetch_mr(rdma->pd, (uintptr_t)chunk_start,
1306                                             len, block->pmr[chunk]->lkey,
1307                                             block->block_name, rkey);
1308
1309             }
1310         }
1311     }
1312     if (!block->pmr[chunk]) {
1313         return -1;
1314     }
1315     rdma->total_registrations++;
1316
1317     if (lkey) {
1318         *lkey = block->pmr[chunk]->lkey;
1319     }
1320     if (rkey) {
1321         *rkey = block->pmr[chunk]->rkey;
1322     }
1323     return 0;
1324 }
1325
1326 /*
1327  * Register (at connection time) the memory used for control
1328  * channel messages.
1329  */
1330 static int qemu_rdma_reg_control(RDMAContext *rdma, int idx)
1331 {
1332     rdma->wr_data[idx].control_mr = ibv_reg_mr(rdma->pd,
1333             rdma->wr_data[idx].control, RDMA_CONTROL_MAX_BUFFER,
1334             IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE);
1335     if (rdma->wr_data[idx].control_mr) {
1336         rdma->total_registrations++;
1337         return 0;
1338     }
1339     return -1;
1340 }
1341
1342 /*
1343  * Perform a non-optimized memory unregistration after every transfer
1344  * for demonstration purposes, only if pin-all is not requested.
1345  *
1346  * Potential optimizations:
1347  * 1. Start a new thread to run this function continuously
1348         - for bit clearing
1349         - and for receipt of unregister messages
1350  * 2. Use an LRU.
1351  * 3. Use workload hints.
1352  */
1353 static int qemu_rdma_unregister_waiting(RDMAContext *rdma)
1354 {
1355     Error *err = NULL;
1356
1357     while (rdma->unregistrations[rdma->unregister_current]) {
1358         int ret;
1359         uint64_t wr_id = rdma->unregistrations[rdma->unregister_current];
1360         uint64_t chunk =
1361             (wr_id & RDMA_WRID_CHUNK_MASK) >> RDMA_WRID_CHUNK_SHIFT;
1362         uint64_t index =
1363             (wr_id & RDMA_WRID_BLOCK_MASK) >> RDMA_WRID_BLOCK_SHIFT;
1364         RDMALocalBlock *block =
1365             &(rdma->local_ram_blocks.block[index]);
1366         RDMARegister reg = { .current_index = index };
1367         RDMAControlHeader resp = { .type = RDMA_CONTROL_UNREGISTER_FINISHED,
1368                                  };
1369         RDMAControlHeader head = { .len = sizeof(RDMARegister),
1370                                    .type = RDMA_CONTROL_UNREGISTER_REQUEST,
1371                                    .repeat = 1,
1372                                  };
1373
1374         trace_qemu_rdma_unregister_waiting_proc(chunk,
1375                                                 rdma->unregister_current);
1376
1377         rdma->unregistrations[rdma->unregister_current] = 0;
1378         rdma->unregister_current++;
1379
1380         if (rdma->unregister_current == RDMA_SIGNALED_SEND_MAX) {
1381             rdma->unregister_current = 0;
1382         }
1383
1384
1385         /*
1386          * Unregistration is speculative (because migration is single-threaded
1387          * and we cannot break the protocol's inifinband message ordering).
1388          * Thus, if the memory is currently being used for transmission,
1389          * then abort the attempt to unregister and try again
1390          * later the next time a completion is received for this memory.
1391          */
1392         clear_bit(chunk, block->unregister_bitmap);
1393
1394         if (test_bit(chunk, block->transit_bitmap)) {
1395             trace_qemu_rdma_unregister_waiting_inflight(chunk);
1396             continue;
1397         }
1398
1399         trace_qemu_rdma_unregister_waiting_send(chunk);
1400
1401         ret = ibv_dereg_mr(block->pmr[chunk]);
1402         block->pmr[chunk] = NULL;
1403         block->remote_keys[chunk] = 0;
1404
1405         if (ret != 0) {
1406             error_report("unregistration chunk failed: %s",
1407                          strerror(ret));
1408             return -1;
1409         }
1410         rdma->total_registrations--;
1411
1412         reg.key.chunk = chunk;
1413         register_to_network(rdma, &reg);
1414         ret = qemu_rdma_exchange_send(rdma, &head, (uint8_t *) &reg,
1415                                       &resp, NULL, NULL, &err);
1416         if (ret < 0) {
1417             error_report_err(err);
1418             return -1;
1419         }
1420
1421         trace_qemu_rdma_unregister_waiting_complete(chunk);
1422     }
1423
1424     return 0;
1425 }
1426
1427 static uint64_t qemu_rdma_make_wrid(uint64_t wr_id, uint64_t index,
1428                                          uint64_t chunk)
1429 {
1430     uint64_t result = wr_id & RDMA_WRID_TYPE_MASK;
1431
1432     result |= (index << RDMA_WRID_BLOCK_SHIFT);
1433     result |= (chunk << RDMA_WRID_CHUNK_SHIFT);
1434
1435     return result;
1436 }
1437
1438 /*
1439  * Consult the connection manager to see a work request
1440  * (of any kind) has completed.
1441  * Return the work request ID that completed.
1442  */
1443 static int qemu_rdma_poll(RDMAContext *rdma, struct ibv_cq *cq,
1444                           uint64_t *wr_id_out, uint32_t *byte_len)
1445 {
1446     int ret;
1447     struct ibv_wc wc;
1448     uint64_t wr_id;
1449
1450     ret = ibv_poll_cq(cq, 1, &wc);
1451
1452     if (!ret) {
1453         *wr_id_out = RDMA_WRID_NONE;
1454         return 0;
1455     }
1456
1457     if (ret < 0) {
1458         return -1;
1459     }
1460
1461     wr_id = wc.wr_id & RDMA_WRID_TYPE_MASK;
1462
1463     if (wc.status != IBV_WC_SUCCESS) {
1464         return -1;
1465     }
1466
1467     if (rdma->control_ready_expected &&
1468         (wr_id >= RDMA_WRID_RECV_CONTROL)) {
1469         trace_qemu_rdma_poll_recv(wr_id - RDMA_WRID_RECV_CONTROL, wr_id,
1470                                   rdma->nb_sent);
1471         rdma->control_ready_expected = 0;
1472     }
1473
1474     if (wr_id == RDMA_WRID_RDMA_WRITE) {
1475         uint64_t chunk =
1476             (wc.wr_id & RDMA_WRID_CHUNK_MASK) >> RDMA_WRID_CHUNK_SHIFT;
1477         uint64_t index =
1478             (wc.wr_id & RDMA_WRID_BLOCK_MASK) >> RDMA_WRID_BLOCK_SHIFT;
1479         RDMALocalBlock *block = &(rdma->local_ram_blocks.block[index]);
1480
1481         trace_qemu_rdma_poll_write(wr_id, rdma->nb_sent,
1482                                    index, chunk, block->local_host_addr,
1483                                    (void *)(uintptr_t)block->remote_host_addr);
1484
1485         clear_bit(chunk, block->transit_bitmap);
1486
1487         if (rdma->nb_sent > 0) {
1488             rdma->nb_sent--;
1489         }
1490     } else {
1491         trace_qemu_rdma_poll_other(wr_id, rdma->nb_sent);
1492     }
1493
1494     *wr_id_out = wc.wr_id;
1495     if (byte_len) {
1496         *byte_len = wc.byte_len;
1497     }
1498
1499     return  0;
1500 }
1501
1502 /* Wait for activity on the completion channel.
1503  * Returns 0 on success, none-0 on error.
1504  */
1505 static int qemu_rdma_wait_comp_channel(RDMAContext *rdma,
1506                                        struct ibv_comp_channel *comp_channel)
1507 {
1508     struct rdma_cm_event *cm_event;
1509
1510     /*
1511      * Coroutine doesn't start until migration_fd_process_incoming()
1512      * so don't yield unless we know we're running inside of a coroutine.
1513      */
1514     if (rdma->migration_started_on_destination &&
1515         migration_incoming_get_current()->state == MIGRATION_STATUS_ACTIVE) {
1516         yield_until_fd_readable(comp_channel->fd);
1517     } else {
1518         /* This is the source side, we're in a separate thread
1519          * or destination prior to migration_fd_process_incoming()
1520          * after postcopy, the destination also in a separate thread.
1521          * we can't yield; so we have to poll the fd.
1522          * But we need to be able to handle 'cancel' or an error
1523          * without hanging forever.
1524          */
1525         while (!rdma->errored && !rdma->received_error) {
1526             GPollFD pfds[2];
1527             pfds[0].fd = comp_channel->fd;
1528             pfds[0].events = G_IO_IN | G_IO_HUP | G_IO_ERR;
1529             pfds[0].revents = 0;
1530
1531             pfds[1].fd = rdma->channel->fd;
1532             pfds[1].events = G_IO_IN | G_IO_HUP | G_IO_ERR;
1533             pfds[1].revents = 0;
1534
1535             /* 0.1s timeout, should be fine for a 'cancel' */
1536             switch (qemu_poll_ns(pfds, 2, 100 * 1000 * 1000)) {
1537             case 2:
1538             case 1: /* fd active */
1539                 if (pfds[0].revents) {
1540                     return 0;
1541                 }
1542
1543                 if (pfds[1].revents) {
1544                     if (rdma_get_cm_event(rdma->channel, &cm_event) < 0) {
1545                         return -1;
1546                     }
1547
1548                     if (cm_event->event == RDMA_CM_EVENT_DISCONNECTED ||
1549                         cm_event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) {
1550                         rdma_ack_cm_event(cm_event);
1551                         return -1;
1552                     }
1553                     rdma_ack_cm_event(cm_event);
1554                 }
1555                 break;
1556
1557             case 0: /* Timeout, go around again */
1558                 break;
1559
1560             default: /* Error of some type -
1561                       * I don't trust errno from qemu_poll_ns
1562                      */
1563                 return -1;
1564             }
1565
1566             if (migrate_get_current()->state == MIGRATION_STATUS_CANCELLING) {
1567                 /* Bail out and let the cancellation happen */
1568                 return -1;
1569             }
1570         }
1571     }
1572
1573     if (rdma->received_error) {
1574         return -1;
1575     }
1576     return -rdma->errored;
1577 }
1578
1579 static struct ibv_comp_channel *to_channel(RDMAContext *rdma, uint64_t wrid)
1580 {
1581     return wrid < RDMA_WRID_RECV_CONTROL ? rdma->send_comp_channel :
1582            rdma->recv_comp_channel;
1583 }
1584
1585 static struct ibv_cq *to_cq(RDMAContext *rdma, uint64_t wrid)
1586 {
1587     return wrid < RDMA_WRID_RECV_CONTROL ? rdma->send_cq : rdma->recv_cq;
1588 }
1589
1590 /*
1591  * Block until the next work request has completed.
1592  *
1593  * First poll to see if a work request has already completed,
1594  * otherwise block.
1595  *
1596  * If we encounter completed work requests for IDs other than
1597  * the one we're interested in, then that's generally an error.
1598  *
1599  * The only exception is actual RDMA Write completions. These
1600  * completions only need to be recorded, but do not actually
1601  * need further processing.
1602  */
1603 static int qemu_rdma_block_for_wrid(RDMAContext *rdma,
1604                                     uint64_t wrid_requested,
1605                                     uint32_t *byte_len)
1606 {
1607     int num_cq_events = 0, ret;
1608     struct ibv_cq *cq;
1609     void *cq_ctx;
1610     uint64_t wr_id = RDMA_WRID_NONE, wr_id_in;
1611     struct ibv_comp_channel *ch = to_channel(rdma, wrid_requested);
1612     struct ibv_cq *poll_cq = to_cq(rdma, wrid_requested);
1613
1614     if (ibv_req_notify_cq(poll_cq, 0)) {
1615         return -1;
1616     }
1617     /* poll cq first */
1618     while (wr_id != wrid_requested) {
1619         ret = qemu_rdma_poll(rdma, poll_cq, &wr_id_in, byte_len);
1620         if (ret < 0) {
1621             return -1;
1622         }
1623
1624         wr_id = wr_id_in & RDMA_WRID_TYPE_MASK;
1625
1626         if (wr_id == RDMA_WRID_NONE) {
1627             break;
1628         }
1629         if (wr_id != wrid_requested) {
1630             trace_qemu_rdma_block_for_wrid_miss(wrid_requested, wr_id);
1631         }
1632     }
1633
1634     if (wr_id == wrid_requested) {
1635         return 0;
1636     }
1637
1638     while (1) {
1639         ret = qemu_rdma_wait_comp_channel(rdma, ch);
1640         if (ret < 0) {
1641             goto err_block_for_wrid;
1642         }
1643
1644         ret = ibv_get_cq_event(ch, &cq, &cq_ctx);
1645         if (ret < 0) {
1646             goto err_block_for_wrid;
1647         }
1648
1649         num_cq_events++;
1650
1651         if (ibv_req_notify_cq(cq, 0)) {
1652             goto err_block_for_wrid;
1653         }
1654
1655         while (wr_id != wrid_requested) {
1656             ret = qemu_rdma_poll(rdma, poll_cq, &wr_id_in, byte_len);
1657             if (ret < 0) {
1658                 goto err_block_for_wrid;
1659             }
1660
1661             wr_id = wr_id_in & RDMA_WRID_TYPE_MASK;
1662
1663             if (wr_id == RDMA_WRID_NONE) {
1664                 break;
1665             }
1666             if (wr_id != wrid_requested) {
1667                 trace_qemu_rdma_block_for_wrid_miss(wrid_requested, wr_id);
1668             }
1669         }
1670
1671         if (wr_id == wrid_requested) {
1672             goto success_block_for_wrid;
1673         }
1674     }
1675
1676 success_block_for_wrid:
1677     if (num_cq_events) {
1678         ibv_ack_cq_events(cq, num_cq_events);
1679     }
1680     return 0;
1681
1682 err_block_for_wrid:
1683     if (num_cq_events) {
1684         ibv_ack_cq_events(cq, num_cq_events);
1685     }
1686
1687     rdma->errored = true;
1688     return -1;
1689 }
1690
1691 /*
1692  * Post a SEND message work request for the control channel
1693  * containing some data and block until the post completes.
1694  */
1695 static int qemu_rdma_post_send_control(RDMAContext *rdma, uint8_t *buf,
1696                                        RDMAControlHeader *head,
1697                                        Error **errp)
1698 {
1699     int ret;
1700     RDMAWorkRequestData *wr = &rdma->wr_data[RDMA_WRID_CONTROL];
1701     struct ibv_send_wr *bad_wr;
1702     struct ibv_sge sge = {
1703                            .addr = (uintptr_t)(wr->control),
1704                            .length = head->len + sizeof(RDMAControlHeader),
1705                            .lkey = wr->control_mr->lkey,
1706                          };
1707     struct ibv_send_wr send_wr = {
1708                                    .wr_id = RDMA_WRID_SEND_CONTROL,
1709                                    .opcode = IBV_WR_SEND,
1710                                    .send_flags = IBV_SEND_SIGNALED,
1711                                    .sg_list = &sge,
1712                                    .num_sge = 1,
1713                                 };
1714
1715     trace_qemu_rdma_post_send_control(control_desc(head->type));
1716
1717     /*
1718      * We don't actually need to do a memcpy() in here if we used
1719      * the "sge" properly, but since we're only sending control messages
1720      * (not RAM in a performance-critical path), then its OK for now.
1721      *
1722      * The copy makes the RDMAControlHeader simpler to manipulate
1723      * for the time being.
1724      */
1725     assert(head->len <= RDMA_CONTROL_MAX_BUFFER - sizeof(*head));
1726     memcpy(wr->control, head, sizeof(RDMAControlHeader));
1727     control_to_network((void *) wr->control);
1728
1729     if (buf) {
1730         memcpy(wr->control + sizeof(RDMAControlHeader), buf, head->len);
1731     }
1732
1733
1734     ret = ibv_post_send(rdma->qp, &send_wr, &bad_wr);
1735
1736     if (ret > 0) {
1737         error_setg(errp, "Failed to use post IB SEND for control");
1738         return -1;
1739     }
1740
1741     ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_SEND_CONTROL, NULL);
1742     if (ret < 0) {
1743         error_setg(errp, "rdma migration: send polling control error");
1744         return -1;
1745     }
1746
1747     return 0;
1748 }
1749
1750 /*
1751  * Post a RECV work request in anticipation of some future receipt
1752  * of data on the control channel.
1753  */
1754 static int qemu_rdma_post_recv_control(RDMAContext *rdma, int idx,
1755                                        Error **errp)
1756 {
1757     struct ibv_recv_wr *bad_wr;
1758     struct ibv_sge sge = {
1759                             .addr = (uintptr_t)(rdma->wr_data[idx].control),
1760                             .length = RDMA_CONTROL_MAX_BUFFER,
1761                             .lkey = rdma->wr_data[idx].control_mr->lkey,
1762                          };
1763
1764     struct ibv_recv_wr recv_wr = {
1765                                     .wr_id = RDMA_WRID_RECV_CONTROL + idx,
1766                                     .sg_list = &sge,
1767                                     .num_sge = 1,
1768                                  };
1769
1770
1771     if (ibv_post_recv(rdma->qp, &recv_wr, &bad_wr)) {
1772         error_setg(errp, "error posting control recv");
1773         return -1;
1774     }
1775
1776     return 0;
1777 }
1778
1779 /*
1780  * Block and wait for a RECV control channel message to arrive.
1781  */
1782 static int qemu_rdma_exchange_get_response(RDMAContext *rdma,
1783                 RDMAControlHeader *head, uint32_t expecting, int idx,
1784                 Error **errp)
1785 {
1786     uint32_t byte_len;
1787     int ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RECV_CONTROL + idx,
1788                                        &byte_len);
1789
1790     if (ret < 0) {
1791         error_setg(errp, "rdma migration: recv polling control error!");
1792         return -1;
1793     }
1794
1795     network_to_control((void *) rdma->wr_data[idx].control);
1796     memcpy(head, rdma->wr_data[idx].control, sizeof(RDMAControlHeader));
1797
1798     trace_qemu_rdma_exchange_get_response_start(control_desc(expecting));
1799
1800     if (expecting == RDMA_CONTROL_NONE) {
1801         trace_qemu_rdma_exchange_get_response_none(control_desc(head->type),
1802                                              head->type);
1803     } else if (head->type != expecting || head->type == RDMA_CONTROL_ERROR) {
1804         error_setg(errp, "Was expecting a %s (%d) control message"
1805                 ", but got: %s (%d), length: %d",
1806                 control_desc(expecting), expecting,
1807                 control_desc(head->type), head->type, head->len);
1808         if (head->type == RDMA_CONTROL_ERROR) {
1809             rdma->received_error = true;
1810         }
1811         return -1;
1812     }
1813     if (head->len > RDMA_CONTROL_MAX_BUFFER - sizeof(*head)) {
1814         error_setg(errp, "too long length: %d", head->len);
1815         return -1;
1816     }
1817     if (sizeof(*head) + head->len != byte_len) {
1818         error_setg(errp, "Malformed length: %d byte_len %d",
1819                    head->len, byte_len);
1820         return -1;
1821     }
1822
1823     return 0;
1824 }
1825
1826 /*
1827  * When a RECV work request has completed, the work request's
1828  * buffer is pointed at the header.
1829  *
1830  * This will advance the pointer to the data portion
1831  * of the control message of the work request's buffer that
1832  * was populated after the work request finished.
1833  */
1834 static void qemu_rdma_move_header(RDMAContext *rdma, int idx,
1835                                   RDMAControlHeader *head)
1836 {
1837     rdma->wr_data[idx].control_len = head->len;
1838     rdma->wr_data[idx].control_curr =
1839         rdma->wr_data[idx].control + sizeof(RDMAControlHeader);
1840 }
1841
1842 /*
1843  * This is an 'atomic' high-level operation to deliver a single, unified
1844  * control-channel message.
1845  *
1846  * Additionally, if the user is expecting some kind of reply to this message,
1847  * they can request a 'resp' response message be filled in by posting an
1848  * additional work request on behalf of the user and waiting for an additional
1849  * completion.
1850  *
1851  * The extra (optional) response is used during registration to us from having
1852  * to perform an *additional* exchange of message just to provide a response by
1853  * instead piggy-backing on the acknowledgement.
1854  */
1855 static int qemu_rdma_exchange_send(RDMAContext *rdma, RDMAControlHeader *head,
1856                                    uint8_t *data, RDMAControlHeader *resp,
1857                                    int *resp_idx,
1858                                    int (*callback)(RDMAContext *rdma,
1859                                                    Error **errp),
1860                                    Error **errp)
1861 {
1862     int ret;
1863
1864     /*
1865      * Wait until the dest is ready before attempting to deliver the message
1866      * by waiting for a READY message.
1867      */
1868     if (rdma->control_ready_expected) {
1869         RDMAControlHeader resp_ignored;
1870
1871         ret = qemu_rdma_exchange_get_response(rdma, &resp_ignored,
1872                                               RDMA_CONTROL_READY,
1873                                               RDMA_WRID_READY, errp);
1874         if (ret < 0) {
1875             return -1;
1876         }
1877     }
1878
1879     /*
1880      * If the user is expecting a response, post a WR in anticipation of it.
1881      */
1882     if (resp) {
1883         ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_DATA, errp);
1884         if (ret < 0) {
1885             return -1;
1886         }
1887     }
1888
1889     /*
1890      * Post a WR to replace the one we just consumed for the READY message.
1891      */
1892     ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY, errp);
1893     if (ret < 0) {
1894         return -1;
1895     }
1896
1897     /*
1898      * Deliver the control message that was requested.
1899      */
1900     ret = qemu_rdma_post_send_control(rdma, data, head, errp);
1901
1902     if (ret < 0) {
1903         return -1;
1904     }
1905
1906     /*
1907      * If we're expecting a response, block and wait for it.
1908      */
1909     if (resp) {
1910         if (callback) {
1911             trace_qemu_rdma_exchange_send_issue_callback();
1912             ret = callback(rdma, errp);
1913             if (ret < 0) {
1914                 return -1;
1915             }
1916         }
1917
1918         trace_qemu_rdma_exchange_send_waiting(control_desc(resp->type));
1919         ret = qemu_rdma_exchange_get_response(rdma, resp,
1920                                               resp->type, RDMA_WRID_DATA,
1921                                               errp);
1922
1923         if (ret < 0) {
1924             return -1;
1925         }
1926
1927         qemu_rdma_move_header(rdma, RDMA_WRID_DATA, resp);
1928         if (resp_idx) {
1929             *resp_idx = RDMA_WRID_DATA;
1930         }
1931         trace_qemu_rdma_exchange_send_received(control_desc(resp->type));
1932     }
1933
1934     rdma->control_ready_expected = 1;
1935
1936     return 0;
1937 }
1938
1939 /*
1940  * This is an 'atomic' high-level operation to receive a single, unified
1941  * control-channel message.
1942  */
1943 static int qemu_rdma_exchange_recv(RDMAContext *rdma, RDMAControlHeader *head,
1944                                    uint32_t expecting, Error **errp)
1945 {
1946     RDMAControlHeader ready = {
1947                                 .len = 0,
1948                                 .type = RDMA_CONTROL_READY,
1949                                 .repeat = 1,
1950                               };
1951     int ret;
1952
1953     /*
1954      * Inform the source that we're ready to receive a message.
1955      */
1956     ret = qemu_rdma_post_send_control(rdma, NULL, &ready, errp);
1957
1958     if (ret < 0) {
1959         return -1;
1960     }
1961
1962     /*
1963      * Block and wait for the message.
1964      */
1965     ret = qemu_rdma_exchange_get_response(rdma, head,
1966                                           expecting, RDMA_WRID_READY, errp);
1967
1968     if (ret < 0) {
1969         return -1;
1970     }
1971
1972     qemu_rdma_move_header(rdma, RDMA_WRID_READY, head);
1973
1974     /*
1975      * Post a new RECV work request to replace the one we just consumed.
1976      */
1977     ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY, errp);
1978     if (ret < 0) {
1979         return -1;
1980     }
1981
1982     return 0;
1983 }
1984
1985 /*
1986  * Write an actual chunk of memory using RDMA.
1987  *
1988  * If we're using dynamic registration on the dest-side, we have to
1989  * send a registration command first.
1990  */
1991 static int qemu_rdma_write_one(RDMAContext *rdma,
1992                                int current_index, uint64_t current_addr,
1993                                uint64_t length, Error **errp)
1994 {
1995     struct ibv_sge sge;
1996     struct ibv_send_wr send_wr = { 0 };
1997     struct ibv_send_wr *bad_wr;
1998     int reg_result_idx, ret, count = 0;
1999     uint64_t chunk, chunks;
2000     uint8_t *chunk_start, *chunk_end;
2001     RDMALocalBlock *block = &(rdma->local_ram_blocks.block[current_index]);
2002     RDMARegister reg;
2003     RDMARegisterResult *reg_result;
2004     RDMAControlHeader resp = { .type = RDMA_CONTROL_REGISTER_RESULT };
2005     RDMAControlHeader head = { .len = sizeof(RDMARegister),
2006                                .type = RDMA_CONTROL_REGISTER_REQUEST,
2007                                .repeat = 1,
2008                              };
2009
2010 retry:
2011     sge.addr = (uintptr_t)(block->local_host_addr +
2012                             (current_addr - block->offset));
2013     sge.length = length;
2014
2015     chunk = ram_chunk_index(block->local_host_addr,
2016                             (uint8_t *)(uintptr_t)sge.addr);
2017     chunk_start = ram_chunk_start(block, chunk);
2018
2019     if (block->is_ram_block) {
2020         chunks = length / (1UL << RDMA_REG_CHUNK_SHIFT);
2021
2022         if (chunks && ((length % (1UL << RDMA_REG_CHUNK_SHIFT)) == 0)) {
2023             chunks--;
2024         }
2025     } else {
2026         chunks = block->length / (1UL << RDMA_REG_CHUNK_SHIFT);
2027
2028         if (chunks && ((block->length % (1UL << RDMA_REG_CHUNK_SHIFT)) == 0)) {
2029             chunks--;
2030         }
2031     }
2032
2033     trace_qemu_rdma_write_one_top(chunks + 1,
2034                                   (chunks + 1) *
2035                                   (1UL << RDMA_REG_CHUNK_SHIFT) / 1024 / 1024);
2036
2037     chunk_end = ram_chunk_end(block, chunk + chunks);
2038
2039
2040     while (test_bit(chunk, block->transit_bitmap)) {
2041         (void)count;
2042         trace_qemu_rdma_write_one_block(count++, current_index, chunk,
2043                 sge.addr, length, rdma->nb_sent, block->nb_chunks);
2044
2045         ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RDMA_WRITE, NULL);
2046
2047         if (ret < 0) {
2048             error_setg(errp, "Failed to Wait for previous write to complete "
2049                     "block %d chunk %" PRIu64
2050                     " current %" PRIu64 " len %" PRIu64 " %d",
2051                     current_index, chunk, sge.addr, length, rdma->nb_sent);
2052             return -1;
2053         }
2054     }
2055
2056     if (!rdma->pin_all || !block->is_ram_block) {
2057         if (!block->remote_keys[chunk]) {
2058             /*
2059              * This chunk has not yet been registered, so first check to see
2060              * if the entire chunk is zero. If so, tell the other size to
2061              * memset() + madvise() the entire chunk without RDMA.
2062              */
2063
2064             if (buffer_is_zero((void *)(uintptr_t)sge.addr, length)) {
2065                 RDMACompress comp = {
2066                                         .offset = current_addr,
2067                                         .value = 0,
2068                                         .block_idx = current_index,
2069                                         .length = length,
2070                                     };
2071
2072                 head.len = sizeof(comp);
2073                 head.type = RDMA_CONTROL_COMPRESS;
2074
2075                 trace_qemu_rdma_write_one_zero(chunk, sge.length,
2076                                                current_index, current_addr);
2077
2078                 compress_to_network(rdma, &comp);
2079                 ret = qemu_rdma_exchange_send(rdma, &head,
2080                                 (uint8_t *) &comp, NULL, NULL, NULL, errp);
2081
2082                 if (ret < 0) {
2083                     return -1;
2084                 }
2085
2086                 /*
2087                  * TODO: Here we are sending something, but we are not
2088                  * accounting for anything transferred.  The following is wrong:
2089                  *
2090                  * stat64_add(&mig_stats.rdma_bytes, sge.length);
2091                  *
2092                  * because we are using some kind of compression.  I
2093                  * would think that head.len would be the more similar
2094                  * thing to a correct value.
2095                  */
2096                 stat64_add(&mig_stats.zero_pages,
2097                            sge.length / qemu_target_page_size());
2098                 return 1;
2099             }
2100
2101             /*
2102              * Otherwise, tell other side to register.
2103              */
2104             reg.current_index = current_index;
2105             if (block->is_ram_block) {
2106                 reg.key.current_addr = current_addr;
2107             } else {
2108                 reg.key.chunk = chunk;
2109             }
2110             reg.chunks = chunks;
2111
2112             trace_qemu_rdma_write_one_sendreg(chunk, sge.length, current_index,
2113                                               current_addr);
2114
2115             register_to_network(rdma, &reg);
2116             ret = qemu_rdma_exchange_send(rdma, &head, (uint8_t *) &reg,
2117                                     &resp, &reg_result_idx, NULL, errp);
2118             if (ret < 0) {
2119                 return -1;
2120             }
2121
2122             /* try to overlap this single registration with the one we sent. */
2123             if (qemu_rdma_register_and_get_keys(rdma, block, sge.addr,
2124                                                 &sge.lkey, NULL, chunk,
2125                                                 chunk_start, chunk_end)) {
2126                 error_setg(errp, "cannot get lkey");
2127                 return -1;
2128             }
2129
2130             reg_result = (RDMARegisterResult *)
2131                     rdma->wr_data[reg_result_idx].control_curr;
2132
2133             network_to_result(reg_result);
2134
2135             trace_qemu_rdma_write_one_recvregres(block->remote_keys[chunk],
2136                                                  reg_result->rkey, chunk);
2137
2138             block->remote_keys[chunk] = reg_result->rkey;
2139             block->remote_host_addr = reg_result->host_addr;
2140         } else {
2141             /* already registered before */
2142             if (qemu_rdma_register_and_get_keys(rdma, block, sge.addr,
2143                                                 &sge.lkey, NULL, chunk,
2144                                                 chunk_start, chunk_end)) {
2145                 error_setg(errp, "cannot get lkey!");
2146                 return -1;
2147             }
2148         }
2149
2150         send_wr.wr.rdma.rkey = block->remote_keys[chunk];
2151     } else {
2152         send_wr.wr.rdma.rkey = block->remote_rkey;
2153
2154         if (qemu_rdma_register_and_get_keys(rdma, block, sge.addr,
2155                                                      &sge.lkey, NULL, chunk,
2156                                                      chunk_start, chunk_end)) {
2157             error_setg(errp, "cannot get lkey!");
2158             return -1;
2159         }
2160     }
2161
2162     /*
2163      * Encode the ram block index and chunk within this wrid.
2164      * We will use this information at the time of completion
2165      * to figure out which bitmap to check against and then which
2166      * chunk in the bitmap to look for.
2167      */
2168     send_wr.wr_id = qemu_rdma_make_wrid(RDMA_WRID_RDMA_WRITE,
2169                                         current_index, chunk);
2170
2171     send_wr.opcode = IBV_WR_RDMA_WRITE;
2172     send_wr.send_flags = IBV_SEND_SIGNALED;
2173     send_wr.sg_list = &sge;
2174     send_wr.num_sge = 1;
2175     send_wr.wr.rdma.remote_addr = block->remote_host_addr +
2176                                 (current_addr - block->offset);
2177
2178     trace_qemu_rdma_write_one_post(chunk, sge.addr, send_wr.wr.rdma.remote_addr,
2179                                    sge.length);
2180
2181     /*
2182      * ibv_post_send() does not return negative error numbers,
2183      * per the specification they are positive - no idea why.
2184      */
2185     ret = ibv_post_send(rdma->qp, &send_wr, &bad_wr);
2186
2187     if (ret == ENOMEM) {
2188         trace_qemu_rdma_write_one_queue_full();
2189         ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RDMA_WRITE, NULL);
2190         if (ret < 0) {
2191             error_setg(errp, "rdma migration: failed to make "
2192                          "room in full send queue!");
2193             return -1;
2194         }
2195
2196         goto retry;
2197
2198     } else if (ret > 0) {
2199         error_setg_errno(errp, ret,
2200                          "rdma migration: post rdma write failed");
2201         return -1;
2202     }
2203
2204     set_bit(chunk, block->transit_bitmap);
2205     stat64_add(&mig_stats.normal_pages, sge.length / qemu_target_page_size());
2206     /*
2207      * We are adding to transferred the amount of data written, but no
2208      * overhead at all.  I will asume that RDMA is magicaly and don't
2209      * need to transfer (at least) the addresses where it wants to
2210      * write the pages.  Here it looks like it should be something
2211      * like:
2212      *     sizeof(send_wr) + sge.length
2213      * but this being RDMA, who knows.
2214      */
2215     stat64_add(&mig_stats.rdma_bytes, sge.length);
2216     ram_transferred_add(sge.length);
2217     rdma->total_writes++;
2218
2219     return 0;
2220 }
2221
2222 /*
2223  * Push out any unwritten RDMA operations.
2224  *
2225  * We support sending out multiple chunks at the same time.
2226  * Not all of them need to get signaled in the completion queue.
2227  */
2228 static int qemu_rdma_write_flush(RDMAContext *rdma, Error **errp)
2229 {
2230     int ret;
2231
2232     if (!rdma->current_length) {
2233         return 0;
2234     }
2235
2236     ret = qemu_rdma_write_one(rdma, rdma->current_index, rdma->current_addr,
2237                               rdma->current_length, errp);
2238
2239     if (ret < 0) {
2240         return -1;
2241     }
2242
2243     if (ret == 0) {
2244         rdma->nb_sent++;
2245         trace_qemu_rdma_write_flush(rdma->nb_sent);
2246     }
2247
2248     rdma->current_length = 0;
2249     rdma->current_addr = 0;
2250
2251     return 0;
2252 }
2253
2254 static inline bool qemu_rdma_buffer_mergeable(RDMAContext *rdma,
2255                     uint64_t offset, uint64_t len)
2256 {
2257     RDMALocalBlock *block;
2258     uint8_t *host_addr;
2259     uint8_t *chunk_end;
2260
2261     if (rdma->current_index < 0) {
2262         return false;
2263     }
2264
2265     if (rdma->current_chunk < 0) {
2266         return false;
2267     }
2268
2269     block = &(rdma->local_ram_blocks.block[rdma->current_index]);
2270     host_addr = block->local_host_addr + (offset - block->offset);
2271     chunk_end = ram_chunk_end(block, rdma->current_chunk);
2272
2273     if (rdma->current_length == 0) {
2274         return false;
2275     }
2276
2277     /*
2278      * Only merge into chunk sequentially.
2279      */
2280     if (offset != (rdma->current_addr + rdma->current_length)) {
2281         return false;
2282     }
2283
2284     if (offset < block->offset) {
2285         return false;
2286     }
2287
2288     if ((offset + len) > (block->offset + block->length)) {
2289         return false;
2290     }
2291
2292     if ((host_addr + len) > chunk_end) {
2293         return false;
2294     }
2295
2296     return true;
2297 }
2298
2299 /*
2300  * We're not actually writing here, but doing three things:
2301  *
2302  * 1. Identify the chunk the buffer belongs to.
2303  * 2. If the chunk is full or the buffer doesn't belong to the current
2304  *    chunk, then start a new chunk and flush() the old chunk.
2305  * 3. To keep the hardware busy, we also group chunks into batches
2306  *    and only require that a batch gets acknowledged in the completion
2307  *    queue instead of each individual chunk.
2308  */
2309 static int qemu_rdma_write(RDMAContext *rdma,
2310                            uint64_t block_offset, uint64_t offset,
2311                            uint64_t len, Error **errp)
2312 {
2313     uint64_t current_addr = block_offset + offset;
2314     uint64_t index = rdma->current_index;
2315     uint64_t chunk = rdma->current_chunk;
2316
2317     /* If we cannot merge it, we flush the current buffer first. */
2318     if (!qemu_rdma_buffer_mergeable(rdma, current_addr, len)) {
2319         if (qemu_rdma_write_flush(rdma, errp) < 0) {
2320             return -1;
2321         }
2322         rdma->current_length = 0;
2323         rdma->current_addr = current_addr;
2324
2325         qemu_rdma_search_ram_block(rdma, block_offset,
2326                                    offset, len, &index, &chunk);
2327         rdma->current_index = index;
2328         rdma->current_chunk = chunk;
2329     }
2330
2331     /* merge it */
2332     rdma->current_length += len;
2333
2334     /* flush it if buffer is too large */
2335     if (rdma->current_length >= RDMA_MERGE_MAX) {
2336         return qemu_rdma_write_flush(rdma, errp);
2337     }
2338
2339     return 0;
2340 }
2341
2342 static void qemu_rdma_cleanup(RDMAContext *rdma)
2343 {
2344     Error *err = NULL;
2345
2346     if (rdma->cm_id && rdma->connected) {
2347         if ((rdma->errored ||
2348              migrate_get_current()->state == MIGRATION_STATUS_CANCELLING) &&
2349             !rdma->received_error) {
2350             RDMAControlHeader head = { .len = 0,
2351                                        .type = RDMA_CONTROL_ERROR,
2352                                        .repeat = 1,
2353                                      };
2354             warn_report("Early error. Sending error.");
2355             if (qemu_rdma_post_send_control(rdma, NULL, &head, &err) < 0) {
2356                 warn_report_err(err);
2357             }
2358         }
2359
2360         rdma_disconnect(rdma->cm_id);
2361         trace_qemu_rdma_cleanup_disconnect();
2362         rdma->connected = false;
2363     }
2364
2365     if (rdma->channel) {
2366         qemu_set_fd_handler(rdma->channel->fd, NULL, NULL, NULL);
2367     }
2368     g_free(rdma->dest_blocks);
2369     rdma->dest_blocks = NULL;
2370
2371     for (int i = 0; i < RDMA_WRID_MAX; i++) {
2372         if (rdma->wr_data[i].control_mr) {
2373             rdma->total_registrations--;
2374             ibv_dereg_mr(rdma->wr_data[i].control_mr);
2375         }
2376         rdma->wr_data[i].control_mr = NULL;
2377     }
2378
2379     if (rdma->local_ram_blocks.block) {
2380         while (rdma->local_ram_blocks.nb_blocks) {
2381             rdma_delete_block(rdma, &rdma->local_ram_blocks.block[0]);
2382         }
2383     }
2384
2385     if (rdma->qp) {
2386         rdma_destroy_qp(rdma->cm_id);
2387         rdma->qp = NULL;
2388     }
2389     if (rdma->recv_cq) {
2390         ibv_destroy_cq(rdma->recv_cq);
2391         rdma->recv_cq = NULL;
2392     }
2393     if (rdma->send_cq) {
2394         ibv_destroy_cq(rdma->send_cq);
2395         rdma->send_cq = NULL;
2396     }
2397     if (rdma->recv_comp_channel) {
2398         ibv_destroy_comp_channel(rdma->recv_comp_channel);
2399         rdma->recv_comp_channel = NULL;
2400     }
2401     if (rdma->send_comp_channel) {
2402         ibv_destroy_comp_channel(rdma->send_comp_channel);
2403         rdma->send_comp_channel = NULL;
2404     }
2405     if (rdma->pd) {
2406         ibv_dealloc_pd(rdma->pd);
2407         rdma->pd = NULL;
2408     }
2409     if (rdma->cm_id) {
2410         rdma_destroy_id(rdma->cm_id);
2411         rdma->cm_id = NULL;
2412     }
2413
2414     /* the destination side, listen_id and channel is shared */
2415     if (rdma->listen_id) {
2416         if (!rdma->is_return_path) {
2417             rdma_destroy_id(rdma->listen_id);
2418         }
2419         rdma->listen_id = NULL;
2420
2421         if (rdma->channel) {
2422             if (!rdma->is_return_path) {
2423                 rdma_destroy_event_channel(rdma->channel);
2424             }
2425             rdma->channel = NULL;
2426         }
2427     }
2428
2429     if (rdma->channel) {
2430         rdma_destroy_event_channel(rdma->channel);
2431         rdma->channel = NULL;
2432     }
2433     g_free(rdma->host);
2434     g_free(rdma->host_port);
2435     rdma->host = NULL;
2436     rdma->host_port = NULL;
2437 }
2438
2439
2440 static int qemu_rdma_source_init(RDMAContext *rdma, bool pin_all, Error **errp)
2441 {
2442     int ret;
2443
2444     /*
2445      * Will be validated against destination's actual capabilities
2446      * after the connect() completes.
2447      */
2448     rdma->pin_all = pin_all;
2449
2450     ret = qemu_rdma_resolve_host(rdma, errp);
2451     if (ret < 0) {
2452         goto err_rdma_source_init;
2453     }
2454
2455     ret = qemu_rdma_alloc_pd_cq(rdma, errp);
2456     if (ret < 0) {
2457         goto err_rdma_source_init;
2458     }
2459
2460     ret = qemu_rdma_alloc_qp(rdma);
2461     if (ret < 0) {
2462         error_setg(errp, "RDMA ERROR: rdma migration: error allocating qp!");
2463         goto err_rdma_source_init;
2464     }
2465
2466     qemu_rdma_init_ram_blocks(rdma);
2467
2468     /* Build the hash that maps from offset to RAMBlock */
2469     rdma->blockmap = g_hash_table_new(g_direct_hash, g_direct_equal);
2470     for (int i = 0; i < rdma->local_ram_blocks.nb_blocks; i++) {
2471         g_hash_table_insert(rdma->blockmap,
2472                 (void *)(uintptr_t)rdma->local_ram_blocks.block[i].offset,
2473                 &rdma->local_ram_blocks.block[i]);
2474     }
2475
2476     for (int i = 0; i < RDMA_WRID_MAX; i++) {
2477         ret = qemu_rdma_reg_control(rdma, i);
2478         if (ret < 0) {
2479             error_setg(errp, "RDMA ERROR: rdma migration: error "
2480                        "registering %d control!", i);
2481             goto err_rdma_source_init;
2482         }
2483     }
2484
2485     return 0;
2486
2487 err_rdma_source_init:
2488     qemu_rdma_cleanup(rdma);
2489     return -1;
2490 }
2491
2492 static int qemu_get_cm_event_timeout(RDMAContext *rdma,
2493                                      struct rdma_cm_event **cm_event,
2494                                      long msec, Error **errp)
2495 {
2496     int ret;
2497     struct pollfd poll_fd = {
2498                                 .fd = rdma->channel->fd,
2499                                 .events = POLLIN,
2500                                 .revents = 0
2501                             };
2502
2503     do {
2504         ret = poll(&poll_fd, 1, msec);
2505     } while (ret < 0 && errno == EINTR);
2506
2507     if (ret == 0) {
2508         error_setg(errp, "RDMA ERROR: poll cm event timeout");
2509         return -1;
2510     } else if (ret < 0) {
2511         error_setg(errp, "RDMA ERROR: failed to poll cm event, errno=%i",
2512                    errno);
2513         return -1;
2514     } else if (poll_fd.revents & POLLIN) {
2515         if (rdma_get_cm_event(rdma->channel, cm_event) < 0) {
2516             error_setg(errp, "RDMA ERROR: failed to get cm event");
2517             return -1;
2518         }
2519         return 0;
2520     } else {
2521         error_setg(errp, "RDMA ERROR: no POLLIN event, revent=%x",
2522                    poll_fd.revents);
2523         return -1;
2524     }
2525 }
2526
2527 static int qemu_rdma_connect(RDMAContext *rdma, bool return_path,
2528                              Error **errp)
2529 {
2530     RDMACapabilities cap = {
2531                                 .version = RDMA_CONTROL_VERSION_CURRENT,
2532                                 .flags = 0,
2533                            };
2534     struct rdma_conn_param conn_param = { .initiator_depth = 2,
2535                                           .retry_count = 5,
2536                                           .private_data = &cap,
2537                                           .private_data_len = sizeof(cap),
2538                                         };
2539     struct rdma_cm_event *cm_event;
2540     int ret;
2541
2542     /*
2543      * Only negotiate the capability with destination if the user
2544      * on the source first requested the capability.
2545      */
2546     if (rdma->pin_all) {
2547         trace_qemu_rdma_connect_pin_all_requested();
2548         cap.flags |= RDMA_CAPABILITY_PIN_ALL;
2549     }
2550
2551     caps_to_network(&cap);
2552
2553     ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY, errp);
2554     if (ret < 0) {
2555         goto err_rdma_source_connect;
2556     }
2557
2558     ret = rdma_connect(rdma->cm_id, &conn_param);
2559     if (ret < 0) {
2560         error_setg_errno(errp, errno,
2561                          "RDMA ERROR: connecting to destination!");
2562         goto err_rdma_source_connect;
2563     }
2564
2565     if (return_path) {
2566         ret = qemu_get_cm_event_timeout(rdma, &cm_event, 5000, errp);
2567     } else {
2568         ret = rdma_get_cm_event(rdma->channel, &cm_event);
2569         if (ret < 0) {
2570             error_setg_errno(errp, errno,
2571                              "RDMA ERROR: failed to get cm event");
2572         }
2573     }
2574     if (ret < 0) {
2575         goto err_rdma_source_connect;
2576     }
2577
2578     if (cm_event->event != RDMA_CM_EVENT_ESTABLISHED) {
2579         error_setg(errp, "RDMA ERROR: connecting to destination!");
2580         rdma_ack_cm_event(cm_event);
2581         goto err_rdma_source_connect;
2582     }
2583     rdma->connected = true;
2584
2585     memcpy(&cap, cm_event->param.conn.private_data, sizeof(cap));
2586     network_to_caps(&cap);
2587
2588     /*
2589      * Verify that the *requested* capabilities are supported by the destination
2590      * and disable them otherwise.
2591      */
2592     if (rdma->pin_all && !(cap.flags & RDMA_CAPABILITY_PIN_ALL)) {
2593         warn_report("RDMA: Server cannot support pinning all memory. "
2594                     "Will register memory dynamically.");
2595         rdma->pin_all = false;
2596     }
2597
2598     trace_qemu_rdma_connect_pin_all_outcome(rdma->pin_all);
2599
2600     rdma_ack_cm_event(cm_event);
2601
2602     rdma->control_ready_expected = 1;
2603     rdma->nb_sent = 0;
2604     return 0;
2605
2606 err_rdma_source_connect:
2607     qemu_rdma_cleanup(rdma);
2608     return -1;
2609 }
2610
2611 static int qemu_rdma_dest_init(RDMAContext *rdma, Error **errp)
2612 {
2613     Error *err = NULL;
2614     int ret;
2615     struct rdma_cm_id *listen_id;
2616     char ip[40] = "unknown";
2617     struct rdma_addrinfo *res, *e;
2618     char port_str[16];
2619     int reuse = 1;
2620
2621     for (int i = 0; i < RDMA_WRID_MAX; i++) {
2622         rdma->wr_data[i].control_len = 0;
2623         rdma->wr_data[i].control_curr = NULL;
2624     }
2625
2626     if (!rdma->host || !rdma->host[0]) {
2627         error_setg(errp, "RDMA ERROR: RDMA host is not set!");
2628         rdma->errored = true;
2629         return -1;
2630     }
2631     /* create CM channel */
2632     rdma->channel = rdma_create_event_channel();
2633     if (!rdma->channel) {
2634         error_setg(errp, "RDMA ERROR: could not create rdma event channel");
2635         rdma->errored = true;
2636         return -1;
2637     }
2638
2639     /* create CM id */
2640     ret = rdma_create_id(rdma->channel, &listen_id, NULL, RDMA_PS_TCP);
2641     if (ret < 0) {
2642         error_setg(errp, "RDMA ERROR: could not create cm_id!");
2643         goto err_dest_init_create_listen_id;
2644     }
2645
2646     snprintf(port_str, 16, "%d", rdma->port);
2647     port_str[15] = '\0';
2648
2649     ret = rdma_getaddrinfo(rdma->host, port_str, NULL, &res);
2650     if (ret) {
2651         error_setg(errp, "RDMA ERROR: could not rdma_getaddrinfo address %s",
2652                    rdma->host);
2653         goto err_dest_init_bind_addr;
2654     }
2655
2656     ret = rdma_set_option(listen_id, RDMA_OPTION_ID, RDMA_OPTION_ID_REUSEADDR,
2657                           &reuse, sizeof reuse);
2658     if (ret < 0) {
2659         error_setg(errp, "RDMA ERROR: Error: could not set REUSEADDR option");
2660         goto err_dest_init_bind_addr;
2661     }
2662
2663     /* Try all addresses, saving the first error in @err */
2664     for (e = res; e != NULL; e = e->ai_next) {
2665         Error **local_errp = err ? NULL : &err;
2666
2667         inet_ntop(e->ai_family,
2668             &((struct sockaddr_in *) e->ai_dst_addr)->sin_addr, ip, sizeof ip);
2669         trace_qemu_rdma_dest_init_trying(rdma->host, ip);
2670         ret = rdma_bind_addr(listen_id, e->ai_dst_addr);
2671         if (ret < 0) {
2672             continue;
2673         }
2674         if (e->ai_family == AF_INET6) {
2675             ret = qemu_rdma_broken_ipv6_kernel(listen_id->verbs,
2676                                                local_errp);
2677             if (ret < 0) {
2678                 continue;
2679             }
2680         }
2681         error_free(err);
2682         break;
2683     }
2684
2685     rdma_freeaddrinfo(res);
2686     if (!e) {
2687         if (err) {
2688             error_propagate(errp, err);
2689         } else {
2690             error_setg(errp, "RDMA ERROR: Error: could not rdma_bind_addr!");
2691         }
2692         goto err_dest_init_bind_addr;
2693     }
2694
2695     rdma->listen_id = listen_id;
2696     qemu_rdma_dump_gid("dest_init", listen_id);
2697     return 0;
2698
2699 err_dest_init_bind_addr:
2700     rdma_destroy_id(listen_id);
2701 err_dest_init_create_listen_id:
2702     rdma_destroy_event_channel(rdma->channel);
2703     rdma->channel = NULL;
2704     rdma->errored = true;
2705     return -1;
2706
2707 }
2708
2709 static void qemu_rdma_return_path_dest_init(RDMAContext *rdma_return_path,
2710                                             RDMAContext *rdma)
2711 {
2712     for (int i = 0; i < RDMA_WRID_MAX; i++) {
2713         rdma_return_path->wr_data[i].control_len = 0;
2714         rdma_return_path->wr_data[i].control_curr = NULL;
2715     }
2716
2717     /*the CM channel and CM id is shared*/
2718     rdma_return_path->channel = rdma->channel;
2719     rdma_return_path->listen_id = rdma->listen_id;
2720
2721     rdma->return_path = rdma_return_path;
2722     rdma_return_path->return_path = rdma;
2723     rdma_return_path->is_return_path = true;
2724 }
2725
2726 static RDMAContext *qemu_rdma_data_init(const char *host_port, Error **errp)
2727 {
2728     RDMAContext *rdma = NULL;
2729     InetSocketAddress *addr;
2730
2731     rdma = g_new0(RDMAContext, 1);
2732     rdma->current_index = -1;
2733     rdma->current_chunk = -1;
2734
2735     addr = g_new(InetSocketAddress, 1);
2736     if (!inet_parse(addr, host_port, NULL)) {
2737         rdma->port = atoi(addr->port);
2738         rdma->host = g_strdup(addr->host);
2739         rdma->host_port = g_strdup(host_port);
2740     } else {
2741         error_setg(errp, "RDMA ERROR: bad RDMA migration address '%s'",
2742                    host_port);
2743         g_free(rdma);
2744         rdma = NULL;
2745     }
2746
2747     qapi_free_InetSocketAddress(addr);
2748     return rdma;
2749 }
2750
2751 /*
2752  * QEMUFile interface to the control channel.
2753  * SEND messages for control only.
2754  * VM's ram is handled with regular RDMA messages.
2755  */
2756 static ssize_t qio_channel_rdma_writev(QIOChannel *ioc,
2757                                        const struct iovec *iov,
2758                                        size_t niov,
2759                                        int *fds,
2760                                        size_t nfds,
2761                                        int flags,
2762                                        Error **errp)
2763 {
2764     QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
2765     RDMAContext *rdma;
2766     int ret;
2767     ssize_t done = 0;
2768     size_t len;
2769
2770     RCU_READ_LOCK_GUARD();
2771     rdma = qatomic_rcu_read(&rioc->rdmaout);
2772
2773     if (!rdma) {
2774         error_setg(errp, "RDMA control channel output is not set");
2775         return -1;
2776     }
2777
2778     if (rdma->errored) {
2779         error_setg(errp,
2780                    "RDMA is in an error state waiting migration to abort!");
2781         return -1;
2782     }
2783
2784     /*
2785      * Push out any writes that
2786      * we're queued up for VM's ram.
2787      */
2788     ret = qemu_rdma_write_flush(rdma, errp);
2789     if (ret < 0) {
2790         rdma->errored = true;
2791         return -1;
2792     }
2793
2794     for (int i = 0; i < niov; i++) {
2795         size_t remaining = iov[i].iov_len;
2796         uint8_t * data = (void *)iov[i].iov_base;
2797         while (remaining) {
2798             RDMAControlHeader head = {};
2799
2800             len = MIN(remaining, RDMA_SEND_INCREMENT);
2801             remaining -= len;
2802
2803             head.len = len;
2804             head.type = RDMA_CONTROL_QEMU_FILE;
2805
2806             ret = qemu_rdma_exchange_send(rdma, &head,
2807                                           data, NULL, NULL, NULL, errp);
2808
2809             if (ret < 0) {
2810                 rdma->errored = true;
2811                 return -1;
2812             }
2813
2814             data += len;
2815             done += len;
2816         }
2817     }
2818
2819     return done;
2820 }
2821
2822 static size_t qemu_rdma_fill(RDMAContext *rdma, uint8_t *buf,
2823                              size_t size, int idx)
2824 {
2825     size_t len = 0;
2826
2827     if (rdma->wr_data[idx].control_len) {
2828         trace_qemu_rdma_fill(rdma->wr_data[idx].control_len, size);
2829
2830         len = MIN(size, rdma->wr_data[idx].control_len);
2831         memcpy(buf, rdma->wr_data[idx].control_curr, len);
2832         rdma->wr_data[idx].control_curr += len;
2833         rdma->wr_data[idx].control_len -= len;
2834     }
2835
2836     return len;
2837 }
2838
2839 /*
2840  * QEMUFile interface to the control channel.
2841  * RDMA links don't use bytestreams, so we have to
2842  * return bytes to QEMUFile opportunistically.
2843  */
2844 static ssize_t qio_channel_rdma_readv(QIOChannel *ioc,
2845                                       const struct iovec *iov,
2846                                       size_t niov,
2847                                       int **fds,
2848                                       size_t *nfds,
2849                                       int flags,
2850                                       Error **errp)
2851 {
2852     QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
2853     RDMAContext *rdma;
2854     RDMAControlHeader head;
2855     int ret;
2856     ssize_t done = 0;
2857     size_t len;
2858
2859     RCU_READ_LOCK_GUARD();
2860     rdma = qatomic_rcu_read(&rioc->rdmain);
2861
2862     if (!rdma) {
2863         error_setg(errp, "RDMA control channel input is not set");
2864         return -1;
2865     }
2866
2867     if (rdma->errored) {
2868         error_setg(errp,
2869                    "RDMA is in an error state waiting migration to abort!");
2870         return -1;
2871     }
2872
2873     for (int i = 0; i < niov; i++) {
2874         size_t want = iov[i].iov_len;
2875         uint8_t *data = (void *)iov[i].iov_base;
2876
2877         /*
2878          * First, we hold on to the last SEND message we
2879          * were given and dish out the bytes until we run
2880          * out of bytes.
2881          */
2882         len = qemu_rdma_fill(rdma, data, want, 0);
2883         done += len;
2884         want -= len;
2885         /* Got what we needed, so go to next iovec */
2886         if (want == 0) {
2887             continue;
2888         }
2889
2890         /* If we got any data so far, then don't wait
2891          * for more, just return what we have */
2892         if (done > 0) {
2893             break;
2894         }
2895
2896
2897         /* We've got nothing at all, so lets wait for
2898          * more to arrive
2899          */
2900         ret = qemu_rdma_exchange_recv(rdma, &head, RDMA_CONTROL_QEMU_FILE,
2901                                       errp);
2902
2903         if (ret < 0) {
2904             rdma->errored = true;
2905             return -1;
2906         }
2907
2908         /*
2909          * SEND was received with new bytes, now try again.
2910          */
2911         len = qemu_rdma_fill(rdma, data, want, 0);
2912         done += len;
2913         want -= len;
2914
2915         /* Still didn't get enough, so lets just return */
2916         if (want) {
2917             if (done == 0) {
2918                 return QIO_CHANNEL_ERR_BLOCK;
2919             } else {
2920                 break;
2921             }
2922         }
2923     }
2924     return done;
2925 }
2926
2927 /*
2928  * Block until all the outstanding chunks have been delivered by the hardware.
2929  */
2930 static int qemu_rdma_drain_cq(RDMAContext *rdma)
2931 {
2932     Error *err = NULL;
2933
2934     if (qemu_rdma_write_flush(rdma, &err) < 0) {
2935         error_report_err(err);
2936         return -1;
2937     }
2938
2939     while (rdma->nb_sent) {
2940         if (qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RDMA_WRITE, NULL) < 0) {
2941             error_report("rdma migration: complete polling error!");
2942             return -1;
2943         }
2944     }
2945
2946     qemu_rdma_unregister_waiting(rdma);
2947
2948     return 0;
2949 }
2950
2951
2952 static int qio_channel_rdma_set_blocking(QIOChannel *ioc,
2953                                          bool blocking,
2954                                          Error **errp)
2955 {
2956     QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
2957     /* XXX we should make readv/writev actually honour this :-) */
2958     rioc->blocking = blocking;
2959     return 0;
2960 }
2961
2962
2963 typedef struct QIOChannelRDMASource QIOChannelRDMASource;
2964 struct QIOChannelRDMASource {
2965     GSource parent;
2966     QIOChannelRDMA *rioc;
2967     GIOCondition condition;
2968 };
2969
2970 static gboolean
2971 qio_channel_rdma_source_prepare(GSource *source,
2972                                 gint *timeout)
2973 {
2974     QIOChannelRDMASource *rsource = (QIOChannelRDMASource *)source;
2975     RDMAContext *rdma;
2976     GIOCondition cond = 0;
2977     *timeout = -1;
2978
2979     RCU_READ_LOCK_GUARD();
2980     if (rsource->condition == G_IO_IN) {
2981         rdma = qatomic_rcu_read(&rsource->rioc->rdmain);
2982     } else {
2983         rdma = qatomic_rcu_read(&rsource->rioc->rdmaout);
2984     }
2985
2986     if (!rdma) {
2987         error_report("RDMAContext is NULL when prepare Gsource");
2988         return FALSE;
2989     }
2990
2991     if (rdma->wr_data[0].control_len) {
2992         cond |= G_IO_IN;
2993     }
2994     cond |= G_IO_OUT;
2995
2996     return cond & rsource->condition;
2997 }
2998
2999 static gboolean
3000 qio_channel_rdma_source_check(GSource *source)
3001 {
3002     QIOChannelRDMASource *rsource = (QIOChannelRDMASource *)source;
3003     RDMAContext *rdma;
3004     GIOCondition cond = 0;
3005
3006     RCU_READ_LOCK_GUARD();
3007     if (rsource->condition == G_IO_IN) {
3008         rdma = qatomic_rcu_read(&rsource->rioc->rdmain);
3009     } else {
3010         rdma = qatomic_rcu_read(&rsource->rioc->rdmaout);
3011     }
3012
3013     if (!rdma) {
3014         error_report("RDMAContext is NULL when check Gsource");
3015         return FALSE;
3016     }
3017
3018     if (rdma->wr_data[0].control_len) {
3019         cond |= G_IO_IN;
3020     }
3021     cond |= G_IO_OUT;
3022
3023     return cond & rsource->condition;
3024 }
3025
3026 static gboolean
3027 qio_channel_rdma_source_dispatch(GSource *source,
3028                                  GSourceFunc callback,
3029                                  gpointer user_data)
3030 {
3031     QIOChannelFunc func = (QIOChannelFunc)callback;
3032     QIOChannelRDMASource *rsource = (QIOChannelRDMASource *)source;
3033     RDMAContext *rdma;
3034     GIOCondition cond = 0;
3035
3036     RCU_READ_LOCK_GUARD();
3037     if (rsource->condition == G_IO_IN) {
3038         rdma = qatomic_rcu_read(&rsource->rioc->rdmain);
3039     } else {
3040         rdma = qatomic_rcu_read(&rsource->rioc->rdmaout);
3041     }
3042
3043     if (!rdma) {
3044         error_report("RDMAContext is NULL when dispatch Gsource");
3045         return FALSE;
3046     }
3047
3048     if (rdma->wr_data[0].control_len) {
3049         cond |= G_IO_IN;
3050     }
3051     cond |= G_IO_OUT;
3052
3053     return (*func)(QIO_CHANNEL(rsource->rioc),
3054                    (cond & rsource->condition),
3055                    user_data);
3056 }
3057
3058 static void
3059 qio_channel_rdma_source_finalize(GSource *source)
3060 {
3061     QIOChannelRDMASource *ssource = (QIOChannelRDMASource *)source;
3062
3063     object_unref(OBJECT(ssource->rioc));
3064 }
3065
3066 static GSourceFuncs qio_channel_rdma_source_funcs = {
3067     qio_channel_rdma_source_prepare,
3068     qio_channel_rdma_source_check,
3069     qio_channel_rdma_source_dispatch,
3070     qio_channel_rdma_source_finalize
3071 };
3072
3073 static GSource *qio_channel_rdma_create_watch(QIOChannel *ioc,
3074                                               GIOCondition condition)
3075 {
3076     QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
3077     QIOChannelRDMASource *ssource;
3078     GSource *source;
3079
3080     source = g_source_new(&qio_channel_rdma_source_funcs,
3081                           sizeof(QIOChannelRDMASource));
3082     ssource = (QIOChannelRDMASource *)source;
3083
3084     ssource->rioc = rioc;
3085     object_ref(OBJECT(rioc));
3086
3087     ssource->condition = condition;
3088
3089     return source;
3090 }
3091
3092 static void qio_channel_rdma_set_aio_fd_handler(QIOChannel *ioc,
3093                                                 AioContext *read_ctx,
3094                                                 IOHandler *io_read,
3095                                                 AioContext *write_ctx,
3096                                                 IOHandler *io_write,
3097                                                 void *opaque)
3098 {
3099     QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
3100     if (io_read) {
3101         aio_set_fd_handler(read_ctx, rioc->rdmain->recv_comp_channel->fd,
3102                            io_read, io_write, NULL, NULL, opaque);
3103         aio_set_fd_handler(read_ctx, rioc->rdmain->send_comp_channel->fd,
3104                            io_read, io_write, NULL, NULL, opaque);
3105     } else {
3106         aio_set_fd_handler(write_ctx, rioc->rdmaout->recv_comp_channel->fd,
3107                            io_read, io_write, NULL, NULL, opaque);
3108         aio_set_fd_handler(write_ctx, rioc->rdmaout->send_comp_channel->fd,
3109                            io_read, io_write, NULL, NULL, opaque);
3110     }
3111 }
3112
3113 struct rdma_close_rcu {
3114     struct rcu_head rcu;
3115     RDMAContext *rdmain;
3116     RDMAContext *rdmaout;
3117 };
3118
3119 /* callback from qio_channel_rdma_close via call_rcu */
3120 static void qio_channel_rdma_close_rcu(struct rdma_close_rcu *rcu)
3121 {
3122     if (rcu->rdmain) {
3123         qemu_rdma_cleanup(rcu->rdmain);
3124     }
3125
3126     if (rcu->rdmaout) {
3127         qemu_rdma_cleanup(rcu->rdmaout);
3128     }
3129
3130     g_free(rcu->rdmain);
3131     g_free(rcu->rdmaout);
3132     g_free(rcu);
3133 }
3134
3135 static int qio_channel_rdma_close(QIOChannel *ioc,
3136                                   Error **errp)
3137 {
3138     QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
3139     RDMAContext *rdmain, *rdmaout;
3140     struct rdma_close_rcu *rcu = g_new(struct rdma_close_rcu, 1);
3141
3142     trace_qemu_rdma_close();
3143
3144     rdmain = rioc->rdmain;
3145     if (rdmain) {
3146         qatomic_rcu_set(&rioc->rdmain, NULL);
3147     }
3148
3149     rdmaout = rioc->rdmaout;
3150     if (rdmaout) {
3151         qatomic_rcu_set(&rioc->rdmaout, NULL);
3152     }
3153
3154     rcu->rdmain = rdmain;
3155     rcu->rdmaout = rdmaout;
3156     call_rcu(rcu, qio_channel_rdma_close_rcu, rcu);
3157
3158     return 0;
3159 }
3160
3161 static int
3162 qio_channel_rdma_shutdown(QIOChannel *ioc,
3163                             QIOChannelShutdown how,
3164                             Error **errp)
3165 {
3166     QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(ioc);
3167     RDMAContext *rdmain, *rdmaout;
3168
3169     RCU_READ_LOCK_GUARD();
3170
3171     rdmain = qatomic_rcu_read(&rioc->rdmain);
3172     rdmaout = qatomic_rcu_read(&rioc->rdmain);
3173
3174     switch (how) {
3175     case QIO_CHANNEL_SHUTDOWN_READ:
3176         if (rdmain) {
3177             rdmain->errored = true;
3178         }
3179         break;
3180     case QIO_CHANNEL_SHUTDOWN_WRITE:
3181         if (rdmaout) {
3182             rdmaout->errored = true;
3183         }
3184         break;
3185     case QIO_CHANNEL_SHUTDOWN_BOTH:
3186     default:
3187         if (rdmain) {
3188             rdmain->errored = true;
3189         }
3190         if (rdmaout) {
3191             rdmaout->errored = true;
3192         }
3193         break;
3194     }
3195
3196     return 0;
3197 }
3198
3199 /*
3200  * Parameters:
3201  *    @offset == 0 :
3202  *        This means that 'block_offset' is a full virtual address that does not
3203  *        belong to a RAMBlock of the virtual machine and instead
3204  *        represents a private malloc'd memory area that the caller wishes to
3205  *        transfer.
3206  *
3207  *    @offset != 0 :
3208  *        Offset is an offset to be added to block_offset and used
3209  *        to also lookup the corresponding RAMBlock.
3210  *
3211  *    @size : Number of bytes to transfer
3212  *
3213  *    @pages_sent : User-specificed pointer to indicate how many pages were
3214  *                  sent. Usually, this will not be more than a few bytes of
3215  *                  the protocol because most transfers are sent asynchronously.
3216  */
3217 static int qemu_rdma_save_page(QEMUFile *f, ram_addr_t block_offset,
3218                                ram_addr_t offset, size_t size)
3219 {
3220     QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(qemu_file_get_ioc(f));
3221     Error *err = NULL;
3222     RDMAContext *rdma;
3223     int ret;
3224
3225     RCU_READ_LOCK_GUARD();
3226     rdma = qatomic_rcu_read(&rioc->rdmaout);
3227
3228     if (!rdma) {
3229         return -1;
3230     }
3231
3232     if (rdma_errored(rdma)) {
3233         return -1;
3234     }
3235
3236     qemu_fflush(f);
3237
3238     /*
3239      * Add this page to the current 'chunk'. If the chunk
3240      * is full, or the page doesn't belong to the current chunk,
3241      * an actual RDMA write will occur and a new chunk will be formed.
3242      */
3243     ret = qemu_rdma_write(rdma, block_offset, offset, size, &err);
3244     if (ret < 0) {
3245         error_report_err(err);
3246         goto err;
3247     }
3248
3249     /*
3250      * Drain the Completion Queue if possible, but do not block,
3251      * just poll.
3252      *
3253      * If nothing to poll, the end of the iteration will do this
3254      * again to make sure we don't overflow the request queue.
3255      */
3256     while (1) {
3257         uint64_t wr_id, wr_id_in;
3258         ret = qemu_rdma_poll(rdma, rdma->recv_cq, &wr_id_in, NULL);
3259
3260         if (ret < 0) {
3261             error_report("rdma migration: polling error");
3262             goto err;
3263         }
3264
3265         wr_id = wr_id_in & RDMA_WRID_TYPE_MASK;
3266
3267         if (wr_id == RDMA_WRID_NONE) {
3268             break;
3269         }
3270     }
3271
3272     while (1) {
3273         uint64_t wr_id, wr_id_in;
3274         ret = qemu_rdma_poll(rdma, rdma->send_cq, &wr_id_in, NULL);
3275
3276         if (ret < 0) {
3277             error_report("rdma migration: polling error");
3278             goto err;
3279         }
3280
3281         wr_id = wr_id_in & RDMA_WRID_TYPE_MASK;
3282
3283         if (wr_id == RDMA_WRID_NONE) {
3284             break;
3285         }
3286     }
3287
3288     return RAM_SAVE_CONTROL_DELAYED;
3289
3290 err:
3291     rdma->errored = true;
3292     return -1;
3293 }
3294
3295 int rdma_control_save_page(QEMUFile *f, ram_addr_t block_offset,
3296                            ram_addr_t offset, size_t size)
3297 {
3298     if (!migrate_rdma() || migration_in_postcopy()) {
3299         return RAM_SAVE_CONTROL_NOT_SUPP;
3300     }
3301
3302     int ret = qemu_rdma_save_page(f, block_offset, offset, size);
3303
3304     if (ret != RAM_SAVE_CONTROL_DELAYED &&
3305         ret != RAM_SAVE_CONTROL_NOT_SUPP) {
3306         if (ret < 0) {
3307             qemu_file_set_error(f, ret);
3308         }
3309     }
3310     return ret;
3311 }
3312
3313 static void rdma_accept_incoming_migration(void *opaque);
3314
3315 static void rdma_cm_poll_handler(void *opaque)
3316 {
3317     RDMAContext *rdma = opaque;
3318     struct rdma_cm_event *cm_event;
3319     MigrationIncomingState *mis = migration_incoming_get_current();
3320
3321     if (rdma_get_cm_event(rdma->channel, &cm_event) < 0) {
3322         error_report("get_cm_event failed %d", errno);
3323         return;
3324     }
3325
3326     if (cm_event->event == RDMA_CM_EVENT_DISCONNECTED ||
3327         cm_event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) {
3328         if (!rdma->errored &&
3329             migration_incoming_get_current()->state !=
3330               MIGRATION_STATUS_COMPLETED) {
3331             error_report("receive cm event, cm event is %d", cm_event->event);
3332             rdma->errored = true;
3333             if (rdma->return_path) {
3334                 rdma->return_path->errored = true;
3335             }
3336         }
3337         rdma_ack_cm_event(cm_event);
3338         if (mis->loadvm_co) {
3339             qemu_coroutine_enter(mis->loadvm_co);
3340         }
3341         return;
3342     }
3343     rdma_ack_cm_event(cm_event);
3344 }
3345
3346 static int qemu_rdma_accept(RDMAContext *rdma)
3347 {
3348     Error *err = NULL;
3349     RDMACapabilities cap;
3350     struct rdma_conn_param conn_param = {
3351                                             .responder_resources = 2,
3352                                             .private_data = &cap,
3353                                             .private_data_len = sizeof(cap),
3354                                          };
3355     RDMAContext *rdma_return_path = NULL;
3356     struct rdma_cm_event *cm_event;
3357     struct ibv_context *verbs;
3358     int ret;
3359
3360     ret = rdma_get_cm_event(rdma->channel, &cm_event);
3361     if (ret < 0) {
3362         goto err_rdma_dest_wait;
3363     }
3364
3365     if (cm_event->event != RDMA_CM_EVENT_CONNECT_REQUEST) {
3366         rdma_ack_cm_event(cm_event);
3367         goto err_rdma_dest_wait;
3368     }
3369
3370     /*
3371      * initialize the RDMAContext for return path for postcopy after first
3372      * connection request reached.
3373      */
3374     if ((migrate_postcopy() || migrate_return_path())
3375         && !rdma->is_return_path) {
3376         rdma_return_path = qemu_rdma_data_init(rdma->host_port, NULL);
3377         if (rdma_return_path == NULL) {
3378             rdma_ack_cm_event(cm_event);
3379             goto err_rdma_dest_wait;
3380         }
3381
3382         qemu_rdma_return_path_dest_init(rdma_return_path, rdma);
3383     }
3384
3385     memcpy(&cap, cm_event->param.conn.private_data, sizeof(cap));
3386
3387     network_to_caps(&cap);
3388
3389     if (cap.version < 1 || cap.version > RDMA_CONTROL_VERSION_CURRENT) {
3390         error_report("Unknown source RDMA version: %d, bailing...",
3391                      cap.version);
3392         rdma_ack_cm_event(cm_event);
3393         goto err_rdma_dest_wait;
3394     }
3395
3396     /*
3397      * Respond with only the capabilities this version of QEMU knows about.
3398      */
3399     cap.flags &= known_capabilities;
3400
3401     /*
3402      * Enable the ones that we do know about.
3403      * Add other checks here as new ones are introduced.
3404      */
3405     if (cap.flags & RDMA_CAPABILITY_PIN_ALL) {
3406         rdma->pin_all = true;
3407     }
3408
3409     rdma->cm_id = cm_event->id;
3410     verbs = cm_event->id->verbs;
3411
3412     rdma_ack_cm_event(cm_event);
3413
3414     trace_qemu_rdma_accept_pin_state(rdma->pin_all);
3415
3416     caps_to_network(&cap);
3417
3418     trace_qemu_rdma_accept_pin_verbsc(verbs);
3419
3420     if (!rdma->verbs) {
3421         rdma->verbs = verbs;
3422     } else if (rdma->verbs != verbs) {
3423         error_report("ibv context not matching %p, %p!", rdma->verbs,
3424                      verbs);
3425         goto err_rdma_dest_wait;
3426     }
3427
3428     qemu_rdma_dump_id("dest_init", verbs);
3429
3430     ret = qemu_rdma_alloc_pd_cq(rdma, &err);
3431     if (ret < 0) {
3432         error_report_err(err);
3433         goto err_rdma_dest_wait;
3434     }
3435
3436     ret = qemu_rdma_alloc_qp(rdma);
3437     if (ret < 0) {
3438         error_report("rdma migration: error allocating qp!");
3439         goto err_rdma_dest_wait;
3440     }
3441
3442     qemu_rdma_init_ram_blocks(rdma);
3443
3444     for (int i = 0; i < RDMA_WRID_MAX; i++) {
3445         ret = qemu_rdma_reg_control(rdma, i);
3446         if (ret < 0) {
3447             error_report("rdma: error registering %d control", i);
3448             goto err_rdma_dest_wait;
3449         }
3450     }
3451
3452     /* Accept the second connection request for return path */
3453     if ((migrate_postcopy() || migrate_return_path())
3454         && !rdma->is_return_path) {
3455         qemu_set_fd_handler(rdma->channel->fd, rdma_accept_incoming_migration,
3456                             NULL,
3457                             (void *)(intptr_t)rdma->return_path);
3458     } else {
3459         qemu_set_fd_handler(rdma->channel->fd, rdma_cm_poll_handler,
3460                             NULL, rdma);
3461     }
3462
3463     ret = rdma_accept(rdma->cm_id, &conn_param);
3464     if (ret < 0) {
3465         error_report("rdma_accept failed");
3466         goto err_rdma_dest_wait;
3467     }
3468
3469     ret = rdma_get_cm_event(rdma->channel, &cm_event);
3470     if (ret < 0) {
3471         error_report("rdma_accept get_cm_event failed");
3472         goto err_rdma_dest_wait;
3473     }
3474
3475     if (cm_event->event != RDMA_CM_EVENT_ESTABLISHED) {
3476         error_report("rdma_accept not event established");
3477         rdma_ack_cm_event(cm_event);
3478         goto err_rdma_dest_wait;
3479     }
3480
3481     rdma_ack_cm_event(cm_event);
3482     rdma->connected = true;
3483
3484     ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY, &err);
3485     if (ret < 0) {
3486         error_report_err(err);
3487         goto err_rdma_dest_wait;
3488     }
3489
3490     qemu_rdma_dump_gid("dest_connect", rdma->cm_id);
3491
3492     return 0;
3493
3494 err_rdma_dest_wait:
3495     rdma->errored = true;
3496     qemu_rdma_cleanup(rdma);
3497     g_free(rdma_return_path);
3498     return -1;
3499 }
3500
3501 static int dest_ram_sort_func(const void *a, const void *b)
3502 {
3503     unsigned int a_index = ((const RDMALocalBlock *)a)->src_index;
3504     unsigned int b_index = ((const RDMALocalBlock *)b)->src_index;
3505
3506     return (a_index < b_index) ? -1 : (a_index != b_index);
3507 }
3508
3509 /*
3510  * During each iteration of the migration, we listen for instructions
3511  * by the source VM to perform dynamic page registrations before they
3512  * can perform RDMA operations.
3513  *
3514  * We respond with the 'rkey'.
3515  *
3516  * Keep doing this until the source tells us to stop.
3517  */
3518 int rdma_registration_handle(QEMUFile *f)
3519 {
3520     RDMAControlHeader reg_resp = { .len = sizeof(RDMARegisterResult),
3521                                .type = RDMA_CONTROL_REGISTER_RESULT,
3522                                .repeat = 0,
3523                              };
3524     RDMAControlHeader unreg_resp = { .len = 0,
3525                                .type = RDMA_CONTROL_UNREGISTER_FINISHED,
3526                                .repeat = 0,
3527                              };
3528     RDMAControlHeader blocks = { .type = RDMA_CONTROL_RAM_BLOCKS_RESULT,
3529                                  .repeat = 1 };
3530     QIOChannelRDMA *rioc;
3531     Error *err = NULL;
3532     RDMAContext *rdma;
3533     RDMALocalBlocks *local;
3534     RDMAControlHeader head;
3535     RDMARegister *reg, *registers;
3536     RDMACompress *comp;
3537     RDMARegisterResult *reg_result;
3538     static RDMARegisterResult results[RDMA_CONTROL_MAX_COMMANDS_PER_MESSAGE];
3539     RDMALocalBlock *block;
3540     void *host_addr;
3541     int ret;
3542     int idx = 0;
3543
3544     if (!migrate_rdma()) {
3545         return 0;
3546     }
3547
3548     RCU_READ_LOCK_GUARD();
3549     rioc = QIO_CHANNEL_RDMA(qemu_file_get_ioc(f));
3550     rdma = qatomic_rcu_read(&rioc->rdmain);
3551
3552     if (!rdma) {
3553         return -1;
3554     }
3555
3556     if (rdma_errored(rdma)) {
3557         return -1;
3558     }
3559
3560     local = &rdma->local_ram_blocks;
3561     do {
3562         trace_rdma_registration_handle_wait();
3563
3564         ret = qemu_rdma_exchange_recv(rdma, &head, RDMA_CONTROL_NONE, &err);
3565
3566         if (ret < 0) {
3567             error_report_err(err);
3568             break;
3569         }
3570
3571         if (head.repeat > RDMA_CONTROL_MAX_COMMANDS_PER_MESSAGE) {
3572             error_report("rdma: Too many requests in this message (%d)."
3573                             "Bailing.", head.repeat);
3574             break;
3575         }
3576
3577         switch (head.type) {
3578         case RDMA_CONTROL_COMPRESS:
3579             comp = (RDMACompress *) rdma->wr_data[idx].control_curr;
3580             network_to_compress(comp);
3581
3582             trace_rdma_registration_handle_compress(comp->length,
3583                                                     comp->block_idx,
3584                                                     comp->offset);
3585             if (comp->block_idx >= rdma->local_ram_blocks.nb_blocks) {
3586                 error_report("rdma: 'compress' bad block index %u (vs %d)",
3587                              (unsigned int)comp->block_idx,
3588                              rdma->local_ram_blocks.nb_blocks);
3589                 goto err;
3590             }
3591             block = &(rdma->local_ram_blocks.block[comp->block_idx]);
3592
3593             host_addr = block->local_host_addr +
3594                             (comp->offset - block->offset);
3595             if (comp->value) {
3596                 error_report("rdma: Zero page with non-zero (%d) value",
3597                              comp->value);
3598                 goto err;
3599             }
3600             ram_handle_zero(host_addr, comp->length);
3601             break;
3602
3603         case RDMA_CONTROL_REGISTER_FINISHED:
3604             trace_rdma_registration_handle_finished();
3605             return 0;
3606
3607         case RDMA_CONTROL_RAM_BLOCKS_REQUEST:
3608             trace_rdma_registration_handle_ram_blocks();
3609
3610             /* Sort our local RAM Block list so it's the same as the source,
3611              * we can do this since we've filled in a src_index in the list
3612              * as we received the RAMBlock list earlier.
3613              */
3614             qsort(rdma->local_ram_blocks.block,
3615                   rdma->local_ram_blocks.nb_blocks,
3616                   sizeof(RDMALocalBlock), dest_ram_sort_func);
3617             for (int i = 0; i < local->nb_blocks; i++) {
3618                 local->block[i].index = i;
3619             }
3620
3621             if (rdma->pin_all) {
3622                 ret = qemu_rdma_reg_whole_ram_blocks(rdma, &err);
3623                 if (ret < 0) {
3624                     error_report_err(err);
3625                     goto err;
3626                 }
3627             }
3628
3629             /*
3630              * Dest uses this to prepare to transmit the RAMBlock descriptions
3631              * to the source VM after connection setup.
3632              * Both sides use the "remote" structure to communicate and update
3633              * their "local" descriptions with what was sent.
3634              */
3635             for (int i = 0; i < local->nb_blocks; i++) {
3636                 rdma->dest_blocks[i].remote_host_addr =
3637                     (uintptr_t)(local->block[i].local_host_addr);
3638
3639                 if (rdma->pin_all) {
3640                     rdma->dest_blocks[i].remote_rkey = local->block[i].mr->rkey;
3641                 }
3642
3643                 rdma->dest_blocks[i].offset = local->block[i].offset;
3644                 rdma->dest_blocks[i].length = local->block[i].length;
3645
3646                 dest_block_to_network(&rdma->dest_blocks[i]);
3647                 trace_rdma_registration_handle_ram_blocks_loop(
3648                     local->block[i].block_name,
3649                     local->block[i].offset,
3650                     local->block[i].length,
3651                     local->block[i].local_host_addr,
3652                     local->block[i].src_index);
3653             }
3654
3655             blocks.len = rdma->local_ram_blocks.nb_blocks
3656                                                 * sizeof(RDMADestBlock);
3657
3658
3659             ret = qemu_rdma_post_send_control(rdma,
3660                                     (uint8_t *) rdma->dest_blocks, &blocks,
3661                                     &err);
3662
3663             if (ret < 0) {
3664                 error_report_err(err);
3665                 goto err;
3666             }
3667
3668             break;
3669         case RDMA_CONTROL_REGISTER_REQUEST:
3670             trace_rdma_registration_handle_register(head.repeat);
3671
3672             reg_resp.repeat = head.repeat;
3673             registers = (RDMARegister *) rdma->wr_data[idx].control_curr;
3674
3675             for (int count = 0; count < head.repeat; count++) {
3676                 uint64_t chunk;
3677                 uint8_t *chunk_start, *chunk_end;
3678
3679                 reg = &registers[count];
3680                 network_to_register(reg);
3681
3682                 reg_result = &results[count];
3683
3684                 trace_rdma_registration_handle_register_loop(count,
3685                          reg->current_index, reg->key.current_addr, reg->chunks);
3686
3687                 if (reg->current_index >= rdma->local_ram_blocks.nb_blocks) {
3688                     error_report("rdma: 'register' bad block index %u (vs %d)",
3689                                  (unsigned int)reg->current_index,
3690                                  rdma->local_ram_blocks.nb_blocks);
3691                     goto err;
3692                 }
3693                 block = &(rdma->local_ram_blocks.block[reg->current_index]);
3694                 if (block->is_ram_block) {
3695                     if (block->offset > reg->key.current_addr) {
3696                         error_report("rdma: bad register address for block %s"
3697                             " offset: %" PRIx64 " current_addr: %" PRIx64,
3698                             block->block_name, block->offset,
3699                             reg->key.current_addr);
3700                         goto err;
3701                     }
3702                     host_addr = (block->local_host_addr +
3703                                 (reg->key.current_addr - block->offset));
3704                     chunk = ram_chunk_index(block->local_host_addr,
3705                                             (uint8_t *) host_addr);
3706                 } else {
3707                     chunk = reg->key.chunk;
3708                     host_addr = block->local_host_addr +
3709                         (reg->key.chunk * (1UL << RDMA_REG_CHUNK_SHIFT));
3710                     /* Check for particularly bad chunk value */
3711                     if (host_addr < (void *)block->local_host_addr) {
3712                         error_report("rdma: bad chunk for block %s"
3713                             " chunk: %" PRIx64,
3714                             block->block_name, reg->key.chunk);
3715                         goto err;
3716                     }
3717                 }
3718                 chunk_start = ram_chunk_start(block, chunk);
3719                 chunk_end = ram_chunk_end(block, chunk + reg->chunks);
3720                 /* avoid "-Waddress-of-packed-member" warning */
3721                 uint32_t tmp_rkey = 0;
3722                 if (qemu_rdma_register_and_get_keys(rdma, block,
3723                             (uintptr_t)host_addr, NULL, &tmp_rkey,
3724                             chunk, chunk_start, chunk_end)) {
3725                     error_report("cannot get rkey");
3726                     goto err;
3727                 }
3728                 reg_result->rkey = tmp_rkey;
3729
3730                 reg_result->host_addr = (uintptr_t)block->local_host_addr;
3731
3732                 trace_rdma_registration_handle_register_rkey(reg_result->rkey);
3733
3734                 result_to_network(reg_result);
3735             }
3736
3737             ret = qemu_rdma_post_send_control(rdma,
3738                             (uint8_t *) results, &reg_resp, &err);
3739
3740             if (ret < 0) {
3741                 error_report_err(err);
3742                 goto err;
3743             }
3744             break;
3745         case RDMA_CONTROL_UNREGISTER_REQUEST:
3746             trace_rdma_registration_handle_unregister(head.repeat);
3747             unreg_resp.repeat = head.repeat;
3748             registers = (RDMARegister *) rdma->wr_data[idx].control_curr;
3749
3750             for (int count = 0; count < head.repeat; count++) {
3751                 reg = &registers[count];
3752                 network_to_register(reg);
3753
3754                 trace_rdma_registration_handle_unregister_loop(count,
3755                            reg->current_index, reg->key.chunk);
3756
3757                 block = &(rdma->local_ram_blocks.block[reg->current_index]);
3758
3759                 ret = ibv_dereg_mr(block->pmr[reg->key.chunk]);
3760                 block->pmr[reg->key.chunk] = NULL;
3761
3762                 if (ret != 0) {
3763                     error_report("rdma unregistration chunk failed: %s",
3764                                  strerror(errno));
3765                     goto err;
3766                 }
3767
3768                 rdma->total_registrations--;
3769
3770                 trace_rdma_registration_handle_unregister_success(reg->key.chunk);
3771             }
3772
3773             ret = qemu_rdma_post_send_control(rdma, NULL, &unreg_resp, &err);
3774
3775             if (ret < 0) {
3776                 error_report_err(err);
3777                 goto err;
3778             }
3779             break;
3780         case RDMA_CONTROL_REGISTER_RESULT:
3781             error_report("Invalid RESULT message at dest.");
3782             goto err;
3783         default:
3784             error_report("Unknown control message %s", control_desc(head.type));
3785             goto err;
3786         }
3787     } while (1);
3788
3789 err:
3790     rdma->errored = true;
3791     return -1;
3792 }
3793
3794 /* Destination:
3795  * Called during the initial RAM load section which lists the
3796  * RAMBlocks by name.  This lets us know the order of the RAMBlocks on
3797  * the source.  We've already built our local RAMBlock list, but not
3798  * yet sent the list to the source.
3799  */
3800 int rdma_block_notification_handle(QEMUFile *f, const char *name)
3801 {
3802     int curr;
3803     int found = -1;
3804
3805     if (!migrate_rdma()) {
3806         return 0;
3807     }
3808
3809     RCU_READ_LOCK_GUARD();
3810     QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(qemu_file_get_ioc(f));
3811     RDMAContext *rdma = qatomic_rcu_read(&rioc->rdmain);
3812
3813     if (!rdma) {
3814         return -1;
3815     }
3816
3817     /* Find the matching RAMBlock in our local list */
3818     for (curr = 0; curr < rdma->local_ram_blocks.nb_blocks; curr++) {
3819         if (!strcmp(rdma->local_ram_blocks.block[curr].block_name, name)) {
3820             found = curr;
3821             break;
3822         }
3823     }
3824
3825     if (found == -1) {
3826         error_report("RAMBlock '%s' not found on destination", name);
3827         return -1;
3828     }
3829
3830     rdma->local_ram_blocks.block[curr].src_index = rdma->next_src_index;
3831     trace_rdma_block_notification_handle(name, rdma->next_src_index);
3832     rdma->next_src_index++;
3833
3834     return 0;
3835 }
3836
3837 int rdma_registration_start(QEMUFile *f, uint64_t flags)
3838 {
3839     if (!migrate_rdma() || migration_in_postcopy()) {
3840         return 0;
3841     }
3842
3843     QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(qemu_file_get_ioc(f));
3844     RCU_READ_LOCK_GUARD();
3845     RDMAContext *rdma = qatomic_rcu_read(&rioc->rdmaout);
3846     if (!rdma) {
3847         return -1;
3848     }
3849
3850     if (rdma_errored(rdma)) {
3851         return -1;
3852     }
3853
3854     trace_rdma_registration_start(flags);
3855     qemu_put_be64(f, RAM_SAVE_FLAG_HOOK);
3856     return qemu_fflush(f);
3857 }
3858
3859 /*
3860  * Inform dest that dynamic registrations are done for now.
3861  * First, flush writes, if any.
3862  */
3863 int rdma_registration_stop(QEMUFile *f, uint64_t flags)
3864 {
3865     QIOChannelRDMA *rioc;
3866     Error *err = NULL;
3867     RDMAContext *rdma;
3868     RDMAControlHeader head = { .len = 0, .repeat = 1 };
3869     int ret;
3870
3871     if (!migrate_rdma() || migration_in_postcopy()) {
3872         return 0;
3873     }
3874
3875     RCU_READ_LOCK_GUARD();
3876     rioc = QIO_CHANNEL_RDMA(qemu_file_get_ioc(f));
3877     rdma = qatomic_rcu_read(&rioc->rdmaout);
3878     if (!rdma) {
3879         return -1;
3880     }
3881
3882     if (rdma_errored(rdma)) {
3883         return -1;
3884     }
3885
3886     qemu_fflush(f);
3887     ret = qemu_rdma_drain_cq(rdma);
3888
3889     if (ret < 0) {
3890         goto err;
3891     }
3892
3893     if (flags == RAM_CONTROL_SETUP) {
3894         RDMAControlHeader resp = {.type = RDMA_CONTROL_RAM_BLOCKS_RESULT };
3895         RDMALocalBlocks *local = &rdma->local_ram_blocks;
3896         int reg_result_idx, nb_dest_blocks;
3897
3898         head.type = RDMA_CONTROL_RAM_BLOCKS_REQUEST;
3899         trace_rdma_registration_stop_ram();
3900
3901         /*
3902          * Make sure that we parallelize the pinning on both sides.
3903          * For very large guests, doing this serially takes a really
3904          * long time, so we have to 'interleave' the pinning locally
3905          * with the control messages by performing the pinning on this
3906          * side before we receive the control response from the other
3907          * side that the pinning has completed.
3908          */
3909         ret = qemu_rdma_exchange_send(rdma, &head, NULL, &resp,
3910                     &reg_result_idx, rdma->pin_all ?
3911                     qemu_rdma_reg_whole_ram_blocks : NULL,
3912                     &err);
3913         if (ret < 0) {
3914             error_report_err(err);
3915             return -1;
3916         }
3917
3918         nb_dest_blocks = resp.len / sizeof(RDMADestBlock);
3919
3920         /*
3921          * The protocol uses two different sets of rkeys (mutually exclusive):
3922          * 1. One key to represent the virtual address of the entire ram block.
3923          *    (dynamic chunk registration disabled - pin everything with one rkey.)
3924          * 2. One to represent individual chunks within a ram block.
3925          *    (dynamic chunk registration enabled - pin individual chunks.)
3926          *
3927          * Once the capability is successfully negotiated, the destination transmits
3928          * the keys to use (or sends them later) including the virtual addresses
3929          * and then propagates the remote ram block descriptions to his local copy.
3930          */
3931
3932         if (local->nb_blocks != nb_dest_blocks) {
3933             error_report("ram blocks mismatch (Number of blocks %d vs %d)",
3934                          local->nb_blocks, nb_dest_blocks);
3935             error_printf("Your QEMU command line parameters are probably "
3936                          "not identical on both the source and destination.");
3937             rdma->errored = true;
3938             return -1;
3939         }
3940
3941         qemu_rdma_move_header(rdma, reg_result_idx, &resp);
3942         memcpy(rdma->dest_blocks,
3943             rdma->wr_data[reg_result_idx].control_curr, resp.len);
3944         for (int i = 0; i < nb_dest_blocks; i++) {
3945             network_to_dest_block(&rdma->dest_blocks[i]);
3946
3947             /* We require that the blocks are in the same order */
3948             if (rdma->dest_blocks[i].length != local->block[i].length) {
3949                 error_report("Block %s/%d has a different length %" PRIu64
3950                              "vs %" PRIu64,
3951                              local->block[i].block_name, i,
3952                              local->block[i].length,
3953                              rdma->dest_blocks[i].length);
3954                 rdma->errored = true;
3955                 return -1;
3956             }
3957             local->block[i].remote_host_addr =
3958                     rdma->dest_blocks[i].remote_host_addr;
3959             local->block[i].remote_rkey = rdma->dest_blocks[i].remote_rkey;
3960         }
3961     }
3962
3963     trace_rdma_registration_stop(flags);
3964
3965     head.type = RDMA_CONTROL_REGISTER_FINISHED;
3966     ret = qemu_rdma_exchange_send(rdma, &head, NULL, NULL, NULL, NULL, &err);
3967
3968     if (ret < 0) {
3969         error_report_err(err);
3970         goto err;
3971     }
3972
3973     return 0;
3974 err:
3975     rdma->errored = true;
3976     return -1;
3977 }
3978
3979 static void qio_channel_rdma_finalize(Object *obj)
3980 {
3981     QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(obj);
3982     if (rioc->rdmain) {
3983         qemu_rdma_cleanup(rioc->rdmain);
3984         g_free(rioc->rdmain);
3985         rioc->rdmain = NULL;
3986     }
3987     if (rioc->rdmaout) {
3988         qemu_rdma_cleanup(rioc->rdmaout);
3989         g_free(rioc->rdmaout);
3990         rioc->rdmaout = NULL;
3991     }
3992 }
3993
3994 static void qio_channel_rdma_class_init(ObjectClass *klass,
3995                                         void *class_data G_GNUC_UNUSED)
3996 {
3997     QIOChannelClass *ioc_klass = QIO_CHANNEL_CLASS(klass);
3998
3999     ioc_klass->io_writev = qio_channel_rdma_writev;
4000     ioc_klass->io_readv = qio_channel_rdma_readv;
4001     ioc_klass->io_set_blocking = qio_channel_rdma_set_blocking;
4002     ioc_klass->io_close = qio_channel_rdma_close;
4003     ioc_klass->io_create_watch = qio_channel_rdma_create_watch;
4004     ioc_klass->io_set_aio_fd_handler = qio_channel_rdma_set_aio_fd_handler;
4005     ioc_klass->io_shutdown = qio_channel_rdma_shutdown;
4006 }
4007
4008 static const TypeInfo qio_channel_rdma_info = {
4009     .parent = TYPE_QIO_CHANNEL,
4010     .name = TYPE_QIO_CHANNEL_RDMA,
4011     .instance_size = sizeof(QIOChannelRDMA),
4012     .instance_finalize = qio_channel_rdma_finalize,
4013     .class_init = qio_channel_rdma_class_init,
4014 };
4015
4016 static void qio_channel_rdma_register_types(void)
4017 {
4018     type_register_static(&qio_channel_rdma_info);
4019 }
4020
4021 type_init(qio_channel_rdma_register_types);
4022
4023 static QEMUFile *rdma_new_input(RDMAContext *rdma)
4024 {
4025     QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(object_new(TYPE_QIO_CHANNEL_RDMA));
4026
4027     rioc->file = qemu_file_new_input(QIO_CHANNEL(rioc));
4028     rioc->rdmain = rdma;
4029     rioc->rdmaout = rdma->return_path;
4030
4031     return rioc->file;
4032 }
4033
4034 static QEMUFile *rdma_new_output(RDMAContext *rdma)
4035 {
4036     QIOChannelRDMA *rioc = QIO_CHANNEL_RDMA(object_new(TYPE_QIO_CHANNEL_RDMA));
4037
4038     rioc->file = qemu_file_new_output(QIO_CHANNEL(rioc));
4039     rioc->rdmaout = rdma;
4040     rioc->rdmain = rdma->return_path;
4041
4042     return rioc->file;
4043 }
4044
4045 static void rdma_accept_incoming_migration(void *opaque)
4046 {
4047     RDMAContext *rdma = opaque;
4048     QEMUFile *f;
4049     Error *local_err = NULL;
4050
4051     trace_qemu_rdma_accept_incoming_migration();
4052     if (qemu_rdma_accept(rdma) < 0) {
4053         error_report("RDMA ERROR: Migration initialization failed");
4054         return;
4055     }
4056
4057     trace_qemu_rdma_accept_incoming_migration_accepted();
4058
4059     if (rdma->is_return_path) {
4060         return;
4061     }
4062
4063     f = rdma_new_input(rdma);
4064     if (f == NULL) {
4065         error_report("RDMA ERROR: could not open RDMA for input");
4066         qemu_rdma_cleanup(rdma);
4067         return;
4068     }
4069
4070     rdma->migration_started_on_destination = 1;
4071     migration_fd_process_incoming(f, &local_err);
4072     if (local_err) {
4073         error_reportf_err(local_err, "RDMA ERROR:");
4074     }
4075 }
4076
4077 void rdma_start_incoming_migration(const char *host_port, Error **errp)
4078 {
4079     MigrationState *s = migrate_get_current();
4080     int ret;
4081     RDMAContext *rdma;
4082
4083     trace_rdma_start_incoming_migration();
4084
4085     /* Avoid ram_block_discard_disable(), cannot change during migration. */
4086     if (ram_block_discard_is_required()) {
4087         error_setg(errp, "RDMA: cannot disable RAM discard");
4088         return;
4089     }
4090
4091     rdma = qemu_rdma_data_init(host_port, errp);
4092     if (rdma == NULL) {
4093         goto err;
4094     }
4095
4096     ret = qemu_rdma_dest_init(rdma, errp);
4097     if (ret < 0) {
4098         goto err;
4099     }
4100
4101     trace_rdma_start_incoming_migration_after_dest_init();
4102
4103     ret = rdma_listen(rdma->listen_id, 5);
4104
4105     if (ret < 0) {
4106         error_setg(errp, "RDMA ERROR: listening on socket!");
4107         goto cleanup_rdma;
4108     }
4109
4110     trace_rdma_start_incoming_migration_after_rdma_listen();
4111     s->rdma_migration = true;
4112     qemu_set_fd_handler(rdma->channel->fd, rdma_accept_incoming_migration,
4113                         NULL, (void *)(intptr_t)rdma);
4114     return;
4115
4116 cleanup_rdma:
4117     qemu_rdma_cleanup(rdma);
4118 err:
4119     if (rdma) {
4120         g_free(rdma->host);
4121         g_free(rdma->host_port);
4122     }
4123     g_free(rdma);
4124 }
4125
4126 void rdma_start_outgoing_migration(void *opaque,
4127                             const char *host_port, Error **errp)
4128 {
4129     MigrationState *s = opaque;
4130     RDMAContext *rdma_return_path = NULL;
4131     RDMAContext *rdma;
4132     int ret;
4133
4134     /* Avoid ram_block_discard_disable(), cannot change during migration. */
4135     if (ram_block_discard_is_required()) {
4136         error_setg(errp, "RDMA: cannot disable RAM discard");
4137         return;
4138     }
4139
4140     rdma = qemu_rdma_data_init(host_port, errp);
4141     if (rdma == NULL) {
4142         goto err;
4143     }
4144
4145     ret = qemu_rdma_source_init(rdma, migrate_rdma_pin_all(), errp);
4146
4147     if (ret < 0) {
4148         goto err;
4149     }
4150
4151     trace_rdma_start_outgoing_migration_after_rdma_source_init();
4152     ret = qemu_rdma_connect(rdma, false, errp);
4153
4154     if (ret < 0) {
4155         goto err;
4156     }
4157
4158     /* RDMA postcopy need a separate queue pair for return path */
4159     if (migrate_postcopy() || migrate_return_path()) {
4160         rdma_return_path = qemu_rdma_data_init(host_port, errp);
4161
4162         if (rdma_return_path == NULL) {
4163             goto return_path_err;
4164         }
4165
4166         ret = qemu_rdma_source_init(rdma_return_path,
4167                                     migrate_rdma_pin_all(), errp);
4168
4169         if (ret < 0) {
4170             goto return_path_err;
4171         }
4172
4173         ret = qemu_rdma_connect(rdma_return_path, true, errp);
4174
4175         if (ret < 0) {
4176             goto return_path_err;
4177         }
4178
4179         rdma->return_path = rdma_return_path;
4180         rdma_return_path->return_path = rdma;
4181         rdma_return_path->is_return_path = true;
4182     }
4183
4184     trace_rdma_start_outgoing_migration_after_rdma_connect();
4185
4186     s->to_dst_file = rdma_new_output(rdma);
4187     s->rdma_migration = true;
4188     migrate_fd_connect(s, NULL);
4189     return;
4190 return_path_err:
4191     qemu_rdma_cleanup(rdma);
4192 err:
4193     g_free(rdma);
4194     g_free(rdma_return_path);
4195 }