migration/rdma.c

   1 /*
   2  * RDMA protocol and interfaces
   3  *
   4  * Copyright IBM, Corp. 2010-2013
   5  *
   6  * Authors:
   7  *  Michael R. Hines <mrhines@us.ibm.com>
   8  *  Jiuxing Liu <jl@us.ibm.com>
   9  *
  10  * This work is licensed under the terms of the GNU GPL, version 2 or
  11  * later.  See the COPYING file in the top-level directory.
  12  *
  13  */
  14 #include "qemu-common.h"
  15 #include "migration/migration.h"
  16 #include "migration/qemu-file.h"
  17 #include "exec/cpu-common.h"
  18 #include "qemu/error-report.h"
  19 #include "qemu/main-loop.h"
  20 #include "qemu/sockets.h"
  21 #include "qemu/bitmap.h"
  22 #include "block/coroutine.h"
  23 #include <stdio.h>
  24 #include <sys/types.h>
  25 #include <sys/socket.h>
  26 #include <netdb.h>
  27 #include <arpa/inet.h>
  28 #include <string.h>
  29 #include <rdma/rdma_cma.h>
  30 #include "trace.h"
  31
  32 /*
  33  * Print and error on both the Monitor and the Log file.
  34  */
  35 #define ERROR(errp, fmt, ...) \
  36     do { \
  37         fprintf(stderr, "RDMA ERROR: " fmt "\n", ## __VA_ARGS__); \
  38         if (errp && (*(errp) == NULL)) { \
  39             error_setg(errp, "RDMA ERROR: " fmt, ## __VA_ARGS__); \
  40         } \
  41     } while (0)
  42
  43 #define RDMA_RESOLVE_TIMEOUT_MS 10000
  44
  45 /* Do not merge data if larger than this. */
  46 #define RDMA_MERGE_MAX (2 * 1024 * 1024)
  47 #define RDMA_SIGNALED_SEND_MAX (RDMA_MERGE_MAX / 4096)
  48
  49 #define RDMA_REG_CHUNK_SHIFT 20 /* 1 MB */
  50
  51 /*
  52  * This is only for non-live state being migrated.
  53  * Instead of RDMA_WRITE messages, we use RDMA_SEND
  54  * messages for that state, which requires a different
  55  * delivery design than main memory.
  56  */
  57 #define RDMA_SEND_INCREMENT 32768
  58
  59 /*
  60  * Maximum size infiniband SEND message
  61  */
  62 #define RDMA_CONTROL_MAX_BUFFER (512 * 1024)
  63 #define RDMA_CONTROL_MAX_COMMANDS_PER_MESSAGE 4096
  64
  65 #define RDMA_CONTROL_VERSION_CURRENT 1
  66 /*
  67  * Capabilities for negotiation.
  68  */
  69 #define RDMA_CAPABILITY_PIN_ALL 0x01
  70
  71 /*
  72  * Add the other flags above to this list of known capabilities
  73  * as they are introduced.
  74  */
  75 static uint32_t known_capabilities = RDMA_CAPABILITY_PIN_ALL;
  76
  77 #define CHECK_ERROR_STATE() \
  78     do { \
  79         if (rdma->error_state) { \
  80             if (!rdma->error_reported) { \
  81                 error_report("RDMA is in an error state waiting migration" \
  82                                 " to abort!"); \
  83                 rdma->error_reported = 1; \
  84             } \
  85             return rdma->error_state; \
  86         } \
  87     } while (0);
  88
  89 /*
  90  * A work request ID is 64-bits and we split up these bits
  91  * into 3 parts:
  92  *
  93  * bits 0-15 : type of control message, 2^16
  94  * bits 16-29: ram block index, 2^14
  95  * bits 30-63: ram block chunk number, 2^34
  96  *
  97  * The last two bit ranges are only used for RDMA writes,
  98  * in order to track their completion and potentially
  99  * also track unregistration status of the message.
 100  */
 101 #define RDMA_WRID_TYPE_SHIFT  0UL
 102 #define RDMA_WRID_BLOCK_SHIFT 16UL
 103 #define RDMA_WRID_CHUNK_SHIFT 30UL
 104
 105 #define RDMA_WRID_TYPE_MASK \
 106     ((1UL << RDMA_WRID_BLOCK_SHIFT) - 1UL)
 107
 108 #define RDMA_WRID_BLOCK_MASK \
 109     (~RDMA_WRID_TYPE_MASK & ((1UL << RDMA_WRID_CHUNK_SHIFT) - 1UL))
 110
 111 #define RDMA_WRID_CHUNK_MASK (~RDMA_WRID_BLOCK_MASK & ~RDMA_WRID_TYPE_MASK)
 112
 113 /*
 114  * RDMA migration protocol:
 115  * 1. RDMA Writes (data messages, i.e. RAM)
 116  * 2. IB Send/Recv (control channel messages)
 117  */
 118 enum {
 119     RDMA_WRID_NONE = 0,
 120     RDMA_WRID_RDMA_WRITE = 1,
 121     RDMA_WRID_SEND_CONTROL = 2000,
 122     RDMA_WRID_RECV_CONTROL = 4000,
 123 };
 124
 125 static const char *wrid_desc[] = {
 126     [RDMA_WRID_NONE] = "NONE",
 127     [RDMA_WRID_RDMA_WRITE] = "WRITE RDMA",
 128     [RDMA_WRID_SEND_CONTROL] = "CONTROL SEND",
 129     [RDMA_WRID_RECV_CONTROL] = "CONTROL RECV",
 130 };
 131
 132 /*
 133  * Work request IDs for IB SEND messages only (not RDMA writes).
 134  * This is used by the migration protocol to transmit
 135  * control messages (such as device state and registration commands)
 136  *
 137  * We could use more WRs, but we have enough for now.
 138  */
 139 enum {
 140     RDMA_WRID_READY = 0,
 141     RDMA_WRID_DATA,
 142     RDMA_WRID_CONTROL,
 143     RDMA_WRID_MAX,
 144 };
 145
 146 /*
 147  * SEND/RECV IB Control Messages.
 148  */
 149 enum {
 150     RDMA_CONTROL_NONE = 0,
 151     RDMA_CONTROL_ERROR,
 152     RDMA_CONTROL_READY,               /* ready to receive */
 153     RDMA_CONTROL_QEMU_FILE,           /* QEMUFile-transmitted bytes */
 154     RDMA_CONTROL_RAM_BLOCKS_REQUEST,  /* RAMBlock synchronization */
 155     RDMA_CONTROL_RAM_BLOCKS_RESULT,   /* RAMBlock synchronization */
 156     RDMA_CONTROL_COMPRESS,            /* page contains repeat values */
 157     RDMA_CONTROL_REGISTER_REQUEST,    /* dynamic page registration */
 158     RDMA_CONTROL_REGISTER_RESULT,     /* key to use after registration */
 159     RDMA_CONTROL_REGISTER_FINISHED,   /* current iteration finished */
 160     RDMA_CONTROL_UNREGISTER_REQUEST,  /* dynamic UN-registration */
 161     RDMA_CONTROL_UNREGISTER_FINISHED, /* unpinning finished */
 162 };
 163
 164 static const char *control_desc[] = {
 165     [RDMA_CONTROL_NONE] = "NONE",
 166     [RDMA_CONTROL_ERROR] = "ERROR",
 167     [RDMA_CONTROL_READY] = "READY",
 168     [RDMA_CONTROL_QEMU_FILE] = "QEMU FILE",
 169     [RDMA_CONTROL_RAM_BLOCKS_REQUEST] = "RAM BLOCKS REQUEST",
 170     [RDMA_CONTROL_RAM_BLOCKS_RESULT] = "RAM BLOCKS RESULT",
 171     [RDMA_CONTROL_COMPRESS] = "COMPRESS",
 172     [RDMA_CONTROL_REGISTER_REQUEST] = "REGISTER REQUEST",
 173     [RDMA_CONTROL_REGISTER_RESULT] = "REGISTER RESULT",
 174     [RDMA_CONTROL_REGISTER_FINISHED] = "REGISTER FINISHED",
 175     [RDMA_CONTROL_UNREGISTER_REQUEST] = "UNREGISTER REQUEST",
 176     [RDMA_CONTROL_UNREGISTER_FINISHED] = "UNREGISTER FINISHED",
 177 };
 178
 179 /*
 180  * Memory and MR structures used to represent an IB Send/Recv work request.
 181  * This is *not* used for RDMA writes, only IB Send/Recv.
 182  */
 183 typedef struct {
 184     uint8_t  control[RDMA_CONTROL_MAX_BUFFER]; /* actual buffer to register */
 185     struct   ibv_mr *control_mr;               /* registration metadata */
 186     size_t   control_len;                      /* length of the message */
 187     uint8_t *control_curr;                     /* start of unconsumed bytes */
 188 } RDMAWorkRequestData;
 189
 190 /*
 191  * Negotiate RDMA capabilities during connection-setup time.
 192  */
 193 typedef struct {
 194     uint32_t version;
 195     uint32_t flags;
 196 } RDMACapabilities;
 197
 198 static void caps_to_network(RDMACapabilities *cap)
 199 {
 200     cap->version = htonl(cap->version);
 201     cap->flags = htonl(cap->flags);
 202 }
 203
 204 static void network_to_caps(RDMACapabilities *cap)
 205 {
 206     cap->version = ntohl(cap->version);
 207     cap->flags = ntohl(cap->flags);
 208 }
 209
 210 /*
 211  * Representation of a RAMBlock from an RDMA perspective.
 212  * This is not transmitted, only local.
 213  * This and subsequent structures cannot be linked lists
 214  * because we're using a single IB message to transmit
 215  * the information. It's small anyway, so a list is overkill.
 216  */
 217 typedef struct RDMALocalBlock {
 218     char          *block_name;
 219     uint8_t       *local_host_addr; /* local virtual address */
 220     uint64_t       remote_host_addr; /* remote virtual address */
 221     uint64_t       offset;
 222     uint64_t       length;
 223     struct         ibv_mr **pmr;    /* MRs for chunk-level registration */
 224     struct         ibv_mr *mr;      /* MR for non-chunk-level registration */
 225     uint32_t      *remote_keys;     /* rkeys for chunk-level registration */
 226     uint32_t       remote_rkey;     /* rkeys for non-chunk-level registration */
 227     int            index;           /* which block are we */
 228     bool           is_ram_block;
 229     int            nb_chunks;
 230     unsigned long *transit_bitmap;
 231     unsigned long *unregister_bitmap;
 232 } RDMALocalBlock;
 233
 234 /*
 235  * Also represents a RAMblock, but only on the dest.
 236  * This gets transmitted by the dest during connection-time
 237  * to the source VM and then is used to populate the
 238  * corresponding RDMALocalBlock with
 239  * the information needed to perform the actual RDMA.
 240  */
 241 typedef struct QEMU_PACKED RDMADestBlock {
 242     uint64_t remote_host_addr;
 243     uint64_t offset;
 244     uint64_t length;
 245     uint32_t remote_rkey;
 246     uint32_t padding;
 247 } RDMADestBlock;
 248
 249 static uint64_t htonll(uint64_t v)
 250 {
 251     union { uint32_t lv[2]; uint64_t llv; } u;
 252     u.lv[0] = htonl(v >> 32);
 253     u.lv[1] = htonl(v & 0xFFFFFFFFULL);
 254     return u.llv;
 255 }
 256
 257 static uint64_t ntohll(uint64_t v) {
 258     union { uint32_t lv[2]; uint64_t llv; } u;
 259     u.llv = v;
 260     return ((uint64_t)ntohl(u.lv[0]) << 32) | (uint64_t) ntohl(u.lv[1]);
 261 }
 262
 263 static void dest_block_to_network(RDMADestBlock *db)
 264 {
 265     db->remote_host_addr = htonll(db->remote_host_addr);
 266     db->offset = htonll(db->offset);
 267     db->length = htonll(db->length);
 268     db->remote_rkey = htonl(db->remote_rkey);
 269 }
 270
 271 static void network_to_dest_block(RDMADestBlock *db)
 272 {
 273     db->remote_host_addr = ntohll(db->remote_host_addr);
 274     db->offset = ntohll(db->offset);
 275     db->length = ntohll(db->length);
 276     db->remote_rkey = ntohl(db->remote_rkey);
 277 }
 278
 279 /*
 280  * Virtual address of the above structures used for transmitting
 281  * the RAMBlock descriptions at connection-time.
 282  * This structure is *not* transmitted.
 283  */
 284 typedef struct RDMALocalBlocks {
 285     int nb_blocks;
 286     bool     init;             /* main memory init complete */
 287     RDMALocalBlock *block;
 288 } RDMALocalBlocks;
 289
 290 /*
 291  * Main data structure for RDMA state.
 292  * While there is only one copy of this structure being allocated right now,
 293  * this is the place where one would start if you wanted to consider
 294  * having more than one RDMA connection open at the same time.
 295  */
 296 typedef struct RDMAContext {
 297     char *host;
 298     int port;
 299
 300     RDMAWorkRequestData wr_data[RDMA_WRID_MAX];
 301
 302     /*
 303      * This is used by *_exchange_send() to figure out whether or not
 304      * the initial "READY" message has already been received or not.
 305      * This is because other functions may potentially poll() and detect
 306      * the READY message before send() does, in which case we need to
 307      * know if it completed.
 308      */
 309     int control_ready_expected;
 310
 311     /* number of outstanding writes */
 312     int nb_sent;
 313
 314     /* store info about current buffer so that we can
 315        merge it with future sends */
 316     uint64_t current_addr;
 317     uint64_t current_length;
 318     /* index of ram block the current buffer belongs to */
 319     int current_index;
 320     /* index of the chunk in the current ram block */
 321     int current_chunk;
 322
 323     bool pin_all;
 324
 325     /*
 326      * infiniband-specific variables for opening the device
 327      * and maintaining connection state and so forth.
 328      *
 329      * cm_id also has ibv_context, rdma_event_channel, and ibv_qp in
 330      * cm_id->verbs, cm_id->channel, and cm_id->qp.
 331      */
 332     struct rdma_cm_id *cm_id;               /* connection manager ID */
 333     struct rdma_cm_id *listen_id;
 334     bool connected;
 335
 336     struct ibv_context          *verbs;
 337     struct rdma_event_channel   *channel;
 338     struct ibv_qp *qp;                      /* queue pair */
 339     struct ibv_comp_channel *comp_channel;  /* completion channel */
 340     struct ibv_pd *pd;                      /* protection domain */
 341     struct ibv_cq *cq;                      /* completion queue */
 342
 343     /*
 344      * If a previous write failed (perhaps because of a failed
 345      * memory registration, then do not attempt any future work
 346      * and remember the error state.
 347      */
 348     int error_state;
 349     int error_reported;
 350
 351     /*
 352      * Description of ram blocks used throughout the code.
 353      */
 354     RDMALocalBlocks local_ram_blocks;
 355     RDMADestBlock  *dest_blocks;
 356
 357     /*
 358      * Migration on *destination* started.
 359      * Then use coroutine yield function.
 360      * Source runs in a thread, so we don't care.
 361      */
 362     int migration_started_on_destination;
 363
 364     int total_registrations;
 365     int total_writes;
 366
 367     int unregister_current, unregister_next;
 368     uint64_t unregistrations[RDMA_SIGNALED_SEND_MAX];
 369
 370     GHashTable *blockmap;
 371 } RDMAContext;
 372
 373 /*
 374  * Interface to the rest of the migration call stack.
 375  */
 376 typedef struct QEMUFileRDMA {
 377     RDMAContext *rdma;
 378     size_t len;
 379     void *file;
 380 } QEMUFileRDMA;
 381
 382 /*
 383  * Main structure for IB Send/Recv control messages.
 384  * This gets prepended at the beginning of every Send/Recv.
 385  */
 386 typedef struct QEMU_PACKED {
 387     uint32_t len;     /* Total length of data portion */
 388     uint32_t type;    /* which control command to perform */
 389     uint32_t repeat;  /* number of commands in data portion of same type */
 390     uint32_t padding;
 391 } RDMAControlHeader;
 392
 393 static void control_to_network(RDMAControlHeader *control)
 394 {
 395     control->type = htonl(control->type);
 396     control->len = htonl(control->len);
 397     control->repeat = htonl(control->repeat);
 398 }
 399
 400 static void network_to_control(RDMAControlHeader *control)
 401 {
 402     control->type = ntohl(control->type);
 403     control->len = ntohl(control->len);
 404     control->repeat = ntohl(control->repeat);
 405 }
 406
 407 /*
 408  * Register a single Chunk.
 409  * Information sent by the source VM to inform the dest
 410  * to register an single chunk of memory before we can perform
 411  * the actual RDMA operation.
 412  */
 413 typedef struct QEMU_PACKED {
 414     union QEMU_PACKED {
 415         uint64_t current_addr;  /* offset into the ram_addr_t space */
 416         uint64_t chunk;         /* chunk to lookup if unregistering */
 417     } key;
 418     uint32_t current_index; /* which ramblock the chunk belongs to */
 419     uint32_t padding;
 420     uint64_t chunks;            /* how many sequential chunks to register */
 421 } RDMARegister;
 422
 423 static void register_to_network(RDMAContext *rdma, RDMARegister *reg)
 424 {
 425     RDMALocalBlock *local_block;
 426     local_block  = &rdma->local_ram_blocks.block[reg->current_index];
 427
 428     if (local_block->is_ram_block) {
 429         /*
 430          * current_addr as passed in is an address in the local ram_addr_t
 431          * space, we need to translate this for the destination
 432          */
 433         reg->key.current_addr -= local_block->offset;
 434         reg->key.current_addr += rdma->dest_blocks[reg->current_index].offset;
 435     }
 436     reg->key.current_addr = htonll(reg->key.current_addr);
 437     reg->current_index = htonl(reg->current_index);
 438     reg->chunks = htonll(reg->chunks);
 439 }
 440
 441 static void network_to_register(RDMARegister *reg)
 442 {
 443     reg->key.current_addr = ntohll(reg->key.current_addr);
 444     reg->current_index = ntohl(reg->current_index);
 445     reg->chunks = ntohll(reg->chunks);
 446 }
 447
 448 typedef struct QEMU_PACKED {
 449     uint32_t value;     /* if zero, we will madvise() */
 450     uint32_t block_idx; /* which ram block index */
 451     uint64_t offset;    /* Address in remote ram_addr_t space */
 452     uint64_t length;    /* length of the chunk */
 453 } RDMACompress;
 454
 455 static void compress_to_network(RDMAContext *rdma, RDMACompress *comp)
 456 {
 457     comp->value = htonl(comp->value);
 458     /*
 459      * comp->offset as passed in is an address in the local ram_addr_t
 460      * space, we need to translate this for the destination
 461      */
 462     comp->offset -= rdma->local_ram_blocks.block[comp->block_idx].offset;
 463     comp->offset += rdma->dest_blocks[comp->block_idx].offset;
 464     comp->block_idx = htonl(comp->block_idx);
 465     comp->offset = htonll(comp->offset);
 466     comp->length = htonll(comp->length);
 467 }
 468
 469 static void network_to_compress(RDMACompress *comp)
 470 {
 471     comp->value = ntohl(comp->value);
 472     comp->block_idx = ntohl(comp->block_idx);
 473     comp->offset = ntohll(comp->offset);
 474     comp->length = ntohll(comp->length);
 475 }
 476
 477 /*
 478  * The result of the dest's memory registration produces an "rkey"
 479  * which the source VM must reference in order to perform
 480  * the RDMA operation.
 481  */
 482 typedef struct QEMU_PACKED {
 483     uint32_t rkey;
 484     uint32_t padding;
 485     uint64_t host_addr;
 486 } RDMARegisterResult;
 487
 488 static void result_to_network(RDMARegisterResult *result)
 489 {
 490     result->rkey = htonl(result->rkey);
 491     result->host_addr = htonll(result->host_addr);
 492 };
 493
 494 static void network_to_result(RDMARegisterResult *result)
 495 {
 496     result->rkey = ntohl(result->rkey);
 497     result->host_addr = ntohll(result->host_addr);
 498 };
 499
 500 const char *print_wrid(int wrid);
 501 static int qemu_rdma_exchange_send(RDMAContext *rdma, RDMAControlHeader *head,
 502                                    uint8_t *data, RDMAControlHeader *resp,
 503                                    int *resp_idx,
 504                                    int (*callback)(RDMAContext *rdma));
 505
 506 static inline uint64_t ram_chunk_index(const uint8_t *start,
 507                                        const uint8_t *host)
 508 {
 509     return ((uintptr_t) host - (uintptr_t) start) >> RDMA_REG_CHUNK_SHIFT;
 510 }
 511
 512 static inline uint8_t *ram_chunk_start(const RDMALocalBlock *rdma_ram_block,
 513                                        uint64_t i)
 514 {
 515     return (uint8_t *)(uintptr_t)(rdma_ram_block->local_host_addr +
 516                                   (i << RDMA_REG_CHUNK_SHIFT));
 517 }
 518
 519 static inline uint8_t *ram_chunk_end(const RDMALocalBlock *rdma_ram_block,
 520                                      uint64_t i)
 521 {
 522     uint8_t *result = ram_chunk_start(rdma_ram_block, i) +
 523                                          (1UL << RDMA_REG_CHUNK_SHIFT);
 524
 525     if (result > (rdma_ram_block->local_host_addr + rdma_ram_block->length)) {
 526         result = rdma_ram_block->local_host_addr + rdma_ram_block->length;
 527     }
 528
 529     return result;
 530 }
 531
 532 static int rdma_add_block(RDMAContext *rdma, const char *block_name,
 533                          void *host_addr,
 534                          ram_addr_t block_offset, uint64_t length)
 535 {
 536     RDMALocalBlocks *local = &rdma->local_ram_blocks;
 537     RDMALocalBlock *block;
 538     RDMALocalBlock *old = local->block;
 539
 540     local->block = g_malloc0(sizeof(RDMALocalBlock) * (local->nb_blocks + 1));
 541
 542     if (local->nb_blocks) {
 543         int x;
 544
 545         if (rdma->blockmap) {
 546             for (x = 0; x < local->nb_blocks; x++) {
 547                 g_hash_table_remove(rdma->blockmap,
 548                                     (void *)(uintptr_t)old[x].offset);
 549                 g_hash_table_insert(rdma->blockmap,
 550                                     (void *)(uintptr_t)old[x].offset,
 551                                     &local->block[x]);
 552             }
 553         }
 554         memcpy(local->block, old, sizeof(RDMALocalBlock) * local->nb_blocks);
 555         g_free(old);
 556     }
 557
 558     block = &local->block[local->nb_blocks];
 559
 560     block->block_name = g_strdup(block_name);
 561     block->local_host_addr = host_addr;
 562     block->offset = block_offset;
 563     block->length = length;
 564     block->index = local->nb_blocks;
 565     block->nb_chunks = ram_chunk_index(host_addr, host_addr + length) + 1UL;
 566     block->transit_bitmap = bitmap_new(block->nb_chunks);
 567     bitmap_clear(block->transit_bitmap, 0, block->nb_chunks);
 568     block->unregister_bitmap = bitmap_new(block->nb_chunks);
 569     bitmap_clear(block->unregister_bitmap, 0, block->nb_chunks);
 570     block->remote_keys = g_malloc0(block->nb_chunks * sizeof(uint32_t));
 571
 572     block->is_ram_block = local->init ? false : true;
 573
 574     if (rdma->blockmap) {
 575         g_hash_table_insert(rdma->blockmap, (void *) block_offset, block);
 576     }
 577
 578     trace_rdma_add_block(block_name, local->nb_blocks,
 579                          (uintptr_t) block->local_host_addr,
 580                          block->offset, block->length,
 581                          (uintptr_t) (block->local_host_addr + block->length),
 582                          BITS_TO_LONGS(block->nb_chunks) *
 583                              sizeof(unsigned long) * 8,
 584                          block->nb_chunks);
 585
 586     local->nb_blocks++;
 587
 588     return 0;
 589 }
 590
 591 /*
 592  * Memory regions need to be registered with the device and queue pairs setup
 593  * in advanced before the migration starts. This tells us where the RAM blocks
 594  * are so that we can register them individually.
 595  */
 596 static int qemu_rdma_init_one_block(const char *block_name, void *host_addr,
 597     ram_addr_t block_offset, ram_addr_t length, void *opaque)
 598 {
 599     return rdma_add_block(opaque, block_name, host_addr, block_offset, length);
 600 }
 601
 602 /*
 603  * Identify the RAMBlocks and their quantity. They will be references to
 604  * identify chunk boundaries inside each RAMBlock and also be referenced
 605  * during dynamic page registration.
 606  */
 607 static int qemu_rdma_init_ram_blocks(RDMAContext *rdma)
 608 {
 609     RDMALocalBlocks *local = &rdma->local_ram_blocks;
 610
 611     assert(rdma->blockmap == NULL);
 612     memset(local, 0, sizeof *local);
 613     qemu_ram_foreach_block(qemu_rdma_init_one_block, rdma);
 614     trace_qemu_rdma_init_ram_blocks(local->nb_blocks);
 615     rdma->dest_blocks = (RDMADestBlock *) g_malloc0(sizeof(RDMADestBlock) *
 616                         rdma->local_ram_blocks.nb_blocks);
 617     local->init = true;
 618     return 0;
 619 }
 620
 621 /*
 622  * Note: If used outside of cleanup, the caller must ensure that the destination
 623  * block structures are also updated
 624  */
 625 static int rdma_delete_block(RDMAContext *rdma, RDMALocalBlock *block)
 626 {
 627     RDMALocalBlocks *local = &rdma->local_ram_blocks;
 628     RDMALocalBlock *old = local->block;
 629     int x;
 630
 631     if (rdma->blockmap) {
 632         g_hash_table_remove(rdma->blockmap, (void *)(uintptr_t)block->offset);
 633     }
 634     if (block->pmr) {
 635         int j;
 636
 637         for (j = 0; j < block->nb_chunks; j++) {
 638             if (!block->pmr[j]) {
 639                 continue;
 640             }
 641             ibv_dereg_mr(block->pmr[j]);
 642             rdma->total_registrations--;
 643         }
 644         g_free(block->pmr);
 645         block->pmr = NULL;
 646     }
 647
 648     if (block->mr) {
 649         ibv_dereg_mr(block->mr);
 650         rdma->total_registrations--;
 651         block->mr = NULL;
 652     }
 653
 654     g_free(block->transit_bitmap);
 655     block->transit_bitmap = NULL;
 656
 657     g_free(block->unregister_bitmap);
 658     block->unregister_bitmap = NULL;
 659
 660     g_free(block->remote_keys);
 661     block->remote_keys = NULL;
 662
 663     g_free(block->block_name);
 664     block->block_name = NULL;
 665
 666     if (rdma->blockmap) {
 667         for (x = 0; x < local->nb_blocks; x++) {
 668             g_hash_table_remove(rdma->blockmap,
 669                                 (void *)(uintptr_t)old[x].offset);
 670         }
 671     }
 672
 673     if (local->nb_blocks > 1) {
 674
 675         local->block = g_malloc0(sizeof(RDMALocalBlock) *
 676                                     (local->nb_blocks - 1));
 677
 678         if (block->index) {
 679             memcpy(local->block, old, sizeof(RDMALocalBlock) * block->index);
 680         }
 681
 682         if (block->index < (local->nb_blocks - 1)) {
 683             memcpy(local->block + block->index, old + (block->index + 1),
 684                 sizeof(RDMALocalBlock) *
 685                     (local->nb_blocks - (block->index + 1)));
 686         }
 687     } else {
 688         assert(block == local->block);
 689         local->block = NULL;
 690     }
 691
 692     trace_rdma_delete_block(block, (uintptr_t)block->local_host_addr,
 693                            block->offset, block->length,
 694                             (uintptr_t)(block->local_host_addr + block->length),
 695                            BITS_TO_LONGS(block->nb_chunks) *
 696                                sizeof(unsigned long) * 8, block->nb_chunks);
 697
 698     g_free(old);
 699
 700     local->nb_blocks--;
 701
 702     if (local->nb_blocks && rdma->blockmap) {
 703         for (x = 0; x < local->nb_blocks; x++) {
 704             g_hash_table_insert(rdma->blockmap,
 705                                 (void *)(uintptr_t)local->block[x].offset,
 706                                 &local->block[x]);
 707         }
 708     }
 709
 710     return 0;
 711 }
 712
 713 /*
 714  * Put in the log file which RDMA device was opened and the details
 715  * associated with that device.
 716  */
 717 static void qemu_rdma_dump_id(const char *who, struct ibv_context *verbs)
 718 {
 719     struct ibv_port_attr port;
 720
 721     if (ibv_query_port(verbs, 1, &port)) {
 722         error_report("Failed to query port information");
 723         return;
 724     }
 725
 726     printf("%s RDMA Device opened: kernel name %s "
 727            "uverbs device name %s, "
 728            "infiniband_verbs class device path %s, "
 729            "infiniband class device path %s, "
 730            "transport: (%d) %s\n",
 731                 who,
 732                 verbs->device->name,
 733                 verbs->device->dev_name,
 734                 verbs->device->dev_path,
 735                 verbs->device->ibdev_path,
 736                 port.link_layer,
 737                 (port.link_layer == IBV_LINK_LAYER_INFINIBAND) ? "Infiniband" :
 738                  ((port.link_layer == IBV_LINK_LAYER_ETHERNET)
 739                     ? "Ethernet" : "Unknown"));
 740 }
 741
 742 /*
 743  * Put in the log file the RDMA gid addressing information,
 744  * useful for folks who have trouble understanding the
 745  * RDMA device hierarchy in the kernel.
 746  */
 747 static void qemu_rdma_dump_gid(const char *who, struct rdma_cm_id *id)
 748 {
 749     char sgid[33];
 750     char dgid[33];
 751     inet_ntop(AF_INET6, &id->route.addr.addr.ibaddr.sgid, sgid, sizeof sgid);
 752     inet_ntop(AF_INET6, &id->route.addr.addr.ibaddr.dgid, dgid, sizeof dgid);
 753     trace_qemu_rdma_dump_gid(who, sgid, dgid);
 754 }
 755
 756 /*
 757  * As of now, IPv6 over RoCE / iWARP is not supported by linux.
 758  * We will try the next addrinfo struct, and fail if there are
 759  * no other valid addresses to bind against.
 760  *
 761  * If user is listening on '[::]', then we will not have a opened a device
 762  * yet and have no way of verifying if the device is RoCE or not.
 763  *
 764  * In this case, the source VM will throw an error for ALL types of
 765  * connections (both IPv4 and IPv6) if the destination machine does not have
 766  * a regular infiniband network available for use.
 767  *
 768  * The only way to guarantee that an error is thrown for broken kernels is
 769  * for the management software to choose a *specific* interface at bind time
 770  * and validate what time of hardware it is.
 771  *
 772  * Unfortunately, this puts the user in a fix:
 773  *
 774  *  If the source VM connects with an IPv4 address without knowing that the
 775  *  destination has bound to '[::]' the migration will unconditionally fail
 776  *  unless the management software is explicitly listening on the the IPv4
 777  *  address while using a RoCE-based device.
 778  *
 779  *  If the source VM connects with an IPv6 address, then we're OK because we can
 780  *  throw an error on the source (and similarly on the destination).
 781  *
 782  *  But in mixed environments, this will be broken for a while until it is fixed
 783  *  inside linux.
 784  *
 785  * We do provide a *tiny* bit of help in this function: We can list all of the
 786  * devices in the system and check to see if all the devices are RoCE or
 787  * Infiniband.
 788  *
 789  * If we detect that we have a *pure* RoCE environment, then we can safely
 790  * thrown an error even if the management software has specified '[::]' as the
 791  * bind address.
 792  *
 793  * However, if there is are multiple hetergeneous devices, then we cannot make
 794  * this assumption and the user just has to be sure they know what they are
 795  * doing.
 796  *
 797  * Patches are being reviewed on linux-rdma.
 798  */
 799 static int qemu_rdma_broken_ipv6_kernel(Error **errp, struct ibv_context *verbs)
 800 {
 801     struct ibv_port_attr port_attr;
 802
 803     /* This bug only exists in linux, to our knowledge. */
 804 #ifdef CONFIG_LINUX
 805
 806     /*
 807      * Verbs are only NULL if management has bound to '[::]'.
 808      *
 809      * Let's iterate through all the devices and see if there any pure IB
 810      * devices (non-ethernet).
 811      *
 812      * If not, then we can safely proceed with the migration.
 813      * Otherwise, there are no guarantees until the bug is fixed in linux.
 814      */
 815     if (!verbs) {
 816         int num_devices, x;
 817         struct ibv_device ** dev_list = ibv_get_device_list(&num_devices);
 818         bool roce_found = false;
 819         bool ib_found = false;
 820
 821         for (x = 0; x < num_devices; x++) {
 822             verbs = ibv_open_device(dev_list[x]);
 823             if (!verbs) {
 824                 if (errno == EPERM) {
 825                     continue;
 826                 } else {
 827                     return -EINVAL;
 828                 }
 829             }
 830
 831             if (ibv_query_port(verbs, 1, &port_attr)) {
 832                 ibv_close_device(verbs);
 833                 ERROR(errp, "Could not query initial IB port");
 834                 return -EINVAL;
 835             }
 836
 837             if (port_attr.link_layer == IBV_LINK_LAYER_INFINIBAND) {
 838                 ib_found = true;
 839             } else if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) {
 840                 roce_found = true;
 841             }
 842
 843             ibv_close_device(verbs);
 844
 845         }
 846
 847         if (roce_found) {
 848             if (ib_found) {
 849                 fprintf(stderr, "WARN: migrations may fail:"
 850                                 " IPv6 over RoCE / iWARP in linux"
 851                                 " is broken. But since you appear to have a"
 852                                 " mixed RoCE / IB environment, be sure to only"
 853                                 " migrate over the IB fabric until the kernel "
 854                                 " fixes the bug.\n");
 855             } else {
 856                 ERROR(errp, "You only have RoCE / iWARP devices in your systems"
 857                             " and your management software has specified '[::]'"
 858                             ", but IPv6 over RoCE / iWARP is not supported in Linux.");
 859                 return -ENONET;
 860             }
 861         }
 862
 863         return 0;
 864     }
 865
 866     /*
 867      * If we have a verbs context, that means that some other than '[::]' was
 868      * used by the management software for binding. In which case we can
 869      * actually warn the user about a potentially broken kernel.
 870      */
 871
 872     /* IB ports start with 1, not 0 */
 873     if (ibv_query_port(verbs, 1, &port_attr)) {
 874         ERROR(errp, "Could not query initial IB port");
 875         return -EINVAL;
 876     }
 877
 878     if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) {
 879         ERROR(errp, "Linux kernel's RoCE / iWARP does not support IPv6 "
 880                     "(but patches on linux-rdma in progress)");
 881         return -ENONET;
 882     }
 883
 884 #endif
 885
 886     return 0;
 887 }
 888
 889 /*
 890  * Figure out which RDMA device corresponds to the requested IP hostname
 891  * Also create the initial connection manager identifiers for opening
 892  * the connection.
 893  */
 894 static int qemu_rdma_resolve_host(RDMAContext *rdma, Error **errp)
 895 {
 896     int ret;
 897     struct rdma_addrinfo *res;
 898     char port_str[16];
 899     struct rdma_cm_event *cm_event;
 900     char ip[40] = "unknown";
 901     struct rdma_addrinfo *e;
 902
 903     if (rdma->host == NULL || !strcmp(rdma->host, "")) {
 904         ERROR(errp, "RDMA hostname has not been set");
 905         return -EINVAL;
 906     }
 907
 908     /* create CM channel */
 909     rdma->channel = rdma_create_event_channel();
 910     if (!rdma->channel) {
 911         ERROR(errp, "could not create CM channel");
 912         return -EINVAL;
 913     }
 914
 915     /* create CM id */
 916     ret = rdma_create_id(rdma->channel, &rdma->cm_id, NULL, RDMA_PS_TCP);
 917     if (ret) {
 918         ERROR(errp, "could not create channel id");
 919         goto err_resolve_create_id;
 920     }
 921
 922     snprintf(port_str, 16, "%d", rdma->port);
 923     port_str[15] = '\0';
 924
 925     ret = rdma_getaddrinfo(rdma->host, port_str, NULL, &res);
 926     if (ret < 0) {
 927         ERROR(errp, "could not rdma_getaddrinfo address %s", rdma->host);
 928         goto err_resolve_get_addr;
 929     }
 930
 931     for (e = res; e != NULL; e = e->ai_next) {
 932         inet_ntop(e->ai_family,
 933             &((struct sockaddr_in *) e->ai_dst_addr)->sin_addr, ip, sizeof ip);
 934         trace_qemu_rdma_resolve_host_trying(rdma->host, ip);
 935
 936         ret = rdma_resolve_addr(rdma->cm_id, NULL, e->ai_dst_addr,
 937                 RDMA_RESOLVE_TIMEOUT_MS);
 938         if (!ret) {
 939             if (e->ai_family == AF_INET6) {
 940                 ret = qemu_rdma_broken_ipv6_kernel(errp, rdma->cm_id->verbs);
 941                 if (ret) {
 942                     continue;
 943                 }
 944             }
 945             goto route;
 946         }
 947     }
 948
 949     ERROR(errp, "could not resolve address %s", rdma->host);
 950     goto err_resolve_get_addr;
 951
 952 route:
 953     qemu_rdma_dump_gid("source_resolve_addr", rdma->cm_id);
 954
 955     ret = rdma_get_cm_event(rdma->channel, &cm_event);
 956     if (ret) {
 957         ERROR(errp, "could not perform event_addr_resolved");
 958         goto err_resolve_get_addr;
 959     }
 960
 961     if (cm_event->event != RDMA_CM_EVENT_ADDR_RESOLVED) {
 962         ERROR(errp, "result not equal to event_addr_resolved %s",
 963                 rdma_event_str(cm_event->event));
 964         perror("rdma_resolve_addr");
 965         rdma_ack_cm_event(cm_event);
 966         ret = -EINVAL;
 967         goto err_resolve_get_addr;
 968     }
 969     rdma_ack_cm_event(cm_event);
 970
 971     /* resolve route */
 972     ret = rdma_resolve_route(rdma->cm_id, RDMA_RESOLVE_TIMEOUT_MS);
 973     if (ret) {
 974         ERROR(errp, "could not resolve rdma route");
 975         goto err_resolve_get_addr;
 976     }
 977
 978     ret = rdma_get_cm_event(rdma->channel, &cm_event);
 979     if (ret) {
 980         ERROR(errp, "could not perform event_route_resolved");
 981         goto err_resolve_get_addr;
 982     }
 983     if (cm_event->event != RDMA_CM_EVENT_ROUTE_RESOLVED) {
 984         ERROR(errp, "result not equal to event_route_resolved: %s",
 985                         rdma_event_str(cm_event->event));
 986         rdma_ack_cm_event(cm_event);
 987         ret = -EINVAL;
 988         goto err_resolve_get_addr;
 989     }
 990     rdma_ack_cm_event(cm_event);
 991     rdma->verbs = rdma->cm_id->verbs;
 992     qemu_rdma_dump_id("source_resolve_host", rdma->cm_id->verbs);
 993     qemu_rdma_dump_gid("source_resolve_host", rdma->cm_id);
 994     return 0;
 995
 996 err_resolve_get_addr:
 997     rdma_destroy_id(rdma->cm_id);
 998     rdma->cm_id = NULL;
 999 err_resolve_create_id:
1000     rdma_destroy_event_channel(rdma->channel);
1001     rdma->channel = NULL;
1002     return ret;
1003 }
1004
1005 /*
1006  * Create protection domain and completion queues
1007  */
1008 static int qemu_rdma_alloc_pd_cq(RDMAContext *rdma)
1009 {
1010     /* allocate pd */
1011     rdma->pd = ibv_alloc_pd(rdma->verbs);
1012     if (!rdma->pd) {
1013         error_report("failed to allocate protection domain");
1014         return -1;
1015     }
1016
1017     /* create completion channel */
1018     rdma->comp_channel = ibv_create_comp_channel(rdma->verbs);
1019     if (!rdma->comp_channel) {
1020         error_report("failed to allocate completion channel");
1021         goto err_alloc_pd_cq;
1022     }
1023
1024     /*
1025      * Completion queue can be filled by both read and write work requests,
1026      * so must reflect the sum of both possible queue sizes.
1027      */
1028     rdma->cq = ibv_create_cq(rdma->verbs, (RDMA_SIGNALED_SEND_MAX * 3),
1029             NULL, rdma->comp_channel, 0);
1030     if (!rdma->cq) {
1031         error_report("failed to allocate completion queue");
1032         goto err_alloc_pd_cq;
1033     }
1034
1035     return 0;
1036
1037 err_alloc_pd_cq:
1038     if (rdma->pd) {
1039         ibv_dealloc_pd(rdma->pd);
1040     }
1041     if (rdma->comp_channel) {
1042         ibv_destroy_comp_channel(rdma->comp_channel);
1043     }
1044     rdma->pd = NULL;
1045     rdma->comp_channel = NULL;
1046     return -1;
1047
1048 }
1049
1050 /*
1051  * Create queue pairs.
1052  */
1053 static int qemu_rdma_alloc_qp(RDMAContext *rdma)
1054 {
1055     struct ibv_qp_init_attr attr = { 0 };
1056     int ret;
1057
1058     attr.cap.max_send_wr = RDMA_SIGNALED_SEND_MAX;
1059     attr.cap.max_recv_wr = 3;
1060     attr.cap.max_send_sge = 1;
1061     attr.cap.max_recv_sge = 1;
1062     attr.send_cq = rdma->cq;
1063     attr.recv_cq = rdma->cq;
1064     attr.qp_type = IBV_QPT_RC;
1065
1066     ret = rdma_create_qp(rdma->cm_id, rdma->pd, &attr);
1067     if (ret) {
1068         return -1;
1069     }
1070
1071     rdma->qp = rdma->cm_id->qp;
1072     return 0;
1073 }
1074
1075 static int qemu_rdma_reg_whole_ram_blocks(RDMAContext *rdma)
1076 {
1077     int i;
1078     RDMALocalBlocks *local = &rdma->local_ram_blocks;
1079
1080     for (i = 0; i < local->nb_blocks; i++) {
1081         local->block[i].mr =
1082             ibv_reg_mr(rdma->pd,
1083                     local->block[i].local_host_addr,
1084                     local->block[i].length,
1085                     IBV_ACCESS_LOCAL_WRITE |
1086                     IBV_ACCESS_REMOTE_WRITE
1087                     );
1088         if (!local->block[i].mr) {
1089             perror("Failed to register local dest ram block!\n");
1090             break;
1091         }
1092         rdma->total_registrations++;
1093     }
1094
1095     if (i >= local->nb_blocks) {
1096         return 0;
1097     }
1098
1099     for (i--; i >= 0; i--) {
1100         ibv_dereg_mr(local->block[i].mr);
1101         rdma->total_registrations--;
1102     }
1103
1104     return -1;
1105
1106 }
1107
1108 /*
1109  * Find the ram block that corresponds to the page requested to be
1110  * transmitted by QEMU.
1111  *
1112  * Once the block is found, also identify which 'chunk' within that
1113  * block that the page belongs to.
1114  *
1115  * This search cannot fail or the migration will fail.
1116  */
1117 static int qemu_rdma_search_ram_block(RDMAContext *rdma,
1118                                       uintptr_t block_offset,
1119                                       uint64_t offset,
1120                                       uint64_t length,
1121                                       uint64_t *block_index,
1122                                       uint64_t *chunk_index)
1123 {
1124     uint64_t current_addr = block_offset + offset;
1125     RDMALocalBlock *block = g_hash_table_lookup(rdma->blockmap,
1126                                                 (void *) block_offset);
1127     assert(block);
1128     assert(current_addr >= block->offset);
1129     assert((current_addr + length) <= (block->offset + block->length));
1130
1131     *block_index = block->index;
1132     *chunk_index = ram_chunk_index(block->local_host_addr,
1133                 block->local_host_addr + (current_addr - block->offset));
1134
1135     return 0;
1136 }
1137
1138 /*
1139  * Register a chunk with IB. If the chunk was already registered
1140  * previously, then skip.
1141  *
1142  * Also return the keys associated with the registration needed
1143  * to perform the actual RDMA operation.
1144  */
1145 static int qemu_rdma_register_and_get_keys(RDMAContext *rdma,
1146         RDMALocalBlock *block, uintptr_t host_addr,
1147         uint32_t *lkey, uint32_t *rkey, int chunk,
1148         uint8_t *chunk_start, uint8_t *chunk_end)
1149 {
1150     if (block->mr) {
1151         if (lkey) {
1152             *lkey = block->mr->lkey;
1153         }
1154         if (rkey) {
1155             *rkey = block->mr->rkey;
1156         }
1157         return 0;
1158     }
1159
1160     /* allocate memory to store chunk MRs */
1161     if (!block->pmr) {
1162         block->pmr = g_malloc0(block->nb_chunks * sizeof(struct ibv_mr *));
1163     }
1164
1165     /*
1166      * If 'rkey', then we're the destination, so grant access to the source.
1167      *
1168      * If 'lkey', then we're the source VM, so grant access only to ourselves.
1169      */
1170     if (!block->pmr[chunk]) {
1171         uint64_t len = chunk_end - chunk_start;
1172
1173         trace_qemu_rdma_register_and_get_keys(len, chunk_start);
1174
1175         block->pmr[chunk] = ibv_reg_mr(rdma->pd,
1176                 chunk_start, len,
1177                 (rkey ? (IBV_ACCESS_LOCAL_WRITE |
1178                         IBV_ACCESS_REMOTE_WRITE) : 0));
1179
1180         if (!block->pmr[chunk]) {
1181             perror("Failed to register chunk!");
1182             fprintf(stderr, "Chunk details: block: %d chunk index %d"
1183                             " start %" PRIuPTR " end %" PRIuPTR
1184                             " host %" PRIuPTR
1185                             " local %" PRIuPTR " registrations: %d\n",
1186                             block->index, chunk, (uintptr_t)chunk_start,
1187                             (uintptr_t)chunk_end, host_addr,
1188                             (uintptr_t)block->local_host_addr,
1189                             rdma->total_registrations);
1190             return -1;
1191         }
1192         rdma->total_registrations++;
1193     }
1194
1195     if (lkey) {
1196         *lkey = block->pmr[chunk]->lkey;
1197     }
1198     if (rkey) {
1199         *rkey = block->pmr[chunk]->rkey;
1200     }
1201     return 0;
1202 }
1203
1204 /*
1205  * Register (at connection time) the memory used for control
1206  * channel messages.
1207  */
1208 static int qemu_rdma_reg_control(RDMAContext *rdma, int idx)
1209 {
1210     rdma->wr_data[idx].control_mr = ibv_reg_mr(rdma->pd,
1211             rdma->wr_data[idx].control, RDMA_CONTROL_MAX_BUFFER,
1212             IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE);
1213     if (rdma->wr_data[idx].control_mr) {
1214         rdma->total_registrations++;
1215         return 0;
1216     }
1217     error_report("qemu_rdma_reg_control failed");
1218     return -1;
1219 }
1220
1221 const char *print_wrid(int wrid)
1222 {
1223     if (wrid >= RDMA_WRID_RECV_CONTROL) {
1224         return wrid_desc[RDMA_WRID_RECV_CONTROL];
1225     }
1226     return wrid_desc[wrid];
1227 }
1228
1229 /*
1230  * RDMA requires memory registration (mlock/pinning), but this is not good for
1231  * overcommitment.
1232  *
1233  * In preparation for the future where LRU information or workload-specific
1234  * writable writable working set memory access behavior is available to QEMU
1235  * it would be nice to have in place the ability to UN-register/UN-pin
1236  * particular memory regions from the RDMA hardware when it is determine that
1237  * those regions of memory will likely not be accessed again in the near future.
1238  *
1239  * While we do not yet have such information right now, the following
1240  * compile-time option allows us to perform a non-optimized version of this
1241  * behavior.
1242  *
1243  * By uncommenting this option, you will cause *all* RDMA transfers to be
1244  * unregistered immediately after the transfer completes on both sides of the
1245  * connection. This has no effect in 'rdma-pin-all' mode, only regular mode.
1246  *
1247  * This will have a terrible impact on migration performance, so until future
1248  * workload information or LRU information is available, do not attempt to use
1249  * this feature except for basic testing.
1250  */
1251 //#define RDMA_UNREGISTRATION_EXAMPLE
1252
1253 /*
1254  * Perform a non-optimized memory unregistration after every transfer
1255  * for demonstration purposes, only if pin-all is not requested.
1256  *
1257  * Potential optimizations:
1258  * 1. Start a new thread to run this function continuously
1259         - for bit clearing
1260         - and for receipt of unregister messages
1261  * 2. Use an LRU.
1262  * 3. Use workload hints.
1263  */
1264 static int qemu_rdma_unregister_waiting(RDMAContext *rdma)
1265 {
1266     while (rdma->unregistrations[rdma->unregister_current]) {
1267         int ret;
1268         uint64_t wr_id = rdma->unregistrations[rdma->unregister_current];
1269         uint64_t chunk =
1270             (wr_id & RDMA_WRID_CHUNK_MASK) >> RDMA_WRID_CHUNK_SHIFT;
1271         uint64_t index =
1272             (wr_id & RDMA_WRID_BLOCK_MASK) >> RDMA_WRID_BLOCK_SHIFT;
1273         RDMALocalBlock *block =
1274             &(rdma->local_ram_blocks.block[index]);
1275         RDMARegister reg = { .current_index = index };
1276         RDMAControlHeader resp = { .type = RDMA_CONTROL_UNREGISTER_FINISHED,
1277                                  };
1278         RDMAControlHeader head = { .len = sizeof(RDMARegister),
1279                                    .type = RDMA_CONTROL_UNREGISTER_REQUEST,
1280                                    .repeat = 1,
1281                                  };
1282
1283         trace_qemu_rdma_unregister_waiting_proc(chunk,
1284                                                 rdma->unregister_current);
1285
1286         rdma->unregistrations[rdma->unregister_current] = 0;
1287         rdma->unregister_current++;
1288
1289         if (rdma->unregister_current == RDMA_SIGNALED_SEND_MAX) {
1290             rdma->unregister_current = 0;
1291         }
1292
1293
1294         /*
1295          * Unregistration is speculative (because migration is single-threaded
1296          * and we cannot break the protocol's inifinband message ordering).
1297          * Thus, if the memory is currently being used for transmission,
1298          * then abort the attempt to unregister and try again
1299          * later the next time a completion is received for this memory.
1300          */
1301         clear_bit(chunk, block->unregister_bitmap);
1302
1303         if (test_bit(chunk, block->transit_bitmap)) {
1304             trace_qemu_rdma_unregister_waiting_inflight(chunk);
1305             continue;
1306         }
1307
1308         trace_qemu_rdma_unregister_waiting_send(chunk);
1309
1310         ret = ibv_dereg_mr(block->pmr[chunk]);
1311         block->pmr[chunk] = NULL;
1312         block->remote_keys[chunk] = 0;
1313
1314         if (ret != 0) {
1315             perror("unregistration chunk failed");
1316             return -ret;
1317         }
1318         rdma->total_registrations--;
1319
1320         reg.key.chunk = chunk;
1321         register_to_network(rdma, &reg);
1322         ret = qemu_rdma_exchange_send(rdma, &head, (uint8_t *) &reg,
1323                                 &resp, NULL, NULL);
1324         if (ret < 0) {
1325             return ret;
1326         }
1327
1328         trace_qemu_rdma_unregister_waiting_complete(chunk);
1329     }
1330
1331     return 0;
1332 }
1333
1334 static uint64_t qemu_rdma_make_wrid(uint64_t wr_id, uint64_t index,
1335                                          uint64_t chunk)
1336 {
1337     uint64_t result = wr_id & RDMA_WRID_TYPE_MASK;
1338
1339     result |= (index << RDMA_WRID_BLOCK_SHIFT);
1340     result |= (chunk << RDMA_WRID_CHUNK_SHIFT);
1341
1342     return result;
1343 }
1344
1345 /*
1346  * Set bit for unregistration in the next iteration.
1347  * We cannot transmit right here, but will unpin later.
1348  */
1349 static void qemu_rdma_signal_unregister(RDMAContext *rdma, uint64_t index,
1350                                         uint64_t chunk, uint64_t wr_id)
1351 {
1352     if (rdma->unregistrations[rdma->unregister_next] != 0) {
1353         error_report("rdma migration: queue is full");
1354     } else {
1355         RDMALocalBlock *block = &(rdma->local_ram_blocks.block[index]);
1356
1357         if (!test_and_set_bit(chunk, block->unregister_bitmap)) {
1358             trace_qemu_rdma_signal_unregister_append(chunk,
1359                                                      rdma->unregister_next);
1360
1361             rdma->unregistrations[rdma->unregister_next++] =
1362                     qemu_rdma_make_wrid(wr_id, index, chunk);
1363
1364             if (rdma->unregister_next == RDMA_SIGNALED_SEND_MAX) {
1365                 rdma->unregister_next = 0;
1366             }
1367         } else {
1368             trace_qemu_rdma_signal_unregister_already(chunk);
1369         }
1370     }
1371 }
1372
1373 /*
1374  * Consult the connection manager to see a work request
1375  * (of any kind) has completed.
1376  * Return the work request ID that completed.
1377  */
1378 static uint64_t qemu_rdma_poll(RDMAContext *rdma, uint64_t *wr_id_out,
1379                                uint32_t *byte_len)
1380 {
1381     int ret;
1382     struct ibv_wc wc;
1383     uint64_t wr_id;
1384
1385     ret = ibv_poll_cq(rdma->cq, 1, &wc);
1386
1387     if (!ret) {
1388         *wr_id_out = RDMA_WRID_NONE;
1389         return 0;
1390     }
1391
1392     if (ret < 0) {
1393         error_report("ibv_poll_cq return %d", ret);
1394         return ret;
1395     }
1396
1397     wr_id = wc.wr_id & RDMA_WRID_TYPE_MASK;
1398
1399     if (wc.status != IBV_WC_SUCCESS) {
1400         fprintf(stderr, "ibv_poll_cq wc.status=%d %s!\n",
1401                         wc.status, ibv_wc_status_str(wc.status));
1402         fprintf(stderr, "ibv_poll_cq wrid=%s!\n", wrid_desc[wr_id]);
1403
1404         return -1;
1405     }
1406
1407     if (rdma->control_ready_expected &&
1408         (wr_id >= RDMA_WRID_RECV_CONTROL)) {
1409         trace_qemu_rdma_poll_recv(wrid_desc[RDMA_WRID_RECV_CONTROL],
1410                   wr_id - RDMA_WRID_RECV_CONTROL, wr_id, rdma->nb_sent);
1411         rdma->control_ready_expected = 0;
1412     }
1413
1414     if (wr_id == RDMA_WRID_RDMA_WRITE) {
1415         uint64_t chunk =
1416             (wc.wr_id & RDMA_WRID_CHUNK_MASK) >> RDMA_WRID_CHUNK_SHIFT;
1417         uint64_t index =
1418             (wc.wr_id & RDMA_WRID_BLOCK_MASK) >> RDMA_WRID_BLOCK_SHIFT;
1419         RDMALocalBlock *block = &(rdma->local_ram_blocks.block[index]);
1420
1421         trace_qemu_rdma_poll_write(print_wrid(wr_id), wr_id, rdma->nb_sent,
1422                                    index, chunk, block->local_host_addr,
1423                                    (void *)(uintptr_t)block->remote_host_addr);
1424
1425         clear_bit(chunk, block->transit_bitmap);
1426
1427         if (rdma->nb_sent > 0) {
1428             rdma->nb_sent--;
1429         }
1430
1431         if (!rdma->pin_all) {
1432             /*
1433              * FYI: If one wanted to signal a specific chunk to be unregistered
1434              * using LRU or workload-specific information, this is the function
1435              * you would call to do so. That chunk would then get asynchronously
1436              * unregistered later.
1437              */
1438 #ifdef RDMA_UNREGISTRATION_EXAMPLE
1439             qemu_rdma_signal_unregister(rdma, index, chunk, wc.wr_id);
1440 #endif
1441         }
1442     } else {
1443         trace_qemu_rdma_poll_other(print_wrid(wr_id), wr_id, rdma->nb_sent);
1444     }
1445
1446     *wr_id_out = wc.wr_id;
1447     if (byte_len) {
1448         *byte_len = wc.byte_len;
1449     }
1450
1451     return  0;
1452 }
1453
1454 /*
1455  * Block until the next work request has completed.
1456  *
1457  * First poll to see if a work request has already completed,
1458  * otherwise block.
1459  *
1460  * If we encounter completed work requests for IDs other than
1461  * the one we're interested in, then that's generally an error.
1462  *
1463  * The only exception is actual RDMA Write completions. These
1464  * completions only need to be recorded, but do not actually
1465  * need further processing.
1466  */
1467 static int qemu_rdma_block_for_wrid(RDMAContext *rdma, int wrid_requested,
1468                                     uint32_t *byte_len)
1469 {
1470     int num_cq_events = 0, ret = 0;
1471     struct ibv_cq *cq;
1472     void *cq_ctx;
1473     uint64_t wr_id = RDMA_WRID_NONE, wr_id_in;
1474
1475     if (ibv_req_notify_cq(rdma->cq, 0)) {
1476         return -1;
1477     }
1478     /* poll cq first */
1479     while (wr_id != wrid_requested) {
1480         ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len);
1481         if (ret < 0) {
1482             return ret;
1483         }
1484
1485         wr_id = wr_id_in & RDMA_WRID_TYPE_MASK;
1486
1487         if (wr_id == RDMA_WRID_NONE) {
1488             break;
1489         }
1490         if (wr_id != wrid_requested) {
1491             trace_qemu_rdma_block_for_wrid_miss(print_wrid(wrid_requested),
1492                        wrid_requested, print_wrid(wr_id), wr_id);
1493         }
1494     }
1495
1496     if (wr_id == wrid_requested) {
1497         return 0;
1498     }
1499
1500     while (1) {
1501         /*
1502          * Coroutine doesn't start until process_incoming_migration()
1503          * so don't yield unless we know we're running inside of a coroutine.
1504          */
1505         if (rdma->migration_started_on_destination) {
1506             yield_until_fd_readable(rdma->comp_channel->fd);
1507         }
1508
1509         if (ibv_get_cq_event(rdma->comp_channel, &cq, &cq_ctx)) {
1510             perror("ibv_get_cq_event");
1511             goto err_block_for_wrid;
1512         }
1513
1514         num_cq_events++;
1515
1516         if (ibv_req_notify_cq(cq, 0)) {
1517             goto err_block_for_wrid;
1518         }
1519
1520         while (wr_id != wrid_requested) {
1521             ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len);
1522             if (ret < 0) {
1523                 goto err_block_for_wrid;
1524             }
1525
1526             wr_id = wr_id_in & RDMA_WRID_TYPE_MASK;
1527
1528             if (wr_id == RDMA_WRID_NONE) {
1529                 break;
1530             }
1531             if (wr_id != wrid_requested) {
1532                 trace_qemu_rdma_block_for_wrid_miss(print_wrid(wrid_requested),
1533                                    wrid_requested, print_wrid(wr_id), wr_id);
1534             }
1535         }
1536
1537         if (wr_id == wrid_requested) {
1538             goto success_block_for_wrid;
1539         }
1540     }
1541
1542 success_block_for_wrid:
1543     if (num_cq_events) {
1544         ibv_ack_cq_events(cq, num_cq_events);
1545     }
1546     return 0;
1547
1548 err_block_for_wrid:
1549     if (num_cq_events) {
1550         ibv_ack_cq_events(cq, num_cq_events);
1551     }
1552     return ret;
1553 }
1554
1555 /*
1556  * Post a SEND message work request for the control channel
1557  * containing some data and block until the post completes.
1558  */
1559 static int qemu_rdma_post_send_control(RDMAContext *rdma, uint8_t *buf,
1560                                        RDMAControlHeader *head)
1561 {
1562     int ret = 0;
1563     RDMAWorkRequestData *wr = &rdma->wr_data[RDMA_WRID_CONTROL];
1564     struct ibv_send_wr *bad_wr;
1565     struct ibv_sge sge = {
1566                            .addr = (uintptr_t)(wr->control),
1567                            .length = head->len + sizeof(RDMAControlHeader),
1568                            .lkey = wr->control_mr->lkey,
1569                          };
1570     struct ibv_send_wr send_wr = {
1571                                    .wr_id = RDMA_WRID_SEND_CONTROL,
1572                                    .opcode = IBV_WR_SEND,
1573                                    .send_flags = IBV_SEND_SIGNALED,
1574                                    .sg_list = &sge,
1575                                    .num_sge = 1,
1576                                 };
1577
1578     trace_qemu_rdma_post_send_control(control_desc[head->type]);
1579
1580     /*
1581      * We don't actually need to do a memcpy() in here if we used
1582      * the "sge" properly, but since we're only sending control messages
1583      * (not RAM in a performance-critical path), then its OK for now.
1584      *
1585      * The copy makes the RDMAControlHeader simpler to manipulate
1586      * for the time being.
1587      */
1588     assert(head->len <= RDMA_CONTROL_MAX_BUFFER - sizeof(*head));
1589     memcpy(wr->control, head, sizeof(RDMAControlHeader));
1590     control_to_network((void *) wr->control);
1591
1592     if (buf) {
1593         memcpy(wr->control + sizeof(RDMAControlHeader), buf, head->len);
1594     }
1595
1596
1597     ret = ibv_post_send(rdma->qp, &send_wr, &bad_wr);
1598
1599     if (ret > 0) {
1600         error_report("Failed to use post IB SEND for control");
1601         return -ret;
1602     }
1603
1604     ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_SEND_CONTROL, NULL);
1605     if (ret < 0) {
1606         error_report("rdma migration: send polling control error");
1607     }
1608
1609     return ret;
1610 }
1611
1612 /*
1613  * Post a RECV work request in anticipation of some future receipt
1614  * of data on the control channel.
1615  */
1616 static int qemu_rdma_post_recv_control(RDMAContext *rdma, int idx)
1617 {
1618     struct ibv_recv_wr *bad_wr;
1619     struct ibv_sge sge = {
1620                             .addr = (uintptr_t)(rdma->wr_data[idx].control),
1621                             .length = RDMA_CONTROL_MAX_BUFFER,
1622                             .lkey = rdma->wr_data[idx].control_mr->lkey,
1623                          };
1624
1625     struct ibv_recv_wr recv_wr = {
1626                                     .wr_id = RDMA_WRID_RECV_CONTROL + idx,
1627                                     .sg_list = &sge,
1628                                     .num_sge = 1,
1629                                  };
1630
1631
1632     if (ibv_post_recv(rdma->qp, &recv_wr, &bad_wr)) {
1633         return -1;
1634     }
1635
1636     return 0;
1637 }
1638
1639 /*
1640  * Block and wait for a RECV control channel message to arrive.
1641  */
1642 static int qemu_rdma_exchange_get_response(RDMAContext *rdma,
1643                 RDMAControlHeader *head, int expecting, int idx)
1644 {
1645     uint32_t byte_len;
1646     int ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RECV_CONTROL + idx,
1647                                        &byte_len);
1648
1649     if (ret < 0) {
1650         error_report("rdma migration: recv polling control error!");
1651         return ret;
1652     }
1653
1654     network_to_control((void *) rdma->wr_data[idx].control);
1655     memcpy(head, rdma->wr_data[idx].control, sizeof(RDMAControlHeader));
1656
1657     trace_qemu_rdma_exchange_get_response_start(control_desc[expecting]);
1658
1659     if (expecting == RDMA_CONTROL_NONE) {
1660         trace_qemu_rdma_exchange_get_response_none(control_desc[head->type],
1661                                              head->type);
1662     } else if (head->type != expecting || head->type == RDMA_CONTROL_ERROR) {
1663         error_report("Was expecting a %s (%d) control message"
1664                 ", but got: %s (%d), length: %d",
1665                 control_desc[expecting], expecting,
1666                 control_desc[head->type], head->type, head->len);
1667         return -EIO;
1668     }
1669     if (head->len > RDMA_CONTROL_MAX_BUFFER - sizeof(*head)) {
1670         error_report("too long length: %d", head->len);
1671         return -EINVAL;
1672     }
1673     if (sizeof(*head) + head->len != byte_len) {
1674         error_report("Malformed length: %d byte_len %d", head->len, byte_len);
1675         return -EINVAL;
1676     }
1677
1678     return 0;
1679 }
1680
1681 /*
1682  * When a RECV work request has completed, the work request's
1683  * buffer is pointed at the header.
1684  *
1685  * This will advance the pointer to the data portion
1686  * of the control message of the work request's buffer that
1687  * was populated after the work request finished.
1688  */
1689 static void qemu_rdma_move_header(RDMAContext *rdma, int idx,
1690                                   RDMAControlHeader *head)
1691 {
1692     rdma->wr_data[idx].control_len = head->len;
1693     rdma->wr_data[idx].control_curr =
1694         rdma->wr_data[idx].control + sizeof(RDMAControlHeader);
1695 }
1696
1697 /*
1698  * This is an 'atomic' high-level operation to deliver a single, unified
1699  * control-channel message.
1700  *
1701  * Additionally, if the user is expecting some kind of reply to this message,
1702  * they can request a 'resp' response message be filled in by posting an
1703  * additional work request on behalf of the user and waiting for an additional
1704  * completion.
1705  *
1706  * The extra (optional) response is used during registration to us from having
1707  * to perform an *additional* exchange of message just to provide a response by
1708  * instead piggy-backing on the acknowledgement.
1709  */
1710 static int qemu_rdma_exchange_send(RDMAContext *rdma, RDMAControlHeader *head,
1711                                    uint8_t *data, RDMAControlHeader *resp,
1712                                    int *resp_idx,
1713                                    int (*callback)(RDMAContext *rdma))
1714 {
1715     int ret = 0;
1716
1717     /*
1718      * Wait until the dest is ready before attempting to deliver the message
1719      * by waiting for a READY message.
1720      */
1721     if (rdma->control_ready_expected) {
1722         RDMAControlHeader resp;
1723         ret = qemu_rdma_exchange_get_response(rdma,
1724                                     &resp, RDMA_CONTROL_READY, RDMA_WRID_READY);
1725         if (ret < 0) {
1726             return ret;
1727         }
1728     }
1729
1730     /*
1731      * If the user is expecting a response, post a WR in anticipation of it.
1732      */
1733     if (resp) {
1734         ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_DATA);
1735         if (ret) {
1736             error_report("rdma migration: error posting"
1737                     " extra control recv for anticipated result!");
1738             return ret;
1739         }
1740     }
1741
1742     /*
1743      * Post a WR to replace the one we just consumed for the READY message.
1744      */
1745     ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY);
1746     if (ret) {
1747         error_report("rdma migration: error posting first control recv!");
1748         return ret;
1749     }
1750
1751     /*
1752      * Deliver the control message that was requested.
1753      */
1754     ret = qemu_rdma_post_send_control(rdma, data, head);
1755
1756     if (ret < 0) {
1757         error_report("Failed to send control buffer!");
1758         return ret;
1759     }
1760
1761     /*
1762      * If we're expecting a response, block and wait for it.
1763      */
1764     if (resp) {
1765         if (callback) {
1766             trace_qemu_rdma_exchange_send_issue_callback();
1767             ret = callback(rdma);
1768             if (ret < 0) {
1769                 return ret;
1770             }
1771         }
1772
1773         trace_qemu_rdma_exchange_send_waiting(control_desc[resp->type]);
1774         ret = qemu_rdma_exchange_get_response(rdma, resp,
1775                                               resp->type, RDMA_WRID_DATA);
1776
1777         if (ret < 0) {
1778             return ret;
1779         }
1780
1781         qemu_rdma_move_header(rdma, RDMA_WRID_DATA, resp);
1782         if (resp_idx) {
1783             *resp_idx = RDMA_WRID_DATA;
1784         }
1785         trace_qemu_rdma_exchange_send_received(control_desc[resp->type]);
1786     }
1787
1788     rdma->control_ready_expected = 1;
1789
1790     return 0;
1791 }
1792
1793 /*
1794  * This is an 'atomic' high-level operation to receive a single, unified
1795  * control-channel message.
1796  */
1797 static int qemu_rdma_exchange_recv(RDMAContext *rdma, RDMAControlHeader *head,
1798                                 int expecting)
1799 {
1800     RDMAControlHeader ready = {
1801                                 .len = 0,
1802                                 .type = RDMA_CONTROL_READY,
1803                                 .repeat = 1,
1804                               };
1805     int ret;
1806
1807     /*
1808      * Inform the source that we're ready to receive a message.
1809      */
1810     ret = qemu_rdma_post_send_control(rdma, NULL, &ready);
1811
1812     if (ret < 0) {
1813         error_report("Failed to send control buffer!");
1814         return ret;
1815     }
1816
1817     /*
1818      * Block and wait for the message.
1819      */
1820     ret = qemu_rdma_exchange_get_response(rdma, head,
1821                                           expecting, RDMA_WRID_READY);
1822
1823     if (ret < 0) {
1824         return ret;
1825     }
1826
1827     qemu_rdma_move_header(rdma, RDMA_WRID_READY, head);
1828
1829     /*
1830      * Post a new RECV work request to replace the one we just consumed.
1831      */
1832     ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY);
1833     if (ret) {
1834         error_report("rdma migration: error posting second control recv!");
1835         return ret;
1836     }
1837
1838     return 0;
1839 }
1840
1841 /*
1842  * Write an actual chunk of memory using RDMA.
1843  *
1844  * If we're using dynamic registration on the dest-side, we have to
1845  * send a registration command first.
1846  */
1847 static int qemu_rdma_write_one(QEMUFile *f, RDMAContext *rdma,
1848                                int current_index, uint64_t current_addr,
1849                                uint64_t length)
1850 {
1851     struct ibv_sge sge;
1852     struct ibv_send_wr send_wr = { 0 };
1853     struct ibv_send_wr *bad_wr;
1854     int reg_result_idx, ret, count = 0;
1855     uint64_t chunk, chunks;
1856     uint8_t *chunk_start, *chunk_end;
1857     RDMALocalBlock *block = &(rdma->local_ram_blocks.block[current_index]);
1858     RDMARegister reg;
1859     RDMARegisterResult *reg_result;
1860     RDMAControlHeader resp = { .type = RDMA_CONTROL_REGISTER_RESULT };
1861     RDMAControlHeader head = { .len = sizeof(RDMARegister),
1862                                .type = RDMA_CONTROL_REGISTER_REQUEST,
1863                                .repeat = 1,
1864                              };
1865
1866 retry:
1867     sge.addr = (uintptr_t)(block->local_host_addr +
1868                             (current_addr - block->offset));
1869     sge.length = length;
1870
1871     chunk = ram_chunk_index(block->local_host_addr,
1872                             (uint8_t *)(uintptr_t)sge.addr);
1873     chunk_start = ram_chunk_start(block, chunk);
1874
1875     if (block->is_ram_block) {
1876         chunks = length / (1UL << RDMA_REG_CHUNK_SHIFT);
1877
1878         if (chunks && ((length % (1UL << RDMA_REG_CHUNK_SHIFT)) == 0)) {
1879             chunks--;
1880         }
1881     } else {
1882         chunks = block->length / (1UL << RDMA_REG_CHUNK_SHIFT);
1883
1884         if (chunks && ((block->length % (1UL << RDMA_REG_CHUNK_SHIFT)) == 0)) {
1885             chunks--;
1886         }
1887     }
1888
1889     trace_qemu_rdma_write_one_top(chunks + 1,
1890                                   (chunks + 1) *
1891                                   (1UL << RDMA_REG_CHUNK_SHIFT) / 1024 / 1024);
1892
1893     chunk_end = ram_chunk_end(block, chunk + chunks);
1894
1895     if (!rdma->pin_all) {
1896 #ifdef RDMA_UNREGISTRATION_EXAMPLE
1897         qemu_rdma_unregister_waiting(rdma);
1898 #endif
1899     }
1900
1901     while (test_bit(chunk, block->transit_bitmap)) {
1902         (void)count;
1903         trace_qemu_rdma_write_one_block(count++, current_index, chunk,
1904                 sge.addr, length, rdma->nb_sent, block->nb_chunks);
1905
1906         ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RDMA_WRITE, NULL);
1907
1908         if (ret < 0) {
1909             error_report("Failed to Wait for previous write to complete "
1910                     "block %d chunk %" PRIu64
1911                     " current %" PRIu64 " len %" PRIu64 " %d",
1912                     current_index, chunk, sge.addr, length, rdma->nb_sent);
1913             return ret;
1914         }
1915     }
1916
1917     if (!rdma->pin_all || !block->is_ram_block) {
1918         if (!block->remote_keys[chunk]) {
1919             /*
1920              * This chunk has not yet been registered, so first check to see
1921              * if the entire chunk is zero. If so, tell the other size to
1922              * memset() + madvise() the entire chunk without RDMA.
1923              */
1924
1925             if (can_use_buffer_find_nonzero_offset((void *)(uintptr_t)sge.addr,
1926                                                    length)
1927                    && buffer_find_nonzero_offset((void *)(uintptr_t)sge.addr,
1928                                                     length) == length) {
1929                 RDMACompress comp = {
1930                                         .offset = current_addr,
1931                                         .value = 0,
1932                                         .block_idx = current_index,
1933                                         .length = length,
1934                                     };
1935
1936                 head.len = sizeof(comp);
1937                 head.type = RDMA_CONTROL_COMPRESS;
1938
1939                 trace_qemu_rdma_write_one_zero(chunk, sge.length,
1940                                                current_index, current_addr);
1941
1942                 compress_to_network(rdma, &comp);
1943                 ret = qemu_rdma_exchange_send(rdma, &head,
1944                                 (uint8_t *) &comp, NULL, NULL, NULL);
1945
1946                 if (ret < 0) {
1947                     return -EIO;
1948                 }
1949
1950                 acct_update_position(f, sge.length, true);
1951
1952                 return 1;
1953             }
1954
1955             /*
1956              * Otherwise, tell other side to register.
1957              */
1958             reg.current_index = current_index;
1959             if (block->is_ram_block) {
1960                 reg.key.current_addr = current_addr;
1961             } else {
1962                 reg.key.chunk = chunk;
1963             }
1964             reg.chunks = chunks;
1965
1966             trace_qemu_rdma_write_one_sendreg(chunk, sge.length, current_index,
1967                                               current_addr);
1968
1969             register_to_network(rdma, &reg);
1970             ret = qemu_rdma_exchange_send(rdma, &head, (uint8_t *) &reg,
1971                                     &resp, &reg_result_idx, NULL);
1972             if (ret < 0) {
1973                 return ret;
1974             }
1975
1976             /* try to overlap this single registration with the one we sent. */
1977             if (qemu_rdma_register_and_get_keys(rdma, block, sge.addr,
1978                                                 &sge.lkey, NULL, chunk,
1979                                                 chunk_start, chunk_end)) {
1980                 error_report("cannot get lkey");
1981                 return -EINVAL;
1982             }
1983
1984             reg_result = (RDMARegisterResult *)
1985                     rdma->wr_data[reg_result_idx].control_curr;
1986
1987             network_to_result(reg_result);
1988
1989             trace_qemu_rdma_write_one_recvregres(block->remote_keys[chunk],
1990                                                  reg_result->rkey, chunk);
1991
1992             block->remote_keys[chunk] = reg_result->rkey;
1993             block->remote_host_addr = reg_result->host_addr;
1994         } else {
1995             /* already registered before */
1996             if (qemu_rdma_register_and_get_keys(rdma, block, sge.addr,
1997                                                 &sge.lkey, NULL, chunk,
1998                                                 chunk_start, chunk_end)) {
1999                 error_report("cannot get lkey!");
2000                 return -EINVAL;
2001             }
2002         }
2003
2004         send_wr.wr.rdma.rkey = block->remote_keys[chunk];
2005     } else {
2006         send_wr.wr.rdma.rkey = block->remote_rkey;
2007
2008         if (qemu_rdma_register_and_get_keys(rdma, block, sge.addr,
2009                                                      &sge.lkey, NULL, chunk,
2010                                                      chunk_start, chunk_end)) {
2011             error_report("cannot get lkey!");
2012             return -EINVAL;
2013         }
2014     }
2015
2016     /*
2017      * Encode the ram block index and chunk within this wrid.
2018      * We will use this information at the time of completion
2019      * to figure out which bitmap to check against and then which
2020      * chunk in the bitmap to look for.
2021      */
2022     send_wr.wr_id = qemu_rdma_make_wrid(RDMA_WRID_RDMA_WRITE,
2023                                         current_index, chunk);
2024
2025     send_wr.opcode = IBV_WR_RDMA_WRITE;
2026     send_wr.send_flags = IBV_SEND_SIGNALED;
2027     send_wr.sg_list = &sge;
2028     send_wr.num_sge = 1;
2029     send_wr.wr.rdma.remote_addr = block->remote_host_addr +
2030                                 (current_addr - block->offset);
2031
2032     trace_qemu_rdma_write_one_post(chunk, sge.addr, send_wr.wr.rdma.remote_addr,
2033                                    sge.length);
2034
2035     /*
2036      * ibv_post_send() does not return negative error numbers,
2037      * per the specification they are positive - no idea why.
2038      */
2039     ret = ibv_post_send(rdma->qp, &send_wr, &bad_wr);
2040
2041     if (ret == ENOMEM) {
2042         trace_qemu_rdma_write_one_queue_full();
2043         ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RDMA_WRITE, NULL);
2044         if (ret < 0) {
2045             error_report("rdma migration: failed to make "
2046                          "room in full send queue! %d", ret);
2047             return ret;
2048         }
2049
2050         goto retry;
2051
2052     } else if (ret > 0) {
2053         perror("rdma migration: post rdma write failed");
2054         return -ret;
2055     }
2056
2057     set_bit(chunk, block->transit_bitmap);
2058     acct_update_position(f, sge.length, false);
2059     rdma->total_writes++;
2060
2061     return 0;
2062 }
2063
2064 /*
2065  * Push out any unwritten RDMA operations.
2066  *
2067  * We support sending out multiple chunks at the same time.
2068  * Not all of them need to get signaled in the completion queue.
2069  */
2070 static int qemu_rdma_write_flush(QEMUFile *f, RDMAContext *rdma)
2071 {
2072     int ret;
2073
2074     if (!rdma->current_length) {
2075         return 0;
2076     }
2077
2078     ret = qemu_rdma_write_one(f, rdma,
2079             rdma->current_index, rdma->current_addr, rdma->current_length);
2080
2081     if (ret < 0) {
2082         return ret;
2083     }
2084
2085     if (ret == 0) {
2086         rdma->nb_sent++;
2087         trace_qemu_rdma_write_flush(rdma->nb_sent);
2088     }
2089
2090     rdma->current_length = 0;
2091     rdma->current_addr = 0;
2092
2093     return 0;
2094 }
2095
2096 static inline int qemu_rdma_buffer_mergable(RDMAContext *rdma,
2097                     uint64_t offset, uint64_t len)
2098 {
2099     RDMALocalBlock *block;
2100     uint8_t *host_addr;
2101     uint8_t *chunk_end;
2102
2103     if (rdma->current_index < 0) {
2104         return 0;
2105     }
2106
2107     if (rdma->current_chunk < 0) {
2108         return 0;
2109     }
2110
2111     block = &(rdma->local_ram_blocks.block[rdma->current_index]);
2112     host_addr = block->local_host_addr + (offset - block->offset);
2113     chunk_end = ram_chunk_end(block, rdma->current_chunk);
2114
2115     if (rdma->current_length == 0) {
2116         return 0;
2117     }
2118
2119     /*
2120      * Only merge into chunk sequentially.
2121      */
2122     if (offset != (rdma->current_addr + rdma->current_length)) {
2123         return 0;
2124     }
2125
2126     if (offset < block->offset) {
2127         return 0;
2128     }
2129
2130     if ((offset + len) > (block->offset + block->length)) {
2131         return 0;
2132     }
2133
2134     if ((host_addr + len) > chunk_end) {
2135         return 0;
2136     }
2137
2138     return 1;
2139 }
2140
2141 /*
2142  * We're not actually writing here, but doing three things:
2143  *
2144  * 1. Identify the chunk the buffer belongs to.
2145  * 2. If the chunk is full or the buffer doesn't belong to the current
2146  *    chunk, then start a new chunk and flush() the old chunk.
2147  * 3. To keep the hardware busy, we also group chunks into batches
2148  *    and only require that a batch gets acknowledged in the completion
2149  *    qeueue instead of each individual chunk.
2150  */
2151 static int qemu_rdma_write(QEMUFile *f, RDMAContext *rdma,
2152                            uint64_t block_offset, uint64_t offset,
2153                            uint64_t len)
2154 {
2155     uint64_t current_addr = block_offset + offset;
2156     uint64_t index = rdma->current_index;
2157     uint64_t chunk = rdma->current_chunk;
2158     int ret;
2159
2160     /* If we cannot merge it, we flush the current buffer first. */
2161     if (!qemu_rdma_buffer_mergable(rdma, current_addr, len)) {
2162         ret = qemu_rdma_write_flush(f, rdma);
2163         if (ret) {
2164             return ret;
2165         }
2166         rdma->current_length = 0;
2167         rdma->current_addr = current_addr;
2168
2169         ret = qemu_rdma_search_ram_block(rdma, block_offset,
2170                                          offset, len, &index, &chunk);
2171         if (ret) {
2172             error_report("ram block search failed");
2173             return ret;
2174         }
2175         rdma->current_index = index;
2176         rdma->current_chunk = chunk;
2177     }
2178
2179     /* merge it */
2180     rdma->current_length += len;
2181
2182     /* flush it if buffer is too large */
2183     if (rdma->current_length >= RDMA_MERGE_MAX) {
2184         return qemu_rdma_write_flush(f, rdma);
2185     }
2186
2187     return 0;
2188 }
2189
2190 static void qemu_rdma_cleanup(RDMAContext *rdma)
2191 {
2192     struct rdma_cm_event *cm_event;
2193     int ret, idx;
2194
2195     if (rdma->cm_id && rdma->connected) {
2196         if (rdma->error_state) {
2197             RDMAControlHeader head = { .len = 0,
2198                                        .type = RDMA_CONTROL_ERROR,
2199                                        .repeat = 1,
2200                                      };
2201             error_report("Early error. Sending error.");
2202             qemu_rdma_post_send_control(rdma, NULL, &head);
2203         }
2204
2205         ret = rdma_disconnect(rdma->cm_id);
2206         if (!ret) {
2207             trace_qemu_rdma_cleanup_waiting_for_disconnect();
2208             ret = rdma_get_cm_event(rdma->channel, &cm_event);
2209             if (!ret) {
2210                 rdma_ack_cm_event(cm_event);
2211             }
2212         }
2213         trace_qemu_rdma_cleanup_disconnect();
2214         rdma->connected = false;
2215     }
2216
2217     g_free(rdma->dest_blocks);
2218     rdma->dest_blocks = NULL;
2219
2220     for (idx = 0; idx < RDMA_WRID_MAX; idx++) {
2221         if (rdma->wr_data[idx].control_mr) {
2222             rdma->total_registrations--;
2223             ibv_dereg_mr(rdma->wr_data[idx].control_mr);
2224         }
2225         rdma->wr_data[idx].control_mr = NULL;
2226     }
2227
2228     if (rdma->local_ram_blocks.block) {
2229         while (rdma->local_ram_blocks.nb_blocks) {
2230             rdma_delete_block(rdma, &rdma->local_ram_blocks.block[0]);
2231         }
2232     }
2233
2234     if (rdma->qp) {
2235         rdma_destroy_qp(rdma->cm_id);
2236         rdma->qp = NULL;
2237     }
2238     if (rdma->cq) {
2239         ibv_destroy_cq(rdma->cq);
2240         rdma->cq = NULL;
2241     }
2242     if (rdma->comp_channel) {
2243         ibv_destroy_comp_channel(rdma->comp_channel);
2244         rdma->comp_channel = NULL;
2245     }
2246     if (rdma->pd) {
2247         ibv_dealloc_pd(rdma->pd);
2248         rdma->pd = NULL;
2249     }
2250     if (rdma->cm_id) {
2251         rdma_destroy_id(rdma->cm_id);
2252         rdma->cm_id = NULL;
2253     }
2254     if (rdma->listen_id) {
2255         rdma_destroy_id(rdma->listen_id);
2256         rdma->listen_id = NULL;
2257     }
2258     if (rdma->channel) {
2259         rdma_destroy_event_channel(rdma->channel);
2260         rdma->channel = NULL;
2261     }
2262     g_free(rdma->host);
2263     rdma->host = NULL;
2264 }
2265
2266
2267 static int qemu_rdma_source_init(RDMAContext *rdma, Error **errp, bool pin_all)
2268 {
2269     int ret, idx;
2270     Error *local_err = NULL, **temp = &local_err;
2271
2272     /*
2273      * Will be validated against destination's actual capabilities
2274      * after the connect() completes.
2275      */
2276     rdma->pin_all = pin_all;
2277
2278     ret = qemu_rdma_resolve_host(rdma, temp);
2279     if (ret) {
2280         goto err_rdma_source_init;
2281     }
2282
2283     ret = qemu_rdma_alloc_pd_cq(rdma);
2284     if (ret) {
2285         ERROR(temp, "rdma migration: error allocating pd and cq! Your mlock()"
2286                     " limits may be too low. Please check $ ulimit -a # and "
2287                     "search for 'ulimit -l' in the output");
2288         goto err_rdma_source_init;
2289     }
2290
2291     ret = qemu_rdma_alloc_qp(rdma);
2292     if (ret) {
2293         ERROR(temp, "rdma migration: error allocating qp!");
2294         goto err_rdma_source_init;
2295     }
2296
2297     ret = qemu_rdma_init_ram_blocks(rdma);
2298     if (ret) {
2299         ERROR(temp, "rdma migration: error initializing ram blocks!");
2300         goto err_rdma_source_init;
2301     }
2302
2303     /* Build the hash that maps from offset to RAMBlock */
2304     rdma->blockmap = g_hash_table_new(g_direct_hash, g_direct_equal);
2305     for (idx = 0; idx < rdma->local_ram_blocks.nb_blocks; idx++) {
2306         g_hash_table_insert(rdma->blockmap,
2307                 (void *)(uintptr_t)rdma->local_ram_blocks.block[idx].offset,
2308                 &rdma->local_ram_blocks.block[idx]);
2309     }
2310
2311     for (idx = 0; idx < RDMA_WRID_MAX; idx++) {
2312         ret = qemu_rdma_reg_control(rdma, idx);
2313         if (ret) {
2314             ERROR(temp, "rdma migration: error registering %d control!",
2315                                                             idx);
2316             goto err_rdma_source_init;
2317         }
2318     }
2319
2320     return 0;
2321
2322 err_rdma_source_init:
2323     error_propagate(errp, local_err);
2324     qemu_rdma_cleanup(rdma);
2325     return -1;
2326 }
2327
2328 static int qemu_rdma_connect(RDMAContext *rdma, Error **errp)
2329 {
2330     RDMACapabilities cap = {
2331                                 .version = RDMA_CONTROL_VERSION_CURRENT,
2332                                 .flags = 0,
2333                            };
2334     struct rdma_conn_param conn_param = { .initiator_depth = 2,
2335                                           .retry_count = 5,
2336                                           .private_data = &cap,
2337                                           .private_data_len = sizeof(cap),
2338                                         };
2339     struct rdma_cm_event *cm_event;
2340     int ret;
2341
2342     /*
2343      * Only negotiate the capability with destination if the user
2344      * on the source first requested the capability.
2345      */
2346     if (rdma->pin_all) {
2347         trace_qemu_rdma_connect_pin_all_requested();
2348         cap.flags |= RDMA_CAPABILITY_PIN_ALL;
2349     }
2350
2351     caps_to_network(&cap);
2352
2353     ret = rdma_connect(rdma->cm_id, &conn_param);
2354     if (ret) {
2355         perror("rdma_connect");
2356         ERROR(errp, "connecting to destination!");
2357         goto err_rdma_source_connect;
2358     }
2359
2360     ret = rdma_get_cm_event(rdma->channel, &cm_event);
2361     if (ret) {
2362         perror("rdma_get_cm_event after rdma_connect");
2363         ERROR(errp, "connecting to destination!");
2364         rdma_ack_cm_event(cm_event);
2365         goto err_rdma_source_connect;
2366     }
2367
2368     if (cm_event->event != RDMA_CM_EVENT_ESTABLISHED) {
2369         perror("rdma_get_cm_event != EVENT_ESTABLISHED after rdma_connect");
2370         ERROR(errp, "connecting to destination!");
2371         rdma_ack_cm_event(cm_event);
2372         goto err_rdma_source_connect;
2373     }
2374     rdma->connected = true;
2375
2376     memcpy(&cap, cm_event->param.conn.private_data, sizeof(cap));
2377     network_to_caps(&cap);
2378
2379     /*
2380      * Verify that the *requested* capabilities are supported by the destination
2381      * and disable them otherwise.
2382      */
2383     if (rdma->pin_all && !(cap.flags & RDMA_CAPABILITY_PIN_ALL)) {
2384         ERROR(errp, "Server cannot support pinning all memory. "
2385                         "Will register memory dynamically.");
2386         rdma->pin_all = false;
2387     }
2388
2389     trace_qemu_rdma_connect_pin_all_outcome(rdma->pin_all);
2390
2391     rdma_ack_cm_event(cm_event);
2392
2393     ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY);
2394     if (ret) {
2395         ERROR(errp, "posting second control recv!");
2396         goto err_rdma_source_connect;
2397     }
2398
2399     rdma->control_ready_expected = 1;
2400     rdma->nb_sent = 0;
2401     return 0;
2402
2403 err_rdma_source_connect:
2404     qemu_rdma_cleanup(rdma);
2405     return -1;
2406 }
2407
2408 static int qemu_rdma_dest_init(RDMAContext *rdma, Error **errp)
2409 {
2410     int ret, idx;
2411     struct rdma_cm_id *listen_id;
2412     char ip[40] = "unknown";
2413     struct rdma_addrinfo *res, *e;
2414     char port_str[16];
2415
2416     for (idx = 0; idx < RDMA_WRID_MAX; idx++) {
2417         rdma->wr_data[idx].control_len = 0;
2418         rdma->wr_data[idx].control_curr = NULL;
2419     }
2420
2421     if (!rdma->host || !rdma->host[0]) {
2422         ERROR(errp, "RDMA host is not set!");
2423         rdma->error_state = -EINVAL;
2424         return -1;
2425     }
2426     /* create CM channel */
2427     rdma->channel = rdma_create_event_channel();
2428     if (!rdma->channel) {
2429         ERROR(errp, "could not create rdma event channel");
2430         rdma->error_state = -EINVAL;
2431         return -1;
2432     }
2433
2434     /* create CM id */
2435     ret = rdma_create_id(rdma->channel, &listen_id, NULL, RDMA_PS_TCP);
2436     if (ret) {
2437         ERROR(errp, "could not create cm_id!");
2438         goto err_dest_init_create_listen_id;
2439     }
2440
2441     snprintf(port_str, 16, "%d", rdma->port);
2442     port_str[15] = '\0';
2443
2444     ret = rdma_getaddrinfo(rdma->host, port_str, NULL, &res);
2445     if (ret < 0) {
2446         ERROR(errp, "could not rdma_getaddrinfo address %s", rdma->host);
2447         goto err_dest_init_bind_addr;
2448     }
2449
2450     for (e = res; e != NULL; e = e->ai_next) {
2451         inet_ntop(e->ai_family,
2452             &((struct sockaddr_in *) e->ai_dst_addr)->sin_addr, ip, sizeof ip);
2453         trace_qemu_rdma_dest_init_trying(rdma->host, ip);
2454         ret = rdma_bind_addr(listen_id, e->ai_dst_addr);
2455         if (ret) {
2456             continue;
2457         }
2458         if (e->ai_family == AF_INET6) {
2459             ret = qemu_rdma_broken_ipv6_kernel(errp, listen_id->verbs);
2460             if (ret) {
2461                 continue;
2462             }
2463         }
2464         break;
2465     }
2466
2467     if (!e) {
2468         ERROR(errp, "Error: could not rdma_bind_addr!");
2469         goto err_dest_init_bind_addr;
2470     }
2471
2472     rdma->listen_id = listen_id;
2473     qemu_rdma_dump_gid("dest_init", listen_id);
2474     return 0;
2475
2476 err_dest_init_bind_addr:
2477     rdma_destroy_id(listen_id);
2478 err_dest_init_create_listen_id:
2479     rdma_destroy_event_channel(rdma->channel);
2480     rdma->channel = NULL;
2481     rdma->error_state = ret;
2482     return ret;
2483
2484 }
2485
2486 static void *qemu_rdma_data_init(const char *host_port, Error **errp)
2487 {
2488     RDMAContext *rdma = NULL;
2489     InetSocketAddress *addr;
2490
2491     if (host_port) {
2492         rdma = g_malloc0(sizeof(RDMAContext));
2493         rdma->current_index = -1;
2494         rdma->current_chunk = -1;
2495
2496         addr = inet_parse(host_port, NULL);
2497         if (addr != NULL) {
2498             rdma->port = atoi(addr->port);
2499             rdma->host = g_strdup(addr->host);
2500         } else {
2501             ERROR(errp, "bad RDMA migration address '%s'", host_port);
2502             g_free(rdma);
2503             rdma = NULL;
2504         }
2505
2506         qapi_free_InetSocketAddress(addr);
2507     }
2508
2509     return rdma;
2510 }
2511
2512 /*
2513  * QEMUFile interface to the control channel.
2514  * SEND messages for control only.
2515  * VM's ram is handled with regular RDMA messages.
2516  */
2517 static int qemu_rdma_put_buffer(void *opaque, const uint8_t *buf,
2518                                 int64_t pos, int size)
2519 {
2520     QEMUFileRDMA *r = opaque;
2521     QEMUFile *f = r->file;
2522     RDMAContext *rdma = r->rdma;
2523     size_t remaining = size;
2524     uint8_t * data = (void *) buf;
2525     int ret;
2526
2527     CHECK_ERROR_STATE();
2528
2529     /*
2530      * Push out any writes that
2531      * we're queued up for VM's ram.
2532      */
2533     ret = qemu_rdma_write_flush(f, rdma);
2534     if (ret < 0) {
2535         rdma->error_state = ret;
2536         return ret;
2537     }
2538
2539     while (remaining) {
2540         RDMAControlHeader head;
2541
2542         r->len = MIN(remaining, RDMA_SEND_INCREMENT);
2543         remaining -= r->len;
2544
2545         head.len = r->len;
2546         head.type = RDMA_CONTROL_QEMU_FILE;
2547
2548         ret = qemu_rdma_exchange_send(rdma, &head, data, NULL, NULL, NULL);
2549
2550         if (ret < 0) {
2551             rdma->error_state = ret;
2552             return ret;
2553         }
2554
2555         data += r->len;
2556     }
2557
2558     return size;
2559 }
2560
2561 static size_t qemu_rdma_fill(RDMAContext *rdma, uint8_t *buf,
2562                              int size, int idx)
2563 {
2564     size_t len = 0;
2565
2566     if (rdma->wr_data[idx].control_len) {
2567         trace_qemu_rdma_fill(rdma->wr_data[idx].control_len, size);
2568
2569         len = MIN(size, rdma->wr_data[idx].control_len);
2570         memcpy(buf, rdma->wr_data[idx].control_curr, len);
2571         rdma->wr_data[idx].control_curr += len;
2572         rdma->wr_data[idx].control_len -= len;
2573     }
2574
2575     return len;
2576 }
2577
2578 /*
2579  * QEMUFile interface to the control channel.
2580  * RDMA links don't use bytestreams, so we have to
2581  * return bytes to QEMUFile opportunistically.
2582  */
2583 static int qemu_rdma_get_buffer(void *opaque, uint8_t *buf,
2584                                 int64_t pos, int size)
2585 {
2586     QEMUFileRDMA *r = opaque;
2587     RDMAContext *rdma = r->rdma;
2588     RDMAControlHeader head;
2589     int ret = 0;
2590
2591     CHECK_ERROR_STATE();
2592
2593     /*
2594      * First, we hold on to the last SEND message we
2595      * were given and dish out the bytes until we run
2596      * out of bytes.
2597      */
2598     r->len = qemu_rdma_fill(r->rdma, buf, size, 0);
2599     if (r->len) {
2600         return r->len;
2601     }
2602
2603     /*
2604      * Once we run out, we block and wait for another
2605      * SEND message to arrive.
2606      */
2607     ret = qemu_rdma_exchange_recv(rdma, &head, RDMA_CONTROL_QEMU_FILE);
2608
2609     if (ret < 0) {
2610         rdma->error_state = ret;
2611         return ret;
2612     }
2613
2614     /*
2615      * SEND was received with new bytes, now try again.
2616      */
2617     return qemu_rdma_fill(r->rdma, buf, size, 0);
2618 }
2619
2620 /*
2621  * Block until all the outstanding chunks have been delivered by the hardware.
2622  */
2623 static int qemu_rdma_drain_cq(QEMUFile *f, RDMAContext *rdma)
2624 {
2625     int ret;
2626
2627     if (qemu_rdma_write_flush(f, rdma) < 0) {
2628         return -EIO;
2629     }
2630
2631     while (rdma->nb_sent) {
2632         ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RDMA_WRITE, NULL);
2633         if (ret < 0) {
2634             error_report("rdma migration: complete polling error!");
2635             return -EIO;
2636         }
2637     }
2638
2639     qemu_rdma_unregister_waiting(rdma);
2640
2641     return 0;
2642 }
2643
2644 static int qemu_rdma_close(void *opaque)
2645 {
2646     trace_qemu_rdma_close();
2647     QEMUFileRDMA *r = opaque;
2648     if (r->rdma) {
2649         qemu_rdma_cleanup(r->rdma);
2650         g_free(r->rdma);
2651     }
2652     g_free(r);
2653     return 0;
2654 }
2655
2656 /*
2657  * Parameters:
2658  *    @offset == 0 :
2659  *        This means that 'block_offset' is a full virtual address that does not
2660  *        belong to a RAMBlock of the virtual machine and instead
2661  *        represents a private malloc'd memory area that the caller wishes to
2662  *        transfer.
2663  *
2664  *    @offset != 0 :
2665  *        Offset is an offset to be added to block_offset and used
2666  *        to also lookup the corresponding RAMBlock.
2667  *
2668  *    @size > 0 :
2669  *        Initiate an transfer this size.
2670  *
2671  *    @size == 0 :
2672  *        A 'hint' or 'advice' that means that we wish to speculatively
2673  *        and asynchronously unregister this memory. In this case, there is no
2674  *        guarantee that the unregister will actually happen, for example,
2675  *        if the memory is being actively transmitted. Additionally, the memory
2676  *        may be re-registered at any future time if a write within the same
2677  *        chunk was requested again, even if you attempted to unregister it
2678  *        here.
2679  *
2680  *    @size < 0 : TODO, not yet supported
2681  *        Unregister the memory NOW. This means that the caller does not
2682  *        expect there to be any future RDMA transfers and we just want to clean
2683  *        things up. This is used in case the upper layer owns the memory and
2684  *        cannot wait for qemu_fclose() to occur.
2685  *
2686  *    @bytes_sent : User-specificed pointer to indicate how many bytes were
2687  *                  sent. Usually, this will not be more than a few bytes of
2688  *                  the protocol because most transfers are sent asynchronously.
2689  */
2690 static size_t qemu_rdma_save_page(QEMUFile *f, void *opaque,
2691                                   ram_addr_t block_offset, ram_addr_t offset,
2692                                   size_t size, uint64_t *bytes_sent)
2693 {
2694     QEMUFileRDMA *rfile = opaque;
2695     RDMAContext *rdma = rfile->rdma;
2696     int ret;
2697
2698     CHECK_ERROR_STATE();
2699
2700     qemu_fflush(f);
2701
2702     if (size > 0) {
2703         /*
2704          * Add this page to the current 'chunk'. If the chunk
2705          * is full, or the page doen't belong to the current chunk,
2706          * an actual RDMA write will occur and a new chunk will be formed.
2707          */
2708         ret = qemu_rdma_write(f, rdma, block_offset, offset, size);
2709         if (ret < 0) {
2710             error_report("rdma migration: write error! %d", ret);
2711             goto err;
2712         }
2713
2714         /*
2715          * We always return 1 bytes because the RDMA
2716          * protocol is completely asynchronous. We do not yet know
2717          * whether an  identified chunk is zero or not because we're
2718          * waiting for other pages to potentially be merged with
2719          * the current chunk. So, we have to call qemu_update_position()
2720          * later on when the actual write occurs.
2721          */
2722         if (bytes_sent) {
2723             *bytes_sent = 1;
2724         }
2725     } else {
2726         uint64_t index, chunk;
2727
2728         /* TODO: Change QEMUFileOps prototype to be signed: size_t => long
2729         if (size < 0) {
2730             ret = qemu_rdma_drain_cq(f, rdma);
2731             if (ret < 0) {
2732                 fprintf(stderr, "rdma: failed to synchronously drain"
2733                                 " completion queue before unregistration.\n");
2734                 goto err;
2735             }
2736         }
2737         */
2738
2739         ret = qemu_rdma_search_ram_block(rdma, block_offset,
2740                                          offset, size, &index, &chunk);
2741
2742         if (ret) {
2743             error_report("ram block search failed");
2744             goto err;
2745         }
2746
2747         qemu_rdma_signal_unregister(rdma, index, chunk, 0);
2748
2749         /*
2750          * TODO: Synchronous, guaranteed unregistration (should not occur during
2751          * fast-path). Otherwise, unregisters will process on the next call to
2752          * qemu_rdma_drain_cq()
2753         if (size < 0) {
2754             qemu_rdma_unregister_waiting(rdma);
2755         }
2756         */
2757     }
2758
2759     /*
2760      * Drain the Completion Queue if possible, but do not block,
2761      * just poll.
2762      *
2763      * If nothing to poll, the end of the iteration will do this
2764      * again to make sure we don't overflow the request queue.
2765      */
2766     while (1) {
2767         uint64_t wr_id, wr_id_in;
2768         int ret = qemu_rdma_poll(rdma, &wr_id_in, NULL);
2769         if (ret < 0) {
2770             error_report("rdma migration: polling error! %d", ret);
2771             goto err;
2772         }
2773
2774         wr_id = wr_id_in & RDMA_WRID_TYPE_MASK;
2775
2776         if (wr_id == RDMA_WRID_NONE) {
2777             break;
2778         }
2779     }
2780
2781     return RAM_SAVE_CONTROL_DELAYED;
2782 err:
2783     rdma->error_state = ret;
2784     return ret;
2785 }
2786
2787 static int qemu_rdma_accept(RDMAContext *rdma)
2788 {
2789     RDMACapabilities cap;
2790     struct rdma_conn_param conn_param = {
2791                                             .responder_resources = 2,
2792                                             .private_data = &cap,
2793                                             .private_data_len = sizeof(cap),
2794                                          };
2795     struct rdma_cm_event *cm_event;
2796     struct ibv_context *verbs;
2797     int ret = -EINVAL;
2798     int idx;
2799
2800     ret = rdma_get_cm_event(rdma->channel, &cm_event);
2801     if (ret) {
2802         goto err_rdma_dest_wait;
2803     }
2804
2805     if (cm_event->event != RDMA_CM_EVENT_CONNECT_REQUEST) {
2806         rdma_ack_cm_event(cm_event);
2807         goto err_rdma_dest_wait;
2808     }
2809
2810     memcpy(&cap, cm_event->param.conn.private_data, sizeof(cap));
2811
2812     network_to_caps(&cap);
2813
2814     if (cap.version < 1 || cap.version > RDMA_CONTROL_VERSION_CURRENT) {
2815             error_report("Unknown source RDMA version: %d, bailing...",
2816                             cap.version);
2817             rdma_ack_cm_event(cm_event);
2818             goto err_rdma_dest_wait;
2819     }
2820
2821     /*
2822      * Respond with only the capabilities this version of QEMU knows about.
2823      */
2824     cap.flags &= known_capabilities;
2825
2826     /*
2827      * Enable the ones that we do know about.
2828      * Add other checks here as new ones are introduced.
2829      */
2830     if (cap.flags & RDMA_CAPABILITY_PIN_ALL) {
2831         rdma->pin_all = true;
2832     }
2833
2834     rdma->cm_id = cm_event->id;
2835     verbs = cm_event->id->verbs;
2836
2837     rdma_ack_cm_event(cm_event);
2838
2839     trace_qemu_rdma_accept_pin_state(rdma->pin_all);
2840
2841     caps_to_network(&cap);
2842
2843     trace_qemu_rdma_accept_pin_verbsc(verbs);
2844
2845     if (!rdma->verbs) {
2846         rdma->verbs = verbs;
2847     } else if (rdma->verbs != verbs) {
2848             error_report("ibv context not matching %p, %p!", rdma->verbs,
2849                          verbs);
2850             goto err_rdma_dest_wait;
2851     }
2852
2853     qemu_rdma_dump_id("dest_init", verbs);
2854
2855     ret = qemu_rdma_alloc_pd_cq(rdma);
2856     if (ret) {
2857         error_report("rdma migration: error allocating pd and cq!");
2858         goto err_rdma_dest_wait;
2859     }
2860
2861     ret = qemu_rdma_alloc_qp(rdma);
2862     if (ret) {
2863         error_report("rdma migration: error allocating qp!");
2864         goto err_rdma_dest_wait;
2865     }
2866
2867     ret = qemu_rdma_init_ram_blocks(rdma);
2868     if (ret) {
2869         error_report("rdma migration: error initializing ram blocks!");
2870         goto err_rdma_dest_wait;
2871     }
2872
2873     for (idx = 0; idx < RDMA_WRID_MAX; idx++) {
2874         ret = qemu_rdma_reg_control(rdma, idx);
2875         if (ret) {
2876             error_report("rdma: error registering %d control", idx);
2877             goto err_rdma_dest_wait;
2878         }
2879     }
2880
2881     qemu_set_fd_handler(rdma->channel->fd, NULL, NULL, NULL);
2882
2883     ret = rdma_accept(rdma->cm_id, &conn_param);
2884     if (ret) {
2885         error_report("rdma_accept returns %d", ret);
2886         goto err_rdma_dest_wait;
2887     }
2888
2889     ret = rdma_get_cm_event(rdma->channel, &cm_event);
2890     if (ret) {
2891         error_report("rdma_accept get_cm_event failed %d", ret);
2892         goto err_rdma_dest_wait;
2893     }
2894
2895     if (cm_event->event != RDMA_CM_EVENT_ESTABLISHED) {
2896         error_report("rdma_accept not event established");
2897         rdma_ack_cm_event(cm_event);
2898         goto err_rdma_dest_wait;
2899     }
2900
2901     rdma_ack_cm_event(cm_event);
2902     rdma->connected = true;
2903
2904     ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY);
2905     if (ret) {
2906         error_report("rdma migration: error posting second control recv");
2907         goto err_rdma_dest_wait;
2908     }
2909
2910     qemu_rdma_dump_gid("dest_connect", rdma->cm_id);
2911
2912     return 0;
2913
2914 err_rdma_dest_wait:
2915     rdma->error_state = ret;
2916     qemu_rdma_cleanup(rdma);
2917     return ret;
2918 }
2919
2920 /*
2921  * During each iteration of the migration, we listen for instructions
2922  * by the source VM to perform dynamic page registrations before they
2923  * can perform RDMA operations.
2924  *
2925  * We respond with the 'rkey'.
2926  *
2927  * Keep doing this until the source tells us to stop.
2928  */
2929 static int qemu_rdma_registration_handle(QEMUFile *f, void *opaque)
2930 {
2931     RDMAControlHeader reg_resp = { .len = sizeof(RDMARegisterResult),
2932                                .type = RDMA_CONTROL_REGISTER_RESULT,
2933                                .repeat = 0,
2934                              };
2935     RDMAControlHeader unreg_resp = { .len = 0,
2936                                .type = RDMA_CONTROL_UNREGISTER_FINISHED,
2937                                .repeat = 0,
2938                              };
2939     RDMAControlHeader blocks = { .type = RDMA_CONTROL_RAM_BLOCKS_RESULT,
2940                                  .repeat = 1 };
2941     QEMUFileRDMA *rfile = opaque;
2942     RDMAContext *rdma = rfile->rdma;
2943     RDMALocalBlocks *local = &rdma->local_ram_blocks;
2944     RDMAControlHeader head;
2945     RDMARegister *reg, *registers;
2946     RDMACompress *comp;
2947     RDMARegisterResult *reg_result;
2948     static RDMARegisterResult results[RDMA_CONTROL_MAX_COMMANDS_PER_MESSAGE];
2949     RDMALocalBlock *block;
2950     void *host_addr;
2951     int ret = 0;
2952     int idx = 0;
2953     int count = 0;
2954     int i = 0;
2955
2956     CHECK_ERROR_STATE();
2957
2958     do {
2959         trace_qemu_rdma_registration_handle_wait();
2960
2961         ret = qemu_rdma_exchange_recv(rdma, &head, RDMA_CONTROL_NONE);
2962
2963         if (ret < 0) {
2964             break;
2965         }
2966
2967         if (head.repeat > RDMA_CONTROL_MAX_COMMANDS_PER_MESSAGE) {
2968             error_report("rdma: Too many requests in this message (%d)."
2969                             "Bailing.", head.repeat);
2970             ret = -EIO;
2971             break;
2972         }
2973
2974         switch (head.type) {
2975         case RDMA_CONTROL_COMPRESS:
2976             comp = (RDMACompress *) rdma->wr_data[idx].control_curr;
2977             network_to_compress(comp);
2978
2979             trace_qemu_rdma_registration_handle_compress(comp->length,
2980                                                          comp->block_idx,
2981                                                          comp->offset);
2982             block = &(rdma->local_ram_blocks.block[comp->block_idx]);
2983
2984             host_addr = block->local_host_addr +
2985                             (comp->offset - block->offset);
2986
2987             ram_handle_compressed(host_addr, comp->value, comp->length);
2988             break;
2989
2990         case RDMA_CONTROL_REGISTER_FINISHED:
2991             trace_qemu_rdma_registration_handle_finished();
2992             goto out;
2993
2994         case RDMA_CONTROL_RAM_BLOCKS_REQUEST:
2995             trace_qemu_rdma_registration_handle_ram_blocks();
2996
2997             if (rdma->pin_all) {
2998                 ret = qemu_rdma_reg_whole_ram_blocks(rdma);
2999                 if (ret) {
3000                     error_report("rdma migration: error dest "
3001                                     "registering ram blocks");
3002                     goto out;
3003                 }
3004             }
3005
3006             /*
3007              * Dest uses this to prepare to transmit the RAMBlock descriptions
3008              * to the source VM after connection setup.
3009              * Both sides use the "remote" structure to communicate and update
3010              * their "local" descriptions with what was sent.
3011              */
3012             for (i = 0; i < local->nb_blocks; i++) {
3013                 rdma->dest_blocks[i].remote_host_addr =
3014                     (uintptr_t)(local->block[i].local_host_addr);
3015
3016                 if (rdma->pin_all) {
3017                     rdma->dest_blocks[i].remote_rkey = local->block[i].mr->rkey;
3018                 }
3019
3020                 rdma->dest_blocks[i].offset = local->block[i].offset;
3021                 rdma->dest_blocks[i].length = local->block[i].length;
3022
3023                 dest_block_to_network(&rdma->dest_blocks[i]);
3024             }
3025
3026             blocks.len = rdma->local_ram_blocks.nb_blocks
3027                                                 * sizeof(RDMADestBlock);
3028
3029
3030             ret = qemu_rdma_post_send_control(rdma,
3031                                         (uint8_t *) rdma->dest_blocks, &blocks);
3032
3033             if (ret < 0) {
3034                 error_report("rdma migration: error sending remote info");
3035                 goto out;
3036             }
3037
3038             break;
3039         case RDMA_CONTROL_REGISTER_REQUEST:
3040             trace_qemu_rdma_registration_handle_register(head.repeat);
3041
3042             reg_resp.repeat = head.repeat;
3043             registers = (RDMARegister *) rdma->wr_data[idx].control_curr;
3044
3045             for (count = 0; count < head.repeat; count++) {
3046                 uint64_t chunk;
3047                 uint8_t *chunk_start, *chunk_end;
3048
3049                 reg = &registers[count];
3050                 network_to_register(reg);
3051
3052                 reg_result = &results[count];
3053
3054                 trace_qemu_rdma_registration_handle_register_loop(count,
3055                          reg->current_index, reg->key.current_addr, reg->chunks);
3056
3057                 block = &(rdma->local_ram_blocks.block[reg->current_index]);
3058                 if (block->is_ram_block) {
3059                     host_addr = (block->local_host_addr +
3060                                 (reg->key.current_addr - block->offset));
3061                     chunk = ram_chunk_index(block->local_host_addr,
3062                                             (uint8_t *) host_addr);
3063                 } else {
3064                     chunk = reg->key.chunk;
3065                     host_addr = block->local_host_addr +
3066                         (reg->key.chunk * (1UL << RDMA_REG_CHUNK_SHIFT));
3067                 }
3068                 chunk_start = ram_chunk_start(block, chunk);
3069                 chunk_end = ram_chunk_end(block, chunk + reg->chunks);
3070                 if (qemu_rdma_register_and_get_keys(rdma, block,
3071                             (uintptr_t)host_addr, NULL, &reg_result->rkey,
3072                             chunk, chunk_start, chunk_end)) {
3073                     error_report("cannot get rkey");
3074                     ret = -EINVAL;
3075                     goto out;
3076                 }
3077
3078                 reg_result->host_addr = (uintptr_t)block->local_host_addr;
3079
3080                 trace_qemu_rdma_registration_handle_register_rkey(
3081                                                            reg_result->rkey);
3082
3083                 result_to_network(reg_result);
3084             }
3085
3086             ret = qemu_rdma_post_send_control(rdma,
3087                             (uint8_t *) results, &reg_resp);
3088
3089             if (ret < 0) {
3090                 error_report("Failed to send control buffer");
3091                 goto out;
3092             }
3093             break;
3094         case RDMA_CONTROL_UNREGISTER_REQUEST:
3095             trace_qemu_rdma_registration_handle_unregister(head.repeat);
3096             unreg_resp.repeat = head.repeat;
3097             registers = (RDMARegister *) rdma->wr_data[idx].control_curr;
3098
3099             for (count = 0; count < head.repeat; count++) {
3100                 reg = &registers[count];
3101                 network_to_register(reg);
3102
3103                 trace_qemu_rdma_registration_handle_unregister_loop(count,
3104                            reg->current_index, reg->key.chunk);
3105
3106                 block = &(rdma->local_ram_blocks.block[reg->current_index]);
3107
3108                 ret = ibv_dereg_mr(block->pmr[reg->key.chunk]);
3109                 block->pmr[reg->key.chunk] = NULL;
3110
3111                 if (ret != 0) {
3112                     perror("rdma unregistration chunk failed");
3113                     ret = -ret;
3114                     goto out;
3115                 }
3116
3117                 rdma->total_registrations--;
3118
3119                 trace_qemu_rdma_registration_handle_unregister_success(
3120                                                        reg->key.chunk);
3121             }
3122
3123             ret = qemu_rdma_post_send_control(rdma, NULL, &unreg_resp);
3124
3125             if (ret < 0) {
3126                 error_report("Failed to send control buffer");
3127                 goto out;
3128             }
3129             break;
3130         case RDMA_CONTROL_REGISTER_RESULT:
3131             error_report("Invalid RESULT message at dest.");
3132             ret = -EIO;
3133             goto out;
3134         default:
3135             error_report("Unknown control message %s", control_desc[head.type]);
3136             ret = -EIO;
3137             goto out;
3138         }
3139     } while (1);
3140 out:
3141     if (ret < 0) {
3142         rdma->error_state = ret;
3143     }
3144     return ret;
3145 }
3146
3147 static int rdma_load_hook(QEMUFile *f, void *opaque, uint64_t flags, void *data)
3148 {
3149     switch (flags) {
3150     case RAM_CONTROL_BLOCK_REG:
3151         /* TODO A later patch */
3152         return 0;
3153         break;
3154
3155     case RAM_CONTROL_HOOK:
3156         return qemu_rdma_registration_handle(f, opaque);
3157
3158     default:
3159         /* Shouldn't be called with any other values */
3160         abort();
3161     }
3162 }
3163
3164 static int qemu_rdma_registration_start(QEMUFile *f, void *opaque,
3165                                         uint64_t flags, void *data)
3166 {
3167     QEMUFileRDMA *rfile = opaque;
3168     RDMAContext *rdma = rfile->rdma;
3169
3170     CHECK_ERROR_STATE();
3171
3172     trace_qemu_rdma_registration_start(flags);
3173     qemu_put_be64(f, RAM_SAVE_FLAG_HOOK);
3174     qemu_fflush(f);
3175
3176     return 0;
3177 }
3178
3179 /*
3180  * Inform dest that dynamic registrations are done for now.
3181  * First, flush writes, if any.
3182  */
3183 static int qemu_rdma_registration_stop(QEMUFile *f, void *opaque,
3184                                        uint64_t flags, void *data)
3185 {
3186     Error *local_err = NULL, **errp = &local_err;
3187     QEMUFileRDMA *rfile = opaque;
3188     RDMAContext *rdma = rfile->rdma;
3189     RDMAControlHeader head = { .len = 0, .repeat = 1 };
3190     int ret = 0;
3191
3192     CHECK_ERROR_STATE();
3193
3194     qemu_fflush(f);
3195     ret = qemu_rdma_drain_cq(f, rdma);
3196
3197     if (ret < 0) {
3198         goto err;
3199     }
3200
3201     if (flags == RAM_CONTROL_SETUP) {
3202         RDMAControlHeader resp = {.type = RDMA_CONTROL_RAM_BLOCKS_RESULT };
3203         RDMALocalBlocks *local = &rdma->local_ram_blocks;
3204         int reg_result_idx, i, j, nb_dest_blocks;
3205
3206         head.type = RDMA_CONTROL_RAM_BLOCKS_REQUEST;
3207         trace_qemu_rdma_registration_stop_ram();
3208
3209         /*
3210          * Make sure that we parallelize the pinning on both sides.
3211          * For very large guests, doing this serially takes a really
3212          * long time, so we have to 'interleave' the pinning locally
3213          * with the control messages by performing the pinning on this
3214          * side before we receive the control response from the other
3215          * side that the pinning has completed.
3216          */
3217         ret = qemu_rdma_exchange_send(rdma, &head, NULL, &resp,
3218                     &reg_result_idx, rdma->pin_all ?
3219                     qemu_rdma_reg_whole_ram_blocks : NULL);
3220         if (ret < 0) {
3221             ERROR(errp, "receiving remote info!");
3222             return ret;
3223         }
3224
3225         nb_dest_blocks = resp.len / sizeof(RDMADestBlock);
3226
3227         /*
3228          * The protocol uses two different sets of rkeys (mutually exclusive):
3229          * 1. One key to represent the virtual address of the entire ram block.
3230          *    (dynamic chunk registration disabled - pin everything with one rkey.)
3231          * 2. One to represent individual chunks within a ram block.
3232          *    (dynamic chunk registration enabled - pin individual chunks.)
3233          *
3234          * Once the capability is successfully negotiated, the destination transmits
3235          * the keys to use (or sends them later) including the virtual addresses
3236          * and then propagates the remote ram block descriptions to his local copy.
3237          */
3238
3239         if (local->nb_blocks != nb_dest_blocks) {
3240             ERROR(errp, "ram blocks mismatch #1! "
3241                         "Your QEMU command line parameters are probably "
3242                         "not identical on both the source and destination.");
3243             return -EINVAL;
3244         }
3245
3246         qemu_rdma_move_header(rdma, reg_result_idx, &resp);
3247         memcpy(rdma->dest_blocks,
3248             rdma->wr_data[reg_result_idx].control_curr, resp.len);
3249         for (i = 0; i < nb_dest_blocks; i++) {
3250             network_to_dest_block(&rdma->dest_blocks[i]);
3251
3252             /* search local ram blocks */
3253             for (j = 0; j < local->nb_blocks; j++) {
3254                 if (rdma->dest_blocks[i].offset != local->block[j].offset) {
3255                     continue;
3256                 }
3257
3258                 if (rdma->dest_blocks[i].length != local->block[j].length) {
3259                     ERROR(errp, "ram blocks mismatch #2! "
3260                         "Your QEMU command line parameters are probably "
3261                         "not identical on both the source and destination.");
3262                     return -EINVAL;
3263                 }
3264                 local->block[j].remote_host_addr =
3265                         rdma->dest_blocks[i].remote_host_addr;
3266                 local->block[j].remote_rkey = rdma->dest_blocks[i].remote_rkey;
3267                 break;
3268             }
3269
3270             if (j >= local->nb_blocks) {
3271                 ERROR(errp, "ram blocks mismatch #3! "
3272                         "Your QEMU command line parameters are probably "
3273                         "not identical on both the source and destination.");
3274                 return -EINVAL;
3275             }
3276         }
3277     }
3278
3279     trace_qemu_rdma_registration_stop(flags);
3280
3281     head.type = RDMA_CONTROL_REGISTER_FINISHED;
3282     ret = qemu_rdma_exchange_send(rdma, &head, NULL, NULL, NULL, NULL);
3283
3284     if (ret < 0) {
3285         goto err;
3286     }
3287
3288     return 0;
3289 err:
3290     rdma->error_state = ret;
3291     return ret;
3292 }
3293
3294 static int qemu_rdma_get_fd(void *opaque)
3295 {
3296     QEMUFileRDMA *rfile = opaque;
3297     RDMAContext *rdma = rfile->rdma;
3298
3299     return rdma->comp_channel->fd;
3300 }
3301
3302 static const QEMUFileOps rdma_read_ops = {
3303     .get_buffer    = qemu_rdma_get_buffer,
3304     .get_fd        = qemu_rdma_get_fd,
3305     .close         = qemu_rdma_close,
3306     .hook_ram_load = rdma_load_hook,
3307 };
3308
3309 static const QEMUFileOps rdma_write_ops = {
3310     .put_buffer         = qemu_rdma_put_buffer,
3311     .close              = qemu_rdma_close,
3312     .before_ram_iterate = qemu_rdma_registration_start,
3313     .after_ram_iterate  = qemu_rdma_registration_stop,
3314     .save_page          = qemu_rdma_save_page,
3315 };
3316
3317 static void *qemu_fopen_rdma(RDMAContext *rdma, const char *mode)
3318 {
3319     QEMUFileRDMA *r;
3320
3321     if (qemu_file_mode_is_not_valid(mode)) {
3322         return NULL;
3323     }
3324
3325     r = g_malloc0(sizeof(QEMUFileRDMA));
3326     r->rdma = rdma;
3327
3328     if (mode[0] == 'w') {
3329         r->file = qemu_fopen_ops(r, &rdma_write_ops);
3330     } else {
3331         r->file = qemu_fopen_ops(r, &rdma_read_ops);
3332     }
3333
3334     return r->file;
3335 }
3336
3337 static void rdma_accept_incoming_migration(void *opaque)
3338 {
3339     RDMAContext *rdma = opaque;
3340     int ret;
3341     QEMUFile *f;
3342     Error *local_err = NULL, **errp = &local_err;
3343
3344     trace_qemu_rdma_accept_incoming_migration();
3345     ret = qemu_rdma_accept(rdma);
3346
3347     if (ret) {
3348         ERROR(errp, "RDMA Migration initialization failed!");
3349         return;
3350     }
3351
3352     trace_qemu_rdma_accept_incoming_migration_accepted();
3353
3354     f = qemu_fopen_rdma(rdma, "rb");
3355     if (f == NULL) {
3356         ERROR(errp, "could not qemu_fopen_rdma!");
3357         qemu_rdma_cleanup(rdma);
3358         return;
3359     }
3360
3361     rdma->migration_started_on_destination = 1;
3362     process_incoming_migration(f);
3363 }
3364
3365 void rdma_start_incoming_migration(const char *host_port, Error **errp)
3366 {
3367     int ret;
3368     RDMAContext *rdma;
3369     Error *local_err = NULL;
3370
3371     trace_rdma_start_incoming_migration();
3372     rdma = qemu_rdma_data_init(host_port, &local_err);
3373
3374     if (rdma == NULL) {
3375         goto err;
3376     }
3377
3378     ret = qemu_rdma_dest_init(rdma, &local_err);
3379
3380     if (ret) {
3381         goto err;
3382     }
3383
3384     trace_rdma_start_incoming_migration_after_dest_init();
3385
3386     ret = rdma_listen(rdma->listen_id, 5);
3387
3388     if (ret) {
3389         ERROR(errp, "listening on socket!");
3390         goto err;
3391     }
3392
3393     trace_rdma_start_incoming_migration_after_rdma_listen();
3394
3395     qemu_set_fd_handler(rdma->channel->fd, rdma_accept_incoming_migration,
3396                         NULL, (void *)(intptr_t)rdma);
3397     return;
3398 err:
3399     error_propagate(errp, local_err);
3400     g_free(rdma);
3401 }
3402
3403 void rdma_start_outgoing_migration(void *opaque,
3404                             const char *host_port, Error **errp)
3405 {
3406     MigrationState *s = opaque;
3407     Error *local_err = NULL, **temp = &local_err;
3408     RDMAContext *rdma = qemu_rdma_data_init(host_port, &local_err);
3409     int ret = 0;
3410
3411     if (rdma == NULL) {
3412         ERROR(temp, "Failed to initialize RDMA data structures! %d", ret);
3413         goto err;
3414     }
3415
3416     ret = qemu_rdma_source_init(rdma, &local_err,
3417         s->enabled_capabilities[MIGRATION_CAPABILITY_RDMA_PIN_ALL]);
3418
3419     if (ret) {
3420         goto err;
3421     }
3422
3423     trace_rdma_start_outgoing_migration_after_rdma_source_init();
3424     ret = qemu_rdma_connect(rdma, &local_err);
3425
3426     if (ret) {
3427         goto err;
3428     }
3429
3430     trace_rdma_start_outgoing_migration_after_rdma_connect();
3431
3432     s->file = qemu_fopen_rdma(rdma, "wb");
3433     migrate_fd_connect(s);
3434     return;
3435 err:
3436     error_propagate(errp, local_err);
3437     g_free(rdma);
3438     migrate_fd_error(s);
3439 }