block/nvme.c

   1 /*
   2  * NVMe block driver based on vfio
   3  *
   4  * Copyright 2016 - 2018 Red Hat, Inc.
   5  *
   6  * Authors:
   7  *   Fam Zheng <famz@redhat.com>
   8  *   Paolo Bonzini <pbonzini@redhat.com>
   9  *
  10  * This work is licensed under the terms of the GNU GPL, version 2 or later.
  11  * See the COPYING file in the top-level directory.
  12  */
  13
  14 #include "qemu/osdep.h"
  15 #include <linux/vfio.h>
  16 #include "qapi/error.h"
  17 #include "qapi/qmp/qdict.h"
  18 #include "qapi/qmp/qstring.h"
  19 #include "qemu/error-report.h"
  20 #include "qemu/main-loop.h"
  21 #include "qemu/module.h"
  22 #include "qemu/cutils.h"
  23 #include "qemu/option.h"
  24 #include "qemu/memalign.h"
  25 #include "qemu/vfio-helpers.h"
  26 #include "block/block_int.h"
  27 #include "sysemu/replay.h"
  28 #include "trace.h"
  29
  30 #include "block/nvme.h"
  31
  32 #define NVME_SQ_ENTRY_BYTES 64
  33 #define NVME_CQ_ENTRY_BYTES 16
  34 #define NVME_QUEUE_SIZE 128
  35 #define NVME_DOORBELL_SIZE 4096
  36
  37 /*
  38  * We have to leave one slot empty as that is the full queue case where
  39  * head == tail + 1.
  40  */
  41 #define NVME_NUM_REQS (NVME_QUEUE_SIZE - 1)
  42
  43 typedef struct BDRVNVMeState BDRVNVMeState;
  44
  45 /* Same index is used for queues and IRQs */
  46 #define INDEX_ADMIN     0
  47 #define INDEX_IO(n)     (1 + n)
  48
  49 /* This driver shares a single MSIX IRQ for the admin and I/O queues */
  50 enum {
  51     MSIX_SHARED_IRQ_IDX = 0,
  52     MSIX_IRQ_COUNT = 1
  53 };
  54
  55 typedef struct {
  56     int32_t  head, tail;
  57     uint8_t  *queue;
  58     uint64_t iova;
  59     /* Hardware MMIO register */
  60     volatile uint32_t *doorbell;
  61 } NVMeQueue;
  62
  63 typedef struct {
  64     BlockCompletionFunc *cb;
  65     void *opaque;
  66     int cid;
  67     void *prp_list_page;
  68     uint64_t prp_list_iova;
  69     int free_req_next; /* q->reqs[] index of next free req */
  70 } NVMeRequest;
  71
  72 typedef struct {
  73     QemuMutex   lock;
  74
  75     /* Read from I/O code path, initialized under BQL */
  76     BDRVNVMeState   *s;
  77     int             index;
  78
  79     /* Fields protected by BQL */
  80     uint8_t     *prp_list_pages;
  81
  82     /* Fields protected by @lock */
  83     CoQueue     free_req_queue;
  84     NVMeQueue   sq, cq;
  85     int         cq_phase;
  86     int         free_req_head;
  87     NVMeRequest reqs[NVME_NUM_REQS];
  88     int         need_kick;
  89     int         inflight;
  90
  91     /* Thread-safe, no lock necessary */
  92     QEMUBH      *completion_bh;
  93 } NVMeQueuePair;
  94
  95 struct BDRVNVMeState {
  96     AioContext *aio_context;
  97     QEMUVFIOState *vfio;
  98     void *bar0_wo_map;
  99     /* Memory mapped registers */
 100     volatile struct {
 101         uint32_t sq_tail;
 102         uint32_t cq_head;
 103     } *doorbells;
 104     /* The submission/completion queue pairs.
 105      * [0]: admin queue.
 106      * [1..]: io queues.
 107      */
 108     NVMeQueuePair **queues;
 109     unsigned queue_count;
 110     size_t page_size;
 111     /* How many uint32_t elements does each doorbell entry take. */
 112     size_t doorbell_scale;
 113     bool write_cache_supported;
 114     EventNotifier irq_notifier[MSIX_IRQ_COUNT];
 115
 116     uint64_t nsze; /* Namespace size reported by identify command */
 117     int nsid;      /* The namespace id to read/write data. */
 118     int blkshift;
 119
 120     uint64_t max_transfer;
 121     bool plugged;
 122
 123     bool supports_write_zeroes;
 124     bool supports_discard;
 125
 126     CoMutex dma_map_lock;
 127     CoQueue dma_flush_queue;
 128
 129     /* Total size of mapped qiov, accessed under dma_map_lock */
 130     int dma_map_count;
 131
 132     /* PCI address (required for nvme_refresh_filename()) */
 133     char *device;
 134
 135     struct {
 136         uint64_t completion_errors;
 137         uint64_t aligned_accesses;
 138         uint64_t unaligned_accesses;
 139     } stats;
 140 };
 141
 142 #define NVME_BLOCK_OPT_DEVICE "device"
 143 #define NVME_BLOCK_OPT_NAMESPACE "namespace"
 144
 145 static void nvme_process_completion_bh(void *opaque);
 146
 147 static QemuOptsList runtime_opts = {
 148     .name = "nvme",
 149     .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
 150     .desc = {
 151         {
 152             .name = NVME_BLOCK_OPT_DEVICE,
 153             .type = QEMU_OPT_STRING,
 154             .help = "NVMe PCI device address",
 155         },
 156         {
 157             .name = NVME_BLOCK_OPT_NAMESPACE,
 158             .type = QEMU_OPT_NUMBER,
 159             .help = "NVMe namespace",
 160         },
 161         { /* end of list */ }
 162     },
 163 };
 164
 165 /* Returns true on success, false on failure. */
 166 static bool nvme_init_queue(BDRVNVMeState *s, NVMeQueue *q,
 167                             unsigned nentries, size_t entry_bytes, Error **errp)
 168 {
 169     size_t bytes;
 170     int r;
 171
 172     bytes = ROUND_UP(nentries * entry_bytes, qemu_real_host_page_size());
 173     q->head = q->tail = 0;
 174     q->queue = qemu_try_memalign(qemu_real_host_page_size(), bytes);
 175     if (!q->queue) {
 176         error_setg(errp, "Cannot allocate queue");
 177         return false;
 178     }
 179     memset(q->queue, 0, bytes);
 180     r = qemu_vfio_dma_map(s->vfio, q->queue, bytes, false, &q->iova, errp);
 181     if (r) {
 182         error_prepend(errp, "Cannot map queue: ");
 183     }
 184     return r == 0;
 185 }
 186
 187 static void nvme_free_queue(NVMeQueue *q)
 188 {
 189     qemu_vfree(q->queue);
 190 }
 191
 192 static void nvme_free_queue_pair(NVMeQueuePair *q)
 193 {
 194     trace_nvme_free_queue_pair(q->index, q, &q->cq, &q->sq);
 195     if (q->completion_bh) {
 196         qemu_bh_delete(q->completion_bh);
 197     }
 198     nvme_free_queue(&q->sq);
 199     nvme_free_queue(&q->cq);
 200     qemu_vfree(q->prp_list_pages);
 201     qemu_mutex_destroy(&q->lock);
 202     g_free(q);
 203 }
 204
 205 static void nvme_free_req_queue_cb(void *opaque)
 206 {
 207     NVMeQueuePair *q = opaque;
 208
 209     qemu_mutex_lock(&q->lock);
 210     while (q->free_req_head != -1 &&
 211            qemu_co_enter_next(&q->free_req_queue, &q->lock)) {
 212         /* Retry waiting requests */
 213     }
 214     qemu_mutex_unlock(&q->lock);
 215 }
 216
 217 static NVMeQueuePair *nvme_create_queue_pair(BDRVNVMeState *s,
 218                                              AioContext *aio_context,
 219                                              unsigned idx, size_t size,
 220                                              Error **errp)
 221 {
 222     int i, r;
 223     NVMeQueuePair *q;
 224     uint64_t prp_list_iova;
 225     size_t bytes;
 226
 227     q = g_try_new0(NVMeQueuePair, 1);
 228     if (!q) {
 229         error_setg(errp, "Cannot allocate queue pair");
 230         return NULL;
 231     }
 232     trace_nvme_create_queue_pair(idx, q, size, aio_context,
 233                                  event_notifier_get_fd(s->irq_notifier));
 234     bytes = QEMU_ALIGN_UP(s->page_size * NVME_NUM_REQS,
 235                           qemu_real_host_page_size());
 236     q->prp_list_pages = qemu_try_memalign(qemu_real_host_page_size(), bytes);
 237     if (!q->prp_list_pages) {
 238         error_setg(errp, "Cannot allocate PRP page list");
 239         goto fail;
 240     }
 241     memset(q->prp_list_pages, 0, bytes);
 242     qemu_mutex_init(&q->lock);
 243     q->s = s;
 244     q->index = idx;
 245     qemu_co_queue_init(&q->free_req_queue);
 246     q->completion_bh = aio_bh_new(aio_context, nvme_process_completion_bh, q);
 247     r = qemu_vfio_dma_map(s->vfio, q->prp_list_pages, bytes,
 248                           false, &prp_list_iova, errp);
 249     if (r) {
 250         error_prepend(errp, "Cannot map buffer for DMA: ");
 251         goto fail;
 252     }
 253     q->free_req_head = -1;
 254     for (i = 0; i < NVME_NUM_REQS; i++) {
 255         NVMeRequest *req = &q->reqs[i];
 256         req->cid = i + 1;
 257         req->free_req_next = q->free_req_head;
 258         q->free_req_head = i;
 259         req->prp_list_page = q->prp_list_pages + i * s->page_size;
 260         req->prp_list_iova = prp_list_iova + i * s->page_size;
 261     }
 262
 263     if (!nvme_init_queue(s, &q->sq, size, NVME_SQ_ENTRY_BYTES, errp)) {
 264         goto fail;
 265     }
 266     q->sq.doorbell = &s->doorbells[idx * s->doorbell_scale].sq_tail;
 267
 268     if (!nvme_init_queue(s, &q->cq, size, NVME_CQ_ENTRY_BYTES, errp)) {
 269         goto fail;
 270     }
 271     q->cq.doorbell = &s->doorbells[idx * s->doorbell_scale].cq_head;
 272
 273     return q;
 274 fail:
 275     nvme_free_queue_pair(q);
 276     return NULL;
 277 }
 278
 279 /* With q->lock */
 280 static void nvme_kick(NVMeQueuePair *q)
 281 {
 282     BDRVNVMeState *s = q->s;
 283
 284     if (s->plugged || !q->need_kick) {
 285         return;
 286     }
 287     trace_nvme_kick(s, q->index);
 288     assert(!(q->sq.tail & 0xFF00));
 289     /* Fence the write to submission queue entry before notifying the device. */
 290     smp_wmb();
 291     *q->sq.doorbell = cpu_to_le32(q->sq.tail);
 292     q->inflight += q->need_kick;
 293     q->need_kick = 0;
 294 }
 295
 296 /* Find a free request element if any, otherwise:
 297  * a) if in coroutine context, try to wait for one to become available;
 298  * b) if not in coroutine, return NULL;
 299  */
 300 static NVMeRequest *nvme_get_free_req(NVMeQueuePair *q)
 301 {
 302     NVMeRequest *req;
 303
 304     qemu_mutex_lock(&q->lock);
 305
 306     while (q->free_req_head == -1) {
 307         if (qemu_in_coroutine()) {
 308             trace_nvme_free_req_queue_wait(q->s, q->index);
 309             qemu_co_queue_wait(&q->free_req_queue, &q->lock);
 310         } else {
 311             qemu_mutex_unlock(&q->lock);
 312             return NULL;
 313         }
 314     }
 315
 316     req = &q->reqs[q->free_req_head];
 317     q->free_req_head = req->free_req_next;
 318     req->free_req_next = -1;
 319
 320     qemu_mutex_unlock(&q->lock);
 321     return req;
 322 }
 323
 324 /* With q->lock */
 325 static void nvme_put_free_req_locked(NVMeQueuePair *q, NVMeRequest *req)
 326 {
 327     req->free_req_next = q->free_req_head;
 328     q->free_req_head = req - q->reqs;
 329 }
 330
 331 /* With q->lock */
 332 static void nvme_wake_free_req_locked(NVMeQueuePair *q)
 333 {
 334     if (!qemu_co_queue_empty(&q->free_req_queue)) {
 335         replay_bh_schedule_oneshot_event(q->s->aio_context,
 336                 nvme_free_req_queue_cb, q);
 337     }
 338 }
 339
 340 /* Insert a request in the freelist and wake waiters */
 341 static void nvme_put_free_req_and_wake(NVMeQueuePair *q, NVMeRequest *req)
 342 {
 343     qemu_mutex_lock(&q->lock);
 344     nvme_put_free_req_locked(q, req);
 345     nvme_wake_free_req_locked(q);
 346     qemu_mutex_unlock(&q->lock);
 347 }
 348
 349 static inline int nvme_translate_error(const NvmeCqe *c)
 350 {
 351     uint16_t status = (le16_to_cpu(c->status) >> 1) & 0xFF;
 352     if (status) {
 353         trace_nvme_error(le32_to_cpu(c->result),
 354                          le16_to_cpu(c->sq_head),
 355                          le16_to_cpu(c->sq_id),
 356                          le16_to_cpu(c->cid),
 357                          le16_to_cpu(status));
 358     }
 359     switch (status) {
 360     case 0:
 361         return 0;
 362     case 1:
 363         return -ENOSYS;
 364     case 2:
 365         return -EINVAL;
 366     default:
 367         return -EIO;
 368     }
 369 }
 370
 371 /* With q->lock */
 372 static bool nvme_process_completion(NVMeQueuePair *q)
 373 {
 374     BDRVNVMeState *s = q->s;
 375     bool progress = false;
 376     NVMeRequest *preq;
 377     NVMeRequest req;
 378     NvmeCqe *c;
 379
 380     trace_nvme_process_completion(s, q->index, q->inflight);
 381     if (s->plugged) {
 382         trace_nvme_process_completion_queue_plugged(s, q->index);
 383         return false;
 384     }
 385
 386     /*
 387      * Support re-entrancy when a request cb() function invokes aio_poll().
 388      * Pending completions must be visible to aio_poll() so that a cb()
 389      * function can wait for the completion of another request.
 390      *
 391      * The aio_poll() loop will execute our BH and we'll resume completion
 392      * processing there.
 393      */
 394     qemu_bh_schedule(q->completion_bh);
 395
 396     assert(q->inflight >= 0);
 397     while (q->inflight) {
 398         int ret;
 399         int16_t cid;
 400
 401         c = (NvmeCqe *)&q->cq.queue[q->cq.head * NVME_CQ_ENTRY_BYTES];
 402         if ((le16_to_cpu(c->status) & 0x1) == q->cq_phase) {
 403             break;
 404         }
 405         ret = nvme_translate_error(c);
 406         if (ret) {
 407             s->stats.completion_errors++;
 408         }
 409         q->cq.head = (q->cq.head + 1) % NVME_QUEUE_SIZE;
 410         if (!q->cq.head) {
 411             q->cq_phase = !q->cq_phase;
 412         }
 413         cid = le16_to_cpu(c->cid);
 414         if (cid == 0 || cid > NVME_QUEUE_SIZE) {
 415             warn_report("NVMe: Unexpected CID in completion queue: %"PRIu32", "
 416                         "queue size: %u", cid, NVME_QUEUE_SIZE);
 417             continue;
 418         }
 419         trace_nvme_complete_command(s, q->index, cid);
 420         preq = &q->reqs[cid - 1];
 421         req = *preq;
 422         assert(req.cid == cid);
 423         assert(req.cb);
 424         nvme_put_free_req_locked(q, preq);
 425         preq->cb = preq->opaque = NULL;
 426         q->inflight--;
 427         qemu_mutex_unlock(&q->lock);
 428         req.cb(req.opaque, ret);
 429         qemu_mutex_lock(&q->lock);
 430         progress = true;
 431     }
 432     if (progress) {
 433         /* Notify the device so it can post more completions. */
 434         smp_mb_release();
 435         *q->cq.doorbell = cpu_to_le32(q->cq.head);
 436         nvme_wake_free_req_locked(q);
 437     }
 438
 439     qemu_bh_cancel(q->completion_bh);
 440
 441     return progress;
 442 }
 443
 444 static void nvme_process_completion_bh(void *opaque)
 445 {
 446     NVMeQueuePair *q = opaque;
 447
 448     /*
 449      * We're being invoked because a nvme_process_completion() cb() function
 450      * called aio_poll(). The callback may be waiting for further completions
 451      * so notify the device that it has space to fill in more completions now.
 452      */
 453     smp_mb_release();
 454     *q->cq.doorbell = cpu_to_le32(q->cq.head);
 455     nvme_wake_free_req_locked(q);
 456
 457     nvme_process_completion(q);
 458 }
 459
 460 static void nvme_trace_command(const NvmeCmd *cmd)
 461 {
 462     int i;
 463
 464     if (!trace_event_get_state_backends(TRACE_NVME_SUBMIT_COMMAND_RAW)) {
 465         return;
 466     }
 467     for (i = 0; i < 8; ++i) {
 468         uint8_t *cmdp = (uint8_t *)cmd + i * 8;
 469         trace_nvme_submit_command_raw(cmdp[0], cmdp[1], cmdp[2], cmdp[3],
 470                                       cmdp[4], cmdp[5], cmdp[6], cmdp[7]);
 471     }
 472 }
 473
 474 static void nvme_submit_command(NVMeQueuePair *q, NVMeRequest *req,
 475                                 NvmeCmd *cmd, BlockCompletionFunc cb,
 476                                 void *opaque)
 477 {
 478     assert(!req->cb);
 479     req->cb = cb;
 480     req->opaque = opaque;
 481     cmd->cid = cpu_to_le16(req->cid);
 482
 483     trace_nvme_submit_command(q->s, q->index, req->cid);
 484     nvme_trace_command(cmd);
 485     qemu_mutex_lock(&q->lock);
 486     memcpy((uint8_t *)q->sq.queue +
 487            q->sq.tail * NVME_SQ_ENTRY_BYTES, cmd, sizeof(*cmd));
 488     q->sq.tail = (q->sq.tail + 1) % NVME_QUEUE_SIZE;
 489     q->need_kick++;
 490     nvme_kick(q);
 491     nvme_process_completion(q);
 492     qemu_mutex_unlock(&q->lock);
 493 }
 494
 495 static void nvme_admin_cmd_sync_cb(void *opaque, int ret)
 496 {
 497     int *pret = opaque;
 498     *pret = ret;
 499     aio_wait_kick();
 500 }
 501
 502 static int nvme_admin_cmd_sync(BlockDriverState *bs, NvmeCmd *cmd)
 503 {
 504     BDRVNVMeState *s = bs->opaque;
 505     NVMeQueuePair *q = s->queues[INDEX_ADMIN];
 506     AioContext *aio_context = bdrv_get_aio_context(bs);
 507     NVMeRequest *req;
 508     int ret = -EINPROGRESS;
 509     req = nvme_get_free_req(q);
 510     if (!req) {
 511         return -EBUSY;
 512     }
 513     nvme_submit_command(q, req, cmd, nvme_admin_cmd_sync_cb, &ret);
 514
 515     AIO_WAIT_WHILE(aio_context, ret == -EINPROGRESS);
 516     return ret;
 517 }
 518
 519 /* Returns true on success, false on failure. */
 520 static bool nvme_identify(BlockDriverState *bs, int namespace, Error **errp)
 521 {
 522     BDRVNVMeState *s = bs->opaque;
 523     bool ret = false;
 524     QEMU_AUTO_VFREE union {
 525         NvmeIdCtrl ctrl;
 526         NvmeIdNs ns;
 527     } *id = NULL;
 528     NvmeLBAF *lbaf;
 529     uint16_t oncs;
 530     int r;
 531     uint64_t iova;
 532     NvmeCmd cmd = {
 533         .opcode = NVME_ADM_CMD_IDENTIFY,
 534         .cdw10 = cpu_to_le32(0x1),
 535     };
 536     size_t id_size = QEMU_ALIGN_UP(sizeof(*id), qemu_real_host_page_size());
 537
 538     id = qemu_try_memalign(qemu_real_host_page_size(), id_size);
 539     if (!id) {
 540         error_setg(errp, "Cannot allocate buffer for identify response");
 541         goto out;
 542     }
 543     r = qemu_vfio_dma_map(s->vfio, id, id_size, true, &iova, errp);
 544     if (r) {
 545         error_prepend(errp, "Cannot map buffer for DMA: ");
 546         goto out;
 547     }
 548
 549     memset(id, 0, id_size);
 550     cmd.dptr.prp1 = cpu_to_le64(iova);
 551     if (nvme_admin_cmd_sync(bs, &cmd)) {
 552         error_setg(errp, "Failed to identify controller");
 553         goto out;
 554     }
 555
 556     if (le32_to_cpu(id->ctrl.nn) < namespace) {
 557         error_setg(errp, "Invalid namespace");
 558         goto out;
 559     }
 560     s->write_cache_supported = le32_to_cpu(id->ctrl.vwc) & 0x1;
 561     s->max_transfer = (id->ctrl.mdts ? 1 << id->ctrl.mdts : 0) * s->page_size;
 562     /* For now the page list buffer per command is one page, to hold at most
 563      * s->page_size / sizeof(uint64_t) entries. */
 564     s->max_transfer = MIN_NON_ZERO(s->max_transfer,
 565                           s->page_size / sizeof(uint64_t) * s->page_size);
 566
 567     oncs = le16_to_cpu(id->ctrl.oncs);
 568     s->supports_write_zeroes = !!(oncs & NVME_ONCS_WRITE_ZEROES);
 569     s->supports_discard = !!(oncs & NVME_ONCS_DSM);
 570
 571     memset(id, 0, id_size);
 572     cmd.cdw10 = 0;
 573     cmd.nsid = cpu_to_le32(namespace);
 574     if (nvme_admin_cmd_sync(bs, &cmd)) {
 575         error_setg(errp, "Failed to identify namespace");
 576         goto out;
 577     }
 578
 579     s->nsze = le64_to_cpu(id->ns.nsze);
 580     lbaf = &id->ns.lbaf[NVME_ID_NS_FLBAS_INDEX(id->ns.flbas)];
 581
 582     if (NVME_ID_NS_DLFEAT_WRITE_ZEROES(id->ns.dlfeat) &&
 583             NVME_ID_NS_DLFEAT_READ_BEHAVIOR(id->ns.dlfeat) ==
 584                     NVME_ID_NS_DLFEAT_READ_BEHAVIOR_ZEROES) {
 585         bs->supported_write_flags |= BDRV_REQ_MAY_UNMAP;
 586     }
 587
 588     if (lbaf->ms) {
 589         error_setg(errp, "Namespaces with metadata are not yet supported");
 590         goto out;
 591     }
 592
 593     if (lbaf->ds < BDRV_SECTOR_BITS || lbaf->ds > 12 ||
 594         (1 << lbaf->ds) > s->page_size)
 595     {
 596         error_setg(errp, "Namespace has unsupported block size (2^%d)",
 597                    lbaf->ds);
 598         goto out;
 599     }
 600
 601     ret = true;
 602     s->blkshift = lbaf->ds;
 603 out:
 604     qemu_vfio_dma_unmap(s->vfio, id);
 605
 606     return ret;
 607 }
 608
 609 static void nvme_poll_queue(NVMeQueuePair *q)
 610 {
 611     const size_t cqe_offset = q->cq.head * NVME_CQ_ENTRY_BYTES;
 612     NvmeCqe *cqe = (NvmeCqe *)&q->cq.queue[cqe_offset];
 613
 614     trace_nvme_poll_queue(q->s, q->index);
 615     /*
 616      * Do an early check for completions. q->lock isn't needed because
 617      * nvme_process_completion() only runs in the event loop thread and
 618      * cannot race with itself.
 619      */
 620     if ((le16_to_cpu(cqe->status) & 0x1) == q->cq_phase) {
 621         return;
 622     }
 623
 624     qemu_mutex_lock(&q->lock);
 625     while (nvme_process_completion(q)) {
 626         /* Keep polling */
 627     }
 628     qemu_mutex_unlock(&q->lock);
 629 }
 630
 631 static void nvme_poll_queues(BDRVNVMeState *s)
 632 {
 633     int i;
 634
 635     for (i = 0; i < s->queue_count; i++) {
 636         nvme_poll_queue(s->queues[i]);
 637     }
 638 }
 639
 640 static void nvme_handle_event(EventNotifier *n)
 641 {
 642     BDRVNVMeState *s = container_of(n, BDRVNVMeState,
 643                                     irq_notifier[MSIX_SHARED_IRQ_IDX]);
 644
 645     trace_nvme_handle_event(s);
 646     event_notifier_test_and_clear(n);
 647     nvme_poll_queues(s);
 648 }
 649
 650 static bool nvme_add_io_queue(BlockDriverState *bs, Error **errp)
 651 {
 652     BDRVNVMeState *s = bs->opaque;
 653     unsigned n = s->queue_count;
 654     NVMeQueuePair *q;
 655     NvmeCmd cmd;
 656     unsigned queue_size = NVME_QUEUE_SIZE;
 657
 658     assert(n <= UINT16_MAX);
 659     q = nvme_create_queue_pair(s, bdrv_get_aio_context(bs),
 660                                n, queue_size, errp);
 661     if (!q) {
 662         return false;
 663     }
 664     cmd = (NvmeCmd) {
 665         .opcode = NVME_ADM_CMD_CREATE_CQ,
 666         .dptr.prp1 = cpu_to_le64(q->cq.iova),
 667         .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | n),
 668         .cdw11 = cpu_to_le32(NVME_CQ_IEN | NVME_CQ_PC),
 669     };
 670     if (nvme_admin_cmd_sync(bs, &cmd)) {
 671         error_setg(errp, "Failed to create CQ io queue [%u]", n);
 672         goto out_error;
 673     }
 674     cmd = (NvmeCmd) {
 675         .opcode = NVME_ADM_CMD_CREATE_SQ,
 676         .dptr.prp1 = cpu_to_le64(q->sq.iova),
 677         .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | n),
 678         .cdw11 = cpu_to_le32(NVME_SQ_PC | (n << 16)),
 679     };
 680     if (nvme_admin_cmd_sync(bs, &cmd)) {
 681         error_setg(errp, "Failed to create SQ io queue [%u]", n);
 682         goto out_error;
 683     }
 684     s->queues = g_renew(NVMeQueuePair *, s->queues, n + 1);
 685     s->queues[n] = q;
 686     s->queue_count++;
 687     return true;
 688 out_error:
 689     nvme_free_queue_pair(q);
 690     return false;
 691 }
 692
 693 static bool nvme_poll_cb(void *opaque)
 694 {
 695     EventNotifier *e = opaque;
 696     BDRVNVMeState *s = container_of(e, BDRVNVMeState,
 697                                     irq_notifier[MSIX_SHARED_IRQ_IDX]);
 698     int i;
 699
 700     for (i = 0; i < s->queue_count; i++) {
 701         NVMeQueuePair *q = s->queues[i];
 702         const size_t cqe_offset = q->cq.head * NVME_CQ_ENTRY_BYTES;
 703         NvmeCqe *cqe = (NvmeCqe *)&q->cq.queue[cqe_offset];
 704
 705         /*
 706          * q->lock isn't needed because nvme_process_completion() only runs in
 707          * the event loop thread and cannot race with itself.
 708          */
 709         if ((le16_to_cpu(cqe->status) & 0x1) != q->cq_phase) {
 710             return true;
 711         }
 712     }
 713     return false;
 714 }
 715
 716 static void nvme_poll_ready(EventNotifier *e)
 717 {
 718     BDRVNVMeState *s = container_of(e, BDRVNVMeState,
 719                                     irq_notifier[MSIX_SHARED_IRQ_IDX]);
 720
 721     nvme_poll_queues(s);
 722 }
 723
 724 static int nvme_init(BlockDriverState *bs, const char *device, int namespace,
 725                      Error **errp)
 726 {
 727     BDRVNVMeState *s = bs->opaque;
 728     NVMeQueuePair *q;
 729     AioContext *aio_context = bdrv_get_aio_context(bs);
 730     int ret;
 731     uint64_t cap;
 732     uint32_t ver;
 733     uint64_t timeout_ms;
 734     uint64_t deadline, now;
 735     volatile NvmeBar *regs = NULL;
 736
 737     qemu_co_mutex_init(&s->dma_map_lock);
 738     qemu_co_queue_init(&s->dma_flush_queue);
 739     s->device = g_strdup(device);
 740     s->nsid = namespace;
 741     s->aio_context = bdrv_get_aio_context(bs);
 742     ret = event_notifier_init(&s->irq_notifier[MSIX_SHARED_IRQ_IDX], 0);
 743     if (ret) {
 744         error_setg(errp, "Failed to init event notifier");
 745         return ret;
 746     }
 747
 748     s->vfio = qemu_vfio_open_pci(device, errp);
 749     if (!s->vfio) {
 750         ret = -EINVAL;
 751         goto out;
 752     }
 753
 754     regs = qemu_vfio_pci_map_bar(s->vfio, 0, 0, sizeof(NvmeBar),
 755                                  PROT_READ | PROT_WRITE, errp);
 756     if (!regs) {
 757         ret = -EINVAL;
 758         goto out;
 759     }
 760     /* Perform initialize sequence as described in NVMe spec "7.6.1
 761      * Initialization". */
 762
 763     cap = le64_to_cpu(regs->cap);
 764     trace_nvme_controller_capability_raw(cap);
 765     trace_nvme_controller_capability("Maximum Queue Entries Supported",
 766                                      1 + NVME_CAP_MQES(cap));
 767     trace_nvme_controller_capability("Contiguous Queues Required",
 768                                      NVME_CAP_CQR(cap));
 769     trace_nvme_controller_capability("Doorbell Stride",
 770                                      1 << (2 + NVME_CAP_DSTRD(cap)));
 771     trace_nvme_controller_capability("Subsystem Reset Supported",
 772                                      NVME_CAP_NSSRS(cap));
 773     trace_nvme_controller_capability("Memory Page Size Minimum",
 774                                      1 << (12 + NVME_CAP_MPSMIN(cap)));
 775     trace_nvme_controller_capability("Memory Page Size Maximum",
 776                                      1 << (12 + NVME_CAP_MPSMAX(cap)));
 777     if (!NVME_CAP_CSS(cap)) {
 778         error_setg(errp, "Device doesn't support NVMe command set");
 779         ret = -EINVAL;
 780         goto out;
 781     }
 782
 783     s->page_size = 1u << (12 + NVME_CAP_MPSMIN(cap));
 784     s->doorbell_scale = (4 << NVME_CAP_DSTRD(cap)) / sizeof(uint32_t);
 785     bs->bl.opt_mem_alignment = s->page_size;
 786     bs->bl.request_alignment = s->page_size;
 787     timeout_ms = MIN(500 * NVME_CAP_TO(cap), 30000);
 788
 789     ver = le32_to_cpu(regs->vs);
 790     trace_nvme_controller_spec_version(extract32(ver, 16, 16),
 791                                        extract32(ver, 8, 8),
 792                                        extract32(ver, 0, 8));
 793
 794     /* Reset device to get a clean state. */
 795     regs->cc = cpu_to_le32(le32_to_cpu(regs->cc) & 0xFE);
 796     /* Wait for CSTS.RDY = 0. */
 797     deadline = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + timeout_ms * SCALE_MS;
 798     while (NVME_CSTS_RDY(le32_to_cpu(regs->csts))) {
 799         if (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) > deadline) {
 800             error_setg(errp, "Timeout while waiting for device to reset (%"
 801                              PRId64 " ms)",
 802                        timeout_ms);
 803             ret = -ETIMEDOUT;
 804             goto out;
 805         }
 806     }
 807
 808     s->bar0_wo_map = qemu_vfio_pci_map_bar(s->vfio, 0, 0,
 809                                            sizeof(NvmeBar) + NVME_DOORBELL_SIZE,
 810                                            PROT_WRITE, errp);
 811     s->doorbells = (void *)((uintptr_t)s->bar0_wo_map + sizeof(NvmeBar));
 812     if (!s->doorbells) {
 813         ret = -EINVAL;
 814         goto out;
 815     }
 816
 817     /* Set up admin queue. */
 818     s->queues = g_new(NVMeQueuePair *, 1);
 819     q = nvme_create_queue_pair(s, aio_context, 0, NVME_QUEUE_SIZE, errp);
 820     if (!q) {
 821         ret = -EINVAL;
 822         goto out;
 823     }
 824     s->queues[INDEX_ADMIN] = q;
 825     s->queue_count = 1;
 826     QEMU_BUILD_BUG_ON((NVME_QUEUE_SIZE - 1) & 0xF000);
 827     regs->aqa = cpu_to_le32(((NVME_QUEUE_SIZE - 1) << AQA_ACQS_SHIFT) |
 828                             ((NVME_QUEUE_SIZE - 1) << AQA_ASQS_SHIFT));
 829     regs->asq = cpu_to_le64(q->sq.iova);
 830     regs->acq = cpu_to_le64(q->cq.iova);
 831
 832     /* After setting up all control registers we can enable device now. */
 833     regs->cc = cpu_to_le32((ctz32(NVME_CQ_ENTRY_BYTES) << CC_IOCQES_SHIFT) |
 834                            (ctz32(NVME_SQ_ENTRY_BYTES) << CC_IOSQES_SHIFT) |
 835                            CC_EN_MASK);
 836     /* Wait for CSTS.RDY = 1. */
 837     now = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
 838     deadline = now + timeout_ms * SCALE_MS;
 839     while (!NVME_CSTS_RDY(le32_to_cpu(regs->csts))) {
 840         if (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) > deadline) {
 841             error_setg(errp, "Timeout while waiting for device to start (%"
 842                              PRId64 " ms)",
 843                        timeout_ms);
 844             ret = -ETIMEDOUT;
 845             goto out;
 846         }
 847     }
 848
 849     ret = qemu_vfio_pci_init_irq(s->vfio, s->irq_notifier,
 850                                  VFIO_PCI_MSIX_IRQ_INDEX, errp);
 851     if (ret) {
 852         goto out;
 853     }
 854     aio_set_event_notifier(bdrv_get_aio_context(bs),
 855                            &s->irq_notifier[MSIX_SHARED_IRQ_IDX],
 856                            false, nvme_handle_event, nvme_poll_cb,
 857                            nvme_poll_ready);
 858
 859     if (!nvme_identify(bs, namespace, errp)) {
 860         ret = -EIO;
 861         goto out;
 862     }
 863
 864     /* Set up command queues. */
 865     if (!nvme_add_io_queue(bs, errp)) {
 866         ret = -EIO;
 867     }
 868 out:
 869     if (regs) {
 870         qemu_vfio_pci_unmap_bar(s->vfio, 0, (void *)regs, 0, sizeof(NvmeBar));
 871     }
 872
 873     /* Cleaning up is done in nvme_file_open() upon error. */
 874     return ret;
 875 }
 876
 877 /* Parse a filename in the format of nvme://XXXX:XX:XX.X/X. Example:
 878  *
 879  *     nvme://0000:44:00.0/1
 880  *
 881  * where the "nvme://" is a fixed form of the protocol prefix, the middle part
 882  * is the PCI address, and the last part is the namespace number starting from
 883  * 1 according to the NVMe spec. */
 884 static void nvme_parse_filename(const char *filename, QDict *options,
 885                                 Error **errp)
 886 {
 887     int pref = strlen("nvme://");
 888
 889     if (strlen(filename) > pref && !strncmp(filename, "nvme://", pref)) {
 890         const char *tmp = filename + pref;
 891         char *device;
 892         const char *namespace;
 893         unsigned long ns;
 894         const char *slash = strchr(tmp, '/');
 895         if (!slash) {
 896             qdict_put_str(options, NVME_BLOCK_OPT_DEVICE, tmp);
 897             return;
 898         }
 899         device = g_strndup(tmp, slash - tmp);
 900         qdict_put_str(options, NVME_BLOCK_OPT_DEVICE, device);
 901         g_free(device);
 902         namespace = slash + 1;
 903         if (*namespace && qemu_strtoul(namespace, NULL, 10, &ns)) {
 904             error_setg(errp, "Invalid namespace '%s', positive number expected",
 905                        namespace);
 906             return;
 907         }
 908         qdict_put_str(options, NVME_BLOCK_OPT_NAMESPACE,
 909                       *namespace ? namespace : "1");
 910     }
 911 }
 912
 913 static int nvme_enable_disable_write_cache(BlockDriverState *bs, bool enable,
 914                                            Error **errp)
 915 {
 916     int ret;
 917     BDRVNVMeState *s = bs->opaque;
 918     NvmeCmd cmd = {
 919         .opcode = NVME_ADM_CMD_SET_FEATURES,
 920         .nsid = cpu_to_le32(s->nsid),
 921         .cdw10 = cpu_to_le32(0x06),
 922         .cdw11 = cpu_to_le32(enable ? 0x01 : 0x00),
 923     };
 924
 925     ret = nvme_admin_cmd_sync(bs, &cmd);
 926     if (ret) {
 927         error_setg(errp, "Failed to configure NVMe write cache");
 928     }
 929     return ret;
 930 }
 931
 932 static void nvme_close(BlockDriverState *bs)
 933 {
 934     BDRVNVMeState *s = bs->opaque;
 935
 936     for (unsigned i = 0; i < s->queue_count; ++i) {
 937         nvme_free_queue_pair(s->queues[i]);
 938     }
 939     g_free(s->queues);
 940     aio_set_event_notifier(bdrv_get_aio_context(bs),
 941                            &s->irq_notifier[MSIX_SHARED_IRQ_IDX],
 942                            false, NULL, NULL, NULL);
 943     event_notifier_cleanup(&s->irq_notifier[MSIX_SHARED_IRQ_IDX]);
 944     qemu_vfio_pci_unmap_bar(s->vfio, 0, s->bar0_wo_map,
 945                             0, sizeof(NvmeBar) + NVME_DOORBELL_SIZE);
 946     qemu_vfio_close(s->vfio);
 947
 948     g_free(s->device);
 949 }
 950
 951 static int nvme_file_open(BlockDriverState *bs, QDict *options, int flags,
 952                           Error **errp)
 953 {
 954     const char *device;
 955     QemuOpts *opts;
 956     int namespace;
 957     int ret;
 958     BDRVNVMeState *s = bs->opaque;
 959
 960     bs->supported_write_flags = BDRV_REQ_FUA;
 961
 962     opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
 963     qemu_opts_absorb_qdict(opts, options, &error_abort);
 964     device = qemu_opt_get(opts, NVME_BLOCK_OPT_DEVICE);
 965     if (!device) {
 966         error_setg(errp, "'" NVME_BLOCK_OPT_DEVICE "' option is required");
 967         qemu_opts_del(opts);
 968         return -EINVAL;
 969     }
 970
 971     namespace = qemu_opt_get_number(opts, NVME_BLOCK_OPT_NAMESPACE, 1);
 972     ret = nvme_init(bs, device, namespace, errp);
 973     qemu_opts_del(opts);
 974     if (ret) {
 975         goto fail;
 976     }
 977     if (flags & BDRV_O_NOCACHE) {
 978         if (!s->write_cache_supported) {
 979             error_setg(errp,
 980                        "NVMe controller doesn't support write cache configuration");
 981             ret = -EINVAL;
 982         } else {
 983             ret = nvme_enable_disable_write_cache(bs, !(flags & BDRV_O_NOCACHE),
 984                                                   errp);
 985         }
 986         if (ret) {
 987             goto fail;
 988         }
 989     }
 990     return 0;
 991 fail:
 992     nvme_close(bs);
 993     return ret;
 994 }
 995
 996 static int64_t nvme_getlength(BlockDriverState *bs)
 997 {
 998     BDRVNVMeState *s = bs->opaque;
 999     return s->nsze << s->blkshift;
1000 }
1001
1002 static uint32_t nvme_get_blocksize(BlockDriverState *bs)
1003 {
1004     BDRVNVMeState *s = bs->opaque;
1005     assert(s->blkshift >= BDRV_SECTOR_BITS && s->blkshift <= 12);
1006     return UINT32_C(1) << s->blkshift;
1007 }
1008
1009 static int nvme_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz)
1010 {
1011     uint32_t blocksize = nvme_get_blocksize(bs);
1012     bsz->phys = blocksize;
1013     bsz->log = blocksize;
1014     return 0;
1015 }
1016
1017 /* Called with s->dma_map_lock */
1018 static coroutine_fn int nvme_cmd_unmap_qiov(BlockDriverState *bs,
1019                                             QEMUIOVector *qiov)
1020 {
1021     int r = 0;
1022     BDRVNVMeState *s = bs->opaque;
1023
1024     s->dma_map_count -= qiov->size;
1025     if (!s->dma_map_count && !qemu_co_queue_empty(&s->dma_flush_queue)) {
1026         r = qemu_vfio_dma_reset_temporary(s->vfio);
1027         if (!r) {
1028             qemu_co_queue_restart_all(&s->dma_flush_queue);
1029         }
1030     }
1031     return r;
1032 }
1033
1034 /* Called with s->dma_map_lock */
1035 static coroutine_fn int nvme_cmd_map_qiov(BlockDriverState *bs, NvmeCmd *cmd,
1036                                           NVMeRequest *req, QEMUIOVector *qiov)
1037 {
1038     BDRVNVMeState *s = bs->opaque;
1039     uint64_t *pagelist = req->prp_list_page;
1040     int i, j, r;
1041     int entries = 0;
1042     Error *local_err = NULL, **errp = NULL;
1043
1044     assert(qiov->size);
1045     assert(QEMU_IS_ALIGNED(qiov->size, s->page_size));
1046     assert(qiov->size / s->page_size <= s->page_size / sizeof(uint64_t));
1047     for (i = 0; i < qiov->niov; ++i) {
1048         bool retry = true;
1049         uint64_t iova;
1050         size_t len = QEMU_ALIGN_UP(qiov->iov[i].iov_len,
1051                                    qemu_real_host_page_size());
1052 try_map:
1053         r = qemu_vfio_dma_map(s->vfio,
1054                               qiov->iov[i].iov_base,
1055                               len, true, &iova, errp);
1056         if (r == -ENOSPC) {
1057             /*
1058              * In addition to the -ENOMEM error, the VFIO_IOMMU_MAP_DMA
1059              * ioctl returns -ENOSPC to signal the user exhausted the DMA
1060              * mappings available for a container since Linux kernel commit
1061              * 492855939bdb ("vfio/type1: Limit DMA mappings per container",
1062              * April 2019, see CVE-2019-3882).
1063              *
1064              * This block driver already handles this error path by checking
1065              * for the -ENOMEM error, so we directly replace -ENOSPC by
1066              * -ENOMEM. Beside, -ENOSPC has a specific meaning for blockdev
1067              * coroutines: it triggers BLOCKDEV_ON_ERROR_ENOSPC and
1068              * BLOCK_ERROR_ACTION_STOP which stops the VM, asking the operator
1069              * to add more storage to the blockdev. Not something we can do
1070              * easily with an IOMMU :)
1071              */
1072             r = -ENOMEM;
1073         }
1074         if (r == -ENOMEM && retry) {
1075             /*
1076              * We exhausted the DMA mappings available for our container:
1077              * recycle the volatile IOVA mappings.
1078              */
1079             retry = false;
1080             trace_nvme_dma_flush_queue_wait(s);
1081             if (s->dma_map_count) {
1082                 trace_nvme_dma_map_flush(s);
1083                 qemu_co_queue_wait(&s->dma_flush_queue, &s->dma_map_lock);
1084             } else {
1085                 r = qemu_vfio_dma_reset_temporary(s->vfio);
1086                 if (r) {
1087                     goto fail;
1088                 }
1089             }
1090             errp = &local_err;
1091
1092             goto try_map;
1093         }
1094         if (r) {
1095             goto fail;
1096         }
1097
1098         for (j = 0; j < qiov->iov[i].iov_len / s->page_size; j++) {
1099             pagelist[entries++] = cpu_to_le64(iova + j * s->page_size);
1100         }
1101         trace_nvme_cmd_map_qiov_iov(s, i, qiov->iov[i].iov_base,
1102                                     qiov->iov[i].iov_len / s->page_size);
1103     }
1104
1105     s->dma_map_count += qiov->size;
1106
1107     assert(entries <= s->page_size / sizeof(uint64_t));
1108     switch (entries) {
1109     case 0:
1110         abort();
1111     case 1:
1112         cmd->dptr.prp1 = pagelist[0];
1113         cmd->dptr.prp2 = 0;
1114         break;
1115     case 2:
1116         cmd->dptr.prp1 = pagelist[0];
1117         cmd->dptr.prp2 = pagelist[1];
1118         break;
1119     default:
1120         cmd->dptr.prp1 = pagelist[0];
1121         cmd->dptr.prp2 = cpu_to_le64(req->prp_list_iova + sizeof(uint64_t));
1122         break;
1123     }
1124     trace_nvme_cmd_map_qiov(s, cmd, req, qiov, entries);
1125     for (i = 0; i < entries; ++i) {
1126         trace_nvme_cmd_map_qiov_pages(s, i, pagelist[i]);
1127     }
1128     return 0;
1129 fail:
1130     /* No need to unmap [0 - i) iovs even if we've failed, since we don't
1131      * increment s->dma_map_count. This is okay for fixed mapping memory areas
1132      * because they are already mapped before calling this function; for
1133      * temporary mappings, a later nvme_cmd_(un)map_qiov will reclaim by
1134      * calling qemu_vfio_dma_reset_temporary when necessary. */
1135     if (local_err) {
1136         error_reportf_err(local_err, "Cannot map buffer for DMA: ");
1137     }
1138     return r;
1139 }
1140
1141 typedef struct {
1142     Coroutine *co;
1143     int ret;
1144     AioContext *ctx;
1145 } NVMeCoData;
1146
1147 static void nvme_rw_cb_bh(void *opaque)
1148 {
1149     NVMeCoData *data = opaque;
1150     qemu_coroutine_enter(data->co);
1151 }
1152
1153 static void nvme_rw_cb(void *opaque, int ret)
1154 {
1155     NVMeCoData *data = opaque;
1156     data->ret = ret;
1157     if (!data->co) {
1158         /* The rw coroutine hasn't yielded, don't try to enter. */
1159         return;
1160     }
1161     replay_bh_schedule_oneshot_event(data->ctx, nvme_rw_cb_bh, data);
1162 }
1163
1164 static coroutine_fn int nvme_co_prw_aligned(BlockDriverState *bs,
1165                                             uint64_t offset, uint64_t bytes,
1166                                             QEMUIOVector *qiov,
1167                                             bool is_write,
1168                                             int flags)
1169 {
1170     int r;
1171     BDRVNVMeState *s = bs->opaque;
1172     NVMeQueuePair *ioq = s->queues[INDEX_IO(0)];
1173     NVMeRequest *req;
1174
1175     uint32_t cdw12 = (((bytes >> s->blkshift) - 1) & 0xFFFF) |
1176                        (flags & BDRV_REQ_FUA ? 1 << 30 : 0);
1177     NvmeCmd cmd = {
1178         .opcode = is_write ? NVME_CMD_WRITE : NVME_CMD_READ,
1179         .nsid = cpu_to_le32(s->nsid),
1180         .cdw10 = cpu_to_le32((offset >> s->blkshift) & 0xFFFFFFFF),
1181         .cdw11 = cpu_to_le32(((offset >> s->blkshift) >> 32) & 0xFFFFFFFF),
1182         .cdw12 = cpu_to_le32(cdw12),
1183     };
1184     NVMeCoData data = {
1185         .ctx = bdrv_get_aio_context(bs),
1186         .ret = -EINPROGRESS,
1187     };
1188
1189     trace_nvme_prw_aligned(s, is_write, offset, bytes, flags, qiov->niov);
1190     assert(s->queue_count > 1);
1191     req = nvme_get_free_req(ioq);
1192     assert(req);
1193
1194     qemu_co_mutex_lock(&s->dma_map_lock);
1195     r = nvme_cmd_map_qiov(bs, &cmd, req, qiov);
1196     qemu_co_mutex_unlock(&s->dma_map_lock);
1197     if (r) {
1198         nvme_put_free_req_and_wake(ioq, req);
1199         return r;
1200     }
1201     nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data);
1202
1203     data.co = qemu_coroutine_self();
1204     while (data.ret == -EINPROGRESS) {
1205         qemu_coroutine_yield();
1206     }
1207
1208     qemu_co_mutex_lock(&s->dma_map_lock);
1209     r = nvme_cmd_unmap_qiov(bs, qiov);
1210     qemu_co_mutex_unlock(&s->dma_map_lock);
1211     if (r) {
1212         return r;
1213     }
1214
1215     trace_nvme_rw_done(s, is_write, offset, bytes, data.ret);
1216     return data.ret;
1217 }
1218
1219 static inline bool nvme_qiov_aligned(BlockDriverState *bs,
1220                                      const QEMUIOVector *qiov)
1221 {
1222     int i;
1223     BDRVNVMeState *s = bs->opaque;
1224
1225     for (i = 0; i < qiov->niov; ++i) {
1226         if (!QEMU_PTR_IS_ALIGNED(qiov->iov[i].iov_base,
1227                                  qemu_real_host_page_size()) ||
1228             !QEMU_IS_ALIGNED(qiov->iov[i].iov_len, qemu_real_host_page_size())) {
1229             trace_nvme_qiov_unaligned(qiov, i, qiov->iov[i].iov_base,
1230                                       qiov->iov[i].iov_len, s->page_size);
1231             return false;
1232         }
1233     }
1234     return true;
1235 }
1236
1237 static int nvme_co_prw(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
1238                        QEMUIOVector *qiov, bool is_write, int flags)
1239 {
1240     BDRVNVMeState *s = bs->opaque;
1241     int r;
1242     QEMU_AUTO_VFREE uint8_t *buf = NULL;
1243     QEMUIOVector local_qiov;
1244     size_t len = QEMU_ALIGN_UP(bytes, qemu_real_host_page_size());
1245     assert(QEMU_IS_ALIGNED(offset, s->page_size));
1246     assert(QEMU_IS_ALIGNED(bytes, s->page_size));
1247     assert(bytes <= s->max_transfer);
1248     if (nvme_qiov_aligned(bs, qiov)) {
1249         s->stats.aligned_accesses++;
1250         return nvme_co_prw_aligned(bs, offset, bytes, qiov, is_write, flags);
1251     }
1252     s->stats.unaligned_accesses++;
1253     trace_nvme_prw_buffered(s, offset, bytes, qiov->niov, is_write);
1254     buf = qemu_try_memalign(qemu_real_host_page_size(), len);
1255
1256     if (!buf) {
1257         return -ENOMEM;
1258     }
1259     qemu_iovec_init(&local_qiov, 1);
1260     if (is_write) {
1261         qemu_iovec_to_buf(qiov, 0, buf, bytes);
1262     }
1263     qemu_iovec_add(&local_qiov, buf, bytes);
1264     r = nvme_co_prw_aligned(bs, offset, bytes, &local_qiov, is_write, flags);
1265     qemu_iovec_destroy(&local_qiov);
1266     if (!r && !is_write) {
1267         qemu_iovec_from_buf(qiov, 0, buf, bytes);
1268     }
1269     return r;
1270 }
1271
1272 static coroutine_fn int nvme_co_preadv(BlockDriverState *bs,
1273                                        int64_t offset, int64_t bytes,
1274                                        QEMUIOVector *qiov,
1275                                        BdrvRequestFlags flags)
1276 {
1277     return nvme_co_prw(bs, offset, bytes, qiov, false, flags);
1278 }
1279
1280 static coroutine_fn int nvme_co_pwritev(BlockDriverState *bs,
1281                                         int64_t offset, int64_t bytes,
1282                                         QEMUIOVector *qiov,
1283                                         BdrvRequestFlags flags)
1284 {
1285     return nvme_co_prw(bs, offset, bytes, qiov, true, flags);
1286 }
1287
1288 static coroutine_fn int nvme_co_flush(BlockDriverState *bs)
1289 {
1290     BDRVNVMeState *s = bs->opaque;
1291     NVMeQueuePair *ioq = s->queues[INDEX_IO(0)];
1292     NVMeRequest *req;
1293     NvmeCmd cmd = {
1294         .opcode = NVME_CMD_FLUSH,
1295         .nsid = cpu_to_le32(s->nsid),
1296     };
1297     NVMeCoData data = {
1298         .ctx = bdrv_get_aio_context(bs),
1299         .ret = -EINPROGRESS,
1300     };
1301
1302     assert(s->queue_count > 1);
1303     req = nvme_get_free_req(ioq);
1304     assert(req);
1305     nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data);
1306
1307     data.co = qemu_coroutine_self();
1308     if (data.ret == -EINPROGRESS) {
1309         qemu_coroutine_yield();
1310     }
1311
1312     return data.ret;
1313 }
1314
1315
1316 static coroutine_fn int nvme_co_pwrite_zeroes(BlockDriverState *bs,
1317                                               int64_t offset,
1318                                               int64_t bytes,
1319                                               BdrvRequestFlags flags)
1320 {
1321     BDRVNVMeState *s = bs->opaque;
1322     NVMeQueuePair *ioq = s->queues[INDEX_IO(0)];
1323     NVMeRequest *req;
1324     uint32_t cdw12;
1325
1326     if (!s->supports_write_zeroes) {
1327         return -ENOTSUP;
1328     }
1329
1330     if (bytes == 0) {
1331         return 0;
1332     }
1333
1334     cdw12 = ((bytes >> s->blkshift) - 1) & 0xFFFF;
1335     /*
1336      * We should not lose information. pwrite_zeroes_alignment and
1337      * max_pwrite_zeroes guarantees it.
1338      */
1339     assert(((cdw12 + 1) << s->blkshift) == bytes);
1340
1341     NvmeCmd cmd = {
1342         .opcode = NVME_CMD_WRITE_ZEROES,
1343         .nsid = cpu_to_le32(s->nsid),
1344         .cdw10 = cpu_to_le32((offset >> s->blkshift) & 0xFFFFFFFF),
1345         .cdw11 = cpu_to_le32(((offset >> s->blkshift) >> 32) & 0xFFFFFFFF),
1346     };
1347
1348     NVMeCoData data = {
1349         .ctx = bdrv_get_aio_context(bs),
1350         .ret = -EINPROGRESS,
1351     };
1352
1353     if (flags & BDRV_REQ_MAY_UNMAP) {
1354         cdw12 |= (1 << 25);
1355     }
1356
1357     if (flags & BDRV_REQ_FUA) {
1358         cdw12 |= (1 << 30);
1359     }
1360
1361     cmd.cdw12 = cpu_to_le32(cdw12);
1362
1363     trace_nvme_write_zeroes(s, offset, bytes, flags);
1364     assert(s->queue_count > 1);
1365     req = nvme_get_free_req(ioq);
1366     assert(req);
1367
1368     nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data);
1369
1370     data.co = qemu_coroutine_self();
1371     while (data.ret == -EINPROGRESS) {
1372         qemu_coroutine_yield();
1373     }
1374
1375     trace_nvme_rw_done(s, true, offset, bytes, data.ret);
1376     return data.ret;
1377 }
1378
1379
1380 static int coroutine_fn nvme_co_pdiscard(BlockDriverState *bs,
1381                                          int64_t offset,
1382                                          int64_t bytes)
1383 {
1384     BDRVNVMeState *s = bs->opaque;
1385     NVMeQueuePair *ioq = s->queues[INDEX_IO(0)];
1386     NVMeRequest *req;
1387     QEMU_AUTO_VFREE NvmeDsmRange *buf = NULL;
1388     QEMUIOVector local_qiov;
1389     int ret;
1390
1391     NvmeCmd cmd = {
1392         .opcode = NVME_CMD_DSM,
1393         .nsid = cpu_to_le32(s->nsid),
1394         .cdw10 = cpu_to_le32(0), /*number of ranges - 0 based*/
1395         .cdw11 = cpu_to_le32(1 << 2), /*deallocate bit*/
1396     };
1397
1398     NVMeCoData data = {
1399         .ctx = bdrv_get_aio_context(bs),
1400         .ret = -EINPROGRESS,
1401     };
1402
1403     if (!s->supports_discard) {
1404         return -ENOTSUP;
1405     }
1406
1407     assert(s->queue_count > 1);
1408
1409     /*
1410      * Filling the @buf requires @offset and @bytes to satisfy restrictions
1411      * defined in nvme_refresh_limits().
1412      */
1413     assert(QEMU_IS_ALIGNED(bytes, 1UL << s->blkshift));
1414     assert(QEMU_IS_ALIGNED(offset, 1UL << s->blkshift));
1415     assert((bytes >> s->blkshift) <= UINT32_MAX);
1416
1417     buf = qemu_try_memalign(s->page_size, s->page_size);
1418     if (!buf) {
1419         return -ENOMEM;
1420     }
1421     memset(buf, 0, s->page_size);
1422     buf->nlb = cpu_to_le32(bytes >> s->blkshift);
1423     buf->slba = cpu_to_le64(offset >> s->blkshift);
1424     buf->cattr = 0;
1425
1426     qemu_iovec_init(&local_qiov, 1);
1427     qemu_iovec_add(&local_qiov, buf, 4096);
1428
1429     req = nvme_get_free_req(ioq);
1430     assert(req);
1431
1432     qemu_co_mutex_lock(&s->dma_map_lock);
1433     ret = nvme_cmd_map_qiov(bs, &cmd, req, &local_qiov);
1434     qemu_co_mutex_unlock(&s->dma_map_lock);
1435
1436     if (ret) {
1437         nvme_put_free_req_and_wake(ioq, req);
1438         goto out;
1439     }
1440
1441     trace_nvme_dsm(s, offset, bytes);
1442
1443     nvme_submit_command(ioq, req, &cmd, nvme_rw_cb, &data);
1444
1445     data.co = qemu_coroutine_self();
1446     while (data.ret == -EINPROGRESS) {
1447         qemu_coroutine_yield();
1448     }
1449
1450     qemu_co_mutex_lock(&s->dma_map_lock);
1451     ret = nvme_cmd_unmap_qiov(bs, &local_qiov);
1452     qemu_co_mutex_unlock(&s->dma_map_lock);
1453
1454     if (ret) {
1455         goto out;
1456     }
1457
1458     ret = data.ret;
1459     trace_nvme_dsm_done(s, offset, bytes, ret);
1460 out:
1461     qemu_iovec_destroy(&local_qiov);
1462     return ret;
1463
1464 }
1465
1466 static int coroutine_fn nvme_co_truncate(BlockDriverState *bs, int64_t offset,
1467                                          bool exact, PreallocMode prealloc,
1468                                          BdrvRequestFlags flags, Error **errp)
1469 {
1470     int64_t cur_length;
1471
1472     if (prealloc != PREALLOC_MODE_OFF) {
1473         error_setg(errp, "Unsupported preallocation mode '%s'",
1474                    PreallocMode_str(prealloc));
1475         return -ENOTSUP;
1476     }
1477
1478     cur_length = nvme_getlength(bs);
1479     if (offset != cur_length && exact) {
1480         error_setg(errp, "Cannot resize NVMe devices");
1481         return -ENOTSUP;
1482     } else if (offset > cur_length) {
1483         error_setg(errp, "Cannot grow NVMe devices");
1484         return -EINVAL;
1485     }
1486
1487     return 0;
1488 }
1489
1490 static int nvme_reopen_prepare(BDRVReopenState *reopen_state,
1491                                BlockReopenQueue *queue, Error **errp)
1492 {
1493     return 0;
1494 }
1495
1496 static void nvme_refresh_filename(BlockDriverState *bs)
1497 {
1498     BDRVNVMeState *s = bs->opaque;
1499
1500     snprintf(bs->exact_filename, sizeof(bs->exact_filename), "nvme://%s/%i",
1501              s->device, s->nsid);
1502 }
1503
1504 static void nvme_refresh_limits(BlockDriverState *bs, Error **errp)
1505 {
1506     BDRVNVMeState *s = bs->opaque;
1507
1508     bs->bl.opt_mem_alignment = s->page_size;
1509     bs->bl.request_alignment = s->page_size;
1510     bs->bl.max_transfer = s->max_transfer;
1511
1512     /*
1513      * Look at nvme_co_pwrite_zeroes: after shift and decrement we should get
1514      * at most 0xFFFF
1515      */
1516     bs->bl.max_pwrite_zeroes = 1ULL << (s->blkshift + 16);
1517     bs->bl.pwrite_zeroes_alignment = MAX(bs->bl.request_alignment,
1518                                          1UL << s->blkshift);
1519
1520     bs->bl.max_pdiscard = (uint64_t)UINT32_MAX << s->blkshift;
1521     bs->bl.pdiscard_alignment = MAX(bs->bl.request_alignment,
1522                                     1UL << s->blkshift);
1523 }
1524
1525 static void nvme_detach_aio_context(BlockDriverState *bs)
1526 {
1527     BDRVNVMeState *s = bs->opaque;
1528
1529     for (unsigned i = 0; i < s->queue_count; i++) {
1530         NVMeQueuePair *q = s->queues[i];
1531
1532         qemu_bh_delete(q->completion_bh);
1533         q->completion_bh = NULL;
1534     }
1535
1536     aio_set_event_notifier(bdrv_get_aio_context(bs),
1537                            &s->irq_notifier[MSIX_SHARED_IRQ_IDX],
1538                            false, NULL, NULL, NULL);
1539 }
1540
1541 static void nvme_attach_aio_context(BlockDriverState *bs,
1542                                     AioContext *new_context)
1543 {
1544     BDRVNVMeState *s = bs->opaque;
1545
1546     s->aio_context = new_context;
1547     aio_set_event_notifier(new_context, &s->irq_notifier[MSIX_SHARED_IRQ_IDX],
1548                            false, nvme_handle_event, nvme_poll_cb,
1549                            nvme_poll_ready);
1550
1551     for (unsigned i = 0; i < s->queue_count; i++) {
1552         NVMeQueuePair *q = s->queues[i];
1553
1554         q->completion_bh =
1555             aio_bh_new(new_context, nvme_process_completion_bh, q);
1556     }
1557 }
1558
1559 static void nvme_aio_plug(BlockDriverState *bs)
1560 {
1561     BDRVNVMeState *s = bs->opaque;
1562     assert(!s->plugged);
1563     s->plugged = true;
1564 }
1565
1566 static void nvme_aio_unplug(BlockDriverState *bs)
1567 {
1568     BDRVNVMeState *s = bs->opaque;
1569     assert(s->plugged);
1570     s->plugged = false;
1571     for (unsigned i = INDEX_IO(0); i < s->queue_count; i++) {
1572         NVMeQueuePair *q = s->queues[i];
1573         qemu_mutex_lock(&q->lock);
1574         nvme_kick(q);
1575         nvme_process_completion(q);
1576         qemu_mutex_unlock(&q->lock);
1577     }
1578 }
1579
1580 static void nvme_register_buf(BlockDriverState *bs, void *host, size_t size)
1581 {
1582     int ret;
1583     Error *local_err = NULL;
1584     BDRVNVMeState *s = bs->opaque;
1585
1586     ret = qemu_vfio_dma_map(s->vfio, host, size, false, NULL, &local_err);
1587     if (ret) {
1588         /* FIXME: we may run out of IOVA addresses after repeated
1589          * bdrv_register_buf/bdrv_unregister_buf, because nvme_vfio_dma_unmap
1590          * doesn't reclaim addresses for fixed mappings. */
1591         error_reportf_err(local_err, "nvme_register_buf failed: ");
1592     }
1593 }
1594
1595 static void nvme_unregister_buf(BlockDriverState *bs, void *host)
1596 {
1597     BDRVNVMeState *s = bs->opaque;
1598
1599     qemu_vfio_dma_unmap(s->vfio, host);
1600 }
1601
1602 static BlockStatsSpecific *nvme_get_specific_stats(BlockDriverState *bs)
1603 {
1604     BlockStatsSpecific *stats = g_new(BlockStatsSpecific, 1);
1605     BDRVNVMeState *s = bs->opaque;
1606
1607     stats->driver = BLOCKDEV_DRIVER_NVME;
1608     stats->u.nvme = (BlockStatsSpecificNvme) {
1609         .completion_errors = s->stats.completion_errors,
1610         .aligned_accesses = s->stats.aligned_accesses,
1611         .unaligned_accesses = s->stats.unaligned_accesses,
1612     };
1613
1614     return stats;
1615 }
1616
1617 static const char *const nvme_strong_runtime_opts[] = {
1618     NVME_BLOCK_OPT_DEVICE,
1619     NVME_BLOCK_OPT_NAMESPACE,
1620
1621     NULL
1622 };
1623
1624 static BlockDriver bdrv_nvme = {
1625     .format_name              = "nvme",
1626     .protocol_name            = "nvme",
1627     .instance_size            = sizeof(BDRVNVMeState),
1628
1629     .bdrv_co_create_opts      = bdrv_co_create_opts_simple,
1630     .create_opts              = &bdrv_create_opts_simple,
1631
1632     .bdrv_parse_filename      = nvme_parse_filename,
1633     .bdrv_file_open           = nvme_file_open,
1634     .bdrv_close               = nvme_close,
1635     .bdrv_getlength           = nvme_getlength,
1636     .bdrv_probe_blocksizes    = nvme_probe_blocksizes,
1637     .bdrv_co_truncate         = nvme_co_truncate,
1638
1639     .bdrv_co_preadv           = nvme_co_preadv,
1640     .bdrv_co_pwritev          = nvme_co_pwritev,
1641
1642     .bdrv_co_pwrite_zeroes    = nvme_co_pwrite_zeroes,
1643     .bdrv_co_pdiscard         = nvme_co_pdiscard,
1644
1645     .bdrv_co_flush_to_disk    = nvme_co_flush,
1646     .bdrv_reopen_prepare      = nvme_reopen_prepare,
1647
1648     .bdrv_refresh_filename    = nvme_refresh_filename,
1649     .bdrv_refresh_limits      = nvme_refresh_limits,
1650     .strong_runtime_opts      = nvme_strong_runtime_opts,
1651     .bdrv_get_specific_stats  = nvme_get_specific_stats,
1652
1653     .bdrv_detach_aio_context  = nvme_detach_aio_context,
1654     .bdrv_attach_aio_context  = nvme_attach_aio_context,
1655
1656     .bdrv_io_plug             = nvme_aio_plug,
1657     .bdrv_io_unplug           = nvme_aio_unplug,
1658
1659     .bdrv_register_buf        = nvme_register_buf,
1660     .bdrv_unregister_buf      = nvme_unregister_buf,
1661 };
1662
1663 static void bdrv_nvme_init(void)
1664 {
1665     bdrv_register(&bdrv_nvme);
1666 }
1667
1668 block_init(bdrv_nvme_init);