vl: Fix broken thread=xxx option of the --accel parameter
[qemu/ar7.git] / hw / block / nvme.c
blob381dc7c5fb20eb15215f3a7da43b99bee152051a
1 /*
2 * QEMU NVM Express Controller
4 * Copyright (c) 2012, Intel Corporation
6 * Written by Keith Busch <keith.busch@intel.com>
8 * This code is licensed under the GNU GPL v2 or later.
9 */
11 /**
12 * Reference Specs: http://www.nvmexpress.org, 1.2, 1.1, 1.0e
14 * http://www.nvmexpress.org/resources/
17 /**
18 * Usage: add options:
19 * -drive file=<file>,if=none,id=<drive_id>
20 * -device nvme,drive=<drive_id>,serial=<serial>,id=<id[optional]>, \
21 * cmb_size_mb=<cmb_size_mb[optional]>
23 * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
24 * offset 0 in BAR2 and supports SQS only for now.
27 #include "qemu/osdep.h"
28 #include "hw/block/block.h"
29 #include "hw/hw.h"
30 #include "hw/pci/msix.h"
31 #include "hw/pci/pci.h"
32 #include "sysemu/sysemu.h"
33 #include "qapi/error.h"
34 #include "qapi/visitor.h"
35 #include "sysemu/block-backend.h"
37 #include "nvme.h"
39 static void nvme_process_sq(void *opaque);
41 static void nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
43 if (n->cmbsz && addr >= n->ctrl_mem.addr &&
44 addr < (n->ctrl_mem.addr + int128_get64(n->ctrl_mem.size))) {
45 memcpy(buf, (void *)&n->cmbuf[addr - n->ctrl_mem.addr], size);
46 } else {
47 pci_dma_read(&n->parent_obj, addr, buf, size);
51 static int nvme_check_sqid(NvmeCtrl *n, uint16_t sqid)
53 return sqid < n->num_queues && n->sq[sqid] != NULL ? 0 : -1;
56 static int nvme_check_cqid(NvmeCtrl *n, uint16_t cqid)
58 return cqid < n->num_queues && n->cq[cqid] != NULL ? 0 : -1;
61 static void nvme_inc_cq_tail(NvmeCQueue *cq)
63 cq->tail++;
64 if (cq->tail >= cq->size) {
65 cq->tail = 0;
66 cq->phase = !cq->phase;
70 static void nvme_inc_sq_head(NvmeSQueue *sq)
72 sq->head = (sq->head + 1) % sq->size;
75 static uint8_t nvme_cq_full(NvmeCQueue *cq)
77 return (cq->tail + 1) % cq->size == cq->head;
80 static uint8_t nvme_sq_empty(NvmeSQueue *sq)
82 return sq->head == sq->tail;
85 static void nvme_isr_notify(NvmeCtrl *n, NvmeCQueue *cq)
87 if (cq->irq_enabled) {
88 if (msix_enabled(&(n->parent_obj))) {
89 msix_notify(&(n->parent_obj), cq->vector);
90 } else {
91 pci_irq_pulse(&n->parent_obj);
96 static uint16_t nvme_map_prp(QEMUSGList *qsg, uint64_t prp1, uint64_t prp2,
97 uint32_t len, NvmeCtrl *n)
99 hwaddr trans_len = n->page_size - (prp1 % n->page_size);
100 trans_len = MIN(len, trans_len);
101 int num_prps = (len >> n->page_bits) + 1;
103 if (!prp1) {
104 return NVME_INVALID_FIELD | NVME_DNR;
107 pci_dma_sglist_init(qsg, &n->parent_obj, num_prps);
108 qemu_sglist_add(qsg, prp1, trans_len);
109 len -= trans_len;
110 if (len) {
111 if (!prp2) {
112 goto unmap;
114 if (len > n->page_size) {
115 uint64_t prp_list[n->max_prp_ents];
116 uint32_t nents, prp_trans;
117 int i = 0;
119 nents = (len + n->page_size - 1) >> n->page_bits;
120 prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t);
121 pci_dma_read(&n->parent_obj, prp2, (void *)prp_list, prp_trans);
122 while (len != 0) {
123 uint64_t prp_ent = le64_to_cpu(prp_list[i]);
125 if (i == n->max_prp_ents - 1 && len > n->page_size) {
126 if (!prp_ent || prp_ent & (n->page_size - 1)) {
127 goto unmap;
130 i = 0;
131 nents = (len + n->page_size - 1) >> n->page_bits;
132 prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t);
133 pci_dma_read(&n->parent_obj, prp_ent, (void *)prp_list,
134 prp_trans);
135 prp_ent = le64_to_cpu(prp_list[i]);
138 if (!prp_ent || prp_ent & (n->page_size - 1)) {
139 goto unmap;
142 trans_len = MIN(len, n->page_size);
143 qemu_sglist_add(qsg, prp_ent, trans_len);
144 len -= trans_len;
145 i++;
147 } else {
148 if (prp2 & (n->page_size - 1)) {
149 goto unmap;
151 qemu_sglist_add(qsg, prp2, len);
154 return NVME_SUCCESS;
156 unmap:
157 qemu_sglist_destroy(qsg);
158 return NVME_INVALID_FIELD | NVME_DNR;
161 static uint16_t nvme_dma_read_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
162 uint64_t prp1, uint64_t prp2)
164 QEMUSGList qsg;
166 if (nvme_map_prp(&qsg, prp1, prp2, len, n)) {
167 return NVME_INVALID_FIELD | NVME_DNR;
169 if (dma_buf_read(ptr, len, &qsg)) {
170 qemu_sglist_destroy(&qsg);
171 return NVME_INVALID_FIELD | NVME_DNR;
173 qemu_sglist_destroy(&qsg);
174 return NVME_SUCCESS;
177 static void nvme_post_cqes(void *opaque)
179 NvmeCQueue *cq = opaque;
180 NvmeCtrl *n = cq->ctrl;
181 NvmeRequest *req, *next;
183 QTAILQ_FOREACH_SAFE(req, &cq->req_list, entry, next) {
184 NvmeSQueue *sq;
185 hwaddr addr;
187 if (nvme_cq_full(cq)) {
188 break;
191 QTAILQ_REMOVE(&cq->req_list, req, entry);
192 sq = req->sq;
193 req->cqe.status = cpu_to_le16((req->status << 1) | cq->phase);
194 req->cqe.sq_id = cpu_to_le16(sq->sqid);
195 req->cqe.sq_head = cpu_to_le16(sq->head);
196 addr = cq->dma_addr + cq->tail * n->cqe_size;
197 nvme_inc_cq_tail(cq);
198 pci_dma_write(&n->parent_obj, addr, (void *)&req->cqe,
199 sizeof(req->cqe));
200 QTAILQ_INSERT_TAIL(&sq->req_list, req, entry);
202 nvme_isr_notify(n, cq);
205 static void nvme_enqueue_req_completion(NvmeCQueue *cq, NvmeRequest *req)
207 assert(cq->cqid == req->sq->cqid);
208 QTAILQ_REMOVE(&req->sq->out_req_list, req, entry);
209 QTAILQ_INSERT_TAIL(&cq->req_list, req, entry);
210 timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
213 static void nvme_rw_cb(void *opaque, int ret)
215 NvmeRequest *req = opaque;
216 NvmeSQueue *sq = req->sq;
217 NvmeCtrl *n = sq->ctrl;
218 NvmeCQueue *cq = n->cq[sq->cqid];
220 if (!ret) {
221 block_acct_done(blk_get_stats(n->conf.blk), &req->acct);
222 req->status = NVME_SUCCESS;
223 } else {
224 block_acct_failed(blk_get_stats(n->conf.blk), &req->acct);
225 req->status = NVME_INTERNAL_DEV_ERROR;
227 if (req->has_sg) {
228 qemu_sglist_destroy(&req->qsg);
230 nvme_enqueue_req_completion(cq, req);
233 static uint16_t nvme_flush(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd,
234 NvmeRequest *req)
236 req->has_sg = false;
237 block_acct_start(blk_get_stats(n->conf.blk), &req->acct, 0,
238 BLOCK_ACCT_FLUSH);
239 req->aiocb = blk_aio_flush(n->conf.blk, nvme_rw_cb, req);
241 return NVME_NO_COMPLETE;
244 static uint16_t nvme_write_zeros(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd,
245 NvmeRequest *req)
247 NvmeRwCmd *rw = (NvmeRwCmd *)cmd;
248 const uint8_t lba_index = NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas);
249 const uint8_t data_shift = ns->id_ns.lbaf[lba_index].ds;
250 uint64_t slba = le64_to_cpu(rw->slba);
251 uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
252 uint64_t aio_slba = slba << (data_shift - BDRV_SECTOR_BITS);
253 uint32_t aio_nlb = nlb << (data_shift - BDRV_SECTOR_BITS);
255 if (slba + nlb > ns->id_ns.nsze) {
256 return NVME_LBA_RANGE | NVME_DNR;
259 req->has_sg = false;
260 block_acct_start(blk_get_stats(n->conf.blk), &req->acct, 0,
261 BLOCK_ACCT_WRITE);
262 req->aiocb = blk_aio_pwrite_zeroes(n->conf.blk, aio_slba, aio_nlb,
263 BDRV_REQ_MAY_UNMAP, nvme_rw_cb, req);
264 return NVME_NO_COMPLETE;
267 static uint16_t nvme_rw(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd,
268 NvmeRequest *req)
270 NvmeRwCmd *rw = (NvmeRwCmd *)cmd;
271 uint32_t nlb = le32_to_cpu(rw->nlb) + 1;
272 uint64_t slba = le64_to_cpu(rw->slba);
273 uint64_t prp1 = le64_to_cpu(rw->prp1);
274 uint64_t prp2 = le64_to_cpu(rw->prp2);
276 uint8_t lba_index = NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas);
277 uint8_t data_shift = ns->id_ns.lbaf[lba_index].ds;
278 uint64_t data_size = (uint64_t)nlb << data_shift;
279 uint64_t data_offset = slba << data_shift;
280 int is_write = rw->opcode == NVME_CMD_WRITE ? 1 : 0;
281 enum BlockAcctType acct = is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ;
283 if ((slba + nlb) > ns->id_ns.nsze) {
284 block_acct_invalid(blk_get_stats(n->conf.blk), acct);
285 return NVME_LBA_RANGE | NVME_DNR;
288 if (nvme_map_prp(&req->qsg, prp1, prp2, data_size, n)) {
289 block_acct_invalid(blk_get_stats(n->conf.blk), acct);
290 return NVME_INVALID_FIELD | NVME_DNR;
293 assert((nlb << data_shift) == req->qsg.size);
295 req->has_sg = true;
296 dma_acct_start(n->conf.blk, &req->acct, &req->qsg, acct);
297 req->aiocb = is_write ?
298 dma_blk_write(n->conf.blk, &req->qsg, data_offset, BDRV_SECTOR_SIZE,
299 nvme_rw_cb, req) :
300 dma_blk_read(n->conf.blk, &req->qsg, data_offset, BDRV_SECTOR_SIZE,
301 nvme_rw_cb, req);
303 return NVME_NO_COMPLETE;
306 static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
308 NvmeNamespace *ns;
309 uint32_t nsid = le32_to_cpu(cmd->nsid);
311 if (nsid == 0 || nsid > n->num_namespaces) {
312 return NVME_INVALID_NSID | NVME_DNR;
315 ns = &n->namespaces[nsid - 1];
316 switch (cmd->opcode) {
317 case NVME_CMD_FLUSH:
318 return nvme_flush(n, ns, cmd, req);
319 case NVME_CMD_WRITE_ZEROS:
320 return nvme_write_zeros(n, ns, cmd, req);
321 case NVME_CMD_WRITE:
322 case NVME_CMD_READ:
323 return nvme_rw(n, ns, cmd, req);
324 default:
325 return NVME_INVALID_OPCODE | NVME_DNR;
329 static void nvme_free_sq(NvmeSQueue *sq, NvmeCtrl *n)
331 n->sq[sq->sqid] = NULL;
332 timer_del(sq->timer);
333 timer_free(sq->timer);
334 g_free(sq->io_req);
335 if (sq->sqid) {
336 g_free(sq);
340 static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeCmd *cmd)
342 NvmeDeleteQ *c = (NvmeDeleteQ *)cmd;
343 NvmeRequest *req, *next;
344 NvmeSQueue *sq;
345 NvmeCQueue *cq;
346 uint16_t qid = le16_to_cpu(c->qid);
348 if (!qid || nvme_check_sqid(n, qid)) {
349 return NVME_INVALID_QID | NVME_DNR;
352 sq = n->sq[qid];
353 while (!QTAILQ_EMPTY(&sq->out_req_list)) {
354 req = QTAILQ_FIRST(&sq->out_req_list);
355 assert(req->aiocb);
356 blk_aio_cancel(req->aiocb);
358 if (!nvme_check_cqid(n, sq->cqid)) {
359 cq = n->cq[sq->cqid];
360 QTAILQ_REMOVE(&cq->sq_list, sq, entry);
362 nvme_post_cqes(cq);
363 QTAILQ_FOREACH_SAFE(req, &cq->req_list, entry, next) {
364 if (req->sq == sq) {
365 QTAILQ_REMOVE(&cq->req_list, req, entry);
366 QTAILQ_INSERT_TAIL(&sq->req_list, req, entry);
371 nvme_free_sq(sq, n);
372 return NVME_SUCCESS;
375 static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, uint64_t dma_addr,
376 uint16_t sqid, uint16_t cqid, uint16_t size)
378 int i;
379 NvmeCQueue *cq;
381 sq->ctrl = n;
382 sq->dma_addr = dma_addr;
383 sq->sqid = sqid;
384 sq->size = size;
385 sq->cqid = cqid;
386 sq->head = sq->tail = 0;
387 sq->io_req = g_new(NvmeRequest, sq->size);
389 QTAILQ_INIT(&sq->req_list);
390 QTAILQ_INIT(&sq->out_req_list);
391 for (i = 0; i < sq->size; i++) {
392 sq->io_req[i].sq = sq;
393 QTAILQ_INSERT_TAIL(&(sq->req_list), &sq->io_req[i], entry);
395 sq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_process_sq, sq);
397 assert(n->cq[cqid]);
398 cq = n->cq[cqid];
399 QTAILQ_INSERT_TAIL(&(cq->sq_list), sq, entry);
400 n->sq[sqid] = sq;
403 static uint16_t nvme_create_sq(NvmeCtrl *n, NvmeCmd *cmd)
405 NvmeSQueue *sq;
406 NvmeCreateSq *c = (NvmeCreateSq *)cmd;
408 uint16_t cqid = le16_to_cpu(c->cqid);
409 uint16_t sqid = le16_to_cpu(c->sqid);
410 uint16_t qsize = le16_to_cpu(c->qsize);
411 uint16_t qflags = le16_to_cpu(c->sq_flags);
412 uint64_t prp1 = le64_to_cpu(c->prp1);
414 if (!cqid || nvme_check_cqid(n, cqid)) {
415 return NVME_INVALID_CQID | NVME_DNR;
417 if (!sqid || !nvme_check_sqid(n, sqid)) {
418 return NVME_INVALID_QID | NVME_DNR;
420 if (!qsize || qsize > NVME_CAP_MQES(n->bar.cap)) {
421 return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
423 if (!prp1 || prp1 & (n->page_size - 1)) {
424 return NVME_INVALID_FIELD | NVME_DNR;
426 if (!(NVME_SQ_FLAGS_PC(qflags))) {
427 return NVME_INVALID_FIELD | NVME_DNR;
429 sq = g_malloc0(sizeof(*sq));
430 nvme_init_sq(sq, n, prp1, sqid, cqid, qsize + 1);
431 return NVME_SUCCESS;
434 static void nvme_free_cq(NvmeCQueue *cq, NvmeCtrl *n)
436 n->cq[cq->cqid] = NULL;
437 timer_del(cq->timer);
438 timer_free(cq->timer);
439 msix_vector_unuse(&n->parent_obj, cq->vector);
440 if (cq->cqid) {
441 g_free(cq);
445 static uint16_t nvme_del_cq(NvmeCtrl *n, NvmeCmd *cmd)
447 NvmeDeleteQ *c = (NvmeDeleteQ *)cmd;
448 NvmeCQueue *cq;
449 uint16_t qid = le16_to_cpu(c->qid);
451 if (!qid || nvme_check_cqid(n, qid)) {
452 return NVME_INVALID_CQID | NVME_DNR;
455 cq = n->cq[qid];
456 if (!QTAILQ_EMPTY(&cq->sq_list)) {
457 return NVME_INVALID_QUEUE_DEL;
459 nvme_free_cq(cq, n);
460 return NVME_SUCCESS;
463 static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, uint64_t dma_addr,
464 uint16_t cqid, uint16_t vector, uint16_t size, uint16_t irq_enabled)
466 cq->ctrl = n;
467 cq->cqid = cqid;
468 cq->size = size;
469 cq->dma_addr = dma_addr;
470 cq->phase = 1;
471 cq->irq_enabled = irq_enabled;
472 cq->vector = vector;
473 cq->head = cq->tail = 0;
474 QTAILQ_INIT(&cq->req_list);
475 QTAILQ_INIT(&cq->sq_list);
476 msix_vector_use(&n->parent_obj, cq->vector);
477 n->cq[cqid] = cq;
478 cq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_post_cqes, cq);
481 static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeCmd *cmd)
483 NvmeCQueue *cq;
484 NvmeCreateCq *c = (NvmeCreateCq *)cmd;
485 uint16_t cqid = le16_to_cpu(c->cqid);
486 uint16_t vector = le16_to_cpu(c->irq_vector);
487 uint16_t qsize = le16_to_cpu(c->qsize);
488 uint16_t qflags = le16_to_cpu(c->cq_flags);
489 uint64_t prp1 = le64_to_cpu(c->prp1);
491 if (!cqid || !nvme_check_cqid(n, cqid)) {
492 return NVME_INVALID_CQID | NVME_DNR;
494 if (!qsize || qsize > NVME_CAP_MQES(n->bar.cap)) {
495 return NVME_MAX_QSIZE_EXCEEDED | NVME_DNR;
497 if (!prp1) {
498 return NVME_INVALID_FIELD | NVME_DNR;
500 if (vector > n->num_queues) {
501 return NVME_INVALID_IRQ_VECTOR | NVME_DNR;
503 if (!(NVME_CQ_FLAGS_PC(qflags))) {
504 return NVME_INVALID_FIELD | NVME_DNR;
507 cq = g_malloc0(sizeof(*cq));
508 nvme_init_cq(cq, n, prp1, cqid, vector, qsize + 1,
509 NVME_CQ_FLAGS_IEN(qflags));
510 return NVME_SUCCESS;
513 static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeIdentify *c)
515 uint64_t prp1 = le64_to_cpu(c->prp1);
516 uint64_t prp2 = le64_to_cpu(c->prp2);
518 return nvme_dma_read_prp(n, (uint8_t *)&n->id_ctrl, sizeof(n->id_ctrl),
519 prp1, prp2);
522 static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeIdentify *c)
524 NvmeNamespace *ns;
525 uint32_t nsid = le32_to_cpu(c->nsid);
526 uint64_t prp1 = le64_to_cpu(c->prp1);
527 uint64_t prp2 = le64_to_cpu(c->prp2);
529 if (nsid == 0 || nsid > n->num_namespaces) {
530 return NVME_INVALID_NSID | NVME_DNR;
533 ns = &n->namespaces[nsid - 1];
534 return nvme_dma_read_prp(n, (uint8_t *)&ns->id_ns, sizeof(ns->id_ns),
535 prp1, prp2);
538 static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeIdentify *c)
540 static const int data_len = 4096;
541 uint32_t min_nsid = le32_to_cpu(c->nsid);
542 uint64_t prp1 = le64_to_cpu(c->prp1);
543 uint64_t prp2 = le64_to_cpu(c->prp2);
544 uint32_t *list;
545 uint16_t ret;
546 int i, j = 0;
548 list = g_malloc0(data_len);
549 for (i = 0; i < n->num_namespaces; i++) {
550 if (i < min_nsid) {
551 continue;
553 list[j++] = cpu_to_le32(i + 1);
554 if (j == data_len / sizeof(uint32_t)) {
555 break;
558 ret = nvme_dma_read_prp(n, (uint8_t *)list, data_len, prp1, prp2);
559 g_free(list);
560 return ret;
564 static uint16_t nvme_identify(NvmeCtrl *n, NvmeCmd *cmd)
566 NvmeIdentify *c = (NvmeIdentify *)cmd;
568 switch (le32_to_cpu(c->cns)) {
569 case 0x00:
570 return nvme_identify_ns(n, c);
571 case 0x01:
572 return nvme_identify_ctrl(n, c);
573 case 0x02:
574 return nvme_identify_nslist(n, c);
575 default:
576 return NVME_INVALID_FIELD | NVME_DNR;
580 static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
582 uint32_t dw10 = le32_to_cpu(cmd->cdw10);
583 uint32_t result;
585 switch (dw10) {
586 case NVME_VOLATILE_WRITE_CACHE:
587 result = blk_enable_write_cache(n->conf.blk);
588 break;
589 case NVME_NUMBER_OF_QUEUES:
590 result = cpu_to_le32((n->num_queues - 1) | ((n->num_queues - 1) << 16));
591 break;
592 default:
593 return NVME_INVALID_FIELD | NVME_DNR;
596 req->cqe.result = result;
597 return NVME_SUCCESS;
600 static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
602 uint32_t dw10 = le32_to_cpu(cmd->cdw10);
603 uint32_t dw11 = le32_to_cpu(cmd->cdw11);
605 switch (dw10) {
606 case NVME_VOLATILE_WRITE_CACHE:
607 blk_set_enable_write_cache(n->conf.blk, dw11 & 1);
608 break;
609 case NVME_NUMBER_OF_QUEUES:
610 req->cqe.result =
611 cpu_to_le32((n->num_queues - 1) | ((n->num_queues - 1) << 16));
612 break;
613 default:
614 return NVME_INVALID_FIELD | NVME_DNR;
616 return NVME_SUCCESS;
619 static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
621 switch (cmd->opcode) {
622 case NVME_ADM_CMD_DELETE_SQ:
623 return nvme_del_sq(n, cmd);
624 case NVME_ADM_CMD_CREATE_SQ:
625 return nvme_create_sq(n, cmd);
626 case NVME_ADM_CMD_DELETE_CQ:
627 return nvme_del_cq(n, cmd);
628 case NVME_ADM_CMD_CREATE_CQ:
629 return nvme_create_cq(n, cmd);
630 case NVME_ADM_CMD_IDENTIFY:
631 return nvme_identify(n, cmd);
632 case NVME_ADM_CMD_SET_FEATURES:
633 return nvme_set_feature(n, cmd, req);
634 case NVME_ADM_CMD_GET_FEATURES:
635 return nvme_get_feature(n, cmd, req);
636 default:
637 return NVME_INVALID_OPCODE | NVME_DNR;
641 static void nvme_process_sq(void *opaque)
643 NvmeSQueue *sq = opaque;
644 NvmeCtrl *n = sq->ctrl;
645 NvmeCQueue *cq = n->cq[sq->cqid];
647 uint16_t status;
648 hwaddr addr;
649 NvmeCmd cmd;
650 NvmeRequest *req;
652 while (!(nvme_sq_empty(sq) || QTAILQ_EMPTY(&sq->req_list))) {
653 addr = sq->dma_addr + sq->head * n->sqe_size;
654 nvme_addr_read(n, addr, (void *)&cmd, sizeof(cmd));
655 nvme_inc_sq_head(sq);
657 req = QTAILQ_FIRST(&sq->req_list);
658 QTAILQ_REMOVE(&sq->req_list, req, entry);
659 QTAILQ_INSERT_TAIL(&sq->out_req_list, req, entry);
660 memset(&req->cqe, 0, sizeof(req->cqe));
661 req->cqe.cid = cmd.cid;
663 status = sq->sqid ? nvme_io_cmd(n, &cmd, req) :
664 nvme_admin_cmd(n, &cmd, req);
665 if (status != NVME_NO_COMPLETE) {
666 req->status = status;
667 nvme_enqueue_req_completion(cq, req);
672 static void nvme_clear_ctrl(NvmeCtrl *n)
674 int i;
676 for (i = 0; i < n->num_queues; i++) {
677 if (n->sq[i] != NULL) {
678 nvme_free_sq(n->sq[i], n);
681 for (i = 0; i < n->num_queues; i++) {
682 if (n->cq[i] != NULL) {
683 nvme_free_cq(n->cq[i], n);
687 blk_flush(n->conf.blk);
688 n->bar.cc = 0;
691 static int nvme_start_ctrl(NvmeCtrl *n)
693 uint32_t page_bits = NVME_CC_MPS(n->bar.cc) + 12;
694 uint32_t page_size = 1 << page_bits;
696 if (n->cq[0] || n->sq[0] || !n->bar.asq || !n->bar.acq ||
697 n->bar.asq & (page_size - 1) || n->bar.acq & (page_size - 1) ||
698 NVME_CC_MPS(n->bar.cc) < NVME_CAP_MPSMIN(n->bar.cap) ||
699 NVME_CC_MPS(n->bar.cc) > NVME_CAP_MPSMAX(n->bar.cap) ||
700 NVME_CC_IOCQES(n->bar.cc) < NVME_CTRL_CQES_MIN(n->id_ctrl.cqes) ||
701 NVME_CC_IOCQES(n->bar.cc) > NVME_CTRL_CQES_MAX(n->id_ctrl.cqes) ||
702 NVME_CC_IOSQES(n->bar.cc) < NVME_CTRL_SQES_MIN(n->id_ctrl.sqes) ||
703 NVME_CC_IOSQES(n->bar.cc) > NVME_CTRL_SQES_MAX(n->id_ctrl.sqes) ||
704 !NVME_AQA_ASQS(n->bar.aqa) || !NVME_AQA_ACQS(n->bar.aqa)) {
705 return -1;
708 n->page_bits = page_bits;
709 n->page_size = page_size;
710 n->max_prp_ents = n->page_size / sizeof(uint64_t);
711 n->cqe_size = 1 << NVME_CC_IOCQES(n->bar.cc);
712 n->sqe_size = 1 << NVME_CC_IOSQES(n->bar.cc);
713 nvme_init_cq(&n->admin_cq, n, n->bar.acq, 0, 0,
714 NVME_AQA_ACQS(n->bar.aqa) + 1, 1);
715 nvme_init_sq(&n->admin_sq, n, n->bar.asq, 0, 0,
716 NVME_AQA_ASQS(n->bar.aqa) + 1);
718 return 0;
721 static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
722 unsigned size)
724 switch (offset) {
725 case 0xc:
726 n->bar.intms |= data & 0xffffffff;
727 n->bar.intmc = n->bar.intms;
728 break;
729 case 0x10:
730 n->bar.intms &= ~(data & 0xffffffff);
731 n->bar.intmc = n->bar.intms;
732 break;
733 case 0x14:
734 /* Windows first sends data, then sends enable bit */
735 if (!NVME_CC_EN(data) && !NVME_CC_EN(n->bar.cc) &&
736 !NVME_CC_SHN(data) && !NVME_CC_SHN(n->bar.cc))
738 n->bar.cc = data;
741 if (NVME_CC_EN(data) && !NVME_CC_EN(n->bar.cc)) {
742 n->bar.cc = data;
743 if (nvme_start_ctrl(n)) {
744 n->bar.csts = NVME_CSTS_FAILED;
745 } else {
746 n->bar.csts = NVME_CSTS_READY;
748 } else if (!NVME_CC_EN(data) && NVME_CC_EN(n->bar.cc)) {
749 nvme_clear_ctrl(n);
750 n->bar.csts &= ~NVME_CSTS_READY;
752 if (NVME_CC_SHN(data) && !(NVME_CC_SHN(n->bar.cc))) {
753 nvme_clear_ctrl(n);
754 n->bar.cc = data;
755 n->bar.csts |= NVME_CSTS_SHST_COMPLETE;
756 } else if (!NVME_CC_SHN(data) && NVME_CC_SHN(n->bar.cc)) {
757 n->bar.csts &= ~NVME_CSTS_SHST_COMPLETE;
758 n->bar.cc = data;
760 break;
761 case 0x24:
762 n->bar.aqa = data & 0xffffffff;
763 break;
764 case 0x28:
765 n->bar.asq = data;
766 break;
767 case 0x2c:
768 n->bar.asq |= data << 32;
769 break;
770 case 0x30:
771 n->bar.acq = data;
772 break;
773 case 0x34:
774 n->bar.acq |= data << 32;
775 break;
776 default:
777 break;
781 static uint64_t nvme_mmio_read(void *opaque, hwaddr addr, unsigned size)
783 NvmeCtrl *n = (NvmeCtrl *)opaque;
784 uint8_t *ptr = (uint8_t *)&n->bar;
785 uint64_t val = 0;
787 if (addr < sizeof(n->bar)) {
788 memcpy(&val, ptr + addr, size);
790 return val;
793 static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
795 uint32_t qid;
797 if (addr & ((1 << 2) - 1)) {
798 return;
801 if (((addr - 0x1000) >> 2) & 1) {
802 uint16_t new_head = val & 0xffff;
803 int start_sqs;
804 NvmeCQueue *cq;
806 qid = (addr - (0x1000 + (1 << 2))) >> 3;
807 if (nvme_check_cqid(n, qid)) {
808 return;
811 cq = n->cq[qid];
812 if (new_head >= cq->size) {
813 return;
816 start_sqs = nvme_cq_full(cq) ? 1 : 0;
817 cq->head = new_head;
818 if (start_sqs) {
819 NvmeSQueue *sq;
820 QTAILQ_FOREACH(sq, &cq->sq_list, entry) {
821 timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
823 timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
826 if (cq->tail != cq->head) {
827 nvme_isr_notify(n, cq);
829 } else {
830 uint16_t new_tail = val & 0xffff;
831 NvmeSQueue *sq;
833 qid = (addr - 0x1000) >> 3;
834 if (nvme_check_sqid(n, qid)) {
835 return;
838 sq = n->sq[qid];
839 if (new_tail >= sq->size) {
840 return;
843 sq->tail = new_tail;
844 timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
848 static void nvme_mmio_write(void *opaque, hwaddr addr, uint64_t data,
849 unsigned size)
851 NvmeCtrl *n = (NvmeCtrl *)opaque;
852 if (addr < sizeof(n->bar)) {
853 nvme_write_bar(n, addr, data, size);
854 } else if (addr >= 0x1000) {
855 nvme_process_db(n, addr, data);
859 static const MemoryRegionOps nvme_mmio_ops = {
860 .read = nvme_mmio_read,
861 .write = nvme_mmio_write,
862 .endianness = DEVICE_LITTLE_ENDIAN,
863 .impl = {
864 .min_access_size = 2,
865 .max_access_size = 8,
869 static void nvme_cmb_write(void *opaque, hwaddr addr, uint64_t data,
870 unsigned size)
872 NvmeCtrl *n = (NvmeCtrl *)opaque;
873 memcpy(&n->cmbuf[addr], &data, size);
876 static uint64_t nvme_cmb_read(void *opaque, hwaddr addr, unsigned size)
878 uint64_t val;
879 NvmeCtrl *n = (NvmeCtrl *)opaque;
881 memcpy(&val, &n->cmbuf[addr], size);
882 return val;
885 static const MemoryRegionOps nvme_cmb_ops = {
886 .read = nvme_cmb_read,
887 .write = nvme_cmb_write,
888 .endianness = DEVICE_LITTLE_ENDIAN,
889 .impl = {
890 .min_access_size = 2,
891 .max_access_size = 8,
895 static int nvme_init(PCIDevice *pci_dev)
897 NvmeCtrl *n = NVME(pci_dev);
898 NvmeIdCtrl *id = &n->id_ctrl;
900 int i;
901 int64_t bs_size;
902 uint8_t *pci_conf;
903 Error *local_err = NULL;
905 if (!n->conf.blk) {
906 return -1;
909 bs_size = blk_getlength(n->conf.blk);
910 if (bs_size < 0) {
911 return -1;
914 blkconf_serial(&n->conf, &n->serial);
915 if (!n->serial) {
916 return -1;
918 blkconf_blocksizes(&n->conf);
919 blkconf_apply_backend_options(&n->conf, blk_is_read_only(n->conf.blk),
920 false, &local_err);
921 if (local_err) {
922 error_report_err(local_err);
923 return -1;
926 pci_conf = pci_dev->config;
927 pci_conf[PCI_INTERRUPT_PIN] = 1;
928 pci_config_set_prog_interface(pci_dev->config, 0x2);
929 pci_config_set_class(pci_dev->config, PCI_CLASS_STORAGE_EXPRESS);
930 pcie_endpoint_cap_init(&n->parent_obj, 0x80);
932 n->num_namespaces = 1;
933 n->num_queues = 64;
934 n->reg_size = pow2ceil(0x1004 + 2 * (n->num_queues + 1) * 4);
935 n->ns_size = bs_size / (uint64_t)n->num_namespaces;
937 n->namespaces = g_new0(NvmeNamespace, n->num_namespaces);
938 n->sq = g_new0(NvmeSQueue *, n->num_queues);
939 n->cq = g_new0(NvmeCQueue *, n->num_queues);
941 memory_region_init_io(&n->iomem, OBJECT(n), &nvme_mmio_ops, n,
942 "nvme", n->reg_size);
943 pci_register_bar(&n->parent_obj, 0,
944 PCI_BASE_ADDRESS_SPACE_MEMORY | PCI_BASE_ADDRESS_MEM_TYPE_64,
945 &n->iomem);
946 msix_init_exclusive_bar(&n->parent_obj, n->num_queues, 4, NULL);
948 id->vid = cpu_to_le16(pci_get_word(pci_conf + PCI_VENDOR_ID));
949 id->ssvid = cpu_to_le16(pci_get_word(pci_conf + PCI_SUBSYSTEM_VENDOR_ID));
950 strpadcpy((char *)id->mn, sizeof(id->mn), "QEMU NVMe Ctrl", ' ');
951 strpadcpy((char *)id->fr, sizeof(id->fr), "1.0", ' ');
952 strpadcpy((char *)id->sn, sizeof(id->sn), n->serial, ' ');
953 id->rab = 6;
954 id->ieee[0] = 0x00;
955 id->ieee[1] = 0x02;
956 id->ieee[2] = 0xb3;
957 id->oacs = cpu_to_le16(0);
958 id->frmw = 7 << 1;
959 id->lpa = 1 << 0;
960 id->sqes = (0x6 << 4) | 0x6;
961 id->cqes = (0x4 << 4) | 0x4;
962 id->nn = cpu_to_le32(n->num_namespaces);
963 id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROS);
964 id->psd[0].mp = cpu_to_le16(0x9c4);
965 id->psd[0].enlat = cpu_to_le32(0x10);
966 id->psd[0].exlat = cpu_to_le32(0x4);
967 if (blk_enable_write_cache(n->conf.blk)) {
968 id->vwc = 1;
971 n->bar.cap = 0;
972 NVME_CAP_SET_MQES(n->bar.cap, 0x7ff);
973 NVME_CAP_SET_CQR(n->bar.cap, 1);
974 NVME_CAP_SET_AMS(n->bar.cap, 1);
975 NVME_CAP_SET_TO(n->bar.cap, 0xf);
976 NVME_CAP_SET_CSS(n->bar.cap, 1);
977 NVME_CAP_SET_MPSMAX(n->bar.cap, 4);
979 n->bar.vs = 0x00010200;
980 n->bar.intmc = n->bar.intms = 0;
982 if (n->cmb_size_mb) {
984 NVME_CMBLOC_SET_BIR(n->bar.cmbloc, 2);
985 NVME_CMBLOC_SET_OFST(n->bar.cmbloc, 0);
987 NVME_CMBSZ_SET_SQS(n->bar.cmbsz, 1);
988 NVME_CMBSZ_SET_CQS(n->bar.cmbsz, 0);
989 NVME_CMBSZ_SET_LISTS(n->bar.cmbsz, 0);
990 NVME_CMBSZ_SET_RDS(n->bar.cmbsz, 0);
991 NVME_CMBSZ_SET_WDS(n->bar.cmbsz, 0);
992 NVME_CMBSZ_SET_SZU(n->bar.cmbsz, 2); /* MBs */
993 NVME_CMBSZ_SET_SZ(n->bar.cmbsz, n->cmb_size_mb);
995 n->cmbuf = g_malloc0(NVME_CMBSZ_GETSIZE(n->bar.cmbsz));
996 memory_region_init_io(&n->ctrl_mem, OBJECT(n), &nvme_cmb_ops, n,
997 "nvme-cmb", NVME_CMBSZ_GETSIZE(n->bar.cmbsz));
998 pci_register_bar(&n->parent_obj, NVME_CMBLOC_BIR(n->bar.cmbloc),
999 PCI_BASE_ADDRESS_SPACE_MEMORY | PCI_BASE_ADDRESS_MEM_TYPE_64 |
1000 PCI_BASE_ADDRESS_MEM_PREFETCH, &n->ctrl_mem);
1004 for (i = 0; i < n->num_namespaces; i++) {
1005 NvmeNamespace *ns = &n->namespaces[i];
1006 NvmeIdNs *id_ns = &ns->id_ns;
1007 id_ns->nsfeat = 0;
1008 id_ns->nlbaf = 0;
1009 id_ns->flbas = 0;
1010 id_ns->mc = 0;
1011 id_ns->dpc = 0;
1012 id_ns->dps = 0;
1013 id_ns->lbaf[0].ds = BDRV_SECTOR_BITS;
1014 id_ns->ncap = id_ns->nuse = id_ns->nsze =
1015 cpu_to_le64(n->ns_size >>
1016 id_ns->lbaf[NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas)].ds);
1018 return 0;
1021 static void nvme_exit(PCIDevice *pci_dev)
1023 NvmeCtrl *n = NVME(pci_dev);
1025 nvme_clear_ctrl(n);
1026 g_free(n->namespaces);
1027 g_free(n->cq);
1028 g_free(n->sq);
1029 if (n->cmbsz) {
1030 memory_region_unref(&n->ctrl_mem);
1033 msix_uninit_exclusive_bar(pci_dev);
1036 static Property nvme_props[] = {
1037 DEFINE_BLOCK_PROPERTIES(NvmeCtrl, conf),
1038 DEFINE_PROP_STRING("serial", NvmeCtrl, serial),
1039 DEFINE_PROP_UINT32("cmb_size_mb", NvmeCtrl, cmb_size_mb, 0),
1040 DEFINE_PROP_END_OF_LIST(),
1043 static const VMStateDescription nvme_vmstate = {
1044 .name = "nvme",
1045 .unmigratable = 1,
1048 static void nvme_class_init(ObjectClass *oc, void *data)
1050 DeviceClass *dc = DEVICE_CLASS(oc);
1051 PCIDeviceClass *pc = PCI_DEVICE_CLASS(oc);
1053 pc->init = nvme_init;
1054 pc->exit = nvme_exit;
1055 pc->class_id = PCI_CLASS_STORAGE_EXPRESS;
1056 pc->vendor_id = PCI_VENDOR_ID_INTEL;
1057 pc->device_id = 0x5845;
1058 pc->revision = 2;
1059 pc->is_express = 1;
1061 set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
1062 dc->desc = "Non-Volatile Memory Express";
1063 dc->props = nvme_props;
1064 dc->vmsd = &nvme_vmstate;
1067 static void nvme_instance_init(Object *obj)
1069 NvmeCtrl *s = NVME(obj);
1071 device_add_bootindex_property(obj, &s->conf.bootindex,
1072 "bootindex", "/namespace@1,0",
1073 DEVICE(obj), &error_abort);
1076 static const TypeInfo nvme_info = {
1077 .name = "nvme",
1078 .parent = TYPE_PCI_DEVICE,
1079 .instance_size = sizeof(NvmeCtrl),
1080 .class_init = nvme_class_init,
1081 .instance_init = nvme_instance_init,
1084 static void nvme_register_types(void)
1086 type_register_static(&nvme_info);
1089 type_init(nvme_register_types)