From 6c1feaf50952b51987811bfc4ec6466cfdaab389 Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Wed, 20 Jul 2016 19:29:06 -0700 Subject: [PATCH] nvme - Fix BUF_KERNPROC() SMP race * BUF_KERNPROC() must be issued before we submit the request. The subq lock is not sufficient to interlock request completion (which only needs the comq lock). * Only occurs under extreme loads, probably due to an IPI or Xinvltlb causing enough of a pause that the completion can run. NVMe is so fast, probably no other controller would hit this particular race condition. * Also fix a bio queueing race which can leave a bio hanging. If no requests are available (which can only happen under very heavy I/O loads), the signaling to the admin thread on the next I/O completion can race the queueing of the bio. Fix the race by making sure the admin thread is signalled *after* queueing the bio. --- sys/dev/disk/nvme/nvme.c | 7 +++---- sys/dev/disk/nvme/nvme_disk.c | 18 ++++++++++++++++-- 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/sys/dev/disk/nvme/nvme.c b/sys/dev/disk/nvme/nvme.c index 8d88f8fd16..04f0e2b4f4 100644 --- a/sys/dev/disk/nvme/nvme.c +++ b/sys/dev/disk/nvme/nvme.c @@ -344,9 +344,9 @@ nvme_get_request(nvme_subqueue_t *queue, uint8_t opcode, */ if ((queue->subq_tail + queue->unsubmitted + 1) % queue->nqe == queue->subq_head) { - queue->signal_requeue = 1; lockmgr(&queue->lk, LK_RELEASE); KKASSERT(queue->qid != 0); + atomic_swap_int(&queue->signal_requeue, 1); return NULL; } @@ -360,9 +360,9 @@ nvme_get_request(nvme_subqueue_t *queue, uint8_t opcode, req = queue->first_avail; cpu_ccfence(); if (req == NULL) { - queue->signal_requeue = 1; lockmgr(&queue->lk, LK_RELEASE); KKASSERT(queue->qid != 0); + atomic_swap_int(&queue->signal_requeue, 1); return NULL; } @@ -527,8 +527,7 @@ nvme_put_request(nvme_request_t *req) * should not happen due to the large number of queue entries nvme * usually has. Let it race for now (admin has a 1hz tick). */ - if (queue->signal_requeue) { - queue->signal_requeue = 0; + if (atomic_swap_int(&queue->signal_requeue, 0)) { atomic_set_int(&queue->sc->admin_signal, ADMIN_SIG_REQUEUE); wakeup(&queue->sc->admin_signal); } diff --git a/sys/dev/disk/nvme/nvme_disk.c b/sys/dev/disk/nvme/nvme_disk.c index c351f939ab..292d3d3813 100644 --- a/sys/dev/disk/nvme/nvme_disk.c +++ b/sys/dev/disk/nvme/nvme_disk.c @@ -313,14 +313,17 @@ nvme_strategy_core(nvme_softns_t *nsc, struct bio *bio, int delay) /* * Prevent callback from occurring if the synchronous * delay optimization is enabled. + * + * NOTE: subq lock does not protect the I/O (completion + * only needs the comq lock). */ if (delay == 0) req->callback = nvme_disk_callback; req->nsc = nsc; req->bio = bio; + BUF_KERNPROC(bp); /* do before submit */ lockmgr(&subq->lk, LK_EXCLUSIVE); nvme_submit_request(req); /* needs subq lock */ - BUF_KERNPROC(bp); /* do before lock release */ lockmgr(&subq->lk, LK_RELEASE); if (delay) { comq = req->comq; @@ -354,13 +357,24 @@ nvme_strategy_core(nvme_softns_t *nsc, struct bio *bio, int delay) return 0; /* - * No requests were available, requeue the bio + * No requests were available, requeue the bio. + * + * The nvme_get_request() call armed the requeue signal but + * it is possible that it was picked up too quickly. If it + * was, signal the admin thread ourselves. This case will occur + * relatively rarely and only under heavy I/O conditions so we + * don't have to be entirely efficient about dealing with it. */ requeue: BUF_KERNPROC(bp); lockmgr(&nsc->lk, LK_EXCLUSIVE); bioqdisksort(&nsc->bioq, bio); lockmgr(&nsc->lk, LK_RELEASE); + if (atomic_swap_int(&subq->signal_requeue, 1) == 0) { + atomic_swap_int(&subq->signal_requeue, 0); + atomic_set_int(&subq->sc->admin_signal, ADMIN_SIG_REQUEUE); + wakeup(&subq->sc->admin_signal); + } return 1; } -- 2.11.4.GIT