2 * Copyright (c) 2016 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 * Administration thread
37 * - Handles resetting, features, iteration of namespaces, and disk
38 * attachments. Most admin operations are serialized by the admin thread.
40 * - Ioctls as well as any BIOs which require more sophisticated processing
41 * are handed to this thread as well.
43 * - Can freeze/resume other queues for various purposes.
48 static void nvme_admin_thread(void *arg
);
49 static int nvme_admin_state_identify_ctlr(nvme_softc_t
*sc
);
50 static int nvme_admin_state_make_queues(nvme_softc_t
*sc
);
51 static int nvme_admin_state_identify_ns(nvme_softc_t
*sc
);
52 static int nvme_admin_state_operating(nvme_softc_t
*sc
);
53 static int nvme_admin_state_failed(nvme_softc_t
*sc
);
56 * Start the admin thread and block until it says it is running.
59 nvme_start_admin_thread(nvme_softc_t
*sc
)
61 int error
, intr_flags
;
63 lockinit(&sc
->admin_lk
, "admlk", 0, 0);
64 lockinit(&sc
->ioctl_lk
, "nvioc", 0, 0);
67 intr_flags
= INTR_MPSAFE
;
69 /* This interrupt processes data CQs too */
70 intr_flags
|= INTR_HIFREQ
;
73 error
= bus_setup_intr(sc
->dev
, sc
->irq
[0], intr_flags
,
74 nvme_intr
, &sc
->comqueues
[0],
75 &sc
->irq_handle
[0], NULL
);
77 device_printf(sc
->dev
, "unable to install interrupt\n");
80 lockmgr(&sc
->admin_lk
, LK_EXCLUSIVE
);
81 kthread_create(nvme_admin_thread
, sc
, &sc
->admintd
, "nvme_admin");
82 while ((sc
->admin_signal
& ADMIN_SIG_RUNNING
) == 0)
83 lksleep(&sc
->admin_signal
, &sc
->admin_lk
, 0, "nvwbeg", 0);
84 lockmgr(&sc
->admin_lk
, LK_RELEASE
);
90 * Stop the admin thread and block until it says it is done.
93 nvme_stop_admin_thread(nvme_softc_t
*sc
)
97 atomic_set_int(&sc
->admin_signal
, ADMIN_SIG_STOP
);
100 * We have to wait for the admin thread to finish its probe
101 * before shutting it down. Break out if the admin thread
102 * never managed to even start.
104 lockmgr(&sc
->admin_lk
, LK_EXCLUSIVE
);
105 while ((sc
->admin_signal
& ADMIN_SIG_PROBED
) == 0) {
106 if ((sc
->admin_signal
& ADMIN_SIG_RUNNING
) == 0)
108 lksleep(&sc
->admin_signal
, &sc
->admin_lk
, 0, "nvwend", 0);
110 lockmgr(&sc
->admin_lk
, LK_RELEASE
);
113 * Disconnect our disks while the admin thread is still running,
114 * ensuring that the poll works even if interrupts are broken.
115 * Otherwise we could deadlock in the devfs core.
117 for (i
= 0; i
< NVME_MAX_NAMESPACES
; ++i
) {
120 if ((nsc
= sc
->nscary
[i
]) != NULL
) {
121 nvme_disk_detach(nsc
);
124 sc
->nscary
[i
] = NULL
;
129 * Ask the admin thread to shut-down.
131 lockmgr(&sc
->admin_lk
, LK_EXCLUSIVE
);
132 wakeup(&sc
->admin_signal
);
133 while (sc
->admin_signal
& ADMIN_SIG_RUNNING
)
134 lksleep(&sc
->admin_signal
, &sc
->admin_lk
, 0, "nvwend", 0);
135 lockmgr(&sc
->admin_lk
, LK_RELEASE
);
136 if (sc
->irq_handle
[0]) {
137 bus_teardown_intr(sc
->dev
, sc
->irq
[0], sc
->irq_handle
[0]);
138 sc
->irq_handle
[0] = NULL
;
140 lockuninit(&sc
->ioctl_lk
);
141 lockuninit(&sc
->admin_lk
);
144 * Thread might be running on another cpu, give it time to actually
145 * exit before returning in case the caller is about to unload the
146 * module. Otherwise we don't need this.
153 nvme_admin_thread(void *arg
)
155 nvme_softc_t
*sc
= arg
;
158 lockmgr(&sc
->admin_lk
, LK_EXCLUSIVE
);
159 atomic_set_int(&sc
->admin_signal
, ADMIN_SIG_RUNNING
);
160 wakeup(&sc
->admin_signal
);
162 sc
->admin_func
= nvme_admin_state_identify_ctlr
;
164 while ((sc
->admin_signal
& ADMIN_SIG_STOP
) == 0) {
165 for (i
= 0; i
<= sc
->niocomqs
; ++i
) {
166 nvme_comqueue_t
*comq
= &sc
->comqueues
[i
];
168 if (comq
->nqe
== 0) /* not configured */
171 lockmgr(&comq
->lk
, LK_EXCLUSIVE
);
172 nvme_poll_completions(comq
, &comq
->lk
);
173 lockmgr(&comq
->lk
, LK_RELEASE
);
175 if (sc
->admin_signal
& ADMIN_SIG_REQUEUE
) {
176 atomic_clear_int(&sc
->admin_signal
, ADMIN_SIG_REQUEUE
);
177 nvme_disk_requeues(sc
);
179 if (sc
->admin_func(sc
) == 0 &&
180 (sc
->admin_signal
& ADMIN_SIG_RUN_MASK
) == 0) {
181 lksleep(&sc
->admin_signal
, &sc
->admin_lk
, 0,
189 * Note that we actually issue delete queue commands here. The NVME
190 * spec says that for a normal shutdown the I/O queues should be
191 * deleted prior to issuing the shutdown in the CONFIG register.
193 for (i
= 1; i
<= sc
->niosubqs
; ++i
) {
194 nvme_delete_subqueue(sc
, i
);
195 nvme_free_subqueue(sc
, i
);
197 for (i
= 1; i
<= sc
->niocomqs
; ++i
) {
198 nvme_delete_comqueue(sc
, i
);
199 nvme_free_comqueue(sc
, i
);
203 * Signal that we are done.
205 atomic_clear_int(&sc
->admin_signal
, ADMIN_SIG_RUNNING
);
206 wakeup(&sc
->admin_signal
);
207 lockmgr(&sc
->admin_lk
, LK_RELEASE
);
211 * Identify the controller
215 nvme_admin_state_identify_ctlr(nvme_softc_t
*sc
)
218 nvme_ident_ctlr_data_t
*rp
;
225 * Identify Controller
227 mempgsize
= NVME_CAP_MEMPG_MIN_GET(sc
->cap
);
229 req
= nvme_get_admin_request(sc
, NVME_OP_IDENTIFY
);
230 req
->cmd
.identify
.cns
= NVME_CNS_CTLR
;
231 req
->cmd
.identify
.cntid
= 0;
232 bzero(req
->info
, sizeof(*req
->info
));
233 nvme_submit_request(req
);
234 status
= nvme_wait_request(req
, hz
);
235 /* XXX handle status */
237 sc
->idctlr
= req
->info
->idctlr
;
238 nvme_put_request(req
);
242 KKASSERT(sizeof(sc
->idctlr
.serialno
) == 20);
243 KKASSERT(sizeof(sc
->idctlr
.modelno
) == 40);
244 bzero(serial
, sizeof(serial
));
245 bzero(model
, sizeof(model
));
246 bcopy(rp
->serialno
, serial
, sizeof(rp
->serialno
));
247 bcopy(rp
->modelno
, model
, sizeof(rp
->modelno
));
248 string_cleanup(serial
, 0);
249 string_cleanup(model
, 0);
251 device_printf(sc
->dev
, "Model %s BaseSerial %s nscount=%d\n",
252 model
, serial
, rp
->ns_count
);
254 sc
->admin_func
= nvme_admin_state_make_queues
;
259 #define COMQFIXUP(msix, ncomqs) ((((msix) - 1) % ncomqs) + 1)
262 * Request and create the I/O queues. Figure out CPU mapping optimizations.
266 nvme_admin_state_make_queues(nvme_softc_t
*sc
)
277 * Calculate how many I/O queues (non-inclusive of admin queue)
278 * we want to have, up to 65535. dw0 in the response returns the
279 * number of queues the controller gives us. Submission and
280 * Completion queues are specified separately.
282 * This driver runs optimally with 4 submission queues and one
283 * completion queue per cpu (rdhipri, rdlopri, wrhipri, wrlopri),
285 * +1 for dumps XXX future
286 * +1 for async events XXX future
288 req
= nvme_get_admin_request(sc
, NVME_OP_SET_FEATURES
);
290 niosubqs
= ncpus
* 2 + 0;
291 niocomqs
= ncpus
+ 0;
292 if (niosubqs
> NVME_MAX_QUEUES
)
293 niosubqs
= NVME_MAX_QUEUES
;
294 if (niocomqs
> NVME_MAX_QUEUES
)
295 niocomqs
= NVME_MAX_QUEUES
;
296 device_printf(sc
->dev
, "Request %u/%u queues, ", niosubqs
, niocomqs
);
298 req
->cmd
.setfeat
.flags
= NVME_FID_NUMQUEUES
;
299 req
->cmd
.setfeat
.numqs
.nsqr
= niosubqs
- 1; /* 0's based 0=1 */
300 req
->cmd
.setfeat
.numqs
.ncqr
= niocomqs
- 1; /* 0's based 0=1 */
302 nvme_submit_request(req
);
305 * Get response and set our operations mode.
307 status
= nvme_wait_request(req
, hz
);
308 /* XXX handle status */
311 sc
->niosubqs
= 1 + (req
->res
.setfeat
.dw0
& 0xFFFFU
);
312 sc
->niocomqs
= 1 + ((req
->res
.setfeat
.dw0
>> 16) & 0xFFFFU
);
317 kprintf("Returns %u/%u queues, ", sc
->niosubqs
, sc
->niocomqs
);
319 nvme_put_request(req
);
324 if (sc
->niosubqs
>= ncpus
* 2 + 0 && sc
->niocomqs
>= ncpus
+ 0) {
326 * If we got all the queues we wanted do a full-bore setup of
329 * Remember that subq 0 / comq 0 is the admin queue.
331 kprintf("optimal map\n");
333 for (i
= 0; i
< ncpus
; ++i
) {
334 int cpuqno
= COMQFIXUP(sc
->cputovect
[i
], ncpus
);
336 KKASSERT(cpuqno
!= 0);
337 sc
->qmap
[i
][0] = qno
+ 0;
338 sc
->qmap
[i
][1] = qno
+ 1;
339 sc
->subqueues
[qno
+ 0].comqid
= cpuqno
;
340 sc
->subqueues
[qno
+ 1].comqid
= cpuqno
;
343 sc
->niosubqs
= ncpus
* 2 + 0;
344 sc
->niocomqs
= ncpus
+ 0;
345 } else if (sc
->niosubqs
>= ncpus
&& sc
->niocomqs
>= ncpus
) {
347 * We have enough to give each cpu its own submission
348 * and completion queue.
350 * leave dumpqno and eventqno set to the admin queue.
352 kprintf("nominal map 1:1 cpu\n");
353 for (i
= 0; i
< ncpus
; ++i
) {
354 qno
= sc
->cputovect
[i
];
356 sc
->qmap
[i
][0] = qno
;
357 sc
->qmap
[i
][1] = qno
;
358 sc
->subqueues
[qno
].comqid
= COMQFIXUP(qno
, ncpus
);
360 sc
->niosubqs
= ncpus
;
361 sc
->niocomqs
= ncpus
;
362 } else if (sc
->niosubqs
>= 2 && sc
->niocomqs
>= 2) {
364 * prioritize trying to distribute available queues to
365 * cpus, don't separate read and write.
367 * leave dumpqno and eventqno set to the admin queue.
369 kprintf("rw-sep map (%d, %d)\n", sc
->niosubqs
, sc
->niocomqs
);
370 for (i
= 0; i
< ncpus
; ++i
) {
371 int cpuqno
= COMQFIXUP(sc
->cputovect
[i
], sc
->niocomqs
);
372 int qno
= COMQFIXUP((i
+ 1), sc
->niosubqs
);
375 sc
->qmap
[i
][0] = qno
; /* read */
376 sc
->qmap
[i
][1] = qno
; /* write */
377 sc
->subqueues
[qno
].comqid
= cpuqno
;
378 /* do not increment qno */
384 } else if (sc
->niosubqs
>= 2) {
386 * We have enough to have separate read and write queues.
388 kprintf("basic map\n");
390 for (i
= 0; i
< ncpus
; ++i
) {
391 int cpuqno
= COMQFIXUP(sc
->cputovect
[i
], 1);
394 sc
->qmap
[i
][0] = qno
+ 0; /* read */
395 sc
->qmap
[i
][1] = qno
+ 1; /* write */
397 sc
->subqueues
[qno
+ 0].comqid
= cpuqno
;
399 sc
->subqueues
[qno
+ 1].comqid
= cpuqno
;
405 * Minimal configuration, all cpus and I/O types use the
406 * same queue. Sad day.
408 kprintf("minimal map\n");
411 for (i
= 0; i
< ncpus
; ++i
) {
415 sc
->subqueues
[1].comqid
= 1;
421 * Create all I/O submission and completion queues. The I/O
422 * queues start at 1 and are inclusive of niosubqs and niocomqs.
424 * NOTE: Completion queues must be created before submission queues.
425 * That is, the completion queue specified when creating a
426 * submission queue must already exist.
429 for (i
= 1; i
<= sc
->niocomqs
; ++i
) {
430 error
+= nvme_alloc_comqueue(sc
, i
);
432 device_printf(sc
->dev
, "Unable to allocate comqs\n");
435 error
+= nvme_create_comqueue(sc
, i
);
437 for (i
= 1; i
<= sc
->niosubqs
; ++i
) {
438 error
+= nvme_alloc_subqueue(sc
, i
);
440 device_printf(sc
->dev
, "Unable to allocate subqs\n");
443 error
+= nvme_create_subqueue(sc
, i
);
447 device_printf(sc
->dev
, "Failed to initialize device!\n");
448 sc
->admin_func
= nvme_admin_state_failed
;
450 sc
->admin_func
= nvme_admin_state_identify_ns
;
454 * Basically interrupt coalescing is worthless if we care about
455 * performance, at least on the Intel 750. Setting the threshold
456 * has no effect if time is set to 0. The smallest time that can
457 * be set is a value of 1 (== 100uS), which is much too long. That
458 * is only 10,000 interrupts/sec/cpu and on the Intel 750 it totally
459 * destroys sequential performance.
461 req
= nvme_get_admin_request(sc
, NVME_OP_SET_FEATURES
);
463 device_printf(sc
->dev
, "Interrupt Coalesce: 100uS / 4 qentries\n");
465 req
->cmd
.setfeat
.flags
= NVME_FID_INTCOALESCE
;
466 req
->cmd
.setfeat
.intcoal
.thr
= 0;
467 req
->cmd
.setfeat
.intcoal
.time
= 0;
469 nvme_submit_request(req
);
470 status
= nvme_wait_request(req
, hz
);
472 device_printf(sc
->dev
,
473 "Interrupt coalesce failed status=%d\n",
476 nvme_put_request(req
);
482 * Identify available namespaces, iterate, and attach to disks.
486 nvme_admin_state_identify_ns(nvme_softc_t
*sc
)
489 nvme_ident_ns_list_t
*rp
;
495 if (sc
->idctlr
.admin_cap
& NVME_ADMIN_NSMANAGE
)
496 device_printf(sc
->dev
,
497 "Namespace management supported\n");
499 device_printf(sc
->dev
,
500 "Namespace management not supported\n");
504 * Identify Controllers TODO TODO TODO
506 if (sc
->idctlr
.admin_cap
& NVME_ADMIN_NSMANAGE
) {
507 req
= nvme_get_admin_request(sc
, NVME_OP_IDENTIFY
);
508 req
->cmd
.identify
.cns
= NVME_CNS_ANY_CTLR_LIST
;
509 req
->cmd
.identify
.cntid
= 0;
510 bzero(req
->info
, sizeof(*req
->info
));
511 nvme_submit_request(req
);
512 status
= nvme_wait_request(req
, hz
);
513 kprintf("nsquery status %08x\n", status
);
516 for (i
= 0; i
< req
->info
->ctlrlist
.idcount
; ++i
) {
517 kprintf("CTLR %04x\n", req
->info
->ctlrlist
.ctlrids
[i
]);
520 nvme_put_request(req
);
524 rp
= kmalloc(sizeof(*rp
), M_NVME
, M_WAITOK
| M_ZERO
);
525 if (sc
->idctlr
.admin_cap
& NVME_ADMIN_NSMANAGE
) {
527 * Namespace management supported, query active namespaces.
529 req
= nvme_get_admin_request(sc
, NVME_OP_IDENTIFY
);
530 req
->cmd
.identify
.cns
= NVME_CNS_ACT_NSLIST
;
531 req
->cmd
.identify
.cntid
= 0;
532 bzero(req
->info
, sizeof(*req
->info
));
533 nvme_submit_request(req
);
534 status
= nvme_wait_request(req
, hz
);
535 kprintf("nsquery status %08x\n", status
);
536 /* XXX handle status */
539 *rp
= req
->info
->nslist
;
540 nvme_put_request(req
);
543 * Namespace management not supported, assume nsids 1..N.
545 for (i
= 1; i
<= sc
->idctlr
.ns_count
&& i
<= 1024; ++i
)
550 * Identify each Namespace
552 for (i
= 0; i
< 1024; ++i
) {
554 nvme_lba_fmt_data_t
*lbafmt
;
556 if (rp
->nsids
[i
] == 0)
558 req
= nvme_get_admin_request(sc
, NVME_OP_IDENTIFY
);
559 req
->cmd
.identify
.cns
= NVME_CNS_ACT_NS
;
560 req
->cmd
.identify
.cntid
= 0;
561 req
->cmd
.identify
.head
.nsid
= rp
->nsids
[i
];
562 bzero(req
->info
, sizeof(*req
->info
));
563 nvme_submit_request(req
);
564 status
= nvme_wait_request(req
, hz
);
566 kprintf("NS FAILED %08x\n", status
);
570 for (j
= 0; j
< NVME_MAX_NAMESPACES
; ++j
) {
572 sc
->nscary
[j
]->nsid
== rp
->nsids
[i
])
575 if (j
== NVME_MAX_NAMESPACES
) {
577 if (sc
->nscary
[j
] != NULL
) {
578 for (j
= NVME_MAX_NAMESPACES
- 1; j
>= 0; --j
) {
579 if (sc
->nscary
[j
] == NULL
)
585 device_printf(sc
->dev
, "not enough room in nscary for "
586 "namespace %08x\n", rp
->nsids
[i
]);
587 nvme_put_request(req
);
592 nsc
= kmalloc(sizeof(*nsc
), M_NVME
, M_WAITOK
| M_ZERO
);
593 nsc
->unit
= nvme_alloc_disk_unit();
599 nsc
->nsid
= rp
->nsids
[i
];
600 nsc
->state
= NVME_NSC_STATE_UNATTACHED
;
601 nsc
->idns
= req
->info
->idns
;
602 bioq_init(&nsc
->bioq
);
603 lockinit(&nsc
->lk
, "nvnsc", 0, 0);
605 nvme_put_request(req
);
607 j
= NVME_FLBAS_SEL_GET(nsc
->idns
.flbas
);
608 lbafmt
= &nsc
->idns
.lba_fmt
[j
];
609 nsc
->blksize
= 1 << lbafmt
->sect_size
;
612 * Attach the namespace
614 nvme_disk_attach(nsc
);
618 sc
->admin_func
= nvme_admin_state_operating
;
624 nvme_admin_state_operating(nvme_softc_t
*sc
)
626 if ((sc
->admin_signal
& ADMIN_SIG_PROBED
) == 0) {
627 atomic_set_int(&sc
->admin_signal
, ADMIN_SIG_PROBED
);
628 wakeup(&sc
->admin_signal
);
636 nvme_admin_state_failed(nvme_softc_t
*sc
)
638 if ((sc
->admin_signal
& ADMIN_SIG_PROBED
) == 0) {
639 atomic_set_int(&sc
->admin_signal
, ADMIN_SIG_PROBED
);
640 wakeup(&sc
->admin_signal
);