if_iwm - Use chan list from ieee80211_scan_state for scan, not ic_channels.
[dragonfly.git] / sys / dev / disk / nvme / nvme_admin.c
blob0f213848056b97fc386ad6765040bcad5d5b4c69
1 /*
2 * Copyright (c) 2016 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
35 * Administration thread
37 * - Handles resetting, features, iteration of namespaces, and disk
38 * attachments. Most admin operations are serialized by the admin thread.
40 * - Ioctls as well as any BIOs which require more sophisticated processing
41 * are handed to this thread as well.
43 * - Can freeze/resume other queues for various purposes.
46 #include "nvme.h"
48 static void nvme_admin_thread(void *arg);
49 static int nvme_admin_state_identify_ctlr(nvme_softc_t *sc);
50 static int nvme_admin_state_make_queues(nvme_softc_t *sc);
51 static int nvme_admin_state_identify_ns(nvme_softc_t *sc);
52 static int nvme_admin_state_operating(nvme_softc_t *sc);
53 static int nvme_admin_state_failed(nvme_softc_t *sc);
56 * Start the admin thread and block until it says it is running.
58 int
59 nvme_start_admin_thread(nvme_softc_t *sc)
61 int error, intr_flags;
63 lockinit(&sc->admin_lk, "admlk", 0, 0);
64 lockinit(&sc->ioctl_lk, "nvioc", 0, 0);
65 sc->admin_signal = 0;
67 intr_flags = INTR_MPSAFE;
68 if (sc->nirqs == 1) {
69 /* This interrupt processes data CQs too */
70 intr_flags |= INTR_HIFREQ;
73 error = bus_setup_intr(sc->dev, sc->irq[0], intr_flags,
74 nvme_intr, &sc->comqueues[0],
75 &sc->irq_handle[0], NULL);
76 if (error) {
77 device_printf(sc->dev, "unable to install interrupt\n");
78 return error;
80 lockmgr(&sc->admin_lk, LK_EXCLUSIVE);
81 kthread_create(nvme_admin_thread, sc, &sc->admintd, "nvme_admin");
82 while ((sc->admin_signal & ADMIN_SIG_RUNNING) == 0)
83 lksleep(&sc->admin_signal, &sc->admin_lk, 0, "nvwbeg", 0);
84 lockmgr(&sc->admin_lk, LK_RELEASE);
86 return 0;
90 * Stop the admin thread and block until it says it is done.
92 void
93 nvme_stop_admin_thread(nvme_softc_t *sc)
95 uint32_t i;
97 atomic_set_int(&sc->admin_signal, ADMIN_SIG_STOP);
100 * We have to wait for the admin thread to finish its probe
101 * before shutting it down. Break out if the admin thread
102 * never managed to even start.
104 lockmgr(&sc->admin_lk, LK_EXCLUSIVE);
105 while ((sc->admin_signal & ADMIN_SIG_PROBED) == 0) {
106 if ((sc->admin_signal & ADMIN_SIG_RUNNING) == 0)
107 break;
108 lksleep(&sc->admin_signal, &sc->admin_lk, 0, "nvwend", 0);
110 lockmgr(&sc->admin_lk, LK_RELEASE);
113 * Disconnect our disks while the admin thread is still running,
114 * ensuring that the poll works even if interrupts are broken.
115 * Otherwise we could deadlock in the devfs core.
117 for (i = 0; i < NVME_MAX_NAMESPACES; ++i) {
118 nvme_softns_t *nsc;
120 if ((nsc = sc->nscary[i]) != NULL) {
121 nvme_disk_detach(nsc);
123 kfree(nsc, M_NVME);
124 sc->nscary[i] = NULL;
129 * Ask the admin thread to shut-down.
131 lockmgr(&sc->admin_lk, LK_EXCLUSIVE);
132 wakeup(&sc->admin_signal);
133 while (sc->admin_signal & ADMIN_SIG_RUNNING)
134 lksleep(&sc->admin_signal, &sc->admin_lk, 0, "nvwend", 0);
135 lockmgr(&sc->admin_lk, LK_RELEASE);
136 if (sc->irq_handle[0]) {
137 bus_teardown_intr(sc->dev, sc->irq[0], sc->irq_handle[0]);
138 sc->irq_handle[0] = NULL;
140 lockuninit(&sc->ioctl_lk);
141 lockuninit(&sc->admin_lk);
144 * Thread might be running on another cpu, give it time to actually
145 * exit before returning in case the caller is about to unload the
146 * module. Otherwise we don't need this.
148 nvme_os_sleep(1);
151 static
152 void
153 nvme_admin_thread(void *arg)
155 nvme_softc_t *sc = arg;
156 uint32_t i;
158 lockmgr(&sc->admin_lk, LK_EXCLUSIVE);
159 atomic_set_int(&sc->admin_signal, ADMIN_SIG_RUNNING);
160 wakeup(&sc->admin_signal);
162 sc->admin_func = nvme_admin_state_identify_ctlr;
164 while ((sc->admin_signal & ADMIN_SIG_STOP) == 0) {
165 for (i = 0; i <= sc->niocomqs; ++i) {
166 nvme_comqueue_t *comq = &sc->comqueues[i];
168 if (comq->nqe == 0) /* not configured */
169 continue;
171 lockmgr(&comq->lk, LK_EXCLUSIVE);
172 nvme_poll_completions(comq, &comq->lk);
173 lockmgr(&comq->lk, LK_RELEASE);
175 if (sc->admin_signal & ADMIN_SIG_REQUEUE) {
176 atomic_clear_int(&sc->admin_signal, ADMIN_SIG_REQUEUE);
177 nvme_disk_requeues(sc);
179 if (sc->admin_func(sc) == 0 &&
180 (sc->admin_signal & ADMIN_SIG_RUN_MASK) == 0) {
181 lksleep(&sc->admin_signal, &sc->admin_lk, 0,
182 "nvidle", hz);
187 * Cleanup state.
189 * Note that we actually issue delete queue commands here. The NVME
190 * spec says that for a normal shutdown the I/O queues should be
191 * deleted prior to issuing the shutdown in the CONFIG register.
193 for (i = 1; i <= sc->niosubqs; ++i) {
194 nvme_delete_subqueue(sc, i);
195 nvme_free_subqueue(sc, i);
197 for (i = 1; i <= sc->niocomqs; ++i) {
198 nvme_delete_comqueue(sc, i);
199 nvme_free_comqueue(sc, i);
203 * Signal that we are done.
205 atomic_clear_int(&sc->admin_signal, ADMIN_SIG_RUNNING);
206 wakeup(&sc->admin_signal);
207 lockmgr(&sc->admin_lk, LK_RELEASE);
211 * Identify the controller
213 static
215 nvme_admin_state_identify_ctlr(nvme_softc_t *sc)
217 nvme_request_t *req;
218 nvme_ident_ctlr_data_t *rp;
219 int status;
220 uint64_t mempgsize;
221 char serial[20+16];
222 char model[40+16];
225 * Identify Controller
227 mempgsize = NVME_CAP_MEMPG_MIN_GET(sc->cap);
229 req = nvme_get_admin_request(sc, NVME_OP_IDENTIFY);
230 req->cmd.identify.cns = NVME_CNS_CTLR;
231 req->cmd.identify.cntid = 0;
232 bzero(req->info, sizeof(*req->info));
233 nvme_submit_request(req);
234 status = nvme_wait_request(req, hz);
235 /* XXX handle status */
237 sc->idctlr = req->info->idctlr;
238 nvme_put_request(req);
240 rp = &sc->idctlr;
242 KKASSERT(sizeof(sc->idctlr.serialno) == 20);
243 KKASSERT(sizeof(sc->idctlr.modelno) == 40);
244 bzero(serial, sizeof(serial));
245 bzero(model, sizeof(model));
246 bcopy(rp->serialno, serial, sizeof(rp->serialno));
247 bcopy(rp->modelno, model, sizeof(rp->modelno));
248 string_cleanup(serial, 0);
249 string_cleanup(model, 0);
251 device_printf(sc->dev, "Model %s BaseSerial %s nscount=%d\n",
252 model, serial, rp->ns_count);
254 sc->admin_func = nvme_admin_state_make_queues;
256 return 1;
259 #define COMQFIXUP(msix, ncomqs) ((((msix) - 1) % ncomqs) + 1)
262 * Request and create the I/O queues. Figure out CPU mapping optimizations.
264 static
266 nvme_admin_state_make_queues(nvme_softc_t *sc)
268 nvme_request_t *req;
269 uint16_t niosubqs;
270 uint16_t niocomqs;
271 uint32_t i;
272 uint16_t qno;
273 int status;
274 int error;
277 * Calculate how many I/O queues (non-inclusive of admin queue)
278 * we want to have, up to 65535. dw0 in the response returns the
279 * number of queues the controller gives us. Submission and
280 * Completion queues are specified separately.
282 * This driver runs optimally with 4 submission queues and one
283 * completion queue per cpu (rdhipri, rdlopri, wrhipri, wrlopri),
285 * +1 for dumps XXX future
286 * +1 for async events XXX future
288 req = nvme_get_admin_request(sc, NVME_OP_SET_FEATURES);
290 niosubqs = ncpus * 2 + 0;
291 niocomqs = ncpus + 0;
292 if (niosubqs > NVME_MAX_QUEUES)
293 niosubqs = NVME_MAX_QUEUES;
294 if (niocomqs > NVME_MAX_QUEUES)
295 niocomqs = NVME_MAX_QUEUES;
296 device_printf(sc->dev, "Request %u/%u queues, ", niosubqs, niocomqs);
298 req->cmd.setfeat.flags = NVME_FID_NUMQUEUES;
299 req->cmd.setfeat.numqs.nsqr = niosubqs - 1; /* 0's based 0=1 */
300 req->cmd.setfeat.numqs.ncqr = niocomqs - 1; /* 0's based 0=1 */
302 nvme_submit_request(req);
305 * Get response and set our operations mode.
307 status = nvme_wait_request(req, hz);
308 /* XXX handle status */
310 if (status == 0) {
311 sc->niosubqs = 1 + (req->res.setfeat.dw0 & 0xFFFFU);
312 sc->niocomqs = 1 + ((req->res.setfeat.dw0 >> 16) & 0xFFFFU);
313 } else {
314 sc->niosubqs = 0;
315 sc->niocomqs = 0;
317 kprintf("Returns %u/%u queues, ", sc->niosubqs, sc->niocomqs);
319 nvme_put_request(req);
321 sc->dumpqno = 0;
322 sc->eventqno = 0;
324 if (sc->niosubqs >= ncpus * 2 + 0 && sc->niocomqs >= ncpus + 0) {
326 * If we got all the queues we wanted do a full-bore setup of
327 * qmap[cpu][type].
329 * Remember that subq 0 / comq 0 is the admin queue.
331 kprintf("optimal map\n");
332 qno = 1;
333 for (i = 0; i < ncpus; ++i) {
334 int cpuqno = COMQFIXUP(sc->cputovect[i], ncpus);
336 KKASSERT(cpuqno != 0);
337 sc->qmap[i][0] = qno + 0;
338 sc->qmap[i][1] = qno + 1;
339 sc->subqueues[qno + 0].comqid = cpuqno;
340 sc->subqueues[qno + 1].comqid = cpuqno;
341 qno += 2;
343 sc->niosubqs = ncpus * 2 + 0;
344 sc->niocomqs = ncpus + 0;
345 } else if (sc->niosubqs >= ncpus && sc->niocomqs >= ncpus) {
347 * We have enough to give each cpu its own submission
348 * and completion queue.
350 * leave dumpqno and eventqno set to the admin queue.
352 kprintf("nominal map 1:1 cpu\n");
353 for (i = 0; i < ncpus; ++i) {
354 qno = sc->cputovect[i];
355 KKASSERT(qno != 0);
356 sc->qmap[i][0] = qno;
357 sc->qmap[i][1] = qno;
358 sc->subqueues[qno].comqid = COMQFIXUP(qno, ncpus);
360 sc->niosubqs = ncpus;
361 sc->niocomqs = ncpus;
362 } else if (sc->niosubqs >= 2 && sc->niocomqs >= 2) {
364 * prioritize trying to distribute available queues to
365 * cpus, don't separate read and write.
367 * leave dumpqno and eventqno set to the admin queue.
369 kprintf("rw-sep map (%d, %d)\n", sc->niosubqs, sc->niocomqs);
370 for (i = 0; i < ncpus; ++i) {
371 int cpuqno = COMQFIXUP(sc->cputovect[i], sc->niocomqs);
372 int qno = COMQFIXUP((i + 1), sc->niosubqs);
374 KKASSERT(qno != 0);
375 sc->qmap[i][0] = qno; /* read */
376 sc->qmap[i][1] = qno; /* write */
377 sc->subqueues[qno].comqid = cpuqno;
378 /* do not increment qno */
380 #if 0
381 sc->niosubqs = 2;
382 sc->niocomqs = 2;
383 #endif
384 } else if (sc->niosubqs >= 2) {
386 * We have enough to have separate read and write queues.
388 kprintf("basic map\n");
389 qno = 1;
390 for (i = 0; i < ncpus; ++i) {
391 int cpuqno = COMQFIXUP(sc->cputovect[i], 1);
393 KKASSERT(qno != 0);
394 sc->qmap[i][0] = qno + 0; /* read */
395 sc->qmap[i][1] = qno + 1; /* write */
396 if (i <= 0)
397 sc->subqueues[qno + 0].comqid = cpuqno;
398 if (i <= 1)
399 sc->subqueues[qno + 1].comqid = cpuqno;
401 sc->niosubqs = 2;
402 sc->niocomqs = 1;
403 } else {
405 * Minimal configuration, all cpus and I/O types use the
406 * same queue. Sad day.
408 kprintf("minimal map\n");
409 sc->dumpqno = 0;
410 sc->eventqno = 0;
411 for (i = 0; i < ncpus; ++i) {
412 sc->qmap[i][0] = 1;
413 sc->qmap[i][1] = 1;
415 sc->subqueues[1].comqid = 1;
416 sc->niosubqs = 1;
417 sc->niocomqs = 1;
421 * Create all I/O submission and completion queues. The I/O
422 * queues start at 1 and are inclusive of niosubqs and niocomqs.
424 * NOTE: Completion queues must be created before submission queues.
425 * That is, the completion queue specified when creating a
426 * submission queue must already exist.
428 error = 0;
429 for (i = 1; i <= sc->niocomqs; ++i) {
430 error += nvme_alloc_comqueue(sc, i);
431 if (error) {
432 device_printf(sc->dev, "Unable to allocate comqs\n");
433 break;
435 error += nvme_create_comqueue(sc, i);
437 for (i = 1; i <= sc->niosubqs; ++i) {
438 error += nvme_alloc_subqueue(sc, i);
439 if (error) {
440 device_printf(sc->dev, "Unable to allocate subqs\n");
441 break;
443 error += nvme_create_subqueue(sc, i);
446 if (error) {
447 device_printf(sc->dev, "Failed to initialize device!\n");
448 sc->admin_func = nvme_admin_state_failed;
449 } else {
450 sc->admin_func = nvme_admin_state_identify_ns;
454 * Basically interrupt coalescing is worthless if we care about
455 * performance, at least on the Intel 750. Setting the threshold
456 * has no effect if time is set to 0. The smallest time that can
457 * be set is a value of 1 (== 100uS), which is much too long. That
458 * is only 10,000 interrupts/sec/cpu and on the Intel 750 it totally
459 * destroys sequential performance.
461 req = nvme_get_admin_request(sc, NVME_OP_SET_FEATURES);
463 device_printf(sc->dev, "Interrupt Coalesce: 100uS / 4 qentries\n");
465 req->cmd.setfeat.flags = NVME_FID_INTCOALESCE;
466 req->cmd.setfeat.intcoal.thr = 0;
467 req->cmd.setfeat.intcoal.time = 0;
469 nvme_submit_request(req);
470 status = nvme_wait_request(req, hz);
471 if (status) {
472 device_printf(sc->dev,
473 "Interrupt coalesce failed status=%d\n",
474 status);
476 nvme_put_request(req);
478 return 1;
482 * Identify available namespaces, iterate, and attach to disks.
484 static
486 nvme_admin_state_identify_ns(nvme_softc_t *sc)
488 nvme_request_t *req;
489 nvme_ident_ns_list_t *rp;
490 int status;
491 uint32_t i;
492 uint32_t j;
494 if (bootverbose) {
495 if (sc->idctlr.admin_cap & NVME_ADMIN_NSMANAGE)
496 device_printf(sc->dev,
497 "Namespace management supported\n");
498 else
499 device_printf(sc->dev,
500 "Namespace management not supported\n");
502 #if 0
504 * Identify Controllers TODO TODO TODO
506 if (sc->idctlr.admin_cap & NVME_ADMIN_NSMANAGE) {
507 req = nvme_get_admin_request(sc, NVME_OP_IDENTIFY);
508 req->cmd.identify.cns = NVME_CNS_ANY_CTLR_LIST;
509 req->cmd.identify.cntid = 0;
510 bzero(req->info, sizeof(*req->info));
511 nvme_submit_request(req);
512 status = nvme_wait_request(req, hz);
513 kprintf("nsquery status %08x\n", status);
515 #if 0
516 for (i = 0; i < req->info->ctlrlist.idcount; ++i) {
517 kprintf("CTLR %04x\n", req->info->ctlrlist.ctlrids[i]);
519 #endif
520 nvme_put_request(req);
522 #endif
524 rp = kmalloc(sizeof(*rp), M_NVME, M_WAITOK | M_ZERO);
525 if (sc->idctlr.admin_cap & NVME_ADMIN_NSMANAGE) {
527 * Namespace management supported, query active namespaces.
529 req = nvme_get_admin_request(sc, NVME_OP_IDENTIFY);
530 req->cmd.identify.cns = NVME_CNS_ACT_NSLIST;
531 req->cmd.identify.cntid = 0;
532 bzero(req->info, sizeof(*req->info));
533 nvme_submit_request(req);
534 status = nvme_wait_request(req, hz);
535 kprintf("nsquery status %08x\n", status);
536 /* XXX handle status */
538 cpu_lfence();
539 *rp = req->info->nslist;
540 nvme_put_request(req);
541 } else {
543 * Namespace management not supported, assume nsids 1..N.
545 for (i = 1; i <= sc->idctlr.ns_count && i <= 1024; ++i)
546 rp->nsids[i-1] = i;
550 * Identify each Namespace
552 for (i = 0; i < 1024; ++i) {
553 nvme_softns_t *nsc;
554 nvme_lba_fmt_data_t *lbafmt;
556 if (rp->nsids[i] == 0)
557 continue;
558 req = nvme_get_admin_request(sc, NVME_OP_IDENTIFY);
559 req->cmd.identify.cns = NVME_CNS_ACT_NS;
560 req->cmd.identify.cntid = 0;
561 req->cmd.identify.head.nsid = rp->nsids[i];
562 bzero(req->info, sizeof(*req->info));
563 nvme_submit_request(req);
564 status = nvme_wait_request(req, hz);
565 if (status != 0) {
566 kprintf("NS FAILED %08x\n", status);
567 continue;
570 for (j = 0; j < NVME_MAX_NAMESPACES; ++j) {
571 if (sc->nscary[j] &&
572 sc->nscary[j]->nsid == rp->nsids[i])
573 break;
575 if (j == NVME_MAX_NAMESPACES) {
576 j = i;
577 if (sc->nscary[j] != NULL) {
578 for (j = NVME_MAX_NAMESPACES - 1; j >= 0; --j) {
579 if (sc->nscary[j] == NULL)
580 break;
584 if (j < 0) {
585 device_printf(sc->dev, "not enough room in nscary for "
586 "namespace %08x\n", rp->nsids[i]);
587 nvme_put_request(req);
588 continue;
590 nsc = sc->nscary[j];
591 if (nsc == NULL) {
592 nsc = kmalloc(sizeof(*nsc), M_NVME, M_WAITOK | M_ZERO);
593 nsc->unit = nvme_alloc_disk_unit();
594 sc->nscary[j] = nsc;
596 if (sc->nscmax <= j)
597 sc->nscmax = j + 1;
598 nsc->sc = sc;
599 nsc->nsid = rp->nsids[i];
600 nsc->state = NVME_NSC_STATE_UNATTACHED;
601 nsc->idns = req->info->idns;
602 bioq_init(&nsc->bioq);
603 lockinit(&nsc->lk, "nvnsc", 0, 0);
605 nvme_put_request(req);
607 j = NVME_FLBAS_SEL_GET(nsc->idns.flbas);
608 lbafmt = &nsc->idns.lba_fmt[j];
609 nsc->blksize = 1 << lbafmt->sect_size;
612 * Attach the namespace
614 nvme_disk_attach(nsc);
616 kfree(rp, M_NVME);
618 sc->admin_func = nvme_admin_state_operating;
619 return 1;
622 static
624 nvme_admin_state_operating(nvme_softc_t *sc)
626 if ((sc->admin_signal & ADMIN_SIG_PROBED) == 0) {
627 atomic_set_int(&sc->admin_signal, ADMIN_SIG_PROBED);
628 wakeup(&sc->admin_signal);
631 return 0;
634 static
636 nvme_admin_state_failed(nvme_softc_t *sc)
638 if ((sc->admin_signal & ADMIN_SIG_PROBED) == 0) {
639 atomic_set_int(&sc->admin_signal, ADMIN_SIG_PROBED);
640 wakeup(&sc->admin_signal);
643 return 0;