2 * The Kyber I/O scheduler. Controls latency by throttling queue depths using
5 * Copyright (C) 2017 Facebook
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public
9 * License v2 as published by the Free Software Foundation.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program. If not, see <https://www.gnu.org/licenses/>.
20 #include <linux/kernel.h>
21 #include <linux/blkdev.h>
22 #include <linux/blk-mq.h>
23 #include <linux/elevator.h>
24 #include <linux/module.h>
25 #include <linux/sbitmap.h>
29 #include "blk-mq-debugfs.h"
30 #include "blk-mq-sched.h"
31 #include "blk-mq-tag.h"
34 /* Scheduling domains. */
38 KYBER_OTHER
, /* Async writes, discard, etc. */
43 KYBER_MIN_DEPTH
= 256,
46 * In order to prevent starvation of synchronous requests by a flood of
47 * asynchronous requests, we reserve 25% of requests for synchronous
50 KYBER_ASYNC_PERCENT
= 75,
54 * Initial device-wide depths for each scheduling domain.
56 * Even for fast devices with lots of tags like NVMe, you can saturate
57 * the device with only a fraction of the maximum possible queue depth.
58 * So, we cap these to a reasonable value.
60 static const unsigned int kyber_depth
[] = {
62 [KYBER_SYNC_WRITE
] = 128,
67 * Scheduling domain batch sizes. We favor reads.
69 static const unsigned int kyber_batch_size
[] = {
71 [KYBER_SYNC_WRITE
] = 8,
76 * There is a same mapping between ctx & hctx and kcq & khd,
77 * we use request->mq_ctx->index_hw to index the kcq in khd.
79 struct kyber_ctx_queue
{
81 * Used to ensure operations on rq_list and kcq_map to be an atmoic one.
82 * Also protect the rqs on rq_list when merge.
85 struct list_head rq_list
[KYBER_NUM_DOMAINS
];
86 } ____cacheline_aligned_in_smp
;
88 struct kyber_queue_data
{
89 struct request_queue
*q
;
91 struct blk_stat_callback
*cb
;
94 * The device is divided into multiple scheduling domains based on the
95 * request type. Each domain has a fixed number of in-flight requests of
96 * that type device-wide, limited by these tokens.
98 struct sbitmap_queue domain_tokens
[KYBER_NUM_DOMAINS
];
101 * Async request percentage, converted to per-word depth for
102 * sbitmap_get_shallow().
104 unsigned int async_depth
;
106 /* Target latencies in nanoseconds. */
107 u64 read_lat_nsec
, write_lat_nsec
;
110 struct kyber_hctx_data
{
112 struct list_head rqs
[KYBER_NUM_DOMAINS
];
113 unsigned int cur_domain
;
114 unsigned int batching
;
115 struct kyber_ctx_queue
*kcqs
;
116 struct sbitmap kcq_map
[KYBER_NUM_DOMAINS
];
117 wait_queue_entry_t domain_wait
[KYBER_NUM_DOMAINS
];
118 struct sbq_wait_state
*domain_ws
[KYBER_NUM_DOMAINS
];
119 atomic_t wait_index
[KYBER_NUM_DOMAINS
];
122 static int kyber_domain_wake(wait_queue_entry_t
*wait
, unsigned mode
, int flags
,
125 static unsigned int kyber_sched_domain(unsigned int op
)
127 if ((op
& REQ_OP_MASK
) == REQ_OP_READ
)
129 else if ((op
& REQ_OP_MASK
) == REQ_OP_WRITE
&& op_is_sync(op
))
130 return KYBER_SYNC_WRITE
;
143 #define IS_GOOD(status) ((status) > 0)
144 #define IS_BAD(status) ((status) < 0)
146 static int kyber_lat_status(struct blk_stat_callback
*cb
,
147 unsigned int sched_domain
, u64 target
)
151 if (!cb
->stat
[sched_domain
].nr_samples
)
154 latency
= cb
->stat
[sched_domain
].mean
;
155 if (latency
>= 2 * target
)
157 else if (latency
> target
)
159 else if (latency
<= target
/ 2)
161 else /* (latency <= target) */
166 * Adjust the read or synchronous write depth given the status of reads and
167 * writes. The goal is that the latencies of the two domains are fair (i.e., if
168 * one is good, then the other is good).
170 static void kyber_adjust_rw_depth(struct kyber_queue_data
*kqd
,
171 unsigned int sched_domain
, int this_status
,
174 unsigned int orig_depth
, depth
;
177 * If this domain had no samples, or reads and writes are both good or
178 * both bad, don't adjust the depth.
180 if (this_status
== NONE
||
181 (IS_GOOD(this_status
) && IS_GOOD(other_status
)) ||
182 (IS_BAD(this_status
) && IS_BAD(other_status
)))
185 orig_depth
= depth
= kqd
->domain_tokens
[sched_domain
].sb
.depth
;
187 if (other_status
== NONE
) {
190 switch (this_status
) {
192 if (other_status
== AWFUL
)
193 depth
-= max(depth
/ 4, 1U);
195 depth
-= max(depth
/ 8, 1U);
198 if (other_status
== AWFUL
)
201 depth
-= max(depth
/ 4, 1U);
207 if (other_status
== GREAT
)
215 depth
= clamp(depth
, 1U, kyber_depth
[sched_domain
]);
216 if (depth
!= orig_depth
)
217 sbitmap_queue_resize(&kqd
->domain_tokens
[sched_domain
], depth
);
221 * Adjust the depth of other requests given the status of reads and synchronous
222 * writes. As long as either domain is doing fine, we don't throttle, but if
223 * both domains are doing badly, we throttle heavily.
225 static void kyber_adjust_other_depth(struct kyber_queue_data
*kqd
,
226 int read_status
, int write_status
,
229 unsigned int orig_depth
, depth
;
232 orig_depth
= depth
= kqd
->domain_tokens
[KYBER_OTHER
].sb
.depth
;
234 if (read_status
== NONE
&& write_status
== NONE
) {
236 } else if (have_samples
) {
237 if (read_status
== NONE
)
238 status
= write_status
;
239 else if (write_status
== NONE
)
240 status
= read_status
;
242 status
= max(read_status
, write_status
);
251 depth
-= max(depth
/ 4, 1U);
259 depth
= clamp(depth
, 1U, kyber_depth
[KYBER_OTHER
]);
260 if (depth
!= orig_depth
)
261 sbitmap_queue_resize(&kqd
->domain_tokens
[KYBER_OTHER
], depth
);
265 * Apply heuristics for limiting queue depths based on gathered latency
268 static void kyber_stat_timer_fn(struct blk_stat_callback
*cb
)
270 struct kyber_queue_data
*kqd
= cb
->data
;
271 int read_status
, write_status
;
273 read_status
= kyber_lat_status(cb
, KYBER_READ
, kqd
->read_lat_nsec
);
274 write_status
= kyber_lat_status(cb
, KYBER_SYNC_WRITE
, kqd
->write_lat_nsec
);
276 kyber_adjust_rw_depth(kqd
, KYBER_READ
, read_status
, write_status
);
277 kyber_adjust_rw_depth(kqd
, KYBER_SYNC_WRITE
, write_status
, read_status
);
278 kyber_adjust_other_depth(kqd
, read_status
, write_status
,
279 cb
->stat
[KYBER_OTHER
].nr_samples
!= 0);
282 * Continue monitoring latencies if we aren't hitting the targets or
283 * we're still throttling other requests.
285 if (!blk_stat_is_active(kqd
->cb
) &&
286 ((IS_BAD(read_status
) || IS_BAD(write_status
) ||
287 kqd
->domain_tokens
[KYBER_OTHER
].sb
.depth
< kyber_depth
[KYBER_OTHER
])))
288 blk_stat_activate_msecs(kqd
->cb
, 100);
291 static unsigned int kyber_sched_tags_shift(struct kyber_queue_data
*kqd
)
294 * All of the hardware queues have the same depth, so we can just grab
295 * the shift of the first one.
297 return kqd
->q
->queue_hw_ctx
[0]->sched_tags
->bitmap_tags
.sb
.shift
;
300 static int kyber_bucket_fn(const struct request
*rq
)
302 return kyber_sched_domain(rq
->cmd_flags
);
305 static struct kyber_queue_data
*kyber_queue_data_alloc(struct request_queue
*q
)
307 struct kyber_queue_data
*kqd
;
308 unsigned int max_tokens
;
313 kqd
= kmalloc_node(sizeof(*kqd
), GFP_KERNEL
, q
->node
);
318 kqd
->cb
= blk_stat_alloc_callback(kyber_stat_timer_fn
, kyber_bucket_fn
,
319 KYBER_NUM_DOMAINS
, kqd
);
324 * The maximum number of tokens for any scheduling domain is at least
325 * the queue depth of a single hardware queue. If the hardware doesn't
326 * have many tags, still provide a reasonable number.
328 max_tokens
= max_t(unsigned int, q
->tag_set
->queue_depth
,
330 for (i
= 0; i
< KYBER_NUM_DOMAINS
; i
++) {
331 WARN_ON(!kyber_depth
[i
]);
332 WARN_ON(!kyber_batch_size
[i
]);
333 ret
= sbitmap_queue_init_node(&kqd
->domain_tokens
[i
],
334 max_tokens
, -1, false, GFP_KERNEL
,
338 sbitmap_queue_free(&kqd
->domain_tokens
[i
]);
341 sbitmap_queue_resize(&kqd
->domain_tokens
[i
], kyber_depth
[i
]);
344 shift
= kyber_sched_tags_shift(kqd
);
345 kqd
->async_depth
= (1U << shift
) * KYBER_ASYNC_PERCENT
/ 100U;
347 kqd
->read_lat_nsec
= 2000000ULL;
348 kqd
->write_lat_nsec
= 10000000ULL;
353 blk_stat_free_callback(kqd
->cb
);
360 static int kyber_init_sched(struct request_queue
*q
, struct elevator_type
*e
)
362 struct kyber_queue_data
*kqd
;
363 struct elevator_queue
*eq
;
365 eq
= elevator_alloc(q
, e
);
369 kqd
= kyber_queue_data_alloc(q
);
371 kobject_put(&eq
->kobj
);
375 eq
->elevator_data
= kqd
;
378 blk_stat_add_callback(q
, kqd
->cb
);
383 static void kyber_exit_sched(struct elevator_queue
*e
)
385 struct kyber_queue_data
*kqd
= e
->elevator_data
;
386 struct request_queue
*q
= kqd
->q
;
389 blk_stat_remove_callback(q
, kqd
->cb
);
391 for (i
= 0; i
< KYBER_NUM_DOMAINS
; i
++)
392 sbitmap_queue_free(&kqd
->domain_tokens
[i
]);
393 blk_stat_free_callback(kqd
->cb
);
397 static void kyber_ctx_queue_init(struct kyber_ctx_queue
*kcq
)
401 spin_lock_init(&kcq
->lock
);
402 for (i
= 0; i
< KYBER_NUM_DOMAINS
; i
++)
403 INIT_LIST_HEAD(&kcq
->rq_list
[i
]);
406 static int kyber_init_hctx(struct blk_mq_hw_ctx
*hctx
, unsigned int hctx_idx
)
408 struct kyber_queue_data
*kqd
= hctx
->queue
->elevator
->elevator_data
;
409 struct kyber_hctx_data
*khd
;
412 khd
= kmalloc_node(sizeof(*khd
), GFP_KERNEL
, hctx
->numa_node
);
416 khd
->kcqs
= kmalloc_array_node(hctx
->nr_ctx
,
417 sizeof(struct kyber_ctx_queue
),
418 GFP_KERNEL
, hctx
->numa_node
);
422 for (i
= 0; i
< hctx
->nr_ctx
; i
++)
423 kyber_ctx_queue_init(&khd
->kcqs
[i
]);
425 for (i
= 0; i
< KYBER_NUM_DOMAINS
; i
++) {
426 if (sbitmap_init_node(&khd
->kcq_map
[i
], hctx
->nr_ctx
,
427 ilog2(8), GFP_KERNEL
, hctx
->numa_node
)) {
429 sbitmap_free(&khd
->kcq_map
[i
]);
434 spin_lock_init(&khd
->lock
);
436 for (i
= 0; i
< KYBER_NUM_DOMAINS
; i
++) {
437 INIT_LIST_HEAD(&khd
->rqs
[i
]);
438 init_waitqueue_func_entry(&khd
->domain_wait
[i
],
440 khd
->domain_wait
[i
].private = hctx
;
441 INIT_LIST_HEAD(&khd
->domain_wait
[i
].entry
);
442 atomic_set(&khd
->wait_index
[i
], 0);
448 hctx
->sched_data
= khd
;
449 sbitmap_queue_min_shallow_depth(&hctx
->sched_tags
->bitmap_tags
,
461 static void kyber_exit_hctx(struct blk_mq_hw_ctx
*hctx
, unsigned int hctx_idx
)
463 struct kyber_hctx_data
*khd
= hctx
->sched_data
;
466 for (i
= 0; i
< KYBER_NUM_DOMAINS
; i
++)
467 sbitmap_free(&khd
->kcq_map
[i
]);
469 kfree(hctx
->sched_data
);
472 static int rq_get_domain_token(struct request
*rq
)
474 return (long)rq
->elv
.priv
[0];
477 static void rq_set_domain_token(struct request
*rq
, int token
)
479 rq
->elv
.priv
[0] = (void *)(long)token
;
482 static void rq_clear_domain_token(struct kyber_queue_data
*kqd
,
485 unsigned int sched_domain
;
488 nr
= rq_get_domain_token(rq
);
490 sched_domain
= kyber_sched_domain(rq
->cmd_flags
);
491 sbitmap_queue_clear(&kqd
->domain_tokens
[sched_domain
], nr
,
496 static void kyber_limit_depth(unsigned int op
, struct blk_mq_alloc_data
*data
)
499 * We use the scheduler tags as per-hardware queue queueing tokens.
500 * Async requests can be limited at this stage.
502 if (!op_is_sync(op
)) {
503 struct kyber_queue_data
*kqd
= data
->q
->elevator
->elevator_data
;
505 data
->shallow_depth
= kqd
->async_depth
;
509 static bool kyber_bio_merge(struct blk_mq_hw_ctx
*hctx
, struct bio
*bio
)
511 struct kyber_hctx_data
*khd
= hctx
->sched_data
;
512 struct blk_mq_ctx
*ctx
= blk_mq_get_ctx(hctx
->queue
);
513 struct kyber_ctx_queue
*kcq
= &khd
->kcqs
[ctx
->index_hw
];
514 unsigned int sched_domain
= kyber_sched_domain(bio
->bi_opf
);
515 struct list_head
*rq_list
= &kcq
->rq_list
[sched_domain
];
518 spin_lock(&kcq
->lock
);
519 merged
= blk_mq_bio_list_merge(hctx
->queue
, rq_list
, bio
);
520 spin_unlock(&kcq
->lock
);
526 static void kyber_prepare_request(struct request
*rq
, struct bio
*bio
)
528 rq_set_domain_token(rq
, -1);
531 static void kyber_insert_requests(struct blk_mq_hw_ctx
*hctx
,
532 struct list_head
*rq_list
, bool at_head
)
534 struct kyber_hctx_data
*khd
= hctx
->sched_data
;
535 struct request
*rq
, *next
;
537 list_for_each_entry_safe(rq
, next
, rq_list
, queuelist
) {
538 unsigned int sched_domain
= kyber_sched_domain(rq
->cmd_flags
);
539 struct kyber_ctx_queue
*kcq
= &khd
->kcqs
[rq
->mq_ctx
->index_hw
];
540 struct list_head
*head
= &kcq
->rq_list
[sched_domain
];
542 spin_lock(&kcq
->lock
);
544 list_move(&rq
->queuelist
, head
);
546 list_move_tail(&rq
->queuelist
, head
);
547 sbitmap_set_bit(&khd
->kcq_map
[sched_domain
],
548 rq
->mq_ctx
->index_hw
);
549 blk_mq_sched_request_inserted(rq
);
550 spin_unlock(&kcq
->lock
);
554 static void kyber_finish_request(struct request
*rq
)
556 struct kyber_queue_data
*kqd
= rq
->q
->elevator
->elevator_data
;
558 rq_clear_domain_token(kqd
, rq
);
561 static void kyber_completed_request(struct request
*rq
)
563 struct request_queue
*q
= rq
->q
;
564 struct kyber_queue_data
*kqd
= q
->elevator
->elevator_data
;
565 unsigned int sched_domain
;
566 u64 now
, latency
, target
;
569 * Check if this request met our latency goal. If not, quickly gather
570 * some statistics and start throttling.
572 sched_domain
= kyber_sched_domain(rq
->cmd_flags
);
573 switch (sched_domain
) {
575 target
= kqd
->read_lat_nsec
;
577 case KYBER_SYNC_WRITE
:
578 target
= kqd
->write_lat_nsec
;
584 /* If we are already monitoring latencies, don't check again. */
585 if (blk_stat_is_active(kqd
->cb
))
588 now
= ktime_get_ns();
589 if (now
< rq
->io_start_time_ns
)
592 latency
= now
- rq
->io_start_time_ns
;
594 if (latency
> target
)
595 blk_stat_activate_msecs(kqd
->cb
, 10);
598 struct flush_kcq_data
{
599 struct kyber_hctx_data
*khd
;
600 unsigned int sched_domain
;
601 struct list_head
*list
;
604 static bool flush_busy_kcq(struct sbitmap
*sb
, unsigned int bitnr
, void *data
)
606 struct flush_kcq_data
*flush_data
= data
;
607 struct kyber_ctx_queue
*kcq
= &flush_data
->khd
->kcqs
[bitnr
];
609 spin_lock(&kcq
->lock
);
610 list_splice_tail_init(&kcq
->rq_list
[flush_data
->sched_domain
],
612 sbitmap_clear_bit(sb
, bitnr
);
613 spin_unlock(&kcq
->lock
);
618 static void kyber_flush_busy_kcqs(struct kyber_hctx_data
*khd
,
619 unsigned int sched_domain
,
620 struct list_head
*list
)
622 struct flush_kcq_data data
= {
624 .sched_domain
= sched_domain
,
628 sbitmap_for_each_set(&khd
->kcq_map
[sched_domain
],
629 flush_busy_kcq
, &data
);
632 static int kyber_domain_wake(wait_queue_entry_t
*wait
, unsigned mode
, int flags
,
635 struct blk_mq_hw_ctx
*hctx
= READ_ONCE(wait
->private);
637 list_del_init(&wait
->entry
);
638 blk_mq_run_hw_queue(hctx
, true);
642 static int kyber_get_domain_token(struct kyber_queue_data
*kqd
,
643 struct kyber_hctx_data
*khd
,
644 struct blk_mq_hw_ctx
*hctx
)
646 unsigned int sched_domain
= khd
->cur_domain
;
647 struct sbitmap_queue
*domain_tokens
= &kqd
->domain_tokens
[sched_domain
];
648 wait_queue_entry_t
*wait
= &khd
->domain_wait
[sched_domain
];
649 struct sbq_wait_state
*ws
;
652 nr
= __sbitmap_queue_get(domain_tokens
);
655 * If we failed to get a domain token, make sure the hardware queue is
656 * run when one becomes available. Note that this is serialized on
657 * khd->lock, but we still need to be careful about the waker.
659 if (nr
< 0 && list_empty_careful(&wait
->entry
)) {
660 ws
= sbq_wait_ptr(domain_tokens
,
661 &khd
->wait_index
[sched_domain
]);
662 khd
->domain_ws
[sched_domain
] = ws
;
663 add_wait_queue(&ws
->wait
, wait
);
666 * Try again in case a token was freed before we got on the wait
669 nr
= __sbitmap_queue_get(domain_tokens
);
673 * If we got a token while we were on the wait queue, remove ourselves
674 * from the wait queue to ensure that all wake ups make forward
675 * progress. It's possible that the waker already deleted the entry
676 * between the !list_empty_careful() check and us grabbing the lock, but
677 * list_del_init() is okay with that.
679 if (nr
>= 0 && !list_empty_careful(&wait
->entry
)) {
680 ws
= khd
->domain_ws
[sched_domain
];
681 spin_lock_irq(&ws
->wait
.lock
);
682 list_del_init(&wait
->entry
);
683 spin_unlock_irq(&ws
->wait
.lock
);
689 static struct request
*
690 kyber_dispatch_cur_domain(struct kyber_queue_data
*kqd
,
691 struct kyber_hctx_data
*khd
,
692 struct blk_mq_hw_ctx
*hctx
)
694 struct list_head
*rqs
;
698 rqs
= &khd
->rqs
[khd
->cur_domain
];
701 * If we already have a flushed request, then we just need to get a
702 * token for it. Otherwise, if there are pending requests in the kcqs,
703 * flush the kcqs, but only if we can get a token. If not, we should
704 * leave the requests in the kcqs so that they can be merged. Note that
705 * khd->lock serializes the flushes, so if we observed any bit set in
706 * the kcq_map, we will always get a request.
708 rq
= list_first_entry_or_null(rqs
, struct request
, queuelist
);
710 nr
= kyber_get_domain_token(kqd
, khd
, hctx
);
713 rq_set_domain_token(rq
, nr
);
714 list_del_init(&rq
->queuelist
);
717 } else if (sbitmap_any_bit_set(&khd
->kcq_map
[khd
->cur_domain
])) {
718 nr
= kyber_get_domain_token(kqd
, khd
, hctx
);
720 kyber_flush_busy_kcqs(khd
, khd
->cur_domain
, rqs
);
721 rq
= list_first_entry(rqs
, struct request
, queuelist
);
723 rq_set_domain_token(rq
, nr
);
724 list_del_init(&rq
->queuelist
);
729 /* There were either no pending requests or no tokens. */
733 static struct request
*kyber_dispatch_request(struct blk_mq_hw_ctx
*hctx
)
735 struct kyber_queue_data
*kqd
= hctx
->queue
->elevator
->elevator_data
;
736 struct kyber_hctx_data
*khd
= hctx
->sched_data
;
740 spin_lock(&khd
->lock
);
743 * First, if we are still entitled to batch, try to dispatch a request
746 if (khd
->batching
< kyber_batch_size
[khd
->cur_domain
]) {
747 rq
= kyber_dispatch_cur_domain(kqd
, khd
, hctx
);
754 * 1. We were no longer entitled to a batch.
755 * 2. The domain we were batching didn't have any requests.
756 * 3. The domain we were batching was out of tokens.
758 * Start another batch. Note that this wraps back around to the original
759 * domain if no other domains have requests or tokens.
762 for (i
= 0; i
< KYBER_NUM_DOMAINS
; i
++) {
763 if (khd
->cur_domain
== KYBER_NUM_DOMAINS
- 1)
768 rq
= kyber_dispatch_cur_domain(kqd
, khd
, hctx
);
775 spin_unlock(&khd
->lock
);
779 static bool kyber_has_work(struct blk_mq_hw_ctx
*hctx
)
781 struct kyber_hctx_data
*khd
= hctx
->sched_data
;
784 for (i
= 0; i
< KYBER_NUM_DOMAINS
; i
++) {
785 if (!list_empty_careful(&khd
->rqs
[i
]) ||
786 sbitmap_any_bit_set(&khd
->kcq_map
[i
]))
793 #define KYBER_LAT_SHOW_STORE(op) \
794 static ssize_t kyber_##op##_lat_show(struct elevator_queue *e, \
797 struct kyber_queue_data *kqd = e->elevator_data; \
799 return sprintf(page, "%llu\n", kqd->op##_lat_nsec); \
802 static ssize_t kyber_##op##_lat_store(struct elevator_queue *e, \
803 const char *page, size_t count) \
805 struct kyber_queue_data *kqd = e->elevator_data; \
806 unsigned long long nsec; \
809 ret = kstrtoull(page, 10, &nsec); \
813 kqd->op##_lat_nsec = nsec; \
817 KYBER_LAT_SHOW_STORE(read
);
818 KYBER_LAT_SHOW_STORE(write
);
819 #undef KYBER_LAT_SHOW_STORE
821 #define KYBER_LAT_ATTR(op) __ATTR(op##_lat_nsec, 0644, kyber_##op##_lat_show, kyber_##op##_lat_store)
822 static struct elv_fs_entry kyber_sched_attrs
[] = {
823 KYBER_LAT_ATTR(read
),
824 KYBER_LAT_ATTR(write
),
827 #undef KYBER_LAT_ATTR
829 #ifdef CONFIG_BLK_DEBUG_FS
830 #define KYBER_DEBUGFS_DOMAIN_ATTRS(domain, name) \
831 static int kyber_##name##_tokens_show(void *data, struct seq_file *m) \
833 struct request_queue *q = data; \
834 struct kyber_queue_data *kqd = q->elevator->elevator_data; \
836 sbitmap_queue_show(&kqd->domain_tokens[domain], m); \
840 static void *kyber_##name##_rqs_start(struct seq_file *m, loff_t *pos) \
841 __acquires(&khd->lock) \
843 struct blk_mq_hw_ctx *hctx = m->private; \
844 struct kyber_hctx_data *khd = hctx->sched_data; \
846 spin_lock(&khd->lock); \
847 return seq_list_start(&khd->rqs[domain], *pos); \
850 static void *kyber_##name##_rqs_next(struct seq_file *m, void *v, \
853 struct blk_mq_hw_ctx *hctx = m->private; \
854 struct kyber_hctx_data *khd = hctx->sched_data; \
856 return seq_list_next(v, &khd->rqs[domain], pos); \
859 static void kyber_##name##_rqs_stop(struct seq_file *m, void *v) \
860 __releases(&khd->lock) \
862 struct blk_mq_hw_ctx *hctx = m->private; \
863 struct kyber_hctx_data *khd = hctx->sched_data; \
865 spin_unlock(&khd->lock); \
868 static const struct seq_operations kyber_##name##_rqs_seq_ops = { \
869 .start = kyber_##name##_rqs_start, \
870 .next = kyber_##name##_rqs_next, \
871 .stop = kyber_##name##_rqs_stop, \
872 .show = blk_mq_debugfs_rq_show, \
875 static int kyber_##name##_waiting_show(void *data, struct seq_file *m) \
877 struct blk_mq_hw_ctx *hctx = data; \
878 struct kyber_hctx_data *khd = hctx->sched_data; \
879 wait_queue_entry_t *wait = &khd->domain_wait[domain]; \
881 seq_printf(m, "%d\n", !list_empty_careful(&wait->entry)); \
884 KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_READ
, read
)
885 KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_SYNC_WRITE
, sync_write
)
886 KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_OTHER
, other
)
887 #undef KYBER_DEBUGFS_DOMAIN_ATTRS
889 static int kyber_async_depth_show(void *data
, struct seq_file
*m
)
891 struct request_queue
*q
= data
;
892 struct kyber_queue_data
*kqd
= q
->elevator
->elevator_data
;
894 seq_printf(m
, "%u\n", kqd
->async_depth
);
898 static int kyber_cur_domain_show(void *data
, struct seq_file
*m
)
900 struct blk_mq_hw_ctx
*hctx
= data
;
901 struct kyber_hctx_data
*khd
= hctx
->sched_data
;
903 switch (khd
->cur_domain
) {
905 seq_puts(m
, "READ\n");
907 case KYBER_SYNC_WRITE
:
908 seq_puts(m
, "SYNC_WRITE\n");
911 seq_puts(m
, "OTHER\n");
914 seq_printf(m
, "%u\n", khd
->cur_domain
);
920 static int kyber_batching_show(void *data
, struct seq_file
*m
)
922 struct blk_mq_hw_ctx
*hctx
= data
;
923 struct kyber_hctx_data
*khd
= hctx
->sched_data
;
925 seq_printf(m
, "%u\n", khd
->batching
);
929 #define KYBER_QUEUE_DOMAIN_ATTRS(name) \
930 {#name "_tokens", 0400, kyber_##name##_tokens_show}
931 static const struct blk_mq_debugfs_attr kyber_queue_debugfs_attrs
[] = {
932 KYBER_QUEUE_DOMAIN_ATTRS(read
),
933 KYBER_QUEUE_DOMAIN_ATTRS(sync_write
),
934 KYBER_QUEUE_DOMAIN_ATTRS(other
),
935 {"async_depth", 0400, kyber_async_depth_show
},
938 #undef KYBER_QUEUE_DOMAIN_ATTRS
940 #define KYBER_HCTX_DOMAIN_ATTRS(name) \
941 {#name "_rqs", 0400, .seq_ops = &kyber_##name##_rqs_seq_ops}, \
942 {#name "_waiting", 0400, kyber_##name##_waiting_show}
943 static const struct blk_mq_debugfs_attr kyber_hctx_debugfs_attrs
[] = {
944 KYBER_HCTX_DOMAIN_ATTRS(read
),
945 KYBER_HCTX_DOMAIN_ATTRS(sync_write
),
946 KYBER_HCTX_DOMAIN_ATTRS(other
),
947 {"cur_domain", 0400, kyber_cur_domain_show
},
948 {"batching", 0400, kyber_batching_show
},
951 #undef KYBER_HCTX_DOMAIN_ATTRS
954 static struct elevator_type kyber_sched
= {
956 .init_sched
= kyber_init_sched
,
957 .exit_sched
= kyber_exit_sched
,
958 .init_hctx
= kyber_init_hctx
,
959 .exit_hctx
= kyber_exit_hctx
,
960 .limit_depth
= kyber_limit_depth
,
961 .bio_merge
= kyber_bio_merge
,
962 .prepare_request
= kyber_prepare_request
,
963 .insert_requests
= kyber_insert_requests
,
964 .finish_request
= kyber_finish_request
,
965 .requeue_request
= kyber_finish_request
,
966 .completed_request
= kyber_completed_request
,
967 .dispatch_request
= kyber_dispatch_request
,
968 .has_work
= kyber_has_work
,
971 #ifdef CONFIG_BLK_DEBUG_FS
972 .queue_debugfs_attrs
= kyber_queue_debugfs_attrs
,
973 .hctx_debugfs_attrs
= kyber_hctx_debugfs_attrs
,
975 .elevator_attrs
= kyber_sched_attrs
,
976 .elevator_name
= "kyber",
977 .elevator_owner
= THIS_MODULE
,
980 static int __init
kyber_init(void)
982 return elv_register(&kyber_sched
);
985 static void __exit
kyber_exit(void)
987 elv_unregister(&kyber_sched
);
990 module_init(kyber_init
);
991 module_exit(kyber_exit
);
993 MODULE_AUTHOR("Omar Sandoval");
994 MODULE_LICENSE("GPL");
995 MODULE_DESCRIPTION("Kyber I/O scheduler");