2 * Copyright (c) 2009, 2010 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Alex Hornung <ahornung@gmail.com>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/kernel.h>
38 #include <sys/sysctl.h>
41 #include <sys/diskslice.h>
43 #include <sys/malloc.h>
44 #include <machine/md_var.h>
45 #include <sys/ctype.h>
46 #include <sys/syslog.h>
47 #include <sys/device.h>
48 #include <sys/msgport.h>
49 #include <sys/msgport2.h>
51 #include <sys/dsched.h>
52 #include <sys/fcntl.h>
53 #include <machine/varargs.h>
55 TAILQ_HEAD(tdio_list_head
, dsched_thread_io
);
57 MALLOC_DEFINE(M_DSCHED
, "dsched", "dsched allocs");
59 static dsched_prepare_t noop_prepare
;
60 static dsched_teardown_t noop_teardown
;
61 static dsched_cancel_t noop_cancel
;
62 static dsched_queue_t noop_queue
;
64 static void dsched_thread_io_unref_destroy(struct dsched_thread_io
*tdio
);
65 static void dsched_sysctl_add_disk(struct dsched_disk_ctx
*diskctx
, char *name
);
66 static void dsched_disk_ctx_destroy(struct dsched_disk_ctx
*diskctx
);
67 static void dsched_thread_io_destroy(struct dsched_thread_io
*tdio
);
68 static void dsched_thread_ctx_destroy(struct dsched_thread_ctx
*tdctx
);
70 static int dsched_inited
= 0;
71 static int default_set
= 0;
73 struct lock dsched_lock
;
74 static int dsched_debug_enable
= 0;
76 struct dsched_stats dsched_stats
;
78 struct objcache_malloc_args dsched_disk_ctx_malloc_args
= {
79 DSCHED_DISK_CTX_MAX_SZ
, M_DSCHED
};
80 struct objcache_malloc_args dsched_thread_io_malloc_args
= {
81 DSCHED_THREAD_IO_MAX_SZ
, M_DSCHED
};
82 struct objcache_malloc_args dsched_thread_ctx_malloc_args
= {
83 DSCHED_THREAD_CTX_MAX_SZ
, M_DSCHED
};
85 static struct objcache
*dsched_diskctx_cache
;
86 static struct objcache
*dsched_tdctx_cache
;
87 static struct objcache
*dsched_tdio_cache
;
89 TAILQ_HEAD(, dsched_thread_ctx
) dsched_tdctx_list
=
90 TAILQ_HEAD_INITIALIZER(dsched_tdctx_list
);
92 struct lock dsched_tdctx_lock
;
94 static struct dsched_policy_head dsched_policy_list
=
95 TAILQ_HEAD_INITIALIZER(dsched_policy_list
);
97 static struct dsched_policy dsched_noop_policy
= {
100 .prepare
= noop_prepare
,
101 .teardown
= noop_teardown
,
102 .cancel_all
= noop_cancel
,
103 .bio_queue
= noop_queue
106 static struct dsched_policy
*default_policy
= &dsched_noop_policy
;
109 * dsched_debug() is a SYSCTL and TUNABLE controlled debug output function
113 dsched_debug(int level
, char *fmt
, ...)
118 if (level
<= dsched_debug_enable
)
126 * Called on disk_create()
127 * tries to read which policy to use from loader.conf, if there's
128 * none specified, the default policy is used.
131 dsched_disk_create_callback(struct disk
*dp
, const char *head_name
, int unit
)
133 char tunable_key
[SPECNAMELEN
+ 48];
134 char sched_policy
[DSCHED_POLICY_NAME_LENGTH
];
136 struct dsched_policy
*policy
= NULL
;
138 /* Also look for serno stuff? */
139 lockmgr(&dsched_lock
, LK_EXCLUSIVE
);
141 ksnprintf(tunable_key
, sizeof(tunable_key
),
142 "dsched.policy.%s%d", head_name
, unit
);
143 if (TUNABLE_STR_FETCH(tunable_key
, sched_policy
,
144 sizeof(sched_policy
)) != 0) {
145 policy
= dsched_find_policy(sched_policy
);
148 ksnprintf(tunable_key
, sizeof(tunable_key
),
149 "dsched.policy.%s", head_name
);
151 for (ptr
= tunable_key
; *ptr
; ptr
++) {
155 if (!policy
&& (TUNABLE_STR_FETCH(tunable_key
, sched_policy
,
156 sizeof(sched_policy
)) != 0)) {
157 policy
= dsched_find_policy(sched_policy
);
160 ksnprintf(tunable_key
, sizeof(tunable_key
), "dsched.policy.default");
161 if (!policy
&& !default_set
&&
162 (TUNABLE_STR_FETCH(tunable_key
, sched_policy
,
163 sizeof(sched_policy
)) != 0)) {
164 policy
= dsched_find_policy(sched_policy
);
168 if (!default_set
&& bootverbose
) {
170 "No policy for %s%d specified, "
171 "or policy not found\n",
174 dsched_set_policy(dp
, default_policy
);
176 dsched_set_policy(dp
, policy
);
179 if (strncmp(head_name
, "mapper/", strlen("mapper/")) == 0)
180 ksnprintf(tunable_key
, sizeof(tunable_key
), "%s", head_name
);
182 ksnprintf(tunable_key
, sizeof(tunable_key
), "%s%d", head_name
, unit
);
183 for (ptr
= tunable_key
; *ptr
; ptr
++) {
187 dsched_sysctl_add_disk(
188 (struct dsched_disk_ctx
*)dsched_get_disk_priv(dp
),
191 lockmgr(&dsched_lock
, LK_RELEASE
);
195 * Called from disk_setdiskinfo (or rather _setdiskinfo). This will check if
196 * there's any policy associated with the serial number of the device.
199 dsched_disk_update_callback(struct disk
*dp
, struct disk_info
*info
)
201 char tunable_key
[SPECNAMELEN
+ 48];
202 char sched_policy
[DSCHED_POLICY_NAME_LENGTH
];
203 struct dsched_policy
*policy
= NULL
;
205 if (info
->d_serialno
== NULL
)
208 lockmgr(&dsched_lock
, LK_EXCLUSIVE
);
210 ksnprintf(tunable_key
, sizeof(tunable_key
), "dsched.policy.%s",
213 if((TUNABLE_STR_FETCH(tunable_key
, sched_policy
,
214 sizeof(sched_policy
)) != 0)) {
215 policy
= dsched_find_policy(sched_policy
);
219 dsched_switch(dp
, policy
);
222 dsched_sysctl_add_disk(
223 (struct dsched_disk_ctx
*)dsched_get_disk_priv(dp
),
226 lockmgr(&dsched_lock
, LK_RELEASE
);
230 * Called on disk_destroy()
231 * shuts down the scheduler core and cancels all remaining bios
234 dsched_disk_destroy_callback(struct disk
*dp
)
236 struct dsched_policy
*old_policy
;
237 struct dsched_disk_ctx
*diskctx
;
239 lockmgr(&dsched_lock
, LK_EXCLUSIVE
);
241 diskctx
= dsched_get_disk_priv(dp
);
243 old_policy
= dp
->d_sched_policy
;
244 dp
->d_sched_policy
= &dsched_noop_policy
;
245 old_policy
->cancel_all(dsched_get_disk_priv(dp
));
246 old_policy
->teardown(dsched_get_disk_priv(dp
));
248 if (diskctx
->flags
& DSCHED_SYSCTL_CTX_INITED
)
249 sysctl_ctx_free(&diskctx
->sysctl_ctx
);
252 atomic_subtract_int(&old_policy
->ref_count
, 1);
253 KKASSERT(old_policy
->ref_count
>= 0);
255 lockmgr(&dsched_lock
, LK_RELEASE
);
260 dsched_queue(struct disk
*dp
, struct bio
*bio
)
262 struct dsched_thread_ctx
*tdctx
;
263 struct dsched_thread_io
*tdio
;
264 struct dsched_disk_ctx
*diskctx
;
266 int found
= 0, error
= 0;
268 tdctx
= dsched_get_buf_priv(bio
->bio_buf
);
270 /* We don't handle this case, let dsched dispatch */
271 atomic_add_int(&dsched_stats
.no_tdctx
, 1);
272 dsched_strategy_raw(dp
, bio
);
276 DSCHED_THREAD_CTX_LOCK(tdctx
);
278 KKASSERT(!TAILQ_EMPTY(&tdctx
->tdio_list
));
281 * iterate in reverse to make sure we find the most up-to-date
282 * tdio for a given disk. After a switch it may take some time
283 * for everything to clean up.
285 TAILQ_FOREACH_REVERSE(tdio
, &tdctx
->tdio_list
, tdio_list_head
, link
) {
286 if (tdio
->dp
== dp
) {
287 dsched_thread_io_ref(tdio
);
293 DSCHED_THREAD_CTX_UNLOCK(tdctx
);
294 dsched_clr_buf_priv(bio
->bio_buf
);
295 dsched_thread_ctx_unref(tdctx
); /* acquired on new_buf */
297 KKASSERT(found
== 1);
298 diskctx
= dsched_get_disk_priv(dp
);
299 dsched_disk_ctx_ref(diskctx
);
301 if (dp
->d_sched_policy
!= &dsched_noop_policy
)
302 KKASSERT(tdio
->debug_policy
== dp
->d_sched_policy
);
304 KKASSERT(tdio
->debug_inited
== 0xF00F1234);
306 error
= dp
->d_sched_policy
->bio_queue(diskctx
, tdio
, bio
);
309 dsched_strategy_raw(dp
, bio
);
311 dsched_disk_ctx_unref(diskctx
);
312 dsched_thread_io_unref(tdio
);
317 * Called from each module_init or module_attach of each policy
318 * registers the policy in the local policy list.
321 dsched_register(struct dsched_policy
*d_policy
)
323 struct dsched_policy
*policy
;
326 lockmgr(&dsched_lock
, LK_EXCLUSIVE
);
328 policy
= dsched_find_policy(d_policy
->name
);
331 TAILQ_INSERT_TAIL(&dsched_policy_list
, d_policy
, link
);
332 atomic_add_int(&d_policy
->ref_count
, 1);
334 dsched_debug(LOG_ERR
, "Policy with name %s already registered!\n",
339 lockmgr(&dsched_lock
, LK_RELEASE
);
344 * Called from each module_detach of each policy
345 * unregisters the policy
348 dsched_unregister(struct dsched_policy
*d_policy
)
350 struct dsched_policy
*policy
;
352 lockmgr(&dsched_lock
, LK_EXCLUSIVE
);
353 policy
= dsched_find_policy(d_policy
->name
);
356 if (policy
->ref_count
> 1) {
357 lockmgr(&dsched_lock
, LK_RELEASE
);
360 TAILQ_REMOVE(&dsched_policy_list
, policy
, link
);
361 atomic_subtract_int(&policy
->ref_count
, 1);
362 KKASSERT(policy
->ref_count
== 0);
364 lockmgr(&dsched_lock
, LK_RELEASE
);
371 * switches the policy by first removing the old one and then
372 * enabling the new one.
375 dsched_switch(struct disk
*dp
, struct dsched_policy
*new_policy
)
377 struct dsched_policy
*old_policy
;
379 /* If we are asked to set the same policy, do nothing */
380 if (dp
->d_sched_policy
== new_policy
)
383 /* lock everything down, diskwise */
384 lockmgr(&dsched_lock
, LK_EXCLUSIVE
);
385 old_policy
= dp
->d_sched_policy
;
387 atomic_subtract_int(&old_policy
->ref_count
, 1);
388 KKASSERT(old_policy
->ref_count
>= 0);
390 dp
->d_sched_policy
= &dsched_noop_policy
;
391 old_policy
->teardown(dsched_get_disk_priv(dp
));
394 /* Bring everything back to life */
395 dsched_set_policy(dp
, new_policy
);
396 lockmgr(&dsched_lock
, LK_RELEASE
);
403 * Loads a given policy and attaches it to the specified disk.
404 * Also initializes the core for the policy
407 dsched_set_policy(struct disk
*dp
, struct dsched_policy
*new_policy
)
411 /* Check if it is locked already. if not, we acquire the devfs lock */
412 if (!(lockstatus(&dsched_lock
, curthread
)) == LK_EXCLUSIVE
) {
413 lockmgr(&dsched_lock
, LK_EXCLUSIVE
);
417 DSCHED_GLOBAL_THREAD_CTX_LOCK();
419 policy_new(dp
, new_policy
);
420 new_policy
->prepare(dsched_get_disk_priv(dp
));
421 dp
->d_sched_policy
= new_policy
;
423 DSCHED_GLOBAL_THREAD_CTX_UNLOCK();
425 atomic_add_int(&new_policy
->ref_count
, 1);
426 kprintf("disk scheduler: set policy of %s to %s\n", dp
->d_cdev
->si_name
,
429 /* If we acquired the lock, we also get rid of it */
431 lockmgr(&dsched_lock
, LK_RELEASE
);
434 struct dsched_policy
*
435 dsched_find_policy(char *search
)
437 struct dsched_policy
*policy
;
438 struct dsched_policy
*policy_found
= NULL
;
441 /* Check if it is locked already. if not, we acquire the devfs lock */
442 if (!(lockstatus(&dsched_lock
, curthread
)) == LK_EXCLUSIVE
) {
443 lockmgr(&dsched_lock
, LK_EXCLUSIVE
);
447 TAILQ_FOREACH(policy
, &dsched_policy_list
, link
) {
448 if (!strcmp(policy
->name
, search
)) {
449 policy_found
= policy
;
454 /* If we acquired the lock, we also get rid of it */
456 lockmgr(&dsched_lock
, LK_RELEASE
);
465 dsched_find_disk(char *search
)
468 struct disk
*dp
= NULL
;
470 while ((dp
= disk_enumerate(&marker
, dp
)) != NULL
) {
471 if (strcmp(dp
->d_cdev
->si_name
, search
) == 0) {
472 disk_enumerate_stop(&marker
, NULL
);
473 /* leave ref on dp */
481 dsched_disk_enumerate(struct disk
*marker
, struct disk
*dp
,
482 struct dsched_policy
*policy
)
484 while ((dp
= disk_enumerate(marker
, dp
)) != NULL
) {
485 if (dp
->d_sched_policy
== policy
)
491 struct dsched_policy
*
492 dsched_policy_enumerate(struct dsched_policy
*pol
)
495 return (TAILQ_FIRST(&dsched_policy_list
));
497 return (TAILQ_NEXT(pol
, link
));
501 dsched_cancel_bio(struct bio
*bp
)
503 bp
->bio_buf
->b_error
= ENXIO
;
504 bp
->bio_buf
->b_flags
|= B_ERROR
;
505 bp
->bio_buf
->b_resid
= bp
->bio_buf
->b_bcount
;
511 dsched_strategy_raw(struct disk
*dp
, struct bio
*bp
)
514 * Ideally, this stuff shouldn't be needed... but just in case, we leave it in
517 KASSERT(dp
->d_rawdev
!= NULL
, ("dsched_strategy_raw sees NULL d_rawdev!!"));
518 if(bp
->bio_track
!= NULL
) {
519 dsched_debug(LOG_INFO
,
520 "dsched_strategy_raw sees non-NULL bio_track!! "
522 bp
->bio_track
= NULL
;
524 dev_dstrategy(dp
->d_rawdev
, bp
);
528 dsched_strategy_sync(struct disk
*dp
, struct bio
*bio
)
530 struct buf
*bp
, *nbp
;
538 nbp
->b_cmd
= bp
->b_cmd
;
539 nbp
->b_bufsize
= bp
->b_bufsize
;
540 nbp
->b_runningbufspace
= bp
->b_runningbufspace
;
541 nbp
->b_bcount
= bp
->b_bcount
;
542 nbp
->b_resid
= bp
->b_resid
;
543 nbp
->b_data
= bp
->b_data
;
546 * Buffers undergoing device I/O do not need a kvabase/size.
548 nbp
->b_kvabase
= bp
->b_kvabase
;
549 nbp
->b_kvasize
= bp
->b_kvasize
;
551 nbp
->b_dirtyend
= bp
->b_dirtyend
;
553 nbio
->bio_done
= biodone_sync
;
554 nbio
->bio_flags
|= BIO_SYNC
;
555 nbio
->bio_track
= NULL
;
557 nbio
->bio_caller_info1
.ptr
= dp
;
558 nbio
->bio_offset
= bio
->bio_offset
;
560 dev_dstrategy(dp
->d_rawdev
, nbio
);
561 biowait(nbio
, "dschedsync");
562 bp
->b_resid
= nbp
->b_resid
;
563 bp
->b_error
= nbp
->b_error
;
566 nbp
->b_kvabase
= NULL
;
573 dsched_strategy_async(struct disk
*dp
, struct bio
*bio
, biodone_t
*done
, void *priv
)
577 nbio
= push_bio(bio
);
578 nbio
->bio_done
= done
;
579 nbio
->bio_offset
= bio
->bio_offset
;
581 dsched_set_bio_dp(nbio
, dp
);
582 dsched_set_bio_priv(nbio
, priv
);
584 getmicrotime(&nbio
->bio_caller_info3
.tv
);
585 dev_dstrategy(dp
->d_rawdev
, nbio
);
589 * A special bio done call back function
590 * used by policy having request polling implemented.
593 request_polling_biodone(struct bio
*bp
)
595 struct dsched_disk_ctx
*diskctx
= NULL
;
596 struct disk
*dp
= NULL
;
598 struct dsched_policy
*policy
;
600 dp
= dsched_get_bio_dp(bp
);
601 policy
= dp
->d_sched_policy
;
602 diskctx
= dsched_get_disk_priv(dp
);
603 KKASSERT(diskctx
&& policy
);
604 dsched_disk_ctx_ref(diskctx
);
608 * the bio_done function should not be blocked !
610 if (diskctx
->dp
->d_sched_policy
->bio_done
)
611 diskctx
->dp
->d_sched_policy
->bio_done(bp
);
616 atomic_subtract_int(&diskctx
->current_tag_queue_depth
, 1);
618 /* call the polling function,
620 * the polling function should not be blocked!
622 if (policy
->polling_func
)
623 policy
->polling_func(diskctx
);
625 dsched_debug(0, "dsched: the policy uses request polling without a polling function!\n");
626 dsched_disk_ctx_unref(diskctx
);
630 * A special dsched strategy used by policy having request polling
631 * (polling function) implemented.
633 * The strategy is the just like dsched_strategy_async(), but
634 * the biodone call back is set to a preset one.
636 * If the policy needs its own biodone callback, it should
637 * register it in the policy structure. (bio_done field)
639 * The current_tag_queue_depth is maintained by this function
640 * and the request_polling_biodone() function
644 dsched_strategy_request_polling(struct disk
*dp
, struct bio
*bio
, struct dsched_disk_ctx
*diskctx
)
646 atomic_add_int(&diskctx
->current_tag_queue_depth
, 1);
647 dsched_strategy_async(dp
, bio
, request_polling_biodone
, dsched_get_bio_priv(bio
));
651 * Ref and deref various structures. The 1->0 transition of the reference
652 * count actually transitions 1->0x80000000 and causes the object to be
653 * destroyed. It is possible for transitory references to occur on the
654 * object while it is being destroyed. We use bit 31 to indicate that
655 * destruction is in progress and to prevent nested destructions.
658 dsched_disk_ctx_ref(struct dsched_disk_ctx
*diskctx
)
662 refcount
= atomic_fetchadd_int(&diskctx
->refcount
, 1);
666 dsched_thread_io_ref(struct dsched_thread_io
*tdio
)
670 refcount
= atomic_fetchadd_int(&tdio
->refcount
, 1);
674 dsched_thread_ctx_ref(struct dsched_thread_ctx
*tdctx
)
678 refcount
= atomic_fetchadd_int(&tdctx
->refcount
, 1);
682 dsched_disk_ctx_unref(struct dsched_disk_ctx
*diskctx
)
688 * Handle 1->0 transitions for diskctx and nested destruction
689 * recursions. If the refs are already in destruction mode (bit 31
690 * set) on the 1->0 transition we don't try to destruct it again.
692 * 0x80000001->0x80000000 transitions are handled normally and
693 * thus avoid nested dstruction.
696 refs
= diskctx
->refcount
;
700 KKASSERT(((refs
^ nrefs
) & 0x80000000) == 0);
702 if (atomic_cmpset_int(&diskctx
->refcount
, refs
, nrefs
))
707 if (atomic_cmpset_int(&diskctx
->refcount
, refs
, nrefs
)) {
708 dsched_disk_ctx_destroy(diskctx
);
716 dsched_disk_ctx_destroy(struct dsched_disk_ctx
*diskctx
)
718 struct dsched_thread_io
*tdio
;
723 kprintf("diskctx (%p) destruction started, trace:\n", diskctx
);
726 lockmgr(&diskctx
->lock
, LK_EXCLUSIVE
);
727 while ((tdio
= TAILQ_FIRST(&diskctx
->tdio_list
)) != NULL
) {
728 KKASSERT(tdio
->flags
& DSCHED_LINKED_DISK_CTX
);
729 TAILQ_REMOVE(&diskctx
->tdio_list
, tdio
, dlink
);
730 atomic_clear_int(&tdio
->flags
, DSCHED_LINKED_DISK_CTX
);
731 tdio
->diskctx
= NULL
;
732 /* XXX tdio->diskctx->dp->d_sched_policy->destroy_tdio(tdio);*/
733 lockmgr(&diskctx
->lock
, LK_RELEASE
);
734 dsched_thread_io_unref_destroy(tdio
);
735 lockmgr(&diskctx
->lock
, LK_EXCLUSIVE
);
737 lockmgr(&diskctx
->lock
, LK_RELEASE
);
740 * Expect diskctx->refcount to be 0x80000000. If it isn't someone
741 * else still has a temporary ref on the diskctx and we have to
742 * transition it back to an undestroyed-state (albeit without any
743 * associations), so the other user destroys it properly when the
746 while ((refs
= diskctx
->refcount
) != 0x80000000) {
747 kprintf("dsched_thread_io: destroy race diskctx=%p\n", diskctx
);
749 KKASSERT(refs
& 0x80000000);
750 nrefs
= refs
& 0x7FFFFFFF;
751 if (atomic_cmpset_int(&diskctx
->refcount
, refs
, nrefs
))
756 * Really for sure now.
758 if (diskctx
->dp
->d_sched_policy
->destroy_diskctx
)
759 diskctx
->dp
->d_sched_policy
->destroy_diskctx(diskctx
);
760 objcache_put(dsched_diskctx_cache
, diskctx
);
761 atomic_subtract_int(&dsched_stats
.diskctx_allocations
, 1);
765 dsched_thread_io_unref(struct dsched_thread_io
*tdio
)
771 * Handle 1->0 transitions for tdio and nested destruction
772 * recursions. If the refs are already in destruction mode (bit 31
773 * set) on the 1->0 transition we don't try to destruct it again.
775 * 0x80000001->0x80000000 transitions are handled normally and
776 * thus avoid nested dstruction.
779 refs
= tdio
->refcount
;
783 KKASSERT(((refs
^ nrefs
) & 0x80000000) == 0);
785 if (atomic_cmpset_int(&tdio
->refcount
, refs
, nrefs
))
790 if (atomic_cmpset_int(&tdio
->refcount
, refs
, nrefs
)) {
791 dsched_thread_io_destroy(tdio
);
798 * Unref and destroy the tdio even if additional refs are present.
802 dsched_thread_io_unref_destroy(struct dsched_thread_io
*tdio
)
808 * If not already transitioned to destroy-in-progress we transition
809 * to destroy-in-progress, cleanup our ref, and destroy the tdio.
812 refs
= tdio
->refcount
;
816 KKASSERT(((refs
^ nrefs
) & 0x80000000) == 0);
817 if (nrefs
& 0x80000000) {
818 if (atomic_cmpset_int(&tdio
->refcount
, refs
, nrefs
))
823 if (atomic_cmpset_int(&tdio
->refcount
, refs
, nrefs
)) {
824 dsched_thread_io_destroy(tdio
);
831 dsched_thread_io_destroy(struct dsched_thread_io
*tdio
)
833 struct dsched_thread_ctx
*tdctx
;
834 struct dsched_disk_ctx
*diskctx
;
839 kprintf("tdio (%p) destruction started, trace:\n", tdio
);
842 KKASSERT(tdio
->qlength
== 0);
844 while ((diskctx
= tdio
->diskctx
) != NULL
) {
845 dsched_disk_ctx_ref(diskctx
);
846 lockmgr(&diskctx
->lock
, LK_EXCLUSIVE
);
847 if (diskctx
!= tdio
->diskctx
) {
848 lockmgr(&diskctx
->lock
, LK_RELEASE
);
849 dsched_disk_ctx_unref(diskctx
);
852 KKASSERT(tdio
->flags
& DSCHED_LINKED_DISK_CTX
);
853 if (diskctx
->dp
->d_sched_policy
->destroy_tdio
)
854 diskctx
->dp
->d_sched_policy
->destroy_tdio(tdio
);
855 TAILQ_REMOVE(&diskctx
->tdio_list
, tdio
, dlink
);
856 atomic_clear_int(&tdio
->flags
, DSCHED_LINKED_DISK_CTX
);
857 tdio
->diskctx
= NULL
;
858 dsched_thread_io_unref(tdio
);
859 lockmgr(&diskctx
->lock
, LK_RELEASE
);
860 dsched_disk_ctx_unref(diskctx
);
862 while ((tdctx
= tdio
->tdctx
) != NULL
) {
863 dsched_thread_ctx_ref(tdctx
);
864 lockmgr(&tdctx
->lock
, LK_EXCLUSIVE
);
865 if (tdctx
!= tdio
->tdctx
) {
866 lockmgr(&tdctx
->lock
, LK_RELEASE
);
867 dsched_thread_ctx_unref(tdctx
);
870 KKASSERT(tdio
->flags
& DSCHED_LINKED_THREAD_CTX
);
871 TAILQ_REMOVE(&tdctx
->tdio_list
, tdio
, link
);
872 atomic_clear_int(&tdio
->flags
, DSCHED_LINKED_THREAD_CTX
);
874 dsched_thread_io_unref(tdio
);
875 lockmgr(&tdctx
->lock
, LK_RELEASE
);
876 dsched_thread_ctx_unref(tdctx
);
880 * Expect tdio->refcount to be 0x80000000. If it isn't someone else
881 * still has a temporary ref on the tdio and we have to transition
882 * it back to an undestroyed-state (albeit without any associations)
883 * so the other user destroys it properly when the ref is released.
885 while ((refs
= tdio
->refcount
) != 0x80000000) {
886 kprintf("dsched_thread_io: destroy race tdio=%p\n", tdio
);
888 KKASSERT(refs
& 0x80000000);
889 nrefs
= refs
& 0x7FFFFFFF;
890 if (atomic_cmpset_int(&tdio
->refcount
, refs
, nrefs
))
895 * Really for sure now.
897 objcache_put(dsched_tdio_cache
, tdio
);
898 atomic_subtract_int(&dsched_stats
.tdio_allocations
, 1);
902 dsched_thread_ctx_unref(struct dsched_thread_ctx
*tdctx
)
908 * Handle 1->0 transitions for tdctx and nested destruction
909 * recursions. If the refs are already in destruction mode (bit 31
910 * set) on the 1->0 transition we don't try to destruct it again.
912 * 0x80000001->0x80000000 transitions are handled normally and
913 * thus avoid nested dstruction.
916 refs
= tdctx
->refcount
;
920 KKASSERT(((refs
^ nrefs
) & 0x80000000) == 0);
922 if (atomic_cmpset_int(&tdctx
->refcount
, refs
, nrefs
))
927 if (atomic_cmpset_int(&tdctx
->refcount
, refs
, nrefs
)) {
928 dsched_thread_ctx_destroy(tdctx
);
935 dsched_thread_ctx_destroy(struct dsched_thread_ctx
*tdctx
)
937 struct dsched_thread_io
*tdio
;
940 kprintf("tdctx (%p) destruction started, trace:\n", tdctx
);
943 DSCHED_GLOBAL_THREAD_CTX_LOCK();
945 lockmgr(&tdctx
->lock
, LK_EXCLUSIVE
);
947 while ((tdio
= TAILQ_FIRST(&tdctx
->tdio_list
)) != NULL
) {
948 KKASSERT(tdio
->flags
& DSCHED_LINKED_THREAD_CTX
);
949 TAILQ_REMOVE(&tdctx
->tdio_list
, tdio
, link
);
950 atomic_clear_int(&tdio
->flags
, DSCHED_LINKED_THREAD_CTX
);
952 lockmgr(&tdctx
->lock
, LK_RELEASE
); /* avoid deadlock */
953 dsched_thread_io_unref_destroy(tdio
);
954 lockmgr(&tdctx
->lock
, LK_EXCLUSIVE
);
956 KKASSERT(tdctx
->refcount
== 0x80000000);
957 TAILQ_REMOVE(&dsched_tdctx_list
, tdctx
, link
);
959 lockmgr(&tdctx
->lock
, LK_RELEASE
);
961 DSCHED_GLOBAL_THREAD_CTX_UNLOCK();
963 objcache_put(dsched_tdctx_cache
, tdctx
);
964 atomic_subtract_int(&dsched_stats
.tdctx_allocations
, 1);
968 * Ensures that a tdio is assigned to tdctx and disk.
971 dsched_thread_io_alloc(struct disk
*dp
, struct dsched_thread_ctx
*tdctx
,
972 struct dsched_policy
*pol
)
974 struct dsched_thread_io
*tdio
;
976 dsched_disk_ctx_ref(dsched_get_disk_priv(dp
));
978 tdio
= objcache_get(dsched_tdio_cache
, M_WAITOK
);
979 bzero(tdio
, DSCHED_THREAD_IO_MAX_SZ
);
981 dsched_thread_io_ref(tdio
); /* prevent ripout */
982 dsched_thread_io_ref(tdio
); /* for diskctx ref */
984 DSCHED_THREAD_IO_LOCKINIT(tdio
);
987 tdio
->diskctx
= dsched_get_disk_priv(dp
);
988 TAILQ_INIT(&tdio
->queue
);
993 lockmgr(&tdio
->diskctx
->lock
, LK_EXCLUSIVE
);
994 TAILQ_INSERT_TAIL(&tdio
->diskctx
->tdio_list
, tdio
, dlink
);
995 atomic_set_int(&tdio
->flags
, DSCHED_LINKED_DISK_CTX
);
996 lockmgr(&tdio
->diskctx
->lock
, LK_RELEASE
);
1000 * Put the tdio in the tdctx list. Inherit the temporary
1001 * ref (one ref for each list).
1003 DSCHED_THREAD_CTX_LOCK(tdctx
);
1004 tdio
->tdctx
= tdctx
;
1006 TAILQ_INSERT_TAIL(&tdctx
->tdio_list
, tdio
, link
);
1007 atomic_set_int(&tdio
->flags
, DSCHED_LINKED_THREAD_CTX
);
1008 DSCHED_THREAD_CTX_UNLOCK(tdctx
);
1010 dsched_thread_io_unref(tdio
);
1013 tdio
->debug_policy
= pol
;
1014 tdio
->debug_inited
= 0xF00F1234;
1016 atomic_add_int(&dsched_stats
.tdio_allocations
, 1);
1020 struct dsched_disk_ctx
*
1021 dsched_disk_ctx_alloc(struct disk
*dp
, struct dsched_policy
*pol
)
1023 struct dsched_disk_ctx
*diskctx
;
1025 diskctx
= objcache_get(dsched_diskctx_cache
, M_WAITOK
);
1026 bzero(diskctx
, DSCHED_DISK_CTX_MAX_SZ
);
1027 dsched_disk_ctx_ref(diskctx
);
1029 DSCHED_DISK_CTX_LOCKINIT(diskctx
);
1030 TAILQ_INIT(&diskctx
->tdio_list
);
1032 * XXX: magic number 32: most device has a tag queue
1034 * Better to retrive more precise value from the driver
1036 diskctx
->max_tag_queue_depth
= 32;
1037 diskctx
->current_tag_queue_depth
= 0;
1039 atomic_add_int(&dsched_stats
.diskctx_allocations
, 1);
1040 if (pol
->new_diskctx
)
1041 pol
->new_diskctx(diskctx
);
1046 struct dsched_thread_ctx
*
1047 dsched_thread_ctx_alloc(struct proc
*p
)
1049 struct dsched_thread_ctx
*tdctx
;
1053 tdctx
= objcache_get(dsched_tdctx_cache
, M_WAITOK
);
1054 bzero(tdctx
, DSCHED_THREAD_CTX_MAX_SZ
);
1055 dsched_thread_ctx_ref(tdctx
);
1057 kprintf("dsched_thread_ctx_alloc, new tdctx = %p\n", tdctx
);
1059 DSCHED_THREAD_CTX_LOCKINIT(tdctx
);
1060 TAILQ_INIT(&tdctx
->tdio_list
);
1063 DSCHED_GLOBAL_THREAD_CTX_LOCK();
1065 while ((dp
= disk_enumerate(&marker
, dp
)) != NULL
)
1066 dsched_thread_io_alloc(dp
, tdctx
, dp
->d_sched_policy
);
1068 TAILQ_INSERT_TAIL(&dsched_tdctx_list
, tdctx
, link
);
1069 DSCHED_GLOBAL_THREAD_CTX_UNLOCK();
1071 atomic_add_int(&dsched_stats
.tdctx_allocations
, 1);
1072 /* XXX: no callback here */
1077 policy_new(struct disk
*dp
, struct dsched_policy
*pol
) {
1078 struct dsched_thread_ctx
*tdctx
;
1079 struct dsched_disk_ctx
*diskctx
;
1081 diskctx
= dsched_disk_ctx_alloc(dp
, pol
);
1082 dsched_disk_ctx_ref(diskctx
);
1083 dsched_set_disk_priv(dp
, diskctx
);
1086 * XXX this is really really expensive!
1088 TAILQ_FOREACH(tdctx
, &dsched_tdctx_list
, link
)
1089 dsched_thread_io_alloc(dp
, tdctx
, pol
);
1093 policy_destroy(struct disk
*dp
) {
1094 struct dsched_disk_ctx
*diskctx
;
1096 diskctx
= dsched_get_disk_priv(dp
);
1097 KKASSERT(diskctx
!= NULL
);
1099 dsched_disk_ctx_unref(diskctx
); /* from prepare */
1100 dsched_disk_ctx_unref(diskctx
); /* from alloc */
1102 dsched_set_disk_priv(dp
, NULL
);
1106 dsched_new_buf(struct buf
*bp
)
1108 struct dsched_thread_ctx
*tdctx
= NULL
;
1110 if (dsched_inited
== 0)
1113 if (curproc
!= NULL
) {
1114 tdctx
= dsched_get_proc_priv(curproc
);
1116 /* This is a kernel thread, so no proc info is available */
1117 tdctx
= dsched_get_thread_priv(curthread
);
1122 * XXX: hack. we don't want this assert because we aren't catching all
1123 * threads. mi_startup() is still getting away without an tdctx.
1126 /* by now we should have an tdctx. if not, something bad is going on */
1127 KKASSERT(tdctx
!= NULL
);
1131 dsched_thread_ctx_ref(tdctx
);
1133 dsched_set_buf_priv(bp
, tdctx
);
1137 dsched_exit_buf(struct buf
*bp
)
1139 struct dsched_thread_ctx
*tdctx
;
1141 tdctx
= dsched_get_buf_priv(bp
);
1142 if (tdctx
!= NULL
) {
1143 dsched_clr_buf_priv(bp
);
1144 dsched_thread_ctx_unref(tdctx
);
1149 dsched_new_proc(struct proc
*p
)
1151 struct dsched_thread_ctx
*tdctx
;
1153 if (dsched_inited
== 0)
1156 KKASSERT(p
!= NULL
);
1158 tdctx
= dsched_thread_ctx_alloc(p
);
1160 dsched_thread_ctx_ref(tdctx
);
1162 dsched_set_proc_priv(p
, tdctx
);
1163 atomic_add_int(&dsched_stats
.nprocs
, 1);
1168 dsched_new_thread(struct thread
*td
)
1170 struct dsched_thread_ctx
*tdctx
;
1172 if (dsched_inited
== 0)
1175 KKASSERT(td
!= NULL
);
1177 tdctx
= dsched_thread_ctx_alloc(NULL
);
1179 dsched_thread_ctx_ref(tdctx
);
1181 dsched_set_thread_priv(td
, tdctx
);
1182 atomic_add_int(&dsched_stats
.nthreads
, 1);
1186 dsched_exit_proc(struct proc
*p
)
1188 struct dsched_thread_ctx
*tdctx
;
1190 if (dsched_inited
== 0)
1193 KKASSERT(p
!= NULL
);
1195 tdctx
= dsched_get_proc_priv(p
);
1196 KKASSERT(tdctx
!= NULL
);
1198 tdctx
->dead
= 0xDEAD;
1199 dsched_set_proc_priv(p
, NULL
);
1201 dsched_thread_ctx_unref(tdctx
); /* one for alloc, */
1202 dsched_thread_ctx_unref(tdctx
); /* one for ref */
1203 atomic_subtract_int(&dsched_stats
.nprocs
, 1);
1208 dsched_exit_thread(struct thread
*td
)
1210 struct dsched_thread_ctx
*tdctx
;
1212 if (dsched_inited
== 0)
1215 KKASSERT(td
!= NULL
);
1217 tdctx
= dsched_get_thread_priv(td
);
1218 KKASSERT(tdctx
!= NULL
);
1220 tdctx
->dead
= 0xDEAD;
1221 dsched_set_thread_priv(td
, 0);
1223 dsched_thread_ctx_unref(tdctx
); /* one for alloc, */
1224 dsched_thread_ctx_unref(tdctx
); /* one for ref */
1225 atomic_subtract_int(&dsched_stats
.nthreads
, 1);
1229 * Returns ref'd tdio.
1231 * tdio may have additional refs for the diskctx and tdctx it resides on.
1234 dsched_new_policy_thread_tdio(struct dsched_disk_ctx
*diskctx
,
1235 struct dsched_policy
*pol
)
1237 struct dsched_thread_ctx
*tdctx
;
1239 DSCHED_GLOBAL_THREAD_CTX_LOCK();
1241 tdctx
= dsched_get_thread_priv(curthread
);
1242 KKASSERT(tdctx
!= NULL
);
1243 dsched_thread_io_alloc(diskctx
->dp
, tdctx
, pol
);
1245 DSCHED_GLOBAL_THREAD_CTX_UNLOCK();
1248 /* DEFAULT NOOP POLICY */
1251 noop_prepare(struct dsched_disk_ctx
*diskctx
)
1257 noop_teardown(struct dsched_disk_ctx
*diskctx
)
1263 noop_cancel(struct dsched_disk_ctx
*diskctx
)
1269 noop_queue(struct dsched_disk_ctx
*diskctx
, struct dsched_thread_io
*tdio
,
1272 dsched_strategy_raw(diskctx
->dp
, bio
);
1274 dsched_strategy_async(diskctx
->dp
, bio
, noop_completed
, NULL
);
1285 dsched_tdio_cache
= objcache_create("dsched-tdio-cache", 0, 0,
1287 objcache_malloc_alloc
,
1288 objcache_malloc_free
,
1289 &dsched_thread_io_malloc_args
);
1291 dsched_tdctx_cache
= objcache_create("dsched-tdctx-cache", 0, 0,
1293 objcache_malloc_alloc
,
1294 objcache_malloc_free
,
1295 &dsched_thread_ctx_malloc_args
);
1297 dsched_diskctx_cache
= objcache_create("dsched-diskctx-cache", 0, 0,
1299 objcache_malloc_alloc
,
1300 objcache_malloc_free
,
1301 &dsched_disk_ctx_malloc_args
);
1303 bzero(&dsched_stats
, sizeof(struct dsched_stats
));
1305 lockinit(&dsched_lock
, "dsched lock", 0, LK_CANRECURSE
);
1306 DSCHED_GLOBAL_THREAD_CTX_LOCKINIT();
1308 dsched_register(&dsched_noop_policy
);
1318 SYSINIT(subr_dsched_register
, SI_SUB_CREATE_INIT
-1, SI_ORDER_FIRST
, dsched_init
, NULL
);
1319 SYSUNINIT(subr_dsched_register
, SI_SUB_CREATE_INIT
-1, SI_ORDER_ANY
, dsched_uninit
, NULL
);
1325 sysctl_dsched_stats(SYSCTL_HANDLER_ARGS
)
1327 return (sysctl_handle_opaque(oidp
, &dsched_stats
, sizeof(struct dsched_stats
), req
));
1331 sysctl_dsched_list_policies(SYSCTL_HANDLER_ARGS
)
1333 struct dsched_policy
*pol
= NULL
;
1334 int error
, first
= 1;
1336 lockmgr(&dsched_lock
, LK_EXCLUSIVE
);
1338 while ((pol
= dsched_policy_enumerate(pol
))) {
1340 error
= SYSCTL_OUT(req
, " ", 1);
1346 error
= SYSCTL_OUT(req
, pol
->name
, strlen(pol
->name
));
1352 lockmgr(&dsched_lock
, LK_RELEASE
);
1354 error
= SYSCTL_OUT(req
, "", 1);
1360 sysctl_dsched_policy(SYSCTL_HANDLER_ARGS
)
1362 char buf
[DSCHED_POLICY_NAME_LENGTH
];
1363 struct dsched_disk_ctx
*diskctx
= arg1
;
1364 struct dsched_policy
*pol
= NULL
;
1367 if (diskctx
== NULL
) {
1371 lockmgr(&dsched_lock
, LK_EXCLUSIVE
);
1373 pol
= diskctx
->dp
->d_sched_policy
;
1374 memcpy(buf
, pol
->name
, DSCHED_POLICY_NAME_LENGTH
);
1376 error
= sysctl_handle_string(oidp
, buf
, DSCHED_POLICY_NAME_LENGTH
, req
);
1377 if (error
|| req
->newptr
== NULL
) {
1378 lockmgr(&dsched_lock
, LK_RELEASE
);
1382 pol
= dsched_find_policy(buf
);
1384 lockmgr(&dsched_lock
, LK_RELEASE
);
1388 dsched_switch(diskctx
->dp
, pol
);
1390 lockmgr(&dsched_lock
, LK_RELEASE
);
1396 sysctl_dsched_default_policy(SYSCTL_HANDLER_ARGS
)
1398 char buf
[DSCHED_POLICY_NAME_LENGTH
];
1399 struct dsched_policy
*pol
= NULL
;
1402 lockmgr(&dsched_lock
, LK_EXCLUSIVE
);
1404 pol
= default_policy
;
1405 memcpy(buf
, pol
->name
, DSCHED_POLICY_NAME_LENGTH
);
1407 error
= sysctl_handle_string(oidp
, buf
, DSCHED_POLICY_NAME_LENGTH
, req
);
1408 if (error
|| req
->newptr
== NULL
) {
1409 lockmgr(&dsched_lock
, LK_RELEASE
);
1413 pol
= dsched_find_policy(buf
);
1415 lockmgr(&dsched_lock
, LK_RELEASE
);
1420 default_policy
= pol
;
1422 lockmgr(&dsched_lock
, LK_RELEASE
);
1427 SYSCTL_NODE(, OID_AUTO
, dsched
, CTLFLAG_RD
, NULL
,
1428 "Disk Scheduler Framework (dsched) magic");
1429 SYSCTL_NODE(_dsched
, OID_AUTO
, policy
, CTLFLAG_RW
, NULL
,
1430 "List of disks and their policies");
1431 SYSCTL_INT(_dsched
, OID_AUTO
, debug
, CTLFLAG_RW
, &dsched_debug_enable
,
1432 0, "Enable dsched debugging");
1433 SYSCTL_PROC(_dsched
, OID_AUTO
, stats
, CTLTYPE_OPAQUE
|CTLFLAG_RD
,
1434 0, sizeof(struct dsched_stats
), sysctl_dsched_stats
, "dsched_stats",
1435 "dsched statistics");
1436 SYSCTL_PROC(_dsched
, OID_AUTO
, policies
, CTLTYPE_STRING
|CTLFLAG_RD
,
1437 NULL
, 0, sysctl_dsched_list_policies
, "A", "names of available policies");
1438 SYSCTL_PROC(_dsched_policy
, OID_AUTO
, default, CTLTYPE_STRING
|CTLFLAG_RW
,
1439 NULL
, 0, sysctl_dsched_default_policy
, "A", "default dsched policy");
1442 dsched_sysctl_add_disk(struct dsched_disk_ctx
*diskctx
, char *name
)
1444 if (!(diskctx
->flags
& DSCHED_SYSCTL_CTX_INITED
)) {
1445 diskctx
->flags
|= DSCHED_SYSCTL_CTX_INITED
;
1446 sysctl_ctx_init(&diskctx
->sysctl_ctx
);
1449 SYSCTL_ADD_PROC(&diskctx
->sysctl_ctx
, SYSCTL_STATIC_CHILDREN(_dsched_policy
),
1450 OID_AUTO
, name
, CTLTYPE_STRING
|CTLFLAG_RW
,
1451 diskctx
, 0, sysctl_dsched_policy
, "A", "policy");