2 * Copyright (c) 2006 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * $DragonFly: src/sys/kern/kern_ccms.c,v 1.4 2007/04/30 07:18:53 dillon Exp $
37 * The Cache Coherency Management System (CCMS)
40 #include <sys/param.h>
41 #include <sys/systm.h>
42 #include <sys/kernel.h>
43 #include <sys/malloc.h>
44 #include <sys/objcache.h>
46 #include <sys/sysctl.h>
48 #include <machine/limits.h>
50 #include <sys/spinlock2.h>
52 struct ccms_lock_scan_info
{
60 static int ccms_cst_cmp(ccms_cst_t b1
, ccms_cst_t b2
);
61 static int ccms_lock_scan_cmp(ccms_cst_t b1
, void *arg
);
62 static int ccms_lock_undo_cmp(ccms_cst_t b1
, void *arg
);
63 static int ccms_dataspace_destroy_match(ccms_cst_t cst
, void *arg
);
64 static int ccms_lock_get_match(struct ccms_cst
*cst
, void *arg
);
65 static int ccms_lock_undo_match(struct ccms_cst
*cst
, void *arg
);
66 static int ccms_lock_redo_match(struct ccms_cst
*cst
, void *arg
);
67 static int ccms_lock_put_match(struct ccms_cst
*cst
, void *arg
);
69 RB_GENERATE3(ccms_rb_tree
, ccms_cst
, rbnode
, ccms_cst_cmp
,
70 off_t
, beg_offset
, end_offset
);
71 static MALLOC_DEFINE(M_CCMS
, "CCMS", "Cache Coherency Management System");
73 static int ccms_enable
;
74 SYSCTL_INT(_kern
, OID_AUTO
, ccms_enable
, CTLFLAG_RW
, &ccms_enable
, 0, "");
76 static struct objcache
*ccms_oc
;
79 * Initialize the CCMS subsystem
84 ccms_oc
= objcache_create_simple(M_CCMS
, sizeof(struct ccms_cst
));
86 SYSINIT(ccms
, SI_BOOT2_MACHDEP
, SI_ORDER_ANY
, ccmsinit
, NULL
);
89 * Initialize a new CCMS dataspace. Create a new RB tree with a single
90 * element covering the entire 64 bit offset range. This simplifies
91 * algorithms enormously by removing a number of special cases.
94 ccms_dataspace_init(ccms_dataspace_t ds
)
101 cst
= objcache_get(ccms_oc
, M_WAITOK
);
102 bzero(cst
, sizeof(*cst
));
103 cst
->beg_offset
= LLONG_MIN
;
104 cst
->end_offset
= LLONG_MAX
;
105 cst
->state
= CCMS_STATE_INVALID
;
106 RB_INSERT(ccms_rb_tree
, &ds
->tree
, cst
);
107 spin_init(&ds
->spin
);
111 * Helper to destroy deleted cst's.
115 ccms_delayed_free(ccms_cst_t cstn
)
119 while((cst
= cstn
) != NULL
) {
120 cstn
= cst
->delayed_next
;
121 objcache_put(ccms_oc
, cst
);
126 * Destroy a CCMS dataspace.
131 ccms_dataspace_destroy(ccms_dataspace_t ds
)
135 spin_lock_wr(&ds
->spin
);
136 RB_SCAN(ccms_rb_tree
, &ds
->tree
, NULL
,
137 ccms_dataspace_destroy_match
, ds
);
138 cst
= ds
->delayed_free
;
139 ds
->delayed_free
= NULL
;
140 spin_unlock_wr(&ds
->spin
);
141 ccms_delayed_free(cst
);
145 * Helper routine to delete matches during a destroy.
147 * NOTE: called with spinlock held.
151 ccms_dataspace_destroy_match(ccms_cst_t cst
, void *arg
)
153 ccms_dataspace_t ds
= arg
;
155 RB_REMOVE(ccms_rb_tree
, &ds
->tree
, cst
);
156 cst
->delayed_next
= ds
->delayed_free
;
157 ds
->delayed_free
= cst
;
167 ccms_lock_get(ccms_dataspace_t ds
, ccms_lock_t lock
)
169 struct ccms_lock_scan_info info
;
172 if (ccms_enable
== 0) {
178 * Partition the CST space so the precise range is covered and
179 * attempt to obtain the requested local lock (ltype) at the same
185 info
.coll_cst
= NULL
;
186 info
.cst1
= objcache_get(ccms_oc
, M_WAITOK
);
187 info
.cst2
= objcache_get(ccms_oc
, M_WAITOK
);
189 spin_lock_wr(&ds
->spin
);
190 RB_SCAN(ccms_rb_tree
, &ds
->tree
, ccms_lock_scan_cmp
,
191 ccms_lock_get_match
, &info
);
194 * If a collision occured, undo the fragments we were able to obtain,
195 * block, and try again.
197 while (info
.coll_cst
!= NULL
) {
198 RB_SCAN(ccms_rb_tree
, &ds
->tree
, ccms_lock_undo_cmp
,
199 ccms_lock_undo_match
, &info
);
200 info
.coll_cst
->blocked
= 1;
201 ssleep(info
.coll_cst
, &ds
->spin
, 0,
202 ((lock
->ltype
== CCMS_LTYPE_SHARED
) ? "rngsh" : "rngex"),
204 info
.coll_cst
= NULL
;
205 RB_SCAN(ccms_rb_tree
, &ds
->tree
, ccms_lock_scan_cmp
,
206 ccms_lock_redo_match
, &info
);
208 cst
= ds
->delayed_free
;
209 ds
->delayed_free
= NULL
;
210 spin_unlock_wr(&ds
->spin
);
215 ccms_delayed_free(cst
);
217 objcache_put(ccms_oc
, info
.cst1
);
219 objcache_put(ccms_oc
, info
.cst2
);
225 * Obtain a CCMS lock, initialize the lock structure from the uio.
230 ccms_lock_get_uio(ccms_dataspace_t ds
, ccms_lock_t lock
, struct uio
*uio
)
235 if (uio
->uio_rw
== UIO_READ
)
236 ltype
= CCMS_LTYPE_SHARED
;
238 ltype
= CCMS_LTYPE_MODIFYING
;
241 * Calculate the ending offset (byte inclusive), make sure a seek
242 * overflow does not blow us up.
244 eoff
= uio
->uio_offset
+ uio
->uio_resid
- 1;
245 if (eoff
< uio
->uio_offset
)
246 eoff
= 0x7FFFFFFFFFFFFFFFLL
;
247 ccms_lock_init(lock
, uio
->uio_offset
, eoff
, ltype
);
248 return(ccms_lock_get(ds
, lock
));
254 * NOTE: called with spinlock held.
258 ccms_lock_get_match(ccms_cst_t cst
, void *arg
)
260 struct ccms_lock_scan_info
*info
= arg
;
261 ccms_lock_t lock
= info
->lock
;
265 * If the lock's left edge is within the CST we must split the CST
266 * into two pieces [cst][ncst]. lrefs must be bumped on the CST
267 * containing the left edge.
269 * NOTE! cst->beg_offset may not be modified. This allows us to avoid
270 * having to manipulate the cst's position in the tree.
272 if (lock
->beg_offset
> cst
->beg_offset
) {
275 KKASSERT(ncst
!= NULL
);
277 cst
->end_offset
= lock
->beg_offset
- 1;
279 ncst
->beg_offset
= lock
->beg_offset
;
281 RB_INSERT(ccms_rb_tree
, &info
->ds
->tree
, ncst
);
284 * ncst becomes our 'matching' cst.
287 } else if (lock
->beg_offset
== cst
->beg_offset
) {
292 * If the lock's right edge is within the CST we must split the CST
293 * into two pieces [cst][ncst]. rrefs must be bumped on the CST
294 * containing the right edge.
296 * NOTE! cst->beg_offset may not be modified. This allows us to avoid
297 * having to manipulate the cst's position in the tree.
299 if (lock
->end_offset
< cst
->end_offset
) {
302 KKASSERT(ncst
!= NULL
);
304 cst
->end_offset
= lock
->end_offset
;
306 ncst
->beg_offset
= lock
->end_offset
+ 1;
308 RB_INSERT(ccms_rb_tree
, &info
->ds
->tree
, ncst
);
309 /* cst remains our 'matching' cst */
310 } else if (lock
->end_offset
== cst
->end_offset
) {
315 * The lock covers the CST, so increment the CST's coverage count.
316 * Then attempt to obtain the shared/exclusive ltype.
320 if (info
->coll_cst
== NULL
) {
321 switch(lock
->ltype
) {
322 case CCMS_LTYPE_SHARED
:
323 if (cst
->sharecount
< 0) {
324 info
->coll_cst
= cst
;
327 if (ccms_enable
>= 9) {
328 kprintf("CST SHARE %d %lld-%lld\n",
330 (long long)cst
->beg_offset
,
331 (long long)cst
->end_offset
);
335 case CCMS_LTYPE_EXCLUSIVE
:
336 if (cst
->sharecount
!= 0) {
337 info
->coll_cst
= cst
;
340 if (ccms_enable
>= 9) {
341 kprintf("CST EXCLS %d %lld-%lld\n",
343 (long long)cst
->beg_offset
,
344 (long long)cst
->end_offset
);
348 case CCMS_LTYPE_MODIFYING
:
349 if (cst
->sharecount
!= 0) {
350 info
->coll_cst
= cst
;
354 if (ccms_enable
>= 9) {
355 kprintf("CST MODXL %d %lld-%lld\n",
357 (long long)cst
->beg_offset
,
358 (long long)cst
->end_offset
);
368 * Undo a partially resolved ccms_ltype rangelock. This is atomic with
369 * the scan/redo code so there should not be any blocked locks when
370 * transitioning to 0.
372 * NOTE: called with spinlock held.
376 ccms_lock_undo_match(ccms_cst_t cst
, void *arg
)
378 struct ccms_lock_scan_info
*info
= arg
;
379 ccms_lock_t lock
= info
->lock
;
381 switch(lock
->ltype
) {
382 case CCMS_LTYPE_SHARED
:
383 KKASSERT(cst
->sharecount
> 0);
385 KKASSERT(cst
->sharecount
|| cst
->blocked
== 0);
387 case CCMS_LTYPE_EXCLUSIVE
:
388 KKASSERT(cst
->sharecount
< 0);
390 KKASSERT(cst
->sharecount
|| cst
->blocked
== 0);
392 case CCMS_LTYPE_MODIFYING
:
393 KKASSERT(cst
->sharecount
< 0 && cst
->modifycount
> 0);
396 KKASSERT(cst
->sharecount
|| cst
->blocked
== 0);
403 * Redo the local lock request for a range which has already been
406 * NOTE: called with spinlock held.
410 ccms_lock_redo_match(ccms_cst_t cst
, void *arg
)
412 struct ccms_lock_scan_info
*info
= arg
;
413 ccms_lock_t lock
= info
->lock
;
415 if (info
->coll_cst
== NULL
) {
416 switch(lock
->ltype
) {
417 case CCMS_LTYPE_SHARED
:
418 if (cst
->sharecount
< 0) {
419 info
->coll_cst
= cst
;
421 if (ccms_enable
>= 9) {
422 kprintf("CST SHARE %d %lld-%lld\n",
424 (long long)cst
->beg_offset
,
425 (long long)cst
->end_offset
);
430 case CCMS_LTYPE_EXCLUSIVE
:
431 if (cst
->sharecount
!= 0) {
432 info
->coll_cst
= cst
;
435 if (ccms_enable
>= 9) {
436 kprintf("CST EXCLS %d %lld-%lld\n",
438 (long long)cst
->beg_offset
,
439 (long long)cst
->end_offset
);
443 case CCMS_LTYPE_MODIFYING
:
444 if (cst
->sharecount
!= 0) {
445 info
->coll_cst
= cst
;
449 if (ccms_enable
>= 9) {
450 kprintf("CST MODXL %d %lld-%lld\n",
452 (long long)cst
->beg_offset
,
453 (long long)cst
->end_offset
);
463 * Release a CCMS lock
468 ccms_lock_put(ccms_dataspace_t ds
, ccms_lock_t lock
)
470 struct ccms_lock_scan_info info
;
473 if (lock
->ds
== NULL
)
482 spin_lock_wr(&ds
->spin
);
483 RB_SCAN(ccms_rb_tree
, &ds
->tree
, ccms_lock_scan_cmp
,
484 ccms_lock_put_match
, &info
);
485 cst
= ds
->delayed_free
;
486 ds
->delayed_free
= NULL
;
487 spin_unlock_wr(&ds
->spin
);
489 ccms_delayed_free(cst
);
491 objcache_put(ccms_oc
, info
.cst1
);
493 objcache_put(ccms_oc
, info
.cst2
);
498 * NOTE: called with spinlock held.
502 ccms_lock_put_match(ccms_cst_t cst
, void *arg
)
504 struct ccms_lock_scan_info
*info
= arg
;
505 ccms_lock_t lock
= info
->lock
;
509 * Undo the local shared/exclusive rangelock.
511 switch(lock
->ltype
) {
512 case CCMS_LTYPE_SHARED
:
513 KKASSERT(cst
->sharecount
> 0);
515 if (ccms_enable
>= 9) {
516 kprintf("CST UNSHR %d %lld-%lld (%d)\n", cst
->sharecount
,
517 (long long)cst
->beg_offset
,
518 (long long)cst
->end_offset
,
521 if (cst
->blocked
&& cst
->sharecount
== 0) {
526 case CCMS_LTYPE_EXCLUSIVE
:
527 KKASSERT(cst
->sharecount
< 0);
529 if (ccms_enable
>= 9) {
530 kprintf("CST UNEXC %d %lld-%lld (%d)\n", cst
->sharecount
,
531 (long long)cst
->beg_offset
,
532 (long long)cst
->end_offset
,
535 if (cst
->blocked
&& cst
->sharecount
== 0) {
540 case CCMS_LTYPE_MODIFYING
:
541 KKASSERT(cst
->sharecount
< 0 && cst
->modifycount
> 0);
544 if (ccms_enable
>= 9) {
545 kprintf("CST UNMOD %d %lld-%lld (%d)\n", cst
->sharecount
,
546 (long long)cst
->beg_offset
,
547 (long long)cst
->end_offset
,
550 if (cst
->blocked
&& cst
->sharecount
== 0) {
558 * Decrement the lock coverage count on the CST. Decrement the left and
559 * right edge counts as appropriate.
561 * When lrefs or rrefs drops to zero we check the adjacent entry to
562 * determine whether a merge is possible. If the appropriate refs field
563 * (rrefs for the entry to our left, lrefs for the entry to our right)
564 * is 0, then all covering locks must cover both entries and the xrefs
565 * field must match. We can then merge the entries if they have
566 * compatible cache states.
568 * However, because we are cleaning up the shared/exclusive count at
569 * the same time, the sharecount field may be temporarily out of
570 * sync, so require that the sharecount field also match before doing
573 * When merging an element which is being blocked on, the blocking
574 * thread(s) will be woken up.
576 * If the dataspace has too many CSTs we may be able to merge the
577 * entries even if their cache states are not the same, by dropping
578 * both to a compatible (lower) cache state and performing the appropriate
579 * management operations. XXX
582 if (lock
->beg_offset
== cst
->beg_offset
) {
584 if (cst
->lrefs
== 0) {
585 if ((ocst
= RB_PREV(ccms_rb_tree
, &info
->ds
->tree
, cst
)) != NULL
&&
587 ocst
->state
== cst
->state
&&
588 ocst
->sharecount
== cst
->sharecount
590 KKASSERT(ocst
->xrefs
== cst
->xrefs
);
591 KKASSERT(ocst
->end_offset
+ 1 == cst
->beg_offset
);
592 RB_REMOVE(ccms_rb_tree
, &info
->ds
->tree
, ocst
);
593 cst
->beg_offset
= ocst
->beg_offset
;
594 cst
->lrefs
= ocst
->lrefs
;
595 if (ccms_enable
>= 9) {
596 kprintf("MERGELEFT %p %lld-%lld (%d)\n",
598 (long long)cst
->beg_offset
,
599 (long long)cst
->end_offset
,
606 /*objcache_put(ccms_oc, ocst);*/
607 ocst
->delayed_next
= info
->ds
->delayed_free
;
608 info
->ds
->delayed_free
= ocst
;
612 if (lock
->end_offset
== cst
->end_offset
) {
614 if (cst
->rrefs
== 0) {
615 if ((ocst
= RB_NEXT(ccms_rb_tree
, &info
->ds
->tree
, cst
)) != NULL
&&
617 ocst
->state
== cst
->state
&&
618 ocst
->sharecount
== cst
->sharecount
620 KKASSERT(ocst
->xrefs
== cst
->xrefs
);
621 KKASSERT(cst
->end_offset
+ 1 == ocst
->beg_offset
);
622 RB_REMOVE(ccms_rb_tree
, &info
->ds
->tree
, ocst
);
623 cst
->end_offset
= ocst
->end_offset
;
624 cst
->rrefs
= ocst
->rrefs
;
625 if (ccms_enable
>= 9) {
626 kprintf("MERGERIGHT %p %lld-%lld\n",
628 (long long)cst
->beg_offset
,
629 (long long)cst
->end_offset
);
631 /*objcache_put(ccms_oc, ocst);*/
632 ocst
->delayed_next
= info
->ds
->delayed_free
;
633 info
->ds
->delayed_free
= ocst
;
641 * RB tree compare function for insertions and deletions. This function
645 ccms_cst_cmp(ccms_cst_t b1
, ccms_cst_t b2
)
647 if (b1
->end_offset
< b2
->beg_offset
)
649 if (b1
->beg_offset
> b2
->end_offset
)
655 * RB tree scanning compare function. This function compares the CST
656 * from the tree against the supplied ccms_lock and returns the CST's
657 * placement relative to the lock.
660 ccms_lock_scan_cmp(ccms_cst_t cst
, void *arg
)
662 struct ccms_lock_scan_info
*info
= arg
;
663 ccms_lock_t lock
= info
->lock
;
665 if (cst
->end_offset
< lock
->beg_offset
)
667 if (cst
->beg_offset
> lock
->end_offset
)
673 * This function works like ccms_lock_scan_cmp but terminates at the
674 * collision point rather then at the lock's ending offset. Only
675 * the CSTs that were already partially resolved are returned by the scan.
678 ccms_lock_undo_cmp(ccms_cst_t cst
, void *arg
)
680 struct ccms_lock_scan_info
*info
= arg
;
681 ccms_lock_t lock
= info
->lock
;
683 if (cst
->end_offset
< lock
->beg_offset
)
685 if (cst
->beg_offset
>= info
->coll_cst
->beg_offset
)