2 * Copyright (c) 1998 Michael Smith <msmith@freebsd.org>
3 * Copyright 2015 Toomas Soome <tsoome@me.com>
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 #include <sys/cdefs.h>
29 #include <sys/param.h>
32 * Simple hashed block cache
35 #include <sys/stdint.h>
41 #include "bootstrap.h"
43 /* #define BCACHE_DEBUG */
46 # define DEBUG(fmt, args...) printf("%s: " fmt "\n" , __func__ , ## args)
48 # define DEBUG(fmt, args...)
58 * bcache per device node. cache is allocated on device first open and freed
59 * on last close, to save memory. The issue there is the size; biosdisk
60 * supports up to 31 (0x1f) devices. Classic setup would use single disk
61 * to boot from, but this has changed with zfs.
64 struct bcachectl
*bcache_ctl
;
70 static u_int bcache_total_nblks
; /* set by bcache_init */
71 static u_int bcache_blksize
; /* set by bcache_init */
72 static u_int bcache_numdev
; /* set by bcache_add_dev */
74 static u_int bcache_units
; /* number of devices with cache */
75 static u_int bcache_unit_nblks
; /* nblocks per unit */
76 static u_int bcache_hits
;
77 static u_int bcache_misses
;
78 static u_int bcache_ops
;
79 static u_int bcache_bypasses
;
80 static u_int bcache_bcount
;
81 static u_int bcache_rablks
;
83 #define BHASH(bc, blkno) ((blkno) & ((bc)->bcache_nblks - 1))
84 #define BCACHE_LOOKUP(bc, blkno) \
85 ((bc)->bcache_ctl[BHASH((bc), (blkno))].bc_blkno != (blkno))
86 #define BCACHE_READAHEAD 256
87 #define BCACHE_MINREADAHEAD 32
89 static void bcache_invalidate(struct bcache
*bc
, daddr_t blkno
);
90 static void bcache_insert(struct bcache
*bc
, daddr_t blkno
);
91 static void bcache_free_instance(struct bcache
*bc
);
94 * Initialise the cache for (nblks) of (bsize).
97 bcache_init(size_t nblks
, size_t bsize
)
99 /* set up control data */
100 bcache_total_nblks
= nblks
;
101 bcache_blksize
= bsize
;
105 * add number of devices to bcache. we have to divide cache space
106 * between the devices, so bcache_add_dev() can be used to set up the
107 * number. The issue is, we need to get the number before actual allocations.
108 * bcache_add_dev() is supposed to be called from device init() call, so the
109 * assumption is, devsw dv_init is called for plain devices first, and
113 bcache_add_dev(int devices
)
115 bcache_numdev
+= devices
;
119 bcache_allocate(void)
122 struct bcache
*bc
= malloc(sizeof (struct bcache
));
123 int disks
= bcache_numdev
;
126 disks
= 1; /* safe guard */
134 * the bcache block count must be power of 2 for hash function
136 i
= fls(disks
) - 1; /* highbit - 1 */
137 if (disks
> (1 << i
)) /* next power of 2 */
140 bc
->bcache_nblks
= bcache_total_nblks
>> i
;
141 bcache_unit_nblks
= bc
->bcache_nblks
;
142 bc
->bcache_data
= malloc(bc
->bcache_nblks
* bcache_blksize
);
143 if (bc
->bcache_data
== NULL
) {
144 /* dont error out yet. fall back to 32 blocks and try again */
145 bc
->bcache_nblks
= 32;
146 bc
->bcache_data
= malloc(bc
->bcache_nblks
* bcache_blksize
+
150 bc
->bcache_ctl
= malloc(bc
->bcache_nblks
* sizeof(struct bcachectl
));
152 if ((bc
->bcache_data
== NULL
) || (bc
->bcache_ctl
== NULL
)) {
153 bcache_free_instance(bc
);
158 /* Flush the cache */
159 for (i
= 0; i
< bc
->bcache_nblks
; i
++) {
160 bc
->bcache_ctl
[i
].bc_count
= -1;
161 bc
->bcache_ctl
[i
].bc_blkno
= -1;
164 bc
->ra
= BCACHE_READAHEAD
; /* optimistic read ahead */
169 bcache_free(void *cache
)
171 struct bcache
*bc
= cache
;
176 bcache_free_instance(bc
);
181 * Handle a write request; write directly to the disk, and populate the
182 * cache with the new values.
185 write_strategy(void *devdata
, int rw
, daddr_t blk
, size_t size
,
186 char *buf
, size_t *rsize
)
188 struct bcache_devdata
*dd
= (struct bcache_devdata
*)devdata
;
189 struct bcache
*bc
= dd
->dv_cache
;
192 nblk
= size
/ bcache_blksize
;
194 /* Invalidate the blocks being written */
195 for (i
= 0; i
< nblk
; i
++) {
196 bcache_invalidate(bc
, blk
+ i
);
199 /* Write the blocks */
200 return (dd
->dv_strategy(dd
->dv_devdata
, rw
, blk
, size
, buf
, rsize
));
204 * Handle a read request; fill in parts of the request that can
205 * be satisfied by the cache, use the supplied strategy routine to do
206 * device I/O and then use the I/O results to populate the cache.
209 read_strategy(void *devdata
, int rw
, daddr_t blk
, size_t size
,
210 char *buf
, size_t *rsize
)
212 struct bcache_devdata
*dd
= (struct bcache_devdata
*)devdata
;
213 struct bcache
*bc
= dd
->dv_cache
;
214 size_t i
, nblk
, p_size
, r_size
, complete
, ra
;
227 nblk
= size
/ bcache_blksize
;
228 if (nblk
== 0 && size
!= 0)
233 /* Satisfy any cache hits up front, break on first miss */
234 for (i
= 0; i
< nblk
; i
++) {
235 if (BCACHE_LOOKUP(bc
, (daddr_t
)(blk
+ i
))) {
236 bcache_misses
+= (nblk
- i
);
238 if (nblk
- i
> BCACHE_MINREADAHEAD
&& bc
->ra
> BCACHE_MINREADAHEAD
)
239 bc
->ra
>>= 1; /* reduce read ahead */
246 if (complete
) { /* whole set was in cache, return it */
247 if (bc
->ra
< BCACHE_READAHEAD
)
248 bc
->ra
<<= 1; /* increase read ahead */
249 bcopy(bc
->bcache_data
+ (bcache_blksize
* BHASH(bc
, blk
)), buf
, size
);
254 * Fill in any misses. From check we have i pointing to first missing
255 * block, read in all remaining blocks + readahead.
256 * We have space at least for nblk - i before bcache wraps.
259 p_buf
= bc
->bcache_data
+ (bcache_blksize
* BHASH(bc
, p_blk
));
260 r_size
= bc
->bcache_nblks
- BHASH(bc
, p_blk
); /* remaining blocks */
262 p_size
= MIN(r_size
, nblk
- i
); /* read at least those blocks */
265 * The read ahead size setup.
266 * While the read ahead can save us IO, it also can complicate things:
267 * 1. We do not want to read ahead by wrapping around the
268 * bcache end - this would complicate the cache management.
269 * 2. We are using bc->ra as dynamic hint for read ahead size,
270 * detected cache hits will increase the read-ahead block count, and
271 * misses will decrease, see the code above.
272 * 3. The bcache is sized by 512B blocks, however, the underlying device
273 * may have a larger sector size, and we should perform the IO by
274 * taking into account these larger sector sizes. We could solve this by
275 * passing the sector size to bcache_allocate(), or by using ioctl(), but
276 * in this version we are using the constant, 16 blocks, and are rounding
277 * read ahead block count down to multiple of 16.
278 * Using the constant has two reasons, we are not entirely sure if the
279 * BIOS disk interface is providing the correct value for sector size.
280 * And secondly, this way we get the most conservative setup for the ra.
282 * The selection of multiple of 16 blocks (8KB) is quite arbitrary, however,
283 * we want to cover CDs (2K) and 4K disks.
284 * bcache_allocate() will always fall back to a minimum of 32 blocks.
285 * Our choice of 16 read ahead blocks will always fit inside the bcache.
288 if ((rw
& F_NORA
) == F_NORA
)
291 ra
= bc
->bcache_nblks
- BHASH(bc
, p_blk
+ p_size
);
293 if (ra
!= 0 && ra
!= bc
->bcache_nblks
) { /* do we have RA space? */
294 ra
= MIN(bc
->ra
, ra
- 1);
295 ra
= rounddown(ra
, 16); /* multiple of 16 blocks */
299 /* invalidate bcache */
300 for (i
= 0; i
< p_size
; i
++) {
301 bcache_invalidate(bc
, p_blk
+ i
);
306 * with read-ahead, it may happen we are attempting to read past
307 * disk end, as bcache has no information about disk size.
308 * in such case we should get partial read if some blocks can be
309 * read or error, if no blocks can be read.
310 * in either case we should return the data in bcache and only
311 * return error if there is no data.
314 result
= dd
->dv_strategy(dd
->dv_devdata
, rw
, p_blk
,
315 p_size
* bcache_blksize
, p_buf
, &r_size
);
317 r_size
/= bcache_blksize
;
318 for (i
= 0; i
< r_size
; i
++)
319 bcache_insert(bc
, p_blk
+ i
);
321 /* update ra statistics */
324 bcache_rablks
+= (p_size
- r_size
);
329 /* check how much data can we copy */
330 for (i
= 0; i
< nblk
; i
++) {
331 if (BCACHE_LOOKUP(bc
, (daddr_t
)(blk
+ i
)))
335 if (size
> i
* bcache_blksize
)
336 size
= i
* bcache_blksize
;
339 bcopy(bc
->bcache_data
+ (bcache_blksize
* BHASH(bc
, blk
)), buf
, size
);
344 if ((result
== 0) && (rsize
!= NULL
))
350 * Requests larger than 1/2 cache size will be bypassed and go
351 * directly to the disk. XXX tune this.
354 bcache_strategy(void *devdata
, int rw
, daddr_t blk
, size_t size
,
355 char *buf
, size_t *rsize
)
357 struct bcache_devdata
*dd
= (struct bcache_devdata
*)devdata
;
358 struct bcache
*bc
= dd
->dv_cache
;
359 u_int bcache_nblks
= 0;
361 size_t csize
, isize
, total
;
366 bcache_nblks
= bc
->bcache_nblks
;
368 /* bypass large requests, or when the cache is inactive */
370 ((size
* 2 / bcache_blksize
) > bcache_nblks
)) {
371 DEBUG("bypass %zu from %qu", size
/ bcache_blksize
, blk
);
374 return (dd
->dv_strategy(dd
->dv_devdata
, rw
, blk
, size
, buf
, rsize
));
377 switch (rw
& F_MASK
) {
379 nblk
= size
/ bcache_blksize
;
380 if (size
!= 0 && nblk
== 0)
381 nblk
++; /* read at least one block */
386 cblk
= bcache_nblks
- BHASH(bc
, blk
); /* # of blocks left */
387 cblk
= MIN(cblk
, nblk
);
389 if (size
<= bcache_blksize
)
392 csize
= cblk
* bcache_blksize
;
394 ret
= read_strategy(devdata
, rw
, blk
, csize
, buf
+total
, &isize
);
397 * we may have error from read ahead, if we have read some data
398 * return partial read.
400 if (ret
!= 0 || isize
== 0) {
405 blk
+= isize
/ bcache_blksize
;
408 nblk
= size
/ bcache_blksize
;
416 return write_strategy(devdata
, F_WRITE
, blk
, size
, buf
, rsize
);
422 * Free allocated bcache instance
425 bcache_free_instance(struct bcache
*bc
)
429 free(bc
->bcache_ctl
);
431 free(bc
->bcache_data
);
437 * Insert a block into the cache.
440 bcache_insert(struct bcache
*bc
, daddr_t blkno
)
444 cand
= BHASH(bc
, blkno
);
446 DEBUG("insert blk %llu -> %u # %d", blkno
, cand
, bcache_bcount
);
447 bc
->bcache_ctl
[cand
].bc_blkno
= blkno
;
448 bc
->bcache_ctl
[cand
].bc_count
= bcache_bcount
++;
452 * Invalidate a block from the cache.
455 bcache_invalidate(struct bcache
*bc
, daddr_t blkno
)
459 i
= BHASH(bc
, blkno
);
460 if (bc
->bcache_ctl
[i
].bc_blkno
== blkno
) {
461 bc
->bcache_ctl
[i
].bc_count
= -1;
462 bc
->bcache_ctl
[i
].bc_blkno
= -1;
463 DEBUG("invalidate blk %llu", blkno
);
468 COMMAND_SET(bcachestat
, "bcachestat", "get disk block cache stats", command_bcache
);
471 command_bcache(int argc
, char *argv
[] __attribute((unused
)))
474 command_errmsg
= "wrong number of arguments";
478 printf("\ncache blocks: %u\n", bcache_total_nblks
);
479 printf("cache blocksz: %u\n", bcache_blksize
);
480 printf("cache readahead: %u\n", bcache_rablks
);
481 printf("unit cache blocks: %u\n", bcache_unit_nblks
);
482 printf("cached units: %u\n", bcache_units
);
483 printf("%u ops %u bypasses %u hits %u misses\n", bcache_ops
,
484 bcache_bypasses
, bcache_hits
, bcache_misses
);