4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
26 #include <sys/zfs_context.h>
27 #include <sys/dnode.h>
28 #include <sys/dmu_objset.h>
29 #include <sys/dmu_zfetch.h>
32 #include <sys/kstat.h>
35 * I'm against tune-ables, but these should probably exist as tweakable globals
36 * until we can get this working the way we want it to.
39 int zfs_prefetch_disable
= 0;
41 /* max # of streams per zfetch */
42 uint32_t zfetch_max_streams
= 8;
43 /* min time before stream reclaim */
44 uint32_t zfetch_min_sec_reap
= 2;
45 /* max number of blocks to fetch at a time */
46 uint32_t zfetch_block_cap
= 256;
47 /* number of bytes in a array_read at which we stop prefetching (1Mb) */
48 uint64_t zfetch_array_rd_sz
= 1024 * 1024;
50 /* forward decls for static routines */
51 static int dmu_zfetch_colinear(zfetch_t
*, zstream_t
*);
52 static void dmu_zfetch_dofetch(zfetch_t
*, zstream_t
*);
53 static uint64_t dmu_zfetch_fetch(dnode_t
*, uint64_t, uint64_t);
54 static uint64_t dmu_zfetch_fetchsz(dnode_t
*, uint64_t, uint64_t);
55 static int dmu_zfetch_find(zfetch_t
*, zstream_t
*, int);
56 static int dmu_zfetch_stream_insert(zfetch_t
*, zstream_t
*);
57 static zstream_t
*dmu_zfetch_stream_reclaim(zfetch_t
*);
58 static void dmu_zfetch_stream_remove(zfetch_t
*, zstream_t
*);
59 static int dmu_zfetch_streams_equal(zstream_t
*, zstream_t
*);
61 typedef struct zfetch_stats
{
62 kstat_named_t zfetchstat_hits
;
63 kstat_named_t zfetchstat_misses
;
64 kstat_named_t zfetchstat_colinear_hits
;
65 kstat_named_t zfetchstat_colinear_misses
;
66 kstat_named_t zfetchstat_stride_hits
;
67 kstat_named_t zfetchstat_stride_misses
;
68 kstat_named_t zfetchstat_reclaim_successes
;
69 kstat_named_t zfetchstat_reclaim_failures
;
70 kstat_named_t zfetchstat_stream_resets
;
71 kstat_named_t zfetchstat_stream_noresets
;
72 kstat_named_t zfetchstat_bogus_streams
;
75 static zfetch_stats_t zfetch_stats
= {
76 { "hits", KSTAT_DATA_UINT64
},
77 { "misses", KSTAT_DATA_UINT64
},
78 { "colinear_hits", KSTAT_DATA_UINT64
},
79 { "colinear_misses", KSTAT_DATA_UINT64
},
80 { "stride_hits", KSTAT_DATA_UINT64
},
81 { "stride_misses", KSTAT_DATA_UINT64
},
82 { "reclaim_successes", KSTAT_DATA_UINT64
},
83 { "reclaim_failures", KSTAT_DATA_UINT64
},
84 { "streams_resets", KSTAT_DATA_UINT64
},
85 { "streams_noresets", KSTAT_DATA_UINT64
},
86 { "bogus_streams", KSTAT_DATA_UINT64
},
89 #define ZFETCHSTAT_INCR(stat, val) \
90 atomic_add_64(&zfetch_stats.stat.value.ui64, (val));
92 #define ZFETCHSTAT_BUMP(stat) ZFETCHSTAT_INCR(stat, 1);
97 * Given a zfetch structure and a zstream structure, determine whether the
98 * blocks to be read are part of a co-linear pair of existing prefetch
99 * streams. If a set is found, coalesce the streams, removing one, and
100 * configure the prefetch so it looks for a strided access pattern.
102 * In other words: if we find two sequential access streams that are
103 * the same length and distance N appart, and this read is N from the
104 * last stream, then we are probably in a strided access pattern. So
105 * combine the two sequential streams into a single strided stream.
107 * If no co-linear streams are found, return NULL.
110 dmu_zfetch_colinear(zfetch_t
*zf
, zstream_t
*zh
)
115 if (! rw_tryenter(&zf
->zf_rwlock
, RW_WRITER
))
119 rw_exit(&zf
->zf_rwlock
);
123 for (z_walk
= list_head(&zf
->zf_stream
); z_walk
;
124 z_walk
= list_next(&zf
->zf_stream
, z_walk
)) {
125 for (z_comp
= list_next(&zf
->zf_stream
, z_walk
); z_comp
;
126 z_comp
= list_next(&zf
->zf_stream
, z_comp
)) {
129 if (z_walk
->zst_len
!= z_walk
->zst_stride
||
130 z_comp
->zst_len
!= z_comp
->zst_stride
) {
134 diff
= z_comp
->zst_offset
- z_walk
->zst_offset
;
135 if (z_comp
->zst_offset
+ diff
== zh
->zst_offset
) {
136 z_walk
->zst_offset
= zh
->zst_offset
;
137 z_walk
->zst_direction
= diff
< 0 ? -1 : 1;
139 diff
* z_walk
->zst_direction
;
140 z_walk
->zst_ph_offset
=
141 zh
->zst_offset
+ z_walk
->zst_stride
;
142 dmu_zfetch_stream_remove(zf
, z_comp
);
143 mutex_destroy(&z_comp
->zst_lock
);
144 kmem_free(z_comp
, sizeof (zstream_t
));
146 dmu_zfetch_dofetch(zf
, z_walk
);
148 rw_exit(&zf
->zf_rwlock
);
152 diff
= z_walk
->zst_offset
- z_comp
->zst_offset
;
153 if (z_walk
->zst_offset
+ diff
== zh
->zst_offset
) {
154 z_walk
->zst_offset
= zh
->zst_offset
;
155 z_walk
->zst_direction
= diff
< 0 ? -1 : 1;
157 diff
* z_walk
->zst_direction
;
158 z_walk
->zst_ph_offset
=
159 zh
->zst_offset
+ z_walk
->zst_stride
;
160 dmu_zfetch_stream_remove(zf
, z_comp
);
161 mutex_destroy(&z_comp
->zst_lock
);
162 kmem_free(z_comp
, sizeof (zstream_t
));
164 dmu_zfetch_dofetch(zf
, z_walk
);
166 rw_exit(&zf
->zf_rwlock
);
172 rw_exit(&zf
->zf_rwlock
);
177 * Given a zstream_t, determine the bounds of the prefetch. Then call the
178 * routine that actually prefetches the individual blocks.
181 dmu_zfetch_dofetch(zfetch_t
*zf
, zstream_t
*zs
)
183 uint64_t prefetch_tail
;
184 uint64_t prefetch_limit
;
185 uint64_t prefetch_ofst
;
186 uint64_t prefetch_len
;
187 uint64_t blocks_fetched
;
189 zs
->zst_stride
= MAX((int64_t)zs
->zst_stride
, zs
->zst_len
);
190 zs
->zst_cap
= MIN(zfetch_block_cap
, 2 * zs
->zst_cap
);
192 prefetch_tail
= MAX((int64_t)zs
->zst_ph_offset
,
193 (int64_t)(zs
->zst_offset
+ zs
->zst_stride
));
195 * XXX: use a faster division method?
197 prefetch_limit
= zs
->zst_offset
+ zs
->zst_len
+
198 (zs
->zst_cap
* zs
->zst_stride
) / zs
->zst_len
;
200 while (prefetch_tail
< prefetch_limit
) {
201 prefetch_ofst
= zs
->zst_offset
+ zs
->zst_direction
*
202 (prefetch_tail
- zs
->zst_offset
);
204 prefetch_len
= zs
->zst_len
;
207 * Don't prefetch beyond the end of the file, if working
210 if ((zs
->zst_direction
== ZFETCH_BACKWARD
) &&
211 (prefetch_ofst
> prefetch_tail
)) {
212 prefetch_len
+= prefetch_ofst
;
216 /* don't prefetch more than we're supposed to */
217 if (prefetch_len
> zs
->zst_len
)
220 blocks_fetched
= dmu_zfetch_fetch(zf
->zf_dnode
,
221 prefetch_ofst
, zs
->zst_len
);
223 prefetch_tail
+= zs
->zst_stride
;
224 /* stop if we've run out of stuff to prefetch */
225 if (blocks_fetched
< zs
->zst_len
)
228 zs
->zst_ph_offset
= prefetch_tail
;
229 zs
->zst_last
= ddi_get_lbolt();
236 zfetch_ksp
= kstat_create("zfs", 0, "zfetchstats", "misc",
237 KSTAT_TYPE_NAMED
, sizeof (zfetch_stats
) / sizeof (kstat_named_t
),
240 if (zfetch_ksp
!= NULL
) {
241 zfetch_ksp
->ks_data
= &zfetch_stats
;
242 kstat_install(zfetch_ksp
);
249 if (zfetch_ksp
!= NULL
) {
250 kstat_delete(zfetch_ksp
);
256 * This takes a pointer to a zfetch structure and a dnode. It performs the
257 * necessary setup for the zfetch structure, grokking data from the
261 dmu_zfetch_init(zfetch_t
*zf
, dnode_t
*dno
)
268 zf
->zf_stream_cnt
= 0;
269 zf
->zf_alloc_fail
= 0;
271 list_create(&zf
->zf_stream
, sizeof (zstream_t
),
272 offsetof(zstream_t
, zst_node
));
274 rw_init(&zf
->zf_rwlock
, NULL
, RW_DEFAULT
, NULL
);
278 * This function computes the actual size, in blocks, that can be prefetched,
282 dmu_zfetch_fetch(dnode_t
*dn
, uint64_t blkid
, uint64_t nblks
)
287 fetchsz
= dmu_zfetch_fetchsz(dn
, blkid
, nblks
);
289 for (i
= 0; i
< fetchsz
; i
++) {
290 dbuf_prefetch(dn
, blkid
+ i
);
297 * this function returns the number of blocks that would be prefetched, based
298 * upon the supplied dnode, blockid, and nblks. This is used so that we can
299 * update streams in place, and then prefetch with their old value after the
300 * fact. This way, we can delay the prefetch, but subsequent accesses to the
301 * stream won't result in the same data being prefetched multiple times.
304 dmu_zfetch_fetchsz(dnode_t
*dn
, uint64_t blkid
, uint64_t nblks
)
308 if (blkid
> dn
->dn_maxblkid
) {
312 /* compute fetch size */
313 if (blkid
+ nblks
+ 1 > dn
->dn_maxblkid
) {
314 fetchsz
= (dn
->dn_maxblkid
- blkid
) + 1;
315 ASSERT(blkid
+ fetchsz
- 1 <= dn
->dn_maxblkid
);
325 * given a zfetch and a zstream structure, see if there is an associated zstream
326 * for this block read. If so, it starts a prefetch for the stream it
327 * located and returns true, otherwise it returns false
330 dmu_zfetch_find(zfetch_t
*zf
, zstream_t
*zh
, int prefetched
)
334 int reset
= !prefetched
;
341 * XXX: This locking strategy is a bit coarse; however, it's impact has
342 * yet to be tested. If this turns out to be an issue, it can be
343 * modified in a number of different ways.
346 rw_enter(&zf
->zf_rwlock
, RW_READER
);
349 for (zs
= list_head(&zf
->zf_stream
); zs
;
350 zs
= list_next(&zf
->zf_stream
, zs
)) {
353 * XXX - should this be an assert?
355 if (zs
->zst_len
== 0) {
357 ZFETCHSTAT_BUMP(zfetchstat_bogus_streams
);
362 * We hit this case when we are in a strided prefetch stream:
363 * we will read "len" blocks before "striding".
365 if (zh
->zst_offset
>= zs
->zst_offset
&&
366 zh
->zst_offset
< zs
->zst_offset
+ zs
->zst_len
) {
368 /* already fetched */
369 ZFETCHSTAT_BUMP(zfetchstat_stride_hits
);
373 ZFETCHSTAT_BUMP(zfetchstat_stride_misses
);
378 * This is the forward sequential read case: we increment
379 * len by one each time we hit here, so we will enter this
380 * case on every read.
382 if (zh
->zst_offset
== zs
->zst_offset
+ zs
->zst_len
) {
384 reset
= !prefetched
&& zs
->zst_len
> 1;
386 mutex_enter(&zs
->zst_lock
);
388 if (zh
->zst_offset
!= zs
->zst_offset
+ zs
->zst_len
) {
389 mutex_exit(&zs
->zst_lock
);
392 zs
->zst_len
+= zh
->zst_len
;
393 diff
= zs
->zst_len
- zfetch_block_cap
;
395 zs
->zst_offset
+= diff
;
396 zs
->zst_len
= zs
->zst_len
> diff
?
397 zs
->zst_len
- diff
: 0;
399 zs
->zst_direction
= ZFETCH_FORWARD
;
404 * Same as above, but reading backwards through the file.
406 } else if (zh
->zst_offset
== zs
->zst_offset
- zh
->zst_len
) {
407 /* backwards sequential access */
409 reset
= !prefetched
&& zs
->zst_len
> 1;
411 mutex_enter(&zs
->zst_lock
);
413 if (zh
->zst_offset
!= zs
->zst_offset
- zh
->zst_len
) {
414 mutex_exit(&zs
->zst_lock
);
418 zs
->zst_offset
= zs
->zst_offset
> zh
->zst_len
?
419 zs
->zst_offset
- zh
->zst_len
: 0;
420 zs
->zst_ph_offset
= zs
->zst_ph_offset
> zh
->zst_len
?
421 zs
->zst_ph_offset
- zh
->zst_len
: 0;
422 zs
->zst_len
+= zh
->zst_len
;
424 diff
= zs
->zst_len
- zfetch_block_cap
;
426 zs
->zst_ph_offset
= zs
->zst_ph_offset
> diff
?
427 zs
->zst_ph_offset
- diff
: 0;
428 zs
->zst_len
= zs
->zst_len
> diff
?
429 zs
->zst_len
- diff
: zs
->zst_len
;
431 zs
->zst_direction
= ZFETCH_BACKWARD
;
435 } else if ((zh
->zst_offset
- zs
->zst_offset
- zs
->zst_stride
<
436 zs
->zst_len
) && (zs
->zst_len
!= zs
->zst_stride
)) {
437 /* strided forward access */
439 mutex_enter(&zs
->zst_lock
);
441 if ((zh
->zst_offset
- zs
->zst_offset
- zs
->zst_stride
>=
442 zs
->zst_len
) || (zs
->zst_len
== zs
->zst_stride
)) {
443 mutex_exit(&zs
->zst_lock
);
447 zs
->zst_offset
+= zs
->zst_stride
;
448 zs
->zst_direction
= ZFETCH_FORWARD
;
452 } else if ((zh
->zst_offset
- zs
->zst_offset
+ zs
->zst_stride
<
453 zs
->zst_len
) && (zs
->zst_len
!= zs
->zst_stride
)) {
454 /* strided reverse access */
456 mutex_enter(&zs
->zst_lock
);
458 if ((zh
->zst_offset
- zs
->zst_offset
+ zs
->zst_stride
>=
459 zs
->zst_len
) || (zs
->zst_len
== zs
->zst_stride
)) {
460 mutex_exit(&zs
->zst_lock
);
464 zs
->zst_offset
= zs
->zst_offset
> zs
->zst_stride
?
465 zs
->zst_offset
- zs
->zst_stride
: 0;
466 zs
->zst_ph_offset
= (zs
->zst_ph_offset
>
467 (2 * zs
->zst_stride
)) ?
468 (zs
->zst_ph_offset
- (2 * zs
->zst_stride
)) : 0;
469 zs
->zst_direction
= ZFETCH_BACKWARD
;
477 zstream_t
*remove
= zs
;
479 ZFETCHSTAT_BUMP(zfetchstat_stream_resets
);
481 mutex_exit(&zs
->zst_lock
);
482 rw_exit(&zf
->zf_rwlock
);
483 rw_enter(&zf
->zf_rwlock
, RW_WRITER
);
485 * Relocate the stream, in case someone removes
486 * it while we were acquiring the WRITER lock.
488 for (zs
= list_head(&zf
->zf_stream
); zs
;
489 zs
= list_next(&zf
->zf_stream
, zs
)) {
491 dmu_zfetch_stream_remove(zf
, zs
);
492 mutex_destroy(&zs
->zst_lock
);
493 kmem_free(zs
, sizeof (zstream_t
));
498 ZFETCHSTAT_BUMP(zfetchstat_stream_noresets
);
500 dmu_zfetch_dofetch(zf
, zs
);
501 mutex_exit(&zs
->zst_lock
);
505 rw_exit(&zf
->zf_rwlock
);
510 * Clean-up state associated with a zfetch structure. This frees allocated
511 * structure members, empties the zf_stream tree, and generally makes things
512 * nice. This doesn't free the zfetch_t itself, that's left to the caller.
515 dmu_zfetch_rele(zfetch_t
*zf
)
520 ASSERT(!RW_LOCK_HELD(&zf
->zf_rwlock
));
522 for (zs
= list_head(&zf
->zf_stream
); zs
; zs
= zs_next
) {
523 zs_next
= list_next(&zf
->zf_stream
, zs
);
525 list_remove(&zf
->zf_stream
, zs
);
526 mutex_destroy(&zs
->zst_lock
);
527 kmem_free(zs
, sizeof (zstream_t
));
529 list_destroy(&zf
->zf_stream
);
530 rw_destroy(&zf
->zf_rwlock
);
536 * Given a zfetch and zstream structure, insert the zstream structure into the
537 * AVL tree contained within the zfetch structure. Peform the appropriate
538 * book-keeping. It is possible that another thread has inserted a stream which
539 * matches one that we are about to insert, so we must be sure to check for this
540 * case. If one is found, return failure, and let the caller cleanup the
544 dmu_zfetch_stream_insert(zfetch_t
*zf
, zstream_t
*zs
)
549 ASSERT(RW_WRITE_HELD(&zf
->zf_rwlock
));
551 for (zs_walk
= list_head(&zf
->zf_stream
); zs_walk
; zs_walk
= zs_next
) {
552 zs_next
= list_next(&zf
->zf_stream
, zs_walk
);
554 if (dmu_zfetch_streams_equal(zs_walk
, zs
)) {
559 list_insert_head(&zf
->zf_stream
, zs
);
566 * Walk the list of zstreams in the given zfetch, find an old one (by time), and
567 * reclaim it for use by the caller.
570 dmu_zfetch_stream_reclaim(zfetch_t
*zf
)
574 if (! rw_tryenter(&zf
->zf_rwlock
, RW_WRITER
))
577 for (zs
= list_head(&zf
->zf_stream
); zs
;
578 zs
= list_next(&zf
->zf_stream
, zs
)) {
580 if (((ddi_get_lbolt() - zs
->zst_last
)/hz
) > zfetch_min_sec_reap
)
585 dmu_zfetch_stream_remove(zf
, zs
);
586 mutex_destroy(&zs
->zst_lock
);
587 bzero(zs
, sizeof (zstream_t
));
591 rw_exit(&zf
->zf_rwlock
);
597 * Given a zfetch and zstream structure, remove the zstream structure from its
598 * container in the zfetch structure. Perform the appropriate book-keeping.
601 dmu_zfetch_stream_remove(zfetch_t
*zf
, zstream_t
*zs
)
603 ASSERT(RW_WRITE_HELD(&zf
->zf_rwlock
));
605 list_remove(&zf
->zf_stream
, zs
);
610 dmu_zfetch_streams_equal(zstream_t
*zs1
, zstream_t
*zs2
)
612 if (zs1
->zst_offset
!= zs2
->zst_offset
)
615 if (zs1
->zst_len
!= zs2
->zst_len
)
618 if (zs1
->zst_stride
!= zs2
->zst_stride
)
621 if (zs1
->zst_ph_offset
!= zs2
->zst_ph_offset
)
624 if (zs1
->zst_cap
!= zs2
->zst_cap
)
627 if (zs1
->zst_direction
!= zs2
->zst_direction
)
634 * This is the prefetch entry point. It calls all of the other dmu_zfetch
635 * routines to create, delete, find, or operate upon prefetch streams.
638 dmu_zfetch(zfetch_t
*zf
, uint64_t offset
, uint64_t size
, int prefetched
)
641 zstream_t
*newstream
;
644 unsigned int blkshft
;
647 if (zfs_prefetch_disable
)
650 /* files that aren't ln2 blocksz are only one block -- nothing to do */
651 if (!zf
->zf_dnode
->dn_datablkshift
)
654 /* convert offset and size, into blockid and nblocks */
655 blkshft
= zf
->zf_dnode
->dn_datablkshift
;
656 blksz
= (1 << blkshft
);
658 bzero(&zst
, sizeof (zstream_t
));
659 zst
.zst_offset
= offset
>> blkshft
;
660 zst
.zst_len
= (P2ROUNDUP(offset
+ size
, blksz
) -
661 P2ALIGN(offset
, blksz
)) >> blkshft
;
663 fetched
= dmu_zfetch_find(zf
, &zst
, prefetched
);
665 ZFETCHSTAT_BUMP(zfetchstat_hits
);
667 ZFETCHSTAT_BUMP(zfetchstat_misses
);
668 if (fetched
= dmu_zfetch_colinear(zf
, &zst
)) {
669 ZFETCHSTAT_BUMP(zfetchstat_colinear_hits
);
671 ZFETCHSTAT_BUMP(zfetchstat_colinear_misses
);
676 newstream
= dmu_zfetch_stream_reclaim(zf
);
679 * we still couldn't find a stream, drop the lock, and allocate
680 * one if possible. Otherwise, give up and go home.
683 ZFETCHSTAT_BUMP(zfetchstat_reclaim_successes
);
686 uint32_t max_streams
;
687 uint32_t cur_streams
;
689 ZFETCHSTAT_BUMP(zfetchstat_reclaim_failures
);
690 cur_streams
= zf
->zf_stream_cnt
;
691 maxblocks
= zf
->zf_dnode
->dn_maxblkid
;
693 max_streams
= MIN(zfetch_max_streams
,
694 (maxblocks
/ zfetch_block_cap
));
695 if (max_streams
== 0) {
699 if (cur_streams
>= max_streams
) {
702 newstream
= kmem_zalloc(sizeof (zstream_t
), KM_SLEEP
);
705 newstream
->zst_offset
= zst
.zst_offset
;
706 newstream
->zst_len
= zst
.zst_len
;
707 newstream
->zst_stride
= zst
.zst_len
;
708 newstream
->zst_ph_offset
= zst
.zst_len
+ zst
.zst_offset
;
709 newstream
->zst_cap
= zst
.zst_len
;
710 newstream
->zst_direction
= ZFETCH_FORWARD
;
711 newstream
->zst_last
= ddi_get_lbolt();
713 mutex_init(&newstream
->zst_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
715 rw_enter(&zf
->zf_rwlock
, RW_WRITER
);
716 inserted
= dmu_zfetch_stream_insert(zf
, newstream
);
717 rw_exit(&zf
->zf_rwlock
);
720 mutex_destroy(&newstream
->zst_lock
);
721 kmem_free(newstream
, sizeof (zstream_t
));