4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
27 * Copyright (c) 2013 by Delphix. All rights reserved.
30 #include <sys/zfs_context.h>
31 #include <sys/dnode.h>
32 #include <sys/dmu_objset.h>
33 #include <sys/dmu_zfetch.h>
36 #include <sys/kstat.h>
39 * I'm against tune-ables, but these should probably exist as tweakable globals
40 * until we can get this working the way we want it to.
43 int zfs_prefetch_disable
= 0;
45 /* max # of streams per zfetch */
46 uint32_t zfetch_max_streams
= 8;
47 /* min time before stream reclaim */
48 uint32_t zfetch_min_sec_reap
= 2;
49 /* max number of blocks to fetch at a time */
50 uint32_t zfetch_block_cap
= 256;
51 /* number of bytes in a array_read at which we stop prefetching (1Mb) */
52 uint64_t zfetch_array_rd_sz
= 1024 * 1024;
54 /* forward decls for static routines */
55 static boolean_t
dmu_zfetch_colinear(zfetch_t
*, zstream_t
*);
56 static void dmu_zfetch_dofetch(zfetch_t
*, zstream_t
*);
57 static uint64_t dmu_zfetch_fetch(dnode_t
*, uint64_t, uint64_t);
58 static uint64_t dmu_zfetch_fetchsz(dnode_t
*, uint64_t, uint64_t);
59 static boolean_t
dmu_zfetch_find(zfetch_t
*, zstream_t
*, int);
60 static int dmu_zfetch_stream_insert(zfetch_t
*, zstream_t
*);
61 static zstream_t
*dmu_zfetch_stream_reclaim(zfetch_t
*);
62 static void dmu_zfetch_stream_remove(zfetch_t
*, zstream_t
*);
63 static int dmu_zfetch_streams_equal(zstream_t
*, zstream_t
*);
65 typedef struct zfetch_stats
{
66 kstat_named_t zfetchstat_hits
;
67 kstat_named_t zfetchstat_misses
;
68 kstat_named_t zfetchstat_colinear_hits
;
69 kstat_named_t zfetchstat_colinear_misses
;
70 kstat_named_t zfetchstat_stride_hits
;
71 kstat_named_t zfetchstat_stride_misses
;
72 kstat_named_t zfetchstat_reclaim_successes
;
73 kstat_named_t zfetchstat_reclaim_failures
;
74 kstat_named_t zfetchstat_stream_resets
;
75 kstat_named_t zfetchstat_stream_noresets
;
76 kstat_named_t zfetchstat_bogus_streams
;
79 static zfetch_stats_t zfetch_stats
= {
80 { "hits", KSTAT_DATA_UINT64
},
81 { "misses", KSTAT_DATA_UINT64
},
82 { "colinear_hits", KSTAT_DATA_UINT64
},
83 { "colinear_misses", KSTAT_DATA_UINT64
},
84 { "stride_hits", KSTAT_DATA_UINT64
},
85 { "stride_misses", KSTAT_DATA_UINT64
},
86 { "reclaim_successes", KSTAT_DATA_UINT64
},
87 { "reclaim_failures", KSTAT_DATA_UINT64
},
88 { "streams_resets", KSTAT_DATA_UINT64
},
89 { "streams_noresets", KSTAT_DATA_UINT64
},
90 { "bogus_streams", KSTAT_DATA_UINT64
},
93 #define ZFETCHSTAT_INCR(stat, val) \
94 atomic_add_64(&zfetch_stats.stat.value.ui64, (val));
96 #define ZFETCHSTAT_BUMP(stat) ZFETCHSTAT_INCR(stat, 1);
101 * Given a zfetch structure and a zstream structure, determine whether the
102 * blocks to be read are part of a co-linear pair of existing prefetch
103 * streams. If a set is found, coalesce the streams, removing one, and
104 * configure the prefetch so it looks for a strided access pattern.
106 * In other words: if we find two sequential access streams that are
107 * the same length and distance N appart, and this read is N from the
108 * last stream, then we are probably in a strided access pattern. So
109 * combine the two sequential streams into a single strided stream.
111 * Returns whether co-linear streams were found.
114 dmu_zfetch_colinear(zfetch_t
*zf
, zstream_t
*zh
)
119 if (! rw_tryenter(&zf
->zf_rwlock
, RW_WRITER
))
123 rw_exit(&zf
->zf_rwlock
);
127 for (z_walk
= list_head(&zf
->zf_stream
); z_walk
;
128 z_walk
= list_next(&zf
->zf_stream
, z_walk
)) {
129 for (z_comp
= list_next(&zf
->zf_stream
, z_walk
); z_comp
;
130 z_comp
= list_next(&zf
->zf_stream
, z_comp
)) {
133 if (z_walk
->zst_len
!= z_walk
->zst_stride
||
134 z_comp
->zst_len
!= z_comp
->zst_stride
) {
138 diff
= z_comp
->zst_offset
- z_walk
->zst_offset
;
139 if (z_comp
->zst_offset
+ diff
== zh
->zst_offset
) {
140 z_walk
->zst_offset
= zh
->zst_offset
;
141 z_walk
->zst_direction
= diff
< 0 ? -1 : 1;
143 diff
* z_walk
->zst_direction
;
144 z_walk
->zst_ph_offset
=
145 zh
->zst_offset
+ z_walk
->zst_stride
;
146 dmu_zfetch_stream_remove(zf
, z_comp
);
147 mutex_destroy(&z_comp
->zst_lock
);
148 kmem_free(z_comp
, sizeof (zstream_t
));
150 dmu_zfetch_dofetch(zf
, z_walk
);
152 rw_exit(&zf
->zf_rwlock
);
156 diff
= z_walk
->zst_offset
- z_comp
->zst_offset
;
157 if (z_walk
->zst_offset
+ diff
== zh
->zst_offset
) {
158 z_walk
->zst_offset
= zh
->zst_offset
;
159 z_walk
->zst_direction
= diff
< 0 ? -1 : 1;
161 diff
* z_walk
->zst_direction
;
162 z_walk
->zst_ph_offset
=
163 zh
->zst_offset
+ z_walk
->zst_stride
;
164 dmu_zfetch_stream_remove(zf
, z_comp
);
165 mutex_destroy(&z_comp
->zst_lock
);
166 kmem_free(z_comp
, sizeof (zstream_t
));
168 dmu_zfetch_dofetch(zf
, z_walk
);
170 rw_exit(&zf
->zf_rwlock
);
176 rw_exit(&zf
->zf_rwlock
);
181 * Given a zstream_t, determine the bounds of the prefetch. Then call the
182 * routine that actually prefetches the individual blocks.
185 dmu_zfetch_dofetch(zfetch_t
*zf
, zstream_t
*zs
)
187 uint64_t prefetch_tail
;
188 uint64_t prefetch_limit
;
189 uint64_t prefetch_ofst
;
190 uint64_t prefetch_len
;
191 uint64_t blocks_fetched
;
193 zs
->zst_stride
= MAX((int64_t)zs
->zst_stride
, zs
->zst_len
);
194 zs
->zst_cap
= MIN(zfetch_block_cap
, 2 * zs
->zst_cap
);
196 prefetch_tail
= MAX((int64_t)zs
->zst_ph_offset
,
197 (int64_t)(zs
->zst_offset
+ zs
->zst_stride
));
199 * XXX: use a faster division method?
201 prefetch_limit
= zs
->zst_offset
+ zs
->zst_len
+
202 (zs
->zst_cap
* zs
->zst_stride
) / zs
->zst_len
;
204 while (prefetch_tail
< prefetch_limit
) {
205 prefetch_ofst
= zs
->zst_offset
+ zs
->zst_direction
*
206 (prefetch_tail
- zs
->zst_offset
);
208 prefetch_len
= zs
->zst_len
;
211 * Don't prefetch beyond the end of the file, if working
214 if ((zs
->zst_direction
== ZFETCH_BACKWARD
) &&
215 (prefetch_ofst
> prefetch_tail
)) {
216 prefetch_len
+= prefetch_ofst
;
220 /* don't prefetch more than we're supposed to */
221 if (prefetch_len
> zs
->zst_len
)
224 blocks_fetched
= dmu_zfetch_fetch(zf
->zf_dnode
,
225 prefetch_ofst
, zs
->zst_len
);
227 prefetch_tail
+= zs
->zst_stride
;
228 /* stop if we've run out of stuff to prefetch */
229 if (blocks_fetched
< zs
->zst_len
)
232 zs
->zst_ph_offset
= prefetch_tail
;
233 zs
->zst_last
= ddi_get_lbolt();
240 zfetch_ksp
= kstat_create("zfs", 0, "zfetchstats", "misc",
241 KSTAT_TYPE_NAMED
, sizeof (zfetch_stats
) / sizeof (kstat_named_t
),
244 if (zfetch_ksp
!= NULL
) {
245 zfetch_ksp
->ks_data
= &zfetch_stats
;
246 kstat_install(zfetch_ksp
);
253 if (zfetch_ksp
!= NULL
) {
254 kstat_delete(zfetch_ksp
);
260 * This takes a pointer to a zfetch structure and a dnode. It performs the
261 * necessary setup for the zfetch structure, grokking data from the
265 dmu_zfetch_init(zfetch_t
*zf
, dnode_t
*dno
)
272 zf
->zf_stream_cnt
= 0;
273 zf
->zf_alloc_fail
= 0;
275 list_create(&zf
->zf_stream
, sizeof (zstream_t
),
276 offsetof(zstream_t
, zst_node
));
278 rw_init(&zf
->zf_rwlock
, NULL
, RW_DEFAULT
, NULL
);
282 * This function computes the actual size, in blocks, that can be prefetched,
286 dmu_zfetch_fetch(dnode_t
*dn
, uint64_t blkid
, uint64_t nblks
)
291 fetchsz
= dmu_zfetch_fetchsz(dn
, blkid
, nblks
);
293 for (i
= 0; i
< fetchsz
; i
++) {
294 dbuf_prefetch(dn
, 0, blkid
+ i
, ZIO_PRIORITY_ASYNC_READ
,
302 * this function returns the number of blocks that would be prefetched, based
303 * upon the supplied dnode, blockid, and nblks. This is used so that we can
304 * update streams in place, and then prefetch with their old value after the
305 * fact. This way, we can delay the prefetch, but subsequent accesses to the
306 * stream won't result in the same data being prefetched multiple times.
309 dmu_zfetch_fetchsz(dnode_t
*dn
, uint64_t blkid
, uint64_t nblks
)
313 if (blkid
> dn
->dn_maxblkid
) {
317 /* compute fetch size */
318 if (blkid
+ nblks
+ 1 > dn
->dn_maxblkid
) {
319 fetchsz
= (dn
->dn_maxblkid
- blkid
) + 1;
320 ASSERT(blkid
+ fetchsz
- 1 <= dn
->dn_maxblkid
);
330 * given a zfetch and a zstream structure, see if there is an associated zstream
331 * for this block read. If so, it starts a prefetch for the stream it
332 * located and returns true, otherwise it returns false
335 dmu_zfetch_find(zfetch_t
*zf
, zstream_t
*zh
, int prefetched
)
339 int reset
= !prefetched
;
346 * XXX: This locking strategy is a bit coarse; however, it's impact has
347 * yet to be tested. If this turns out to be an issue, it can be
348 * modified in a number of different ways.
351 rw_enter(&zf
->zf_rwlock
, RW_READER
);
354 for (zs
= list_head(&zf
->zf_stream
); zs
;
355 zs
= list_next(&zf
->zf_stream
, zs
)) {
358 * XXX - should this be an assert?
360 if (zs
->zst_len
== 0) {
362 ZFETCHSTAT_BUMP(zfetchstat_bogus_streams
);
367 * We hit this case when we are in a strided prefetch stream:
368 * we will read "len" blocks before "striding".
370 if (zh
->zst_offset
>= zs
->zst_offset
&&
371 zh
->zst_offset
< zs
->zst_offset
+ zs
->zst_len
) {
373 /* already fetched */
374 ZFETCHSTAT_BUMP(zfetchstat_stride_hits
);
378 ZFETCHSTAT_BUMP(zfetchstat_stride_misses
);
383 * This is the forward sequential read case: we increment
384 * len by one each time we hit here, so we will enter this
385 * case on every read.
387 if (zh
->zst_offset
== zs
->zst_offset
+ zs
->zst_len
) {
389 reset
= !prefetched
&& zs
->zst_len
> 1;
391 mutex_enter(&zs
->zst_lock
);
393 if (zh
->zst_offset
!= zs
->zst_offset
+ zs
->zst_len
) {
394 mutex_exit(&zs
->zst_lock
);
397 zs
->zst_len
+= zh
->zst_len
;
398 diff
= zs
->zst_len
- zfetch_block_cap
;
400 zs
->zst_offset
+= diff
;
401 zs
->zst_len
= zs
->zst_len
> diff
?
402 zs
->zst_len
- diff
: 0;
404 zs
->zst_direction
= ZFETCH_FORWARD
;
409 * Same as above, but reading backwards through the file.
411 } else if (zh
->zst_offset
== zs
->zst_offset
- zh
->zst_len
) {
412 /* backwards sequential access */
414 reset
= !prefetched
&& zs
->zst_len
> 1;
416 mutex_enter(&zs
->zst_lock
);
418 if (zh
->zst_offset
!= zs
->zst_offset
- zh
->zst_len
) {
419 mutex_exit(&zs
->zst_lock
);
423 zs
->zst_offset
= zs
->zst_offset
> zh
->zst_len
?
424 zs
->zst_offset
- zh
->zst_len
: 0;
425 zs
->zst_ph_offset
= zs
->zst_ph_offset
> zh
->zst_len
?
426 zs
->zst_ph_offset
- zh
->zst_len
: 0;
427 zs
->zst_len
+= zh
->zst_len
;
429 diff
= zs
->zst_len
- zfetch_block_cap
;
431 zs
->zst_ph_offset
= zs
->zst_ph_offset
> diff
?
432 zs
->zst_ph_offset
- diff
: 0;
433 zs
->zst_len
= zs
->zst_len
> diff
?
434 zs
->zst_len
- diff
: zs
->zst_len
;
436 zs
->zst_direction
= ZFETCH_BACKWARD
;
440 } else if ((zh
->zst_offset
- zs
->zst_offset
- zs
->zst_stride
<
441 zs
->zst_len
) && (zs
->zst_len
!= zs
->zst_stride
)) {
442 /* strided forward access */
444 mutex_enter(&zs
->zst_lock
);
446 if ((zh
->zst_offset
- zs
->zst_offset
- zs
->zst_stride
>=
447 zs
->zst_len
) || (zs
->zst_len
== zs
->zst_stride
)) {
448 mutex_exit(&zs
->zst_lock
);
452 zs
->zst_offset
+= zs
->zst_stride
;
453 zs
->zst_direction
= ZFETCH_FORWARD
;
457 } else if ((zh
->zst_offset
- zs
->zst_offset
+ zs
->zst_stride
<
458 zs
->zst_len
) && (zs
->zst_len
!= zs
->zst_stride
)) {
459 /* strided reverse access */
461 mutex_enter(&zs
->zst_lock
);
463 if ((zh
->zst_offset
- zs
->zst_offset
+ zs
->zst_stride
>=
464 zs
->zst_len
) || (zs
->zst_len
== zs
->zst_stride
)) {
465 mutex_exit(&zs
->zst_lock
);
469 zs
->zst_offset
= zs
->zst_offset
> zs
->zst_stride
?
470 zs
->zst_offset
- zs
->zst_stride
: 0;
471 zs
->zst_ph_offset
= (zs
->zst_ph_offset
>
472 (2 * zs
->zst_stride
)) ?
473 (zs
->zst_ph_offset
- (2 * zs
->zst_stride
)) : 0;
474 zs
->zst_direction
= ZFETCH_BACKWARD
;
482 zstream_t
*remove
= zs
;
484 ZFETCHSTAT_BUMP(zfetchstat_stream_resets
);
486 mutex_exit(&zs
->zst_lock
);
487 rw_exit(&zf
->zf_rwlock
);
488 rw_enter(&zf
->zf_rwlock
, RW_WRITER
);
490 * Relocate the stream, in case someone removes
491 * it while we were acquiring the WRITER lock.
493 for (zs
= list_head(&zf
->zf_stream
); zs
;
494 zs
= list_next(&zf
->zf_stream
, zs
)) {
496 dmu_zfetch_stream_remove(zf
, zs
);
497 mutex_destroy(&zs
->zst_lock
);
498 kmem_free(zs
, sizeof (zstream_t
));
503 ZFETCHSTAT_BUMP(zfetchstat_stream_noresets
);
505 dmu_zfetch_dofetch(zf
, zs
);
506 mutex_exit(&zs
->zst_lock
);
510 rw_exit(&zf
->zf_rwlock
);
515 * Clean-up state associated with a zfetch structure. This frees allocated
516 * structure members, empties the zf_stream tree, and generally makes things
517 * nice. This doesn't free the zfetch_t itself, that's left to the caller.
520 dmu_zfetch_rele(zfetch_t
*zf
)
525 ASSERT(!RW_LOCK_HELD(&zf
->zf_rwlock
));
527 for (zs
= list_head(&zf
->zf_stream
); zs
; zs
= zs_next
) {
528 zs_next
= list_next(&zf
->zf_stream
, zs
);
530 list_remove(&zf
->zf_stream
, zs
);
531 mutex_destroy(&zs
->zst_lock
);
532 kmem_free(zs
, sizeof (zstream_t
));
534 list_destroy(&zf
->zf_stream
);
535 rw_destroy(&zf
->zf_rwlock
);
541 * Given a zfetch and zstream structure, insert the zstream structure into the
542 * AVL tree contained within the zfetch structure. Peform the appropriate
543 * book-keeping. It is possible that another thread has inserted a stream which
544 * matches one that we are about to insert, so we must be sure to check for this
545 * case. If one is found, return failure, and let the caller cleanup the
549 dmu_zfetch_stream_insert(zfetch_t
*zf
, zstream_t
*zs
)
554 ASSERT(RW_WRITE_HELD(&zf
->zf_rwlock
));
556 for (zs_walk
= list_head(&zf
->zf_stream
); zs_walk
; zs_walk
= zs_next
) {
557 zs_next
= list_next(&zf
->zf_stream
, zs_walk
);
559 if (dmu_zfetch_streams_equal(zs_walk
, zs
)) {
564 list_insert_head(&zf
->zf_stream
, zs
);
571 * Walk the list of zstreams in the given zfetch, find an old one (by time), and
572 * reclaim it for use by the caller.
575 dmu_zfetch_stream_reclaim(zfetch_t
*zf
)
579 if (! rw_tryenter(&zf
->zf_rwlock
, RW_WRITER
))
582 for (zs
= list_head(&zf
->zf_stream
); zs
;
583 zs
= list_next(&zf
->zf_stream
, zs
)) {
585 if (((ddi_get_lbolt() - zs
->zst_last
)/hz
) > zfetch_min_sec_reap
)
590 dmu_zfetch_stream_remove(zf
, zs
);
591 mutex_destroy(&zs
->zst_lock
);
592 bzero(zs
, sizeof (zstream_t
));
596 rw_exit(&zf
->zf_rwlock
);
602 * Given a zfetch and zstream structure, remove the zstream structure from its
603 * container in the zfetch structure. Perform the appropriate book-keeping.
606 dmu_zfetch_stream_remove(zfetch_t
*zf
, zstream_t
*zs
)
608 ASSERT(RW_WRITE_HELD(&zf
->zf_rwlock
));
610 list_remove(&zf
->zf_stream
, zs
);
615 dmu_zfetch_streams_equal(zstream_t
*zs1
, zstream_t
*zs2
)
617 if (zs1
->zst_offset
!= zs2
->zst_offset
)
620 if (zs1
->zst_len
!= zs2
->zst_len
)
623 if (zs1
->zst_stride
!= zs2
->zst_stride
)
626 if (zs1
->zst_ph_offset
!= zs2
->zst_ph_offset
)
629 if (zs1
->zst_cap
!= zs2
->zst_cap
)
632 if (zs1
->zst_direction
!= zs2
->zst_direction
)
639 * This is the prefetch entry point. It calls all of the other dmu_zfetch
640 * routines to create, delete, find, or operate upon prefetch streams.
643 dmu_zfetch(zfetch_t
*zf
, uint64_t offset
, uint64_t size
, int prefetched
)
646 zstream_t
*newstream
;
649 unsigned int blkshft
;
652 if (zfs_prefetch_disable
)
655 /* files that aren't ln2 blocksz are only one block -- nothing to do */
656 if (!zf
->zf_dnode
->dn_datablkshift
)
659 /* convert offset and size, into blockid and nblocks */
660 blkshft
= zf
->zf_dnode
->dn_datablkshift
;
661 blksz
= (1 << blkshft
);
663 bzero(&zst
, sizeof (zstream_t
));
664 zst
.zst_offset
= offset
>> blkshft
;
665 zst
.zst_len
= (P2ROUNDUP(offset
+ size
, blksz
) -
666 P2ALIGN(offset
, blksz
)) >> blkshft
;
668 fetched
= dmu_zfetch_find(zf
, &zst
, prefetched
);
670 ZFETCHSTAT_BUMP(zfetchstat_hits
);
672 ZFETCHSTAT_BUMP(zfetchstat_misses
);
673 fetched
= dmu_zfetch_colinear(zf
, &zst
);
675 ZFETCHSTAT_BUMP(zfetchstat_colinear_hits
);
677 ZFETCHSTAT_BUMP(zfetchstat_colinear_misses
);
682 newstream
= dmu_zfetch_stream_reclaim(zf
);
685 * we still couldn't find a stream, drop the lock, and allocate
686 * one if possible. Otherwise, give up and go home.
689 ZFETCHSTAT_BUMP(zfetchstat_reclaim_successes
);
692 uint32_t max_streams
;
693 uint32_t cur_streams
;
695 ZFETCHSTAT_BUMP(zfetchstat_reclaim_failures
);
696 cur_streams
= zf
->zf_stream_cnt
;
697 maxblocks
= zf
->zf_dnode
->dn_maxblkid
;
699 max_streams
= MIN(zfetch_max_streams
,
700 (maxblocks
/ zfetch_block_cap
));
701 if (max_streams
== 0) {
705 if (cur_streams
>= max_streams
) {
708 newstream
= kmem_zalloc(sizeof (zstream_t
), KM_SLEEP
);
711 newstream
->zst_offset
= zst
.zst_offset
;
712 newstream
->zst_len
= zst
.zst_len
;
713 newstream
->zst_stride
= zst
.zst_len
;
714 newstream
->zst_ph_offset
= zst
.zst_len
+ zst
.zst_offset
;
715 newstream
->zst_cap
= zst
.zst_len
;
716 newstream
->zst_direction
= ZFETCH_FORWARD
;
717 newstream
->zst_last
= ddi_get_lbolt();
719 mutex_init(&newstream
->zst_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
721 rw_enter(&zf
->zf_rwlock
, RW_WRITER
);
722 inserted
= dmu_zfetch_stream_insert(zf
, newstream
);
723 rw_exit(&zf
->zf_rwlock
);
726 mutex_destroy(&newstream
->zst_lock
);
727 kmem_free(newstream
, sizeof (zstream_t
));