4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
25 #include <sys/zfs_context.h>
26 #include <sys/dmu_objset.h>
27 #include <sys/dmu_traverse.h>
28 #include <sys/dsl_dataset.h>
29 #include <sys/dsl_dir.h>
30 #include <sys/dsl_pool.h>
31 #include <sys/dnode.h>
34 #include <sys/dmu_impl.h>
36 #include <sys/sa_impl.h>
37 #include <sys/callb.h>
39 int zfs_pd_blks_max
= 100;
41 typedef struct prefetch_data
{
51 typedef struct traverse_data
{
57 prefetch_data_t
*td_pfd
;
62 static int traverse_dnode(traverse_data_t
*td
, const dnode_phys_t
*dnp
,
63 arc_buf_t
*buf
, uint64_t objset
, uint64_t object
);
66 traverse_zil_block(zilog_t
*zilog
, blkptr_t
*bp
, void *arg
, uint64_t claim_txg
)
68 traverse_data_t
*td
= arg
;
71 if (bp
->blk_birth
== 0)
74 if (claim_txg
== 0 && bp
->blk_birth
>= spa_first_txg(td
->td_spa
))
77 SET_BOOKMARK(&zb
, td
->td_objset
, ZB_ZIL_OBJECT
, ZB_ZIL_LEVEL
,
78 bp
->blk_cksum
.zc_word
[ZIL_ZC_SEQ
]);
80 (void) td
->td_func(td
->td_spa
, zilog
, bp
, NULL
, &zb
, NULL
, td
->td_arg
);
86 traverse_zil_record(zilog_t
*zilog
, lr_t
*lrc
, void *arg
, uint64_t claim_txg
)
88 traverse_data_t
*td
= arg
;
90 if (lrc
->lrc_txtype
== TX_WRITE
) {
91 lr_write_t
*lr
= (lr_write_t
*)lrc
;
92 blkptr_t
*bp
= &lr
->lr_blkptr
;
95 if (bp
->blk_birth
== 0)
98 if (claim_txg
== 0 || bp
->blk_birth
< claim_txg
)
101 SET_BOOKMARK(&zb
, td
->td_objset
, lr
->lr_foid
,
102 ZB_ZIL_LEVEL
, lr
->lr_offset
/ BP_GET_LSIZE(bp
));
104 (void) td
->td_func(td
->td_spa
, zilog
, bp
, NULL
, &zb
, NULL
,
111 traverse_zil(traverse_data_t
*td
, zil_header_t
*zh
)
113 uint64_t claim_txg
= zh
->zh_claim_txg
;
117 * We only want to visit blocks that have been claimed but not yet
118 * replayed; plus, in read-only mode, blocks that are already stable.
120 if (claim_txg
== 0 && spa_writeable(td
->td_spa
))
123 zilog
= zil_alloc(spa_get_dsl(td
->td_spa
)->dp_meta_objset
, zh
);
125 (void) zil_parse(zilog
, traverse_zil_block
, traverse_zil_record
, td
,
132 traverse_visitbp(traverse_data_t
*td
, const dnode_phys_t
*dnp
,
133 arc_buf_t
*pbuf
, blkptr_t
*bp
, const zbookmark_t
*zb
)
136 int err
= 0, lasterr
= 0;
137 arc_buf_t
*buf
= NULL
;
138 prefetch_data_t
*pd
= td
->td_pfd
;
139 boolean_t hard
= td
->td_flags
& TRAVERSE_HARD
;
141 if (bp
->blk_birth
== 0) {
142 err
= td
->td_func(td
->td_spa
, NULL
, NULL
, pbuf
, zb
, dnp
,
147 if (bp
->blk_birth
<= td
->td_min_txg
)
150 if (pd
&& !pd
->pd_exited
&&
151 ((pd
->pd_flags
& TRAVERSE_PREFETCH_DATA
) ||
152 BP_GET_TYPE(bp
) == DMU_OT_DNODE
|| BP_GET_LEVEL(bp
) > 0)) {
153 mutex_enter(&pd
->pd_mtx
);
154 ASSERT(pd
->pd_blks_fetched
>= 0);
155 while (pd
->pd_blks_fetched
== 0 && !pd
->pd_exited
)
156 cv_wait(&pd
->pd_cv
, &pd
->pd_mtx
);
157 pd
->pd_blks_fetched
--;
158 cv_broadcast(&pd
->pd_cv
);
159 mutex_exit(&pd
->pd_mtx
);
162 if (td
->td_flags
& TRAVERSE_PRE
) {
163 err
= td
->td_func(td
->td_spa
, NULL
, bp
, pbuf
, zb
, dnp
,
165 if (err
== TRAVERSE_VISIT_NO_CHILDREN
)
171 if (BP_GET_LEVEL(bp
) > 0) {
172 uint32_t flags
= ARC_WAIT
;
175 int epb
= BP_GET_LSIZE(bp
) >> SPA_BLKPTRSHIFT
;
177 err
= dsl_read(NULL
, td
->td_spa
, bp
, pbuf
,
178 arc_getbuf_func
, &buf
,
179 ZIO_PRIORITY_ASYNC_READ
, ZIO_FLAG_CANFAIL
, &flags
, zb
);
183 /* recursively visitbp() blocks below this */
185 for (i
= 0; i
< epb
; i
++, cbp
++) {
186 SET_BOOKMARK(&czb
, zb
->zb_objset
, zb
->zb_object
,
188 zb
->zb_blkid
* epb
+ i
);
189 err
= traverse_visitbp(td
, dnp
, buf
, cbp
, &czb
);
196 } else if (BP_GET_TYPE(bp
) == DMU_OT_DNODE
) {
197 uint32_t flags
= ARC_WAIT
;
199 int epb
= BP_GET_LSIZE(bp
) >> DNODE_SHIFT
;
201 err
= dsl_read(NULL
, td
->td_spa
, bp
, pbuf
,
202 arc_getbuf_func
, &buf
,
203 ZIO_PRIORITY_ASYNC_READ
, ZIO_FLAG_CANFAIL
, &flags
, zb
);
207 /* recursively visitbp() blocks below this */
209 for (i
= 0; i
< epb
; i
++, dnp
++) {
210 err
= traverse_dnode(td
, dnp
, buf
, zb
->zb_objset
,
211 zb
->zb_blkid
* epb
+ i
);
218 } else if (BP_GET_TYPE(bp
) == DMU_OT_OBJSET
) {
219 uint32_t flags
= ARC_WAIT
;
223 err
= dsl_read_nolock(NULL
, td
->td_spa
, bp
,
224 arc_getbuf_func
, &buf
,
225 ZIO_PRIORITY_ASYNC_READ
, ZIO_FLAG_CANFAIL
, &flags
, zb
);
230 dnp
= &osp
->os_meta_dnode
;
231 err
= traverse_dnode(td
, dnp
, buf
, zb
->zb_objset
,
232 DMU_META_DNODE_OBJECT
);
237 if (err
== 0 && arc_buf_size(buf
) >= sizeof (objset_phys_t
)) {
238 dnp
= &osp
->os_userused_dnode
;
239 err
= traverse_dnode(td
, dnp
, buf
, zb
->zb_objset
,
240 DMU_USERUSED_OBJECT
);
246 if (err
== 0 && arc_buf_size(buf
) >= sizeof (objset_phys_t
)) {
247 dnp
= &osp
->os_groupused_dnode
;
248 err
= traverse_dnode(td
, dnp
, buf
, zb
->zb_objset
,
249 DMU_GROUPUSED_OBJECT
);
254 (void) arc_buf_remove_ref(buf
, &buf
);
256 if (err
== 0 && lasterr
== 0 && (td
->td_flags
& TRAVERSE_POST
)) {
257 err
= td
->td_func(td
->td_spa
, NULL
, bp
, pbuf
, zb
, dnp
,
261 return (err
!= 0 ? err
: lasterr
);
265 traverse_dnode(traverse_data_t
*td
, const dnode_phys_t
*dnp
,
266 arc_buf_t
*buf
, uint64_t objset
, uint64_t object
)
268 int j
, err
= 0, lasterr
= 0;
270 boolean_t hard
= (td
->td_flags
& TRAVERSE_HARD
);
272 for (j
= 0; j
< dnp
->dn_nblkptr
; j
++) {
273 SET_BOOKMARK(&czb
, objset
, object
, dnp
->dn_nlevels
- 1, j
);
274 err
= traverse_visitbp(td
, dnp
, buf
,
275 (blkptr_t
*)&dnp
->dn_blkptr
[j
], &czb
);
283 if (dnp
->dn_flags
& DNODE_FLAG_SPILL_BLKPTR
) {
284 SET_BOOKMARK(&czb
, objset
,
285 object
, 0, DMU_SPILL_BLKID
);
286 err
= traverse_visitbp(td
, dnp
, buf
,
287 (blkptr_t
*)&dnp
->dn_spill
, &czb
);
294 return (err
!= 0 ? err
: lasterr
);
299 traverse_prefetcher(spa_t
*spa
, zilog_t
*zilog
, const blkptr_t
*bp
,
300 arc_buf_t
*pbuf
, const zbookmark_t
*zb
, const dnode_phys_t
*dnp
,
303 prefetch_data_t
*pfd
= arg
;
304 uint32_t aflags
= ARC_NOWAIT
| ARC_PREFETCH
;
306 ASSERT(pfd
->pd_blks_fetched
>= 0);
310 if (bp
== NULL
|| !((pfd
->pd_flags
& TRAVERSE_PREFETCH_DATA
) ||
311 BP_GET_TYPE(bp
) == DMU_OT_DNODE
|| BP_GET_LEVEL(bp
) > 0) ||
312 BP_GET_TYPE(bp
) == DMU_OT_INTENT_LOG
)
315 mutex_enter(&pfd
->pd_mtx
);
316 while (!pfd
->pd_cancel
&& pfd
->pd_blks_fetched
>= pfd
->pd_blks_max
)
317 cv_wait(&pfd
->pd_cv
, &pfd
->pd_mtx
);
318 pfd
->pd_blks_fetched
++;
319 cv_broadcast(&pfd
->pd_cv
);
320 mutex_exit(&pfd
->pd_mtx
);
322 (void) dsl_read(NULL
, spa
, bp
, pbuf
, NULL
, NULL
,
323 ZIO_PRIORITY_ASYNC_READ
,
324 ZIO_FLAG_CANFAIL
| ZIO_FLAG_SPECULATIVE
,
331 traverse_prefetch_thread(void *arg
)
333 traverse_data_t
*td_main
= arg
;
334 traverse_data_t td
= *td_main
;
337 td
.td_func
= traverse_prefetcher
;
338 td
.td_arg
= td_main
->td_pfd
;
341 SET_BOOKMARK(&czb
, td
.td_objset
,
342 ZB_ROOT_OBJECT
, ZB_ROOT_LEVEL
, ZB_ROOT_BLKID
);
343 (void) traverse_visitbp(&td
, NULL
, NULL
, td
.td_rootbp
, &czb
);
345 mutex_enter(&td_main
->td_pfd
->pd_mtx
);
346 td_main
->td_pfd
->pd_exited
= B_TRUE
;
347 cv_broadcast(&td_main
->td_pfd
->pd_cv
);
348 mutex_exit(&td_main
->td_pfd
->pd_mtx
);
352 * NB: dataset must not be changing on-disk (eg, is a snapshot or we are
353 * in syncing context).
356 traverse_impl(spa_t
*spa
, dsl_dataset_t
*ds
, blkptr_t
*rootbp
,
357 uint64_t txg_start
, int flags
, blkptr_cb_t func
, void *arg
)
360 prefetch_data_t pd
= { 0 };
365 td
.td_objset
= ds
? ds
->ds_object
: 0;
366 td
.td_rootbp
= rootbp
;
367 td
.td_min_txg
= txg_start
;
373 pd
.pd_blks_max
= zfs_pd_blks_max
;
375 mutex_init(&pd
.pd_mtx
, NULL
, MUTEX_DEFAULT
, NULL
);
376 cv_init(&pd
.pd_cv
, NULL
, CV_DEFAULT
, NULL
);
378 /* See comment on ZIL traversal in dsl_scan_visitds. */
379 if (ds
!= NULL
&& !dsl_dataset_is_snapshot(ds
)) {
382 err
= dmu_objset_from_ds(ds
, &os
);
386 traverse_zil(&td
, &os
->os_zil_header
);
389 if (!(flags
& TRAVERSE_PREFETCH
) ||
390 0 == taskq_dispatch(system_taskq
, traverse_prefetch_thread
,
392 pd
.pd_exited
= B_TRUE
;
394 SET_BOOKMARK(&czb
, td
.td_objset
,
395 ZB_ROOT_OBJECT
, ZB_ROOT_LEVEL
, ZB_ROOT_BLKID
);
396 err
= traverse_visitbp(&td
, NULL
, NULL
, rootbp
, &czb
);
398 mutex_enter(&pd
.pd_mtx
);
399 pd
.pd_cancel
= B_TRUE
;
400 cv_broadcast(&pd
.pd_cv
);
401 while (!pd
.pd_exited
)
402 cv_wait(&pd
.pd_cv
, &pd
.pd_mtx
);
403 mutex_exit(&pd
.pd_mtx
);
405 mutex_destroy(&pd
.pd_mtx
);
406 cv_destroy(&pd
.pd_cv
);
412 * NB: dataset must not be changing on-disk (eg, is a snapshot or we are
413 * in syncing context).
416 traverse_dataset(dsl_dataset_t
*ds
, uint64_t txg_start
, int flags
,
417 blkptr_cb_t func
, void *arg
)
419 return (traverse_impl(ds
->ds_dir
->dd_pool
->dp_spa
, ds
,
420 &ds
->ds_phys
->ds_bp
, txg_start
, flags
, func
, arg
));
424 * NB: pool must not be changing on-disk (eg, from zdb or sync context).
427 traverse_pool(spa_t
*spa
, uint64_t txg_start
, int flags
,
428 blkptr_cb_t func
, void *arg
)
430 int err
, lasterr
= 0;
432 dsl_pool_t
*dp
= spa_get_dsl(spa
);
433 objset_t
*mos
= dp
->dp_meta_objset
;
434 boolean_t hard
= (flags
& TRAVERSE_HARD
);
437 err
= traverse_impl(spa
, NULL
, spa_get_rootblkptr(spa
),
438 txg_start
, flags
, func
, arg
);
442 /* visit each dataset */
443 for (obj
= 1; err
== 0 || (err
!= ESRCH
&& hard
);
444 err
= dmu_object_next(mos
, &obj
, FALSE
, txg_start
)) {
445 dmu_object_info_t doi
;
447 err
= dmu_object_info(mos
, obj
, &doi
);
455 if (doi
.doi_type
== DMU_OT_DSL_DATASET
) {
457 uint64_t txg
= txg_start
;
459 rw_enter(&dp
->dp_config_rwlock
, RW_READER
);
460 err
= dsl_dataset_hold_obj(dp
, obj
, FTAG
, &ds
);
461 rw_exit(&dp
->dp_config_rwlock
);
468 if (ds
->ds_phys
->ds_prev_snap_txg
> txg
)
469 txg
= ds
->ds_phys
->ds_prev_snap_txg
;
470 err
= traverse_dataset(ds
, txg
, flags
, func
, arg
);
471 dsl_dataset_rele(ds
, FTAG
);
481 return (err
!= 0 ? err
: lasterr
);