From 88b7b0f29b20b808b9e06071885b1d6a3ddb6328 Mon Sep 17 00:00:00 2001 From: Matthew Ahrens Date: Tue, 14 Oct 2008 15:57:18 -0700 Subject: [PATCH] 6333409 traversal code should be able to issue multiple reads in parallel 6418042 want traversal in depth-first pre-order for quicker 'zfs send' 6757112 zvol dump code is extra complicated 6725668 want ::zfs_blkstats to show block type stats after scrub 6725675 dmu traverse code has extraneous features 6725680 P2CROSS is confusing to use 6725698 zvol dump device should always be 128k 6729696 sync causes scrub or resilver to pause for up to 30s 6730101 online recv can cause scrub to miss some blocks 6752226 assertion failed in dbuf_verify: db->db.db_size >= dn->dn_datablksz 6577985 panic when zfs send a snapshot with i/o errors 6755042 zdb -Lbc counts block several times in case of checksum errors --- usr/src/cmd/mdb/common/modules/zfs/zfs.c | 171 +++ usr/src/cmd/zdb/zdb.c | 358 ++---- usr/src/cmd/ztest/ztest.c | 155 +-- usr/src/lib/libumem/common/vmem.c | 6 +- usr/src/lib/libzpool/common/kernel.c | 4 +- usr/src/lib/libzpool/common/llib-lzpool | 1 + usr/src/lib/libzpool/common/sys/zfs_context.h | 3 + usr/src/lib/libzpool/common/taskq.c | 12 +- usr/src/uts/common/fs/zfs/dbuf.c | 26 +- usr/src/uts/common/fs/zfs/dmu_send.c | 80 +- usr/src/uts/common/fs/zfs/dmu_traverse.c | 1331 ++++++-------------- usr/src/uts/common/fs/zfs/dsl_dataset.c | 19 +- usr/src/uts/common/fs/zfs/dsl_pool.c | 2 + usr/src/uts/common/fs/zfs/dsl_scrub.c | 85 ++ usr/src/uts/common/fs/zfs/spa.c | 4 - usr/src/uts/common/fs/zfs/spa_misc.c | 16 +- .../common/fs/zfs/sys/dmu_traverse.h} | 46 +- usr/src/uts/common/fs/zfs/sys/dsl_pool.h | 24 + usr/src/uts/common/fs/zfs/sys/spa.h | 6 +- usr/src/uts/common/fs/zfs/sys/spa_impl.h | 4 +- usr/src/uts/common/fs/zfs/sys/txg_impl.h | 3 - usr/src/uts/common/fs/zfs/txg.c | 10 +- usr/src/uts/common/fs/zfs/vdev_cache.c | 2 +- usr/src/uts/common/fs/zfs/zvol.c | 421 +++---- usr/src/uts/common/os/vmem.c | 6 +- usr/src/uts/common/sys/sysmacros.h | 63 +- usr/src/uts/i86pc/os/ddi_impl.c | 4 +- 27 files changed, 1147 insertions(+), 1715 deletions(-) rewrite usr/src/uts/common/fs/zfs/dmu_traverse.c (89%) copy usr/src/{lib/libzpool/common/llib-lzpool => uts/common/fs/zfs/sys/dmu_traverse.h} (58%) diff --git a/usr/src/cmd/mdb/common/modules/zfs/zfs.c b/usr/src/cmd/mdb/common/modules/zfs/zfs.c index e415cac35a..c8d32b6455 100644 --- a/usr/src/cmd/mdb/common/modules/zfs/zfs.c +++ b/usr/src/cmd/mdb/common/modules/zfs/zfs.c @@ -1934,6 +1934,174 @@ zio_walk_root_step(mdb_walk_state_t *wsp) return (wsp->walk_callback(wsp->walk_addr, &zio, wsp->walk_cbdata)); } +#define NICENUM_BUFLEN 6 + +static int +snprintfloat(char *buf, int len, float f, int frac_digits) +{ + float mul = 1; + int whole, frac, i; + + for (i = frac_digits; i; i--) + mul *= 10; + whole = (int)f; + frac = (int)((f - whole) * mul); + return (mdb_snprintf(buf, len, "%u.%0*u", whole, frac_digits, frac)); +} + +static void +mdb_nicenum(uint64_t num, char *buf) +{ + uint64_t n = num; + int index = 0; + char *u; + + while (n >= 1024) { + n = (n + (1024 / 2)) / 1024; /* Round up or down */ + index++; + } + + u = &" \0K\0M\0G\0T\0P\0E\0"[index*2]; + + if (index == 0) { + (void) mdb_snprintf(buf, NICENUM_BUFLEN, "%llu", + (u_longlong_t)n); + } else if (n < 10 && (num & (num - 1)) != 0) { + (void) snprintfloat(buf, NICENUM_BUFLEN, + (float)num / (1ULL << 10 * index), 2); + strcat(buf, u); + } else if (n < 100 && (num & (num - 1)) != 0) { + (void) snprintfloat(buf, NICENUM_BUFLEN, + (float)num / (1ULL << 10 * index), 1); + strcat(buf, u); + } else { + (void) mdb_snprintf(buf, NICENUM_BUFLEN, "%llu%s", + (u_longlong_t)n, u); + } +} + +/* + * ::zfs_blkstats + * + * -v print verbose per-level information + * + */ +static int +zfs_blkstats(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) +{ + boolean_t verbose = B_FALSE; + zfs_all_blkstats_t stats; + dmu_object_type_t t; + zfs_blkstat_t *tzb; + uint64_t ditto; + dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES + 10]; + /* +10 in case it grew */ + + if (mdb_readvar(&dmu_ot, "dmu_ot") == -1) { + mdb_warn("failed to read 'dmu_ot'"); + return (DCMD_ERR); + } + + if (mdb_getopts(argc, argv, + 'v', MDB_OPT_SETBITS, TRUE, &verbose, + NULL) != argc) + return (DCMD_USAGE); + + if (!(flags & DCMD_ADDRSPEC)) + return (DCMD_USAGE); + + if (GETMEMB(addr, struct spa, spa_dsl_pool, addr) || + GETMEMB(addr, struct dsl_pool, dp_blkstats, addr) || + mdb_vread(&stats, sizeof (zfs_all_blkstats_t), addr) == -1) { + mdb_warn("failed to read data at %p;", addr); + mdb_printf("maybe no stats? run \"zpool scrub\" first."); + return (DCMD_ERR); + } + + tzb = &stats.zab_type[DN_MAX_LEVELS][DMU_OT_NUMTYPES]; + if (tzb->zb_gangs != 0) { + mdb_printf("Ganged blocks: %llu\n", + (longlong_t)tzb->zb_gangs); + } + + ditto = tzb->zb_ditto_2_of_2_samevdev + tzb->zb_ditto_2_of_3_samevdev + + tzb->zb_ditto_3_of_3_samevdev; + if (ditto != 0) { + mdb_printf("Dittoed blocks on same vdev: %llu\n", + (longlong_t)ditto); + } + + mdb_printf("\nBlocks\tLSIZE\tPSIZE\tASIZE" + "\t avg\t comp\t%%Total\tType\n"); + + for (t = 0; t <= DMU_OT_NUMTYPES; t++) { + char csize[NICENUM_BUFLEN], lsize[NICENUM_BUFLEN]; + char psize[NICENUM_BUFLEN], asize[NICENUM_BUFLEN]; + char avg[NICENUM_BUFLEN]; + char comp[NICENUM_BUFLEN], pct[NICENUM_BUFLEN]; + char typename[64]; + int l; + + + if (t == DMU_OT_DEFERRED) + strcpy(typename, "deferred free"); + else if (t == DMU_OT_TOTAL) + strcpy(typename, "Total"); + else if (mdb_readstr(typename, sizeof (typename), + (uintptr_t)dmu_ot[t].ot_name) == -1) { + mdb_warn("failed to read type name"); + return (DCMD_ERR); + } + + if (stats.zab_type[DN_MAX_LEVELS][t].zb_asize == 0) + continue; + + for (l = -1; l < DN_MAX_LEVELS; l++) { + int level = (l == -1 ? DN_MAX_LEVELS : l); + zfs_blkstat_t *zb = &stats.zab_type[level][t]; + + if (zb->zb_asize == 0) + continue; + + /* + * Don't print each level unless requested. + */ + if (!verbose && level != DN_MAX_LEVELS) + continue; + + /* + * If all the space is level 0, don't print the + * level 0 separately. + */ + if (level == 0 && zb->zb_asize == + stats.zab_type[DN_MAX_LEVELS][t].zb_asize) + continue; + + mdb_nicenum(zb->zb_count, csize); + mdb_nicenum(zb->zb_lsize, lsize); + mdb_nicenum(zb->zb_psize, psize); + mdb_nicenum(zb->zb_asize, asize); + mdb_nicenum(zb->zb_asize / zb->zb_count, avg); + (void) snprintfloat(comp, NICENUM_BUFLEN, + (float)zb->zb_lsize / zb->zb_psize, 2); + (void) snprintfloat(pct, NICENUM_BUFLEN, + 100.0 * zb->zb_asize / tzb->zb_asize, 2); + + mdb_printf("%6s\t%5s\t%5s\t%5s\t%5s" + "\t%5s\t%6s\t", + csize, lsize, psize, asize, avg, comp, pct); + + if (level == DN_MAX_LEVELS) + mdb_printf("%s\n", typename); + else + mdb_printf(" L%d %s\n", + level, typename); + } + } + + return (DCMD_OK); +} + /* * MDB module linkage information: * @@ -1967,6 +2135,9 @@ static const mdb_dcmd_t dcmds[] = { { "zio_state", "?", "print out all zio_t structures on system or " "for a particular pool", zio_state }, { "zio_pipeline", ":", "decode a zio pipeline", zio_pipeline }, + { "zfs_blkstats", ":[-v]", + "given a spa_t, print block type stats from last scrub", + zfs_blkstats }, { "zfs_params", "", "print zfs tunable parameters", zfs_params }, { NULL } }; diff --git a/usr/src/cmd/zdb/zdb.c b/usr/src/cmd/zdb/zdb.c index 16b2787704..253a1346a4 100644 --- a/usr/src/cmd/zdb/zdb.c +++ b/usr/src/cmd/zdb/zdb.c @@ -50,6 +50,7 @@ #include #include #include +#include #undef ZFS_MAXNAMELEN #undef verify #include @@ -62,8 +63,6 @@ typedef void object_viewer_t(objset_t *, uint64_t, void *data, size_t size); extern void dump_intent_log(zilog_t *); uint64_t *zopt_object = NULL; int zopt_objects = 0; -int zdb_advance = ADVANCE_PRE; -zbookmark_t zdb_noread = { 0, 0, ZB_NO_LEVEL, 0 }; libzfs_handle_t *g_zfs; boolean_t zdb_sig_user_data = B_TRUE; int zdb_sig_cksumalg = ZIO_CHECKSUM_SHA256; @@ -88,8 +87,8 @@ static void usage(void) { (void) fprintf(stderr, - "Usage: %s [-udibcsvL] [-U cachefile_path] [-O order] " - "[-B os:obj:level:blkid] [-S user:cksumalg] " + "Usage: %s [-udibcsv] [-U cachefile_path] " + "[-S user:cksumalg] " "dataset [object...]\n" " %s -C [pool]\n" " %s -l dev\n" @@ -109,13 +108,8 @@ usage(void) "dump blkptr signatures\n"); (void) fprintf(stderr, " -v verbose (applies to all others)\n"); (void) fprintf(stderr, " -l dump label contents\n"); - (void) fprintf(stderr, " -L live pool (allows some errors)\n"); - (void) fprintf(stderr, " -O [!] " - "visitation order\n"); (void) fprintf(stderr, " -U cachefile_path -- use alternate " "cachefile\n"); - (void) fprintf(stderr, " -B objset:object:level:blkid -- " - "simulate bad block\n"); (void) fprintf(stderr, " -R read and display block from a " "device\n"); (void) fprintf(stderr, " -e Pool is exported/destroyed/" @@ -138,7 +132,7 @@ fatal(const char *fmt, ...) va_end(ap); (void) fprintf(stderr, "\n"); - exit(1); + abort(); } static void @@ -571,7 +565,7 @@ dump_dnode(objset_t *os, uint64_t object, void *data, size_t size) } static uint64_t -blkid2offset(dnode_phys_t *dnp, int level, uint64_t blkid) +blkid2offset(const dnode_phys_t *dnp, int level, uint64_t blkid) { if (level < 0) return (blkid); @@ -602,115 +596,104 @@ sprintf_blkptr_compact(char *blkbuf, blkptr_t *bp, int alldvas) (u_longlong_t)bp->blk_birth); } -/* ARGSUSED */ -static int -zdb_indirect_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a) +static void +print_indirect(blkptr_t *bp, const zbookmark_t *zb, + const dnode_phys_t *dnp) { - zbookmark_t *zb = &bc->bc_bookmark; - blkptr_t *bp = &bc->bc_blkptr; - void *data = bc->bc_data; - dnode_phys_t *dnp = bc->bc_dnode; - char blkbuf[BP_SPRINTF_LEN + 80]; + char blkbuf[BP_SPRINTF_LEN]; int l; - if (bc->bc_errno) { - (void) sprintf(blkbuf, - "Error %d reading <%llu, %llu, %lld, %llu>: ", - bc->bc_errno, - (u_longlong_t)zb->zb_objset, - (u_longlong_t)zb->zb_object, - (u_longlong_t)zb->zb_level, - (u_longlong_t)zb->zb_blkid); - goto out; - } - - if (zb->zb_level == -1) { - ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_OBJSET); - ASSERT3U(BP_GET_LEVEL(bp), ==, 0); - } else { - ASSERT3U(BP_GET_TYPE(bp), ==, dnp->dn_type); - ASSERT3U(BP_GET_LEVEL(bp), ==, zb->zb_level); - } - - if (zb->zb_level > 0) { - uint64_t fill = 0; - blkptr_t *bpx, *bpend; - - for (bpx = data, bpend = bpx + BP_GET_LSIZE(bp) / sizeof (*bpx); - bpx < bpend; bpx++) { - if (bpx->blk_birth != 0) { - fill += bpx->blk_fill; - } else { - ASSERT(bpx->blk_fill == 0); - } - } - ASSERT3U(fill, ==, bp->blk_fill); - } - - if (zb->zb_level == 0 && dnp->dn_type == DMU_OT_DNODE) { - uint64_t fill = 0; - dnode_phys_t *dnx, *dnend; + ASSERT3U(BP_GET_TYPE(bp), ==, dnp->dn_type); + ASSERT3U(BP_GET_LEVEL(bp), ==, zb->zb_level); - for (dnx = data, dnend = dnx + (BP_GET_LSIZE(bp)>>DNODE_SHIFT); - dnx < dnend; dnx++) { - if (dnx->dn_type != DMU_OT_NONE) - fill++; - } - ASSERT3U(fill, ==, bp->blk_fill); - } - - (void) sprintf(blkbuf, "%16llx ", + (void) printf("%16llx ", (u_longlong_t)blkid2offset(dnp, zb->zb_level, zb->zb_blkid)); ASSERT(zb->zb_level >= 0); for (l = dnp->dn_nlevels - 1; l >= -1; l--) { if (l == zb->zb_level) { - (void) sprintf(blkbuf + strlen(blkbuf), "L%llx", - (u_longlong_t)zb->zb_level); + (void) printf("L%llx", (u_longlong_t)zb->zb_level); } else { - (void) sprintf(blkbuf + strlen(blkbuf), " "); + (void) printf(" "); } } -out: - if (bp->blk_birth == 0) { - (void) sprintf(blkbuf + strlen(blkbuf), ""); - (void) printf("%s\n", blkbuf); - } else { - sprintf_blkptr_compact(blkbuf + strlen(blkbuf), bp, - dump_opt['d'] > 5 ? 1 : 0); - (void) printf("%s\n", blkbuf); + sprintf_blkptr_compact(blkbuf, bp, dump_opt['d'] > 5 ? 1 : 0); + (void) printf("%s\n", blkbuf); +} + +#define SET_BOOKMARK(zb, objset, object, level, blkid) \ +{ \ + (zb)->zb_objset = objset; \ + (zb)->zb_object = object; \ + (zb)->zb_level = level; \ + (zb)->zb_blkid = blkid; \ +} + +static int +visit_indirect(spa_t *spa, const dnode_phys_t *dnp, + blkptr_t *bp, const zbookmark_t *zb) +{ + int err; + + if (bp->blk_birth == 0) + return (0); + + print_indirect(bp, zb, dnp); + + if (BP_GET_LEVEL(bp) > 0) { + uint32_t flags = ARC_WAIT; + int i; + blkptr_t *cbp; + int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; + arc_buf_t *buf; + uint64_t fill = 0; + + err = arc_read_nolock(NULL, spa, bp, arc_getbuf_func, &buf, + ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); + if (err) + return (err); + + /* recursively visit blocks below this */ + cbp = buf->b_data; + for (i = 0; i < epb; i++, cbp++) { + zbookmark_t czb; + + SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, + zb->zb_level - 1, + zb->zb_blkid * epb + i); + err = visit_indirect(spa, dnp, cbp, &czb); + if (err) + break; + fill += cbp->blk_fill; + } + ASSERT3U(fill, ==, bp->blk_fill); + (void) arc_buf_remove_ref(buf, &buf); } - return (bc->bc_errno ? ERESTART : 0); + return (err); } /*ARGSUSED*/ static void -dump_indirect(objset_t *os, uint64_t object, void *data, size_t size) +dump_indirect(dnode_t *dn) { - traverse_handle_t *th; - uint64_t objset = dmu_objset_id(os); - int advance = zdb_advance; + dnode_phys_t *dnp = dn->dn_phys; + int j; + zbookmark_t czb; (void) printf("Indirect blocks:\n"); - if (object == 0) - advance |= ADVANCE_DATA; - - th = traverse_init(dmu_objset_spa(os), zdb_indirect_cb, NULL, advance, - ZIO_FLAG_CANFAIL); - th->th_noread = zdb_noread; - - traverse_add_dnode(th, 0, -1ULL, objset, object); - - while (traverse_more(th) == EAGAIN) - continue; + SET_BOOKMARK(&czb, dmu_objset_id(&dn->dn_objset->os), + dn->dn_object, dnp->dn_nlevels - 1, 0); + for (j = 0; j < dnp->dn_nblkptr; j++) { + czb.zb_blkid = j; + (void) visit_indirect(dmu_objset_spa(&dn->dn_objset->os), dnp, + &dnp->dn_blkptr[j], &czb); + } (void) printf("\n"); - - traverse_fini(th); } /*ARGSUSED*/ @@ -1093,7 +1076,7 @@ dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header) } if (verbosity >= 5) - dump_indirect(os, object, NULL, 0); + dump_indirect(dn); if (verbosity >= 5) { /* @@ -1450,18 +1433,17 @@ typedef struct zdb_blkstats { #define DMU_OT_DEFERRED DMU_OT_NONE #define DMU_OT_TOTAL DMU_OT_NUMTYPES -#define ZB_TOTAL ZB_MAXLEVEL +#define ZB_TOTAL DN_MAX_LEVELS typedef struct zdb_cb { zdb_blkstats_t zcb_type[ZB_TOTAL + 1][DMU_OT_TOTAL + 1]; uint64_t zcb_errors[256]; - traverse_blk_cache_t *zcb_cache; int zcb_readfails; int zcb_haderrors; } zdb_cb_t; static void -zdb_count_block(spa_t *spa, zdb_cb_t *zcb, blkptr_t *bp, int type) +zdb_count_block(spa_t *spa, zdb_cb_t *zcb, blkptr_t *bp, dmu_object_type_t type) { for (int i = 0; i < 4; i++) { int l = (i < 2) ? BP_GET_LEVEL(bp) : ZB_TOTAL; @@ -1477,7 +1459,7 @@ zdb_count_block(spa_t *spa, zdb_cb_t *zcb, blkptr_t *bp, int type) if (dump_opt['S']) { boolean_t print_sig; - print_sig = !zdb_sig_user_data || (BP_GET_LEVEL(bp) == 0 && + print_sig = !zdb_sig_user_data || (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) == DMU_OT_PLAIN_FILE_CONTENTS); if (BP_GET_CHECKSUM(bp) < zdb_sig_cksumalg) @@ -1499,56 +1481,55 @@ zdb_count_block(spa_t *spa, zdb_cb_t *zcb, blkptr_t *bp, int type) } } - if (!dump_opt['L']) - VERIFY(zio_wait(zio_claim(NULL, spa, spa_first_txg(spa), bp, - NULL, NULL, ZIO_FLAG_MUSTSUCCEED)) == 0); + VERIFY(zio_wait(zio_claim(NULL, spa, spa_first_txg(spa), bp, + NULL, NULL, ZIO_FLAG_MUSTSUCCEED)) == 0); } static int -zdb_blkptr_cb(traverse_blk_cache_t *bc, spa_t *spa, void *arg) +zdb_blkptr_cb(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb, + const dnode_phys_t *dnp, void *arg) { - zbookmark_t *zb = &bc->bc_bookmark; zdb_cb_t *zcb = arg; - blkptr_t *bp = &bc->bc_blkptr; - dmu_object_type_t type = BP_GET_TYPE(bp); char blkbuf[BP_SPRINTF_LEN]; - int error = 0; - ASSERT(!BP_IS_HOLE(bp)); + if (bp == NULL) + return (0); - zdb_count_block(spa, zcb, bp, type); + zdb_count_block(spa, zcb, bp, BP_GET_TYPE(bp)); - if (bc->bc_errno) { - if (zcb->zcb_readfails++ < 10 && dump_opt['L']) { - uberblock_t ub; - vdev_uberblock_load(NULL, spa->spa_root_vdev, &ub); - if (ub.ub_txg != 0) - spa->spa_ubsync = ub; - error = EAGAIN; - } else { + if (dump_opt['c'] || dump_opt['S']) { + int ioerr, size; + void *data; + + size = BP_GET_LSIZE(bp); + data = malloc(size); + ioerr = zio_wait(zio_read(NULL, spa, bp, data, size, + NULL, NULL, ZIO_PRIORITY_ASYNC_READ, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB, zb)); + free(data); + + /* We expect io errors on intent log */ + if (ioerr && BP_GET_TYPE(bp) != DMU_OT_INTENT_LOG) { zcb->zcb_haderrors = 1; - zcb->zcb_errors[bc->bc_errno]++; - error = ERESTART; - } + zcb->zcb_errors[ioerr]++; - if (dump_opt['b'] >= 3 || (dump_opt['b'] >= 2 && bc->bc_errno)) - sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, bp); - else - blkbuf[0] = '\0'; - - if (!dump_opt['S']) { - (void) printf("zdb_blkptr_cb: Got error %d reading " - "<%llu, %llu, %lld, %llx> %s -- %s\n", - bc->bc_errno, - (u_longlong_t)zb->zb_objset, - (u_longlong_t)zb->zb_object, - (u_longlong_t)zb->zb_level, - (u_longlong_t)zb->zb_blkid, - blkbuf, - error == EAGAIN ? "retrying" : "skipping"); + if (dump_opt['b'] >= 2) + sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, bp); + else + blkbuf[0] = '\0'; + + if (!dump_opt['S']) { + (void) printf("zdb_blkptr_cb: " + "Got error %d reading " + "<%llu, %llu, %lld, %llx> %s -- skipping\n", + ioerr, + (u_longlong_t)zb->zb_objset, + (u_longlong_t)zb->zb_object, + (u_longlong_t)zb->zb_level, + (u_longlong_t)zb->zb_blkid, + blkbuf); + } } - - return (error); } zcb->zcb_readfails = 0; @@ -1558,8 +1539,8 @@ zdb_blkptr_cb(traverse_blk_cache_t *bc, spa_t *spa, void *arg) (void) printf("objset %llu object %llu offset 0x%llx %s\n", (u_longlong_t)zb->zb_objset, (u_longlong_t)zb->zb_object, - (u_longlong_t)blkid2offset(bc->bc_dnode, - zb->zb_level, zb->zb_blkid), blkbuf); + (u_longlong_t)blkid2offset(dnp, zb->zb_level, zb->zb_blkid), + blkbuf); } return (0); @@ -1568,22 +1549,12 @@ zdb_blkptr_cb(traverse_blk_cache_t *bc, spa_t *spa, void *arg) static int dump_block_stats(spa_t *spa) { - traverse_handle_t *th; zdb_cb_t zcb = { 0 }; - traverse_blk_cache_t dummy_cache = { 0 }; zdb_blkstats_t *zb, *tzb; uint64_t alloc, space, logalloc; vdev_t *rvd = spa->spa_root_vdev; int leaks = 0; - int advance = zdb_advance; - int c, e, flags; - - zcb.zcb_cache = &dummy_cache; - - if (dump_opt['c'] || dump_opt['S']) - advance |= ADVANCE_DATA; - - advance |= ADVANCE_PRUNE | ADVANCE_ZIL; + int c, e; if (!dump_opt['S']) { (void) printf("\nTraversing all blocks to %sverify" @@ -1599,8 +1570,7 @@ dump_block_stats(spa_t *spa) * it's not part of any space map) is a double allocation, * reference to a freed block, or an unclaimed log block. */ - if (!dump_opt['L']) - zdb_leak_init(spa); + zdb_leak_init(spa); /* * If there's a deferred-free bplist, process that first. @@ -1626,22 +1596,7 @@ dump_block_stats(spa_t *spa) bplist_close(bpl); } - /* - * Now traverse the pool. If we're reading all data to verify - * checksums, do a scrubbing read so that we validate all copies. - */ - flags = ZIO_FLAG_CANFAIL; - if (advance & ADVANCE_DATA) - flags |= ZIO_FLAG_SCRUB; - th = traverse_init(spa, zdb_blkptr_cb, &zcb, advance, flags); - th->th_noread = zdb_noread; - - traverse_add_pool(th, 0, spa_first_txg(spa) + TXG_CONCURRENT_STATES); - - while (traverse_more(th) == EAGAIN) - continue; - - traverse_fini(th); + zcb.zcb_haderrors |= traverse_pool(spa, zdb_blkptr_cb, &zcb); if (zcb.zcb_haderrors && !dump_opt['S']) { (void) printf("\nError counts:\n\n"); @@ -1657,8 +1612,7 @@ dump_block_stats(spa_t *spa) /* * Report any leaked segments. */ - if (!dump_opt['L']) - zdb_leak_fini(spa); + zdb_leak_fini(spa); /* * If we're interested in printing out the blkptr signatures, @@ -1668,10 +1622,6 @@ dump_block_stats(spa_t *spa) if (dump_opt['S']) return (zcb.zcb_haderrors ? 3 : 0); - if (dump_opt['L']) - (void) printf("\n\n *** Live pool traversal; " - "block counts are only approximate ***\n\n"); - alloc = spa_get_alloc(spa); space = spa_get_space(spa); @@ -2277,7 +2227,6 @@ main(int argc, char **argv) int dump_all = 1; int verbose = 0; int error; - int flag, set; int exported = 0; char *vdev_dir = NULL; @@ -2286,7 +2235,7 @@ main(int argc, char **argv) dprintf_setup(&argc, argv); - while ((c = getopt(argc, argv, "udibcsvCLO:B:S:U:lRep:")) != -1) { + while ((c = getopt(argc, argv, "udibcsvCS:U:lRep:")) != -1) { switch (c) { case 'u': case 'd': @@ -2300,49 +2249,6 @@ main(int argc, char **argv) dump_opt[c]++; dump_all = 0; break; - case 'L': - dump_opt[c]++; - break; - case 'O': - endstr = optarg; - if (endstr[0] == '!') { - endstr++; - set = 0; - } else { - set = 1; - } - if (strcmp(endstr, "post") == 0) { - flag = ADVANCE_PRE; - set = !set; - } else if (strcmp(endstr, "pre") == 0) { - flag = ADVANCE_PRE; - } else if (strcmp(endstr, "prune") == 0) { - flag = ADVANCE_PRUNE; - } else if (strcmp(endstr, "data") == 0) { - flag = ADVANCE_DATA; - } else if (strcmp(endstr, "holes") == 0) { - flag = ADVANCE_HOLES; - } else { - usage(); - } - if (set) - zdb_advance |= flag; - else - zdb_advance &= ~flag; - break; - case 'B': - endstr = optarg - 1; - zdb_noread.zb_objset = strtoull(endstr + 1, &endstr, 0); - zdb_noread.zb_object = strtoull(endstr + 1, &endstr, 0); - zdb_noread.zb_level = strtol(endstr + 1, &endstr, 0); - zdb_noread.zb_blkid = strtoull(endstr + 1, &endstr, 16); - (void) printf("simulating bad block " - "<%llu, %llu, %lld, %llx>\n", - (u_longlong_t)zdb_noread.zb_objset, - (u_longlong_t)zdb_noread.zb_object, - (u_longlong_t)zdb_noread.zb_level, - (u_longlong_t)zdb_noread.zb_blkid); - break; case 'v': verbose++; break; @@ -2379,21 +2285,17 @@ main(int argc, char **argv) } } - if (vdev_dir != NULL && exported == 0) - (void) fatal("-p option requires use of -e\n"); + if (vdev_dir != NULL && exported == 0) { + (void) fprintf(stderr, "-p option requires use of -e\n"); + usage(); + } kernel_init(FREAD); g_zfs = libzfs_init(); ASSERT(g_zfs != NULL); - /* - * Disable vdev caching. If we don't do this, live pool traversal - * won't make progress because it will never see disk updates. - */ - zfs_vdev_cache_size = 0; - for (c = 0; c < 256; c++) { - if (dump_all && c != 'L' && c != 'l' && c != 'R') + if (dump_all && c != 'l' && c != 'R') dump_opt[c] = 1; if (dump_opt[c]) dump_opt[c] += verbose; diff --git a/usr/src/cmd/ztest/ztest.c b/usr/src/cmd/ztest/ztest.c index e6960f0868..53cc6c7093 100644 --- a/usr/src/cmd/ztest/ztest.c +++ b/usr/src/cmd/ztest/ztest.c @@ -77,7 +77,6 @@ #include #include #include -#include #include #include #include @@ -148,7 +147,6 @@ typedef struct ztest_args { hrtime_t za_start; hrtime_t za_stop; hrtime_t za_kill; - traverse_handle_t *za_th; /* * Thread-local variables can go here to aid debugging. */ @@ -202,7 +200,6 @@ ztest_info_t ztest_info[] = { { ztest_dmu_object_alloc_free, 1, &zopt_always }, { ztest_zap, 30, &zopt_always }, { ztest_zap_parallel, 100, &zopt_always }, - { ztest_traverse, 1, &zopt_often }, { ztest_dsl_prop_get_set, 1, &zopt_sometimes }, { ztest_dmu_objset_create_destroy, 1, &zopt_sometimes }, { ztest_dmu_snapshot_create_destroy, 1, &zopt_sometimes }, @@ -1439,152 +1436,6 @@ ztest_dmu_snapshot_create_destroy(ztest_args_t *za) (void) rw_unlock(&ztest_shared->zs_name_lock); } -#define ZTEST_TRAVERSE_BLOCKS 1000 - -static int -ztest_blk_cb(traverse_blk_cache_t *bc, spa_t *spa, void *arg) -{ - ztest_args_t *za = arg; - zbookmark_t *zb = &bc->bc_bookmark; - blkptr_t *bp = &bc->bc_blkptr; - dnode_phys_t *dnp = bc->bc_dnode; - traverse_handle_t *th = za->za_th; - uint64_t size = BP_GET_LSIZE(bp); - - /* - * Level -1 indicates the objset_phys_t or something in its intent log. - */ - if (zb->zb_level == -1) { - if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { - ASSERT3U(zb->zb_object, ==, 0); - ASSERT3U(zb->zb_blkid, ==, 0); - ASSERT3U(size, ==, sizeof (objset_phys_t)); - za->za_zil_seq = 0; - } else if (BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG) { - ASSERT3U(zb->zb_object, ==, 0); - ASSERT3U(zb->zb_blkid, >, za->za_zil_seq); - za->za_zil_seq = zb->zb_blkid; - } else { - ASSERT3U(zb->zb_object, !=, 0); /* lr_write_t */ - } - - return (0); - } - - ASSERT(dnp != NULL); - - if (bc->bc_errno) - return (ERESTART); - - /* - * Once in a while, abort the traverse. We only do this to odd - * instance numbers to ensure that even ones can run to completion. - */ - if ((za->za_instance & 1) && ztest_random(10000) == 0) - return (EINTR); - - if (bp->blk_birth == 0) { - ASSERT(th->th_advance & ADVANCE_HOLES); - return (0); - } - - if (zb->zb_level == 0 && !(th->th_advance & ADVANCE_DATA) && - bc == &th->th_cache[ZB_DN_CACHE][0]) { - ASSERT(bc->bc_data == NULL); - return (0); - } - - ASSERT(bc->bc_data != NULL); - - /* - * This is an expensive question, so don't ask it too often. - */ - if (((za->za_random ^ th->th_callbacks) & 0xff) == 0) { - void *xbuf = umem_alloc(size, UMEM_NOFAIL); - if (arc_tryread(spa, bp, xbuf) == 0) { - ASSERT(bcmp(bc->bc_data, xbuf, size) == 0); - } - umem_free(xbuf, size); - } - - if (zb->zb_level > 0) { - ASSERT3U(size, ==, 1ULL << dnp->dn_indblkshift); - return (0); - } - - ASSERT(zb->zb_level == 0); - ASSERT3U(size, ==, dnp->dn_datablkszsec << DEV_BSHIFT); - - return (0); -} - -/* - * Verify that live pool traversal works. - */ -void -ztest_traverse(ztest_args_t *za) -{ - spa_t *spa = za->za_spa; - traverse_handle_t *th = za->za_th; - int rc, advance; - uint64_t cbstart, cblimit; - - if (th == NULL) { - advance = 0; - - if (ztest_random(2) == 0) - advance |= ADVANCE_PRE; - - if (ztest_random(2) == 0) - advance |= ADVANCE_PRUNE; - - if (ztest_random(2) == 0) - advance |= ADVANCE_DATA; - - if (ztest_random(2) == 0) - advance |= ADVANCE_HOLES; - - if (ztest_random(2) == 0) - advance |= ADVANCE_ZIL; - - th = za->za_th = traverse_init(spa, ztest_blk_cb, za, advance, - ZIO_FLAG_CANFAIL); - - traverse_add_pool(th, 0, -1ULL); - } - - advance = th->th_advance; - cbstart = th->th_callbacks; - cblimit = cbstart + ((advance & ADVANCE_DATA) ? 100 : 1000); - - while ((rc = traverse_more(th)) == EAGAIN && th->th_callbacks < cblimit) - continue; - - if (zopt_verbose >= 5) - (void) printf("traverse %s%s%s%s %llu blocks to " - "<%llu, %llu, %lld, %llx>%s\n", - (advance & ADVANCE_PRE) ? "pre" : "post", - (advance & ADVANCE_PRUNE) ? "|prune" : "", - (advance & ADVANCE_DATA) ? "|data" : "", - (advance & ADVANCE_HOLES) ? "|holes" : "", - (u_longlong_t)(th->th_callbacks - cbstart), - (u_longlong_t)th->th_lastcb.zb_objset, - (u_longlong_t)th->th_lastcb.zb_object, - (u_longlong_t)th->th_lastcb.zb_level, - (u_longlong_t)th->th_lastcb.zb_blkid, - rc == 0 ? " [done]" : - rc == EINTR ? " [aborted]" : - rc == EAGAIN ? "" : - strerror(rc)); - - if (rc != EAGAIN) { - if (rc != 0 && rc != EINTR) - fatal(0, "traverse_more(%p) = %d", th, rc); - traverse_fini(th); - za->za_th = NULL; - } -} - /* * Verify that dmu_object_{alloc,free} work as expected. */ @@ -2955,12 +2806,12 @@ ztest_verify_blocks(char *pool) isa = strdup(isa); /* LINTED */ (void) sprintf(bin, - "/usr/sbin%.*s/zdb -bc%s%s -U /tmp/zpool.cache -O %s %s", + "/usr/sbin%.*s/zdb -bc%s%s -U /tmp/zpool.cache %s", isalen, isa, zopt_verbose >= 3 ? "s" : "", zopt_verbose >= 4 ? "v" : "", - ztest_random(2) == 0 ? "pre" : "post", pool); + pool); free(isa); if (zopt_verbose >= 5) @@ -3325,8 +3176,6 @@ ztest_run(char *pool) while (--t >= 0) { VERIFY(thr_join(za[t].za_thread, NULL, NULL) == 0); - if (za[t].za_th) - traverse_fini(za[t].za_th); if (t < zopt_datasets) { zil_close(za[t].za_zilog); dmu_objset_close(za[t].za_os); diff --git a/usr/src/lib/libumem/common/vmem.c b/usr/src/lib/libumem/common/vmem.c index 9a51d8c606..040517a78f 100644 --- a/usr/src/lib/libumem/common/vmem.c +++ b/usr/src/lib/libumem/common/vmem.c @@ -24,8 +24,6 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * For a more complete description of the main ideas, see: * @@ -918,7 +916,7 @@ vmem_xalloc(vmem_t *vmp, size_t size, size_t align, size_t phase, start = MAX(vsp->vs_start, (uintptr_t)minaddr); end = MIN(vsp->vs_end - 1, (uintptr_t)maxaddr - 1) + 1; taddr = P2PHASEUP(start, align, phase); - if (P2CROSS(taddr, taddr + size - 1, nocross)) + if (P2BOUNDARY(taddr, size, nocross)) taddr += P2ROUNDUP(P2NPHASE(taddr, nocross), align); if ((taddr - start) + size > end - start || @@ -985,7 +983,7 @@ vmem_xalloc(vmem_t *vmp, size_t size, size_t align, size_t phase, (void) vmem_seg_alloc(vmp, vbest, addr, size); (void) mutex_unlock(&vmp->vm_lock); ASSERT(P2PHASE(addr, align) == phase); - ASSERT(!P2CROSS(addr, addr + size - 1, nocross)); + ASSERT(!P2BOUNDARY(addr, size, nocross)); ASSERT(addr >= (uintptr_t)minaddr); ASSERT(addr + size - 1 <= (uintptr_t)maxaddr - 1); return ((void *)addr); diff --git a/usr/src/lib/libzpool/common/kernel.c b/usr/src/lib/libzpool/common/kernel.c index 933409bebf..fe817cc64b 100644 --- a/usr/src/lib/libzpool/common/kernel.c +++ b/usr/src/lib/libzpool/common/kernel.c @@ -23,8 +23,6 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include #include #include @@ -785,6 +783,8 @@ kernel_init(int mode) VERIFY((random_fd = open("/dev/random", O_RDONLY)) != -1); VERIFY((urandom_fd = open("/dev/urandom", O_RDONLY)) != -1); + system_taskq_init(); + spa_init(mode); } diff --git a/usr/src/lib/libzpool/common/llib-lzpool b/usr/src/lib/libzpool/common/llib-lzpool index e715898a2f..ca3225393a 100644 --- a/usr/src/lib/libzpool/common/llib-lzpool +++ b/usr/src/lib/libzpool/common/llib-lzpool @@ -45,5 +45,6 @@ #include #include #include +#include extern uint64_t metaslab_gang_bang; diff --git a/usr/src/lib/libzpool/common/sys/zfs_context.h b/usr/src/lib/libzpool/common/sys/zfs_context.h index e065d7089d..0e7019bba8 100644 --- a/usr/src/lib/libzpool/common/sys/zfs_context.h +++ b/usr/src/lib/libzpool/common/sys/zfs_context.h @@ -321,11 +321,14 @@ typedef void (task_func_t)(void *); #define TQ_NOSLEEP KM_NOSLEEP /* cannot block for memory; may fail */ #define TQ_NOQUEUE 0x02 /* Do not enqueue if can't dispatch */ +extern taskq_t *system_taskq; + extern taskq_t *taskq_create(const char *, int, pri_t, int, int, uint_t); extern taskqid_t taskq_dispatch(taskq_t *, task_func_t, void *, uint_t); extern void taskq_destroy(taskq_t *); extern void taskq_wait(taskq_t *); extern int taskq_member(taskq_t *, void *); +extern void system_taskq_init(void); #define XVA_MAPSIZE 3 #define XVA_MAGIC 0x78766174 diff --git a/usr/src/lib/libzpool/common/taskq.c b/usr/src/lib/libzpool/common/taskq.c index ccf5b4ded8..93acdcf8e4 100644 --- a/usr/src/lib/libzpool/common/taskq.c +++ b/usr/src/lib/libzpool/common/taskq.c @@ -19,15 +19,14 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include int taskq_now; +taskq_t *system_taskq; typedef struct task { struct task *task_next; @@ -253,3 +252,10 @@ taskq_member(taskq_t *tq, void *t) return (0); } + +void +system_taskq_init(void) +{ + system_taskq = taskq_create("system_taskq", 64, minclsyspri, 4, 512, + TASKQ_DYNAMIC | TASKQ_PREPOPULATE); +} diff --git a/usr/src/uts/common/fs/zfs/dbuf.c b/usr/src/uts/common/fs/zfs/dbuf.c index ff6030bf31..9c54431181 100644 --- a/usr/src/uts/common/fs/zfs/dbuf.c +++ b/usr/src/uts/common/fs/zfs/dbuf.c @@ -308,20 +308,18 @@ dbuf_verify(dmu_buf_impl_t *db) ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size); } - if (db->db_level == 0) { - /* we can be momentarily larger in dnode_set_blksz() */ - if (db->db_blkid != DB_BONUS_BLKID && dn) { - ASSERT3U(db->db.db_size, >=, dn->dn_datablksz); - } - if (db->db.db_object == DMU_META_DNODE_OBJECT) { - dbuf_dirty_record_t *dr = db->db_data_pending; - /* - * it should only be modified in syncing - * context, so make sure we only have - * one copy of the data. - */ - ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf); - } + /* + * We can't assert that db_size matches dn_datablksz because it + * can be momentarily different when another thread is doing + * dnode_set_blksz(). + */ + if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) { + dbuf_dirty_record_t *dr = db->db_data_pending; + /* + * It should only be modified in syncing context, so + * make sure we only have one copy of the data. + */ + ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf); } /* verify db->db_blkptr */ diff --git a/usr/src/uts/common/fs/zfs/dmu_send.c b/usr/src/uts/common/fs/zfs/dmu_send.c index c4223e4aab..626d3be890 100644 --- a/usr/src/uts/common/fs/zfs/dmu_send.c +++ b/usr/src/uts/common/fs/zfs/dmu_send.c @@ -23,8 +23,6 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include #include #include @@ -153,66 +151,59 @@ dump_dnode(struct backuparg *ba, uint64_t object, dnode_phys_t *dnp) (level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) static int -backup_cb(traverse_blk_cache_t *bc, spa_t *spa, void *arg) +backup_cb(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb, + const dnode_phys_t *dnp, void *arg) { struct backuparg *ba = arg; - uint64_t object = bc->bc_bookmark.zb_object; - int level = bc->bc_bookmark.zb_level; - uint64_t blkid = bc->bc_bookmark.zb_blkid; - blkptr_t *bp = bc->bc_blkptr.blk_birth ? &bc->bc_blkptr : NULL; dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE; - void *data = bc->bc_data; int err = 0; if (issig(JUSTLOOKING) && issig(FORREAL)) return (EINTR); - ASSERT(data || bp == NULL); - - if (bp == NULL && object == 0) { - uint64_t span = BP_SPAN(bc->bc_dnode, level); - uint64_t dnobj = (blkid * span) >> DNODE_SHIFT; + if (bp == NULL && zb->zb_object == 0) { + uint64_t span = BP_SPAN(dnp, zb->zb_level); + uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT; err = dump_freeobjects(ba, dnobj, span >> DNODE_SHIFT); } else if (bp == NULL) { - uint64_t span = BP_SPAN(bc->bc_dnode, level); - err = dump_free(ba, object, blkid * span, span); - } else if (data && level == 0 && type == DMU_OT_DNODE) { - dnode_phys_t *blk = data; + uint64_t span = BP_SPAN(dnp, zb->zb_level); + err = dump_free(ba, zb->zb_object, zb->zb_blkid * span, span); + } else if (zb->zb_level > 0 || type == DMU_OT_OBJSET) { + return (0); + } else if (type == DMU_OT_DNODE) { + dnode_phys_t *blk; int i; int blksz = BP_GET_LSIZE(bp); + uint32_t aflags = ARC_WAIT; + arc_buf_t *abuf; + + if (arc_read_nolock(NULL, spa, bp, + arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, + ZIO_FLAG_CANFAIL, &aflags, zb) != 0) + return (EIO); + blk = abuf->b_data; for (i = 0; i < blksz >> DNODE_SHIFT; i++) { - uint64_t dnobj = - (blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i; + uint64_t dnobj = (zb->zb_blkid << + (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i; err = dump_dnode(ba, dnobj, blk+i); if (err) break; } - } else if (level == 0 && - type != DMU_OT_DNODE && type != DMU_OT_OBJSET) { + (void) arc_buf_remove_ref(abuf, &abuf); + } else { /* it's a level-0 block of a regular object */ + uint32_t aflags = ARC_WAIT; + arc_buf_t *abuf; int blksz = BP_GET_LSIZE(bp); - if (data == NULL) { - uint32_t aflags = ARC_WAIT; - arc_buf_t *abuf; - zbookmark_t zb; - - zb.zb_objset = ba->os->os->os_dsl_dataset->ds_object; - zb.zb_object = object; - zb.zb_level = level; - zb.zb_blkid = blkid; - (void) arc_read_nolock(NULL, spa, bp, - arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, - ZIO_FLAG_MUSTSUCCEED, &aflags, &zb); - - if (abuf) { - err = dump_data(ba, type, object, blkid * blksz, - blksz, abuf->b_data); - (void) arc_buf_remove_ref(abuf, &abuf); - } - } else { - err = dump_data(ba, type, object, blkid * blksz, - blksz, data); - } + + if (arc_read_nolock(NULL, spa, bp, + arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, + ZIO_FLAG_CANFAIL, &aflags, zb) != 0) + return (EIO); + + err = dump_data(ba, type, zb->zb_object, zb->zb_blkid * blksz, + blksz, abuf->b_data); + (void) arc_buf_remove_ref(abuf, &abuf); } ASSERT(err == 0 || err == EINTR); @@ -291,8 +282,7 @@ dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin, return (ba.err); } - err = traverse_dsl_dataset(ds, fromtxg, - ADVANCE_PRE | ADVANCE_HOLES | ADVANCE_DATA | ADVANCE_NOLOCK, + err = traverse_dataset(ds, fromtxg, TRAVERSE_PRE | TRAVERSE_PREFETCH, backup_cb, &ba); if (err) { diff --git a/usr/src/uts/common/fs/zfs/dmu_traverse.c b/usr/src/uts/common/fs/zfs/dmu_traverse.c dissimilarity index 89% index 43bf82e7a6..5e177c5dbe 100644 --- a/usr/src/uts/common/fs/zfs/dmu_traverse.c +++ b/usr/src/uts/common/fs/zfs/dmu_traverse.c @@ -1,920 +1,411 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#pragma ident "%Z%%M% %I% %E% SMI" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define BP_SPAN_SHIFT(level, width) ((level) * (width)) - -#define BP_EQUAL(b1, b2) \ - (DVA_EQUAL(BP_IDENTITY(b1), BP_IDENTITY(b2)) && \ - (b1)->blk_birth == (b2)->blk_birth) - -/* - * Compare two bookmarks. - * - * For ADVANCE_PRE, the visitation order is: - * - * objset 0, 1, 2, ..., ZB_MAXOBJSET. - * object 0, 1, 2, ..., ZB_MAXOBJECT. - * blkoff 0, 1, 2, ... - * level ZB_MAXLEVEL, ..., 2, 1, 0. - * - * where blkoff = blkid << BP_SPAN_SHIFT(level, width), and thus a valid - * ordering vector is: - * - * < objset, object, blkoff, -level > - * - * For ADVANCE_POST, the starting offsets aren't sequential but ending - * offsets [blkoff = (blkid + 1) << BP_SPAN_SHIFT(level, width)] are. - * The visitation order is: - * - * objset 1, 2, ..., ZB_MAXOBJSET, 0. - * object 1, 2, ..., ZB_MAXOBJECT, 0. - * blkoff 1, 2, ... - * level 0, 1, 2, ..., ZB_MAXLEVEL. - * - * and thus a valid ordering vector is: - * - * < objset - 1, object - 1, blkoff, level > - * - * Both orderings can be expressed as: - * - * < objset + bias, object + bias, blkoff, level ^ bias > - * - * where 'bias' is either 0 or -1 (for ADVANCE_PRE or ADVANCE_POST) - * and 'blkoff' is (blkid - bias) << BP_SPAN_SHIFT(level, wshift). - * - * Special case: an objset's osphys is represented as level -1 of object 0. - * It is always either the very first or very last block we visit in an objset. - * Therefore, if either bookmark's level is -1, level alone determines order. - */ -static int -compare_bookmark(zbookmark_t *szb, zbookmark_t *ezb, dnode_phys_t *dnp, - int advance) -{ - int bias = (advance & ADVANCE_PRE) ? 0 : -1; - uint64_t sblkoff, eblkoff; - int slevel, elevel, wshift; - - if (szb->zb_objset + bias < ezb->zb_objset + bias) - return (-1); - - if (szb->zb_objset + bias > ezb->zb_objset + bias) - return (1); - - slevel = szb->zb_level; - elevel = ezb->zb_level; - - if ((slevel | elevel) < 0) - return ((slevel ^ bias) - (elevel ^ bias)); - - if (szb->zb_object + bias < ezb->zb_object + bias) - return (-1); - - if (szb->zb_object + bias > ezb->zb_object + bias) - return (1); - - if (dnp == NULL) - return (0); - - wshift = dnp->dn_indblkshift - SPA_BLKPTRSHIFT; - - sblkoff = (szb->zb_blkid - bias) << BP_SPAN_SHIFT(slevel, wshift); - eblkoff = (ezb->zb_blkid - bias) << BP_SPAN_SHIFT(elevel, wshift); - - if (sblkoff < eblkoff) - return (-1); - - if (sblkoff > eblkoff) - return (1); - - return ((elevel ^ bias) - (slevel ^ bias)); -} - -#define SET_BOOKMARK(zb, objset, object, level, blkid) \ -{ \ - (zb)->zb_objset = objset; \ - (zb)->zb_object = object; \ - (zb)->zb_level = level; \ - (zb)->zb_blkid = blkid; \ -} - -#define SET_BOOKMARK_LB(zb, level, blkid) \ -{ \ - (zb)->zb_level = level; \ - (zb)->zb_blkid = blkid; \ -} - -static int -advance_objset(zseg_t *zseg, uint64_t objset, int advance) -{ - zbookmark_t *zb = &zseg->seg_start; - - if (advance & ADVANCE_PRE) { - if (objset >= ZB_MAXOBJSET) - return (ERANGE); - SET_BOOKMARK(zb, objset, 0, -1, 0); - } else { - if (objset >= ZB_MAXOBJSET) - objset = 0; - SET_BOOKMARK(zb, objset, 1, 0, 0); - } - - if (compare_bookmark(zb, &zseg->seg_end, NULL, advance) > 0) - return (ERANGE); - - return (EAGAIN); -} - -static int -advance_object(zseg_t *zseg, uint64_t object, int advance) -{ - zbookmark_t *zb = &zseg->seg_start; - - if (advance & ADVANCE_PRE) { - if (object >= ZB_MAXOBJECT) { - SET_BOOKMARK(zb, zb->zb_objset + 1, 0, -1, 0); - } else { - SET_BOOKMARK(zb, zb->zb_objset, object, ZB_MAXLEVEL, 0); - } - } else { - if (zb->zb_object == 0) { - SET_BOOKMARK(zb, zb->zb_objset, 0, -1, 0); - } else { - if (object >= ZB_MAXOBJECT) - object = 0; - SET_BOOKMARK(zb, zb->zb_objset, object, 0, 0); - } - } - - if (compare_bookmark(zb, &zseg->seg_end, NULL, advance) > 0) - return (ERANGE); - - return (EAGAIN); -} - -static int -advance_from_osphys(zseg_t *zseg, int advance) -{ - zbookmark_t *zb = &zseg->seg_start; - - ASSERT(zb->zb_object == 0); - ASSERT(zb->zb_level == -1); - ASSERT(zb->zb_blkid == 0); - - if (advance & ADVANCE_PRE) { - SET_BOOKMARK_LB(zb, ZB_MAXLEVEL, 0); - } else { - if (zb->zb_objset == 0) - return (ERANGE); - SET_BOOKMARK(zb, zb->zb_objset + 1, 1, 0, 0); - } - - if (compare_bookmark(zb, &zseg->seg_end, NULL, advance) > 0) - return (ERANGE); - - return (EAGAIN); -} - -static int -advance_block(zseg_t *zseg, dnode_phys_t *dnp, int rc, int advance) -{ - zbookmark_t *zb = &zseg->seg_start; - int wshift = dnp->dn_indblkshift - SPA_BLKPTRSHIFT; - int maxlevel = dnp->dn_nlevels - 1; - int level = zb->zb_level; - uint64_t blkid = zb->zb_blkid; - - if (advance & ADVANCE_PRE) { - if (level > 0 && rc == 0) { - level--; - blkid <<= wshift; - } else { - blkid++; - - if ((blkid << BP_SPAN_SHIFT(level, wshift)) > - dnp->dn_maxblkid) - return (ERANGE); - - while (level < maxlevel) { - if (P2PHASE(blkid, 1ULL << wshift)) - break; - blkid >>= wshift; - level++; - } - } - } else { - if (level >= maxlevel || P2PHASE(blkid + 1, 1ULL << wshift)) { - blkid = (blkid + 1) << BP_SPAN_SHIFT(level, wshift); - level = 0; - } else { - blkid >>= wshift; - level++; - } - - while ((blkid << BP_SPAN_SHIFT(level, wshift)) > - dnp->dn_maxblkid) { - if (level == maxlevel) - return (ERANGE); - blkid >>= wshift; - level++; - } - } - SET_BOOKMARK_LB(zb, level, blkid); - - if (compare_bookmark(zb, &zseg->seg_end, dnp, advance) > 0) - return (ERANGE); - - return (EAGAIN); -} - -/* - * The traverse_callback function will call the function specified in th_func. - * In the event of an error the callee, specified by th_func, must return - * one of the following errors: - * - * EINTR - Indicates that the callee wants the traversal to - * abort immediately. - * ERESTART - The callee has acknowledged the error and would - * like to continue. - */ -static int -traverse_callback(traverse_handle_t *th, zseg_t *zseg, traverse_blk_cache_t *bc) -{ - /* - * Before we issue the callback, prune against maxtxg. - * - * We prune against mintxg before we get here because it's a big win. - * If a given block was born in txg 37, then we know that the entire - * subtree below that block must have been born in txg 37 or earlier. - * We can therefore lop off huge branches of the tree as we go. - * - * There's no corresponding optimization for maxtxg because knowing - * that bp->blk_birth >= maxtxg doesn't imply anything about the bp's - * children. In fact, the copy-on-write design of ZFS ensures that - * top-level blocks will pretty much always be new. - * - * Therefore, in the name of simplicity we don't prune against - * maxtxg until the last possible moment -- that being right now. - */ - if (bc->bc_errno == 0 && bc->bc_blkptr.blk_birth >= zseg->seg_maxtxg) - return (0); - - /* - * Debugging: verify that the order we visit things agrees with the - * order defined by compare_bookmark(). We don't check this for - * log blocks because there's no defined ordering for them; they're - * always visited (or not) as part of visiting the objset_phys_t. - */ - if (bc->bc_errno == 0 && bc != &th->th_zil_cache) { - zbookmark_t *zb = &bc->bc_bookmark; - zbookmark_t *szb = &zseg->seg_start; - zbookmark_t *ezb = &zseg->seg_end; - zbookmark_t *lzb = &th->th_lastcb; - dnode_phys_t *dnp = bc->bc_dnode; - - ASSERT(compare_bookmark(zb, ezb, dnp, th->th_advance) <= 0); - ASSERT(compare_bookmark(zb, szb, dnp, th->th_advance) == 0); - ASSERT(compare_bookmark(lzb, zb, dnp, th->th_advance) < 0 || - lzb->zb_level == ZB_NO_LEVEL); - *lzb = *zb; - } - - th->th_callbacks++; - return (th->th_func(bc, th->th_spa, th->th_arg)); -} - -static int -traverse_read(traverse_handle_t *th, traverse_blk_cache_t *bc, blkptr_t *bp, - dnode_phys_t *dnp) -{ - zbookmark_t *zb = &bc->bc_bookmark; - int error; - - th->th_hits++; - - bc->bc_dnode = dnp; - bc->bc_errno = 0; - - if (BP_EQUAL(&bc->bc_blkptr, bp)) - return (0); - - bc->bc_blkptr = *bp; - - if (bc->bc_data == NULL) - return (0); - - if (BP_IS_HOLE(bp)) { - ASSERT(th->th_advance & ADVANCE_HOLES); - return (0); - } - - if (compare_bookmark(zb, &th->th_noread, dnp, 0) == 0) { - error = EIO; - } else if (arc_tryread(th->th_spa, bp, bc->bc_data) == 0) { - error = 0; - th->th_arc_hits++; - } else { - error = zio_wait(zio_read(NULL, th->th_spa, bp, bc->bc_data, - BP_GET_LSIZE(bp), NULL, NULL, ZIO_PRIORITY_SYNC_READ, - th->th_zio_flags | ZIO_FLAG_DONT_CACHE, zb)); - - if (BP_SHOULD_BYTESWAP(bp) && error == 0) - (zb->zb_level > 0 ? byteswap_uint64_array : - dmu_ot[BP_GET_TYPE(bp)].ot_byteswap)(bc->bc_data, - BP_GET_LSIZE(bp)); - th->th_reads++; - } - - if (error) { - bc->bc_errno = error; - error = traverse_callback(th, NULL, bc); - ASSERT(error == EAGAIN || error == EINTR || error == ERESTART); - bc->bc_blkptr.blk_birth = -1ULL; - } - - dprintf("cache %02x error %d <%llu, %llu, %d, %llx>\n", - bc - &th->th_cache[0][0], error, - zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid); - - return (error); -} - -static int -find_block(traverse_handle_t *th, zseg_t *zseg, dnode_phys_t *dnp, int depth) -{ - zbookmark_t *zb = &zseg->seg_start; - traverse_blk_cache_t *bc; - blkptr_t *bp = dnp->dn_blkptr; - int i, first, level; - int nbp = dnp->dn_nblkptr; - int minlevel = zb->zb_level; - int maxlevel = dnp->dn_nlevels - 1; - int wshift = dnp->dn_indblkshift - SPA_BLKPTRSHIFT; - int bp_shift = BP_SPAN_SHIFT(maxlevel - minlevel, wshift); - uint64_t blkid = zb->zb_blkid >> bp_shift; - int do_holes = (th->th_advance & ADVANCE_HOLES) && depth == ZB_DN_CACHE; - int rc; - - if (minlevel > maxlevel || blkid >= nbp) - return (ERANGE); - - for (level = maxlevel; level >= minlevel; level--) { - first = P2PHASE(blkid, 1ULL << wshift); - - for (i = first; i < nbp; i++) - if (bp[i].blk_birth > zseg->seg_mintxg || - BP_IS_HOLE(&bp[i]) && do_holes) - break; - - if (i != first) { - i--; - SET_BOOKMARK_LB(zb, level, blkid + (i - first)); - return (ENOTBLK); - } - - bc = &th->th_cache[depth][level]; - - SET_BOOKMARK(&bc->bc_bookmark, zb->zb_objset, zb->zb_object, - level, blkid); - - if (rc = traverse_read(th, bc, bp + i, dnp)) { - if (rc != EAGAIN) { - SET_BOOKMARK_LB(zb, level, blkid); - } - return (rc); - } - - if (BP_IS_HOLE(&bp[i])) { - SET_BOOKMARK_LB(zb, level, blkid); - th->th_lastcb.zb_level = ZB_NO_LEVEL; - return (0); - } - - nbp = 1 << wshift; - bp = bc->bc_data; - bp_shift -= wshift; - blkid = zb->zb_blkid >> bp_shift; - } - - return (0); -} - -static int -get_dnode(traverse_handle_t *th, uint64_t objset, dnode_phys_t *mdn, - uint64_t *objectp, dnode_phys_t **dnpp, uint64_t txg, int type, int depth) -{ - zseg_t zseg; - zbookmark_t *zb = &zseg.seg_start; - uint64_t object = *objectp; - int i, rc; - - SET_BOOKMARK(zb, objset, 0, 0, object / DNODES_PER_BLOCK); - SET_BOOKMARK(&zseg.seg_end, objset, 0, 0, ZB_MAXBLKID); - - zseg.seg_mintxg = txg; - zseg.seg_maxtxg = -1ULL; - - for (;;) { - rc = find_block(th, &zseg, mdn, depth); - - if (rc == EAGAIN || rc == EINTR || rc == ERANGE) - break; - - if (rc == 0 && zb->zb_level == 0) { - dnode_phys_t *dnp = th->th_cache[depth][0].bc_data; - for (i = 0; i < DNODES_PER_BLOCK; i++) { - object = (zb->zb_blkid * DNODES_PER_BLOCK) + i; - if (object >= *objectp && - dnp[i].dn_type != DMU_OT_NONE && - (type == -1 || dnp[i].dn_type == type)) { - *objectp = object; - *dnpp = &dnp[i]; - return (0); - } - } - } - - rc = advance_block(&zseg, mdn, rc, ADVANCE_PRE); - - if (rc == ERANGE) - break; - } - - if (rc == ERANGE) - *objectp = ZB_MAXOBJECT; - - return (rc); -} - -/* ARGSUSED */ -static void -traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) -{ - traverse_handle_t *th = arg; - traverse_blk_cache_t *bc = &th->th_zil_cache; - zbookmark_t *zb = &bc->bc_bookmark; - zseg_t *zseg = list_head(&th->th_seglist); - - if (bp->blk_birth <= zseg->seg_mintxg) - return; - - if (claim_txg != 0 || bp->blk_birth < spa_first_txg(th->th_spa)) { - zb->zb_object = 0; - zb->zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ]; - bc->bc_blkptr = *bp; - (void) traverse_callback(th, zseg, bc); - } -} - -/* ARGSUSED */ -static void -traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg) -{ - traverse_handle_t *th = arg; - traverse_blk_cache_t *bc = &th->th_zil_cache; - zbookmark_t *zb = &bc->bc_bookmark; - zseg_t *zseg = list_head(&th->th_seglist); - - if (lrc->lrc_txtype == TX_WRITE) { - lr_write_t *lr = (lr_write_t *)lrc; - blkptr_t *bp = &lr->lr_blkptr; - - if (bp->blk_birth <= zseg->seg_mintxg) - return; - - if (claim_txg != 0 && bp->blk_birth >= claim_txg) { - zb->zb_object = lr->lr_foid; - zb->zb_blkid = lr->lr_offset / BP_GET_LSIZE(bp); - bc->bc_blkptr = *bp; - (void) traverse_callback(th, zseg, bc); - } - } -} - -static void -traverse_zil(traverse_handle_t *th, traverse_blk_cache_t *bc) -{ - spa_t *spa = th->th_spa; - dsl_pool_t *dp = spa_get_dsl(spa); - objset_phys_t *osphys = bc->bc_data; - zil_header_t *zh = &osphys->os_zil_header; - uint64_t claim_txg = zh->zh_claim_txg; - zilog_t *zilog; - - ASSERT(bc == &th->th_cache[ZB_MDN_CACHE][ZB_MAXLEVEL - 1]); - ASSERT(bc->bc_bookmark.zb_level == -1); - - /* - * We only want to visit blocks that have been claimed but not yet - * replayed (or, in read-only mode, blocks that *would* be claimed). - */ - if (claim_txg == 0 && (spa_mode & FWRITE)) - return; - - th->th_zil_cache.bc_bookmark = bc->bc_bookmark; - - zilog = zil_alloc(dp->dp_meta_objset, zh); - - (void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, th, - claim_txg); - - zil_free(zilog); -} - -static int -traverse_segment(traverse_handle_t *th, zseg_t *zseg, blkptr_t *mosbp) -{ - zbookmark_t *zb = &zseg->seg_start; - traverse_blk_cache_t *bc; - dnode_phys_t *dn, *dn_tmp; - int worklimit = 100; - int rc; - - dprintf("<%llu, %llu, %d, %llx>\n", - zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid); - - bc = &th->th_cache[ZB_MOS_CACHE][ZB_MAXLEVEL - 1]; - dn = &((objset_phys_t *)bc->bc_data)->os_meta_dnode; - - SET_BOOKMARK(&bc->bc_bookmark, 0, 0, -1, 0); - - rc = traverse_read(th, bc, mosbp, dn); - - if (rc) /* If we get ERESTART, we've got nowhere left to go */ - return (rc == ERESTART ? EINTR : rc); - - ASSERT(dn->dn_nlevels < ZB_MAXLEVEL); - - if (zb->zb_objset != 0) { - uint64_t objset = zb->zb_objset; - dsl_dataset_phys_t *dsp; - - rc = get_dnode(th, 0, dn, &objset, &dn_tmp, 0, - DMU_OT_DSL_DATASET, ZB_MOS_CACHE); - - if (objset != zb->zb_objset) - rc = advance_objset(zseg, objset, th->th_advance); - - if (rc != 0) - return (rc); - - dsp = DN_BONUS(dn_tmp); - - bc = &th->th_cache[ZB_MDN_CACHE][ZB_MAXLEVEL - 1]; - dn = &((objset_phys_t *)bc->bc_data)->os_meta_dnode; - - SET_BOOKMARK(&bc->bc_bookmark, objset, 0, -1, 0); - - /* - * If we're traversing an open snapshot, we know that it - * can't be deleted (because it's open) and it can't change - * (because it's a snapshot). Therefore, once we've gotten - * from the uberblock down to the snapshot's objset_phys_t, - * we no longer need to synchronize with spa_sync(); we're - * traversing a completely static block tree from here on. - */ - if (th->th_advance & ADVANCE_NOLOCK) { - ASSERT(th->th_locked); - rw_exit(spa_traverse_rwlock(th->th_spa)); - th->th_locked = 0; - } - - if (BP_IS_HOLE(&dsp->ds_bp)) - rc = ERESTART; - else - rc = traverse_read(th, bc, &dsp->ds_bp, dn); - - if (rc != 0) { - if (rc == ERESTART) - rc = advance_objset(zseg, zb->zb_objset + 1, - th->th_advance); - return (rc); - } - - if (th->th_advance & ADVANCE_PRUNE) - zseg->seg_mintxg = - MAX(zseg->seg_mintxg, dsp->ds_prev_snap_txg); - } - - if (zb->zb_level == -1) { - ASSERT(zb->zb_object == 0); - ASSERT(zb->zb_blkid == 0); - ASSERT(BP_GET_TYPE(&bc->bc_blkptr) == DMU_OT_OBJSET); - - if (bc->bc_blkptr.blk_birth > zseg->seg_mintxg) { - rc = traverse_callback(th, zseg, bc); - if (rc) { - ASSERT(rc == EINTR); - return (rc); - } - if ((th->th_advance & ADVANCE_ZIL) && - zb->zb_objset != 0) - traverse_zil(th, bc); - } - - return (advance_from_osphys(zseg, th->th_advance)); - } - - if (zb->zb_object != 0) { - uint64_t object = zb->zb_object; - - rc = get_dnode(th, zb->zb_objset, dn, &object, &dn_tmp, - zseg->seg_mintxg, -1, ZB_MDN_CACHE); - - if (object != zb->zb_object) - rc = advance_object(zseg, object, th->th_advance); - - if (rc != 0) - return (rc); - - dn = dn_tmp; - } - - if (zb->zb_level == ZB_MAXLEVEL) - zb->zb_level = dn->dn_nlevels - 1; - - for (;;) { - rc = find_block(th, zseg, dn, ZB_DN_CACHE); - - if (rc == EAGAIN || rc == EINTR || rc == ERANGE) - break; - - if (rc == 0) { - bc = &th->th_cache[ZB_DN_CACHE][zb->zb_level]; - ASSERT(bc->bc_dnode == dn); - ASSERT(bc->bc_blkptr.blk_birth <= mosbp->blk_birth); - rc = traverse_callback(th, zseg, bc); - if (rc) { - ASSERT(rc == EINTR); - return (rc); - } - if (BP_IS_HOLE(&bc->bc_blkptr)) { - ASSERT(th->th_advance & ADVANCE_HOLES); - rc = ENOTBLK; - } - } - - rc = advance_block(zseg, dn, rc, th->th_advance); - - if (rc == ERANGE) - break; - - /* - * Give spa_sync() a chance to run. - */ - if (th->th_locked && spa_traverse_wanted(th->th_spa)) { - th->th_syncs++; - return (EAGAIN); - } - - if (--worklimit == 0) - return (EAGAIN); - } - - if (rc == ERANGE) - rc = advance_object(zseg, zb->zb_object + 1, th->th_advance); - - return (rc); -} - -/* - * It is the caller's responsibility to ensure that the dsl_dataset_t - * doesn't go away during traversal. - */ -int -traverse_dsl_dataset(dsl_dataset_t *ds, uint64_t txg_start, int advance, - blkptr_cb_t func, void *arg) -{ - spa_t *spa = ds->ds_dir->dd_pool->dp_spa; - traverse_handle_t *th; - int err; - - th = traverse_init(spa, func, arg, advance, ZIO_FLAG_MUSTSUCCEED); - - traverse_add_objset(th, txg_start, -1ULL, ds->ds_object); - - while ((err = traverse_more(th)) == EAGAIN) - continue; - - traverse_fini(th); - return (err); -} - -int -traverse_zvol(objset_t *os, int advance, blkptr_cb_t func, void *arg) -{ - spa_t *spa = dmu_objset_spa(os); - traverse_handle_t *th; - int err; - - th = traverse_init(spa, func, arg, advance, ZIO_FLAG_CANFAIL); - - traverse_add_dnode(th, 0, -1ULL, dmu_objset_id(os), ZVOL_OBJ); - - while ((err = traverse_more(th)) == EAGAIN) - continue; - - traverse_fini(th); - return (err); -} - -int -traverse_more(traverse_handle_t *th) -{ - zseg_t *zseg = list_head(&th->th_seglist); - uint64_t save_txg; /* XXX won't be necessary with real itinerary */ - krwlock_t *rw = spa_traverse_rwlock(th->th_spa); - blkptr_t *mosbp = spa_get_rootblkptr(th->th_spa); - int rc; - - if (zseg == NULL) - return (0); - - th->th_restarts++; - - save_txg = zseg->seg_mintxg; - - rw_enter(rw, RW_READER); - th->th_locked = 1; - - rc = traverse_segment(th, zseg, mosbp); - ASSERT(rc == ERANGE || rc == EAGAIN || rc == EINTR); - - if (th->th_locked) - rw_exit(rw); - th->th_locked = 0; - - zseg->seg_mintxg = save_txg; - - if (rc == ERANGE) { - list_remove(&th->th_seglist, zseg); - kmem_free(zseg, sizeof (*zseg)); - return (EAGAIN); - } - - return (rc); -} - -/* - * Note: (mintxg, maxtxg) is an open interval; mintxg and maxtxg themselves - * are not included. The blocks covered by this segment will all have - * mintxg < birth < maxtxg. - */ -static void -traverse_add_segment(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg, - uint64_t sobjset, uint64_t sobject, int slevel, uint64_t sblkid, - uint64_t eobjset, uint64_t eobject, int elevel, uint64_t eblkid) -{ - zseg_t *zseg; - - zseg = kmem_alloc(sizeof (zseg_t), KM_SLEEP); - - zseg->seg_mintxg = mintxg; - zseg->seg_maxtxg = maxtxg; - - zseg->seg_start.zb_objset = sobjset; - zseg->seg_start.zb_object = sobject; - zseg->seg_start.zb_level = slevel; - zseg->seg_start.zb_blkid = sblkid; - - zseg->seg_end.zb_objset = eobjset; - zseg->seg_end.zb_object = eobject; - zseg->seg_end.zb_level = elevel; - zseg->seg_end.zb_blkid = eblkid; - - list_insert_tail(&th->th_seglist, zseg); -} - -void -traverse_add_dnode(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg, - uint64_t objset, uint64_t object) -{ - if (th->th_advance & ADVANCE_PRE) - traverse_add_segment(th, mintxg, maxtxg, - objset, object, ZB_MAXLEVEL, 0, - objset, object, 0, ZB_MAXBLKID); - else - traverse_add_segment(th, mintxg, maxtxg, - objset, object, 0, 0, - objset, object, 0, ZB_MAXBLKID); -} - -void -traverse_add_objset(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg, - uint64_t objset) -{ - if (th->th_advance & ADVANCE_PRE) - traverse_add_segment(th, mintxg, maxtxg, - objset, 0, -1, 0, - objset, ZB_MAXOBJECT, 0, ZB_MAXBLKID); - else - traverse_add_segment(th, mintxg, maxtxg, - objset, 1, 0, 0, - objset, 0, -1, 0); -} - -void -traverse_add_pool(traverse_handle_t *th, uint64_t mintxg, uint64_t maxtxg) -{ - if (th->th_advance & ADVANCE_PRE) - traverse_add_segment(th, mintxg, maxtxg, - 0, 0, -1, 0, - ZB_MAXOBJSET, ZB_MAXOBJECT, 0, ZB_MAXBLKID); - else - traverse_add_segment(th, mintxg, maxtxg, - 1, 1, 0, 0, - 0, 0, -1, 0); -} - -traverse_handle_t * -traverse_init(spa_t *spa, blkptr_cb_t func, void *arg, int advance, - int zio_flags) -{ - traverse_handle_t *th; - int d, l; - - th = kmem_zalloc(sizeof (*th), KM_SLEEP); - - th->th_spa = spa; - th->th_func = func; - th->th_arg = arg; - th->th_advance = advance; - th->th_lastcb.zb_level = ZB_NO_LEVEL; - th->th_noread.zb_level = ZB_NO_LEVEL; - th->th_zio_flags = zio_flags; - - list_create(&th->th_seglist, sizeof (zseg_t), - offsetof(zseg_t, seg_node)); - - for (d = 0; d < ZB_DEPTH; d++) { - for (l = 0; l < ZB_MAXLEVEL; l++) { - if ((advance & ADVANCE_DATA) || - l != 0 || d != ZB_DN_CACHE) - th->th_cache[d][l].bc_data = - zio_buf_alloc(SPA_MAXBLOCKSIZE); - } - } - - return (th); -} - -void -traverse_fini(traverse_handle_t *th) -{ - int d, l; - zseg_t *zseg; - - for (d = 0; d < ZB_DEPTH; d++) - for (l = 0; l < ZB_MAXLEVEL; l++) - if (th->th_cache[d][l].bc_data != NULL) - zio_buf_free(th->th_cache[d][l].bc_data, - SPA_MAXBLOCKSIZE); - - while ((zseg = list_head(&th->th_seglist)) != NULL) { - list_remove(&th->th_seglist, zseg); - kmem_free(zseg, sizeof (*zseg)); - } - - list_destroy(&th->th_seglist); - - dprintf("%llu hit, %llu ARC, %llu IO, %llu cb, %llu sync, %llu again\n", - th->th_hits, th->th_arc_hits, th->th_reads, th->th_callbacks, - th->th_syncs, th->th_restarts); - - kmem_free(th, sizeof (*th)); -} +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define SET_BOOKMARK(zb, objset, object, level, blkid) \ +{ \ + (zb)->zb_objset = objset; \ + (zb)->zb_object = object; \ + (zb)->zb_level = level; \ + (zb)->zb_blkid = blkid; \ +} + +struct prefetch_data { + kmutex_t pd_mtx; + kcondvar_t pd_cv; + int pd_blks_max; + int pd_blks_fetched; + int pd_flags; + boolean_t pd_cancel; + boolean_t pd_exited; +}; + +struct traverse_data { + spa_t *td_spa; + uint64_t td_objset; + blkptr_t *td_rootbp; + uint64_t td_min_txg; + int td_flags; + struct prefetch_data *td_pfd; + blkptr_cb_t *td_func; + void *td_arg; +}; + +/* ARGSUSED */ +static void +traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) +{ + struct traverse_data *td = arg; + zbookmark_t zb; + + if (bp->blk_birth == 0) + return; + + if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(td->td_spa)) + return; + + zb.zb_objset = td->td_objset; + zb.zb_object = 0; + zb.zb_level = -1; + zb.zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ]; + VERIFY(0 == td->td_func(td->td_spa, bp, &zb, NULL, td->td_arg)); +} + +/* ARGSUSED */ +static void +traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg) +{ + struct traverse_data *td = arg; + + if (lrc->lrc_txtype == TX_WRITE) { + lr_write_t *lr = (lr_write_t *)lrc; + blkptr_t *bp = &lr->lr_blkptr; + zbookmark_t zb; + + if (bp->blk_birth == 0) + return; + + if (claim_txg == 0 || bp->blk_birth < claim_txg) + return; + + zb.zb_objset = td->td_objset; + zb.zb_object = lr->lr_foid; + zb.zb_level = BP_GET_LEVEL(bp); + zb.zb_blkid = lr->lr_offset / BP_GET_LSIZE(bp); + VERIFY(0 == td->td_func(td->td_spa, bp, &zb, NULL, td->td_arg)); + } +} + +static void +traverse_zil(struct traverse_data *td, zil_header_t *zh) +{ + uint64_t claim_txg = zh->zh_claim_txg; + zilog_t *zilog; + + /* + * We only want to visit blocks that have been claimed but not yet + * replayed (or, in read-only mode, blocks that *would* be claimed). + */ + if (claim_txg == 0 && (spa_mode & FWRITE)) + return; + + zilog = zil_alloc(spa_get_dsl(td->td_spa)->dp_meta_objset, zh); + + (void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, td, + claim_txg); + + zil_free(zilog); +} + +static int +traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp, + arc_buf_t *pbuf, blkptr_t *bp, const zbookmark_t *zb) +{ + int err = 0; + arc_buf_t *buf = NULL; + struct prefetch_data *pd = td->td_pfd; + + if (bp->blk_birth == 0) { + err = td->td_func(td->td_spa, NULL, zb, dnp, td->td_arg); + return (err); + } + + if (bp->blk_birth <= td->td_min_txg) + return (0); + + if (pd && !pd->pd_exited && + ((pd->pd_flags & TRAVERSE_PREFETCH_DATA) || + BP_GET_TYPE(bp) == DMU_OT_DNODE || BP_GET_LEVEL(bp) > 0)) { + mutex_enter(&pd->pd_mtx); + ASSERT(pd->pd_blks_fetched >= 0); + while (pd->pd_blks_fetched == 0 && !pd->pd_exited) + cv_wait(&pd->pd_cv, &pd->pd_mtx); + pd->pd_blks_fetched--; + cv_broadcast(&pd->pd_cv); + mutex_exit(&pd->pd_mtx); + } + + if (td->td_flags & TRAVERSE_PRE) { + err = td->td_func(td->td_spa, bp, zb, dnp, td->td_arg); + if (err) + return (err); + } + + if (BP_GET_LEVEL(bp) > 0) { + uint32_t flags = ARC_WAIT; + int i; + blkptr_t *cbp; + int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; + + err = arc_read(NULL, td->td_spa, bp, pbuf, + arc_getbuf_func, &buf, + ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); + if (err) + return (err); + + /* recursively visitbp() blocks below this */ + cbp = buf->b_data; + for (i = 0; i < epb; i++, cbp++) { + zbookmark_t czb; + + SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, + zb->zb_level - 1, + zb->zb_blkid * epb + i); + err = traverse_visitbp(td, dnp, buf, cbp, &czb); + if (err) + break; + } + } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { + uint32_t flags = ARC_WAIT; + int i, j; + int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; + + err = arc_read(NULL, td->td_spa, bp, pbuf, + arc_getbuf_func, &buf, + ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); + if (err) + return (err); + + /* recursively visitbp() blocks below this */ + dnp = buf->b_data; + for (i = 0; i < epb && err == 0; i++, dnp++) { + for (j = 0; j < dnp->dn_nblkptr; j++) { + zbookmark_t czb; + + SET_BOOKMARK(&czb, zb->zb_objset, + zb->zb_blkid * epb + i, + dnp->dn_nlevels - 1, j); + err = traverse_visitbp(td, dnp, buf, + (blkptr_t *)&dnp->dn_blkptr[j], &czb); + if (err) + break; + } + } + } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { + uint32_t flags = ARC_WAIT; + objset_phys_t *osp; + int j; + + err = arc_read_nolock(NULL, td->td_spa, bp, + arc_getbuf_func, &buf, + ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); + if (err) + return (err); + + osp = buf->b_data; + /* + * traverse_zil is just here for zdb's leak checking. + * For other consumers, there will be no ZIL blocks. + */ + traverse_zil(td, &osp->os_zil_header); + + for (j = 0; j < osp->os_meta_dnode.dn_nblkptr; j++) { + zbookmark_t czb; + + SET_BOOKMARK(&czb, zb->zb_objset, 0, + osp->os_meta_dnode.dn_nlevels - 1, j); + err = traverse_visitbp(td, &osp->os_meta_dnode, buf, + (blkptr_t *)&osp->os_meta_dnode.dn_blkptr[j], + &czb); + if (err) + break; + } + } + + if (buf) + (void) arc_buf_remove_ref(buf, &buf); + + if (err == 0 && (td->td_flags & TRAVERSE_POST)) + err = td->td_func(td->td_spa, bp, zb, dnp, td->td_arg); + + return (err); +} + +/* ARGSUSED */ +static int +traverse_prefetcher(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb, + const dnode_phys_t *dnp, void *arg) +{ + struct prefetch_data *pfd = arg; + uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH; + + ASSERT(pfd->pd_blks_fetched >= 0); + if (pfd->pd_cancel) + return (EINTR); + + if (bp == NULL || !((pfd->pd_flags & TRAVERSE_PREFETCH_DATA) || + BP_GET_TYPE(bp) == DMU_OT_DNODE || BP_GET_LEVEL(bp) > 0)) + return (0); + + mutex_enter(&pfd->pd_mtx); + while (!pfd->pd_cancel && pfd->pd_blks_fetched >= pfd->pd_blks_max) + cv_wait(&pfd->pd_cv, &pfd->pd_mtx); + pfd->pd_blks_fetched++; + cv_broadcast(&pfd->pd_cv); + mutex_exit(&pfd->pd_mtx); + + (void) arc_read_nolock(NULL, spa, bp, NULL, NULL, + ZIO_PRIORITY_ASYNC_READ, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, + &aflags, zb); + + return (0); +} + +static void +traverse_prefetch_thread(void *arg) +{ + struct traverse_data *td_main = arg; + struct traverse_data td = *td_main; + zbookmark_t czb; + + td.td_func = traverse_prefetcher; + td.td_arg = td_main->td_pfd; + td.td_pfd = NULL; + + SET_BOOKMARK(&czb, td.td_objset, 0, -1, 0); + (void) traverse_visitbp(&td, NULL, NULL, td.td_rootbp, &czb); + + mutex_enter(&td_main->td_pfd->pd_mtx); + td_main->td_pfd->pd_exited = B_TRUE; + cv_broadcast(&td_main->td_pfd->pd_cv); + mutex_exit(&td_main->td_pfd->pd_mtx); +} + +/* + * NB: dataset must not be changing on-disk (eg, is a snapshot or we are + * in syncing context). + */ +static int +traverse_impl(spa_t *spa, uint64_t objset, blkptr_t *rootbp, + uint64_t txg_start, int flags, blkptr_cb_t func, void *arg) +{ + struct traverse_data td; + struct prefetch_data pd = { 0 }; + zbookmark_t czb; + int err; + + td.td_spa = spa; + td.td_objset = objset; + td.td_rootbp = rootbp; + td.td_min_txg = txg_start; + td.td_func = func; + td.td_arg = arg; + td.td_pfd = &pd; + td.td_flags = flags; + + pd.pd_blks_max = 100; + pd.pd_flags = flags; + mutex_init(&pd.pd_mtx, NULL, MUTEX_DEFAULT, NULL); + cv_init(&pd.pd_cv, NULL, CV_DEFAULT, NULL); + + if (!(flags & TRAVERSE_PREFETCH) || + 0 == taskq_dispatch(system_taskq, traverse_prefetch_thread, + &td, TQ_NOQUEUE)) + pd.pd_exited = B_TRUE; + + SET_BOOKMARK(&czb, objset, 0, -1, 0); + err = traverse_visitbp(&td, NULL, NULL, rootbp, &czb); + + mutex_enter(&pd.pd_mtx); + pd.pd_cancel = B_TRUE; + cv_broadcast(&pd.pd_cv); + while (!pd.pd_exited) + cv_wait(&pd.pd_cv, &pd.pd_mtx); + mutex_exit(&pd.pd_mtx); + + mutex_destroy(&pd.pd_mtx); + cv_destroy(&pd.pd_cv); + + return (err); +} + +/* + * NB: dataset must not be changing on-disk (eg, is a snapshot or we are + * in syncing context). + */ +int +traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start, int flags, + blkptr_cb_t func, void *arg) +{ + return (traverse_impl(ds->ds_dir->dd_pool->dp_spa, ds->ds_object, + &ds->ds_phys->ds_bp, txg_start, flags, func, arg)); +} + +/* + * NB: pool must not be changing on-disk (eg, from zdb or sync context). + */ +int +traverse_pool(spa_t *spa, blkptr_cb_t func, void *arg) +{ + int err; + uint64_t obj; + dsl_pool_t *dp = spa_get_dsl(spa); + objset_t *mos = dp->dp_meta_objset; + + /* visit the MOS */ + err = traverse_impl(spa, 0, spa_get_rootblkptr(spa), + 0, TRAVERSE_PRE, func, arg); + if (err) + return (err); + + /* visit each dataset */ + for (obj = 1; err == 0; err = dmu_object_next(mos, &obj, FALSE, 0)) { + dmu_object_info_t doi; + + err = dmu_object_info(mos, obj, &doi); + if (err) + return (err); + + if (doi.doi_type == DMU_OT_DSL_DATASET) { + dsl_dataset_t *ds; + rw_enter(&dp->dp_config_rwlock, RW_READER); + err = dsl_dataset_hold_obj(dp, obj, FTAG, &ds); + rw_exit(&dp->dp_config_rwlock); + if (err) + return (err); + err = traverse_dataset(ds, + ds->ds_phys->ds_prev_snap_txg, TRAVERSE_PRE, + func, arg); + dsl_dataset_rele(ds, FTAG); + if (err) + return (err); + } + } + if (err == ESRCH) + err = 0; + return (err); +} diff --git a/usr/src/uts/common/fs/zfs/dsl_dataset.c b/usr/src/uts/common/fs/zfs/dsl_dataset.c index db2171c7d9..93ea8aa111 100644 --- a/usr/src/uts/common/fs/zfs/dsl_dataset.c +++ b/usr/src/uts/common/fs/zfs/dsl_dataset.c @@ -1156,12 +1156,13 @@ struct killarg { /* ARGSUSED */ static int -kill_blkptr(traverse_blk_cache_t *bc, spa_t *spa, void *arg) +kill_blkptr(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb, + const dnode_phys_t *dnp, void *arg) { struct killarg *ka = arg; - blkptr_t *bp = &bc->bc_blkptr; - ASSERT3U(bc->bc_errno, ==, 0); + if (bp == NULL) + return (0); ASSERT3U(bp->blk_birth, >, ka->ds->ds_phys->ds_prev_snap_txg); (void) dsl_dataset_block_kill(ka->ds, bp, ka->zio, ka->tx); @@ -1189,7 +1190,7 @@ dsl_dataset_rollback_check(void *arg1, void *arg2, dmu_tx_t *tx) return (EINVAL); /* - * If we made changes this txg, traverse_dsl_dataset won't find + * If we made changes this txg, traverse_dataset won't find * them. Try again. */ if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg) @@ -1256,8 +1257,8 @@ dsl_dataset_rollback_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) ka.ds = ds; ka.zio = zio; ka.tx = tx; - (void) traverse_dsl_dataset(ds, ds->ds_phys->ds_prev_snap_txg, - ADVANCE_POST, kill_blkptr, &ka); + (void) traverse_dataset(ds, ds->ds_phys->ds_prev_snap_txg, + TRAVERSE_POST, kill_blkptr, &ka); (void) zio_wait(zio); } @@ -1650,8 +1651,8 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx) ka.ds = ds; ka.zio = zio; ka.tx = tx; - err = traverse_dsl_dataset(ds, ds->ds_phys->ds_prev_snap_txg, - ADVANCE_POST, kill_blkptr, &ka); + err = traverse_dataset(ds, ds->ds_phys->ds_prev_snap_txg, + TRAVERSE_POST, kill_blkptr, &ka); ASSERT3U(err, ==, 0); ASSERT(spa_version(dp->dp_spa) < SPA_VERSION_UNIQUE_ACCURATE || ds->ds_phys->ds_unique_bytes == 0); @@ -2837,6 +2838,8 @@ dsl_dataset_clone_swap_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) csa->cds->ds_phys->ds_deadlist_obj)); VERIFY(0 == bplist_open(&csa->ohds->ds_deadlist, dp->dp_meta_objset, csa->ohds->ds_phys->ds_deadlist_obj)); + + dsl_pool_ds_clone_swapped(csa->ohds, csa->cds, tx); } /* diff --git a/usr/src/uts/common/fs/zfs/dsl_pool.c b/usr/src/uts/common/fs/zfs/dsl_pool.c index 91587c3800..dacc57c81c 100644 --- a/usr/src/uts/common/fs/zfs/dsl_pool.c +++ b/usr/src/uts/common/fs/zfs/dsl_pool.c @@ -226,6 +226,8 @@ dsl_pool_close(dsl_pool_t *dp) rw_destroy(&dp->dp_config_rwlock); mutex_destroy(&dp->dp_lock); mutex_destroy(&dp->dp_scrub_cancel_lock); + if (dp->dp_blkstats) + kmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t)); kmem_free(dp, sizeof (dsl_pool_t)); } diff --git a/usr/src/uts/common/fs/zfs/dsl_scrub.c b/usr/src/uts/common/fs/zfs/dsl_scrub.c index 5f675b787d..950a91f783 100644 --- a/usr/src/uts/common/fs/zfs/dsl_scrub.c +++ b/usr/src/uts/common/fs/zfs/dsl_scrub.c @@ -107,6 +107,12 @@ dsl_pool_scrub_setup_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) /* back to the generic stuff */ + if (dp->dp_blkstats == NULL) { + dp->dp_blkstats = + kmem_alloc(sizeof (zfs_all_blkstats_t), KM_SLEEP); + } + bzero(dp->dp_blkstats, sizeof (zfs_all_blkstats_t)); + if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) ot = DMU_OT_ZAP_OTHER; @@ -575,6 +581,37 @@ dsl_pool_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx) } } +void +dsl_pool_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx) +{ + dsl_pool_t *dp = ds1->ds_dir->dd_pool; + + if (dp->dp_scrub_func == SCRUB_FUNC_NONE) + return; + + if (dp->dp_scrub_bookmark.zb_objset == ds1->ds_object) { + dp->dp_scrub_bookmark.zb_objset = ds2->ds_object; + } else if (dp->dp_scrub_bookmark.zb_objset == ds2->ds_object) { + dp->dp_scrub_bookmark.zb_objset = ds1->ds_object; + } + + if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, + ds1->ds_object, tx) == 0) { + int err = zap_add_int(dp->dp_meta_objset, + dp->dp_scrub_queue_obj, ds2->ds_object, tx); + VERIFY(err == 0 || err == EEXIST); + if (err == EEXIST) { + /* Both were there to begin with */ + VERIFY(0 == zap_add_int(dp->dp_meta_objset, + dp->dp_scrub_queue_obj, ds1->ds_object, tx)); + } + } else if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, + ds2->ds_object, tx) == 0) { + VERIFY(0 == zap_add_int(dp->dp_meta_objset, + dp->dp_scrub_queue_obj, ds1->ds_object, tx)); + } +} + struct enqueue_clones_arg { dmu_tx_t *tx; uint64_t originobj; @@ -817,6 +854,52 @@ dsl_pool_scrub_restart(dsl_pool_t *dp) */ static void +count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp) +{ + int i; + + /* + * If we resume after a reboot, zab will be NULL; don't record + * incomplete stats in that case. + */ + if (zab == NULL) + return; + + for (i = 0; i < 4; i++) { + int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS; + int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL; + zfs_blkstat_t *zb = &zab->zab_type[l][t]; + int equal; + + zb->zb_count++; + zb->zb_asize += BP_GET_ASIZE(bp); + zb->zb_lsize += BP_GET_LSIZE(bp); + zb->zb_psize += BP_GET_PSIZE(bp); + zb->zb_gangs += BP_COUNT_GANG(bp); + + switch (BP_GET_NDVAS(bp)) { + case 2: + if (DVA_GET_VDEV(&bp->blk_dva[0]) == + DVA_GET_VDEV(&bp->blk_dva[1])) + zb->zb_ditto_2_of_2_samevdev++; + break; + case 3: + equal = (DVA_GET_VDEV(&bp->blk_dva[0]) == + DVA_GET_VDEV(&bp->blk_dva[1])) + + (DVA_GET_VDEV(&bp->blk_dva[0]) == + DVA_GET_VDEV(&bp->blk_dva[2])) + + (DVA_GET_VDEV(&bp->blk_dva[1]) == + DVA_GET_VDEV(&bp->blk_dva[2])); + if (equal == 1) + zb->zb_ditto_2_of_3_samevdev++; + else if (equal == 3) + zb->zb_ditto_3_of_3_samevdev++; + break; + } + } +} + +static void dsl_pool_scrub_clean_done(zio_t *zio) { spa_t *spa = zio->io_spa; @@ -844,6 +927,8 @@ dsl_pool_scrub_clean_cb(dsl_pool_t *dp, int zio_flags = ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_CANFAIL; int zio_priority; + count_block(dp->dp_blkstats, bp); + if (dp->dp_scrub_isresilver == 0) { /* It's a scrub */ zio_flags |= ZIO_FLAG_SCRUB; diff --git a/usr/src/uts/common/fs/zfs/spa.c b/usr/src/uts/common/fs/zfs/spa.c index fa01a4d729..3420dc2fb4 100644 --- a/usr/src/uts/common/fs/zfs/spa.c +++ b/usr/src/uts/common/fs/zfs/spa.c @@ -4059,11 +4059,7 @@ spa_sync(spa_t *spa, uint64_t txg) spa->spa_config_syncing = NULL; } - spa->spa_traverse_wanted = B_TRUE; - rw_enter(&spa->spa_traverse_lock, RW_WRITER); - spa->spa_traverse_wanted = B_FALSE; spa->spa_ubsync = spa->spa_uberblock; - rw_exit(&spa->spa_traverse_lock); /* * Clean up the ZIL records for the synced txg. diff --git a/usr/src/uts/common/fs/zfs/spa_misc.c b/usr/src/uts/common/fs/zfs/spa_misc.c index d5cadc052c..36046e6df1 100644 --- a/usr/src/uts/common/fs/zfs/spa_misc.c +++ b/usr/src/uts/common/fs/zfs/spa_misc.c @@ -424,8 +424,6 @@ spa_add(const char *name, const char *altroot) spa = kmem_zalloc(sizeof (spa_t), KM_SLEEP); - rw_init(&spa->spa_traverse_lock, NULL, RW_DEFAULT, NULL); - mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_async_root_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL); @@ -509,8 +507,6 @@ spa_remove(spa_t *spa) spa_config_lock_destroy(spa); - rw_destroy(&spa->spa_traverse_lock); - cv_destroy(&spa->spa_async_cv); cv_destroy(&spa->spa_async_root_cv); cv_destroy(&spa->spa_scrub_io_cv); @@ -1123,16 +1119,10 @@ zfs_panic_recover(const char *fmt, ...) * ========================================================================== */ -krwlock_t * -spa_traverse_rwlock(spa_t *spa) -{ - return (&spa->spa_traverse_lock); -} - boolean_t -spa_traverse_wanted(spa_t *spa) +spa_shutting_down(spa_t *spa) { - return (spa->spa_traverse_wanted); + return (spa->spa_async_suspended); } dsl_pool_t * @@ -1201,7 +1191,7 @@ spa_first_txg(spa_t *spa) return (spa->spa_first_txg); } -int +pool_state_t spa_state(spa_t *spa) { return (spa->spa_state); diff --git a/usr/src/lib/libzpool/common/llib-lzpool b/usr/src/uts/common/fs/zfs/sys/dmu_traverse.h similarity index 58% copy from usr/src/lib/libzpool/common/llib-lzpool copy to usr/src/uts/common/fs/zfs/sys/dmu_traverse.h index e715898a2f..3e02689115 100644 --- a/usr/src/lib/libzpool/common/llib-lzpool +++ b/usr/src/uts/common/fs/zfs/sys/dmu_traverse.h @@ -23,27 +23,35 @@ * Use is subject to license terms. */ -/* LINTLIBRARY */ -/* PROTOLIB1 */ +#ifndef _SYS_DMU_TRAVERSE_H +#define _SYS_DMU_TRAVERSE_H #include -#include -#include -#include -#include -#include -#include -#include #include -#include -#include -#include -#include -#include #include -#include -#include -#include -#include -extern uint64_t metaslab_gang_bang; +#ifdef __cplusplus +extern "C" { +#endif + +struct dnode_phys; +struct dsl_dataset; + +typedef int (blkptr_cb_t)(spa_t *spa, blkptr_t *bp, + const zbookmark_t *zb, const struct dnode_phys *dnp, void *arg); + +#define TRAVERSE_PRE (1<<0) +#define TRAVERSE_POST (1<<1) +#define TRAVERSE_PREFETCH_METADATA (1<<2) +#define TRAVERSE_PREFETCH_DATA (1<<3) +#define TRAVERSE_PREFETCH (TRAVERSE_PREFETCH_METADATA | TRAVERSE_PREFETCH_DATA) + +int traverse_dataset(struct dsl_dataset *ds, uint64_t txg_start, + int flags, blkptr_cb_t func, void *arg); +int traverse_pool(spa_t *spa, blkptr_cb_t func, void *arg); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DMU_TRAVERSE_H */ diff --git a/usr/src/uts/common/fs/zfs/sys/dsl_pool.h b/usr/src/uts/common/fs/zfs/sys/dsl_pool.h index 4dd88fe6fa..3bb4ad4efe 100644 --- a/usr/src/uts/common/fs/zfs/sys/dsl_pool.h +++ b/usr/src/uts/common/fs/zfs/sys/dsl_pool.h @@ -31,6 +31,7 @@ #include #include #include +#include #ifdef __cplusplus extern "C" { @@ -48,6 +49,25 @@ enum scrub_func { SCRUB_FUNC_NUMFUNCS }; +/* These macros are for indexing into the zfs_all_blkstats_t. */ +#define DMU_OT_DEFERRED DMU_OT_NONE +#define DMU_OT_TOTAL DMU_OT_NUMTYPES + +typedef struct zfs_blkstat { + uint64_t zb_count; + uint64_t zb_asize; + uint64_t zb_lsize; + uint64_t zb_psize; + uint64_t zb_gangs; + uint64_t zb_ditto_2_of_2_samevdev; + uint64_t zb_ditto_2_of_3_samevdev; + uint64_t zb_ditto_3_of_3_samevdev; +} zfs_blkstat_t; + +typedef struct zfs_all_blkstats { + zfs_blkstat_t zab_type[DN_MAX_LEVELS + 1][DMU_OT_TOTAL + 1]; +} zfs_all_blkstats_t; + typedef struct dsl_pool { /* Immutable */ @@ -94,6 +114,8 @@ typedef struct dsl_pool { * nobody else could possibly have it for write. */ krwlock_t dp_config_rwlock; + + zfs_all_blkstats_t *dp_blkstats; } dsl_pool_t; int dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp); @@ -111,6 +133,8 @@ int dsl_free(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp, zio_done_func_t *done, void *private, uint32_t arc_flags); void dsl_pool_ds_destroyed(struct dsl_dataset *ds, struct dmu_tx *tx); void dsl_pool_ds_snapshotted(struct dsl_dataset *ds, struct dmu_tx *tx); +void dsl_pool_ds_clone_swapped(struct dsl_dataset *ds1, struct dsl_dataset *ds2, + struct dmu_tx *tx); void dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx); void dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx); diff --git a/usr/src/uts/common/fs/zfs/sys/spa.h b/usr/src/uts/common/fs/zfs/sys/spa.h index 69628e7efc..24b3ca4476 100644 --- a/usr/src/uts/common/fs/zfs/sys/spa.h +++ b/usr/src/uts/common/fs/zfs/sys/spa.h @@ -44,7 +44,6 @@ typedef struct spa spa_t; typedef struct vdev vdev_t; typedef struct metaslab metaslab_t; typedef struct zilog zilog_t; -typedef struct traverse_handle traverse_handle_t; typedef struct spa_aux_vdev spa_aux_vdev_t; struct dsl_pool; @@ -437,8 +436,7 @@ extern void spa_vdev_state_enter(spa_t *spa); extern int spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error); /* Accessor functions */ -extern krwlock_t *spa_traverse_rwlock(spa_t *spa); -extern boolean_t spa_traverse_wanted(spa_t *spa); +extern boolean_t spa_shutting_down(spa_t *spa); extern struct dsl_pool *spa_get_dsl(spa_t *spa); extern blkptr_t *spa_get_rootblkptr(spa_t *spa); extern void spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp); @@ -449,7 +447,7 @@ extern uint64_t spa_guid(spa_t *spa); extern uint64_t spa_last_synced_txg(spa_t *spa); extern uint64_t spa_first_txg(spa_t *spa); extern uint64_t spa_version(spa_t *spa); -extern int spa_state(spa_t *spa); +extern pool_state_t spa_state(spa_t *spa); extern uint64_t spa_freeze_txg(spa_t *spa); extern uint64_t spa_get_alloc(spa_t *spa); extern uint64_t spa_get_space(spa_t *spa); diff --git a/usr/src/uts/common/fs/zfs/sys/spa_impl.h b/usr/src/uts/common/fs/zfs/sys/spa_impl.h index ab41ba605c..8aeb414fe9 100644 --- a/usr/src/uts/common/fs/zfs/sys/spa_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/spa_impl.h @@ -101,9 +101,8 @@ struct spa { nvlist_t *spa_config_syncing; /* currently syncing config */ uint64_t spa_config_txg; /* txg of last config change */ int spa_sync_pass; /* iterate-to-convergence */ - int spa_state; /* pool state */ + pool_state_t spa_state; /* pool state */ int spa_inject_ref; /* injection references */ - uint8_t spa_traverse_wanted; /* traverse lock wanted */ uint8_t spa_sync_on; /* sync threads are running */ spa_load_state_t spa_load_state; /* current load operation */ taskq_t *spa_zio_taskq[ZIO_TYPES][ZIO_TASKQ_TYPES]; @@ -125,7 +124,6 @@ struct spa { uint64_t spa_syncing_txg; /* txg currently syncing */ uint64_t spa_sync_bplist_obj; /* object for deferred frees */ bplist_t spa_sync_bplist; /* deferred-free bplist */ - krwlock_t spa_traverse_lock; /* traverse vs. spa_sync() */ uberblock_t spa_ubsync; /* last synced uberblock */ uberblock_t spa_uberblock; /* current uberblock */ kmutex_t spa_scrub_lock; /* resilver/scrub lock */ diff --git a/usr/src/uts/common/fs/zfs/sys/txg_impl.h b/usr/src/uts/common/fs/zfs/sys/txg_impl.h index a58be84be5..7413c662b3 100644 --- a/usr/src/uts/common/fs/zfs/sys/txg_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/txg_impl.h @@ -26,8 +26,6 @@ #ifndef _SYS_TXG_IMPL_H #define _SYS_TXG_IMPL_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include #include @@ -66,7 +64,6 @@ typedef struct tx_state { kthread_t *tx_sync_thread; kthread_t *tx_quiesce_thread; - kthread_t *tx_timelimit_thread; } tx_state_t; #ifdef __cplusplus diff --git a/usr/src/uts/common/fs/zfs/txg.c b/usr/src/uts/common/fs/zfs/txg.c index 9e1ed51d63..2bbf2f086c 100644 --- a/usr/src/uts/common/fs/zfs/txg.c +++ b/usr/src/uts/common/fs/zfs/txg.c @@ -281,12 +281,14 @@ txg_sync_thread(dsl_pool_t *dp) uint64_t txg; /* - * We sync when there's someone waiting on us, or the - * quiesce thread has handed off a txg to us, or we have - * reached our timeout. + * We sync when we're scrubbing, there's someone waiting + * on us, or the quiesce thread has handed off a txg to + * us, or we have reached our timeout. */ timer = (delta >= timeout ? 0 : timeout - delta); - while (!tx->tx_exiting && timer > 0 && + while ((dp->dp_scrub_func == SCRUB_FUNC_NONE || + spa_shutting_down(dp->dp_spa)) && + !tx->tx_exiting && timer > 0 && tx->tx_synced_txg >= tx->tx_sync_txg_waiting && tx->tx_quiesced_txg == 0) { dprintf("waiting; tx_synced=%llu waiting=%llu dp=%p\n", diff --git a/usr/src/uts/common/fs/zfs/vdev_cache.c b/usr/src/uts/common/fs/zfs/vdev_cache.c index ca9e5603c4..5a7b59f6ed 100644 --- a/usr/src/uts/common/fs/zfs/vdev_cache.c +++ b/usr/src/uts/common/fs/zfs/vdev_cache.c @@ -267,7 +267,7 @@ vdev_cache_read(zio_t *zio) /* * If the I/O straddles two or more cache blocks, don't cache it. */ - if (P2CROSS(zio->io_offset, zio->io_offset + zio->io_size - 1, VCBS)) + if (P2BOUNDARY(zio->io_offset, zio->io_size, VCBS)) return (EXDEV); ASSERT(cache_phase + zio->io_size <= VCBS); diff --git a/usr/src/uts/common/fs/zfs/zvol.c b/usr/src/uts/common/fs/zfs/zvol.c index 06870e15ac..9f77005718 100644 --- a/usr/src/uts/common/fs/zfs/zvol.c +++ b/usr/src/uts/common/fs/zfs/zvol.c @@ -91,23 +91,13 @@ static void *zvol_state; static kmutex_t zvol_state_lock; static uint32_t zvol_minors; -#define NUM_EXTENTS ((SPA_MAXBLOCKSIZE) / sizeof (zvol_extent_t)) - typedef struct zvol_extent { + list_node_t ze_node; dva_t ze_dva; /* dva associated with this extent */ - uint64_t ze_stride; /* extent stride */ - uint64_t ze_size; /* number of blocks in extent */ + uint64_t ze_nblks; /* number of blocks in extent */ } zvol_extent_t; /* - * The list of extents associated with the dump device - */ -typedef struct zvol_ext_list { - zvol_extent_t zl_extents[NUM_EXTENTS]; - struct zvol_ext_list *zl_next; -} zvol_ext_list_t; - -/* * The in-core state of each volume. */ typedef struct zvol_state { @@ -122,7 +112,7 @@ typedef struct zvol_state { uint32_t zv_open_count[OTYPCNT]; /* open counts */ uint32_t zv_total_opens; /* total open count */ zilog_t *zv_zilog; /* ZIL handle */ - zvol_ext_list_t *zv_list; /* List of extents for dump */ + list_t zv_extents; /* List of extents for dump */ uint64_t zv_txg_assign; /* txg to assign during ZIL replay */ znode_t zv_znode; /* for range locking */ } zvol_state_t; @@ -258,128 +248,81 @@ zvol_minor_lookup(const char *name) return (zv); } -void -zvol_init_extent(zvol_extent_t *ze, blkptr_t *bp) -{ - ze->ze_dva = bp->blk_dva[0]; /* structure assignment */ - ze->ze_stride = 0; - ze->ze_size = 1; -} - /* extent mapping arg */ struct maparg { - zvol_ext_list_t *ma_list; - zvol_extent_t *ma_extent; - int ma_gang; + zvol_state_t *ma_zv; + uint64_t ma_blks; }; /*ARGSUSED*/ static int -zvol_map_block(traverse_blk_cache_t *bc, spa_t *spa, void *arg) +zvol_map_block(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb, + const dnode_phys_t *dnp, void *arg) { - zbookmark_t *zb = &bc->bc_bookmark; - blkptr_t *bp = &bc->bc_blkptr; - void *data = bc->bc_data; - dnode_phys_t *dnp = bc->bc_dnode; - struct maparg *ma = (struct maparg *)arg; - uint64_t stride; - - /* If there is an error, then keep trying to make progress */ - if (bc->bc_errno) - return (ERESTART); - -#ifdef ZFS_DEBUG - if (zb->zb_level == -1) { - ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_OBJSET); - ASSERT3U(BP_GET_LEVEL(bp), ==, 0); - } else { - ASSERT3U(BP_GET_TYPE(bp), ==, dnp->dn_type); - ASSERT3U(BP_GET_LEVEL(bp), ==, zb->zb_level); - } + struct maparg *ma = arg; + zvol_extent_t *ze; + int bs = ma->ma_zv->zv_volblocksize; - if (zb->zb_level > 0) { - uint64_t fill = 0; - blkptr_t *bpx, *bpend; + if (bp == NULL || zb->zb_object != ZVOL_OBJ || zb->zb_level != 0) + return (0); - for (bpx = data, bpend = bpx + BP_GET_LSIZE(bp) / sizeof (*bpx); - bpx < bpend; bpx++) { - if (bpx->blk_birth != 0) { - fill += bpx->blk_fill; - } else { - ASSERT(bpx->blk_fill == 0); - } - } - ASSERT3U(fill, ==, bp->blk_fill); - } + VERIFY3U(ma->ma_blks, ==, zb->zb_blkid); + ma->ma_blks++; - if (zb->zb_level == 0 && dnp->dn_type == DMU_OT_DNODE) { - uint64_t fill = 0; - dnode_phys_t *dnx, *dnend; + /* Abort immediately if we have encountered gang blocks */ + if (BP_IS_GANG(bp)) + return (EFRAGS); - for (dnx = data, dnend = dnx + (BP_GET_LSIZE(bp)>>DNODE_SHIFT); - dnx < dnend; dnx++) { - if (dnx->dn_type != DMU_OT_NONE) - fill++; - } - ASSERT3U(fill, ==, bp->blk_fill); + /* + * See if the block is at the end of the previous extent. + */ + ze = list_tail(&ma->ma_zv->zv_extents); + if (ze && + DVA_GET_VDEV(BP_IDENTITY(bp)) == DVA_GET_VDEV(&ze->ze_dva) && + DVA_GET_OFFSET(BP_IDENTITY(bp)) == + DVA_GET_OFFSET(&ze->ze_dva) + ze->ze_nblks * bs) { + ze->ze_nblks++; + return (0); } -#endif - if (zb->zb_level || dnp->dn_type == DMU_OT_DNODE) - return (0); + dprintf_bp(bp, "%s", "next blkptr:"); - /* Abort immediately if we have encountered gang blocks */ - if (BP_IS_GANG(bp)) { - ma->ma_gang++; - return (EINTR); - } + /* start a new extent */ + ze = kmem_zalloc(sizeof (zvol_extent_t), KM_SLEEP); + ze->ze_dva = bp->blk_dva[0]; /* structure assignment */ + ze->ze_nblks = 1; + list_insert_tail(&ma->ma_zv->zv_extents, ze); + return (0); +} - /* first time? */ - if (ma->ma_extent->ze_size == 0) { - zvol_init_extent(ma->ma_extent, bp); - return (0); - } +static void +zvol_free_extents(zvol_state_t *zv) +{ + zvol_extent_t *ze; - stride = (DVA_GET_OFFSET(&bp->blk_dva[0])) - - ((DVA_GET_OFFSET(&ma->ma_extent->ze_dva)) + - (ma->ma_extent->ze_size - 1) * (ma->ma_extent->ze_stride)); - if (DVA_GET_VDEV(BP_IDENTITY(bp)) == - DVA_GET_VDEV(&ma->ma_extent->ze_dva)) { - if (ma->ma_extent->ze_stride == 0) { - /* second block in this extent */ - ma->ma_extent->ze_stride = stride; - ma->ma_extent->ze_size++; - return (0); - } else if (ma->ma_extent->ze_stride == stride) { - /* - * the block we allocated has the same - * stride - */ - ma->ma_extent->ze_size++; - return (0); - } + while (ze = list_head(&zv->zv_extents)) { + list_remove(&zv->zv_extents, ze); + kmem_free(ze, sizeof (zvol_extent_t)); } +} - /* - * dtrace -n 'zfs-dprintf - * /stringof(arg0) == "zvol.c"/ - * { - * printf("%s: %s", stringof(arg1), stringof(arg3)) - * } ' - */ - dprintf("ma_extent 0x%lx mrstride 0x%lx stride %lx\n", - ma->ma_extent->ze_size, ma->ma_extent->ze_stride, stride); - dprintf_bp(bp, "%s", "next blkptr:"); - /* start a new extent */ - if (ma->ma_extent == &ma->ma_list->zl_extents[NUM_EXTENTS - 1]) { - ma->ma_list->zl_next = kmem_zalloc(sizeof (zvol_ext_list_t), - KM_SLEEP); - ma->ma_list = ma->ma_list->zl_next; - ma->ma_extent = &ma->ma_list->zl_extents[0]; - } else { - ma->ma_extent++; +static int +zvol_get_lbas(zvol_state_t *zv) +{ + struct maparg ma; + int err; + + ma.ma_zv = zv; + ma.ma_blks = 0; + zvol_free_extents(zv); + + err = traverse_dataset(dmu_objset_ds(zv->zv_objset), 0, + TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, zvol_map_block, &ma); + if (err || ma.ma_blks != (zv->zv_volsize / zv->zv_volblocksize)) { + zvol_free_extents(zv); + return (err ? err : EIO); } - zvol_init_extent(ma->ma_extent, bp); + return (0); } @@ -477,106 +420,6 @@ zil_replay_func_t *zvol_replay_vector[TX_MAX_TYPE] = { }; /* - * reconstruct dva that gets us to the desired offset (offset - * is in bytes) - */ -int -zvol_get_dva(zvol_state_t *zv, uint64_t offset, dva_t *dva) -{ - zvol_ext_list_t *zl; - zvol_extent_t *ze; - int idx; - uint64_t tmp; - - if ((zl = zv->zv_list) == NULL) - return (EIO); - idx = 0; - ze = &zl->zl_extents[0]; - while (offset >= ze->ze_size * zv->zv_volblocksize) { - offset -= ze->ze_size * zv->zv_volblocksize; - - if (idx == NUM_EXTENTS - 1) { - /* we've reached the end of this array */ - ASSERT(zl->zl_next != NULL); - if (zl->zl_next == NULL) - return (-1); - zl = zl->zl_next; - ze = &zl->zl_extents[0]; - idx = 0; - } else { - ze++; - idx++; - } - } - DVA_SET_VDEV(dva, DVA_GET_VDEV(&ze->ze_dva)); - tmp = DVA_GET_OFFSET((&ze->ze_dva)); - tmp += (ze->ze_stride * (offset / zv->zv_volblocksize)); - DVA_SET_OFFSET(dva, tmp); - return (0); -} - -static void -zvol_free_extents(zvol_state_t *zv) -{ - zvol_ext_list_t *zl; - zvol_ext_list_t *tmp; - - if (zv->zv_list != NULL) { - zl = zv->zv_list; - while (zl != NULL) { - tmp = zl->zl_next; - kmem_free(zl, sizeof (zvol_ext_list_t)); - zl = tmp; - } - zv->zv_list = NULL; - } -} - -int -zvol_get_lbas(zvol_state_t *zv) -{ - struct maparg ma; - zvol_ext_list_t *zl; - zvol_extent_t *ze; - uint64_t blocks = 0; - int err; - - ma.ma_list = zl = kmem_zalloc(sizeof (zvol_ext_list_t), KM_SLEEP); - ma.ma_extent = &ma.ma_list->zl_extents[0]; - ma.ma_gang = 0; - zv->zv_list = ma.ma_list; - - err = traverse_zvol(zv->zv_objset, ADVANCE_PRE, zvol_map_block, &ma); - if (err == EINTR && ma.ma_gang) { - /* - * We currently don't support dump devices when the pool - * is so fragmented that our allocation has resulted in - * gang blocks. - */ - zvol_free_extents(zv); - return (EFRAGS); - } - ASSERT3U(err, ==, 0); - - ze = &zl->zl_extents[0]; - while (ze) { - blocks += ze->ze_size; - if (ze == &zl->zl_extents[NUM_EXTENTS - 1]) { - zl = zl->zl_next; - ze = &zl->zl_extents[0]; - } else { - ze++; - } - } - if (blocks != (zv->zv_volsize / zv->zv_volblocksize)) { - zvol_free_extents(zv); - return (EIO); - } - - return (0); -} - -/* * Create a minor node (plus a whole lot more) for the specified volume. */ int @@ -708,6 +551,8 @@ zvol_create_minor(const char *name, major_t maj) mutex_init(&zv->zv_znode.z_range_lock, NULL, MUTEX_DEFAULT, NULL); avl_create(&zv->zv_znode.z_range_avl, zfs_range_compare, sizeof (rl_t), offsetof(rl_t, r_node)); + list_create(&zv->zv_extents, sizeof (zvol_extent_t), + offsetof(zvol_extent_t, ze_node)); /* get and cache the blocksize */ error = dmu_object_info(os, ZVOL_OBJ, &doi); ASSERT(error == 0); @@ -917,15 +762,24 @@ zvol_set_volblocksize(const char *name, uint64_t volblocksize) zvol_state_t *zv; dmu_tx_t *tx; int error; + boolean_t needlock; - mutex_enter(&zvol_state_lock); + /* + * The lock may already be held if we are being called from + * zvol_dump_init(). + */ + needlock = !MUTEX_HELD(&zvol_state_lock); + if (needlock) + mutex_enter(&zvol_state_lock); if ((zv = zvol_minor_lookup(name)) == NULL) { - mutex_exit(&zvol_state_lock); + if (needlock) + mutex_exit(&zvol_state_lock); return (ENXIO); } if (zv->zv_flags & ZVOL_RDONLY || (zv->zv_mode & DS_MODE_READONLY)) { - mutex_exit(&zvol_state_lock); + if (needlock) + mutex_exit(&zvol_state_lock); return (EROFS); } @@ -940,9 +794,12 @@ zvol_set_volblocksize(const char *name, uint64_t volblocksize) if (error == ENOTSUP) error = EBUSY; dmu_tx_commit(tx); + if (error == 0) + zv->zv_volblocksize = volblocksize; } - mutex_exit(&zvol_state_lock); + if (needlock) + mutex_exit(&zvol_state_lock); return (error); } @@ -1142,20 +999,22 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t len) } } -int -zvol_dumpio(vdev_t *vd, uint64_t size, uint64_t offset, void *addr, - int bflags, int isdump) +static int +zvol_dumpio_vdev(vdev_t *vd, void *addr, uint64_t offset, uint64_t size, + boolean_t doread, boolean_t isdump) { vdev_disk_t *dvd; - int direction; int c; int numerrors = 0; for (c = 0; c < vd->vdev_children; c++) { - if (zvol_dumpio(vd->vdev_child[c], size, offset, - addr, bflags, isdump) != 0) { + ASSERT(vd->vdev_ops == &vdev_mirror_ops); + int err = zvol_dumpio_vdev(vd->vdev_child[c], + addr, offset, size, doread, isdump); + ASSERT3U(err, ==, 0); + if (err != 0) { numerrors++; - } else if (bflags & B_READ) { + } else if (doread) { break; } } @@ -1163,51 +1022,52 @@ zvol_dumpio(vdev_t *vd, uint64_t size, uint64_t offset, void *addr, if (!vd->vdev_ops->vdev_op_leaf) return (numerrors < vd->vdev_children ? 0 : EIO); + ASSERT(vdev_writeable(vd)); if (!vdev_writeable(vd)) return (EIO); dvd = vd->vdev_tsd; ASSERT3P(dvd, !=, NULL); - direction = bflags & (B_WRITE | B_READ); - ASSERT(ISP2(direction)); offset += VDEV_LABEL_START_SIZE; if (ddi_in_panic() || isdump) { - if (direction & B_READ) + ASSERT(!doread); + if (doread) return (EIO); return (ldi_dump(dvd->vd_lh, addr, lbtodb(offset), lbtodb(size))); } else { return (vdev_disk_physio(dvd->vd_lh, addr, size, offset, - direction)); + doread ? B_READ : B_WRITE)); } } -int -zvol_physio(zvol_state_t *zv, int bflags, uint64_t off, - uint64_t size, void *addr, int isdump) +static int +zvol_dumpio(zvol_state_t *zv, void *addr, uint64_t offset, uint64_t size, + boolean_t doread, boolean_t isdump) { - dva_t dva; vdev_t *vd; int error; + zvol_extent_t *ze; spa_t *spa = dmu_objset_spa(zv->zv_objset); - ASSERT(size <= zv->zv_volblocksize); - - /* restrict requests to multiples of the system block size */ - if (P2PHASE(off, DEV_BSIZE) || P2PHASE(size, DEV_BSIZE)) + /* Must be sector aligned, and not stradle a block boundary. */ + if (P2PHASE(offset, DEV_BSIZE) || P2PHASE(size, DEV_BSIZE) || + P2BOUNDARY(offset, size, zv->zv_volblocksize)) { return (EINVAL); + } + ASSERT(size <= zv->zv_volblocksize); - if (zvol_get_dva(zv, off, &dva) != 0) - return (EIO); - + /* Locate the extent this belongs to */ + ze = list_head(&zv->zv_extents); + while (offset >= ze->ze_nblks * zv->zv_volblocksize) { + offset -= ze->ze_nblks * zv->zv_volblocksize; + ze = list_next(&zv->zv_extents, ze); + } spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); - vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva)); - - error = zvol_dumpio(vd, size, - DVA_GET_OFFSET(&dva) + (off % zv->zv_volblocksize), - addr, bflags & (B_READ | B_WRITE | B_PHYS), isdump); - + vd = vdev_lookup_top(spa, DVA_GET_VDEV(&ze->ze_dva)); + offset += DVA_GET_OFFSET(&ze->ze_dva); + error = zvol_dumpio_vdev(vd, addr, offset, size, doread, isdump); spa_config_exit(spa, SCL_STATE, FTAG); return (error); } @@ -1217,12 +1077,13 @@ zvol_strategy(buf_t *bp) { zvol_state_t *zv = ddi_get_soft_state(zvol_state, getminor(bp->b_edev)); uint64_t off, volsize; - size_t size, resid; + size_t resid; char *addr; objset_t *os; rl_t *rl; int error = 0; - boolean_t reading, is_dump = zv->zv_flags & ZVOL_DUMPIFIED; + boolean_t doread = bp->b_flags & B_READ; + boolean_t is_dump = zv->zv_flags & ZVOL_DUMPIFIED; if (zv == NULL) { bioerror(bp, ENXIO); @@ -1254,29 +1115,26 @@ zvol_strategy(buf_t *bp) addr = bp->b_un.b_addr; resid = bp->b_bcount; - if (resid > 0 && (off < 0 || off >= volsize)) - return (EIO); + if (resid > 0 && (off < 0 || off >= volsize)) { + bioerror(bp, EIO); + biodone(bp); + return (0); + } /* * There must be no buffer changes when doing a dmu_sync() because * we can't change the data whilst calculating the checksum. */ - reading = bp->b_flags & B_READ; rl = zfs_range_lock(&zv->zv_znode, off, resid, - reading ? RL_READER : RL_WRITER); - - if (resid > volsize - off) /* don't write past the end */ - resid = volsize - off; + doread ? RL_READER : RL_WRITER); while (resid != 0 && off < volsize) { - - size = MIN(resid, zvol_maxphys); + size_t size = MIN(resid, zvol_maxphys); if (is_dump) { - /* can't straddle a block boundary */ size = MIN(size, P2END(off, zv->zv_volblocksize) - off); - error = zvol_physio(zv, bp->b_flags, off, size, - addr, 0); - } else if (reading) { + error = zvol_dumpio(zv, addr, off, size, + doread, B_FALSE); + } else if (doread) { error = dmu_read(os, ZVOL_OBJ, off, size, addr); } else { dmu_tx_t *tx = dmu_tx_create(os); @@ -1305,7 +1163,7 @@ zvol_strategy(buf_t *bp) if ((bp->b_resid = resid) == bp->b_bcount) bioerror(bp, off > volsize ? EINVAL : error); - if (!(bp->b_flags & B_ASYNC) && !reading && !zil_disable && !is_dump) + if (!(bp->b_flags & B_ASYNC) && !doread && !zil_disable && !is_dump) zil_commit(zv->zv_zilog, UINT64_MAX, ZVOL_OBJ); biodone(bp); @@ -1346,16 +1204,12 @@ zvol_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblocks) boff = ldbtob(blkno); resid = ldbtob(nblocks); - if (boff + resid > zv->zv_volsize) { - /* dump should know better than to write here */ - ASSERT(blkno + resid <= zv->zv_volsize); - return (EIO); - } + + VERIFY3U(boff + resid, <=, zv->zv_volsize); + while (resid) { - /* can't straddle a block boundary */ size = MIN(resid, P2END(boff, zv->zv_volblocksize) - boff); - - error = zvol_physio(zv, B_WRITE, boff, size, addr, 1); + error = zvol_dumpio(zv, addr, boff, size, B_FALSE, B_TRUE); if (error) break; boff += size; @@ -1388,6 +1242,12 @@ zvol_read(dev_t dev, uio_t *uio, cred_t *cr) (uio->uio_loffset < 0 || uio->uio_loffset >= volsize)) return (EIO); + if (zv->zv_flags & ZVOL_DUMPIFIED) { + error = physio(zvol_strategy, NULL, dev, B_READ, + zvol_minphys, uio); + return (error); + } + rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid, RL_READER); while (uio->uio_resid > 0 && uio->uio_loffset < volsize) { @@ -1669,7 +1529,6 @@ zvol_dump_init(zvol_state_t *zv, boolean_t resize) int error = 0; objset_t *os = zv->zv_objset; nvlist_t *nv = NULL; - uint64_t checksum, compress, refresrv; ASSERT(MUTEX_HELD(&zvol_state_lock)); @@ -1692,12 +1551,16 @@ zvol_dump_init(zvol_state_t *zv, boolean_t resize) zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1, &zv->zv_volsize, tx); } else { + uint64_t checksum, compress, refresrv, vbs; + error = dsl_prop_get_integer(zv->zv_name, zfs_prop_to_name(ZFS_PROP_COMPRESSION), &compress, NULL); error = error ? error : dsl_prop_get_integer(zv->zv_name, zfs_prop_to_name(ZFS_PROP_CHECKSUM), &checksum, NULL); error = error ? error : dsl_prop_get_integer(zv->zv_name, zfs_prop_to_name(ZFS_PROP_REFRESERVATION), &refresrv, NULL); + error = error ? error : dsl_prop_get_integer(zv->zv_name, + zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &vbs, NULL); error = error ? error : zap_update(os, ZVOL_ZAP_OBJ, zfs_prop_to_name(ZFS_PROP_COMPRESSION), 8, 1, @@ -1707,6 +1570,9 @@ zvol_dump_init(zvol_state_t *zv, boolean_t resize) error = error ? error : zap_update(os, ZVOL_ZAP_OBJ, zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1, &refresrv, tx); + error = error ? error : zap_update(os, ZVOL_ZAP_OBJ, + zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), 8, 1, + &vbs, tx); } dmu_tx_commit(tx); @@ -1732,6 +1598,9 @@ zvol_dump_init(zvol_state_t *zv, boolean_t resize) VERIFY(nvlist_add_uint64(nv, zfs_prop_to_name(ZFS_PROP_CHECKSUM), ZIO_CHECKSUM_OFF) == 0); + VERIFY(nvlist_add_uint64(nv, + zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), + SPA_MAXBLOCKSIZE) == 0); error = zfs_set_prop_nvlist(zv->zv_name, nv); nvlist_free(nv); @@ -1811,7 +1680,7 @@ zvol_dump_fini(zvol_state_t *zv) objset_t *os = zv->zv_objset; nvlist_t *nv; int error = 0; - uint64_t checksum, compress, refresrv; + uint64_t checksum, compress, refresrv, vbs; /* * Attempt to restore the zvol back to its pre-dumpified state. @@ -1836,6 +1705,8 @@ zvol_dump_fini(zvol_state_t *zv) zfs_prop_to_name(ZFS_PROP_COMPRESSION), 8, 1, &compress); (void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ, zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1, &refresrv); + (void) zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ, + zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), 8, 1, &vbs); VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); (void) nvlist_add_uint64(nv, @@ -1844,6 +1715,8 @@ zvol_dump_fini(zvol_state_t *zv) zfs_prop_to_name(ZFS_PROP_COMPRESSION), compress); (void) nvlist_add_uint64(nv, zfs_prop_to_name(ZFS_PROP_REFRESERVATION), refresrv); + (void) nvlist_add_uint64(nv, + zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), vbs); (void) zfs_set_prop_nvlist(zv->zv_name, nv); nvlist_free(nv); diff --git a/usr/src/uts/common/os/vmem.c b/usr/src/uts/common/os/vmem.c index 767c62bb85..49c3e91611 100644 --- a/usr/src/uts/common/os/vmem.c +++ b/usr/src/uts/common/os/vmem.c @@ -23,8 +23,6 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * Big Theory Statement for the virtual memory allocator. * @@ -1032,7 +1030,7 @@ do_alloc: start = MAX(vsp->vs_start, (uintptr_t)minaddr); end = MIN(vsp->vs_end - 1, (uintptr_t)maxaddr - 1) + 1; taddr = P2PHASEUP(start, align, phase); - if (P2CROSS(taddr, taddr + size - 1, nocross)) + if (P2BOUNDARY(taddr, size, nocross)) taddr += P2ROUNDUP(P2NPHASE(taddr, nocross), align); if ((taddr - start) + size > end - start || @@ -1158,7 +1156,7 @@ do_alloc: if (xvaddr) vmp->vm_source_free(vmp->vm_source, xvaddr, xsize); ASSERT(P2PHASE(addr, align) == phase); - ASSERT(!P2CROSS(addr, addr + size - 1, nocross)); + ASSERT(!P2BOUNDARY(addr, size, nocross)); ASSERT(addr >= (uintptr_t)minaddr); ASSERT(addr + size - 1 <= (uintptr_t)maxaddr - 1); return ((void *)addr); diff --git a/usr/src/uts/common/sys/sysmacros.h b/usr/src/uts/common/sys/sysmacros.h index 001e2fba9f..9695982dcc 100644 --- a/usr/src/uts/common/sys/sysmacros.h +++ b/usr/src/uts/common/sys/sysmacros.h @@ -23,15 +23,13 @@ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_SYSMACROS_H #define _SYS_SYSMACROS_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include #ifdef __cplusplus @@ -222,18 +220,69 @@ extern unsigned char bcd_to_byte[256]; #define ISP2(x) (((x) & ((x) - 1)) == 0) /* - * Macros for various sorts of alignment and rounding when the alignment - * is known to be a power of 2. + * Macros for various sorts of alignment and rounding. The "align" must + * be a power of 2. Often times it is a block, sector, or page. + */ + +/* + * return x rounded down to an align boundary + * eg, P2ALIGN(1200, 1024) == 1024 (1*align) + * eg, P2ALIGN(1024, 1024) == 1024 (1*align) + * eg, P2ALIGN(0x1234, 0x100) == 0x1200 (0x12*align) + * eg, P2ALIGN(0x5600, 0x100) == 0x5600 (0x56*align) */ #define P2ALIGN(x, align) ((x) & -(align)) + +/* + * return x % (mod) align + * eg, P2PHASE(0x1234, 0x100) == 0x34 (x-0x12*align) + * eg, P2PHASE(0x5600, 0x100) == 0x00 (x-0x56*align) + */ #define P2PHASE(x, align) ((x) & ((align) - 1)) + +/* + * return how much space is left in this block (but if it's perfectly + * aligned, return 0). + * eg, P2NPHASE(0x1234, 0x100) == 0xcc (0x13*align-x) + * eg, P2NPHASE(0x5600, 0x100) == 0x00 (0x56*align-x) + */ #define P2NPHASE(x, align) (-(x) & ((align) - 1)) + +/* + * return x rounded up to an align boundary + * eg, P2ROUNDUP(0x1234, 0x100) == 0x1300 (0x13*align) + * eg, P2ROUNDUP(0x5600, 0x100) == 0x5600 (0x56*align) + */ #define P2ROUNDUP(x, align) (-(-(x) & -(align))) + +/* + * return the ending address of the block that x is in + * eg, P2END(0x1234, 0x100) == 0x12ff (0x13*align - 1) + * eg, P2END(0x5600, 0x100) == 0x56ff (0x57*align - 1) + */ #define P2END(x, align) (-(~(x) & -(align))) + +/* + * return x rounded up to the next phase (offset) within align. + * phase should be < align. + * eg, P2PHASEUP(0x1234, 0x100, 0x10) == 0x1310 (0x13*align + phase) + * eg, P2PHASEUP(0x5600, 0x100, 0x10) == 0x5610 (0x56*align + phase) + */ #define P2PHASEUP(x, align, phase) ((phase) - (((phase) - (x)) & -(align))) -#define P2CROSS(x, y, align) (((x) ^ (y)) > (align) - 1) + +/* + * return TRUE if adding len to off would cause it to cross an align + * boundary. + * eg, P2BOUNDARY(0x1234, 0xe0, 0x100) == TRUE (0x1234 + 0xe0 == 0x1314) + * eg, P2BOUNDARY(0x1234, 0x50, 0x100) == FALSE (0x1234 + 0x50 == 0x1284) + */ +#define P2BOUNDARY(off, len, align) \ + (((off) ^ ((off) + (len) - 1)) > (align) - 1) + /* - * Determine whether two numbers have the same high-order bit. + * Return TRUE if they have the same highest bit set. + * eg, P2SAMEHIGHBIT(0x1234, 0x1001) == TRUE (the high bit is 0x1000) + * eg, P2SAMEHIGHBIT(0x1234, 0x3010) == FALSE (high bit of 0x3010 is 0x2000) */ #define P2SAMEHIGHBIT(x, y) (((x) ^ (y)) < ((x) & (y))) diff --git a/usr/src/uts/i86pc/os/ddi_impl.c b/usr/src/uts/i86pc/os/ddi_impl.c index f384c895f3..43dfc12941 100644 --- a/usr/src/uts/i86pc/os/ddi_impl.c +++ b/usr/src/uts/i86pc/os/ddi_impl.c @@ -1461,8 +1461,8 @@ kalloca(size_t size, size_t align, int cansleep, int physcontig, } kallocdone: - ASSERT(!P2CROSS((uintptr_t)raddr, (uintptr_t)raddr + rsize - 1, - PAGESIZE) || rsize > PAGESIZE); + ASSERT(!P2BOUNDARY((uintptr_t)raddr, rsize, PAGESIZE) || + rsize > PAGESIZE); addr = (size_t *)P2ROUNDUP((uintptr_t)raddr + hdrsize, align); ASSERT((uintptr_t)addr + size - (uintptr_t)raddr <= rsize); -- 2.11.4.GIT