From 74e7dc986c89efca1f2e4451c7a572e05e4a6e4f Mon Sep 17 00:00:00 2001 From: Matthew Ahrens Date: Fri, 22 Aug 2008 10:29:35 -0700 Subject: [PATCH] PSARC/2008/518 ZFS space accounting enhancements 6730799 want snapused property 6738349 zfs promote fails without enough space --- usr/src/cmd/mdb/common/modules/zfs/zfs.c | 7 - usr/src/cmd/zdb/zdb.c | 41 +- usr/src/cmd/zfs/zfs_main.c | 14 +- usr/src/cmd/zpool/zpool_main.c | 1 + usr/src/common/zfs/zfs_prop.c | 11 +- usr/src/grub/grub-0.95/stage2/zfs-include/zfs.h | 4 +- usr/src/lib/libzfs/common/libzfs_dataset.c | 9 + usr/src/lib/libzfs/common/libzfs_util.c | 125 +++-- usr/src/uts/common/fs/zfs/bplist.c | 40 +- usr/src/uts/common/fs/zfs/dsl_dataset.c | 596 +++++++++++++++--------- usr/src/uts/common/fs/zfs/dsl_dir.c | 153 ++++-- usr/src/uts/common/fs/zfs/sys/bplist.h | 4 +- usr/src/uts/common/fs/zfs/sys/dsl_dataset.h | 7 +- usr/src/uts/common/fs/zfs/sys/dsl_dir.h | 24 +- usr/src/uts/common/sys/fs/zfs.h | 12 +- 15 files changed, 672 insertions(+), 376 deletions(-) diff --git a/usr/src/cmd/mdb/common/modules/zfs/zfs.c b/usr/src/cmd/mdb/common/modules/zfs/zfs.c index 889ff60c17..06ca389b41 100644 --- a/usr/src/cmd/mdb/common/modules/zfs/zfs.c +++ b/usr/src/cmd/mdb/common/modules/zfs/zfs.c @@ -23,8 +23,6 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include #include #include @@ -1324,7 +1322,6 @@ typedef struct mdb_spa { typedef struct mdb_dsl_dir { uintptr_t dd_phys; - uint64_t dd_used_bytes; int64_t dd_space_towrite[TXG_SIZE]; } mdb_dsl_dir_t; @@ -1428,8 +1425,6 @@ spa_space(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) dp_root_dir, dp_root_dir) || GETMEMB(dp_root_dir, struct dsl_dir, dd_phys, dd.dd_phys) || GETMEMB(dp_root_dir, struct dsl_dir, - dd_used_bytes, dd.dd_used_bytes) || - GETMEMB(dp_root_dir, struct dsl_dir, dd_space_towrite, dd.dd_space_towrite) || GETMEMB(dd.dd_phys, struct dsl_dir_phys, dd_used_bytes, dsp.dd_used_bytes) || @@ -1445,8 +1440,6 @@ spa_space(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) dd.dd_space_towrite[1] >> shift, suffix, dd.dd_space_towrite[2] >> shift, suffix, dd.dd_space_towrite[3] >> shift, suffix); - mdb_printf("dd_used_bytes = %llu%s\n", - dd.dd_used_bytes >> shift, suffix); mdb_printf("dd_phys.dd_used_bytes = %llu%s\n", dsp.dd_used_bytes >> shift, suffix); diff --git a/usr/src/cmd/zdb/zdb.c b/usr/src/cmd/zdb/zdb.c index bb506a4058..6fd2e96cf5 100644 --- a/usr/src/cmd/zdb/zdb.c +++ b/usr/src/cmd/zdb/zdb.c @@ -23,8 +23,6 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include #include #include @@ -719,7 +717,7 @@ dump_dsl_dir(objset_t *os, uint64_t object, void *data, size_t size) { dsl_dir_phys_t *dd = data; time_t crtime; - char used[6], compressed[6], uncompressed[6], quota[6], resv[6]; + char nice[6]; if (dd == NULL) return; @@ -727,12 +725,6 @@ dump_dsl_dir(objset_t *os, uint64_t object, void *data, size_t size) ASSERT3U(size, >=, sizeof (dsl_dir_phys_t)); crtime = dd->dd_creation_time; - nicenum(dd->dd_used_bytes, used); - nicenum(dd->dd_compressed_bytes, compressed); - nicenum(dd->dd_uncompressed_bytes, uncompressed); - nicenum(dd->dd_quota, quota); - nicenum(dd->dd_reserved, resv); - (void) printf("\t\tcreation_time = %s", ctime(&crtime)); (void) printf("\t\thead_dataset_obj = %llu\n", (u_longlong_t)dd->dd_head_dataset_obj); @@ -742,15 +734,32 @@ dump_dsl_dir(objset_t *os, uint64_t object, void *data, size_t size) (u_longlong_t)dd->dd_origin_obj); (void) printf("\t\tchild_dir_zapobj = %llu\n", (u_longlong_t)dd->dd_child_dir_zapobj); - (void) printf("\t\tused_bytes = %s\n", used); - (void) printf("\t\tcompressed_bytes = %s\n", compressed); - (void) printf("\t\tuncompressed_bytes = %s\n", uncompressed); - (void) printf("\t\tquota = %s\n", quota); - (void) printf("\t\treserved = %s\n", resv); + nicenum(dd->dd_used_bytes, nice); + (void) printf("\t\tused_bytes = %s\n", nice); + nicenum(dd->dd_compressed_bytes, nice); + (void) printf("\t\tcompressed_bytes = %s\n", nice); + nicenum(dd->dd_uncompressed_bytes, nice); + (void) printf("\t\tuncompressed_bytes = %s\n", nice); + nicenum(dd->dd_quota, nice); + (void) printf("\t\tquota = %s\n", nice); + nicenum(dd->dd_reserved, nice); + (void) printf("\t\treserved = %s\n", nice); (void) printf("\t\tprops_zapobj = %llu\n", (u_longlong_t)dd->dd_props_zapobj); (void) printf("\t\tdeleg_zapobj = %llu\n", (u_longlong_t)dd->dd_deleg_zapobj); + (void) printf("\t\tflags = %llx\n", + (u_longlong_t)dd->dd_flags); + +#define DO(which) \ + nicenum(dd->dd_used_breakdown[DD_USED_ ## which], nice); \ + (void) printf("\t\tused_breakdown[" #which "] = %s\n", nice) + DO(HEAD); + DO(SNAP); + DO(CHILD); + DO(CHILD_RSRV); + DO(REFRSRV); +#undef DO } /*ARGSUSED*/ @@ -1145,8 +1154,8 @@ dump_dir(objset_t *os) if (dds.dds_type == DMU_OST_META) { dds.dds_creation_txg = TXG_INITIAL; usedobjs = os->os->os_rootbp->blk_fill; - refdbytes = - os->os->os_spa->spa_dsl_pool->dp_mos_dir->dd_used_bytes; + refdbytes = os->os->os_spa->spa_dsl_pool-> + dp_mos_dir->dd_phys->dd_used_bytes; } else { dmu_objset_space(os, &refdbytes, &scratch, &usedobjs, &scratch); } diff --git a/usr/src/cmd/zfs/zfs_main.c b/usr/src/cmd/zfs/zfs_main.c index cf1c9d364c..2cb34c5c4e 100644 --- a/usr/src/cmd/zfs/zfs_main.c +++ b/usr/src/cmd/zfs/zfs_main.c @@ -361,7 +361,7 @@ usage(boolean_t requested) (void) fprintf(fp, gettext("\nSizes are specified in bytes " "with standard units such as K, M, G, etc.\n")); - (void) fprintf(fp, gettext("\n\nUser-defined properties can " + (void) fprintf(fp, gettext("\nUser-defined properties can " "be specified by using a name containing a colon (:).\n")); } else if (show_permissions) { @@ -1756,12 +1756,12 @@ zfs_do_list(int argc, char **argv) "name,used,available,referenced,mountpoint"; int types = ZFS_TYPE_DATASET; char *fields = NULL; - char *basic_fields = default_fields; list_cbdata_t cb = { 0 }; char *value; int ret; char *type_subopts[] = { "filesystem", "volume", "snapshot", NULL }; zfs_sort_column_t *sortcol = NULL; + boolean_t gottypes = B_FALSE; /* check options */ while ((c = getopt(argc, argv, ":o:rt:Hs:S:")) != -1) { @@ -1792,6 +1792,7 @@ zfs_do_list(int argc, char **argv) } break; case 't': + gottypes = B_TRUE; types = 0; while (*optarg != '\0') { switch (getsubopt(&optarg, type_subopts, @@ -1829,7 +1830,14 @@ zfs_do_list(int argc, char **argv) argv += optind; if (fields == NULL) - fields = basic_fields; + fields = default_fields; + + /* + * If they only specified "-o space" and no types, don't display + * snapshots. + */ + if (strcmp(fields, "space") == 0 && !gottypes) + types &= ~ZFS_TYPE_SNAPSHOT; /* * If the user specifies '-o all', the zprop_get_list() doesn't diff --git a/usr/src/cmd/zpool/zpool_main.c b/usr/src/cmd/zpool/zpool_main.c index 3551b51c20..645049871f 100644 --- a/usr/src/cmd/zpool/zpool_main.c +++ b/usr/src/cmd/zpool/zpool_main.c @@ -3485,6 +3485,7 @@ zpool_do_upgrade(int argc, char **argv) (void) printf(gettext(" 10 Cache devices\n")); (void) printf(gettext(" 11 Improved scrub performance\n")); (void) printf(gettext(" 12 Snapshot properties\n")); + (void) printf(gettext(" 13 snapused property\n")); (void) printf(gettext("For more information on a particular " "version, including supported releases, see:\n\n")); (void) printf("http://www.opensolaris.org/os/community/zfs/" diff --git a/usr/src/common/zfs/zfs_prop.c b/usr/src/common/zfs/zfs_prop.c index 11461c3f91..6893ea172a 100644 --- a/usr/src/common/zfs/zfs_prop.c +++ b/usr/src/common/zfs/zfs_prop.c @@ -23,8 +23,6 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include #include #include @@ -269,6 +267,15 @@ zfs_prop_init(void) register_number(ZFS_PROP_VOLBLOCKSIZE, "volblocksize", 8192, PROP_ONETIME, ZFS_TYPE_VOLUME, "512 to 128k, power of 2", "VOLBLOCK"); + register_number(ZFS_PROP_USEDSNAP, "usedbysnapshots", 0, PROP_READONLY, + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "", "USEDSNAP"); + register_number(ZFS_PROP_USEDDS, "usedbydataset", 0, PROP_READONLY, + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "", "USEDDS"); + register_number(ZFS_PROP_USEDCHILD, "usedbychildren", 0, PROP_READONLY, + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "", "USEDCHILD"); + register_number(ZFS_PROP_USEDREFRESERV, "usedbyrefreservation", 0, + PROP_READONLY, + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "", "USEDREFRESERV"); /* default number properties */ register_number(ZFS_PROP_QUOTA, "quota", 0, PROP_DEFAULT, diff --git a/usr/src/grub/grub-0.95/stage2/zfs-include/zfs.h b/usr/src/grub/grub-0.95/stage2/zfs-include/zfs.h index 1c6d133817..5450018afb 100644 --- a/usr/src/grub/grub-0.95/stage2/zfs-include/zfs.h +++ b/usr/src/grub/grub-0.95/stage2/zfs-include/zfs.h @@ -24,12 +24,10 @@ #ifndef _SYS_FS_ZFS_H #define _SYS_FS_ZFS_H -#pragma ident "%Z%%M% %I% %E% SMI" - /* * On-disk version number. */ -#define SPA_VERSION 12ULL +#define SPA_VERSION 13ULL /* * The following are configuration names used in the nvlist describing a pool's diff --git a/usr/src/lib/libzfs/common/libzfs_dataset.c b/usr/src/lib/libzfs/common/libzfs_dataset.c index d8d018e2d3..b3526d12b6 100644 --- a/usr/src/lib/libzfs/common/libzfs_dataset.c +++ b/usr/src/lib/libzfs/common/libzfs_dataset.c @@ -2225,6 +2225,15 @@ get_numeric_property(zfs_handle_t *zhp, zfs_prop_t prop, zprop_source_t *src, case PROP_TYPE_NUMBER: case PROP_TYPE_INDEX: *val = getprop_uint64(zhp, prop, source); + /* + * If we tried to use a defalut value for a + * readonly property, it means that it was not + * present; return an error. + */ + if (zfs_prop_readonly(prop) && + *source && (*source)[0] == '\0') { + return (-1); + } break; case PROP_TYPE_STRING: diff --git a/usr/src/lib/libzfs/common/libzfs_util.c b/usr/src/lib/libzfs/common/libzfs_util.c index b6caa797bd..e10f5b3b06 100644 --- a/usr/src/lib/libzfs/common/libzfs_util.c +++ b/usr/src/lib/libzfs/common/libzfs_util.c @@ -1178,6 +1178,51 @@ error: return (-1); } +static int +addlist(libzfs_handle_t *hdl, char *propname, zprop_list_t **listp, + zfs_type_t type) +{ + int prop; + zprop_list_t *entry; + + prop = zprop_name_to_prop(propname, type); + + if (prop != ZPROP_INVAL && !zprop_valid_for_type(prop, type)) + prop = ZPROP_INVAL; + + /* + * When no property table entry can be found, return failure if + * this is a pool property or if this isn't a user-defined + * dataset property, + */ + if (prop == ZPROP_INVAL && (type == ZFS_TYPE_POOL || + !zfs_prop_user(propname))) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "invalid property '%s'"), propname); + return (zfs_error(hdl, EZFS_BADPROP, + dgettext(TEXT_DOMAIN, "bad property list"))); + } + + if ((entry = zfs_alloc(hdl, sizeof (zprop_list_t))) == NULL) + return (-1); + + entry->pl_prop = prop; + if (prop == ZPROP_INVAL) { + if ((entry->pl_user_prop = zfs_strdup(hdl, propname)) == NULL) { + free(entry); + return (-1); + } + entry->pl_width = strlen(propname); + } else { + entry->pl_width = zprop_width(prop, &entry->pl_fixed, + type); + } + + *listp = entry; + + return (0); +} + /* * Given a comma-separated list of properties, construct a property list * containing both user-defined and native properties. This function will @@ -1188,15 +1233,7 @@ int zprop_get_list(libzfs_handle_t *hdl, char *props, zprop_list_t **listp, zfs_type_t type) { - size_t len; - char *s, *p; - char c; - int prop; - zprop_list_t *entry; - zprop_list_t **last; - *listp = NULL; - last = listp; /* * If 'all' is specified, return a NULL list. @@ -1218,13 +1255,16 @@ zprop_get_list(libzfs_handle_t *hdl, char *props, zprop_list_t **listp, * It would be nice to use getsubopt() here, but the inclusion of column * aliases makes this more effort than it's worth. */ - s = props; - while (*s != '\0') { - if ((p = strchr(s, ',')) == NULL) { - len = strlen(s); - p = s + len; + while (*props != '\0') { + size_t len; + char *p; + char c; + + if ((p = strchr(props, ',')) == NULL) { + len = strlen(props); + p = props + len; } else { - len = p - s; + len = p - props; } /* @@ -1240,48 +1280,31 @@ zprop_get_list(libzfs_handle_t *hdl, char *props, zprop_list_t **listp, /* * Check all regular property names. */ - c = s[len]; - s[len] = '\0'; - prop = zprop_name_to_prop(s, type); - - if (prop != ZPROP_INVAL && !zprop_valid_for_type(prop, type)) - prop = ZPROP_INVAL; - - /* - * When no property table entry can be found, return failure if - * this is a pool property or if this isn't a user-defined - * dataset property, - */ - if (prop == ZPROP_INVAL && (type == ZFS_TYPE_POOL || - !zfs_prop_user(s))) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "invalid property '%s'"), s); - return (zfs_error(hdl, EZFS_BADPROP, - dgettext(TEXT_DOMAIN, "bad property list"))); - } - - if ((entry = zfs_alloc(hdl, sizeof (zprop_list_t))) == NULL) - return (-1); - - entry->pl_prop = prop; - if (prop == ZPROP_INVAL) { - if ((entry->pl_user_prop = zfs_strdup(hdl, s)) - == NULL) { - free(entry); - return (-1); + c = props[len]; + props[len] = '\0'; + + if (strcmp(props, "space") == 0) { + static char *spaceprops[] = { + "name", "avail", "used", "usedbysnapshots", + "usedbydataset", "usedbyrefreservation", + "usedbychildren", NULL + }; + int i; + + for (i = 0; spaceprops[i]; i++) { + if (addlist(hdl, spaceprops[i], listp, type)) + return (-1); + listp = &(*listp)->pl_next; } - entry->pl_width = strlen(s); } else { - entry->pl_width = zprop_width(prop, &entry->pl_fixed, - type); + if (addlist(hdl, props, listp, type)) + return (-1); + listp = &(*listp)->pl_next; } - *last = entry; - last = &entry->pl_next; - - s = p; + props = p; if (c == ',') - s++; + props++; } return (0); diff --git a/usr/src/uts/common/fs/zfs/bplist.c b/usr/src/uts/common/fs/zfs/bplist.c index a30c3e167e..93b7741d77 100644 --- a/usr/src/uts/common/fs/zfs/bplist.c +++ b/usr/src/uts/common/fs/zfs/bplist.c @@ -23,8 +23,6 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include #include @@ -311,3 +309,41 @@ bplist_space(bplist_t *bpl, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) return (err); } + +/* + * Return (in *dasizep) the amount of space on the deadlist which is: + * mintxg < blk_birth <= maxtxg + */ +int +bplist_space_birthrange(bplist_t *bpl, uint64_t mintxg, uint64_t maxtxg, + uint64_t *dasizep) +{ + uint64_t size = 0; + uint64_t itor = 0; + blkptr_t bp; + int err; + + /* + * As an optimization, if they want the whole txg range, just + * get bpl_bytes rather than iterating over the bps. + */ + if (mintxg < TXG_INITIAL && maxtxg == UINT64_MAX) { + mutex_enter(&bpl->bpl_lock); + err = bplist_hold(bpl); + if (err == 0) + *dasizep = bpl->bpl_phys->bpl_bytes; + mutex_exit(&bpl->bpl_lock); + return (err); + } + + while ((err = bplist_iterate(bpl, &itor, &bp)) == 0) { + if (bp.blk_birth > mintxg && bp.blk_birth <= maxtxg) { + size += + bp_get_dasize(dmu_objset_spa(bpl->bpl_mos), &bp); + } + } + if (err == ENOENT) + err = 0; + *dasizep = size; + return (err); +} diff --git a/usr/src/uts/common/fs/zfs/dsl_dataset.c b/usr/src/uts/common/fs/zfs/dsl_dataset.c index 1d93acd589..6deb3977d2 100644 --- a/usr/src/uts/common/fs/zfs/dsl_dataset.c +++ b/usr/src/uts/common/fs/zfs/dsl_dataset.c @@ -97,7 +97,7 @@ dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx) * dsl_dir. */ ASSERT3U(compressed, ==, uncompressed); /* it's all metadata */ - dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir, + dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir, DD_USED_HEAD, used, compressed, uncompressed, tx); dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx); return; @@ -110,7 +110,10 @@ dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx) ds->ds_phys->ds_uncompressed_bytes += uncompressed; ds->ds_phys->ds_unique_bytes += used; mutex_exit(&ds->ds_lock); - dsl_dir_diduse_space(ds->ds_dir, delta, compressed, uncompressed, tx); + dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta, + compressed, uncompressed, tx); + dsl_dir_transfer_space(ds->ds_dir, used - delta, + DD_USED_REFRSRV, DD_USED_HEAD, tx); } int @@ -137,13 +140,14 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio, tx->tx_txg, bp, NULL, NULL, pio ? ARC_NOWAIT: ARC_WAIT); ASSERT(err == 0); - dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir, + dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir, DD_USED_HEAD, -used, -compressed, -uncompressed, tx); dsl_dir_dirty(tx->tx_pool->dp_mos_dir, tx); return (used); } ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool); + ASSERT(!dsl_dataset_is_snapshot(ds)); dmu_buf_will_dirty(ds->ds_dbuf, tx); if (bp->blk_birth > ds->ds_phys->ds_prev_snap_txg) { @@ -152,7 +156,7 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio, dprintf_bp(bp, "freeing: %s", ""); err = dsl_free(pio, tx->tx_pool, - tx->tx_txg, bp, NULL, NULL, pio ? ARC_NOWAIT: ARC_WAIT); + tx->tx_txg, bp, NULL, NULL, pio ? ARC_NOWAIT : ARC_WAIT); ASSERT(err == 0); mutex_enter(&ds->ds_lock); @@ -161,8 +165,10 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio, delta = parent_delta(ds, -used); ds->ds_phys->ds_unique_bytes -= used; mutex_exit(&ds->ds_lock); - dsl_dir_diduse_space(ds->ds_dir, + dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta, -compressed, -uncompressed, tx); + dsl_dir_transfer_space(ds->ds_dir, -used - delta, + DD_USED_REFRSRV, DD_USED_HEAD, tx); } else { dprintf_bp(bp, "putting on dead list: %s", ""); VERIFY(0 == bplist_enqueue(&ds->ds_deadlist, bp, tx)); @@ -178,6 +184,10 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio, ds->ds_prev->ds_phys->ds_unique_bytes += used; mutex_exit(&ds->ds_prev->ds_lock); } + if (bp->blk_birth > ds->ds_origin_txg) { + dsl_dir_transfer_space(ds->ds_dir, used, + DD_USED_HEAD, DD_USED_SNAP, tx); + } } mutex_enter(&ds->ds_lock); ASSERT3U(ds->ds_phys->ds_used_bytes, >=, used); @@ -371,18 +381,31 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag, return (err); } - if (ds->ds_dir->dd_phys->dd_head_dataset_obj == dsobj) { + if (!dsl_dataset_is_snapshot(ds)) { ds->ds_snapname[0] = '\0'; if (ds->ds_phys->ds_prev_snap_obj) { err = dsl_dataset_get_ref(dp, ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev); } + + if (err == 0 && dsl_dir_is_clone(ds->ds_dir)) { + dsl_dataset_t *origin; + + err = dsl_dataset_hold_obj(dp, + ds->ds_dir->dd_phys->dd_origin_obj, + FTAG, &origin); + if (err == 0) { + ds->ds_origin_txg = + origin->ds_phys->ds_creation_txg; + dsl_dataset_rele(origin, FTAG); + } + } } else if (zfs_flags & ZFS_DEBUG_SNAPNAMES) { err = dsl_dataset_get_snapname(ds); } - if (!dsl_dataset_is_snapshot(ds)) { + if (err == 0 && !dsl_dataset_is_snapshot(ds)) { /* * In sync context, we're called with either no lock * or with the write lock. If we're not syncing, @@ -1121,13 +1144,12 @@ dsl_dataset_unique(dsl_dataset_t *ds) } struct killarg { - int64_t *usedp; - int64_t *compressedp; - int64_t *uncompressedp; + dsl_dataset_t *ds; zio_t *zio; dmu_tx_t *tx; }; +/* ARGSUSED */ static int kill_blkptr(traverse_blk_cache_t *bc, spa_t *spa, void *arg) { @@ -1136,16 +1158,9 @@ kill_blkptr(traverse_blk_cache_t *bc, spa_t *spa, void *arg) ASSERT3U(bc->bc_errno, ==, 0); - /* - * Since this callback is not called concurrently, no lock is - * needed on the accounting values. - */ - *ka->usedp += bp_get_dasize(spa, bp); - *ka->compressedp += BP_GET_PSIZE(bp); - *ka->uncompressedp += BP_GET_UCSIZE(bp); - /* XXX check for EIO? */ - (void) dsl_free(ka->zio, spa_get_dsl(spa), ka->tx->tx_txg, - bp, NULL, NULL, ARC_NOWAIT); + ASSERT3U(bp->blk_birth, >, ka->ds->ds_phys->ds_prev_snap_txg); + (void) dsl_dataset_block_kill(ka->ds, bp, ka->zio, ka->tx); + return (0); } @@ -1208,6 +1223,16 @@ dsl_dataset_rollback_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) ds->ds_user_ptr = NULL; } + /* Transfer space that was freed since last snap back to the head. */ + { + uint64_t used; + + VERIFY(0 == bplist_space_birthrange(&ds->ds_deadlist, + ds->ds_origin_txg, UINT64_MAX, &used)); + dsl_dir_transfer_space(ds->ds_dir, used, + DD_USED_SNAP, DD_USED_HEAD, tx); + } + /* Zero out the deadlist. */ bplist_close(&ds->ds_deadlist); bplist_destroy(mos, ds->ds_phys->ds_deadlist_obj, tx); @@ -1219,31 +1244,29 @@ dsl_dataset_rollback_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) { /* Free blkptrs that we gave birth to */ zio_t *zio; - int64_t used = 0, compressed = 0, uncompressed = 0; struct killarg ka; - int64_t delta; zio = zio_root(tx->tx_pool->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); - ka.usedp = &used; - ka.compressedp = &compressed; - ka.uncompressedp = &uncompressed; + ka.ds = ds; ka.zio = zio; ka.tx = tx; (void) traverse_dsl_dataset(ds, ds->ds_phys->ds_prev_snap_txg, ADVANCE_POST, kill_blkptr, &ka); (void) zio_wait(zio); - - /* only deduct space beyond any refreservation */ - delta = parent_delta(ds, -used); - dsl_dir_diduse_space(ds->ds_dir, - delta, -compressed, -uncompressed, tx); } + ASSERT(!(ds->ds_phys->ds_flags & DS_FLAG_UNIQUE_ACCURATE) || + ds->ds_phys->ds_unique_bytes == 0); + if (ds->ds_prev && ds->ds_prev != ds->ds_dir->dd_pool->dp_origin_snap) { /* Change our contents to that of the prev snapshot */ + ASSERT3U(ds->ds_prev->ds_object, ==, ds->ds_phys->ds_prev_snap_obj); + ASSERT3U(ds->ds_phys->ds_used_bytes, <=, + ds->ds_prev->ds_phys->ds_used_bytes); + ds->ds_phys->ds_bp = ds->ds_prev->ds_phys->ds_bp; ds->ds_phys->ds_used_bytes = ds->ds_prev->ds_phys->ds_used_bytes; @@ -1252,7 +1275,6 @@ dsl_dataset_rollback_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) ds->ds_phys->ds_uncompressed_bytes = ds->ds_prev->ds_phys->ds_uncompressed_bytes; ds->ds_phys->ds_flags = ds->ds_prev->ds_phys->ds_flags; - ds->ds_phys->ds_unique_bytes = 0; if (ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) { dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); @@ -1261,13 +1283,17 @@ dsl_dataset_rollback_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) } else { objset_impl_t *osi; - /* Zero out our contents, recreate objset */ + ASSERT3U(ds->ds_phys->ds_used_bytes, ==, 0); + ASSERT3U(ds->ds_phys->ds_compressed_bytes, ==, 0); + ASSERT3U(ds->ds_phys->ds_uncompressed_bytes, ==, 0); + bzero(&ds->ds_phys->ds_bp, sizeof (blkptr_t)); - ds->ds_phys->ds_used_bytes = 0; - ds->ds_phys->ds_compressed_bytes = 0; - ds->ds_phys->ds_uncompressed_bytes = 0; ds->ds_phys->ds_flags = 0; ds->ds_phys->ds_unique_bytes = 0; + if (spa_version(ds->ds_dir->dd_pool->dp_spa) >= + SPA_VERSION_UNIQUE_ACCURATE) + ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; + osi = dmu_objset_create_impl(ds->ds_dir->dd_pool->dp_spa, ds, &ds->ds_phys->ds_bp, *ost, tx); #ifdef _KERNEL @@ -1403,7 +1429,6 @@ void dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; - int64_t used = 0, compressed = 0, uncompressed = 0; zio_t *zio; int err; int after_branch_point = FALSE; @@ -1476,6 +1501,7 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx) dsl_dataset_t *ds_next; uint64_t itor = 0; uint64_t old_unique; + int64_t used = 0, compressed = 0, uncompressed = 0; VERIFY(0 == dsl_dataset_hold_obj(dp, ds->ds_phys->ds_next_snap_obj, FTAG, &ds_next)); @@ -1519,6 +1545,12 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx) } } + ASSERT3U(used, ==, ds->ds_phys->ds_unique_bytes); + + /* change snapused */ + dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP, + -used, -compressed, -uncompressed, tx); + /* free next's deadlist */ bplist_close(&ds_next->ds_deadlist); bplist_destroy(mos, ds_next->ds_phys->ds_deadlist_obj, tx); @@ -1545,21 +1577,17 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx) * config lock held */ dsl_dataset_t *ds_after_next; + uint64_t space; VERIFY(0 == dsl_dataset_hold_obj(dp, ds_next->ds_phys->ds_next_snap_obj, FTAG, &ds_after_next)); - itor = 0; - while (bplist_iterate(&ds_after_next->ds_deadlist, - &itor, &bp) == 0) { - if (bp.blk_birth > - ds->ds_phys->ds_prev_snap_txg && - bp.blk_birth <= - ds->ds_phys->ds_creation_txg) { - ds_next->ds_phys->ds_unique_bytes += - bp_get_dasize(dp->dp_spa, &bp); - } - } + + VERIFY(0 == + bplist_space_birthrange(&ds_after_next->ds_deadlist, + ds->ds_phys->ds_prev_snap_txg, + ds->ds_phys->ds_creation_txg, &space)); + ds_next->ds_phys->ds_unique_bytes += space; dsl_dataset_rele(ds_after_next, FTAG); ASSERT3P(ds_next->ds_prev, ==, NULL); @@ -1588,18 +1616,11 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx) ASSERT(old_unique <= new_unique); mrsdelta = MIN(new_unique - old_unique, ds_next->ds_reserved - old_unique); - dsl_dir_diduse_space(ds->ds_dir, -mrsdelta, - 0, 0, tx); + dsl_dir_diduse_space(ds->ds_dir, + DD_USED_REFRSRV, -mrsdelta, 0, 0, tx); } } dsl_dataset_rele(ds_next, FTAG); - - /* - * NB: unique_bytes might not be accurate for the head objset. - * Before SPA_VERSION 9, we didn't update its value when we - * deleted the most recent snapshot. - */ - ASSERT3U(used, ==, ds->ds_phys->ds_unique_bytes); } else { /* * There's no next snapshot, so this is a head dataset. @@ -1618,26 +1639,22 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx) * Free everything that we point to (that's born after * the previous snapshot, if we are a clone) * - * XXX we're doing this long task with the config lock held + * NB: this should be very quick, because we already + * freed all the objects in open context. */ - ka.usedp = &used; - ka.compressedp = &compressed; - ka.uncompressedp = &uncompressed; + ka.ds = ds; ka.zio = zio; ka.tx = tx; err = traverse_dsl_dataset(ds, ds->ds_phys->ds_prev_snap_txg, ADVANCE_POST, kill_blkptr, &ka); ASSERT3U(err, ==, 0); - ASSERT(spa_version(dp->dp_spa) < - SPA_VERSION_UNIQUE_ACCURATE || - used == ds->ds_phys->ds_unique_bytes); + ASSERT(spa_version(dp->dp_spa) < SPA_VERSION_UNIQUE_ACCURATE || + ds->ds_phys->ds_unique_bytes == 0); } err = zio_wait(zio); ASSERT3U(err, ==, 0); - dsl_dir_diduse_space(ds->ds_dir, -used, -compressed, -uncompressed, tx); - if (ds->ds_dir->dd_phys->dd_head_dataset_obj == ds->ds_object) { /* Erase the link in the dir */ dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx); @@ -1681,10 +1698,8 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx) VERIFY(0 == dmu_object_free(mos, ds->ds_phys->ds_next_clones_obj, tx)); } - if (ds->ds_phys->ds_props_obj != 0) { - VERIFY(0 == zap_destroy(mos, - ds->ds_phys->ds_props_obj, tx)); - } + if (ds->ds_phys->ds_props_obj != 0) + VERIFY(0 == zap_destroy(mos, ds->ds_phys->ds_props_obj, tx)); dsl_dir_close(ds->ds_dir, ds); ds->ds_dir = NULL; dsl_dataset_drain_refs(ds, tag); @@ -1831,7 +1846,8 @@ dsl_dataset_snapshot_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) */ if (ds->ds_reserved) { int64_t add = MIN(dsl_dataset_unique(ds), ds->ds_reserved); - dsl_dir_diduse_space(ds->ds_dir, add, 0, 0, tx); + dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, + add, 0, 0, tx); } bplist_close(&ds->ds_deadlist); @@ -2229,25 +2245,21 @@ struct promotenode { }; struct promotearg { - list_t snap_list; - dsl_dataset_t *clone_origin, *old_head; - uint64_t used, comp, uncomp, unique; - uint64_t newnext_obj; + list_t shared_snaps, origin_snaps, clone_snaps; + dsl_dataset_t *origin_origin, *origin_head; + uint64_t used, comp, uncomp, unique, cloneusedsnap, originusedsnap; }; +static int snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep); + /* ARGSUSED */ static int dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dataset_t *hds = arg1; struct promotearg *pa = arg2; - struct promotenode *snap = list_head(&pa->snap_list); - dsl_pool_t *dp = hds->ds_dir->dd_pool; + struct promotenode *snap = list_head(&pa->shared_snaps); dsl_dataset_t *origin_ds = snap->ds; - dsl_dataset_t *newnext_ds; - char *name; - uint64_t itor = 0; - blkptr_t bp; int err; /* Check that it is a real clone */ @@ -2261,93 +2273,109 @@ dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx) if (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE) return (EXDEV); - /* find origin's new next ds */ - newnext_ds = hds; - while (newnext_ds->ds_phys->ds_prev_snap_obj != origin_ds->ds_object) { - dsl_dataset_t *prev; - - err = dsl_dataset_hold_obj(dp, - newnext_ds->ds_phys->ds_prev_snap_obj, FTAG, &prev); - if (newnext_ds != hds) - dsl_dataset_rele(newnext_ds, FTAG); - if (err) - return (err); - newnext_ds = prev; - } - pa->newnext_obj = newnext_ds->ds_object; - /* compute origin's new unique space */ - pa->unique = 0; - while ((err = bplist_iterate(&newnext_ds->ds_deadlist, - &itor, &bp)) == 0) { - if (bp.blk_birth > origin_ds->ds_phys->ds_prev_snap_txg) - pa->unique += bp_get_dasize(dp->dp_spa, &bp); - } - if (newnext_ds != hds) - dsl_dataset_rele(newnext_ds, FTAG); - if (err != ENOENT) + snap = list_tail(&pa->clone_snaps); + ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object); + err = bplist_space_birthrange(&snap->ds->ds_deadlist, + origin_ds->ds_phys->ds_prev_snap_txg, UINT64_MAX, &pa->unique); + if (err) return (err); - name = kmem_alloc(MAXPATHLEN, KM_SLEEP); - /* * Walk the snapshots that we are moving * - * Compute space to transfer. Each snapshot gave birth to: - * (my used) - (prev's used) + (deadlist's used) + * Compute space to transfer. Consider the incremental changes + * to used for each snapshot: + * (my used) = (prev's used) + (blocks born) - (blocks killed) + * So each snapshot gave birth to: + * (blocks born) = (my used) - (prev's used) + (blocks killed) * So a sequence would look like: - * uN - u(N-1) + dN + ... + u1 - u0 + d1 + u0 - 0 + d0 + * (uN - u(N-1) + kN) + ... + (u1 - u0 + k1) + (u0 - 0 + k0) * Which simplifies to: - * uN + dN + ... + d1 + d0 + * uN + kN + kN-1 + ... + k1 + k0 * Note however, if we stop before we reach the ORIGIN we get: - * uN + dN + ... + dM - uM-1 + * uN + kN + kN-1 + ... + kM - uM-1 */ pa->used = origin_ds->ds_phys->ds_used_bytes; pa->comp = origin_ds->ds_phys->ds_compressed_bytes; pa->uncomp = origin_ds->ds_phys->ds_uncompressed_bytes; - do { + for (snap = list_head(&pa->shared_snaps); snap; + snap = list_next(&pa->shared_snaps, snap)) { uint64_t val, dlused, dlcomp, dluncomp; dsl_dataset_t *ds = snap->ds; /* Check that the snapshot name does not conflict */ - dsl_dataset_name(ds, name); + VERIFY(0 == dsl_dataset_get_snapname(ds)); err = dsl_dataset_snap_lookup(hds, ds->ds_snapname, &val); if (err == 0) - err = EEXIST; + return (EEXIST); if (err != ENOENT) - break; - err = 0; + return (err); /* The very first snapshot does not have a deadlist */ - if (ds->ds_phys->ds_prev_snap_obj != 0) { - if (err = bplist_space(&ds->ds_deadlist, - &dlused, &dlcomp, &dluncomp)) - break; - pa->used += dlused; - pa->comp += dlcomp; - pa->uncomp += dluncomp; - } - } while (snap = list_next(&pa->snap_list, snap)); + if (ds->ds_phys->ds_prev_snap_obj == 0) + continue; + + if (err = bplist_space(&ds->ds_deadlist, + &dlused, &dlcomp, &dluncomp)) + return (err); + pa->used += dlused; + pa->comp += dlcomp; + pa->uncomp += dluncomp; + } /* * If we are a clone of a clone then we never reached ORIGIN, * so we need to subtract out the clone origin's used space. */ - if (pa->clone_origin) { - pa->used -= pa->clone_origin->ds_phys->ds_used_bytes; - pa->comp -= pa->clone_origin->ds_phys->ds_compressed_bytes; - pa->uncomp -= pa->clone_origin->ds_phys->ds_uncompressed_bytes; + if (pa->origin_origin) { + pa->used -= pa->origin_origin->ds_phys->ds_used_bytes; + pa->comp -= pa->origin_origin->ds_phys->ds_compressed_bytes; + pa->uncomp -= pa->origin_origin->ds_phys->ds_uncompressed_bytes; } - kmem_free(name, MAXPATHLEN); - /* Check that there is enough space here */ - if (err == 0) { - dsl_dir_t *odd = origin_ds->ds_dir; - err = dsl_dir_transfer_possible(odd, hds->ds_dir, pa->used); + err = dsl_dir_transfer_possible(origin_ds->ds_dir, hds->ds_dir, + pa->used); + if (err) + return (err); + + /* + * Compute the amounts of space that will be used by snapshots + * after the promotion (for both origin and clone). For each, + * it is the amount of space that will be on all of their + * deadlists (that was not born before their new origin). + */ + if (hds->ds_dir->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) { + uint64_t space; + + /* + * Note, typically this will not be a clone of a clone, + * so snap->ds->ds_origin_txg will be < TXG_INITIAL, so + * these snaplist_space() -> bplist_space_birthrange() + * calls will be fast because they do not have to + * iterate over all bps. + */ + snap = list_head(&pa->origin_snaps); + err = snaplist_space(&pa->shared_snaps, + snap->ds->ds_origin_txg, &pa->cloneusedsnap); + if (err) + return (err); + + err = snaplist_space(&pa->clone_snaps, + snap->ds->ds_origin_txg, &space); + if (err) + return (err); + pa->cloneusedsnap += space; + } + if (origin_ds->ds_dir->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) { + err = snaplist_space(&pa->origin_snaps, + origin_ds->ds_phys->ds_creation_txg, &pa->originusedsnap); + if (err) + return (err); } - return (err); + return (0); } static void @@ -2355,16 +2383,20 @@ dsl_dataset_promote_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) { dsl_dataset_t *hds = arg1; struct promotearg *pa = arg2; - struct promotenode *snap = list_head(&pa->snap_list); + struct promotenode *snap = list_head(&pa->shared_snaps); dsl_dataset_t *origin_ds = snap->ds; + dsl_dataset_t *origin_head; dsl_dir_t *dd = hds->ds_dir; dsl_pool_t *dp = hds->ds_dir->dd_pool; dsl_dir_t *odd = NULL; - char *name; uint64_t oldnext_obj; + int64_t delta; ASSERT(0 == (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE)); + snap = list_head(&pa->origin_snaps); + origin_head = snap->ds; + /* * We need to explicitly open odd, since origin_ds's dd will be * changing. @@ -2375,13 +2407,15 @@ dsl_dataset_promote_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) /* change origin's next snap */ dmu_buf_will_dirty(origin_ds->ds_dbuf, tx); oldnext_obj = origin_ds->ds_phys->ds_next_snap_obj; - origin_ds->ds_phys->ds_next_snap_obj = pa->newnext_obj; + snap = list_tail(&pa->clone_snaps); + ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object); + origin_ds->ds_phys->ds_next_snap_obj = snap->ds->ds_object; /* change the origin's next clone */ if (origin_ds->ds_phys->ds_next_clones_obj) { VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, origin_ds->ds_phys->ds_next_clones_obj, - pa->newnext_obj, tx)); + origin_ds->ds_phys->ds_next_snap_obj, tx)); VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset, origin_ds->ds_phys->ds_next_clones_obj, oldnext_obj, tx)); @@ -2391,12 +2425,14 @@ dsl_dataset_promote_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) dmu_buf_will_dirty(dd->dd_dbuf, tx); ASSERT3U(dd->dd_phys->dd_origin_obj, ==, origin_ds->ds_object); dd->dd_phys->dd_origin_obj = odd->dd_phys->dd_origin_obj; + hds->ds_origin_txg = origin_head->ds_origin_txg; dmu_buf_will_dirty(odd->dd_dbuf, tx); odd->dd_phys->dd_origin_obj = origin_ds->ds_object; + origin_head->ds_origin_txg = origin_ds->ds_phys->ds_creation_txg; /* move snapshots to this dir */ - name = kmem_alloc(MAXPATHLEN, KM_SLEEP); - do { + for (snap = list_head(&pa->shared_snaps); snap; + snap = list_next(&pa->shared_snaps, snap)) { dsl_dataset_t *ds = snap->ds; /* unregister props as dsl_dir is changing */ @@ -2405,8 +2441,8 @@ dsl_dataset_promote_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) ds->ds_user_ptr = NULL; } /* move snap name entry */ - dsl_dataset_name(ds, name); - VERIFY(0 == dsl_dataset_snap_remove(pa->old_head, + VERIFY(0 == dsl_dataset_get_snapname(ds)); + VERIFY(0 == dsl_dataset_snap_remove(origin_head, ds->ds_snapname, tx)); VERIFY(0 == zap_add(dp->dp_meta_objset, hds->ds_phys->ds_snapnames_zapobj, ds->ds_snapname, @@ -2421,11 +2457,31 @@ dsl_dataset_promote_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) NULL, ds, &ds->ds_dir)); ASSERT3U(dsl_prop_numcb(ds), ==, 0); - } while (snap = list_next(&pa->snap_list, snap)); + } + + /* + * Change space accounting. + * Note, pa->*usedsnap and dd_used_breakdown[SNAP] will either + * both be valid, or both be 0 (resulting in delta == 0). This + * is true for each of {clone,origin} independently. + */ + + delta = pa->cloneusedsnap - + dd->dd_phys->dd_used_breakdown[DD_USED_SNAP]; + ASSERT3S(delta, >=, 0); + ASSERT3U(pa->used, >=, delta); + dsl_dir_diduse_space(dd, DD_USED_SNAP, delta, 0, 0, tx); + dsl_dir_diduse_space(dd, DD_USED_HEAD, + pa->used - delta, pa->comp, pa->uncomp, tx); + + delta = pa->originusedsnap - + odd->dd_phys->dd_used_breakdown[DD_USED_SNAP]; + ASSERT3S(delta, <=, 0); + ASSERT3U(pa->used, >=, -delta); + dsl_dir_diduse_space(odd, DD_USED_SNAP, delta, 0, 0, tx); + dsl_dir_diduse_space(odd, DD_USED_HEAD, + -pa->used - delta, -pa->comp, -pa->uncomp, tx); - /* change space accounting */ - dsl_dir_diduse_space(odd, -pa->used, -pa->comp, -pa->uncomp, tx); - dsl_dir_diduse_space(dd, pa->used, pa->comp, pa->uncomp, tx); origin_ds->ds_phys->ds_unique_bytes = pa->unique; /* log history record */ @@ -2433,9 +2489,106 @@ dsl_dataset_promote_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) cr, "dataset = %llu", hds->ds_object); dsl_dir_close(odd, FTAG); - kmem_free(name, MAXPATHLEN); } +static char *snaplist_tag = "snaplist"; +/* + * Make a list of dsl_dataset_t's for the snapshots between first_obj + * (exclusive) and last_obj (inclusive). The list will be in reverse + * order (last_obj will be the list_head()). If first_obj == 0, do all + * snapshots back to this dataset's origin. + */ +static int +snaplist_make(dsl_pool_t *dp, boolean_t own, + uint64_t first_obj, uint64_t last_obj, list_t *l) +{ + uint64_t obj = last_obj; + + ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock)); + + list_create(l, sizeof (struct promotenode), + offsetof(struct promotenode, link)); + + while (obj != first_obj) { + dsl_dataset_t *ds; + struct promotenode *snap; + int err; + + if (own) { + err = dsl_dataset_own_obj(dp, obj, + 0, snaplist_tag, &ds); + if (err == 0) + dsl_dataset_make_exclusive(ds, snaplist_tag); + } else { + err = dsl_dataset_hold_obj(dp, obj, snaplist_tag, &ds); + } + if (err == ENOENT) { + /* lost race with snapshot destroy */ + struct promotenode *last = list_tail(l); + ASSERT(obj != last->ds->ds_phys->ds_prev_snap_obj); + obj = last->ds->ds_phys->ds_prev_snap_obj; + continue; + } else if (err) { + return (err); + } + + if (first_obj == 0) + first_obj = ds->ds_dir->dd_phys->dd_origin_obj; + + snap = kmem_alloc(sizeof (struct promotenode), KM_SLEEP); + snap->ds = ds; + list_insert_tail(l, snap); + obj = ds->ds_phys->ds_prev_snap_obj; + } + + return (0); +} + +static int +snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep) +{ + struct promotenode *snap; + + *spacep = 0; + for (snap = list_head(l); snap; snap = list_next(l, snap)) { + uint64_t used; + int err = bplist_space_birthrange(&snap->ds->ds_deadlist, + mintxg, UINT64_MAX, &used); + if (err) + return (err); + *spacep += used; + } + return (0); +} + +static void +snaplist_destroy(list_t *l, boolean_t own) +{ + struct promotenode *snap; + + if (!list_link_active(&l->list_head)) + return; + + while ((snap = list_tail(l)) != NULL) { + list_remove(l, snap); + if (own) + dsl_dataset_disown(snap->ds, snaplist_tag); + else + dsl_dataset_rele(snap->ds, snaplist_tag); + kmem_free(snap, sizeof (struct promotenode)); + } + list_destroy(l); +} + +/* + * Promote a clone. Nomenclature note: + * "clone" or "cds": the original clone which is being promoted + * "origin" or "ods": the snapshot which is originally clone's origin + * "origin head" or "ohds": the dataset which is the head + * (filesystem/volume) for the origin + * "origin origin": the origin of the origin's filesystem (typically + * NULL, indicating that the clone is not a clone of a clone). + */ int dsl_dataset_promote(const char *name) { @@ -2443,10 +2596,8 @@ dsl_dataset_promote(const char *name) dsl_dir_t *dd; dsl_pool_t *dp; dmu_object_info_t doi; - struct promotearg pa; + struct promotearg pa = { 0 }; struct promotenode *snap; - uint64_t snap_obj; - uint64_t last_snap = 0; int err; err = dsl_dataset_hold(name, FTAG, &ds); @@ -2462,88 +2613,62 @@ dsl_dataset_promote(const char *name) return (err); } + if (dsl_dataset_is_snapshot(ds) || dd->dd_phys->dd_origin_obj == 0) { + dsl_dataset_rele(ds, FTAG); + return (EINVAL); + } + /* * We are going to inherit all the snapshots taken before our * origin (i.e., our new origin will be our parent's origin). * Take ownership of them so that we can rename them into our * namespace. */ - pa.clone_origin = NULL; - list_create(&pa.snap_list, - sizeof (struct promotenode), offsetof(struct promotenode, link)); rw_enter(&dp->dp_config_rwlock, RW_READER); - ASSERT(dd->dd_phys->dd_origin_obj != 0); - snap_obj = dd->dd_phys->dd_origin_obj; - while (snap_obj) { - dsl_dataset_t *snapds; - /* - * NB: this would be handled by the below check for - * clone of a clone, but then we'd always own_obj() the - * $ORIGIN, thus causing unnecessary EBUSYs. We don't - * need to set pa.clone_origin because the $ORIGIN has - * no data to account for. - */ - if (dp->dp_origin_snap && - snap_obj == dp->dp_origin_snap->ds_object) - break; + err = snaplist_make(dp, B_TRUE, 0, dd->dd_phys->dd_origin_obj, + &pa.shared_snaps); + if (err != 0) + goto out; - err = dsl_dataset_own_obj(dp, snap_obj, 0, FTAG, &snapds); - if (err == ENOENT) { - /* lost race with snapshot destroy */ - struct promotenode *last = list_tail(&pa.snap_list); - ASSERT(snap_obj != last->ds->ds_phys->ds_prev_snap_obj); - snap_obj = last->ds->ds_phys->ds_prev_snap_obj; - continue; - } else if (err) { - rw_exit(&dp->dp_config_rwlock); - goto out; - } + err = snaplist_make(dp, B_FALSE, 0, ds->ds_object, &pa.clone_snaps); + if (err != 0) + goto out; - /* - * We could be a clone of a clone. If we reach our - * parent's branch point, we're done. - */ - if (last_snap && - snapds->ds_phys->ds_next_snap_obj != last_snap) { - pa.clone_origin = snapds; - break; - } + snap = list_head(&pa.shared_snaps); + ASSERT3U(snap->ds->ds_object, ==, dd->dd_phys->dd_origin_obj); + err = snaplist_make(dp, B_FALSE, dd->dd_phys->dd_origin_obj, + snap->ds->ds_dir->dd_phys->dd_head_dataset_obj, &pa.origin_snaps); + if (err != 0) + goto out; - snap = kmem_alloc(sizeof (struct promotenode), KM_SLEEP); - snap->ds = snapds; - list_insert_tail(&pa.snap_list, snap); - last_snap = snap_obj; - snap_obj = snap->ds->ds_phys->ds_prev_snap_obj; - dsl_dataset_make_exclusive(snapds, FTAG); - } - snap = list_head(&pa.snap_list); - ASSERT(snap != NULL); - err = dsl_dataset_hold_obj(dp, - snap->ds->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &pa.old_head); - rw_exit(&dp->dp_config_rwlock); + if (dsl_dir_is_clone(snap->ds->ds_dir)) { + err = dsl_dataset_own_obj(dp, + snap->ds->ds_dir->dd_phys->dd_origin_obj, + 0, FTAG, &pa.origin_origin); + if (err != 0) + goto out; + } - if (err) - goto out; +out: + rw_exit(&dp->dp_config_rwlock); /* * Add in 128x the snapnames zapobj size, since we will be moving * a bunch of snapnames to the promoted ds, and dirtying their * bonus buffers. */ - err = dsl_sync_task_do(dp, dsl_dataset_promote_check, - dsl_dataset_promote_sync, ds, &pa, 2 + 2 * doi.doi_physical_blks); - - dsl_dataset_rele(pa.old_head, FTAG); -out: - while ((snap = list_tail(&pa.snap_list)) != NULL) { - list_remove(&pa.snap_list, snap); - dsl_dataset_disown(snap->ds, FTAG); - kmem_free(snap, sizeof (struct promotenode)); + if (err == 0) { + err = dsl_sync_task_do(dp, dsl_dataset_promote_check, + dsl_dataset_promote_sync, ds, &pa, + 2 + 2 * doi.doi_physical_blks); } - list_destroy(&pa.snap_list); - if (pa.clone_origin) - dsl_dataset_disown(pa.clone_origin, FTAG); + + snaplist_destroy(&pa.shared_snaps, B_TRUE); + snaplist_destroy(&pa.clone_snaps, B_FALSE); + snaplist_destroy(&pa.origin_snaps, B_FALSE); + if (pa.origin_origin) + dsl_dataset_disown(pa.origin_origin, FTAG); dsl_dataset_rele(ds, FTAG); return (err); } @@ -2604,10 +2729,6 @@ dsl_dataset_clone_swap_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) { struct cloneswaparg *csa = arg1; dsl_pool_t *dp = csa->cds->ds_dir->dd_pool; - uint64_t itor = 0; - blkptr_t bp; - uint64_t unique = 0; - int err; ASSERT(csa->cds->ds_reserved == 0); ASSERT(csa->cds->ds_quota == csa->ohds->ds_quota); @@ -2627,16 +2748,10 @@ dsl_dataset_clone_swap_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) csa->ohds->ds_user_ptr = NULL; } - /* compute unique space */ - while ((err = bplist_iterate(&csa->cds->ds_deadlist, - &itor, &bp)) == 0) { - if (bp.blk_birth > csa->cds->ds_prev->ds_phys->ds_prev_snap_txg) - unique += bp_get_dasize(dp->dp_spa, &bp); - } - VERIFY(err == ENOENT); - /* reset origin's unique bytes */ - csa->cds->ds_prev->ds_phys->ds_unique_bytes = unique; + VERIFY(0 == bplist_space_birthrange(&csa->cds->ds_deadlist, + csa->cds->ds_prev->ds_phys->ds_prev_snap_txg, UINT64_MAX, + &csa->cds->ds_prev->ds_phys->ds_unique_bytes)); /* swap blkptrs */ { @@ -2652,10 +2767,14 @@ dsl_dataset_clone_swap_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) uint64_t cdl_used, cdl_comp, cdl_uncomp; uint64_t odl_used, odl_comp, odl_uncomp; + ASSERT3U(csa->cds->ds_dir->dd_phys-> + dd_used_breakdown[DD_USED_SNAP], ==, 0); + VERIFY(0 == bplist_space(&csa->cds->ds_deadlist, &cdl_used, &cdl_comp, &cdl_uncomp)); VERIFY(0 == bplist_space(&csa->ohds->ds_deadlist, &odl_used, &odl_comp, &odl_uncomp)); + dused = csa->cds->ds_phys->ds_used_bytes + cdl_used - (csa->ohds->ds_phys->ds_used_bytes + odl_used); dcomp = csa->cds->ds_phys->ds_compressed_bytes + cdl_comp - @@ -2664,10 +2783,23 @@ dsl_dataset_clone_swap_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) cdl_uncomp - (csa->ohds->ds_phys->ds_uncompressed_bytes + odl_uncomp); - dsl_dir_diduse_space(csa->ohds->ds_dir, + dsl_dir_diduse_space(csa->ohds->ds_dir, DD_USED_HEAD, dused, dcomp, duncomp, tx); - dsl_dir_diduse_space(csa->cds->ds_dir, + dsl_dir_diduse_space(csa->cds->ds_dir, DD_USED_HEAD, -dused, -dcomp, -duncomp, tx); + + /* + * The difference in the space used by snapshots is the + * difference in snapshot space due to the head's + * deadlist (since that's the only thing that's + * changing that affects the snapused). + */ + VERIFY(0 == bplist_space_birthrange(&csa->cds->ds_deadlist, + csa->ohds->ds_origin_txg, UINT64_MAX, &cdl_used)); + VERIFY(0 == bplist_space_birthrange(&csa->ohds->ds_deadlist, + csa->ohds->ds_origin_txg, UINT64_MAX, &odl_used)); + dsl_dir_transfer_space(csa->ohds->ds_dir, cdl_used - odl_used, + DD_USED_HEAD, DD_USED_SNAP, tx); } #define SWITCH64(x, y) \ @@ -2688,8 +2820,8 @@ dsl_dataset_clone_swap_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) csa->cds->ds_phys->ds_unique_bytes); /* apply any parent delta for change in unconsumed refreservation */ - dsl_dir_diduse_space(csa->ohds->ds_dir, csa->unused_refres_delta, - 0, 0, tx); + dsl_dir_diduse_space(csa->ohds->ds_dir, DD_USED_REFRSRV, + csa->unused_refres_delta, 0, 0, tx); /* swap deadlists */ bplist_close(&csa->cds->ds_deadlist); @@ -2937,7 +3069,7 @@ dsl_dataset_set_reservation_sync(void *arg1, void *arg2, cred_t *cr, dsl_prop_set_uint64_sync(ds->ds_dir, "refreservation", new_reservation, cr, tx); - dsl_dir_diduse_space(ds->ds_dir, delta, 0, 0, tx); + dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, delta, 0, 0, tx); spa_history_internal_log(LOG_DS_REFRESERV, ds->ds_dir->dd_pool->dp_spa, tx, cr, "%lld dataset = %llu", diff --git a/usr/src/uts/common/fs/zfs/dsl_dir.c b/usr/src/uts/common/fs/zfs/dsl_dir.c index d464633658..0441014af5 100644 --- a/usr/src/uts/common/fs/zfs/dsl_dir.c +++ b/usr/src/uts/common/fs/zfs/dsl_dir.c @@ -23,8 +23,6 @@ * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include #include #include @@ -59,8 +57,6 @@ dsl_dir_evict(dmu_buf_t *db, void *arg) ASSERT(dd->dd_space_towrite[t] == 0); } - ASSERT3U(dd->dd_used_bytes, ==, dd->dd_phys->dd_used_bytes); - if (dd->dd_parent) dsl_dir_close(dd->dd_parent, dd); @@ -95,9 +91,9 @@ dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj, dmu_object_info_t doi; dmu_object_info_from_db(dbuf, &doi); ASSERT3U(doi.doi_type, ==, DMU_OT_DSL_DIR); + ASSERT3U(doi.doi_bonus_size, >=, sizeof (dsl_dir_phys_t)); } #endif - /* XXX assert bonus buffer size is correct */ if (dd == NULL) { dsl_dir_t *winner; int err; @@ -107,7 +103,6 @@ dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj, dd->dd_dbuf = dbuf; dd->dd_pool = dp; dd->dd_phys = dbuf->db_data; - dd->dd_used_bytes = dd->dd_phys->dd_used_bytes; mutex_init(&dd->dd_lock, NULL, MUTEX_DEFAULT, NULL); list_create(&dd->dd_prop_cbs, sizeof (dsl_prop_cb_record_t), @@ -116,12 +111,8 @@ dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj, if (dd->dd_phys->dd_parent_obj) { err = dsl_dir_open_obj(dp, dd->dd_phys->dd_parent_obj, NULL, dd, &dd->dd_parent); - if (err) { - mutex_destroy(&dd->dd_lock); - kmem_free(dd, sizeof (dsl_dir_t)); - dmu_buf_rele(dbuf, tag); - return (err); - } + if (err) + goto errout; if (tail) { #ifdef ZFS_DEBUG uint64_t foundobj; @@ -137,13 +128,8 @@ dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj, dd->dd_parent->dd_phys->dd_child_dir_zapobj, ddobj, 0, dd->dd_myname); } - if (err) { - dsl_dir_close(dd->dd_parent, dd); - mutex_destroy(&dd->dd_lock); - kmem_free(dd, sizeof (dsl_dir_t)); - dmu_buf_rele(dbuf, tag); - return (err); - } + if (err) + goto errout; } else { (void) strcpy(dd->dd_myname, spa_name(dp->dp_spa)); } @@ -176,6 +162,15 @@ dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj, ASSERT3P(dd->dd_dbuf, ==, dbuf); *ddp = dd; return (0); + +errout: + if (dd->dd_parent) + dsl_dir_close(dd->dd_parent, dd); + mutex_destroy(&dd->dd_lock); + kmem_free(dd, sizeof (dsl_dir_t)); + dmu_buf_rele(dbuf, tag); + return (err); + } void @@ -435,6 +430,8 @@ dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name, DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx); dsphys->dd_child_dir_zapobj = zap_create(mos, DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx); + if (spa_version(dp->dp_spa) >= SPA_VERSION_USED_BREAKDOWN) + dsphys->dd_flags |= DD_FLAG_USED_BREAKDOWN; dmu_buf_rele(dbuf, FTAG); return (ddobj); @@ -476,6 +473,7 @@ dsl_dir_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx) dsl_dir_t *dd = arg1; objset_t *mos = dd->dd_pool->dp_meta_objset; uint64_t val, obj; + dd_used_t t; ASSERT(RW_WRITE_HELD(&dd->dd_pool->dp_config_rwlock)); ASSERT(dd->dd_phys->dd_head_dataset_obj == 0); @@ -483,8 +481,10 @@ dsl_dir_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx) /* Remove our reservation. */ val = 0; dsl_dir_set_reservation_sync(dd, &val, cr, tx); - ASSERT3U(dd->dd_used_bytes, ==, 0); + ASSERT3U(dd->dd_phys->dd_used_bytes, ==, 0); ASSERT3U(dd->dd_phys->dd_reserved, ==, 0); + for (t = 0; t < DD_USED_NUM; t++) + ASSERT3U(dd->dd_phys->dd_used_breakdown[t], ==, 0); VERIFY(0 == zap_destroy(mos, dd->dd_phys->dd_child_dir_zapobj, tx)); VERIFY(0 == zap_destroy(mos, dd->dd_phys->dd_props_zapobj, tx)); @@ -510,7 +510,8 @@ void dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv) { mutex_enter(&dd->dd_lock); - dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED, dd->dd_used_bytes); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED, + dd->dd_phys->dd_used_bytes); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_QUOTA, dd->dd_phys->dd_quota); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_RESERVATION, dd->dd_phys->dd_reserved); @@ -518,6 +519,17 @@ dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv) dd->dd_phys->dd_compressed_bytes == 0 ? 100 : (dd->dd_phys->dd_uncompressed_bytes * 100 / dd->dd_phys->dd_compressed_bytes)); + if (dd->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) { + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDSNAP, + dd->dd_phys->dd_used_breakdown[DD_USED_SNAP]); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDDS, + dd->dd_phys->dd_used_breakdown[DD_USED_HEAD]); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDREFRESERV, + dd->dd_phys->dd_used_breakdown[DD_USED_REFRSRV]); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDCHILD, + dd->dd_phys->dd_used_breakdown[DD_USED_CHILD] + + dd->dd_phys->dd_used_breakdown[DD_USED_CHILD_RSRV]); + } mutex_exit(&dd->dd_lock); rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER); @@ -567,7 +579,6 @@ dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx) dprintf_dd(dd, "txg=%llu towrite=%lluK\n", tx->tx_txg, dd->dd_space_towrite[tx->tx_txg&TXG_MASK] / 1024); dd->dd_space_towrite[tx->tx_txg&TXG_MASK] = 0; - dd->dd_phys->dd_used_bytes = dd->dd_used_bytes; mutex_exit(&dd->dd_lock); /* release the hold from dsl_dir_dirty */ @@ -615,11 +626,9 @@ dsl_dir_space_available(dsl_dir_t *dd, mutex_enter(&dd->dd_lock); if (dd->dd_phys->dd_quota != 0) quota = dd->dd_phys->dd_quota; - used = dd->dd_used_bytes; + used = dd->dd_phys->dd_used_bytes; if (!ondiskonly) used += dsl_dir_space_towrite(dd); - if (dd == ancestor) - used += delta; if (dd->dd_parent == NULL) { uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, FALSE); @@ -634,6 +643,14 @@ dsl_dir_space_available(dsl_dir_t *dd, parentspace += dd->dd_phys->dd_reserved - used; } + if (dd == ancestor) { + ASSERT(delta <= 0); + ASSERT(used >= -delta); + used += delta; + if (parentspace != UINT64_MAX) + parentspace -= delta; + } + if (used > quota) { /* over quota */ myspace = 0; @@ -691,7 +708,7 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree, est_inflight = dsl_dir_space_towrite(dd); for (i = 0; i < TXG_SIZE; i++) est_inflight += dd->dd_tempreserved[i]; - used_on_disk = dd->dd_used_bytes; + used_on_disk = dd->dd_phys->dd_used_bytes; /* * On the first iteration, fetch the dataset's used-on-disk and @@ -882,7 +899,7 @@ dsl_dir_willuse_space_impl(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx) if (space > 0) dd->dd_space_towrite[tx->tx_txg & TXG_MASK] += space; - est_used = dsl_dir_space_towrite(dd) + dd->dd_used_bytes; + est_used = dsl_dir_space_towrite(dd) + dd->dd_phys->dd_used_bytes; parent_space = parent_delta(dd, est_used, space); mutex_exit(&dd->dd_lock); @@ -908,33 +925,73 @@ dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx) /* call from syncing context when we actually write/free space for this dd */ void -dsl_dir_diduse_space(dsl_dir_t *dd, +dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type, int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx) { int64_t accounted_delta; ASSERT(dmu_tx_is_syncing(tx)); + ASSERT(type < DD_USED_NUM); dsl_dir_dirty(dd, tx); mutex_enter(&dd->dd_lock); - accounted_delta = parent_delta(dd, dd->dd_used_bytes, used); - ASSERT(used >= 0 || dd->dd_used_bytes >= -used); + accounted_delta = parent_delta(dd, dd->dd_phys->dd_used_bytes, used); + ASSERT(used >= 0 || dd->dd_phys->dd_used_bytes >= -used); ASSERT(compressed >= 0 || dd->dd_phys->dd_compressed_bytes >= -compressed); ASSERT(uncompressed >= 0 || dd->dd_phys->dd_uncompressed_bytes >= -uncompressed); - dd->dd_used_bytes += used; + dd->dd_phys->dd_used_bytes += used; dd->dd_phys->dd_uncompressed_bytes += uncompressed; dd->dd_phys->dd_compressed_bytes += compressed; + + if (dd->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) { + ASSERT(used > 0 || + dd->dd_phys->dd_used_breakdown[type] >= -used); + dd->dd_phys->dd_used_breakdown[type] += used; +#ifdef DEBUG + dd_used_t t; + uint64_t u = 0; + for (t = 0; t < DD_USED_NUM; t++) + u += dd->dd_phys->dd_used_breakdown[t]; + ASSERT3U(u, ==, dd->dd_phys->dd_used_bytes); +#endif + } mutex_exit(&dd->dd_lock); if (dd->dd_parent != NULL) { - dsl_dir_diduse_space(dd->dd_parent, + dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD, accounted_delta, compressed, uncompressed, tx); + dsl_dir_transfer_space(dd->dd_parent, + used - accounted_delta, + DD_USED_CHILD_RSRV, DD_USED_CHILD, tx); } } +void +dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta, + dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx) +{ + ASSERT(dmu_tx_is_syncing(tx)); + ASSERT(oldtype < DD_USED_NUM); + ASSERT(newtype < DD_USED_NUM); + + if (delta == 0 || !(dd->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN)) + return; + + dsl_dir_dirty(dd, tx); + mutex_enter(&dd->dd_lock); + ASSERT(delta > 0 ? + dd->dd_phys->dd_used_breakdown[oldtype] >= delta : + dd->dd_phys->dd_used_breakdown[newtype] >= -delta); + ASSERT(dd->dd_phys->dd_used_bytes >= ABS(delta)); + dd->dd_phys->dd_used_breakdown[oldtype] -= delta; + dd->dd_phys->dd_used_breakdown[newtype] += delta; + mutex_exit(&dd->dd_lock); +} + + static int dsl_dir_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx) { @@ -957,7 +1014,7 @@ dsl_dir_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx) towrite = dsl_dir_space_towrite(dd); if ((dmu_tx_is_syncing(tx) || towrite == 0) && (new_quota < dd->dd_phys->dd_reserved || - new_quota < dd->dd_used_bytes + towrite)) { + new_quota < dd->dd_phys->dd_used_bytes + towrite)) { err = ENOSPC; } mutex_exit(&dd->dd_lock); @@ -1027,7 +1084,7 @@ dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx) return (0); mutex_enter(&dd->dd_lock); - used = dd->dd_used_bytes; + used = dd->dd_phys->dd_used_bytes; delta = MAX(used, new_reservation) - MAX(used, dd->dd_phys->dd_reserved); mutex_exit(&dd->dd_lock); @@ -1060,7 +1117,7 @@ dsl_dir_set_reservation_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) dmu_buf_will_dirty(dd->dd_dbuf, tx); mutex_enter(&dd->dd_lock); - used = dd->dd_used_bytes; + used = dd->dd_phys->dd_used_bytes; delta = MAX(used, new_reservation) - MAX(used, dd->dd_phys->dd_reserved); dd->dd_phys->dd_reserved = new_reservation; @@ -1068,7 +1125,8 @@ dsl_dir_set_reservation_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) if (dd->dd_parent != NULL) { /* Roll up this additional usage into our ancestors */ - dsl_dir_diduse_space(dd->dd_parent, delta, 0, 0, tx); + dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV, + delta, 0, 0, tx); } spa_history_internal_log(LOG_DS_RESERVATION, dd->dd_pool->dp_spa, @@ -1115,7 +1173,7 @@ would_change(dsl_dir_t *dd, int64_t delta, dsl_dir_t *ancestor) return (delta); mutex_enter(&dd->dd_lock); - delta = parent_delta(dd, dd->dd_used_bytes, delta); + delta = parent_delta(dd, dd->dd_phys->dd_used_bytes, delta); mutex_exit(&dd->dd_lock); return (would_change(dd->dd_parent, delta, ancestor)); } @@ -1151,7 +1209,7 @@ dsl_dir_rename_check(void *arg1, void *arg2, dmu_tx_t *tx) if (ra->newparent != dd->dd_parent) { /* is there enough space? */ uint64_t myspace = - MAX(dd->dd_used_bytes, dd->dd_phys->dd_reserved); + MAX(dd->dd_phys->dd_used_bytes, dd->dd_phys->dd_reserved); /* no rename into our descendant */ if (closest_common_ancestor(dd, ra->newparent) == dd) @@ -1177,15 +1235,24 @@ dsl_dir_rename_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) ASSERT(dmu_buf_refcount(dd->dd_dbuf) <= 2); if (ra->newparent != dd->dd_parent) { - uint64_t myspace = - MAX(dd->dd_used_bytes, dd->dd_phys->dd_reserved); - - dsl_dir_diduse_space(dd->dd_parent, -myspace, + dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD, + -dd->dd_phys->dd_used_bytes, -dd->dd_phys->dd_compressed_bytes, -dd->dd_phys->dd_uncompressed_bytes, tx); - dsl_dir_diduse_space(ra->newparent, myspace, + dsl_dir_diduse_space(ra->newparent, DD_USED_CHILD, + dd->dd_phys->dd_used_bytes, dd->dd_phys->dd_compressed_bytes, dd->dd_phys->dd_uncompressed_bytes, tx); + + if (dd->dd_phys->dd_reserved > dd->dd_phys->dd_used_bytes) { + uint64_t unused_rsrv = dd->dd_phys->dd_reserved - + dd->dd_phys->dd_used_bytes; + + dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV, + -unused_rsrv, 0, 0, tx); + dsl_dir_diduse_space(ra->newparent, DD_USED_CHILD_RSRV, + unused_rsrv, 0, 0, tx); + } } dmu_buf_will_dirty(dd->dd_dbuf, tx); diff --git a/usr/src/uts/common/fs/zfs/sys/bplist.h b/usr/src/uts/common/fs/zfs/sys/bplist.h index 0615b7ac86..cdb93a6c35 100644 --- a/usr/src/uts/common/fs/zfs/sys/bplist.h +++ b/usr/src/uts/common/fs/zfs/sys/bplist.h @@ -26,8 +26,6 @@ #ifndef _SYS_BPLIST_H #define _SYS_BPLIST_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include #include #include @@ -81,6 +79,8 @@ extern void bplist_sync(bplist_t *bpl, dmu_tx_t *tx); extern void bplist_vacate(bplist_t *bpl, dmu_tx_t *tx); extern int bplist_space(bplist_t *bpl, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp); +extern int bplist_space_birthrange(bplist_t *bpl, + uint64_t mintxg, uint64_t maxtxg, uint64_t *dasizep); #ifdef __cplusplus } diff --git a/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h b/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h index 9ffa2b1bef..3c5141ab35 100644 --- a/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h +++ b/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h @@ -26,8 +26,6 @@ #ifndef _SYS_DSL_DATASET_H #define _SYS_DSL_DATASET_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include #include #include @@ -106,8 +104,9 @@ typedef struct dsl_dataset { uint64_t ds_object; uint64_t ds_fsid_guid; - /* only used in syncing context: */ - struct dsl_dataset *ds_prev; /* only valid for non-snapshots */ + /* only used in syncing context, only valid for non-snapshots: */ + struct dsl_dataset *ds_prev; + uint64_t ds_origin_txg; /* has internal locking: */ bplist_t ds_deadlist; diff --git a/usr/src/uts/common/fs/zfs/sys/dsl_dir.h b/usr/src/uts/common/fs/zfs/sys/dsl_dir.h index 557314a3b3..59d45d0b94 100644 --- a/usr/src/uts/common/fs/zfs/sys/dsl_dir.h +++ b/usr/src/uts/common/fs/zfs/sys/dsl_dir.h @@ -26,8 +26,6 @@ #ifndef _SYS_DSL_DIR_H #define _SYS_DSL_DIR_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include #include #include @@ -40,6 +38,17 @@ extern "C" { struct dsl_dataset; +typedef enum dd_used { + DD_USED_HEAD, + DD_USED_SNAP, + DD_USED_CHILD, + DD_USED_CHILD_RSRV, + DD_USED_REFRSRV, + DD_USED_NUM +} dd_used_t; + +#define DD_FLAG_USED_BREAKDOWN (1<<0) + typedef struct dsl_dir_phys { uint64_t dd_creation_time; /* not actually used */ uint64_t dd_head_dataset_obj; @@ -59,7 +68,9 @@ typedef struct dsl_dir_phys { uint64_t dd_reserved; uint64_t dd_props_zapobj; uint64_t dd_deleg_zapobj; /* dataset delegation permissions */ - uint64_t dd_pad[20]; /* pad out to 256 bytes for good measure */ + uint64_t dd_flags; + uint64_t dd_used_breakdown[DD_USED_NUM]; + uint64_t dd_pad[14]; /* pad out to 256 bytes for good measure */ } dsl_dir_phys_t; struct dsl_dir { @@ -79,9 +90,6 @@ struct dsl_dir { kmutex_t dd_lock; list_t dd_prop_cbs; /* list of dsl_prop_cb_record_t's */ - /* Accounting */ - /* reflects any changes to dd_phys->dd_used_bytes made this syncing */ - int64_t dd_used_bytes; /* gross estimate of space used by in-flight tx's */ uint64_t dd_tempreserved[TXG_SIZE]; /* amount of space we expect to write; == amount of dirty data */ @@ -114,8 +122,10 @@ int dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t mem, dmu_tx_t *tx); void dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx); void dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx); -void dsl_dir_diduse_space(dsl_dir_t *dd, +void dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type, int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx); +void dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta, + dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx); int dsl_dir_set_quota(const char *ddname, uint64_t quota); int dsl_dir_set_reservation(const char *ddname, uint64_t reservation); int dsl_dir_rename(dsl_dir_t *dd, const char *newname); diff --git a/usr/src/uts/common/sys/fs/zfs.h b/usr/src/uts/common/sys/fs/zfs.h index d5ca372dbf..9e95932144 100644 --- a/usr/src/uts/common/sys/fs/zfs.h +++ b/usr/src/uts/common/sys/fs/zfs.h @@ -26,8 +26,6 @@ #ifndef _SYS_FS_ZFS_H #define _SYS_FS_ZFS_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif @@ -103,6 +101,10 @@ typedef enum { ZFS_PROP_GUID, ZFS_PROP_PRIMARYCACHE, ZFS_PROP_SECONDARYCACHE, + ZFS_PROP_USEDSNAP, + ZFS_PROP_USEDDS, + ZFS_PROP_USEDCHILD, + ZFS_PROP_USEDREFRESERV, ZFS_NUM_PROPS } zfs_prop_t; @@ -245,13 +247,14 @@ typedef enum zfs_cache_type { #define SPA_VERSION_10 10ULL #define SPA_VERSION_11 11ULL #define SPA_VERSION_12 12ULL +#define SPA_VERSION_13 13ULL /* * When bumping up SPA_VERSION, make sure GRUB ZFS understands the on-disk * format change. Go to usr/src/grub/grub-0.95/stage2/{zfs-include/, fsys_zfs*}, * and do the appropriate changes. */ -#define SPA_VERSION SPA_VERSION_12 -#define SPA_VERSION_STRING "12" +#define SPA_VERSION SPA_VERSION_13 +#define SPA_VERSION_STRING "13" /* * Symbolic names for the changes that caused a SPA_VERSION switch. @@ -285,6 +288,7 @@ typedef enum zfs_cache_type { #define SPA_VERSION_ORIGIN SPA_VERSION_11 #define SPA_VERSION_DSL_SCRUB SPA_VERSION_11 #define SPA_VERSION_SNAP_PROPS SPA_VERSION_12 +#define SPA_VERSION_USED_BREAKDOWN SPA_VERSION_13 /* * ZPL version - rev'd whenever an incompatible on-disk format change -- 2.11.4.GIT